def main(): try: base = DiasporaOutput(sys.argv[1]) except: logging.error("Missing output path") exit(-1) try: busqueda_id = int(sys.argv[2]) except: logging.error("Missing busqueda_id") exit(-1) busqueda = Busqueda.objects.get(pk=busqueda_id) for persona in busqueda.persona_set.all(): person_file = base.write_personal_feature_matrix(persona) logging.info("Exporting features matrix for person " + persona.name) # TODO: reconnaitre 32 bits et 64 bits data model # j48_path = UNOPORUNO_ROOT + '/resources/classifiers/j48/J48.weka.32.data.model' nbtree_path = UNOPORUNO_ROOT + "/resources/classifiers/naivebayes/NaiveBayes.data.model" command = ( "java weka.classifiers.bayes.NaiveBayes -l " + nbtree_path + " -T " + person_file + " -p 1 > " + person_file + ".out" ) logging.info("classyfying with command=" + command) try: result = os.system(command) except: exit(-1) for subdirs, dirs, files in os.walk(sys.argv[1] + "/"): for file in files: re_out = re.search("\.out$", file) if not re_out: continue top5 = get_weka_top5(sys.argv[1] + "/" + file) logging.info("Extracting " + str(top5) + " tuples from file:" + file) for s in top5: snippet = Snippet.objects.get(pk=int(s[0])) snippet.converging_pipelines = 1 snippet.save()
def main(): try: base = DiasporaOutput(sys.argv[1]) except: logging.error('Missing output path') exit(-1) try: busqueda_id = int(sys.argv[2]) except: logging.error('Missing busqueda_id') exit(-1) busqueda = Busqueda.objects.get(pk=busqueda_id) for persona in busqueda.persona_set.all(): person_file = base.write_personal_feature_matrix(persona) logging.info('Exporting features matrix for person ' + persona.name) # TODO: reconnaitre 32 bits et 64 bits data model # j48_path = UNOPORUNO_ROOT + '/resources/classifiers/j48/J48.weka.32.data.model' smo_path = UNOPORUNO_ROOT + '/resources/classifiers/smo/SMO.data.model' command = 'java weka.classifiers.functions.SMO -l ' + smo_path + ' -T ' + person_file + ' -p 1 > ' + person_file + '.out' logging.info('classyfying with command=' + command) try: result = os.system(command) except: exit(-1) for subdirs, dirs, files in os.walk(sys.argv[1] + '/'): for file in files: re_out = re.search('\.out$', file) if not re_out: continue top5 = get_weka_top5(sys.argv[1] + '/' + file) logging.info('Extracting ' + str(top5) + ' tuples from file:' + file) for s in top5: snippet = Snippet.objects.get(pk=int(s[0])) snippet.converging_pipelines = 1 snippet.save()
def main(): try: base = DiasporaOutput(sys.argv[1]) except: logging.error('Missing output path') exit(-1) try: busqueda_id = int(sys.argv[2]) except: logging.error ('Missing busqueda_id') exit(-1) busqueda = Busqueda.objects.get(pk=busqueda_id) for persona in busqueda.persona_set.all(): person_file = base.write_personal_feature_matrix(persona) logging.info('Exporting features matrix for person ' +persona.name) # TODO: reconnaitre 32 bits et 64 bits data model # j48_path = UNOPORUNO_ROOT + '/resources/classifiers/j48/J48.weka.32.data.model' smo_path = UNOPORUNO_ROOT + '/resources/classifiers/smo/SMO.data.model' command = 'java weka.classifiers.functions.SMO -l '+smo_path+' -T '+person_file+' -p 1 > '+person_file+'.out' logging.info('classyfying with command=' + command) try: result = os.system(command) except: exit(-1) for subdirs, dirs, files in os.walk(sys.argv[1]+'/'): for file in files: re_out = re.search('\.out$', file) if not re_out: continue top5 = get_weka_top5(sys.argv[1]+'/'+file) logging.info('Extracting ' +str(top5)+ ' tuples from file:' +file ) for s in top5: snippet = Snippet.objects.get(pk=int(s[0])) snippet.converging_pipelines=1 snippet.save()
except: logging.error('No busqueda object with id=' +busqueda_in+ ' in UNOPORUNO database.') exit(-1) logging.info('Processing busqueda ' +busqueda.nombre ) try: results_path = sys.argv[2] except: logging.error('Missing parameter path') logging.error('Usage: python unoporuno_export.py NAME|NUMBER path') exit(-1) try: person_id_list = [] person_id_file = open(sys.argv[3]) for l in person_id_file: person_id_list.append(l.strip()) except: person_id_list = None diaspora_output = DiasporaOutput(results_path) if person_id_list: logging.debug('launching export_persona with id_list=' +str(person_id_list)) diaspora_output.export_unoporuno_persona(busqueda, person_id_list) else: diaspora_output.export_unoporuno_busqueda(busqueda)
try: busqueda = Busqueda.objects.get(nombre=busqueda_in) except: logging.error('No busqueda object with id=' + busqueda_in + ' in UNOPORUNO database.') exit(-1) logging.info('Processing busqueda ' + busqueda.nombre) try: results_path = sys.argv[2] except: logging.error('Missing parameter path') logging.error('Usage: python unoporuno_export.py NAME|NUMBER path') exit(-1) try: person_id_list = [] person_id_file = open(sys.argv[3]) for l in person_id_file: person_id_list.append(l.strip()) except: person_id_list = None diaspora_output = DiasporaOutput(results_path) if person_id_list: logging.debug('launching export_persona with id_list=' + str(person_id_list)) diaspora_output.export_unoporuno_persona(busqueda, person_id_list) else: diaspora_output.export_unoporuno_busqueda(busqueda)
def classify_person_top5(busqueda_id, path, classifier, data_model_file): #TODO: validar cuando a) no hay snippets clasificados como positivos y b) hay menos de 5 snippets clasificados como positivos output_path = path+'/'+classifier+'/' logging.info('classifying persons with busqueda_id='+str(busqueda_id)+', classifier'\ +classifier+ ' , data_model_file=' +data_model_file) try: base = DiasporaOutput(output_path) except: logging.error('Error on output path'+output_path) exit(-1) try: busqueda_id = int(busqueda_id) except: logging.error ('Missing busqueda_id') exit(-1) busqueda = Busqueda.objects.get(pk=busqueda_id) d_personas = dict() for persona in busqueda.persona_set.all(): persona_file = persona_file = base.write_personal_feature_matrix_2class(persona) command = 'java ' +classifier+ ' -l '+data_model_file+' -T '+persona_file+' -p 1 > '+persona_file+'.out' try: result = os.system(command) except: exit(-1) logging.info('classyfying with command=' + command) for subdirs, dirs, files in os.walk(output_path): for file in files: re_out = re.search('\.out$', file) if not re_out: continue classed_snippets = get_weka_top5(output_path+'/'+file) logging.info('Extracting ' +str(len(classed_snippets))+ ' tuples from file:' +file ) logging.info('Classed snippets=' +str(classed_snippets)) if len(classed_snippets): for tupla in classed_snippets: logging.info('looking for snippet id =' +str(tupla[0])) snippet = Snippet.objects.get(id=int(tupla[0])) if d_personas.has_key(snippet.persona_id): d_paises = d_personas[snippet.persona_id] else: d_paises = dict() lista_paises = snippet.featured_countries.split(',') if snippet.featured_countries else [] for pais in lista_paises: u_pais = pais.encode('utf-8') if d_paises.has_key(u_pais): d_paises[u_pais] += 1 else: d_pais = dict({u_pais:1}) d_paises.update(d_pais) d_persona = dict({snippet.persona_id:d_paises}) d_personas.update(d_persona) snippet.converging_pipelines=2 snippet.RE_score = get_feature_count(snippet.RE_features) snippet.save() LA = ['AR','BZ','BO','CL','CO','CR','C','DO','SV','MX','GT','HT','JM','NI','PY','PE','VE','TT','PY','HN','PA','UY'] for persona in busqueda.persona_set.all(): if not d_personas.has_key(persona.id): d_paises = dict() d_persona = dict({persona.id:d_paises}) d_personas.update(d_persona) logging.info('Persona ' +persona.name+ ' has the following country frequencies:' +str(d_personas[persona.id])+ \ ' and prediction='+str(persona.prediction)) LA_freq = ('',0) mundo_freq = ('',0) for pais in d_personas[persona.id].keys(): u_pais = pais.encode('utf-8') if u_pais in LA: if d_personas[persona.id][u_pais] > LA_freq[1]: LA_freq = (u_pais,d_personas[persona.id][u_pais]) else: if d_personas[persona.id][u_pais] > mundo_freq[1]: mundo_freq = (u_pais, d_personas[persona.id][u_pais]) logging.info('Pais LA mas frequente:' +str(LA_freq)) logging.info('Pais no LA mas frequente:' +str(mundo_freq)) #3 del país móvil más frecuente #2 del país latinoamericano más frecuente #los demás con móviles if mundo_freq[1]>0 and LA_freq[1]>0: persona.prediction=1 logging.info(persona.name+' is movil! with prediction=' +str(persona.prediction)) elif mundo_freq[1]>0 or LA_freq[1]>0: persona.prediction = 2 logging.info('local!') else: persona.prediction = 3 logging.info('no sé!') mobile_snippets = persona.snippet_set.filter(converging_pipelines=2).order_by('-RE_score') local_snippets = persona.snippet_set.filter(converging_pipelines=3).order_by('-RE_score') converging_count = [0,0,0] #[converging_count, world_count, LA_count] if mundo_freq[1]>0: mobile_limit = min(3,mundo_freq[1]) LA_limit = min(2,LA_freq[1]) for s in mobile_snippets: if converging_count[0]>=5: break if converging_count[1]<mobile_limit: if mundo_freq[0] in str(s.featured_countries): s.converging_pipelines=1 s.save() converging_count[0]+=1 converging_count[1]+=1 logging.info('world hit!') elif converging_count[2]<LA_limit: if LA_freq[0] in str(s.featured_countries): s.converging_pipelines=1 s.save() converging_count[0]+=1 converging_count[2]+=1 logging.info('LA hit!') if converging_count[0]<5: for s in local_snippets: if converging_count[0]>=5: break if converging_count[1]<mobile_limit: if mundo_freq[0] in str(s.featured_countries): s.converging_pipelines=1 s.save() converging_count[0]+=1 converging_count[1]+=1 logging.info('world local hit!') elif converging_count[2]<LA_limit: if LA_freq[0] in str(s.featured_countries): s.converging_pipelines=1 s.save() converging_count[0]+=1 converging_count[2]+=1 logging.info('LA local hit!') if converging_count[0]<5: for s in mobile_snippets: if converging_count[0]>=5: break if s.converging_pipelines==1: continue s.converging_pipelines=1 s.save() converging_count[0]+=1 if converging_count[0]<5: for s in local_snippets: if converging_count[0]>=5: break if s.converging_pipelines==1: continue s.converging_pipelines=1 s.save() converging_count[0]+=1 if converging_count[0]<5: todos = persona.snippet_set.filter(FG=1).exclude(RE=1).order_by('-RE_features') for s in local_snippets: if converging_count[0]>=5: break if s.converging_pipelines==1: continue s.converging_pipelines=1 s.save() converging_count[0]+=1 if converging_count<5: for s in local_snippets: if converging_count>=5: break s.converging_pipelines=1 s.save() converging_count +=1 logging.info(persona.name+' is movil! with prediction=' +str(persona.prediction)) persona.save()