def main(): logging.basicConfig(level=logging.DEBUG) personas_file = PersonasInput() personas_file.open_csv(sys.argv[1]) personas = personas_file.read() buscador = BuscadorDiasporas() diaspora_output = DiasporaOutput(sys.argv[2]) for p in personas: diaspora_output.open_person(p) logging.info('batch_diaspora_search::processing ' + p.nombre) r = reloj() buscador.inicia(p.nombre, p.vinculo) #******** # PIPELINE orgs #******** resultado_busqueda = buscador.genera_busquedas_organizacionales(p.orgs) resultado_busqueda.filtra_nominal(p.nombre) r.stop() ps = PipelineStats() ps.type = 'orgs' ps.total_queries = resultado_busqueda.total_queries ps.total_snippets = len(resultado_busqueda.snippets) ps.tiempo_proceso = r.tiempo()[0] ps.encontro_vinculo = resultado_busqueda.vinculo_encontrado diaspora_output.write_pipeline(ps, list(resultado_busqueda.snippets)) if ps.encontro_vinculo: diaspora_output.close_person() continue diaspora_output.close_person()
def main(): logging.basicConfig(level=logging.DEBUG) personas_file = PersonasInput() personas_file.open_csv(sys.argv[1]) personas = personas_file.read() buscador = BuscadorDiasporas() diaspora_output = DiasporaOutput(sys.argv[2]) for p in personas: diaspora_output.open_person(p) logging.info ('batch_diaspora_search::processing '+ p.nombre) r = reloj() buscador.inicia(p.nombre, p.vinculo) #******** # PIPELINE orgs #******** resultado_busqueda = buscador.genera_busquedas_organizacionales(p.orgs) resultado_busqueda.filtra_nominal(p.nombre) r.stop() ps = PipelineStats() ps.type = 'orgs' ps.total_queries = resultado_busqueda.total_queries ps.total_snippets = len(resultado_busqueda.snippets) ps.tiempo_proceso = r.tiempo()[0] ps.encontro_vinculo = resultado_busqueda.vinculo_encontrado diaspora_output.write_pipeline(ps, list(resultado_busqueda.snippets)) if ps.encontro_vinculo: diaspora_output.close_person() continue diaspora_output.close_person()
def main(): logging.basicConfig(level=logging.DEBUG) personas_file = PersonasInput() personas_file.open_csv(sys.argv[1]) personas = personas_file.read() buscador = BuscadorDiasporas() output_folder = sys.argv[2] for p in personas: logging.info ('batch_diaspora_search::processing '+ p.nombre) #******** # PIPELINE name #******** name_output_folder = output_folder+'/results_name' diaspora_output = DiasporaOutput(name_output_folder) diaspora_output.open_person(p) r = reloj() buscador.inicia(p.nombre, p.vinculo) r.start() resultado_name = buscador.genera_busquedas_nominales() name_list = resultado_name.filtra_nominal(p.nombre) r.stop() logging.debug('converging::name_list.snippets = ' +str(len(name_list))) ps = PipelineStats() ps.type = 'name' ps.total_queries = resultado_name.total_queries ps.total_snippets = len(resultado_name.snippets) ps.tiempo_proceso = r.tiempo()[0] ps.encontro_vinculo = resultado_name.vinculo_encontrado diaspora_output.write_pipeline(ps, list(resultado_name.snippets)) if ps.encontro_vinculo: diaspora_output.close_person() continue diaspora_output.close_person() #******** # PIPELINE geo #******** geo_output_folder = output_folder+'/results_geo' diaspora_output = DiasporaOutput(geo_output_folder) diaspora_output.open_person(p) r.start() resultado_geo = buscador.genera_busquedas_geograficas(p.lugares) geo_list=resultado_geo.filtra_nominal(p.nombre) r.stop() logging.debug('converging::resultado_geo.snippets = ' +str(len(geo_list))) ps = PipelineStats() ps.type = 'geo' ps.total_queries = resultado_geo.total_queries ps.total_snippets = len(resultado_geo.snippets) ps.tiempo_proceso = r.tiempo()[0] ps.encontro_vinculo = resultado_geo.vinculo_encontrado diaspora_output.write_pipeline(ps, list(resultado_geo.snippets)) if ps.encontro_vinculo: diaspora_output.close_person() continue diaspora_output.close_person() #******** # PIPELINE topics #******** topics_output_folder = output_folder+'/results_topics' diaspora_output = DiasporaOutput(topics_output_folder) diaspora_output.open_person(p) r.start() resultado_topics = buscador.genera_busquedas_tematicas(p.temas) topics_list = resultado_topics.filtra_nominal(p.nombre) r.stop() logging.debug('converging::topics_list = ' +str(len(topics_list))) ps = PipelineStats() ps.type = 'topics' ps.total_queries = resultado_topics.total_queries ps.total_snippets = len(resultado_topics.snippets) ps.tiempo_proceso = r.tiempo()[0] ps.encontro_vinculo = resultado_topics.vinculo_encontrado diaspora_output.write_pipeline(ps, list(resultado_topics.snippets)) if ps.encontro_vinculo: diaspora_output.close_person() continue diaspora_output.close_person() #******** # PIPELINE orgs #******** orgs_output_folder = output_folder+'/results_orgs' diaspora_output = DiasporaOutput(orgs_output_folder) diaspora_output.open_person(p) r.start() resultado_orgs = buscador.genera_busquedas_organizacionales(p.orgs) orgs_list = resultado_orgs.filtra_nominal(p.nombre) r.stop() logging.debug('converging::resultado_orgs.snippets = ' +str(len(orgs_list))) ps = PipelineStats() ps.type = 'orgs' ps.total_queries = resultado_orgs.total_queries ps.total_snippets = len(resultado_orgs.snippets) ps.tiempo_proceso = r.tiempo()[0] ps.encontro_vinculo = resultado_orgs.vinculo_encontrado diaspora_output.write_pipeline(ps, list(resultado_orgs.snippets)) if ps.encontro_vinculo: diaspora_output.close_person() continue diaspora_output.close_person() #******** # NEW PIPELINE convergent #******** conv_output_folder = output_folder+'/results_converging' diaspora_output = DiasporaOutput(conv_output_folder) diaspora_output.open_person(p) r.start() name_links = set([]) geo_links = set([]) orgs_links = set([]) topics_links = set([]) unique_snippets_list = name_list + geo_list + orgs_list + topics_list logging.debug('converging::len(name_snippets_set)= ' +str(len(name_list))) logging.debug('converging::len(geo_snippets_set)= ' +str(len(geo_list))) logging.debug('converging::len(topics_snippets_set)= ' +str(len(topics_list))) logging.debug('converging::len(orgs_snippets_set)= ' +str(len(orgs_list))) logging.debug('converging::len(unique_snippets_set)= ' +str(len(unique_snippets_list))) for s in name_list: name_links.add(s.link) for s in geo_list: geo_links.add(s.link) for s in orgs_list: orgs_links.add(s.link) for s in topics_list: topics_links.add(s.link) convergent_4 = [] convergent_3 = [] convergent_2 = [] convergent_1 = [] for s in unique_snippets_list: logging.debug('for s in unique_snippets_list.query=' +s.query) if (s.link not in name_links) and (s.link not in geo_links) and (s.link not in orgs_links) and (s.link not in topics_links): continue if s.link in name_links and s.link in orgs_links and s.link in topics_links and s.link in geo_links: convergent_4.append(s) name_links.remove(s.link) orgs_links.remove(s.link) geo_links.remove(s.link) topics_links.remove(s.link) elif (s.link in name_links) and (s.link in geo_links) and (s.link in orgs_links): convergent_3.append(s) name_links.remove(s.link) geo_links.remove(s.link) orgs_links.remove(s.link) elif (s.link in geo_links) and (s.link in orgs_links) and (s.link in topics_links): convergent_3.append(s) geo_links.remove(s.link) orgs_links.remove(s.link) topics_links.remove(s.link) elif (s.link in orgs_links) and (s.link in topics_links) and (s.link in name_links): convergent_3.append(s) orgs_links.remove(s.link) topics_links.remove(s.link) name_links.remove(s.link) elif (s.link in name_links) and (s.link in geo_links) and (s.link in topics_links): convergent_3.append(s) name_links.remove(s.link) geo_links.remove(s.link) topics_links.remove(s.link) elif (s.link in name_links) and (s.link in geo_links): convergent_2.append(s) name_links.remove(s.link) geo_links.remove(s.link) elif (s.link in name_links) and (s.link in orgs_links): convergent_2.append(s) name_links.remove(s.link) orgs_links.remove(s.link) elif (s.link in name_links) and (s.link in topics_links): convergent_2.append(s) name_links.remove(s.link) topics_links.remove(s.link) elif (s.link in geo_links) and (s.link in orgs_links): convergent_2.append(s) geo_links.remove(s.link) orgs_links.remove(s.link) elif (s.link in geo_links) and (s.link in topics_links): convergent_2.append(s) geo_links.remove(s.link) topics_links.remove(s.link) elif (s.link in orgs_links) and (s.link in topics_links): convergent_2.append(s) orgs_links.remove(s.link) topics_links.remove(s.link) elif (s.link in name_links): convergent_1.append(s) name_links.remove(s.link) elif (s.link in geo_links): convergent_1.append(s) geo_links.remove(s.link) elif (s.link in orgs_links): convergent_1.append(s) orgs_links.remove(s.link) elif (s.link in topics_links): convergent_1.append(s) topics_links.remove(s.link) r.stop() unique_link_set = set([]) unique_convergent_4 = set([]) unique_convergent_3 = set([]) unique_convergent_2 = set([]) unique_convergent_1 = set([]) repeated = set([]) for s in convergent_4: if s.link not in unique_link_set: unique_link_set.add(s.link) unique_convergent_4.add(s) logging.debug ('convergent_4 snippet=' +s.query+ ' title= ' +s.title+ ' ==> ' +s.link) else: repeated.add(s) logging.debug ('repeated_4 snippet=' +s.query+ ' title= ' +s.title+ ' ==> ' +s.link) ps4 = PipelineStats() ps4.type = 'converging pipelines 4' diaspora_output.write_converging_pipeline(ps4, list(unique_convergent_4), 4) for s in convergent_3: if s.link not in unique_link_set: unique_link_set.add(s.link) unique_convergent_3.add(s) logging.debug ('convergent_3 snippet=' +s.query+ ' title= ' +s.title+ ' ==> ' +s.link) else: repeated.add(s) logging.debug ('repeated_3 snippet=' +s.query+ ' title= ' +s.title+ ' ==> ' +s.link) ps3 = PipelineStats() ps3.type = 'converging pipelines 3' diaspora_output.write_converging_pipeline(ps3, list(unique_convergent_3), 3) for s in convergent_2: if s.link not in unique_link_set: unique_link_set.add(s.link) unique_convergent_2.add(s) logging.debug ('convergent_2 snippet=' +s.query+ ' title= ' +s.title+ ' ==> ' +s.link) else: repeated.add(s) logging.debug ('repeated_2 snippet=' +s.query+ ' title= ' +s.title+ ' ==> ' +s.link) ps2 = PipelineStats() ps2.type = 'converging pipelines 2' diaspora_output.write_converging_pipeline(ps2, list(unique_convergent_2), 2) for s in convergent_1: if s.link not in unique_link_set: unique_link_set.add(s.link) unique_convergent_1.add(s) logging.debug ('convergent_1 snippet=' +s.query+ ' title= ' +s.title+ ' ==> ' +s.link) else: repeated.add(s) logging.debug ('repeated_1 snippet=' +s.query+ ' title= ' +s.title+ ' ==> ' +s.link) ps1 = PipelineStats() ps1.type = 'converging pipelines 1' diaspora_output.write_converging_pipeline(ps1, list(unique_convergent_1), 1) diaspora_output.close_person()
def main(): logging.basicConfig(level=logging.DEBUG) personas_file = PersonasInput() personas_file.open_csv(sys.argv[1]) personas = personas_file.read() buscador = BuscadorDiasporas() output_folder = sys.argv[2] for p in personas: logging.info('batch_diaspora_search::processing ' + p.nombre) #******** # PIPELINE name #******** name_output_folder = output_folder + '/results_name' diaspora_output = DiasporaOutput(name_output_folder) diaspora_output.open_person(p) r = reloj() buscador.inicia(p.nombre, p.vinculo) r.start() resultado_name = buscador.genera_busquedas_nominales() name_list = resultado_name.filtra_nominal(p.nombre) r.stop() logging.debug('converging::name_list.snippets = ' + str(len(name_list))) ps = PipelineStats() ps.type = 'name' ps.total_queries = resultado_name.total_queries ps.total_snippets = len(resultado_name.snippets) ps.tiempo_proceso = r.tiempo()[0] ps.encontro_vinculo = resultado_name.vinculo_encontrado diaspora_output.write_pipeline(ps, list(resultado_name.snippets)) if ps.encontro_vinculo: diaspora_output.close_person() continue diaspora_output.close_person() #******** # PIPELINE geo #******** geo_output_folder = output_folder + '/results_geo' diaspora_output = DiasporaOutput(geo_output_folder) diaspora_output.open_person(p) r.start() resultado_geo = buscador.genera_busquedas_geograficas(p.lugares) geo_list = resultado_geo.filtra_nominal(p.nombre) r.stop() logging.debug('converging::resultado_geo.snippets = ' + str(len(geo_list))) ps = PipelineStats() ps.type = 'geo' ps.total_queries = resultado_geo.total_queries ps.total_snippets = len(resultado_geo.snippets) ps.tiempo_proceso = r.tiempo()[0] ps.encontro_vinculo = resultado_geo.vinculo_encontrado diaspora_output.write_pipeline(ps, list(resultado_geo.snippets)) if ps.encontro_vinculo: diaspora_output.close_person() continue diaspora_output.close_person() #******** # PIPELINE topics #******** topics_output_folder = output_folder + '/results_topics' diaspora_output = DiasporaOutput(topics_output_folder) diaspora_output.open_person(p) r.start() resultado_topics = buscador.genera_busquedas_tematicas(p.temas) topics_list = resultado_topics.filtra_nominal(p.nombre) r.stop() logging.debug('converging::topics_list = ' + str(len(topics_list))) ps = PipelineStats() ps.type = 'topics' ps.total_queries = resultado_topics.total_queries ps.total_snippets = len(resultado_topics.snippets) ps.tiempo_proceso = r.tiempo()[0] ps.encontro_vinculo = resultado_topics.vinculo_encontrado diaspora_output.write_pipeline(ps, list(resultado_topics.snippets)) if ps.encontro_vinculo: diaspora_output.close_person() continue diaspora_output.close_person() #******** # PIPELINE orgs #******** orgs_output_folder = output_folder + '/results_orgs' diaspora_output = DiasporaOutput(orgs_output_folder) diaspora_output.open_person(p) r.start() resultado_orgs = buscador.genera_busquedas_organizacionales(p.orgs) orgs_list = resultado_orgs.filtra_nominal(p.nombre) r.stop() logging.debug('converging::resultado_orgs.snippets = ' + str(len(orgs_list))) ps = PipelineStats() ps.type = 'orgs' ps.total_queries = resultado_orgs.total_queries ps.total_snippets = len(resultado_orgs.snippets) ps.tiempo_proceso = r.tiempo()[0] ps.encontro_vinculo = resultado_orgs.vinculo_encontrado diaspora_output.write_pipeline(ps, list(resultado_orgs.snippets)) if ps.encontro_vinculo: diaspora_output.close_person() continue diaspora_output.close_person() #******** # NEW PIPELINE convergent #******** conv_output_folder = output_folder + '/results_converging' diaspora_output = DiasporaOutput(conv_output_folder) diaspora_output.open_person(p) r.start() name_links = set([]) geo_links = set([]) orgs_links = set([]) topics_links = set([]) unique_snippets_list = name_list + geo_list + orgs_list + topics_list logging.debug('converging::len(name_snippets_set)= ' + str(len(name_list))) logging.debug('converging::len(geo_snippets_set)= ' + str(len(geo_list))) logging.debug('converging::len(topics_snippets_set)= ' + str(len(topics_list))) logging.debug('converging::len(orgs_snippets_set)= ' + str(len(orgs_list))) logging.debug('converging::len(unique_snippets_set)= ' + str(len(unique_snippets_list))) for s in name_list: name_links.add(s.link) for s in geo_list: geo_links.add(s.link) for s in orgs_list: orgs_links.add(s.link) for s in topics_list: topics_links.add(s.link) convergent_4 = [] convergent_3 = [] convergent_2 = [] convergent_1 = [] for s in unique_snippets_list: logging.debug('for s in unique_snippets_list.query=' + s.query) if (s.link not in name_links) and (s.link not in geo_links) and ( s.link not in orgs_links) and (s.link not in topics_links): continue if s.link in name_links and s.link in orgs_links and s.link in topics_links and s.link in geo_links: convergent_4.append(s) name_links.remove(s.link) orgs_links.remove(s.link) geo_links.remove(s.link) topics_links.remove(s.link) elif (s.link in name_links) and (s.link in geo_links) and (s.link in orgs_links): convergent_3.append(s) name_links.remove(s.link) geo_links.remove(s.link) orgs_links.remove(s.link) elif (s.link in geo_links) and (s.link in orgs_links) and ( s.link in topics_links): convergent_3.append(s) geo_links.remove(s.link) orgs_links.remove(s.link) topics_links.remove(s.link) elif (s.link in orgs_links) and (s.link in topics_links) and ( s.link in name_links): convergent_3.append(s) orgs_links.remove(s.link) topics_links.remove(s.link) name_links.remove(s.link) elif (s.link in name_links) and (s.link in geo_links) and ( s.link in topics_links): convergent_3.append(s) name_links.remove(s.link) geo_links.remove(s.link) topics_links.remove(s.link) elif (s.link in name_links) and (s.link in geo_links): convergent_2.append(s) name_links.remove(s.link) geo_links.remove(s.link) elif (s.link in name_links) and (s.link in orgs_links): convergent_2.append(s) name_links.remove(s.link) orgs_links.remove(s.link) elif (s.link in name_links) and (s.link in topics_links): convergent_2.append(s) name_links.remove(s.link) topics_links.remove(s.link) elif (s.link in geo_links) and (s.link in orgs_links): convergent_2.append(s) geo_links.remove(s.link) orgs_links.remove(s.link) elif (s.link in geo_links) and (s.link in topics_links): convergent_2.append(s) geo_links.remove(s.link) topics_links.remove(s.link) elif (s.link in orgs_links) and (s.link in topics_links): convergent_2.append(s) orgs_links.remove(s.link) topics_links.remove(s.link) elif (s.link in name_links): convergent_1.append(s) name_links.remove(s.link) elif (s.link in geo_links): convergent_1.append(s) geo_links.remove(s.link) elif (s.link in orgs_links): convergent_1.append(s) orgs_links.remove(s.link) elif (s.link in topics_links): convergent_1.append(s) topics_links.remove(s.link) r.stop() unique_link_set = set([]) unique_convergent_4 = set([]) unique_convergent_3 = set([]) unique_convergent_2 = set([]) unique_convergent_1 = set([]) repeated = set([]) for s in convergent_4: if s.link not in unique_link_set: unique_link_set.add(s.link) unique_convergent_4.add(s) logging.debug('convergent_4 snippet=' + s.query + ' title= ' + s.title + ' ==> ' + s.link) else: repeated.add(s) logging.debug('repeated_4 snippet=' + s.query + ' title= ' + s.title + ' ==> ' + s.link) ps4 = PipelineStats() ps4.type = 'converging pipelines 4' diaspora_output.write_converging_pipeline(ps4, list(unique_convergent_4), 4) for s in convergent_3: if s.link not in unique_link_set: unique_link_set.add(s.link) unique_convergent_3.add(s) logging.debug('convergent_3 snippet=' + s.query + ' title= ' + s.title + ' ==> ' + s.link) else: repeated.add(s) logging.debug('repeated_3 snippet=' + s.query + ' title= ' + s.title + ' ==> ' + s.link) ps3 = PipelineStats() ps3.type = 'converging pipelines 3' diaspora_output.write_converging_pipeline(ps3, list(unique_convergent_3), 3) for s in convergent_2: if s.link not in unique_link_set: unique_link_set.add(s.link) unique_convergent_2.add(s) logging.debug('convergent_2 snippet=' + s.query + ' title= ' + s.title + ' ==> ' + s.link) else: repeated.add(s) logging.debug('repeated_2 snippet=' + s.query + ' title= ' + s.title + ' ==> ' + s.link) ps2 = PipelineStats() ps2.type = 'converging pipelines 2' diaspora_output.write_converging_pipeline(ps2, list(unique_convergent_2), 2) for s in convergent_1: if s.link not in unique_link_set: unique_link_set.add(s.link) unique_convergent_1.add(s) logging.debug('convergent_1 snippet=' + s.query + ' title= ' + s.title + ' ==> ' + s.link) else: repeated.add(s) logging.debug('repeated_1 snippet=' + s.query + ' title= ' + s.title + ' ==> ' + s.link) ps1 = PipelineStats() ps1.type = 'converging pipelines 1' diaspora_output.write_converging_pipeline(ps1, list(unique_convergent_1), 1) diaspora_output.close_person()