busqueda = Busqueda.objects.get(id=int(busqueda_in)) except: logging.error('No busqueda object with id=' +busqueda_in+ ' in UNOPORUNO database.') exit(-1) else: try: busqueda = Busqueda.objects.get(nombre=busqueda_in) except: logging.error('No busqueda object with id=' +busqueda_in+ ' in UNOPORUNO database.') exit(-1) logging.info('Processing busqueda ' +busqueda.nombre ) #TODO: METER TODO EN UNA CLASE DENTRO DE unoporuno/modules/dospordos/features.py OrganizationRegex = RegexFeature(UNOPORUNO_ROOT, 'organization') CountryGazet = QualifiedGazetteerFeature(UNOPORUNO_ROOT, 'country',False) CountryGazetCase = QualifiedGazetteerFeature(UNOPORUNO_ROOT, 'country',True) CityGazet = QualifiedGazetteerFeature(UNOPORUNO_ROOT, 'city', False) CityGazetCase = QualifiedGazetteerFeature(UNOPORUNO_ROOT, 'city', True) AccronymGazet = GazetteerFeature(UNOPORUNO_ROOT, 'accronym', True) BiophrasesRegex = RegexFeature(UNOPORUNO_ROOT, 'biographical phrases') ProfessionRegex = RegexFeature(UNOPORUNO_ROOT, 'profession') ProfessionGazt = GazetteerFeature(UNOPORUNO_ROOT, 'profession') DegreeRegex = RegexFeature(UNOPORUNO_ROOT, 'degree') DegreeGazt = GazetteerFeature(UNOPORUNO_ROOT, 'degree') CvRegex = RegexFeature(UNOPORUNO_ROOT, 'cv general') CvHttpRegex = RegexFeature(UNOPORUNO_ROOT, 'cv http') LatinNatRegex = RegexFeature(UNOPORUNO_ROOT, 'latin nationalities') WorldNatRegex = RegexFeature(UNOPORUNO_ROOT, 'world nationalities es') WorldNatGazt = GazetteerFeature(UNOPORUNO_ROOT, 'world nationalities en') EmailRegex = RegexFeature(UNOPORUNO_ROOT, 'email')
else: try: busqueda = Busqueda.objects.get(nombre=busqueda_in) except: logging.error('No busqueda object with id=' +busqueda_in+ ' in UNOPORUNO database.') exit(-1) logging.info('Processing busqueda ' +busqueda.nombre ) try: selection = sys.argv[2] except: selection = 'all' #TODO: METER TODO EN UNA CLASE DENTRO DE unoporuno/modules/dospordos/features.py OrganizationRegex = RegexFeature(UNOPORUNO_ROOT, 'organization') CountryGazet = QualifiedGazetteerFeature(UNOPORUNO_ROOT, 'country',False) CountryGazetCase = QualifiedGazetteerFeature(UNOPORUNO_ROOT, 'country',True) CityGazet = QualifiedGazetteerFeature(UNOPORUNO_ROOT, 'city', False) CityGazetCase = QualifiedGazetteerFeature(UNOPORUNO_ROOT, 'city', True) AccronymGazet = GazetteerFeature(UNOPORUNO_ROOT, 'accronym', True) BiophrasesRegex = RegexFeature(UNOPORUNO_ROOT, 'biographical phrases') ProfessionRegex = RegexFeature(UNOPORUNO_ROOT, 'profession') ProfessionGazt = GazetteerFeature(UNOPORUNO_ROOT, 'profession') DegreeRegex = RegexFeature(UNOPORUNO_ROOT, 'degree') DegreeGazt = GazetteerFeature(UNOPORUNO_ROOT, 'degree') CvRegex = RegexFeature(UNOPORUNO_ROOT, 'cv general') CvHttpRegex = RegexFeature(UNOPORUNO_ROOT, 'cv http') LatinNatRegex = RegexFeature(UNOPORUNO_ROOT, 'latin nationalities') WorldNatRegex = RegexFeature(UNOPORUNO_ROOT, 'world nationalities es') WorldNatGazt = GazetteerFeature(UNOPORUNO_ROOT, 'world nationalities en') EmailRegex = RegexFeature(UNOPORUNO_ROOT, 'email')
def main(): try: busqueda_in = sys.argv[1] except: logging.error('No parameter busqueda') logging.error('Usage: python batch_biographic_filter.py NAME|NUMBER path') exit(-1) if busqueda_in.isdigit(): try: busqueda = Busqueda.objects.get(id=int(busqueda_in)) except: logging.error('No busqueda object with id=' +busqueda_in+ ' in UNOPORUNO database.') exit(-1) else: try: busqueda = Busqueda.objects.get(nombre=busqueda_in) except: logging.error('No busqueda object with id=' +busqueda_in+ ' in UNOPORUNO database.') exit(-1) logging.info('Processing busqueda ' +busqueda.nombre ) #TODO: METER TODO EN UNA CLASE DENTRO DE unoporuno/modules/dospordos/features.py OrganizationRegex = RegexFeature(UNOPORUNO_ROOT, 'organization') CountryGazet = QualifiedGazetteerFeature(UNOPORUNO_ROOT, 'country',False) CountryGazetCase = QualifiedGazetteerFeature(UNOPORUNO_ROOT, 'country',True) CityGazet = QualifiedGazetteerFeature(UNOPORUNO_ROOT, 'city', False) CityGazetCase = QualifiedGazetteerFeature(UNOPORUNO_ROOT, 'city', True) AccronymGazet = GazetteerFeature(UNOPORUNO_ROOT, 'accronym', True) busqueda = Busqueda.objects.get(id=busqueda.id) for p in busqueda.persona_set.all(): logging.info("processing person " +p.name) person_countries = [] person_organizations = [] top_snippets_count = 0 p.vinculo_set.all().delete() for s in p.snippet_set.filter(FG=1).exclude(RE=1).filter(converging_pipelines=1): if s.FG==0 or s.RE_features<1: continue title_test_str = s.title.encode('utf-8') descr_test_str = s.description.encode('utf-8') #todo filtrar los snippets de acuerdo a las features que buscamos snippet_countries = [] snippet_organizations = [] orgs = OrganizationRegex.list_test(title_test_str) if len(orgs)>0: logging.debug ('FOUND ORGANIZATIONS ' +str(orgs)+' IN TITLE:\n ' +s.title) snippet_organizations += orgs orgs = OrganizationRegex.list_test(descr_test_str) if len(orgs)>0: logging.debug('FOUND ORGANIZATIONS ' +str(orgs)+ ' IN DESCR:\n ' +s.description) snippet_organizations += orgs accronyms = AccronymGazet.list_test(title_test_str) if len(accronyms)>0: logging.debug('FOUND ORG.ACCRONYMS ' +str(accronyms)+ ' IN TITLE:\n ' +s.title) snippet_organizations += accronyms accronyms = AccronymGazet.list_test(descr_test_str) if len(accronyms)>0: logging.debug('FOUND ORG.ACCRONYMS ' +str(accronyms)+ ' IN TITLE:\n ' +s.description) snippet_organizations += accronyms countries = CountryGazet.list_test(title_test_str) if len(countries)>0: logging.debug ('FOUND COUNTRIES '+str(countries)+' IN TITLE:\n ' +s.title) snippet_countries += countries countries = CountryGazet.list_test(descr_test_str) if len(countries)>0: logging.debug('FOUND COUNTRIES '+str(countries)+' IN DESCR:\n ' +s.description) snippet_countries += countries countries = CountryGazetCase.list_test(title_test_str) if len(countries)>0: logging.debug ('FOUND COUNTRIES '+str(countries)+' IN TITLE:\n ' +s.title) snippet_countries += countries countries = CountryGazetCase.list_test(descr_test_str) if len(countries): logging.debug('FOUND COUNTRIES '+str(countries)+' IN DESCR:\n ' +s.description) snippet_countries += countries countries = CityGazet.list_test(title_test_str) if len(countries)>0: logging.debug('FOUND CITIES FROM COUNTRIES '+str(countries)+' IN CI TITLE\n' +s.title) snippet_countries += countries countries = CityGazet.list_test(descr_test_str) if len(countries)>0: logging.debug('FOUND CITIES FROM COUNTRIES '+str(countries)+' IN CI DESCRIPTION\n' +s.description) snippet_countries += countries countries = CityGazetCase.list_test(title_test_str) if len(countries)>0: logging.debug('FOUND CITIES FROM COUNTRIES '+str(countries)+' IN CD TITLE\n' +s.title) snippet_countries += countries countries = CityGazetCase.list_test(descr_test_str) if len(countries)>0: logging.debug('FOUND CITIES FROM COUNTRIES '+str(countries)+' IN CD DESCRIPTION\n' +s.description) snippet_countries += countries person_countries += snippet_countries person_organizations += snippet_organizations top_snippets_count += 1 if top_snippets_count in (5,10,15,20): dict_org = construye_dict_freq(person_organizations) dict_loc = construye_dict_freq(person_countries) list_org = dict_org.items() list_loc = dict_loc.items() sorted_list_org = sorted(list_org, key=lambda t:-t[1]) sorted_list_loc = sorted(list_loc, key=lambda t:-t[1]) organizations_str = '' locations_str = '' for e in sorted_list_org: #logging.debug('type e='+str(type(e))+ ',type e[0]'+str(type(e[0]))+',e='+str(e)) organizations_str += e[0] + ' (' + str(e[1]) + ')\n' if len(e)>1 else '' for e in sorted_list_loc: locations_str += e[0] + ' (' + str(e[1]) + ')\n' if len(e)>1 else '' logging.info('VÍNCULOS ORGANIZACIONES:: '+str(sorted_list_org)) vinculo = Vinculo() vinculo.persona = p vinculo.organizaciones = organizations_str logging.info('VÍNCULOS LUGARES:: '+str(sorted_list_loc)) vinculo.lugares = locations_str vinculo.descripcion = 'Top ' + str(top_snippets_count) vinculo.tipo = top_snippets_count vinculo.save() if top_snippets_count >= 20: continue for s in p.snippet_set.filter(FG=1).exclude(RE=1).exclude(converging_pipelines=1).order_by('-RE_score'): if s.FG==0 or s.RE_features<1: continue title_test_str = s.title.encode('utf-8') descr_test_str = s.description.encode('utf-8') #todo filtrar los snippets de acuerdo a las features que buscamos snippet_countries = [] snippet_organizations = [] orgs = OrganizationRegex.list_test(title_test_str) if len(orgs)>0: logging.debug ('FOUND ORGANIZATIONS ' +str(orgs)+' IN TITLE:\n ' +s.title) snippet_organizations += orgs orgs = OrganizationRegex.list_test(descr_test_str) if len(orgs)>0: logging.debug('FOUND ORGANIZATIONS ' +str(orgs)+ ' IN DESCR:\n ' +s.description) snippet_organizations += orgs accronyms = AccronymGazet.list_test(title_test_str) if len(accronyms)>0: logging.debug('FOUND ORG.ACCRONYMS ' +str(accronyms)+ ' IN TITLE:\n ' +s.title) snippet_organizations += accronyms accronyms = AccronymGazet.list_test(descr_test_str) if len(accronyms)>0: logging.debug('FOUND ORG.ACCRONYMS ' +str(accronyms)+ ' IN TITLE:\n ' +s.description) snippet_organizations += accronyms countries = CountryGazet.list_test(title_test_str) if len(countries)>0: logging.debug ('FOUND COUNTRIES '+str(countries)+' IN TITLE:\n ' +s.title) snippet_countries += countries countries = CountryGazet.list_test(descr_test_str) if len(countries)>0: logging.debug('FOUND COUNTRIES '+str(countries)+' IN DESCR:\n ' +s.description) snippet_countries += countries countries = CountryGazetCase.list_test(title_test_str) if len(countries)>0: logging.debug ('FOUND COUNTRIES '+str(countries)+' IN TITLE:\n ' +s.title) snippet_countries += countries countries = CountryGazetCase.list_test(descr_test_str) if len(countries): logging.debug('FOUND COUNTRIES '+str(countries)+' IN DESCR:\n ' +s.description) snippet_countries += countries countries = CityGazet.list_test(title_test_str) if len(countries)>0: logging.debug('FOUND CITIES FROM COUNTRIES '+str(countries)+' IN CI TITLE\n' +s.title) snippet_countries += countries countries = CityGazet.list_test(descr_test_str) if len(countries)>0: logging.debug('FOUND CITIES FROM COUNTRIES '+str(countries)+' IN CI DESCRIPTION\n' +s.description) snippet_countries += countries countries = CityGazetCase.list_test(title_test_str) if len(countries)>0: logging.debug('FOUND CITIES FROM COUNTRIES '+str(countries)+' IN CD TITLE\n' +s.title) snippet_countries += countries countries = CityGazetCase.list_test(descr_test_str) if len(countries)>0: logging.debug('FOUND CITIES FROM COUNTRIES '+str(countries)+' IN CD DESCRIPTION\n' +s.description) snippet_countries += countries person_countries += snippet_countries person_organizations += snippet_organizations top_snippets_count += 1 if top_snippets_count in (5,10,15,20): dict_org = construye_dict_freq(person_organizations) dict_loc = construye_dict_freq(person_countries) list_org = dict_org.items() list_loc = dict_loc.items() sorted_list_org = sorted(list_org, key=lambda t:-t[1]) sorted_list_loc = sorted(list_loc, key=lambda t:-t[1]) organizations_str = '' locations_str = '' for e in sorted_list_org: #logging.debug('type e='+str(type(e))+ ',type e[0]'+str(type(e[0]))+',e='+str(e)) organizations_str += e[0] + ' (' + str(e[1]) + ')\n' if len(e)>1 else '' for e in sorted_list_loc: locations_str += e[0] + ' (' + str(e[1]) + ')\n' if len(e)>1 else '' logging.info('VÍNCULOS ORGANIZACIONES:: '+str(sorted_list_org)) vinculo = Vinculo() vinculo.persona = p vinculo.organizaciones = organizations_str logging.info('VÍNCULOS LUGARES:: '+str(sorted_list_loc)) vinculo.lugares = locations_str vinculo.descripcion = 'Top ' + str(top_snippets_count) vinculo.tipo = top_snippets_count vinculo.save() if top_snippets_count >= 20: break