def process(self, parameters={}, data={}): verbose = False if 'verbose' in parameters: if parameters['verbose']: verbose = True openrefine_server = False text = '' for field in data: values = data[field] if not isinstance(values, list): values = [values] for value in values: if value: text = "{}{}\n".format(text, value) if openrefine_server: # use REST-API on (remote) HTTP server params = {'text': text} r = requests.post(openrefine_server, params=params) results = r.json() else: # use local Python library linker = Entity_Linker() linker.verbose = verbose results = linker.entities(text=text) if verbose: print("Named Entity Linking: {}".format(results)) for match in results: for candidate in results[match]['result']: if candidate['match']: for facet in candidate['type']: etl.append(data, facet, candidate['name']) etl.append(data, facet + '_uri_ss', candidate['id']) # mark the document, that it was analyzed by this plugin yet data['enhance_entity_linking_b'] = "true" return parameters, data
def process(self, parameters={}, data={}): verbose = False if 'verbose' in parameters: if parameters['verbose']: verbose = True openrefine_server = False analyse_fields = [ 'title_txt', 'content_txt', 'description_txt', 'ocr_t', 'ocr_descew_t' ] text = '' for field in analyse_fields: if field in data: text = "{}{}\n".format(text, data[field]) if openrefine_server: # use REST-API on (remote) HTTP server params = {'text': text} r = requests.post(openrefine_server, params=params) results = r.json() else: # use local Python library linker = Entity_Linker() linker.verbose = verbose results = linker.entities(text=text) if verbose: print("Named Entity Linking: {}".format(results)) for match in results: for candidate in results[match]['result']: for facet in candidate['type']: etl.append(data, facet, candidate['name']) etl.append(data, facet + '_uri_ss', candidate['id']) # mark the document, that it was analyzed by this plugin yet data['enhance_entity_linking_b'] = "true" return parameters, data
def reconcile(request): queries = None if 'queries' in request.GET: queries = json.loads(request.GET['queries']) elif 'queries' in request.POST: queries = json.loads(request.POST['queries']) text = None if 'text' in request.POST: text = request.POST['text'] elif 'text' in request.GET: text = request.GET['text'] if queries or text: # link/normalize/disambiguate entities entity_linker = Entity_Linker() results = entity_linker.entities(queries=queries, text=text) else: # no queries, so just return service metadata results = { 'name': 'Open Semantic Entity Search API', } # Open Refine uses JSONP callback callback = None if 'callback' in request.GET: callback = request.GET['callback'] elif 'callback' in request.POST: callback = request.POST['callback'] if callback: # JSONP response instead of Jsonresponse results = '{}({});'.format( callback, json.dumps(results) ) return HttpResponse(results, "text/javascript") else: return JsonResponse(results)
def process(self, parameters={}, data={}): verbose = False if 'verbose' in parameters: if parameters['verbose']: verbose = True entity_linking_taggers = ['all_labels_ss_tag'] if 'entity_linking_taggers' in parameters: entity_linking_taggers = parameters['entity_linking_taggers'] # add taggers for stemming entity_linking_taggers_document_language_dependent = {} if 'entity_linking_taggers_document_language_dependent' in parameters: entity_linking_taggers_document_language_dependent = parameters[ 'entity_linking_taggers_document_language_dependent'] if 'language_s' in data: # is a language specific tagger there for the detected language? if data['language_s'] in entity_linking_taggers_document_language_dependent: for entity_linking_tagger in entity_linking_taggers_document_language_dependent[ data['language_s']]: if not entity_linking_tagger in entity_linking_taggers: entity_linking_taggers.append(entity_linking_tagger) openrefine_server = False if 'openrefine_server' in parameters: openrefine_server = parameters['openrefine_server'] taxonomy_fields = ['skos_broader_taxonomy_prefLabel_ss'] # collect/copy to be analyzed text from all fields text = '' for field in data: values = data[field] if not isinstance(values, list): values = [values] for value in values: if value: text = "{}{}\n".format(text, value) # tag all entities (by different taggers for different analyzers/stemmers) for entity_linking_tagger in entity_linking_taggers: # call REST API if openrefine_server: # use REST-API on (remote) HTTP server params = {'text': text} r = requests.post(openrefine_server, params=params) results = r.json() else: # use local Python library linker = Entity_Linker() linker.verbose = verbose results = linker.entities( text=text, taggers=[entity_linking_tagger], additional_result_fields=taxonomy_fields) if verbose: print("Named Entity Linking by Tagger {}: {}".format( entity_linking_tagger, results)) # write entities from result to document facets for match in results: for candidate in results[match]['result']: if candidate['match']: for facet in candidate['type']: # use different facet for fuzzy/stemmed matches if not entity_linking_tagger == 'all_labels_ss_tag': # do not use another different facet if same stemmer but forced / not document language dependent entity_linking_tagger_withoutforceoption = entity_linking_tagger.replace( '_stemming_force_', '_stemming_') facet = facet + entity_linking_tagger_withoutforceoption + '_ss' etl.append(data, facet, candidate['name']) etl.append(data, facet + '_uri_ss', candidate['id']) etl.append( data, facet + '_preflabel_and_uri_ss', candidate['name'] + ' <' + candidate['id'] + '>') if 'matchtext' in candidate: for matchtext in candidate['matchtext']: etl.append( data, facet + '_matchtext_ss', candidate['id'] + "\t" + matchtext) for taxonomy_field in taxonomy_fields: if taxonomy_field in candidate: separated_taxonomy_fields = taxonomy2fields( field=facet, data=candidate[taxonomy_field]) for separated_taxonomy_field in separated_taxonomy_fields: etl.append( data, separated_taxonomy_field, separated_taxonomy_fields[ separated_taxonomy_field]) # mark the document, that it was analyzed by this plugin yet data['etl_enhance_entity_linking_b'] = "true" return parameters, data
def process(self, parameters=None, data=None): if parameters is None: parameters = {} if data is None: data = {} verbose = False if 'verbose' in parameters: if parameters['verbose']: verbose = True entity_linking_taggers = ['all_labels_ss_tag'] if 'entity_linking_taggers' in parameters: entity_linking_taggers = parameters['entity_linking_taggers'] # add taggers for stemming entity_linking_taggers_document_language_dependent = {} if 'entity_linking_taggers_document_language_dependent' in parameters: entity_linking_taggers_document_language_dependent = parameters[ 'entity_linking_taggers_document_language_dependent'] if 'language_s' in data: # is a language specific tagger there for the detected language? if data['language_s'] in entity_linking_taggers_document_language_dependent: for entity_linking_tagger in entity_linking_taggers_document_language_dependent[ data['language_s']]: if not entity_linking_tagger in entity_linking_taggers: entity_linking_taggers.append(entity_linking_tagger) openrefine_server = False if 'openrefine_server' in parameters: openrefine_server = parameters['openrefine_server'] taxonomy_fields = ['skos_broader_taxonomy_prefLabel_ss'] # collect/copy to be analyzed text from all fields text = etl_plugin_core.get_text(data=data) # tag all entities (by different taggers for different analyzers/stemmers) for entity_linking_tagger in entity_linking_taggers: results = {} retries = 0 retrytime = 1 # wait time until next retry will be doubled until reaching maximum of 120 seconds (2 minutes) until next retry retrytime_max = 120 no_connection = True while no_connection: try: if retries > 0: print( 'Retrying to connect to Solr tagger in {} second(s).' .format(retrytime)) time.sleep(retrytime) retrytime = retrytime * 2 if retrytime > retrytime_max: retrytime = retrytime_max # call REST API if openrefine_server: # use REST-API on (remote) HTTP server params = {'text': text} r = requests.post(openrefine_server, params=params) # if bad status code, raise exception r.raise_for_status() results = r.json() else: # use local Python library linker = Entity_Linker() linker.verbose = verbose results = linker.entities( text=text, taggers=[entity_linking_tagger], additional_result_fields=taxonomy_fields) no_connection = False except KeyboardInterrupt: raise KeyboardInterrupt except requests.exceptions.ConnectionError as e: retries += 1 if openrefine_server: sys.stderr.write( "Connection to Openrefine server failed (will retry in {} seconds). Exception: {}\n" .format(retrytime, e)) else: sys.stderr.write( "Connection to Solr text tagger failed (will retry in {} seconds). Exception: {}\n" .format(retrytime, e)) except requests.exceptions.HTTPError as e: if e.response.status_code == 503: retries += 1 if openrefine_server: sys.stderr.write( "Openrefine server temporary unavailable (HTTP status code 503). Will retry in {} seconds). Exception: {}\n" .format(retrytime, e)) else: sys.stderr.write( "Solr temporary unavailable (HTTP status code 503). Will retry in {} seconds). Exception: {}\n" .format(retrytime, e)) elif e.response.status_code == 400: no_connection = False # if error because of empty entity index for that tagger because no entities imported yet, no error message / index as fail empty_entity_index = False try: errorstatus = e.response.json() if errorstatus['error'][ 'msg'] == 'field ' + entity_linking_tagger + ' has no indexed data': empty_entity_index = True except: pass if not empty_entity_index: etl.error_message(docid=parameters['id'], data=data, plugin='enhance_entity_linking', e=e) else: no_connection = False etl.error_message(docid=parameters['id'], data=data, plugin='enhance_entity_linking', e=e) except BaseException as e: no_connection = False etl.error_message(docid=parameters['id'], data=data, plugin='enhance_entity_linking', e=e) if verbose: print("Named Entity Linking by Tagger {}: {}".format( entity_linking_tagger, results)) # write entities from result to document facets for match in results: for candidate in results[match]['result']: if candidate['match']: for facet in candidate['type']: # use different facet for fuzzy/stemmed matches if not entity_linking_tagger == 'all_labels_ss_tag': # do not use another different facet if same stemmer but forced / not document language dependent entity_linking_tagger_withoutforceoption = entity_linking_tagger.replace( '_stemming_force_', '_stemming_') facet = facet + entity_linking_tagger_withoutforceoption + '_ss' etl_plugin_core.append(data, facet, candidate['name']) etl_plugin_core.append(data, facet + '_uri_ss', candidate['id']) etl_plugin_core.append( data, facet + '_preflabel_and_uri_ss', candidate['name'] + ' <' + candidate['id'] + '>') if 'matchtext' in candidate: for matchtext in candidate['matchtext']: etl_plugin_core.append( data, facet + '_matchtext_ss', candidate['id'] + "\t" + matchtext) for taxonomy_field in taxonomy_fields: if taxonomy_field in candidate: separated_taxonomy_fields = taxonomy2fields( taxonomy=candidate[taxonomy_field], field=facet) for separated_taxonomy_field in separated_taxonomy_fields: etl_plugin_core.append( data, separated_taxonomy_field, separated_taxonomy_fields[ separated_taxonomy_field]) return parameters, data
def test(self): # add test entity to entities index entity_manager = Entity_Manager() entity_manager.add( id="http://entity-unittest.local/entities/1", types=['entity-unittest_type_one', 'entity-unittest_type_two'], preferred_label= "entity-unittest_preferred_label_part_one entity-unittest_preferred_label_part_two", prefLabels=["entity-unittest_preferredLabels"], labels=[ "entity-unittest_labels_one_part_one entity-unittest_labels_one_part_two", "entity-unittest_labels_two", "entity-unittest_labels_umlaut_äöüß" ]) # extracts and normalizes/links all known entities/names/labels linker = Entity_Linker() # check if entity is found by preferred label results = linker.entities( text= "I want to extract the id of entity-unittest_preferred_label_part_one entity-unittest_preferred_label_part_two from a full text." ) self.assertTrue( is_in_resultdata( resultdata=results, entity_id='http://entity-unittest.local/entities/1', fieldname='name', value= 'entity-unittest_preferred_label_part_one entity-unittest_preferred_label_part_two' )) # check if is_in_resultdata works ok and does not return true even on not existing id self.assertFalse( is_in_resultdata( resultdata=results, entity_id= 'http://entity-unittest.local/entities/notexistententityid', fieldname='name', value='notexistant entity')) # check returned types of returned entity id self.assertTrue( is_in_resultdata( resultdata=results, entity_id='http://entity-unittest.local/entities/1', fieldname='type', value=['entity-unittest_type_one', 'entity-unittest_type_two'])) # check if entity is found by another preferred label results = linker.entities( text= "I want to extract the id of entity-unittest_preferredLabels from a full text." ) self.assertTrue( is_in_resultdata( resultdata=results, entity_id='http://entity-unittest.local/entities/1', fieldname='name', value= 'entity-unittest_preferred_label_part_one entity-unittest_preferred_label_part_two' )) # check if entity is found by (alternate) labels results = linker.entities( text= "I want to extract the id of entity-unittest_labels_one_part_one entity-unittest_labels_one_part_two from a full text." ) self.assertTrue( is_in_resultdata( resultdata=results, entity_id='http://entity-unittest.local/entities/1', fieldname='name', value= 'entity-unittest_preferred_label_part_one entity-unittest_preferred_label_part_two' )) results = linker.entities( text= "I want to extract the id of entity-unittest_labels_two from a full text." ) self.assertTrue( is_in_resultdata( resultdata=results, entity_id='http://entity-unittest.local/entities/1', fieldname='name', value= 'entity-unittest_preferred_label_part_one entity-unittest_preferred_label_part_two' )) # check if entity is found by alternate label with special chars results = linker.entities( text= "I want to extract the id of entity-unittest_labels_umlaut_äöüß from a full text." ) self.assertTrue( is_in_resultdata( resultdata=results, entity_id='http://entity-unittest.local/entities/1', fieldname='name', value= 'entity-unittest_preferred_label_part_one entity-unittest_preferred_label_part_two' )) # entity should not be linked by only a part of the label results = linker.entities( text= "I dont want to extract the id of entity-unittest_labels_one_part_one (missing second part of name) from a full text." ) self.assertFalse( is_in_resultdata( resultdata=results, entity_id='http://entity-unittest.local/entities/1', fieldname='name', value= 'entity-unittest_preferred_label_part_one entity-unittest_preferred_label_part_two' ))
def process(self, parameters={}, data={}): verbose = False if 'verbose' in parameters: if parameters['verbose']: verbose = True entity_linking_taggers = ['all_labels_ss_tag'] if 'entity_linking_taggers' in parameters: entity_linking_taggers = parameters['entity_linking_taggers'] entity_linking_taggers_document_language_dependent = {} if 'entity_linking_taggers_document_language_dependent' in parameters: entity_linking_taggers_document_language_dependent = parameters[ 'entity_linking_taggers_document_language_dependent'] if 'language_s' in data: # is a language specific tagger there for the detected language? if data['language_s'] in entity_linking_taggers_document_language_dependent: for entity_linking_tagger in entity_linking_taggers_document_language_dependent[ data['language_s']]: if not entity_linking_tagger in entity_linking_taggers: entity_linking_taggers.append(entity_linking_tagger) openrefine_server = False if 'openrefine_server' in parameters: openrefine_server = parameters['openrefine_server'] text = '' for field in data: values = data[field] if not isinstance(values, list): values = [values] for value in values: if value: text = "{}{}\n".format(text, value) if openrefine_server: # use REST-API on (remote) HTTP server params = {'text': text} r = requests.post(openrefine_server, params=params) results = r.json() else: # use local Python library linker = Entity_Linker() linker.verbose = verbose results = linker.entities(text=text, taggers=entity_linking_taggers) if verbose: print("Named Entity Linking: {}".format(results)) for match in results: for candidate in results[match]['result']: if candidate['match']: for facet in candidate['type']: etl.append(data, facet, candidate['name']) etl.append(data, facet + '_uri_ss', candidate['id']) # mark the document, that it was analyzed by this plugin yet data['enhance_entity_linking_b'] = "true" return parameters, data
def process (self, parameters={}, data={} ): verbose = False if 'verbose' in parameters: if parameters['verbose']: verbose = True entity_linking_taggers = ['all_labels_ss_tag'] if 'entity_linking_taggers' in parameters: entity_linking_taggers = parameters['entity_linking_taggers'] # add taggers for stemming entity_linking_taggers_document_language_dependent = {} if 'entity_linking_taggers_document_language_dependent' in parameters: entity_linking_taggers_document_language_dependent = parameters['entity_linking_taggers_document_language_dependent'] if 'language_s' in data: # is a language specific tagger there for the detected language? if data['language_s'] in entity_linking_taggers_document_language_dependent: for entity_linking_tagger in entity_linking_taggers_document_language_dependent[data['language_s']]: if not entity_linking_tagger in entity_linking_taggers: entity_linking_taggers.append(entity_linking_tagger) openrefine_server = False if 'openrefine_server' in parameters: openrefine_server = parameters['openrefine_server'] taxonomy_fields = ['skos_broader_taxonomy_prefLabel_ss'] # collect/copy to be analyzed text from all fields text = '' for field in data: values = data[field] if not isinstance(values, list): values = [values] for value in values: if value: text = "{}{}\n".format(text, value) # tag all entities (by different taggers for different analyzers/stemmers) for entity_linking_tagger in entity_linking_taggers: results = {} retries = 0 retrytime = 1 retrytime_max = 120 # wait time until next retry will be doubled until reaching maximum of 120 seconds (2 minutes) until next retry no_connection = True while no_connection: try: if retries > 0: print('Retrying to connect to Solr tagger in {} second(s).'.format(retrytime)) time.sleep(retrytime) retrytime = retrytime * 2 if retrytime > retrytime_max: retrytime = retrytime_max # call REST API if openrefine_server: # use REST-API on (remote) HTTP server params = {'text': text} r = requests.post(openrefine_server, params=params) # if bad status code, raise exception r.raise_for_status() results = r.json() else: # use local Python library linker = Entity_Linker() linker.verbose = verbose results = linker.entities( text = text, taggers = [entity_linking_tagger], additional_result_fields = taxonomy_fields ) no_connection = False except KeyboardInterrupt: raise KeyboardInterrupt except requests.exceptions.ConnectionError as e: retries += 1 if openrefine_server: sys.stderr.write( "Connection to Openrefine server failed (will retry in {} seconds). Exception: {}\n".format(retrytime, e) ) else: sys.stderr.write( "Connection to Solr text tagger failed (will retry in {} seconds). Exception: {}\n".format(retrytime, e) ) except requests.exceptions.HTTPError as e: if e.response.status_code == 503: retries += 1 if openrefine_server: sys.stderr.write( "Openrefine server temporary unavailable (HTTP status code 503). Will retry in {} seconds). Exception: {}\n".format(retrytime, e) ) else: sys.stderr.write( "Solr temporary unavailable (HTTP status code 503). Will retry in {} seconds). Exception: {}\n".format(retrytime, e) ) elif e.response.status_code == 400: no_connection = False # if error because of empty entity index for that tagger because no entities imported yet, no error message / index as fail empty_entity_index = False try: errorstatus = e.response.json() if errorstatus['error']['msg'] == 'field ' + entity_linking_tagger + ' has no indexed data': empty_entity_index = True except: pass if not empty_entity_index: etl.error_message(docid=parameters['id'], data=data, plugin='enhance_entity_linking', e=e) else: no_connection = False etl.error_message(docid=parameters['id'], data=data, plugin='enhance_entity_linking', e=e) except BaseException as e: no_connection = False etl.error_message(docid=parameters['id'], data=data, plugin='enhance_entity_linking', e=e) if verbose: print ("Named Entity Linking by Tagger {}: {}".format(entity_linking_tagger, results)) # write entities from result to document facets for match in results: for candidate in results[match]['result']: if candidate['match']: for facet in candidate['type']: # use different facet for fuzzy/stemmed matches if not entity_linking_tagger == 'all_labels_ss_tag': # do not use another different facet if same stemmer but forced / not document language dependent entity_linking_tagger_withoutforceoption = entity_linking_tagger.replace('_stemming_force_', '_stemming_') facet = facet + entity_linking_tagger_withoutforceoption + '_ss' etl.append(data, facet, candidate['name']) etl.append(data, facet + '_uri_ss', candidate['id']) etl.append(data, facet + '_preflabel_and_uri_ss', candidate['name'] + ' <' + candidate['id'] + '>') if 'matchtext' in candidate: for matchtext in candidate['matchtext']: etl.append(data, facet + '_matchtext_ss', candidate['id'] + "\t" + matchtext) for taxonomy_field in taxonomy_fields: if taxonomy_field in candidate: separated_taxonomy_fields = taxonomy2fields(field=facet, data=candidate[taxonomy_field]) for separated_taxonomy_field in separated_taxonomy_fields: etl.append(data, separated_taxonomy_field, separated_taxonomy_fields[separated_taxonomy_field]) # mark the document, that it was analyzed by this plugin yet data['etl_enhance_entity_linking_b'] = "true" return parameters, data