def process_spotlight_api(text): try: entities = spotlight.annotate( "http://spotlight.dbpedia.org/rest/annotate", text, confidence=0.1, support=0 ) except: return [] link_matches = HyperLink.extract_all_url(text) initial_entities = [] for entity in entities: occ = 0 if occ is not 0: occ = text.count('"', 0, entity["offset"] + len(entity["serviceForm"]) - 1) start = entity["offset"] + occ end = entity["offset"] + len(entity["surfaceForm"]) + occ possible_link = False for link_match in link_matches: if link_match["start"] <= start and link_match["end"] >= end: possible_link = True if not possible_link: e = { "label": entity["surfaceForm"], "startOffset": start, "endOffset": end, "confidence": entity["similarityScore"], "provenance": "dbpediaspotlight", "types": [] } types = [] for data_type in entity["types"].split(","): link = data_type if "DBpedia:" in data_type: link = "http://en.dbpedia.org/resource/" + data_type.split(":")[1] if "Freebase:" in data_type: link = "http://www.freebase.com" + data_type.split(":")[1] dbpedia_type = { "typeURI": None, "typeLabel": data_type, "entityURI": link, "confidence": entity["similarityScore"], "wikiURI": DbpediaLink.get_english_wikipedia_link_from_english_resource(link) } types.append(dbpedia_type) e["types"].append(types) initial_entities.append(e) return initial_entities
def process_textrazor_api(text): client = TextRazor( api_key='67ef1ca06614f7d202b23f1444bd7ee1ea2f916b3ecf488f8d39f800', extractors=[ "entities", "topics", "words", "phrases", "dependency-trees", "senses" ] ) try: response = client.analyze(text) except: return [] link_matches = HyperLink.extract_all_url(text) initial_entities = [] for entity in response.entities(): possible_link = False for link_match in link_matches: if link_match["start"] <= entity.starting_position and link_match["end"] >= entity.ending_position: possible_link = True if not possible_link: e = { "label": entity.matched_text, "startOffset": entity.starting_position, "endOffset": entity.ending_position, "confidence": entity.confidence_score, "relevance": entity.relevance_score, "provenance": "textrazor", "wikipediaLink": entity.wikipedia_link, "types": [] } for dbpedia_type in entity.dbpedia_types: wiki_link = "http://en.wikipedia.org/wiki/" + dbpedia_type dbpedia_type_list = { "typeURI": None, "typeLabel": dbpedia_type, "wikiURI": wiki_link, "entityURI": DbpediaLink.get_english_resource_from_english_wikipedia_link(wiki_link), "confidence": entity.confidence_score } e["types"].append(dbpedia_type_list) for freebase_type in entity.freebase_types: freebase_link = "http://www.freebase.com" + freebase_type freebase_type_list = { "typeURI": None, "typeLabel": "Freebase:" + freebase_type.replace(" ", ""), "wikiURI": None, "entityURI": freebase_link, "confidence": entity.confidence_score } e["types"].append(freebase_type_list) wiki_type_list = { "typeURI": None, "typeLabel": [], "wikiURI": entity.wikipedia_link, "entityURI": DbpediaLink.get_english_resource_from_english_wikipedia_link(entity.wikipedia_link), "confidence": entity.confidence_score } e["types"].append(wiki_type_list) initial_entities.append(e) return initial_entities
def process_textrazor_api(text): client = TextRazor( api_key="67ef1ca06614f7d202b23f1444bd7ee1ea2f916b3ecf488f8d39f800", extractors=["entities", "topics", "words", "phrases", "dependency-trees", "senses"], ) try: response = client.analyze(text) except: return [] link_matches = HyperLink.extract_all_url(text) initial_entities = [] for entity in response.entities(): possible_link = False for link_match in link_matches: if link_match["start"] <= entity.starting_position and link_match["end"] >= entity.ending_position: possible_link = True if not possible_link: e = { "label": entity.matched_text, "startOffset": entity.starting_position, "endOffset": entity.ending_position, "confidence": entity.confidence_score, "relevance": entity.relevance_score, "provenance": "textrazor", "wikipediaLink": entity.wikipedia_link, "types": [], } for dbpedia_type in entity.dbpedia_types: wiki_link = "http://en.wikipedia.org/wiki/" + dbpedia_type dbpedia_type_list = { "typeURI": None, "typeLabel": dbpedia_type, "wikiURI": wiki_link, "entityURI": DbpediaLink.get_english_resource_from_english_wikipedia_link(wiki_link), "confidence": entity.confidence_score, } e["types"].append(dbpedia_type_list) for freebase_type in entity.freebase_types: freebase_link = "http://www.freebase.com" + freebase_type freebase_type_list = { "typeURI": None, "typeLabel": "Freebase:" + freebase_type.replace(" ", ""), "wikiURI": None, "entityURI": freebase_link, "confidence": entity.confidence_score, } e["types"].append(freebase_type_list) wiki_type_list = { "typeURI": None, "typeLabel": [], "wikiURI": entity.wikipedia_link, "entityURI": DbpediaLink.get_english_resource_from_english_wikipedia_link(entity.wikipedia_link), "confidence": entity.confidence_score, } e["types"].append(wiki_type_list) initial_entities.append(e) return initial_entities
def process_nerd_api(text): try: timeout = 10 text = urllib.quote_plus(text) n = nerd.NERD("nerd.eurecom.fr", "akkqfgos0p85mcubcfgp82rn92d23enu") entities = n.extract(text, "combined", timeout) except: return [] link_matches = HyperLink.extract_all_url(text) initial_entities = [] for entity in entities: possible_link = False for link_match in link_matches: if link_match["start"] <= entity["startChar"] and link_match["end"] >= entity["endChar"]: possible_link = True if not possible_link: e = { "label": entity["label"], "startOffset": entity["startChar"], "endOffset": entity["endChar"], "confidence": entity["confidence"], "provenance": "nerd-" + entity["extractor"], "types": [], } if entity["extractorType"]: all_types = entity["extractorType"].split(",") for extracted_type in all_types: if "dbpedia" in extracted_type: type_data = { "typeURI": extracted_type, "typeLabel": None, "wikiURI": DbpediaLink.get_english_resource_from_english_wikipedia_link(entity["uri"]), "entityURI": entity["uri"], "confidence": entity["confidence"], } else: type_data = { "typeURI": None, "typeLabel": extracted_type, "wikiURI": DbpediaLink.get_english_resource_from_english_wikipedia_link(entity["uri"]), "entityURI": entity["uri"], "confidence": entity["confidence"], } e["types"].append(type_data) if entity["nerdType"]: nerd_type_data = { "typeURI": entity["nerdType"], "typeLabel": entity["nerdType"].split("#")[1], "wikiURI": DbpediaLink.get_english_resource_from_english_wikipedia_link(entity["uri"]), "entityURI": entity["uri"], "confidence": entity["confidence"], } e["types"].append(nerd_type_data) initial_entities.append(e) return initial_entities