Пример #1
0
def process_spotlight_api(text):
    try:
        entities = spotlight.annotate(
            "http://spotlight.dbpedia.org/rest/annotate",
            text,
            confidence=0.1,
            support=0
        )
    except:
        return []

    link_matches = HyperLink.extract_all_url(text)

    initial_entities = []
    for entity in entities:
        occ = 0
        if occ is not 0:
            occ = text.count('"', 0, entity["offset"] + len(entity["serviceForm"]) - 1)
        start = entity["offset"] + occ
        end = entity["offset"] + len(entity["surfaceForm"]) + occ

        possible_link = False
        for link_match in link_matches:
            if link_match["start"] <= start and link_match["end"] >= end:
                possible_link = True

        if not possible_link:
            e = {
                "label": entity["surfaceForm"],
                "startOffset": start,
                "endOffset": end,
                "confidence": entity["similarityScore"],
                "provenance": "dbpediaspotlight",
                "types": []
            }

            types = []
            for data_type in entity["types"].split(","):
                link = data_type
                if "DBpedia:" in data_type:
                    link = "http://en.dbpedia.org/resource/" + data_type.split(":")[1]
                if "Freebase:" in data_type:
                    link = "http://www.freebase.com" + data_type.split(":")[1]

                dbpedia_type = {
                    "typeURI": None,
                    "typeLabel": data_type,
                    "entityURI": link,
                    "confidence": entity["similarityScore"],
                    "wikiURI": DbpediaLink.get_english_wikipedia_link_from_english_resource(link)
                }
                types.append(dbpedia_type)

            e["types"].append(types)
            initial_entities.append(e)

    return initial_entities
Пример #2
0
def process_textrazor_api(text):
    client = TextRazor(
        api_key='67ef1ca06614f7d202b23f1444bd7ee1ea2f916b3ecf488f8d39f800',
        extractors=[
            "entities",
            "topics",
            "words",
            "phrases",
            "dependency-trees",
            "senses"
        ]
    )

    try:
        response = client.analyze(text)
    except:
        return []

    link_matches = HyperLink.extract_all_url(text)

    initial_entities = []
    for entity in response.entities():
        possible_link = False
        for link_match in link_matches:
            if link_match["start"] <= entity.starting_position and link_match["end"] >= entity.ending_position:
                possible_link = True

        if not possible_link:
            e = {
                "label": entity.matched_text,
                "startOffset": entity.starting_position,
                "endOffset": entity.ending_position,
                "confidence": entity.confidence_score,
                "relevance": entity.relevance_score,
                "provenance": "textrazor",
                "wikipediaLink": entity.wikipedia_link,
                "types": []
            }

            for dbpedia_type in entity.dbpedia_types:
                wiki_link = "http://en.wikipedia.org/wiki/" + dbpedia_type

                dbpedia_type_list = {
                    "typeURI": None,
                    "typeLabel": dbpedia_type,
                    "wikiURI": wiki_link,
                    "entityURI": DbpediaLink.get_english_resource_from_english_wikipedia_link(wiki_link),
                    "confidence": entity.confidence_score
                }

                e["types"].append(dbpedia_type_list)

            for freebase_type in entity.freebase_types:
                freebase_link = "http://www.freebase.com" + freebase_type

                freebase_type_list = {
                    "typeURI": None,
                    "typeLabel": "Freebase:" + freebase_type.replace(" ", ""),
                    "wikiURI": None,
                    "entityURI": freebase_link,
                    "confidence": entity.confidence_score
                }

                e["types"].append(freebase_type_list)

            wiki_type_list = {
                "typeURI": None,
                "typeLabel": [],
                "wikiURI": entity.wikipedia_link,
                "entityURI": DbpediaLink.get_english_resource_from_english_wikipedia_link(entity.wikipedia_link),
                "confidence": entity.confidence_score
            }

            e["types"].append(wiki_type_list)

            initial_entities.append(e)

    return initial_entities
Пример #3
0
def process_textrazor_api(text):
    client = TextRazor(
        api_key="67ef1ca06614f7d202b23f1444bd7ee1ea2f916b3ecf488f8d39f800",
        extractors=["entities", "topics", "words", "phrases", "dependency-trees", "senses"],
    )

    try:
        response = client.analyze(text)
    except:
        return []

    link_matches = HyperLink.extract_all_url(text)

    initial_entities = []
    for entity in response.entities():
        possible_link = False
        for link_match in link_matches:
            if link_match["start"] <= entity.starting_position and link_match["end"] >= entity.ending_position:
                possible_link = True

        if not possible_link:
            e = {
                "label": entity.matched_text,
                "startOffset": entity.starting_position,
                "endOffset": entity.ending_position,
                "confidence": entity.confidence_score,
                "relevance": entity.relevance_score,
                "provenance": "textrazor",
                "wikipediaLink": entity.wikipedia_link,
                "types": [],
            }

            for dbpedia_type in entity.dbpedia_types:
                wiki_link = "http://en.wikipedia.org/wiki/" + dbpedia_type

                dbpedia_type_list = {
                    "typeURI": None,
                    "typeLabel": dbpedia_type,
                    "wikiURI": wiki_link,
                    "entityURI": DbpediaLink.get_english_resource_from_english_wikipedia_link(wiki_link),
                    "confidence": entity.confidence_score,
                }

                e["types"].append(dbpedia_type_list)

            for freebase_type in entity.freebase_types:
                freebase_link = "http://www.freebase.com" + freebase_type

                freebase_type_list = {
                    "typeURI": None,
                    "typeLabel": "Freebase:" + freebase_type.replace(" ", ""),
                    "wikiURI": None,
                    "entityURI": freebase_link,
                    "confidence": entity.confidence_score,
                }

                e["types"].append(freebase_type_list)

            wiki_type_list = {
                "typeURI": None,
                "typeLabel": [],
                "wikiURI": entity.wikipedia_link,
                "entityURI": DbpediaLink.get_english_resource_from_english_wikipedia_link(entity.wikipedia_link),
                "confidence": entity.confidence_score,
            }

            e["types"].append(wiki_type_list)

            initial_entities.append(e)

    return initial_entities
Пример #4
0
def process_spotlight_api(text):
    try:
        entities = spotlight.annotate(
            "http://spotlight.dbpedia.org/rest/annotate",
            text,
            confidence=0.1,
            support=0)
    except:
        return []

    link_matches = HyperLink.extract_all_url(text)

    initial_entities = []
    for entity in entities:
        occ = 0
        if occ is not 0:
            occ = text.count('"', 0,
                             entity["offset"] + len(entity["serviceForm"]) - 1)
        start = entity["offset"] + occ
        end = entity["offset"] + len(entity["surfaceForm"]) + occ

        possible_link = False
        for link_match in link_matches:
            if link_match["start"] <= start and link_match["end"] >= end:
                possible_link = True

        if not possible_link:
            e = {
                "label": entity["surfaceForm"],
                "startOffset": start,
                "endOffset": end,
                "confidence": entity["similarityScore"],
                "provenance": "dbpediaspotlight",
                "types": []
            }

            types = []
            for data_type in entity["types"].split(","):
                link = data_type
                if "DBpedia:" in data_type:
                    link = "http://en.dbpedia.org/resource/" + data_type.split(
                        ":")[1]
                if "Freebase:" in data_type:
                    link = "http://www.freebase.com" + data_type.split(":")[1]

                dbpedia_type = {
                    "typeURI":
                    None,
                    "typeLabel":
                    data_type,
                    "entityURI":
                    link,
                    "confidence":
                    entity["similarityScore"],
                    "wikiURI":
                    DbpediaLink.
                    get_english_wikipedia_link_from_english_resource(link)
                }
                types.append(dbpedia_type)

            e["types"].append(types)
            initial_entities.append(e)

    return initial_entities
Пример #5
0
def process_nerd_api(text):
    try:
        timeout = 10
        text = urllib.quote_plus(text)
        n = nerd.NERD("nerd.eurecom.fr", 'akkqfgos0p85mcubcfgp82rn92d23enu')
        entities = n.extract(text, "combined", timeout)
    except:
        return []

    link_matches = HyperLink.extract_all_url(text)

    initial_entities = []
    for entity in entities:
        possible_link = False
        for link_match in link_matches:
            if link_match["start"] <= entity["startChar"] and link_match[
                    "end"] >= entity["endChar"]:
                possible_link = True

        if not possible_link:
            e = {
                "label": entity["label"],
                "startOffset": entity["startChar"],
                "endOffset": entity["endChar"],
                "confidence": entity["confidence"],
                "provenance": "nerd-" + entity["extractor"],
                "types": []
            }

            if entity["extractorType"]:
                all_types = entity["extractorType"].split(",")

                for extracted_type in all_types:

                    if "dbpedia" in extracted_type:
                        type_data = {
                            "typeURI":
                            extracted_type,
                            "typeLabel":
                            None,
                            "wikiURI":
                            DbpediaLink.
                            get_english_resource_from_english_wikipedia_link(
                                entity["uri"]),
                            "entityURI":
                            entity["uri"],
                            "confidence":
                            entity["confidence"]
                        }
                    else:
                        type_data = {
                            "typeURI":
                            None,
                            "typeLabel":
                            extracted_type,
                            "wikiURI":
                            DbpediaLink.
                            get_english_resource_from_english_wikipedia_link(
                                entity["uri"]),
                            "entityURI":
                            entity["uri"],
                            "confidence":
                            entity["confidence"]
                        }

                    e["types"].append(type_data)

                if entity["nerdType"]:
                    nerd_type_data = {
                        "typeURI":
                        entity["nerdType"],
                        "typeLabel":
                        entity["nerdType"].split("#")[1],
                        "wikiURI":
                        DbpediaLink.
                        get_english_resource_from_english_wikipedia_link(
                            entity["uri"]),
                        "entityURI":
                        entity["uri"],
                        "confidence":
                        entity["confidence"]
                    }

                    e["types"].append(nerd_type_data)

            initial_entities.append(e)

    return initial_entities
Пример #6
0
def process_tagme_api(text):
    try:
        lang = "en"
        url = "http://tagme.di.unipi.it/tag?"
        include_categories = "true"
        include_all_spots = "true"
        api_key = read_api_key()
        link_matches = HyperLink.extract_all_url(text)

        request = "key=" + api_key + \
                  "&include_categories=" + include_categories + \
                  "&lang=" + lang + \
                  "&include_all_spots=" + include_all_spots + \
                  "&text=" + urllib.quote_plus(text)

        entities = requests.post(url + request)
        entities = json.loads(entities.text)
    except:
        return []

    initial_entities = []
    for entity in entities["annotations"]:
        possible_link = False
        for link_match in link_matches:
            if link_match["start"] <= entity["start"] and link_match[
                    "end"] >= entity["end"]:
                possible_link = True

        if not possible_link:
            e = {
                "label": entity["spot"],
                "startOffset": entity["start"],
                "endOffset": entity["end"],
                "confidence": entity["rho"],
                "provenance": "tagme",
                "types": []
            }

            types = []
            for data_type in entity["dbpedia_categories"]:
                wiki_url = "http://en.wikipedia.org/wiki/" + data_type.replace(
                    " ", "_")
                dbpedia_type = {
                    "typeURI":
                    None,
                    "typeLabel":
                    data_type,
                    "entityURI":
                    DbpediaLink.
                    get_english_resource_from_english_wikipedia_link(wiki_url),
                    "confidence":
                    None,
                    "wikiURI":
                    wiki_url
                }
                types.append(dbpedia_type)

            e["types"].append(types)
            initial_entities.append(e)

    initial_entities = {v["label"]: v for v in initial_entities}.values()
    return initial_entities
Пример #7
0
def process_nerd_api(text):
    try:
        timeout = 10
        text = urllib.quote_plus(text)
        n = nerd.NERD("nerd.eurecom.fr", "akkqfgos0p85mcubcfgp82rn92d23enu")
        entities = n.extract(text, "combined", timeout)
    except:
        return []

    link_matches = HyperLink.extract_all_url(text)

    initial_entities = []
    for entity in entities:
        possible_link = False
        for link_match in link_matches:
            if link_match["start"] <= entity["startChar"] and link_match["end"] >= entity["endChar"]:
                possible_link = True

        if not possible_link:
            e = {
                "label": entity["label"],
                "startOffset": entity["startChar"],
                "endOffset": entity["endChar"],
                "confidence": entity["confidence"],
                "provenance": "nerd-" + entity["extractor"],
                "types": [],
            }

            if entity["extractorType"]:
                all_types = entity["extractorType"].split(",")

                for extracted_type in all_types:

                    if "dbpedia" in extracted_type:
                        type_data = {
                            "typeURI": extracted_type,
                            "typeLabel": None,
                            "wikiURI": DbpediaLink.get_english_resource_from_english_wikipedia_link(entity["uri"]),
                            "entityURI": entity["uri"],
                            "confidence": entity["confidence"],
                        }
                    else:
                        type_data = {
                            "typeURI": None,
                            "typeLabel": extracted_type,
                            "wikiURI": DbpediaLink.get_english_resource_from_english_wikipedia_link(entity["uri"]),
                            "entityURI": entity["uri"],
                            "confidence": entity["confidence"],
                        }

                    e["types"].append(type_data)

                if entity["nerdType"]:
                    nerd_type_data = {
                        "typeURI": entity["nerdType"],
                        "typeLabel": entity["nerdType"].split("#")[1],
                        "wikiURI": DbpediaLink.get_english_resource_from_english_wikipedia_link(entity["uri"]),
                        "entityURI": entity["uri"],
                        "confidence": entity["confidence"],
                    }

                    e["types"].append(nerd_type_data)

            initial_entities.append(e)

    return initial_entities
Пример #8
0
def process_tdh_api(text):
    url = "http://entityclassifier.eu/thd/api/v2/extraction?"
    lang = "en"
    content_format = "json"
    entity_types = ["ne", "ce"]
    priority_entity_linking = "true"
    api_key = read_api_key()
    link_matches = HyperLink.extract_all_url(text)

    initial_entities = []
    for entity_type in entity_types:
        request = "apikey=" + api_key + \
                  "&format=" + content_format + \
                  "&lang=" + lang + \
                  "&priority_entity_linking=" + priority_entity_linking + \
                  "&entity_type=" + entity_type

        try:
            entities = requests.post(url + request, urllib.quote_plus(text))
            entities = json.loads(entities.text)
        except:
            return []

        for entity in entities:
            possible_link = False
            for link_match in link_matches:
                if link_match["start"] <= entity["startOffset"] and link_match[
                        "end"] >= entity["endOffset"]:
                    possible_link = True

            if not possible_link:
                e = {
                    "label": entity["underlyingString"],
                    "startOffset": entity["startOffset"],
                    "endOffset": entity["endOffset"],
                    "confidence": None,
                    "provenance": "thd",
                    "types": []
                }

                types = []
                try:
                    for data_type in entity["types"]:
                        thd_type = {
                            "typeURI":
                            data_type["typeURI"],
                            "typeLabel":
                            data_type["typeLabel"],
                            "entityURI":
                            data_type["entityURI"],
                            "confidence":
                            data_type["salience"]["confidence"],
                            "wikiURI":
                            DbpediaLink.
                            get_english_wikipedia_link_from_english_resource(
                                data_type["entityURI"]),
                        }
                        types.append(thd_type)

                    e["types"].append(types)
                except KeyError:
                    continue
                initial_entities.append(e)

    initial_entities = {v["label"]: v for v in initial_entities}.values()
    return initial_entities