Пример #1
0
def cyc_to_conceptnet_uri(labels, unlabels, uri):
    """
    Convert a Cyc URI to a ConceptNet URI, with the following rules:

    - Use the RDF label as the text. (Alternate labels appear to provide
      synonyms, but these are generally automatically generated and aren't
      particularly accurate.)
    - The part of speech is always 'n'. Cyc describes its concepts in a
      noun-like way. At worst, they're gerunds -- instead of "to eat", Cyc
      would define an event of "Eating".
    - If two different Cyc URIs have the same text, we will attempt to
      disambiguate them using the last component of the Cyc URI.
    - Remove the camel-casing from the Cyc URI component. If the phrase we
      get is the same as the natural-language label, disregard it as an
      uninformative disambiguation. Otherwise, that is the disambiguation text.

    A possible objection: Our disambiguation doesn't distinguish Cyc URIs that
    differ in capitalization, or differ by using underscores instead of
    camel-case. However, I've noticed that such URIs are usually
    *unintentional* duplicates that are okay to merge. If they were really
    unrelated concepts that needed to be distinguished, someone would have
    given them different names.

    Even so, we end up with some unnecessary word senses, such as different
    senses for "mens clothing", "men's clothing", and "men s clothing".
    """
    label = filter_stopwords(labels[uri])
    if len(unlabels[label]) >= 2:
        disambig = filter_stopwords(un_camel_case(resource_name(uri)))
        if simple_tokenize(disambig) != simple_tokenize(label):
            return standardized_concept_uri('en', label, 'n', 'opencyc', disambig)
    return standardized_concept_uri('en', label, 'n')
Пример #2
0
def interlanguage_mapping(interlang_path, ok_concepts):
    quads = parse_nquads(bz2.open(str(interlang_path), 'rt'))
    mapping = {}
    for subj, values in itertools.groupby(quads, itemgetter(0)):
        subj_url = subj['url']
        subj_concept = translate_dbpedia_url(subj_url)
        pieces = split_uri(subj_concept)
        if len(pieces) >= 6:
            sense = pieces[5]
            if 'album' in sense or 'film' in sense or 'series' in sense or 'disambiguation' in sense or 'song' in sense or 'album' in sense or 'band' in sense:
                continue
        if uri_prefix(subj_concept) in ok_concepts:
            targets = [subj_url]

            for _subj, _pred, obj, _graph in values:
                url = obj['url']
                if 'www.wikidata.org' in url:
                    continue
                if url.startswith('http://wikidata.dbpedia.org/'):
                    wikidata_id = resource_name(url)

                    # Return early when we see a high-numbered Wikidata ID
                    if int(wikidata_id[1:]) >= 1000000:
                        return mapping
                targets.append(url)

            mapping[subj_url] = targets
    return mapping
Пример #3
0
def run_opencyc(input_file, output_file):
    """
    Read an .nq file containing OpenCyc data, outputting a file of
    ConceptNet edges and a file of mappings between the Semantic Web and
    ConceptNet.
    """
    out = MsgpackStreamWriter(output_file)

    labels = {}
    unlabels = defaultdict(set)
    seen_external_urls = set()

    # Read through the file once, finding all the "preferred labels". We will
    # use these as the surface texts for the nodes.
    for subj, pred, obj, _graph in parse_nquads(
            open(input_file, encoding='utf-8')):
        if pred['url'] == RDF_LABEL:
            labels[subj['url']] = obj['text']
            unlabels[obj['text']].add(subj['url'])

    # Read through the file again and extract ConceptNet edges.
    for subj, pred, obj, _graph in parse_nquads(
            open(input_file, encoding='utf-8')):
        rel_name = resource_name(pred['url'])
        web_subj = subj.get('url')
        web_obj = obj.get('url')
        if rel_name == 'subClassOf' and web_obj is not None and web_subj in labels and web_obj in labels:
            subj_label = labels[web_subj]
            obj_label = labels[web_obj]
            if '_' in subj_label or '_' in obj_label:
                continue
            if subj_label.startswith('xsd:') or obj_label.startswith('xsd:'):
                continue
            subj_words = set(simple_tokenize(subj_label))
            obj_words = set(simple_tokenize(obj_label))
            if (subj_words & BLACKLIST_WORDS) or (obj_words & BLACKLIST_WORDS):
                continue
            if len(subj_words) > 4 or len(obj_words) > 4:
                continue

            subj_uri = cyc_to_conceptnet_uri(labels, unlabels, web_subj)
            obj_uri = cyc_to_conceptnet_uri(labels, unlabels, web_obj)
            out.write(
                opencyc_edge('/r/IsA', subj_uri, obj_uri, subj_label,
                             obj_label))
            if (subj_uri, web_subj) not in seen_external_urls:
                out.write(external_url_edge(subj_uri, web_subj))
                seen_external_urls.add((subj_uri, web_subj))
            if (obj_uri, web_obj) not in seen_external_urls:
                out.write(external_url_edge(obj_uri, web_obj))
                seen_external_urls.add((obj_uri, web_obj))
        elif rel_name == 'sameAs' and web_subj in labels and web_obj.startswith(
                'http://umbel.org/'):
            subj_label = labels[web_subj]
            subj_uri = standardized_concept_uri('en', subj_label)
            if (subj_uri, web_obj) not in seen_external_urls:
                out.write(external_url_edge(subj_uri, web_obj))
                seen_external_urls.add((subj_uri, web_obj))

    out.close()
Пример #4
0
def interlanguage_mapping(interlang_path, ok_concepts):
    quads = parse_nquads(bz2.open(str(interlang_path), 'rt'))
    mapping = {}
    for subj, values in itertools.groupby(quads, itemgetter(0)):
        subj_url = subj['url']
        subj_concept = translate_dbpedia_url(subj_url)
        pieces = split_uri(subj_concept)
        if len(pieces) >= 6:
            sense = pieces[5]
            if 'album' in sense or 'film' in sense or 'series' in sense or 'disambiguation' in sense or 'song' in sense or 'album' in sense or 'band' in sense:
                continue
        if uri_prefix(subj_concept) in ok_concepts:
            targets = [subj_url]

            for _subj, _pred, obj, _graph in values:
                url = obj['url']
                if 'www.wikidata.org' in url:
                    continue
                if url.startswith('http://wikidata.dbpedia.org/'):
                    wikidata_id = resource_name(url)

                    # Return early when we see a high-numbered Wikidata ID
                    if int(wikidata_id[1:]) >= 1000000:
                        return mapping
                targets.append(url)

            mapping[subj_url] = targets
    return mapping
Пример #5
0
def run_opencyc(input_file, output_file):
    """
    Read an .nq file containing OpenCyc data, outputting a file of
    ConceptNet edges and a file of mappings between the Semantic Web and
    ConceptNet.
    """
    out = MsgpackStreamWriter(output_file)

    labels = {}
    unlabels = defaultdict(set)
    seen_external_urls = set()

    # Read through the file once, finding all the "preferred labels". We will
    # use these as the surface texts for the nodes.
    for subj, pred, obj, _graph in parse_nquads(open(input_file, encoding='utf-8')):
        if pred['url'] == RDF_LABEL:
            labels[subj['url']] = obj['text']
            unlabels[obj['text']].add(subj['url'])

    # Read through the file again and extract ConceptNet edges.
    for subj, pred, obj, _graph in parse_nquads(open(input_file, encoding='utf-8')):
        rel_name = resource_name(pred['url'])
        web_subj = subj.get('url')
        web_obj = obj.get('url')
        if rel_name == 'subClassOf' and web_obj is not None and web_subj in labels and web_obj in labels:
            subj_label = labels[web_subj]
            obj_label = labels[web_obj]
            if '_' in subj_label or '_' in obj_label:
                continue
            if subj_label.startswith('xsd:') or obj_label.startswith('xsd:'):
                continue
            subj_words = set(simple_tokenize(subj_label))
            obj_words = set(simple_tokenize(obj_label))
            if (subj_words & BLACKLIST_WORDS) or (obj_words & BLACKLIST_WORDS):
                continue
            if len(subj_words) > 4 or len(obj_words) > 4:
                continue

            subj_uri = cyc_to_conceptnet_uri(labels, unlabels, web_subj)
            obj_uri = cyc_to_conceptnet_uri(labels, unlabels, web_obj)
            out.write(opencyc_edge('/r/IsA', subj_uri, obj_uri, subj_label, obj_label))
            if (subj_uri, web_subj) not in seen_external_urls:
                out.write(external_url_edge(subj_uri, web_subj))
                seen_external_urls.add((subj_uri, web_subj))
            if (obj_uri, web_obj) not in seen_external_urls:
                out.write(external_url_edge(obj_uri, web_obj))
                seen_external_urls.add((obj_uri, web_obj))
        elif rel_name == 'sameAs' and web_subj in labels and web_obj.startswith('http://umbel.org/'):
            subj_label = labels[web_subj]
            subj_uri = standardized_concept_uri('en', subj_label)
            if (subj_uri, web_obj) not in seen_external_urls:
                out.write(external_url_edge(subj_uri, web_obj))
                seen_external_urls.add((subj_uri, web_obj))

    out.close()
Пример #6
0
def map_dbpedia_relation(url):
    """
    Recognize some relations that we can extract from DBPedia, and convert
    them to ConceptNet relations. If the relation is specific to DBPedia, it'll
    be in the '/r/dbpedia' namespace.

    >>> map_dbpedia_relation('http://www.w3.org/1999/02/22-rdf-syntax-ns#type')
    '/r/InstanceOf'
    >>> map_dbpedia_relation('http://dbpedia.org/ontology/location')
    '/r/AtLocation'
    >>> map_dbpedia_relation('http://dbpedia.org/ontology/genre')
    '/r/dbpedia/genre'
    """
    name = resource_name(url)
    if name in RELATIONS:
        return RELATIONS[name]
    else:
        return None
Пример #7
0
def map_dbpedia_relation(url):
    """
    Recognize some relations that we can extract from DBPedia, and convert
    them to ConceptNet relations. If the relation is specific to DBPedia, it'll
    be in the '/r/dbpedia' namespace.

    >>> map_dbpedia_relation('http://www.w3.org/1999/02/22-rdf-syntax-ns#type')
    '/r/InstanceOf'
    >>> map_dbpedia_relation('http://dbpedia.org/ontology/location')
    '/r/AtLocation'
    >>> map_dbpedia_relation('http://dbpedia.org/ontology/genre')
    '/r/dbpedia/genre'
    """
    name = resource_name(url)
    if name in RELATIONS:
        return RELATIONS[name]
    else:
        return None
Пример #8
0
def map_dbpedia_relation(url):
    """
    Recognize three relations that we can extract from DBPedia, and convert
    them to ConceptNet relations.

    >>> map_dbpedia_relation('http://www.w3.org/1999/02/22-rdf-syntax-ns#type')
    '/r/IsA'
    >>> map_dbpedia_relation('http://dbpedia.org/ontology/location')
    '/r/AtLocation'
    """
    name = resource_name(url)
    if name == 'type':
        return '/r/IsA'
    elif name.startswith('isPartOf'):
        return '/r/PartOf'
    elif name.startswith('location'):
        return '/r/AtLocation'
    else:
        return None
Пример #9
0
def translate_dbpedia_url(url):
    """
    Convert an object that's defined by a DBPedia URL to a ConceptNet
    URI. We do this by finding the part of the URL that names the object,
    and using that as surface text for ConceptNet.

    This is, in some ways, abusing a naming convention in the Semantic Web.
    The URL of an object doesn't have to mean anything at all. The
    human-readable name is supposed to be a string, specified by the "name"
    relation.

    The problem here is that the "name" relation is not unique in either
    direction. A URL can have many names, and the same name can refer to
    many URLs, and some of these names are rarely used or are the result of
    parsing glitches. The URL itself is a stable thing that we can build a
    ConceptNet URI from, on the other hand.
    """
    if '__' in url or 'dbpedia.org' not in url:
        return None
    parsed = parse_url(url)
    domain = parsed.netloc
    if '.' not in domain:
        return None

    if domain == 'dbpedia.org':
        # Handle old DBPedia URLs that had no language code
        domain = 'en.dbpedia.org'

    domain_parts = domain.split('.', 1)
    if domain_parts[1] == 'dbpedia.org':
        lang = domain_parts[0]
        if lang in LCODE_ALIASES:
            lang = LCODE_ALIASES[lang]
        if lang not in ALL_LANGUAGES:
            return None
        text = resource_name(url).replace('_', ' ')
        uri = topic_to_concept(lang, text)
        if uri in CONCEPT_BLACKLIST:
            return None
        else:
            return uri
    else:
        return None
Пример #10
0
def translate_dbpedia_url(url):
    """
    Convert an object that's defined by a DBPedia URL to a ConceptNet
    URI. We do this by finding the part of the URL that names the object,
    and using that as surface text for ConceptNet.

    This is, in some ways, abusing a naming convention in the Semantic Web.
    The URL of an object doesn't have to mean anything at all. The
    human-readable name is supposed to be a string, specified by the "name"
    relation.

    The problem here is that the "name" relation is not unique in either
    direction. A URL can have many names, and the same name can refer to
    many URLs, and some of these names are rarely used or are the result of
    parsing glitches. The URL itself is a stable thing that we can build a
    ConceptNet URI from, on the other hand.
    """
    if '__' in url or 'dbpedia.org' not in url:
        return None
    parsed = parse_url(url)
    domain = parsed.netloc
    if '.' not in domain:
        return None

    if domain == 'dbpedia.org':
        # Handle old DBPedia URLs that had no language code
        domain = 'en.dbpedia.org'

    domain_parts = domain.split('.', 1)
    if domain_parts[1] == 'dbpedia.org':
        lang = domain_parts[0]
        if lang in LCODE_ALIASES:
            lang = LCODE_ALIASES[lang]
        if lang not in ALL_LANGUAGES:
            return None
        text = resource_name(url).replace('_', ' ')
        uri = topic_to_concept(lang, text)
        if uri in CONCEPT_BLACKLIST:
            return None
        else:
            return uri
    else:
        return None
Пример #11
0
def translate_dbpedia_url(url):
    """
    Convert an object that's defined by a DBPedia URL to a ConceptNet
    URI. We do this by finding the part of the URL that names the object,
    and using that as surface text for ConceptNet.

    This is, in some ways, abusing a naming convention in the Semantic Web.
    The URL of an object doesn't have to mean anything at all. The
    human-readable name is supposed to be a string, specified by the "name"
    relation.

    The problem here is that the "name" relation is not unique in either
    direction. A URL can have many names, and the same name can refer to
    many URLs, and some of these names are the result of parsing glitches.
    The URL itself is a stable thing that we can build a ConceptNet URI from,
    on the other hand.
    """
    parsed = parse_url(url)
    domain = parsed.netloc

    if domain == "dbpedia.org":
        # Handle old DBPedia URLs that had no language code
        lang = "en"
    else:
        domain_parts = domain.split(".", 1)
        if domain_parts[1] == "dbpedia.org":
            lang = domain_parts[0]

            # If we can't name this language in English, it's probably
            # not really a language.
            if langcodes.get(lang).language_name("en") == lang:
                return None
        else:
            return None

    # Some Semantic Web URLs are camel-cased. ConceptNet URIs use underscores
    # between words.
    pieces = parse_topic_name(resource_name(url))
    pieces[0] = un_camel_case(pieces[0])
    return standardized_concept_uri(lang, *pieces)
Пример #12
0
def translate_dbpedia_url(url):
    """
    Convert an object that's defined by a DBPedia URL to a ConceptNet
    URI. We do this by finding the part of the URL that names the object,
    and using that as surface text for ConceptNet.

    This is, in some ways, abusing a naming convention in the Semantic Web.
    The URL of an object doesn't have to mean anything at all. The
    human-readable name is supposed to be a string, specified by the "name"
    relation.

    The problem here is that the "name" relation is not unique in either
    direction. A URL can have many names, and the same name can refer to
    many URLs, and some of these names are the result of parsing glitches.
    The URL itself is a stable thing that we can build a ConceptNet URI from,
    on the other hand.
    """
    parsed = parse_url(url)
    domain = parsed.netloc

    if domain == 'dbpedia.org':
        # Handle old DBPedia URLs that had no language code
        lang = 'en'
    else:
        domain_parts = domain.split('.', 1)
        if domain_parts[1] == 'dbpedia.org':
            lang = domain_parts[0]

            # If we can't name this language in English, it's probably
            # not really a language.
            if langcodes.get(lang).language_name('en') == lang:
                return None
        else:
            return None

    # Some Semantic Web URLs are camel-cased. ConceptNet URIs use underscores
    # between words.
    pieces = parse_topic_name(resource_name(url))
    pieces[0] = un_camel_case(pieces[0])
    return standardized_concept_uri(lang, *pieces)
Пример #13
0
def translate_dbpedia_url(url, lang='en'):
    """
    Convert an object that's defined by a DBPedia URL to a ConceptNet
    URI. We do this by finding the part of the URL that names the object,
    and using that as surface text for ConceptNet.

    This is, in some ways, abusing a naming convention in the Semantic Web.
    The URL of an object doesn't have to mean anything at all. The
    human-readable name is supposed to be a string, specified by the "name"
    relation.

    The problem here is that the "name" relation is not unique in either
    direction. A URL can have many names, and the same name can refer to
    many URLs, and some of these names are the result of parsing glitches.
    The URL itself is a stable thing that we can build a ConceptNet URI from,
    on the other hand.
    """
    # Some Semantic Web URLs are camel-cased. ConceptNet URIs use underscores
    # between words.
    pieces = parse_topic_name(resource_name(url))
    pieces[0] = un_camel_case(pieces[0])
    return normalized_concept_uri(lang, *pieces)
Пример #14
0
def map_dbpedia_relation(url):
    """
    Recognize some relations that we can extract from DBPedia, and convert
    them to ConceptNet relations. If the relation is specific to DBPedia, it'll
    be in the '/r/dbpedia' namespace.

    >>> map_dbpedia_relation('http://www.w3.org/1999/02/22-rdf-syntax-ns#type')
    '/r/InstanceOf'
    >>> map_dbpedia_relation('http://dbpedia.org/ontology/location')
    '/r/AtLocation'
    >>> map_dbpedia_relation('http://dbpedia.org/ontology/genre')
    '/r/dbpedia/genre'
    """
    name = resource_name(url)
    if name in {"type", "occupation"}:
        return "/r/InstanceOf"
    elif name.startswith("location"):
        return "/r/AtLocation"
    elif name == "sameAs":
        return "/r/TranslationOf"
    elif name in SPECIFIC_RELATION_WHITELIST:
        return "/r/dbpedia/%s" % name
    else:
        return None
Пример #15
0
def map_dbpedia_relation(url):
    """
    Recognize some relations that we can extract from DBPedia, and convert
    them to ConceptNet relations. If the relation is specific to DBPedia, it'll
    be in the '/r/dbpedia' namespace.

    >>> map_dbpedia_relation('http://www.w3.org/1999/02/22-rdf-syntax-ns#type')
    '/r/InstanceOf'
    >>> map_dbpedia_relation('http://dbpedia.org/ontology/location')
    '/r/AtLocation'
    >>> map_dbpedia_relation('http://dbpedia.org/ontology/genre')
    '/r/dbpedia/genre'
    """
    name = resource_name(url)
    if name in {'type', 'occupation'}:
        return '/r/InstanceOf'
    elif name.startswith('location'):
        return '/r/AtLocation'
    elif name == 'sameAs':
        return '/r/TranslationOf'
    elif name in SPECIFIC_RELATION_WHITELIST:
        return '/r/dbpedia/%s' % name
    else:
        return None
Пример #16
0
def run_wordnet(input_file, output_file):
    out = MsgpackStreamWriter(output_file)

    synset_senses = defaultdict(list)
    sense_synsets = {}
    synset_labels = defaultdict(list)
    synset_canonical_labels = {}
    synset_categories = {}
    synset_domains = {}
    synset_glosses = {}
    synset_disambig = {}
    synset_uris = {}

    # First pass: find data about synsets
    quads = parse_nquads(open(input_file, encoding='utf-8'))
    for subj_dict, rel_dict, obj_dict, _graph in quads:
        if 'url' not in subj_dict or 'url' not in rel_dict:
            continue
        subj = subj_dict['url']
        rel = rel_dict['url']
        obj = obj_dict.get('url')
        objtext = obj_dict.get('text')

        relname = resource_name(rel)
        if relname == 'label':
            if obj_dict['lang'] == 'en':
                synset_labels[subj].append(objtext)
        elif relname == 'sameAs':
            if obj.startswith(WN20_URL):
                # If we have a link to RDF WordNet 2.0, the URL (URI? IRI?)
                # will contain a standardized label for this concept, which
                # we should use when we want to use this synset as the name of
                # a disambiguation category. RDF WordNet 3.1 assigns synsets
                # a number of labels in no particular order, making it hard to
                # determine from 3.1 alone what to name a category.
                objname = resource_name(obj)
                parts = objname.split('-')[1:-2]

                # Handle missing apostrophes
                label = '-'.join(parts).replace('_s_', "'s_").replace(
                    '_s-',
                    "'s_").replace("s__",
                                   "s'_").replace("s_-",
                                                  "s'-").replace('_', ' ')
                synset_canonical_labels[subj] = label

        elif relname == 'domain_category':
            synset_categories[subj] = obj
        elif relname == 'lexical_domain':
            target = resource_name(obj)
            if '.' in target:
                domain = target.split('.')[1]
                synset_domains[subj] = domain
        elif relname == 'gloss':
            synset_glosses[subj] = objtext
        elif relname == 'reference':
            lemma = resource_name(subj)
            synset = obj
            synset_senses[synset].append(lemma)
            sense_synsets[lemma] = synset

    used_labels = set(synset_canonical_labels.values())
    for synset, values in synset_labels.items():
        values.sort(
            key=lambda label: (label in used_labels, ) + label_sort_key(label))
        if (synset not in synset_canonical_labels
                or synset_canonical_labels[synset][0].isupper()
                and synset_domains.get(synset) == 'person'):
            label = values[0]
            synset_canonical_labels[synset] = label
            used_labels.add(label)

    for synset, labels in synset_labels.items():
        if synset in synset_categories:
            category_name = synset_canonical_labels[synset_categories[synset]]
        else:
            category_name = synset_domains.get(synset, None)
        synset_no_fragment = synset.split('#')[0]
        pos = synset_no_fragment[-1].lower()
        assert pos in 'nvarsp', synset
        if pos == 's':
            pos = 'a'
        elif pos == 'p':
            pos = '-'
        if category_name in ('pert', 'all', 'tops'):
            category_name = None
        synset_disambig[synset] = (pos, category_name)

        canon = synset_canonical_labels[synset]
        canon_uri = standardized_concept_uri('en', canon, pos, 'wn',
                                             category_name)
        synset_uris[synset] = canon_uri

        for label in labels:
            if label != canon:
                other_uri = standardized_concept_uri('en', label, pos, 'wn',
                                                     category_name)
                rel_uri = '/r/Synonym'
                surface = '[[{0}]] is a synonym of [[{1}]]'.format(
                    label, canon)
                edge = make_edge(rel_uri,
                                 other_uri,
                                 canon_uri,
                                 dataset=DATASET,
                                 surfaceText=surface,
                                 license=Licenses.cc_attribution,
                                 sources=[SOURCE],
                                 weight=2.0)
                out.write(edge)

    quads = parse_nquads(open(input_file, encoding='utf-8'))
    for subj_dict, rel_dict, obj_dict, _graph in quads:
        if 'url' not in subj_dict or 'url' not in rel_dict:
            continue
        subj = subj_dict['url']
        rel = rel_dict['url']
        obj = obj_dict.get('url')
        relname = resource_name(rel)
        if relname in REL_MAPPING:
            pos, sense = synset_disambig.get(subj, (None, None))
            if relname == 'hypernym' and pos == 'v':
                relname = 'hypernym-v'
            rel, frame = REL_MAPPING[relname]
            reversed_frame = False
            if rel.startswith('~'):
                rel = rel[1:]
                reversed_frame = True
            rel_uri = '/r/' + rel
            if obj is not None:
                obj_uri = synset_uris.get(obj)
                if obj not in synset_canonical_labels:
                    continue
                obj_label = synset_canonical_labels[obj]
            else:
                text = obj_dict['text']
                # Some WordNets use strings with "!" in them to indicate
                # out-of-band information, such as a missing translation
                if (not text) or '!' in text:
                    continue
                lang = obj_dict['lang']
                obj_uri = standardized_concept_uri(lang, text, pos, 'wn',
                                                   sense)
                obj_label = text

            if subj not in synset_uris or subj not in synset_canonical_labels:
                continue
            subj_uri = synset_uris[subj]
            subj_label = synset_canonical_labels[subj]
            license = Licenses.cc_attribution
            langcode = subj_uri.split('/')[2]
            if langcode in SHAREALIKE_LANGUAGES:
                license = Licenses.cc_sharealike

            if reversed_frame:
                subj_uri, obj_uri = obj_uri, subj_uri
                subj_label, obj_label = obj_label, subj_label

            surface = frame.format('[[%s]]' % subj_label, '[[%s]]' % obj_label)

            edge = make_edge(rel_uri,
                             subj_uri,
                             obj_uri,
                             dataset=DATASET,
                             surfaceText=surface,
                             license=license,
                             sources=[SOURCE],
                             weight=2.0)
            out.write(edge)

    for wn_url in sorted(synset_uris):
        cn_uri = synset_uris[wn_url]
        edge = make_edge('/r/ExternalURL',
                         cn_uri,
                         wn_url,
                         dataset=DATASET,
                         license=Licenses.cc_sharealike,
                         sources=[SOURCE],
                         weight=1.0)
        out.write(edge)

    out.close()
Пример #17
0
def run_umbel(input_dir, output_file, sw_map_file):
    """
    Read N-Triples files containing Umbel data, outputting a file of
    ConceptNet edges and a file of mappings between the Semantic Web and
    ConceptNet.
    """
    out = MsgpackStreamWriter(output_file)
    map_out = NTriplesWriter(sw_map_file)
    reader = NTriplesReader()

    labels = {}
    label_sets = defaultdict(set)

    # There are two files we want to parse:
    # - umbel.nt, a transformation of umbel.n3, which is available from
    #   https://github.com/structureddynamics/UMBEL/.
    # - umbel_links.nt, distributed with DBPedia 3.9.
    #
    # We parse them both in this file so that umbel_links can reuse the
    # concept names extracted from umbel.nt.
    main_file = os.path.join(input_dir, 'umbel.nt')
    dbpedia_link_file = os.path.join(input_dir, 'umbel_links.nt')

    # Read through umbel.nt once, finding all the "preferred labels". We will
    # use these as the surface texts for the nodes.
    for web_subj, web_rel, web_obj, objtag in reader.parse_file(main_file):
        if resource_name(web_rel) == 'prefLabel':
            # 'CW' and 'PCW' are Cyc jargon for 'conceptual works'. If a node
            # cannot be described except as a CW, we're probably not
            # interested in it.
            if 'CW' not in web_obj.split() and 'PCW' not in web_obj.split():
                labels[web_subj] = web_obj
        if resource_name(web_rel).endswith('Label'):
            text = standardize_text(web_obj)
            label_sets[text].add(web_subj)

    # Read through umbel.nt again and extract ConceptNet edges.
    for web_subj, web_rel, web_obj, objtag in reader.parse_file(main_file):
        if objtag == 'URL' and acceptable_node(web_obj) and acceptable_node(web_subj):
            # Only use nodes for which we've seen preferred labels.
            # (This skips some anonymous OWL-cruft nodes.)
            if web_subj in labels and web_obj in labels:
                subj_uri = standardized_concept_uri('en', labels[web_subj])
                obj_uri = standardized_concept_uri('en', labels[web_obj])
                rel_name = resource_name(web_rel)
                # Check if this is a relation we want to handle.
                if rel_name in REL_MAPPING:
                    # Write the ConceptNet edges and the mappings to Semantic Web URLs.
                    rel_uri, frame = REL_MAPPING[rel_name]
                    surface = frame % (labels[web_subj], labels[web_obj])
                    out.write(umbel_edge(rel_uri, subj_uri, obj_uri, surface, SOURCE))
                    map_out.write_link(web_rel, full_conceptnet_url(rel_uri))
                    map_out.write_link(web_subj, full_conceptnet_url(subj_uri))
                    map_out.write_link(web_obj, full_conceptnet_url(obj_uri))

        # altLabel relations assign different texts to the same node. We'll
        # represent those in ConceptNet with Synonym relations.
        elif web_rel.endswith('altLabel'):
            # Make sure we know what's being labeled.
            if web_subj in labels:
                name = web_obj
                words = name.split(' ')
                if standardized_concept_name('en', name) != standardized_concept_name('en', labels[web_subj]):
                    if not set(words) & IGNORED_WORDS:
                        main_label = standardized_concept_uri('en', labels[web_subj])
                        name_text = standardize_text(name)
                        if len(label_sets[name_text]) >= 2 or len(name_text) <= 3:
                            disambig = un_camel_case(resource_name(web_subj))

                            # Cyc does not distinguish texts by their part of speech, so use
                            # '_' as the part of speech symbol.
                            alt_label = standardized_concept_uri('en', name, '_', disambig)
                        else:
                            alt_label = standardized_concept_uri('en', name)
                        surface = SYN_FRAME % (name, labels[web_subj])
                        out.write(umbel_edge('/r/Synonym', alt_label, main_label, surface, SOURCE))

    for web_subj, web_rel, web_obj, objtag in reader.parse_file(dbpedia_link_file):
        if objtag == 'URL' and acceptable_node(web_obj) and acceptable_node(web_subj):
            if web_obj in labels:
                subj_label = resource_name(web_subj).replace('_', ' ')
                subj_uri = translate_dbpedia_url(web_subj)
                obj_label = labels[web_obj]
                obj_uri = standardized_concept_uri('en', obj_label)
                rel_name = resource_name(web_rel)
                if rel_name in REL_MAPPING:
                    rel_uri, frame = REL_MAPPING[rel_name]
                    surface = frame % (subj_label, obj_label)
                    out.write(umbel_edge(rel_uri, subj_uri, obj_uri, surface, LINK_SOURCE))
                    map_out.write_link(web_rel, full_conceptnet_url(rel_uri))
                    map_out.write_link(web_subj, full_conceptnet_url(subj_uri))
                    map_out.write_link(web_obj, full_conceptnet_url(obj_uri))
Пример #18
0
def handle_triple(line, reader, out, map_out):
    subj, pred, obj, tag = reader.parse_line(line)
    if tag != "URL":
        return

    # Ignore types of edges that we don't care about:
    #   - Homepage links
    #   - GIS features
    #   - Assertions that something "is a thing"
    #   - Anonymous nodes identified with double-underscores, such as the node
    #     "Alfred_Nobel__1", which means "Alfred Nobel's occupation, whatever
    #     it is"
    #   - Nodes that are articles named "List of X" on Wikipedia
    if (
        "foaf/0.1/homepage" in pred
        or "_Feature" in obj
        or "#Thing" in obj
        or "__" in subj
        or "__" in obj
        or "List_of" in subj
        or "List_of" in obj
        or "Wikidata:" in obj
    ):
        return

    # We don't try to parse URIs from outside of dbpedia.org's namespace.
    if "dbpedia.org" not in obj:
        return

    subj_concept = translate_dbpedia_url(subj)
    obj_concept = translate_dbpedia_url(obj)
    subj_text = un_camel_case(parse_topic_name(resource_name(subj))[0])
    obj_text = un_camel_case(parse_topic_name(resource_name(obj))[0])
    if subj_concept is None or obj_concept is None:
        return

    # DBPedia categorizes a lot of things as 'works', which causes unnecessary
    # ambiguity. Disregard these edges; there will almost always be a more
    # specific edge calling it a 'creative work' anyway.
    if obj_concept in CONCEPT_BLACKLIST:
        return

    rel = map_dbpedia_relation(pred)
    if rel is None:
        return

    if rel in {"/r/IsA", "/r/TranslationOf"}:
        obj_text = obj_text.lower()

    # We've successfully converted this Semantic Web triple to ConceptNet URIs.
    # Now write the results to the 'sw_map' file so others can follow this
    # mapping.
    mapped_pairs = [(pred, rel), (subj, subj_concept), (obj, obj_concept)]
    for sw_url, conceptnet_uri in mapped_pairs:
        conceptnet_url = full_conceptnet_url(conceptnet_uri)
        map_out.write_link(conceptnet_url, sw_url)

    edge = make_edge(
        rel,
        subj_concept,
        obj_concept,
        dataset="/d/dbpedia/en",
        license=Licenses.cc_sharealike,
        sources=["/s/dbpedia/2014"],
        surfaceText=make_surface_text(rel, subj_text, obj_text),
        weight=0.5,
    )

    out.write(edge)
Пример #19
0
def run_wordnet(input_dir, output_file, sw_map_file):
    out = MsgpackStreamWriter(output_file)
    map_out = NTriplesWriter(sw_map_file)
    reader = NTriplesReader()

    synset_senses = defaultdict(list)
    sense_synsets = {}

    labels = {}
    glossary = {}
    concept_map = {}
    sense_to_synset = {}

    # Parse lines such as:
    #   wn30:synset-Aeolian-noun-2 rdfs:label "Aeolian"@en-us .
    for subj, rel, obj, objtag in reader.parse_file(
            os.path.join(input_dir, 'wordnet-synset.ttl')):
        if resource_name(rel) == 'label':
            # Everything in WordNet is in English
            assert objtag == 'en'
            labels[subj] = obj

    for subj, rel, obj, objtag in reader.parse_file(
            os.path.join(input_dir, 'wordnet-glossary.ttl')):
        if resource_name(rel) == 'gloss':
            assert objtag == 'en'

            # Take the definition up to the first semicolon
            text = obj.split(';')[0]

            # Remove introductory phrases with a colon
            text = text.split(': ', 1)[-1]

            # Remove parenthesized expressions
            while True:
                newtext = re.sub(r'\(.+?\) ?', '', text).strip()
                if newtext == text or newtext == '':
                    break
                else:
                    text = newtext

            glossary[subj] = text.replace('/', '_')

    # Get the list of word senses in each synset, and make a bidirectional mapping.
    #
    # Example line:
    #   wn30:synset-Aeolian-noun-2 wn20schema:containsWordSense wn30:wordsense-Aeolian-noun-2 .
    for subj, rel, obj, objtag in reader.parse_file(
            os.path.join(input_dir,
                         'full/wordnet-wordsense-synset-relations.ttl')):
        if resource_name(rel) == 'containsWordSense':
            synset_senses[subj].append(obj)
            sense_synsets[obj] = subj

    # Assign every synset to a disambiguated concept.
    for synset in synset_senses:
        synset_name = labels[synset]
        synset_pos = synset.split('-')[-2]
        pos = PARTS_OF_SPEECH[synset_pos]
        disambig = glossary[synset]

        concept = standardized_concept_uri('en', synset_name, pos, disambig)
        concept_map[synset] = concept

    # Map senses to their synsets.
    for sense, synset in sense_synsets.items():
        sense_to_synset[sense] = synset

    for filename in ('wordnet-attribute.ttl', 'wordnet-causes.ttl',
                     'wordnet-classifiedby.ttl', 'wordnet-entailment.ttl',
                     'wordnet-hyponym.ttl', 'wordnet-instances.ttl',
                     'wordnet-membermeronym.ttl', 'wordnet-partmeronym.ttl',
                     'wordnet-sameverbgroupas.ttl', 'wordnet-similarity.ttl',
                     'wordnet-substancemeronym.ttl',
                     'full/wordnet-antonym.ttl',
                     'full/wordnet-derivationallyrelated.ttl',
                     'full/wordnet-participleof.ttl',
                     'full/wordnet-pertainsto.ttl',
                     'full/wordnet-seealso.ttl'):
        filepath = os.path.join(input_dir, filename)
        if os.path.exists(filepath):
            for web_subj, web_rel, web_obj, objtag in reader.parse_file(
                    filepath):
                # If this relation involves word senses, map them to their synsets
                # first.
                if web_subj in sense_to_synset:
                    web_subj = sense_to_synset[web_subj]
                if web_obj in sense_to_synset:
                    web_obj = sense_to_synset[web_obj]
                subj = concept_map[web_subj]
                obj = concept_map[web_obj]
                pred_label = resource_name(web_rel)
                if pred_label in REL_MAPPING:
                    mapped_rel = REL_MAPPING[pred_label]

                    # Handle WordNet relations that are the reverse of ConceptNet
                    # relations. Change the word 'meronym' to 'holonym' if
                    # necessary.
                    if mapped_rel.startswith('~'):
                        subj, obj = obj, subj
                        web_subj, web_obj = web_obj, web_subj
                        web_rel = web_rel.replace('meronym', 'holonym')
                        mapped_rel = mapped_rel[1:]
                    rel = join_uri('r', mapped_rel)
                else:
                    rel = join_uri('r', 'wordnet', pred_label)

                map_out.write_link(web_rel, full_conceptnet_url(rel))
                map_out.write_link(web_subj, full_conceptnet_url(subj))
                map_out.write_link(web_obj, full_conceptnet_url(obj))
                edge = make_edge(rel,
                                 subj,
                                 obj,
                                 dataset='/d/wordnet/3.0',
                                 license='/l/CC/By',
                                 sources=SOURCE,
                                 weight=2.0)
                out.write(edge)
Пример #20
0
def acceptable_node(url):
    return not (url.endswith('Type') or url.endswith('Concept')
                or resource_name(url) in IGNORED_NODES)
Пример #21
0
def run_umbel(input_dir, output_file, sw_map_file):
    """
    Read N-Triples files containing Umbel data, outputting a file of
    ConceptNet edges and a file of mappings between the Semantic Web and
    ConceptNet.
    """
    out = MsgpackStreamWriter(output_file)
    map_out = NTriplesWriter(sw_map_file)
    reader = NTriplesReader()

    labels = {}
    label_sets = defaultdict(set)

    # There are two files we want to parse:
    # - umbel.nt, a transformation of umbel.n3, which is available from
    #   https://github.com/structureddynamics/UMBEL/.
    # - umbel_links.nt, distributed with DBPedia 3.9.
    #
    # We parse them both in this file so that umbel_links can reuse the
    # concept names extracted from umbel.nt.
    main_file = os.path.join(input_dir, 'umbel.nt')
    dbpedia_link_file = os.path.join(input_dir, 'umbel_links.nt')

    # Read through umbel.nt once, finding all the "preferred labels". We will
    # use these as the surface texts for the nodes.
    for web_subj, web_rel, web_obj, objtag in reader.parse_file(main_file):
        if resource_name(web_rel) == 'prefLabel':
            # 'CW' and 'PCW' are Cyc jargon for 'conceptual works'. If a node
            # cannot be described except as a CW, we're probably not
            # interested in it.
            if 'CW' not in web_obj.split() and 'PCW' not in web_obj.split():
                labels[web_subj] = web_obj
        if resource_name(web_rel).endswith('Label'):
            text = standardize_text(web_obj)
            label_sets[text].add(web_subj)

    # Read through umbel.nt again and extract ConceptNet edges.
    for web_subj, web_rel, web_obj, objtag in reader.parse_file(main_file):
        if objtag == 'URL' and acceptable_node(web_obj) and acceptable_node(
                web_subj):
            # Only use nodes for which we've seen preferred labels.
            # (This skips some anonymous OWL-cruft nodes.)
            if web_subj in labels and web_obj in labels:
                subj_uri = standardized_concept_uri('en', labels[web_subj])
                obj_uri = standardized_concept_uri('en', labels[web_obj])
                rel_name = resource_name(web_rel)
                # Check if this is a relation we want to handle.
                if rel_name in REL_MAPPING:
                    # Write the ConceptNet edges and the mappings to Semantic Web URLs.
                    rel_uri, frame = REL_MAPPING[rel_name]
                    surface = frame % (labels[web_subj], labels[web_obj])
                    out.write(
                        umbel_edge(rel_uri, subj_uri, obj_uri, surface,
                                   SOURCE))
                    map_out.write_link(web_rel, full_conceptnet_url(rel_uri))
                    map_out.write_link(web_subj, full_conceptnet_url(subj_uri))
                    map_out.write_link(web_obj, full_conceptnet_url(obj_uri))

        # altLabel relations assign different texts to the same node. We'll
        # represent those in ConceptNet with Synonym relations.
        elif web_rel.endswith('altLabel'):
            # Make sure we know what's being labeled.
            if web_subj in labels:
                name = web_obj
                words = name.split(' ')
                if standardized_concept_name(
                        'en', name) != standardized_concept_name(
                            'en', labels[web_subj]):
                    if not set(words) & IGNORED_WORDS:
                        main_label = standardized_concept_uri(
                            'en', labels[web_subj])
                        name_text = standardize_text(name)
                        if len(label_sets[name_text]) >= 2 or len(
                                name_text) <= 3:
                            disambig = un_camel_case(resource_name(web_subj))

                            # Cyc does not distinguish texts by their part of speech, so use
                            # '_' as the part of speech symbol.
                            alt_label = standardized_concept_uri(
                                'en', name, '_', disambig)
                        else:
                            alt_label = standardized_concept_uri('en', name)
                        surface = SYN_FRAME % (name, labels[web_subj])
                        out.write(
                            umbel_edge('/r/Synonym', alt_label, main_label,
                                       surface, SOURCE))

    for web_subj, web_rel, web_obj, objtag in reader.parse_file(
            dbpedia_link_file):
        if objtag == 'URL' and acceptable_node(web_obj) and acceptable_node(
                web_subj):
            if web_obj in labels:
                subj_label = resource_name(web_subj).replace('_', ' ')
                subj_uri = translate_dbpedia_url(web_subj)
                obj_label = labels[web_obj]
                obj_uri = standardized_concept_uri('en', obj_label)
                rel_name = resource_name(web_rel)
                if rel_name in REL_MAPPING:
                    rel_uri, frame = REL_MAPPING[rel_name]
                    surface = frame % (subj_label, obj_label)
                    out.write(
                        umbel_edge(rel_uri, subj_uri, obj_uri, surface,
                                   LINK_SOURCE))
                    map_out.write_link(web_rel, full_conceptnet_url(rel_uri))
                    map_out.write_link(web_subj, full_conceptnet_url(subj_uri))
                    map_out.write_link(web_obj, full_conceptnet_url(obj_uri))
Пример #22
0
def process_dbpedia(input_dir, output_file, concept_file):
    """
    Read through multiple DBPedia files and output filtered assertions to
    `output_file`.
    """
    ok_concepts = read_concept_file(concept_file)

    input_path = pathlib.Path(input_dir)
    interlang_path = input_path / 'interlanguage_links_en.tql.bz2'
    mapped_urls = interlanguage_mapping(interlang_path, ok_concepts)

    out = MsgpackStreamWriter(output_file)

    types_path = input_path / 'instance_types_en.tql.bz2'
    quads = parse_nquads(bz2.open(str(types_path), 'rt'))
    for subj, pred, obj, _graph in quads:
        subj_url = subj['url']
        if ('Category:' in subj_url or 'File:' in subj_url
                or 'List_of' in subj_url or '__' in subj_url
                or 'Template:' in subj_url):
            continue
        if subj_url in mapped_urls:
            subj_concept = translate_dbpedia_url(subj_url)
            obj_type = un_camel_case(resource_name(obj['url']))
            if obj_type not in TYPE_BLACKLIST:
                obj_concept = standardized_concept_uri('en', obj_type, 'n')
                if obj_concept not in CONCEPT_BLACKLIST:
                    edge = make_edge('/r/IsA',
                                     subj_concept,
                                     obj_concept,
                                     dataset='/d/dbpedia/en',
                                     license=Licenses.cc_sharealike,
                                     sources=[{
                                         'contributor':
                                         '/s/resource/dbpedia/2015/en'
                                     }],
                                     weight=0.5,
                                     surfaceStart=url_to_label(subj['url']),
                                     surfaceEnd=url_to_label(obj['url']))
                    out.write(edge)
                for other_url in mapped_urls[subj_url]:
                    if other_url.startswith('http://wikidata.dbpedia.org/'):
                        urledge = make_edge('/r/ExternalURL',
                                            subj_concept,
                                            other_url,
                                            dataset='/d/dbpedia/en',
                                            license=Licenses.cc_sharealike,
                                            sources=[{
                                                'contributor':
                                                '/s/resource/dbpedia/2015/en'
                                            }],
                                            weight=1.0)
                        out.write(urledge)
                    else:
                        other_concept = translate_dbpedia_url(other_url)
                        if other_concept:
                            urledge = make_edge(
                                '/r/ExternalURL',
                                other_concept,
                                other_url,
                                dataset='/d/dbpedia/en',
                                license=Licenses.cc_sharealike,
                                sources=[{
                                    'contributor':
                                    '/s/resource/dbpedia/2015/en'
                                }],
                                weight=1.0)
                            out.write(urledge)
                            edge = make_edge(
                                '/r/Synonym',
                                other_concept,
                                subj_concept,
                                dataset='/d/dbpedia/en',
                                license=Licenses.cc_sharealike,
                                sources=[{
                                    'contributor':
                                    '/s/resource/dbpedia/2015/en'
                                }],
                                weight=0.5,
                                surfaceStart=url_to_label(other_url),
                                surfaceEnd=url_to_label(subj_url))
                            out.write(edge)

    relations_path = input_path / 'mappingbased_objects_en.tql.bz2'
    quads = parse_nquads(bz2.open(str(relations_path), 'rt'))
    for subj, pred, obj, _graph in quads:
        subj_concept = translate_dbpedia_url(subj['url'])
        obj_concept = translate_dbpedia_url(obj['url'])
        rel_name = resource_name(pred['url'])
        if (subj_concept and obj_concept and subj['url'] in mapped_urls
                and obj['url'] in mapped_urls):
            if rel_name in RELATIONS:
                rel = RELATIONS[rel_name]
                edge = make_edge(rel,
                                 subj_concept,
                                 obj_concept,
                                 dataset='/d/dbpedia/en',
                                 license=Licenses.cc_sharealike,
                                 sources=[{
                                     'contributor':
                                     '/s/resource/dbpedia/2015/en'
                                 }],
                                 weight=0.5,
                                 surfaceStart=url_to_label(subj['url']),
                                 surfaceEnd=url_to_label(obj['url']))
                out.write(edge)

    out.close()
Пример #23
0
def handle_triple(line, reader, out, map_out):
    subj, pred, obj, tag = reader.parse_line(line)
    if tag != 'URL':
        return

    # Ignore types of edges that we don't care about:
    #   - Homepage links
    #   - GIS features
    #   - Assertions that something "is a thing"
    #   - Anonymous nodes identified with double-underscores, such as the node
    #     "Alfred_Nobel__1", which means "Alfred Nobel's occupation, whatever
    #     it is"
    #   - Nodes that are articles named "List of X" on Wikipedia
    if (
        'foaf/0.1/homepage' in pred or '_Feature' in obj or '#Thing' in obj or
        '__' in subj or '__' in obj or 'List_of' in subj or 'List_of' in obj
        or 'Wikidata:' in obj
    ):
        return

    # We don't try to parse URIs from outside of dbpedia.org's namespace.
    if 'dbpedia.org' not in obj:
        return

    subj_concept = translate_dbpedia_url(subj)
    obj_concept = translate_dbpedia_url(obj)
    subj_text = un_camel_case(parse_topic_name(resource_name(subj))[0])
    obj_text = un_camel_case(parse_topic_name(resource_name(obj))[0])
    if subj_concept is None or obj_concept is None:
        return

    # DBPedia categorizes a lot of things as 'works', which causes unnecessary
    # ambiguity. Disregard these edges; there will almost always be a more
    # specific edge calling it a 'creative work' anyway.
    if obj_concept in CONCEPT_BLACKLIST:
        return

    rel = map_dbpedia_relation(pred)
    if rel is None:
        return

    if rel in {'/r/IsA', '/r/TranslationOf'}:
        obj_text = obj_text.lower()

    # We've successfully converted this Semantic Web triple to ConceptNet URIs.
    # Now write the results to the 'sw_map' file so others can follow this
    # mapping.
    mapped_pairs = [
        (pred, rel),
        (subj, subj_concept),
        (obj, obj_concept)
    ]
    for sw_url, conceptnet_uri in mapped_pairs:
        conceptnet_url = full_conceptnet_url(conceptnet_uri)
        map_out.write_link(conceptnet_url, sw_url)

    edge = make_edge(rel, subj_concept, obj_concept,
                     dataset='/d/dbpedia/en',
                     license=Licenses.cc_sharealike,
                     sources=['/s/dbpedia/2014'],
                     surfaceText=make_surface_text(rel, subj_text, obj_text),
                     weight=0.5)

    out.write(edge)
Пример #24
0
def url_to_label(url):
    return resource_name(url).replace('_', ' ')
Пример #25
0
def acceptable_node(url):
    return not (url.endswith('Type') or url.endswith('Concept')
                or resource_name(url) in IGNORED_NODES)
Пример #26
0
def run_wordnet(input_file, output_file):
    out = MsgpackStreamWriter(output_file)

    synset_senses = defaultdict(list)
    sense_synsets = {}
    synset_labels = defaultdict(list)
    synset_canonical_labels = {}
    synset_categories = {}
    synset_domains = {}
    synset_glosses = {}
    synset_disambig = {}
    synset_uris = {}

    # First pass: find data about synsets
    quads = parse_nquads(open(input_file, encoding="utf-8"))
    for subj_dict, rel_dict, obj_dict, _graph in quads:
        if "url" not in subj_dict or "url" not in rel_dict:
            continue
        subj = subj_dict["url"]
        rel = rel_dict["url"]
        obj = obj_dict.get("url")
        objtext = obj_dict.get("text")

        relname = resource_name(rel)
        if relname == "label":
            if obj_dict["lang"] == "en":
                synset_labels[subj].append(objtext)
        elif relname == "sameAs":
            if obj.startswith(WN20_URL):
                # If we have a link to RDF WordNet 2.0, the URL (URI? IRI?)
                # will contain a standardized label for this concept, which
                # we should use when we want to use this synset as the name of
                # a disambiguation category. RDF WordNet 3.1 assigns synsets
                # a number of labels in no particular order, making it hard to
                # determine from 3.1 alone what to name a category.
                objname = resource_name(obj)
                parts = objname.split("-")[1:-2]

                # Handle missing apostrophes
                label = (
                    "-".join(parts)
                    .replace("_s_", "'s_")
                    .replace("_s-", "'s_")
                    .replace("s__", "s'_")
                    .replace("s_-", "s'-")
                    .replace("_", " ")
                )
                synset_canonical_labels[subj] = label

        elif relname == "domain_category":
            synset_categories[subj] = obj
        elif relname == "lexical_domain":
            target = resource_name(obj)
            if "." in target:
                domain = target.split(".")[1]
                synset_domains[subj] = domain
        elif relname == "gloss":
            synset_glosses[subj] = objtext
        elif relname == "reference":
            lemma = resource_name(subj)
            synset = obj
            synset_senses[synset].append(lemma)
            sense_synsets[lemma] = synset

    used_labels = set(synset_canonical_labels.values())
    for synset, values in synset_labels.items():
        values.sort(key=lambda label: (label in used_labels,) + label_sort_key(label))
        if (
            synset not in synset_canonical_labels
            or synset_canonical_labels[synset][0].isupper()
            and synset_domains.get(synset) == "person"
        ):
            label = values[0]
            synset_canonical_labels[synset] = label
            used_labels.add(label)

    for synset, labels in synset_labels.items():
        if synset in synset_categories:
            category_name = synset_canonical_labels[synset_categories[synset]]
        else:
            category_name = synset_domains.get(synset, None)
        synset_no_fragment = synset.split("#")[0]
        pos = synset_no_fragment[-1].lower()
        assert pos in "nvarsp", synset
        if pos == "s":
            pos = "a"
        elif pos == "p":
            pos = "-"
        if category_name in ("pert", "all", "tops"):
            category_name = None
        synset_disambig[synset] = (pos, category_name)

        canon = synset_canonical_labels[synset]
        canon_uri = standardized_concept_uri("en", canon, pos, "wn", category_name)
        synset_uris[synset] = canon_uri

        for label in labels:
            if label != canon:
                other_uri = standardized_concept_uri(
                    "en", label, pos, "wn", category_name
                )
                rel_uri = "/r/Synonym"
                surface = "[[{0}]] is a synonym of [[{1}]]".format(label, canon)
                edge = make_edge(
                    rel_uri,
                    other_uri,
                    canon_uri,
                    dataset=DATASET,
                    surfaceText=surface,
                    license=Licenses.cc_attribution,
                    sources=[SOURCE],
                    weight=2.0,
                )
                out.write(edge)

    quads = parse_nquads(open(input_file, encoding="utf-8"))
    for subj_dict, rel_dict, obj_dict, _graph in quads:
        if "url" not in subj_dict or "url" not in rel_dict:
            continue
        subj = subj_dict["url"]
        rel = rel_dict["url"]
        obj = obj_dict.get("url")
        relname = resource_name(rel)
        if relname in REL_MAPPING:
            pos, sense = synset_disambig.get(subj, (None, None))
            if relname == "hypernym" and pos == "v":
                relname = "hypernym-v"
            rel, frame = REL_MAPPING[relname]
            reversed_frame = False
            if rel.startswith("~"):
                rel = rel[1:]
                reversed_frame = True
            rel_uri = "/r/" + rel
            if obj is not None:
                obj_uri = synset_uris.get(obj)
                if obj not in synset_canonical_labels:
                    continue
                obj_label = synset_canonical_labels[obj]
            else:
                text = obj_dict["text"]
                # Some WordNets use strings with "!" in them to indicate
                # out-of-band information, such as a missing translation
                if (not text) or "!" in text:
                    continue
                lang = obj_dict["lang"]
                obj_uri = standardized_concept_uri(lang, text, pos, "wn", sense)
                obj_label = text

            if subj not in synset_uris or subj not in synset_canonical_labels:
                continue
            subj_uri = synset_uris[subj]
            subj_label = synset_canonical_labels[subj]
            license = Licenses.cc_attribution
            langcode = subj_uri.split("/")[2]
            if langcode in SHAREALIKE_LANGUAGES:
                license = Licenses.cc_sharealike

            if reversed_frame:
                subj_uri, obj_uri = obj_uri, subj_uri
                subj_label, obj_label = obj_label, subj_label

            surface = frame.format("[[%s]]" % subj_label, "[[%s]]" % obj_label)

            edge = make_edge(
                rel_uri,
                subj_uri,
                obj_uri,
                dataset=DATASET,
                surfaceText=surface,
                license=license,
                sources=[SOURCE],
                weight=2.0,
            )
            out.write(edge)

    for wn_url in sorted(synset_uris):
        cn_uri = synset_uris[wn_url]
        edge = make_edge(
            "/r/ExternalURL",
            cn_uri,
            wn_url,
            dataset=DATASET,
            license=Licenses.cc_sharealike,
            sources=[SOURCE],
            weight=1.0,
        )
        out.write(edge)

    out.close()
Пример #27
0
def run_wordnet(input_dir, output_file, sw_map_file):
    out = MsgpackStreamWriter(output_file)
    map_out = NTriplesWriter(sw_map_file)
    reader = NTriplesReader()

    synset_senses = defaultdict(list)
    sense_synsets = {}

    labels = {}
    glossary = {}
    concept_map = {}
    sense_to_synset = {}

    # Parse lines such as:
    #   wn30:synset-Aeolian-noun-2 rdfs:label "Aeolian"@en-us .
    for subj, rel, obj, objtag in reader.parse_file(os.path.join(input_dir, 'wordnet-synset.ttl')):
        if resource_name(rel) == 'label':
            # Everything in WordNet is in English
            assert objtag == 'en'
            labels[subj] = obj

    for subj, rel, obj, objtag in reader.parse_file(os.path.join(input_dir, 'wordnet-glossary.ttl')):
        if resource_name(rel) == 'gloss':
            assert objtag == 'en'

            # Take the definition up to the first semicolon
            text = obj.split(';')[0]

            # Remove introductory phrases with a colon
            text = text.split(': ', 1)[-1]

            # Remove parenthesized expressions
            while True:
                newtext = re.sub(r'\(.+?\) ?', '', text).strip()
                if newtext == text or newtext == '':
                    break
                else:
                    text = newtext

            glossary[subj] = text.replace('/', '_')

    # Get the list of word senses in each synset, and make a bidirectional mapping.
    #
    # Example line:
    #   wn30:synset-Aeolian-noun-2 wn20schema:containsWordSense wn30:wordsense-Aeolian-noun-2 .
    for subj, rel, obj, objtag in reader.parse_file(os.path.join(input_dir, 'full/wordnet-wordsense-synset-relations.ttl')):
        if resource_name(rel) == 'containsWordSense':
            synset_senses[subj].append(obj)
            sense_synsets[obj] = subj

    # Assign every synset to a disambiguated concept.
    for synset in synset_senses:
        synset_name = labels[synset]
        synset_pos = synset.split('-')[-2]
        pos = PARTS_OF_SPEECH[synset_pos]
        disambig = glossary[synset]

        concept = standardized_concept_uri('en', synset_name, pos, disambig)
        concept_map[synset] = concept

    # Map senses to their synsets.
    for sense, synset in sense_synsets.items():
        sense_to_synset[sense] = synset

    for filename in (
        'wordnet-attribute.ttl', 'wordnet-causes.ttl',
        'wordnet-classifiedby.ttl', 'wordnet-entailment.ttl',
        'wordnet-hyponym.ttl', 'wordnet-instances.ttl',
        'wordnet-membermeronym.ttl', 'wordnet-partmeronym.ttl',
        'wordnet-sameverbgroupas.ttl', 'wordnet-similarity.ttl',
        'wordnet-substancemeronym.ttl', 'full/wordnet-antonym.ttl',
        'full/wordnet-derivationallyrelated.ttl',
        'full/wordnet-participleof.ttl',
        'full/wordnet-pertainsto.ttl',
        'full/wordnet-seealso.ttl'
    ):
        filepath = os.path.join(input_dir, filename)
        if os.path.exists(filepath):
            for web_subj, web_rel, web_obj, objtag in reader.parse_file(filepath):
                # If this relation involves word senses, map them to their synsets
                # first.
                if web_subj in sense_to_synset:
                    web_subj = sense_to_synset[web_subj]
                if web_obj in sense_to_synset:
                    web_obj = sense_to_synset[web_obj]
                subj = concept_map[web_subj]
                obj = concept_map[web_obj]
                pred_label = resource_name(web_rel)
                if pred_label in REL_MAPPING:
                    mapped_rel = REL_MAPPING[pred_label]

                    # Handle WordNet relations that are the reverse of ConceptNet
                    # relations. Change the word 'meronym' to 'holonym' if
                    # necessary.
                    if mapped_rel.startswith('~'):
                        subj, obj = obj, subj
                        web_subj, web_obj = web_obj, web_subj
                        web_rel = web_rel.replace('meronym', 'holonym')
                        mapped_rel = mapped_rel[1:]
                    rel = join_uri('r', mapped_rel)
                else:
                    rel = join_uri('r', 'wordnet', pred_label)

                map_out.write_link(web_rel, full_conceptnet_url(rel))
                map_out.write_link(web_subj, full_conceptnet_url(subj))
                map_out.write_link(web_obj, full_conceptnet_url(obj))
                edge = make_edge(
                    rel, subj, obj, dataset='/d/wordnet/3.0',
                    license='/l/CC/By', sources=SOURCE, weight=2.0
                )
                out.write(edge)
Пример #28
0
def process_dbpedia(input_dir, output_file, concept_file):
    """
    Read through multiple DBPedia files and output filtered assertions to
    `output_file`.
    """
    ok_concepts = read_concept_file(concept_file)

    input_path = pathlib.Path(input_dir)
    interlang_path = input_path / 'interlanguage_links_en.tql.bz2'
    mapped_urls = interlanguage_mapping(interlang_path, ok_concepts)

    out = MsgpackStreamWriter(output_file)

    types_path = input_path / 'instance_types_en.tql.bz2'
    quads = parse_nquads(bz2.open(str(types_path), 'rt'))
    for subj, pred, obj, _graph in quads:
        subj_url = subj['url']
        if (
            'Category:' in subj_url or 'File:' in subj_url or
            'List_of' in subj_url or '__' in subj_url or
            'Template:' in subj_url
        ):
            continue
        if subj_url in mapped_urls:
            subj_concept = translate_dbpedia_url(subj_url)
            obj_type = un_camel_case(resource_name(obj['url']))
            if obj_type not in TYPE_BLACKLIST:
                obj_concept = standardized_concept_uri('en', obj_type, 'n')
                if obj_concept not in CONCEPT_BLACKLIST:
                    edge = make_edge(
                        '/r/IsA', subj_concept, obj_concept,
                        dataset='/d/dbpedia/en',
                        license=Licenses.cc_sharealike,
                        sources=[{'contributor': '/s/resource/dbpedia/2015/en'}],
                        weight=0.5,
                        surfaceStart=url_to_label(subj['url']),
                        surfaceEnd=url_to_label(obj['url'])
                    )
                    out.write(edge)
                for other_url in mapped_urls[subj_url]:
                    if other_url.startswith('http://wikidata.dbpedia.org/'):
                        urledge = make_edge(
                            '/r/ExternalURL',
                            subj_concept, other_url,
                            dataset='/d/dbpedia/en',
                            license=Licenses.cc_sharealike,
                            sources=[{'contributor': '/s/resource/dbpedia/2015/en'}],
                            weight=1.0
                        )
                        out.write(urledge)
                    else:
                        other_concept = translate_dbpedia_url(other_url)
                        if other_concept:
                            urledge = make_edge(
                                '/r/ExternalURL',
                                other_concept, other_url,
                                dataset='/d/dbpedia/en',
                                license=Licenses.cc_sharealike,
                                sources=[{'contributor': '/s/resource/dbpedia/2015/en'}],
                                weight=1.0
                            )
                            out.write(urledge)
                            edge = make_edge(
                                '/r/Synonym',
                                other_concept, subj_concept,
                                dataset='/d/dbpedia/en',
                                license=Licenses.cc_sharealike,
                                sources=[{'contributor': '/s/resource/dbpedia/2015/en'}],
                                weight=0.5,
                                surfaceStart=url_to_label(other_url),
                                surfaceEnd=url_to_label(subj_url)
                            )
                            out.write(edge)

    relations_path = input_path / 'mappingbased_objects_en.tql.bz2'
    quads = parse_nquads(bz2.open(str(relations_path), 'rt'))
    for subj, pred, obj, _graph in quads:
        subj_concept = translate_dbpedia_url(subj['url'])
        obj_concept = translate_dbpedia_url(obj['url'])
        rel_name = resource_name(pred['url'])
        if (
            subj_concept and obj_concept and
            subj['url'] in mapped_urls and obj['url'] in mapped_urls
        ):
            if rel_name in RELATIONS:
                rel = RELATIONS[rel_name]
                edge = make_edge(
                    rel, subj_concept, obj_concept,
                    dataset='/d/dbpedia/en',
                    license=Licenses.cc_sharealike,
                    sources=[{'contributor': '/s/resource/dbpedia/2015/en'}],
                    weight=0.5,
                    surfaceStart=url_to_label(subj['url']),
                    surfaceEnd=url_to_label(obj['url'])
                )
                out.write(edge)

    out.close()
Пример #29
0
def run_wordnet(input_file, output_file):
    out = MsgpackStreamWriter(output_file)

    synset_senses = defaultdict(list)
    sense_synsets = {}
    synset_labels = defaultdict(list)
    synset_canonical_labels = {}
    synset_categories = {}
    synset_domains = {}
    synset_glosses = {}
    synset_disambig = {}
    synset_uris = {}

    # First pass: find data about synsets
    quads = parse_nquads(open(input_file, encoding='utf-8'))
    for subj_dict, rel_dict, obj_dict, _graph in quads:
        if 'url' not in subj_dict or 'url' not in rel_dict:
            continue
        subj = subj_dict['url']
        rel = rel_dict['url']
        obj = obj_dict.get('url')
        objtext = obj_dict.get('text')

        relname = resource_name(rel)
        if relname == 'label':
            if obj_dict['lang'] == 'en':
                synset_labels[subj].append(objtext)
        elif relname == 'sameAs':
            if obj.startswith(WN20_URL):
                # If we have a link to RDF WordNet 2.0, the URL (URI? IRI?)
                # will contain a standardized label for this concept, which
                # we should use when we want to use this synset as the name of
                # a disambiguation category. RDF WordNet 3.1 assigns synsets
                # a number of labels in no particular order, making it hard to
                # determine from 3.1 alone what to name a category.
                objname = resource_name(obj)
                parts = objname.split('-')[1:-2]

                # Handle missing apostrophes
                label = '-'.join(parts).replace('_s_', "'s_").replace('_s-', "'s_").replace("s__", "s'_").replace("s_-", "s'-").replace('_', ' ')
                synset_canonical_labels[subj] = label

        elif relname == 'domain_category':
            synset_categories[subj] = obj
        elif relname == 'lexical_domain':
            target = resource_name(obj)
            if '.' in target:
                domain = target.split('.')[1]
                synset_domains[subj] = domain
        elif relname == 'gloss':
            synset_glosses[subj] = objtext
        elif relname == 'reference':
            lemma = resource_name(subj)
            synset = obj
            synset_senses[synset].append(lemma)
            sense_synsets[lemma] = synset

    used_labels = set(synset_canonical_labels.values())
    for synset, values in synset_labels.items():
        values.sort(key=lambda label: (label in used_labels,) + label_sort_key(label))
        if (
            synset not in synset_canonical_labels or
            synset_canonical_labels[synset][0].isupper() and synset_domains.get(synset) == 'person'
        ):
            label = values[0]
            synset_canonical_labels[synset] = label
            used_labels.add(label)

    for synset, labels in synset_labels.items():
        if synset in synset_categories:
            category_name = synset_canonical_labels[synset_categories[synset]]
        else:
            category_name = synset_domains.get(synset, None)
        synset_no_fragment = synset.split('#')[0]
        pos = synset_no_fragment[-1].lower()
        assert pos in 'nvarsp', synset
        if pos == 's':
            pos = 'a'
        elif pos == 'p':
            pos = '-'
        if category_name in ('pert', 'all', 'tops'):
            category_name = None
        synset_disambig[synset] = (pos, category_name)

        canon = synset_canonical_labels[synset]
        canon_uri = standardized_concept_uri('en', canon, pos, 'wn', category_name)
        synset_uris[synset] = canon_uri

        for label in labels:
            if label != canon:
                other_uri = standardized_concept_uri('en', label, pos, 'wn', category_name)
                rel_uri = '/r/Synonym'
                surface = '[[{0}]] is a synonym of [[{1}]]'.format(label, canon)
                edge = make_edge(
                    rel_uri, other_uri, canon_uri, dataset=DATASET, surfaceText=surface,
                    license=Licenses.cc_attribution, sources=[SOURCE], weight=2.0
                )
                out.write(edge)

    quads = parse_nquads(open(input_file, encoding='utf-8'))
    for subj_dict, rel_dict, obj_dict, _graph in quads:
        if 'url' not in subj_dict or 'url' not in rel_dict:
            continue
        subj = subj_dict['url']
        rel = rel_dict['url']
        obj = obj_dict.get('url')
        relname = resource_name(rel)
        if relname in REL_MAPPING:
            rel, frame = REL_MAPPING[relname]
            reversed_frame = False
            if rel.startswith('~'):
                rel = rel[1:]
                reversed_frame = True
            rel_uri = '/r/' + rel
            if obj is not None:
                obj_uri = synset_uris.get(obj)
                if obj not in synset_canonical_labels:
                    continue
                obj_label = synset_canonical_labels[obj]
            else:
                text = obj_dict['text']
                # Some WordNets use strings with "!" in them to indicate
                # out-of-band information, such as a missing translation
                if (not text) or '!' in text:
                    continue
                lang = obj_dict['lang']
                pos, sense = synset_disambig.get(subj, (None, None))
                obj_uri = standardized_concept_uri(lang, text, pos, 'wn', sense)
                obj_label = text

            if subj not in synset_uris or subj not in synset_canonical_labels:
                continue
            subj_uri = synset_uris[subj]
            subj_label = synset_canonical_labels[subj]
            license = Licenses.cc_attribution
            langcode = subj_uri.split('/')[2]
            if langcode in SHAREALIKE_LANGUAGES:
                license = Licenses.cc_sharealike

            if reversed_frame:
                subj_uri, obj_uri = obj_uri, subj_uri
                subj_label, obj_label = obj_label, subj_label

            surface = frame.format('[[%s]]' % subj_label, '[[%s]]' % obj_label)

            edge = make_edge(
                rel_uri, subj_uri, obj_uri, dataset=DATASET, surfaceText=surface,
                license=license, sources=[SOURCE], weight=2.0
            )
            out.write(edge)

    for wn_url in sorted(synset_uris):
        cn_uri = synset_uris[wn_url]
        edge = make_edge(
            '/r/ExternalURL', cn_uri, wn_url, dataset=DATASET,
            license=Licenses.cc_sharealike, sources=[SOURCE], weight=1.0
        )
        out.write(edge)

    out.close()
Пример #30
0
def url_to_label(url):
    return resource_name(url).replace('_', ' ')