예제 #1
0
def run_opencyc(input_file, output_file):
    """
    Read an .nq file containing OpenCyc data, outputting a file of
    ConceptNet edges and a file of mappings between the Semantic Web and
    ConceptNet.
    """
    out = MsgpackStreamWriter(output_file)

    labels = {}
    unlabels = defaultdict(set)
    seen_external_urls = set()

    # Read through the file once, finding all the "preferred labels". We will
    # use these as the surface texts for the nodes.
    for subj, pred, obj, _graph in parse_nquads(
            open(input_file, encoding='utf-8')):
        if pred['url'] == RDF_LABEL:
            labels[subj['url']] = obj['text']
            unlabels[obj['text']].add(subj['url'])

    # Read through the file again and extract ConceptNet edges.
    for subj, pred, obj, _graph in parse_nquads(
            open(input_file, encoding='utf-8')):
        rel_name = resource_name(pred['url'])
        web_subj = subj.get('url')
        web_obj = obj.get('url')
        if rel_name == 'subClassOf' and web_obj is not None and web_subj in labels and web_obj in labels:
            subj_label = labels[web_subj]
            obj_label = labels[web_obj]
            if '_' in subj_label or '_' in obj_label:
                continue
            if subj_label.startswith('xsd:') or obj_label.startswith('xsd:'):
                continue
            subj_words = set(simple_tokenize(subj_label))
            obj_words = set(simple_tokenize(obj_label))
            if (subj_words & BLACKLIST_WORDS) or (obj_words & BLACKLIST_WORDS):
                continue
            if len(subj_words) > 4 or len(obj_words) > 4:
                continue

            subj_uri = cyc_to_conceptnet_uri(labels, unlabels, web_subj)
            obj_uri = cyc_to_conceptnet_uri(labels, unlabels, web_obj)
            out.write(
                opencyc_edge('/r/IsA', subj_uri, obj_uri, subj_label,
                             obj_label))
            if (subj_uri, web_subj) not in seen_external_urls:
                out.write(external_url_edge(subj_uri, web_subj))
                seen_external_urls.add((subj_uri, web_subj))
            if (obj_uri, web_obj) not in seen_external_urls:
                out.write(external_url_edge(obj_uri, web_obj))
                seen_external_urls.add((obj_uri, web_obj))
        elif rel_name == 'sameAs' and web_subj in labels and web_obj.startswith(
                'http://umbel.org/'):
            subj_label = labels[web_subj]
            subj_uri = standardized_concept_uri('en', subj_label)
            if (subj_uri, web_obj) not in seen_external_urls:
                out.write(external_url_edge(subj_uri, web_obj))
                seen_external_urls.add((subj_uri, web_obj))

    out.close()
예제 #2
0
def run_opencyc(input_file, output_file):
    """
    Read an .nq file containing OpenCyc data, outputting a file of
    ConceptNet edges and a file of mappings between the Semantic Web and
    ConceptNet.
    """
    out = MsgpackStreamWriter(output_file)

    labels = {}
    unlabels = defaultdict(set)
    seen_external_urls = set()

    # Read through the file once, finding all the "preferred labels". We will
    # use these as the surface texts for the nodes.
    for subj, pred, obj, _graph in parse_nquads(open(input_file, encoding='utf-8')):
        if pred['url'] == RDF_LABEL:
            labels[subj['url']] = obj['text']
            unlabels[obj['text']].add(subj['url'])

    # Read through the file again and extract ConceptNet edges.
    for subj, pred, obj, _graph in parse_nquads(open(input_file, encoding='utf-8')):
        rel_name = resource_name(pred['url'])
        web_subj = subj.get('url')
        web_obj = obj.get('url')
        if rel_name == 'subClassOf' and web_obj is not None and web_subj in labels and web_obj in labels:
            subj_label = labels[web_subj]
            obj_label = labels[web_obj]
            if '_' in subj_label or '_' in obj_label:
                continue
            if subj_label.startswith('xsd:') or obj_label.startswith('xsd:'):
                continue
            subj_words = set(simple_tokenize(subj_label))
            obj_words = set(simple_tokenize(obj_label))
            if (subj_words & BLACKLIST_WORDS) or (obj_words & BLACKLIST_WORDS):
                continue
            if len(subj_words) > 4 or len(obj_words) > 4:
                continue

            subj_uri = cyc_to_conceptnet_uri(labels, unlabels, web_subj)
            obj_uri = cyc_to_conceptnet_uri(labels, unlabels, web_obj)
            out.write(opencyc_edge('/r/IsA', subj_uri, obj_uri, subj_label, obj_label))
            if (subj_uri, web_subj) not in seen_external_urls:
                out.write(external_url_edge(subj_uri, web_subj))
                seen_external_urls.add((subj_uri, web_subj))
            if (obj_uri, web_obj) not in seen_external_urls:
                out.write(external_url_edge(obj_uri, web_obj))
                seen_external_urls.add((obj_uri, web_obj))
        elif rel_name == 'sameAs' and web_subj in labels and web_obj.startswith('http://umbel.org/'):
            subj_label = labels[web_subj]
            subj_uri = standardized_concept_uri('en', subj_label)
            if (subj_uri, web_obj) not in seen_external_urls:
                out.write(external_url_edge(subj_uri, web_obj))
                seen_external_urls.add((subj_uri, web_obj))

    out.close()
예제 #3
0
def interlanguage_mapping(interlang_path, ok_concepts):
    quads = parse_nquads(bz2.open(str(interlang_path), 'rt'))
    mapping = {}
    for subj, values in itertools.groupby(quads, itemgetter(0)):
        subj_url = subj['url']
        subj_concept = translate_dbpedia_url(subj_url)
        pieces = split_uri(subj_concept)
        if len(pieces) >= 6:
            sense = pieces[5]
            if 'album' in sense or 'film' in sense or 'series' in sense or 'disambiguation' in sense or 'song' in sense or 'album' in sense or 'band' in sense:
                continue
        if uri_prefix(subj_concept) in ok_concepts:
            targets = [subj_url]

            for _subj, _pred, obj, _graph in values:
                url = obj['url']
                if 'www.wikidata.org' in url:
                    continue
                if url.startswith('http://wikidata.dbpedia.org/'):
                    wikidata_id = resource_name(url)

                    # Return early when we see a high-numbered Wikidata ID
                    if int(wikidata_id[1:]) >= 1000000:
                        return mapping
                targets.append(url)

            mapping[subj_url] = targets
    return mapping
예제 #4
0
def interlanguage_mapping(interlang_path, ok_concepts):
    quads = parse_nquads(bz2.open(str(interlang_path), 'rt'))
    mapping = {}
    for subj, values in itertools.groupby(quads, itemgetter(0)):
        subj_url = subj['url']
        subj_concept = translate_dbpedia_url(subj_url)
        pieces = split_uri(subj_concept)
        if len(pieces) >= 6:
            sense = pieces[5]
            if 'album' in sense or 'film' in sense or 'series' in sense or 'disambiguation' in sense or 'song' in sense or 'album' in sense or 'band' in sense:
                continue
        if uri_prefix(subj_concept) in ok_concepts:
            targets = [subj_url]

            for _subj, _pred, obj, _graph in values:
                url = obj['url']
                if 'www.wikidata.org' in url:
                    continue
                if url.startswith('http://wikidata.dbpedia.org/'):
                    wikidata_id = resource_name(url)

                    # Return early when we see a high-numbered Wikidata ID
                    if int(wikidata_id[1:]) >= 1000000:
                        return mapping
                targets.append(url)

            mapping[subj_url] = targets
    return mapping
예제 #5
0
def run_wordnet(input_file, output_file):
    out = MsgpackStreamWriter(output_file)

    synset_senses = defaultdict(list)
    sense_synsets = {}
    synset_labels = defaultdict(list)
    synset_canonical_labels = {}
    synset_categories = {}
    synset_domains = {}
    synset_glosses = {}
    synset_disambig = {}
    synset_uris = {}

    # First pass: find data about synsets
    quads = parse_nquads(open(input_file, encoding='utf-8'))
    for subj_dict, rel_dict, obj_dict, _graph in quads:
        if 'url' not in subj_dict or 'url' not in rel_dict:
            continue
        subj = subj_dict['url']
        rel = rel_dict['url']
        obj = obj_dict.get('url')
        objtext = obj_dict.get('text')

        relname = resource_name(rel)
        if relname == 'label':
            if obj_dict['lang'] == 'en':
                synset_labels[subj].append(objtext)
        elif relname == 'sameAs':
            if obj.startswith(WN20_URL):
                # If we have a link to RDF WordNet 2.0, the URL (URI? IRI?)
                # will contain a standardized label for this concept, which
                # we should use when we want to use this synset as the name of
                # a disambiguation category. RDF WordNet 3.1 assigns synsets
                # a number of labels in no particular order, making it hard to
                # determine from 3.1 alone what to name a category.
                objname = resource_name(obj)
                parts = objname.split('-')[1:-2]

                # Handle missing apostrophes
                label = '-'.join(parts).replace('_s_', "'s_").replace('_s-', "'s_").replace("s__", "s'_").replace("s_-", "s'-").replace('_', ' ')
                synset_canonical_labels[subj] = label

        elif relname == 'domain_category':
            synset_categories[subj] = obj
        elif relname == 'lexical_domain':
            target = resource_name(obj)
            if '.' in target:
                domain = target.split('.')[1]
                synset_domains[subj] = domain
        elif relname == 'gloss':
            synset_glosses[subj] = objtext
        elif relname == 'reference':
            lemma = resource_name(subj)
            synset = obj
            synset_senses[synset].append(lemma)
            sense_synsets[lemma] = synset

    used_labels = set(synset_canonical_labels.values())
    for synset, values in synset_labels.items():
        values.sort(key=lambda label: (label in used_labels,) + label_sort_key(label))
        if (
            synset not in synset_canonical_labels or
            synset_canonical_labels[synset][0].isupper() and synset_domains.get(synset) == 'person'
        ):
            label = values[0]
            synset_canonical_labels[synset] = label
            used_labels.add(label)

    for synset, labels in synset_labels.items():
        if synset in synset_categories:
            category_name = synset_canonical_labels[synset_categories[synset]]
        else:
            category_name = synset_domains.get(synset, None)
        synset_no_fragment = synset.split('#')[0]
        pos = synset_no_fragment[-1].lower()
        assert pos in 'nvarsp', synset
        if pos == 's':
            pos = 'a'
        elif pos == 'p':
            pos = '-'
        if category_name in ('pert', 'all', 'tops'):
            category_name = None
        synset_disambig[synset] = (pos, category_name)

        canon = synset_canonical_labels[synset]
        canon_uri = standardized_concept_uri('en', canon, pos, 'wn', category_name)
        synset_uris[synset] = canon_uri

        for label in labels:
            if label != canon:
                other_uri = standardized_concept_uri('en', label, pos, 'wn', category_name)
                rel_uri = '/r/Synonym'
                surface = '[[{0}]] is a synonym of [[{1}]]'.format(label, canon)
                edge = make_edge(
                    rel_uri, other_uri, canon_uri, dataset=DATASET, surfaceText=surface,
                    license=Licenses.cc_attribution, sources=[SOURCE], weight=2.0
                )
                out.write(edge)

    quads = parse_nquads(open(input_file, encoding='utf-8'))
    for subj_dict, rel_dict, obj_dict, _graph in quads:
        if 'url' not in subj_dict or 'url' not in rel_dict:
            continue
        subj = subj_dict['url']
        rel = rel_dict['url']
        obj = obj_dict.get('url')
        relname = resource_name(rel)
        if relname in REL_MAPPING:
            rel, frame = REL_MAPPING[relname]
            reversed_frame = False
            if rel.startswith('~'):
                rel = rel[1:]
                reversed_frame = True
            rel_uri = '/r/' + rel
            if obj is not None:
                obj_uri = synset_uris.get(obj)
                if obj not in synset_canonical_labels:
                    continue
                obj_label = synset_canonical_labels[obj]
            else:
                text = obj_dict['text']
                # Some WordNets use strings with "!" in them to indicate
                # out-of-band information, such as a missing translation
                if (not text) or '!' in text:
                    continue
                lang = obj_dict['lang']
                pos, sense = synset_disambig.get(subj, (None, None))
                obj_uri = standardized_concept_uri(lang, text, pos, 'wn', sense)
                obj_label = text

            if subj not in synset_uris or subj not in synset_canonical_labels:
                continue
            subj_uri = synset_uris[subj]
            subj_label = synset_canonical_labels[subj]
            license = Licenses.cc_attribution
            langcode = subj_uri.split('/')[2]
            if langcode in SHAREALIKE_LANGUAGES:
                license = Licenses.cc_sharealike

            if reversed_frame:
                subj_uri, obj_uri = obj_uri, subj_uri
                subj_label, obj_label = obj_label, subj_label

            surface = frame.format('[[%s]]' % subj_label, '[[%s]]' % obj_label)

            edge = make_edge(
                rel_uri, subj_uri, obj_uri, dataset=DATASET, surfaceText=surface,
                license=license, sources=[SOURCE], weight=2.0
            )
            out.write(edge)

    for wn_url in sorted(synset_uris):
        cn_uri = synset_uris[wn_url]
        edge = make_edge(
            '/r/ExternalURL', cn_uri, wn_url, dataset=DATASET,
            license=Licenses.cc_sharealike, sources=[SOURCE], weight=1.0
        )
        out.write(edge)

    out.close()
예제 #6
0
def run_wordnet(input_file, output_file):
    out = MsgpackStreamWriter(output_file)

    synset_senses = defaultdict(list)
    sense_synsets = {}
    synset_labels = defaultdict(list)
    synset_canonical_labels = {}
    synset_categories = {}
    synset_domains = {}
    synset_glosses = {}
    synset_disambig = {}
    synset_uris = {}

    # First pass: find data about synsets
    quads = parse_nquads(open(input_file, encoding='utf-8'))
    for subj_dict, rel_dict, obj_dict, _graph in quads:
        if 'url' not in subj_dict or 'url' not in rel_dict:
            continue
        subj = subj_dict['url']
        rel = rel_dict['url']
        obj = obj_dict.get('url')
        objtext = obj_dict.get('text')

        relname = resource_name(rel)
        if relname == 'label':
            if obj_dict['lang'] == 'en':
                synset_labels[subj].append(objtext)
        elif relname == 'sameAs':
            if obj.startswith(WN20_URL):
                # If we have a link to RDF WordNet 2.0, the URL (URI? IRI?)
                # will contain a standardized label for this concept, which
                # we should use when we want to use this synset as the name of
                # a disambiguation category. RDF WordNet 3.1 assigns synsets
                # a number of labels in no particular order, making it hard to
                # determine from 3.1 alone what to name a category.
                objname = resource_name(obj)
                parts = objname.split('-')[1:-2]

                # Handle missing apostrophes
                label = '-'.join(parts).replace('_s_', "'s_").replace(
                    '_s-',
                    "'s_").replace("s__",
                                   "s'_").replace("s_-",
                                                  "s'-").replace('_', ' ')
                synset_canonical_labels[subj] = label

        elif relname == 'domain_category':
            synset_categories[subj] = obj
        elif relname == 'lexical_domain':
            target = resource_name(obj)
            if '.' in target:
                domain = target.split('.')[1]
                synset_domains[subj] = domain
        elif relname == 'gloss':
            synset_glosses[subj] = objtext
        elif relname == 'reference':
            lemma = resource_name(subj)
            synset = obj
            synset_senses[synset].append(lemma)
            sense_synsets[lemma] = synset

    used_labels = set(synset_canonical_labels.values())
    for synset, values in synset_labels.items():
        values.sort(
            key=lambda label: (label in used_labels, ) + label_sort_key(label))
        if (synset not in synset_canonical_labels
                or synset_canonical_labels[synset][0].isupper()
                and synset_domains.get(synset) == 'person'):
            label = values[0]
            synset_canonical_labels[synset] = label
            used_labels.add(label)

    for synset, labels in synset_labels.items():
        if synset in synset_categories:
            category_name = synset_canonical_labels[synset_categories[synset]]
        else:
            category_name = synset_domains.get(synset, None)
        synset_no_fragment = synset.split('#')[0]
        pos = synset_no_fragment[-1].lower()
        assert pos in 'nvarsp', synset
        if pos == 's':
            pos = 'a'
        elif pos == 'p':
            pos = '-'
        if category_name in ('pert', 'all', 'tops'):
            category_name = None
        synset_disambig[synset] = (pos, category_name)

        canon = synset_canonical_labels[synset]
        canon_uri = standardized_concept_uri('en', canon, pos, 'wn',
                                             category_name)
        synset_uris[synset] = canon_uri

        for label in labels:
            if label != canon:
                other_uri = standardized_concept_uri('en', label, pos, 'wn',
                                                     category_name)
                rel_uri = '/r/Synonym'
                surface = '[[{0}]] is a synonym of [[{1}]]'.format(
                    label, canon)
                edge = make_edge(rel_uri,
                                 other_uri,
                                 canon_uri,
                                 dataset=DATASET,
                                 surfaceText=surface,
                                 license=Licenses.cc_attribution,
                                 sources=[SOURCE],
                                 weight=2.0)
                out.write(edge)

    quads = parse_nquads(open(input_file, encoding='utf-8'))
    for subj_dict, rel_dict, obj_dict, _graph in quads:
        if 'url' not in subj_dict or 'url' not in rel_dict:
            continue
        subj = subj_dict['url']
        rel = rel_dict['url']
        obj = obj_dict.get('url')
        relname = resource_name(rel)
        if relname in REL_MAPPING:
            pos, sense = synset_disambig.get(subj, (None, None))
            if relname == 'hypernym' and pos == 'v':
                relname = 'hypernym-v'
            rel, frame = REL_MAPPING[relname]
            reversed_frame = False
            if rel.startswith('~'):
                rel = rel[1:]
                reversed_frame = True
            rel_uri = '/r/' + rel
            if obj is not None:
                obj_uri = synset_uris.get(obj)
                if obj not in synset_canonical_labels:
                    continue
                obj_label = synset_canonical_labels[obj]
            else:
                text = obj_dict['text']
                # Some WordNets use strings with "!" in them to indicate
                # out-of-band information, such as a missing translation
                if (not text) or '!' in text:
                    continue
                lang = obj_dict['lang']
                obj_uri = standardized_concept_uri(lang, text, pos, 'wn',
                                                   sense)
                obj_label = text

            if subj not in synset_uris or subj not in synset_canonical_labels:
                continue
            subj_uri = synset_uris[subj]
            subj_label = synset_canonical_labels[subj]
            license = Licenses.cc_attribution
            langcode = subj_uri.split('/')[2]
            if langcode in SHAREALIKE_LANGUAGES:
                license = Licenses.cc_sharealike

            if reversed_frame:
                subj_uri, obj_uri = obj_uri, subj_uri
                subj_label, obj_label = obj_label, subj_label

            surface = frame.format('[[%s]]' % subj_label, '[[%s]]' % obj_label)

            edge = make_edge(rel_uri,
                             subj_uri,
                             obj_uri,
                             dataset=DATASET,
                             surfaceText=surface,
                             license=license,
                             sources=[SOURCE],
                             weight=2.0)
            out.write(edge)

    for wn_url in sorted(synset_uris):
        cn_uri = synset_uris[wn_url]
        edge = make_edge('/r/ExternalURL',
                         cn_uri,
                         wn_url,
                         dataset=DATASET,
                         license=Licenses.cc_sharealike,
                         sources=[SOURCE],
                         weight=1.0)
        out.write(edge)

    out.close()
예제 #7
0
def process_dbpedia(input_dir, output_file, concept_file):
    """
    Read through multiple DBPedia files and output filtered assertions to
    `output_file`.
    """
    ok_concepts = read_concept_file(concept_file)

    input_path = pathlib.Path(input_dir)
    interlang_path = input_path / 'interlanguage_links_en.tql.bz2'
    mapped_urls = interlanguage_mapping(interlang_path, ok_concepts)

    out = MsgpackStreamWriter(output_file)

    types_path = input_path / 'instance_types_en.tql.bz2'
    quads = parse_nquads(bz2.open(str(types_path), 'rt'))
    for subj, pred, obj, _graph in quads:
        subj_url = subj['url']
        if (
            'Category:' in subj_url or 'File:' in subj_url or
            'List_of' in subj_url or '__' in subj_url or
            'Template:' in subj_url
        ):
            continue
        if subj_url in mapped_urls:
            subj_concept = translate_dbpedia_url(subj_url)
            obj_type = un_camel_case(resource_name(obj['url']))
            if obj_type not in TYPE_BLACKLIST:
                obj_concept = standardized_concept_uri('en', obj_type, 'n')
                if obj_concept not in CONCEPT_BLACKLIST:
                    edge = make_edge(
                        '/r/IsA', subj_concept, obj_concept,
                        dataset='/d/dbpedia/en',
                        license=Licenses.cc_sharealike,
                        sources=[{'contributor': '/s/resource/dbpedia/2015/en'}],
                        weight=0.5,
                        surfaceStart=url_to_label(subj['url']),
                        surfaceEnd=url_to_label(obj['url'])
                    )
                    out.write(edge)
                for other_url in mapped_urls[subj_url]:
                    if other_url.startswith('http://wikidata.dbpedia.org/'):
                        urledge = make_edge(
                            '/r/ExternalURL',
                            subj_concept, other_url,
                            dataset='/d/dbpedia/en',
                            license=Licenses.cc_sharealike,
                            sources=[{'contributor': '/s/resource/dbpedia/2015/en'}],
                            weight=1.0
                        )
                        out.write(urledge)
                    else:
                        other_concept = translate_dbpedia_url(other_url)
                        if other_concept:
                            urledge = make_edge(
                                '/r/ExternalURL',
                                other_concept, other_url,
                                dataset='/d/dbpedia/en',
                                license=Licenses.cc_sharealike,
                                sources=[{'contributor': '/s/resource/dbpedia/2015/en'}],
                                weight=1.0
                            )
                            out.write(urledge)
                            edge = make_edge(
                                '/r/Synonym',
                                other_concept, subj_concept,
                                dataset='/d/dbpedia/en',
                                license=Licenses.cc_sharealike,
                                sources=[{'contributor': '/s/resource/dbpedia/2015/en'}],
                                weight=0.5,
                                surfaceStart=url_to_label(other_url),
                                surfaceEnd=url_to_label(subj_url)
                            )
                            out.write(edge)

    relations_path = input_path / 'mappingbased_objects_en.tql.bz2'
    quads = parse_nquads(bz2.open(str(relations_path), 'rt'))
    for subj, pred, obj, _graph in quads:
        subj_concept = translate_dbpedia_url(subj['url'])
        obj_concept = translate_dbpedia_url(obj['url'])
        rel_name = resource_name(pred['url'])
        if (
            subj_concept and obj_concept and
            subj['url'] in mapped_urls and obj['url'] in mapped_urls
        ):
            if rel_name in RELATIONS:
                rel = RELATIONS[rel_name]
                edge = make_edge(
                    rel, subj_concept, obj_concept,
                    dataset='/d/dbpedia/en',
                    license=Licenses.cc_sharealike,
                    sources=[{'contributor': '/s/resource/dbpedia/2015/en'}],
                    weight=0.5,
                    surfaceStart=url_to_label(subj['url']),
                    surfaceEnd=url_to_label(obj['url'])
                )
                out.write(edge)

    out.close()
예제 #8
0
def run_wordnet(input_file, output_file):
    out = MsgpackStreamWriter(output_file)

    synset_senses = defaultdict(list)
    sense_synsets = {}
    synset_labels = defaultdict(list)
    synset_canonical_labels = {}
    synset_categories = {}
    synset_domains = {}
    synset_glosses = {}
    synset_disambig = {}
    synset_uris = {}

    # First pass: find data about synsets
    quads = parse_nquads(open(input_file, encoding="utf-8"))
    for subj_dict, rel_dict, obj_dict, _graph in quads:
        if "url" not in subj_dict or "url" not in rel_dict:
            continue
        subj = subj_dict["url"]
        rel = rel_dict["url"]
        obj = obj_dict.get("url")
        objtext = obj_dict.get("text")

        relname = resource_name(rel)
        if relname == "label":
            if obj_dict["lang"] == "en":
                synset_labels[subj].append(objtext)
        elif relname == "sameAs":
            if obj.startswith(WN20_URL):
                # If we have a link to RDF WordNet 2.0, the URL (URI? IRI?)
                # will contain a standardized label for this concept, which
                # we should use when we want to use this synset as the name of
                # a disambiguation category. RDF WordNet 3.1 assigns synsets
                # a number of labels in no particular order, making it hard to
                # determine from 3.1 alone what to name a category.
                objname = resource_name(obj)
                parts = objname.split("-")[1:-2]

                # Handle missing apostrophes
                label = (
                    "-".join(parts)
                    .replace("_s_", "'s_")
                    .replace("_s-", "'s_")
                    .replace("s__", "s'_")
                    .replace("s_-", "s'-")
                    .replace("_", " ")
                )
                synset_canonical_labels[subj] = label

        elif relname == "domain_category":
            synset_categories[subj] = obj
        elif relname == "lexical_domain":
            target = resource_name(obj)
            if "." in target:
                domain = target.split(".")[1]
                synset_domains[subj] = domain
        elif relname == "gloss":
            synset_glosses[subj] = objtext
        elif relname == "reference":
            lemma = resource_name(subj)
            synset = obj
            synset_senses[synset].append(lemma)
            sense_synsets[lemma] = synset

    used_labels = set(synset_canonical_labels.values())
    for synset, values in synset_labels.items():
        values.sort(key=lambda label: (label in used_labels,) + label_sort_key(label))
        if (
            synset not in synset_canonical_labels
            or synset_canonical_labels[synset][0].isupper()
            and synset_domains.get(synset) == "person"
        ):
            label = values[0]
            synset_canonical_labels[synset] = label
            used_labels.add(label)

    for synset, labels in synset_labels.items():
        if synset in synset_categories:
            category_name = synset_canonical_labels[synset_categories[synset]]
        else:
            category_name = synset_domains.get(synset, None)
        synset_no_fragment = synset.split("#")[0]
        pos = synset_no_fragment[-1].lower()
        assert pos in "nvarsp", synset
        if pos == "s":
            pos = "a"
        elif pos == "p":
            pos = "-"
        if category_name in ("pert", "all", "tops"):
            category_name = None
        synset_disambig[synset] = (pos, category_name)

        canon = synset_canonical_labels[synset]
        canon_uri = standardized_concept_uri("en", canon, pos, "wn", category_name)
        synset_uris[synset] = canon_uri

        for label in labels:
            if label != canon:
                other_uri = standardized_concept_uri(
                    "en", label, pos, "wn", category_name
                )
                rel_uri = "/r/Synonym"
                surface = "[[{0}]] is a synonym of [[{1}]]".format(label, canon)
                edge = make_edge(
                    rel_uri,
                    other_uri,
                    canon_uri,
                    dataset=DATASET,
                    surfaceText=surface,
                    license=Licenses.cc_attribution,
                    sources=[SOURCE],
                    weight=2.0,
                )
                out.write(edge)

    quads = parse_nquads(open(input_file, encoding="utf-8"))
    for subj_dict, rel_dict, obj_dict, _graph in quads:
        if "url" not in subj_dict or "url" not in rel_dict:
            continue
        subj = subj_dict["url"]
        rel = rel_dict["url"]
        obj = obj_dict.get("url")
        relname = resource_name(rel)
        if relname in REL_MAPPING:
            pos, sense = synset_disambig.get(subj, (None, None))
            if relname == "hypernym" and pos == "v":
                relname = "hypernym-v"
            rel, frame = REL_MAPPING[relname]
            reversed_frame = False
            if rel.startswith("~"):
                rel = rel[1:]
                reversed_frame = True
            rel_uri = "/r/" + rel
            if obj is not None:
                obj_uri = synset_uris.get(obj)
                if obj not in synset_canonical_labels:
                    continue
                obj_label = synset_canonical_labels[obj]
            else:
                text = obj_dict["text"]
                # Some WordNets use strings with "!" in them to indicate
                # out-of-band information, such as a missing translation
                if (not text) or "!" in text:
                    continue
                lang = obj_dict["lang"]
                obj_uri = standardized_concept_uri(lang, text, pos, "wn", sense)
                obj_label = text

            if subj not in synset_uris or subj not in synset_canonical_labels:
                continue
            subj_uri = synset_uris[subj]
            subj_label = synset_canonical_labels[subj]
            license = Licenses.cc_attribution
            langcode = subj_uri.split("/")[2]
            if langcode in SHAREALIKE_LANGUAGES:
                license = Licenses.cc_sharealike

            if reversed_frame:
                subj_uri, obj_uri = obj_uri, subj_uri
                subj_label, obj_label = obj_label, subj_label

            surface = frame.format("[[%s]]" % subj_label, "[[%s]]" % obj_label)

            edge = make_edge(
                rel_uri,
                subj_uri,
                obj_uri,
                dataset=DATASET,
                surfaceText=surface,
                license=license,
                sources=[SOURCE],
                weight=2.0,
            )
            out.write(edge)

    for wn_url in sorted(synset_uris):
        cn_uri = synset_uris[wn_url]
        edge = make_edge(
            "/r/ExternalURL",
            cn_uri,
            wn_url,
            dataset=DATASET,
            license=Licenses.cc_sharealike,
            sources=[SOURCE],
            weight=1.0,
        )
        out.write(edge)

    out.close()
예제 #9
0
def process_dbpedia(input_dir, output_file, concept_file):
    """
    Read through multiple DBPedia files and output filtered assertions to
    `output_file`.
    """
    ok_concepts = read_concept_file(concept_file)

    input_path = pathlib.Path(input_dir)
    interlang_path = input_path / 'interlanguage_links_en.tql.bz2'
    mapped_urls = interlanguage_mapping(interlang_path, ok_concepts)

    out = MsgpackStreamWriter(output_file)

    types_path = input_path / 'instance_types_en.tql.bz2'
    quads = parse_nquads(bz2.open(str(types_path), 'rt'))
    for subj, pred, obj, _graph in quads:
        subj_url = subj['url']
        if ('Category:' in subj_url or 'File:' in subj_url
                or 'List_of' in subj_url or '__' in subj_url
                or 'Template:' in subj_url):
            continue
        if subj_url in mapped_urls:
            subj_concept = translate_dbpedia_url(subj_url)
            obj_type = un_camel_case(resource_name(obj['url']))
            if obj_type not in TYPE_BLACKLIST:
                obj_concept = standardized_concept_uri('en', obj_type, 'n')
                if obj_concept not in CONCEPT_BLACKLIST:
                    edge = make_edge('/r/IsA',
                                     subj_concept,
                                     obj_concept,
                                     dataset='/d/dbpedia/en',
                                     license=Licenses.cc_sharealike,
                                     sources=[{
                                         'contributor':
                                         '/s/resource/dbpedia/2015/en'
                                     }],
                                     weight=0.5,
                                     surfaceStart=url_to_label(subj['url']),
                                     surfaceEnd=url_to_label(obj['url']))
                    out.write(edge)
                for other_url in mapped_urls[subj_url]:
                    if other_url.startswith('http://wikidata.dbpedia.org/'):
                        urledge = make_edge('/r/ExternalURL',
                                            subj_concept,
                                            other_url,
                                            dataset='/d/dbpedia/en',
                                            license=Licenses.cc_sharealike,
                                            sources=[{
                                                'contributor':
                                                '/s/resource/dbpedia/2015/en'
                                            }],
                                            weight=1.0)
                        out.write(urledge)
                    else:
                        other_concept = translate_dbpedia_url(other_url)
                        if other_concept:
                            urledge = make_edge(
                                '/r/ExternalURL',
                                other_concept,
                                other_url,
                                dataset='/d/dbpedia/en',
                                license=Licenses.cc_sharealike,
                                sources=[{
                                    'contributor':
                                    '/s/resource/dbpedia/2015/en'
                                }],
                                weight=1.0)
                            out.write(urledge)
                            edge = make_edge(
                                '/r/Synonym',
                                other_concept,
                                subj_concept,
                                dataset='/d/dbpedia/en',
                                license=Licenses.cc_sharealike,
                                sources=[{
                                    'contributor':
                                    '/s/resource/dbpedia/2015/en'
                                }],
                                weight=0.5,
                                surfaceStart=url_to_label(other_url),
                                surfaceEnd=url_to_label(subj_url))
                            out.write(edge)

    relations_path = input_path / 'mappingbased_objects_en.tql.bz2'
    quads = parse_nquads(bz2.open(str(relations_path), 'rt'))
    for subj, pred, obj, _graph in quads:
        subj_concept = translate_dbpedia_url(subj['url'])
        obj_concept = translate_dbpedia_url(obj['url'])
        rel_name = resource_name(pred['url'])
        if (subj_concept and obj_concept and subj['url'] in mapped_urls
                and obj['url'] in mapped_urls):
            if rel_name in RELATIONS:
                rel = RELATIONS[rel_name]
                edge = make_edge(rel,
                                 subj_concept,
                                 obj_concept,
                                 dataset='/d/dbpedia/en',
                                 license=Licenses.cc_sharealike,
                                 sources=[{
                                     'contributor':
                                     '/s/resource/dbpedia/2015/en'
                                 }],
                                 weight=0.5,
                                 surfaceStart=url_to_label(subj['url']),
                                 surfaceEnd=url_to_label(obj['url']))
                out.write(edge)

    out.close()