Exemplo n.º 1
0
def handle_file(input_filename, output_file):
    builder = CN4Builder(weight=0.05)
    out = MsgpackStreamWriter(output_file)
    for line in open(input_filename, encoding='utf-8'):
        # Get a line from the file
        for new_obj in handle_line(line, builder):
            out.write(new_obj)
Exemplo n.º 2
0
def subwords_to_edges(language, input, output):
    """
    Morfessor hypothesizes ways to break words into sub-word chunks. Produce
    edges from these sub-words that can be used in retrofitting.
    """
    writer = MsgpackStreamWriter(output)
    for line in input:
        line = line.rstrip()
        if not line or line.startswith('#'):
            continue

        # Remove the unnecessary count ("1 ") from the start of each line
        line = line.split(' ', 1)[1]
        chunks = line.split(' + ')

        # Strip a possible trailing underscore, which would particularly show
        # up in the way we segment ATOMIC_SPACE_LANGUAGES (Vietnamese)
        full_text = ''.join(chunks).strip('_')
        end = join_uri('c', language, full_text)
        for chunk in chunks:
            if chunk != '_':
                start = join_uri('x', language, chunk.strip('_'))
                edge = make_edge(
                    '/r/SubwordOf',
                    start,
                    end,
                    dataset='/d/morphology',
                    license=Licenses.cc_attribution,
                    sources=MORPH_SOURCES,
                    weight=0.01,
                )
                writer.write(edge)
    writer.close()
Exemplo n.º 3
0
def handle_file(input_filename, output_file):
    out = MsgpackStreamWriter(output_file)
    for line in codecs.open(input_filename, encoding='utf-8'):
        line = line.strip()
        if line:
            for new_obj in handle_raw_assertion(line):
                out.write(new_obj)
Exemplo n.º 4
0
def handle_file(input_filename, output_file):
    builder = CN4Builder()
    out = MsgpackStreamWriter(output_file)
    for line in open(input_filename, encoding='utf-8'):
        # Get a line from the file
        for new_obj in handle_line(line, builder):
            out.write(new_obj)
Exemplo n.º 5
0
def handle_file(input_filename, output_file):
    out = MsgpackStreamWriter(output_file)
    for line in codecs.open(input_filename, encoding='utf-8'):
        line = line.strip()
        if line:
            for new_obj in handle_raw_assertion(line):
                out.write(new_obj)
Exemplo n.º 6
0
def json_to_msgpack(input_filename, output_filename):
    """
    Convert a JSON stream (with one object per line) to a msgpack stream.
    """
    out_stream = MsgpackStreamWriter(output_filename)
    for obj in read_json_stream(input_filename):
        out_stream.write(obj)
    out_stream.close()
Exemplo n.º 7
0
def json_to_msgpack(input_filename, output_filename):
    """
    Convert a JSON stream (with one object per line) to a msgpack stream.
    """
    out_stream = MsgpackStreamWriter(output_filename)
    for obj in read_json_stream(input_filename):
        out_stream.write(obj)
    out_stream.close()
Exemplo n.º 8
0
def handle_file(input_file, output_file):
    tree = ET.parse(input_file)
    out = MsgpackStreamWriter(output_file)
    root = tree.getroot()
    lang = root[0][1].attrib['type']
    for annotation in root[1]:

        for word in strip_words(annotation.text):
            start = standardized_concept_uri('mul', annotation.attrib['cp'])
            end = standardized_concept_uri(lang, word)
            edge = make_edge(REL, start, end, DATASET, LICENSE, SOURCE)
            out.write(edge)
Exemplo n.º 9
0
def test_msgpack_to_json():
    with TemporaryDirectory(prefix='conceptnet-test') as tmpdir:
        json_path = os.path.join(tmpdir, 'test.jsons')
        msgpack_path = os.path.join(tmpdir, 'test.msgpack')

        writer = MsgpackStreamWriter(json_path)
        for item in DATA:
            writer.write(item)
        writer.close()

        msgpack_to_json(json_path, msgpack_path)
        reader = read_json_stream(msgpack_path)
        for known, read in zip_longest(DATA, reader):
            eq_(known, read)
Exemplo n.º 10
0
def run_opencyc(input_file, output_file):
    """
    Read an .nq file containing OpenCyc data, outputting a file of
    ConceptNet edges and a file of mappings between the Semantic Web and
    ConceptNet.
    """
    out = MsgpackStreamWriter(output_file)

    labels = {}
    unlabels = defaultdict(set)
    seen_external_urls = set()

    # Read through the file once, finding all the "preferred labels". We will
    # use these as the surface texts for the nodes.
    for subj, pred, obj, _graph in parse_nquads(
            open(input_file, encoding='utf-8')):
        if pred['url'] == RDF_LABEL:
            labels[subj['url']] = obj['text']
            unlabels[obj['text']].add(subj['url'])

    # Read through the file again and extract ConceptNet edges.
    for subj, pred, obj, _graph in parse_nquads(
            open(input_file, encoding='utf-8')):
        rel_name = resource_name(pred['url'])
        web_subj = subj.get('url')
        web_obj = obj.get('url')
        if rel_name == 'subClassOf' and web_obj is not None and web_subj in labels and web_obj in labels:
            subj_label = labels[web_subj]
            obj_label = labels[web_obj]
            if '_' in subj_label or '_' in obj_label:
                continue
            if subj_label.startswith('xsd:') or obj_label.startswith('xsd:'):
                continue
            subj_words = set(simple_tokenize(subj_label))
            obj_words = set(simple_tokenize(obj_label))
            if (subj_words & BLACKLIST_WORDS) or (obj_words & BLACKLIST_WORDS):
                continue
            if len(subj_words) > 4 or len(obj_words) > 4:
                continue

            subj_uri = cyc_to_conceptnet_uri(labels, unlabels, web_subj)
            obj_uri = cyc_to_conceptnet_uri(labels, unlabels, web_obj)
            out.write(
                opencyc_edge('/r/IsA', subj_uri, obj_uri, subj_label,
                             obj_label))
            if (subj_uri, web_subj) not in seen_external_urls:
                out.write(external_url_edge(subj_uri, web_subj))
                seen_external_urls.add((subj_uri, web_subj))
            if (obj_uri, web_obj) not in seen_external_urls:
                out.write(external_url_edge(obj_uri, web_obj))
                seen_external_urls.add((obj_uri, web_obj))
        elif rel_name == 'sameAs' and web_subj in labels and web_obj.startswith(
                'http://umbel.org/'):
            subj_label = labels[web_subj]
            subj_uri = standardized_concept_uri('en', subj_label)
            if (subj_uri, web_obj) not in seen_external_urls:
                out.write(external_url_edge(subj_uri, web_obj))
                seen_external_urls.add((subj_uri, web_obj))

    out.close()
Exemplo n.º 11
0
def run_wiktionary(input_file, output_file, titledb=None, language='en',
                   verbosity=0, logger=None):
    if titledb is None:
        titledb = os.path.dirname(input_file) + '/titles.db'

    trace = (verbosity >= 2)
    sem = SEMANTICS[language](language, titledb=titledb, trace=trace,
                              logger=logger)
    output = MsgpackStreamWriter(output_file)
    for structure in read_msgpack_stream(input_file):
        for edge in sem.parse_structured_entry(structure):
            if verbosity >= 1:
                print(edge['rel'], edge['start'], edge['end'])
            output.write(edge)
Exemplo n.º 12
0
def test_msgpack_to_json():
    with TemporaryDirectory(prefix='conceptnet-test') as tmpdir:
        json_path = os.path.join(tmpdir, 'test.jsons')
        msgpack_path = os.path.join(tmpdir, 'test.msgpack')
        
        writer = MsgpackStreamWriter(json_path)
        for item in DATA:
            writer.write(item)
        writer.close()

        msgpack_to_json(json_path, msgpack_path)
        reader = read_json_stream(msgpack_path)
        for known, read in zip_longest(DATA, reader):
            eq_(known, read)
Exemplo n.º 13
0
def run_opencyc(input_file, output_file):
    """
    Read an .nq file containing OpenCyc data, outputting a file of
    ConceptNet edges and a file of mappings between the Semantic Web and
    ConceptNet.
    """
    out = MsgpackStreamWriter(output_file)

    labels = {}
    unlabels = defaultdict(set)
    seen_external_urls = set()

    # Read through the file once, finding all the "preferred labels". We will
    # use these as the surface texts for the nodes.
    for subj, pred, obj, _graph in parse_nquads(open(input_file, encoding='utf-8')):
        if pred['url'] == RDF_LABEL:
            labels[subj['url']] = obj['text']
            unlabels[obj['text']].add(subj['url'])

    # Read through the file again and extract ConceptNet edges.
    for subj, pred, obj, _graph in parse_nquads(open(input_file, encoding='utf-8')):
        rel_name = resource_name(pred['url'])
        web_subj = subj.get('url')
        web_obj = obj.get('url')
        if rel_name == 'subClassOf' and web_obj is not None and web_subj in labels and web_obj in labels:
            subj_label = labels[web_subj]
            obj_label = labels[web_obj]
            if '_' in subj_label or '_' in obj_label:
                continue
            if subj_label.startswith('xsd:') or obj_label.startswith('xsd:'):
                continue
            subj_words = set(simple_tokenize(subj_label))
            obj_words = set(simple_tokenize(obj_label))
            if (subj_words & BLACKLIST_WORDS) or (obj_words & BLACKLIST_WORDS):
                continue
            if len(subj_words) > 4 or len(obj_words) > 4:
                continue

            subj_uri = cyc_to_conceptnet_uri(labels, unlabels, web_subj)
            obj_uri = cyc_to_conceptnet_uri(labels, unlabels, web_obj)
            out.write(opencyc_edge('/r/IsA', subj_uri, obj_uri, subj_label, obj_label))
            if (subj_uri, web_subj) not in seen_external_urls:
                out.write(external_url_edge(subj_uri, web_subj))
                seen_external_urls.add((subj_uri, web_subj))
            if (obj_uri, web_obj) not in seen_external_urls:
                out.write(external_url_edge(obj_uri, web_obj))
                seen_external_urls.add((obj_uri, web_obj))
        elif rel_name == 'sameAs' and web_subj in labels and web_obj.startswith('http://umbel.org/'):
            subj_label = labels[web_subj]
            subj_uri = standardized_concept_uri('en', subj_label)
            if (subj_uri, web_obj) not in seen_external_urls:
                out.write(external_url_edge(subj_uri, web_obj))
                seen_external_urls.add((subj_uri, web_obj))

    out.close()
Exemplo n.º 14
0
def handle_file(input_filename, output_file):
    out = MsgpackStreamWriter(output_file)
    for line in open(input_filename, encoding='utf-8'):
        parts = line.rstrip('\n').split('\t')
        uri, start, rel, end, weight, source = parts
        if uri == 'uri':
            continue

        edge = make_edge(
            rel=rel,
            start=start,
            end=end,
            dataset=DATASET,
            sources=[{'activity': SOURCE}],
            license=Licenses.cc_attribution,
            weight=WEIGHT_TABLE[weight],
        )
        out.write(edge)
Exemplo n.º 15
0
def handle_file(input_filename, output_file):
    out = MsgpackStreamWriter(output_file)
    for line in open(input_filename, encoding='utf-8'):
        parts = line.rstrip('\n').split('\t')
        uri, start, rel, end, weight, source = parts
        if uri == 'uri':
            return

        edge = make_edge(rel=rel,
                         start=start,
                         end=end,
                         dataset=DATASET,
                         sources=[{
                             'activity': SOURCE
                         }],
                         license=Licenses.cc_attribution,
                         weight=WEIGHT_TABLE[weight])
        out.write(edge)
Exemplo n.º 16
0
def handle_file(input_file, output_file):
    tree = ET.parse(input_file)
    out = MsgpackStreamWriter(output_file)
    root = tree.getroot()
    lang = root[0][1].attrib[
        'type'
    ]  # language is at position [1] within the child node [0]

    if len(root) >= 2:
        for annotation in root[1]:
            for word in strip_words(annotation.text):
                start = standardized_concept_uri('mul', annotation.attrib['cp'])
                end = standardized_concept_uri(lang, word)
                edge = make_edge(REL, start, end, DATASET, LICENSE, SOURCE)
                out.write(edge)
    else:
        print("No emoji data in {!r}".format(input_file))

    out.close()
Exemplo n.º 17
0
def run_wiktionary(input_file,
                   output_file,
                   titledb=None,
                   language='en',
                   verbosity=0,
                   logger=None):
    if titledb is None:
        titledb = os.path.dirname(input_file) + '/titles.db'

    trace = (verbosity >= 2)
    sem = SEMANTICS[language](language,
                              titledb=titledb,
                              trace=trace,
                              logger=logger)
    output = MsgpackStreamWriter(output_file)
    for structure in read_msgpack_stream(input_file):
        for edge in sem.parse_structured_entry(structure):
            if verbosity >= 1:
                print(edge['rel'], edge['start'], edge['end'])
            output.write(edge)
Exemplo n.º 18
0
def json_to_msgpack(input_filename, output_filename):
    out_stream = MsgpackStreamWriter(output_filename)
    for obj in read_json_stream(input_filename):
        out_stream.write(obj)
    out_stream.close()
Exemplo n.º 19
0
def process_dbpedia(input_dir, output_file, concept_file):
    """
    Read through multiple DBPedia files and output filtered assertions to
    `output_file`.
    """
    ok_concepts = read_concept_file(concept_file)

    input_path = pathlib.Path(input_dir)
    interlang_path = input_path / 'interlanguage_links_en.tql.bz2'
    mapped_urls = interlanguage_mapping(interlang_path, ok_concepts)

    out = MsgpackStreamWriter(output_file)

    types_path = input_path / 'instance_types_en.tql.bz2'
    quads = parse_nquads(bz2.open(str(types_path), 'rt'))
    for subj, pred, obj, _graph in quads:
        subj_url = subj['url']
        if (
            'Category:' in subj_url or 'File:' in subj_url or
            'List_of' in subj_url or '__' in subj_url or
            'Template:' in subj_url
        ):
            continue
        if subj_url in mapped_urls:
            subj_concept = translate_dbpedia_url(subj_url)
            obj_type = un_camel_case(resource_name(obj['url']))
            if obj_type not in TYPE_BLACKLIST:
                obj_concept = standardized_concept_uri('en', obj_type, 'n')
                if obj_concept not in CONCEPT_BLACKLIST:
                    edge = make_edge(
                        '/r/IsA', subj_concept, obj_concept,
                        dataset='/d/dbpedia/en',
                        license=Licenses.cc_sharealike,
                        sources=[{'contributor': '/s/resource/dbpedia/2015/en'}],
                        weight=0.5,
                        surfaceStart=url_to_label(subj['url']),
                        surfaceEnd=url_to_label(obj['url'])
                    )
                    out.write(edge)
                for other_url in mapped_urls[subj_url]:
                    if other_url.startswith('http://wikidata.dbpedia.org/'):
                        urledge = make_edge(
                            '/r/ExternalURL',
                            subj_concept, other_url,
                            dataset='/d/dbpedia/en',
                            license=Licenses.cc_sharealike,
                            sources=[{'contributor': '/s/resource/dbpedia/2015/en'}],
                            weight=1.0
                        )
                        out.write(urledge)
                    else:
                        other_concept = translate_dbpedia_url(other_url)
                        if other_concept:
                            urledge = make_edge(
                                '/r/ExternalURL',
                                other_concept, other_url,
                                dataset='/d/dbpedia/en',
                                license=Licenses.cc_sharealike,
                                sources=[{'contributor': '/s/resource/dbpedia/2015/en'}],
                                weight=1.0
                            )
                            out.write(urledge)
                            edge = make_edge(
                                '/r/Synonym',
                                other_concept, subj_concept,
                                dataset='/d/dbpedia/en',
                                license=Licenses.cc_sharealike,
                                sources=[{'contributor': '/s/resource/dbpedia/2015/en'}],
                                weight=0.5,
                                surfaceStart=url_to_label(other_url),
                                surfaceEnd=url_to_label(subj_url)
                            )
                            out.write(edge)

    relations_path = input_path / 'mappingbased_objects_en.tql.bz2'
    quads = parse_nquads(bz2.open(str(relations_path), 'rt'))
    for subj, pred, obj, _graph in quads:
        subj_concept = translate_dbpedia_url(subj['url'])
        obj_concept = translate_dbpedia_url(obj['url'])
        rel_name = resource_name(pred['url'])
        if (
            subj_concept and obj_concept and
            subj['url'] in mapped_urls and obj['url'] in mapped_urls
        ):
            if rel_name in RELATIONS:
                rel = RELATIONS[rel_name]
                edge = make_edge(
                    rel, subj_concept, obj_concept,
                    dataset='/d/dbpedia/en',
                    license=Licenses.cc_sharealike,
                    sources=[{'contributor': '/s/resource/dbpedia/2015/en'}],
                    weight=0.5,
                    surfaceStart=url_to_label(subj['url']),
                    surfaceEnd=url_to_label(obj['url'])
                )
                out.write(edge)

    out.close()
Exemplo n.º 20
0
def read_wiktionary(input_file, db_file, output_file):
    """
    Convert a stream of parsed Wiktionary data into ConceptNet edges.

    A `db_file` containing all known words in all languages must have already
    been prepared from the same data.
    """
    db = sqlite3.connect(db_file)
    out = MsgpackStreamWriter(output_file)
    for heading, items in segmented_stream(input_file):
        language = heading['language']
        title = heading['title']
        dataset = '/d/wiktionary/{}'.format(language)
        url_title = heading['title'].replace(' ', '_')
        web_url = 'http://{}.wiktionary.org/wiki/{}'.format(language, url_title)
        web_source = '/s/resource/wiktionary/{}'.format(language)

        source = {
            'contributor': web_source,
            'process': PARSER_RULE
        }

        # Scan through the 'from' items, such as the start nodes of
        # translations, looking for distinct etymologies. If we get more than
        # one etymology for a language, we need to distinguish them as
        # different senses in that language.
        all_etyms = {
            (item['from']['language'], etym_label(language, item['from']))
            for item in items
            if 'language' in item['from'] and item['from']['text'] == title
            and etym_label(language, item['from']) is not None
        }
        word_languages = {wlang for (wlang, _) in all_etyms}
        for wlang in sorted(word_languages):
            cpage = standardized_concept_uri(wlang, title)
            ld_edge = make_edge(
                '/r/ExternalURL', cpage, web_url,
                dataset=dataset, weight=0.25, sources=[source],
                license=Licenses.cc_sharealike
            )
            out.write(ld_edge)
        etym_to_translation_sense = {}
        language_etym_counts = Counter(lang for (lang, etym) in all_etyms)
        polysemous_languages = {
            lang for lang in language_etym_counts
            if language_etym_counts[lang] > 1
        }

        for item in items:
            tfrom = item['from']
            tto = item['to']
            assumed_languages = [language]
            lang1 = tfrom.get('language')
            lang2 = tto.get('language')
            if lang1 and (lang1 not in assumed_languages) and valid_language(lang1):
                assumed_languages.append(lang1)
            if lang2 and (lang2 not in assumed_languages) and valid_language(lang2):
                assumed_languages.append(lang2)

            cfrom = transform_term(
                language, tfrom, assumed_languages, db,
                use_etyms=(lang1 in polysemous_languages)
            )
            cpage = cfrom
            cto = transform_term(
                language, tto, assumed_languages, db,
                use_etyms=(lang2 in polysemous_languages)
            )

            if cfrom is None or cto is None:
                continue
            if uri_prefix(cfrom, 3) == uri_prefix(cto, 3):
                continue

            rel, switch = transform_relation(item['rel'])
            if rel is None:
                continue
            if switch:
                cfrom, cto = cto, cfrom

            # When translations are separated by sense, use only the first
            # sense we see for each etymology. That will have the most
            # representative translations.
            if item['rel'] == 'translation':
                etym_key = (tfrom['language'], etym_label(language, tfrom))
                sense = tfrom.get('sense', '')
                if etym_key in etym_to_translation_sense:
                    if etym_to_translation_sense[etym_key] != sense:
                        continue
                else:
                    etym_to_translation_sense[etym_key] = sense

            weight = 1.
            if rel == '/r/EtymologicallyRelatedTo':
                weight = 0.25
            edge = make_edge(rel, cfrom, cto, dataset=dataset, weight=weight,
                             sources=[source],
                             surfaceStart=tfrom['text'],
                             surfaceEnd=tto['text'],
                             license=Licenses.cc_sharealike)
            out.write(edge)

    out.close()
Exemplo n.º 21
0
def build_from_dir(dirname, output_file):
    """
    Read a GlobalMind database exported in YAML files, translate
    it into ConceptNet 5 edges, and write those edges to disk using
    a MsgpackStreamWriter.
    """
    out = MsgpackStreamWriter(output_file)
    userdata = yaml.load_all(open(dirname + '/GMUser.yaml'))
    users = {}

    for userinfo in userdata:
        users[userinfo['pk']] = userinfo

    frame_data = yaml.load_all(open(dirname + '/GMFrame.yaml'))
    frames = {}
    for frame in frame_data:
        frames[frame['pk']] = frame['fields']

    assertiondata = yaml.load_all(open(dirname + '/GMAssertion.yaml'))
    assertions = {}
    for assertion in assertiondata:
        obj = assertion['fields']
        frame = frames[obj['frame']]
        frametext = frame['text']
        userinfo = users[obj['author']]
        username = userinfo['fields']['username']

        # As far as I can tell, GlobalMind used the same namespace of
        # usernames as the original Open Mind.
        user_source = "/s/contributor/omcs/%s" % username

        sources = [
            user_source,
            "/s/activity/globalmind/assert"
        ]

        lang = LANG_CODES[obj['lcode']]
        start = normalized_concept_uri(lang, obj['node1'])
        end = normalized_concept_uri(lang, obj['node2'])
        rel = '/r/' + RELATION_MAP.get(frame['relation'], frame['relation'])

        # fix messy english "around in"
        if ' around ' in frametext:
            if obj['node2'].startswith('in '):
                frametext = frametext.replace(' around ', ' in ')
                obj['node2'] = obj['node2'][3:]
            else:
                frametext = frametext.replace(' around ', ' near ')
                rel = '/r/LocatedNear'

        # fix more awkward English. I wonder how bad the other languages are.
        frametext = frametext.replace('hits your head', 'comes to mind')
        frametext = frametext.replace(': [node1], [node2]', ' [node1] and [node2]')

        node1 = u'[[' + obj['node1'] + u']]'
        node2 = u'[[' + obj['node2'] + u']]'
        surfaceText = frametext.replace('//', '').replace('[node1]', node1).replace('[node2]', node2)
        edge = make_edge(rel, start, end,
                         dataset='/d/globalmind',
                         license='/l/CC/By',
                         sources=sources,
                         surfaceText=surfaceText,
                         weight=1)

        # Avoid duplication with the ConceptNet reader, but still save every edge so that we can
        # handle translations.
        if username != 'openmind':
            out.write(edge)

        assertions[assertion['pk']] = edge

    translationdata = yaml.load_all(open(dirname + '/GMTranslation.yaml'))
    for translation in translationdata:
        obj = translation['fields']
        assertion1 = assertions[obj['assertion1']]
        assertion2 = assertions[obj['assertion2']]
        start = assertion1['uri']
        end = assertion2['uri']
        rel = '/r/TranslationOf'
        text1 = assertion1['surfaceText'].replace('[[', '').replace(']]', '')
        text2 = assertion2['surfaceText'].replace('[[', '').replace(']]', '')
        lang1 = LANG_NAMES[get_lang(assertion1)]
        lang2 = LANG_NAMES[get_lang(assertion2)]
        surfaceText = u"[[%s]] in %s means [[%s]] in %s." % (text1, lang1, text2, lang2)
        userinfo = users[obj['author']]
        username = userinfo['fields']['username']

        userlocale = userinfo['fields']['ccode'].lower()
        if userlocale:
            user_source = "/s/contributor/globalmind/%s/%s" % (userlocale, username)
        else:
            user_source = "/s/contributor/globalmind/%s" % username

        sources = [
            user_source,
            "/s/activity/globalmind/translate"
        ]
        edge = make_edge(rel, start, end,
                         dataset='/d/globalmind',
                         license=Licenses.cc_attribution,
                         sources=sources,
                         surfaceText=surfaceText,
                         weight=1)
        out.write(edge)
Exemplo n.º 22
0
def read_wiktionary(input_file, db_file, output_file):
    """
    Convert a stream of parsed Wiktionary data into ConceptNet edges.

    A `db_file` containing all known words in all languages must have already
    been prepared from the same data.
    """
    db = sqlite3.connect(db_file)
    out = MsgpackStreamWriter(output_file)
    for heading, items in segmented_stream(input_file):
        language = heading['language']
        title = heading['title']
        dataset = '/d/wiktionary/{}'.format(language)
        url_title = heading['title'].replace(' ', '_')
        web_url = 'http://{}.wiktionary.org/wiki/{}'.format(
            language, url_title)
        web_source = '/s/resource/wiktionary/{}'.format(language)

        source = {'contributor': web_source, 'process': PARSER_RULE}

        # Scan through the 'from' items, such as the start nodes of
        # translations, looking for distinct etymologies. If we get more than
        # one etymology for a language, we need to distinguish them as
        # different senses in that language.
        all_etyms = {
            (item['from']['language'], etym_label(language, item['from']))
            for item in items
            if 'language' in item['from'] and item['from']['text'] == title
            and etym_label(language, item['from']) is not None
        }
        word_languages = {wlang for (wlang, _) in all_etyms}
        for wlang in sorted(word_languages):
            if valid_language(wlang):
                cpage = standardized_concept_uri(wlang, title)
                ld_edge = make_edge('/r/ExternalURL',
                                    cpage,
                                    web_url,
                                    dataset=dataset,
                                    weight=0.25,
                                    sources=[source],
                                    license=Licenses.cc_sharealike)
                out.write(ld_edge)
        etym_to_translation_sense = {}
        language_etym_counts = Counter(lang for (lang, etym) in all_etyms)
        polysemous_languages = {
            lang
            for lang in language_etym_counts if language_etym_counts[lang] > 1
        }

        for item in items:
            tfrom = item['from']
            tto = item['to']
            assumed_languages = [language]
            lang1 = tfrom.get('language')
            lang2 = tto.get('language')
            if lang1 and (lang1
                          not in assumed_languages) and valid_language(lang1):
                assumed_languages.append(lang1)
            if lang2 and (lang2
                          not in assumed_languages) and valid_language(lang2):
                assumed_languages.append(lang2)

            cfrom = transform_term(language,
                                   tfrom,
                                   assumed_languages,
                                   db,
                                   use_etyms=(lang1 in polysemous_languages))
            cpage = cfrom
            cto = transform_term(language,
                                 tto,
                                 assumed_languages,
                                 db,
                                 use_etyms=(lang2 in polysemous_languages))

            if cfrom is None or cto is None:
                continue
            if uri_prefix(cfrom, 3) == uri_prefix(cto, 3):
                continue

            rel, switch = transform_relation(item['rel'])
            if rel is None:
                continue
            if switch:
                cfrom, cto = cto, cfrom

            # When translations are separated by sense, use only the first
            # sense we see for each etymology. That will have the most
            # representative translations.
            if item['rel'] == 'translation':
                etym_key = (tfrom['language'], etym_label(language, tfrom))
                sense = tfrom.get('sense', '')
                if etym_key in etym_to_translation_sense:
                    if etym_to_translation_sense[etym_key] != sense:
                        continue
                else:
                    etym_to_translation_sense[etym_key] = sense

            weight = 1.
            if rel == '/r/EtymologicallyRelatedTo':
                weight = 0.25
            edge = make_edge(rel,
                             cfrom,
                             cto,
                             dataset=dataset,
                             weight=weight,
                             sources=[source],
                             surfaceStart=tfrom['text'],
                             surfaceEnd=tto['text'],
                             license=Licenses.cc_sharealike)
            out.write(edge)

    out.close()
Exemplo n.º 23
0
 def transform_file(self, input_filename, output_file):
     out = MsgpackStreamWriter(output_file)
     for obj in read_json_stream(input_filename):
         for new_obj in self.handle_assertion(obj):
             out.write(new_obj)
Exemplo n.º 24
0
def run_wordnet(input_file, output_file):
    out = MsgpackStreamWriter(output_file)

    synset_senses = defaultdict(list)
    sense_synsets = {}
    synset_labels = defaultdict(list)
    synset_canonical_labels = {}
    synset_categories = {}
    synset_domains = {}
    synset_glosses = {}
    synset_disambig = {}
    synset_uris = {}

    # First pass: find data about synsets
    quads = parse_nquads(open(input_file, encoding="utf-8"))
    for subj_dict, rel_dict, obj_dict, _graph in quads:
        if "url" not in subj_dict or "url" not in rel_dict:
            continue
        subj = subj_dict["url"]
        rel = rel_dict["url"]
        obj = obj_dict.get("url")
        objtext = obj_dict.get("text")

        relname = resource_name(rel)
        if relname == "label":
            if obj_dict["lang"] == "en":
                synset_labels[subj].append(objtext)
        elif relname == "sameAs":
            if obj.startswith(WN20_URL):
                # If we have a link to RDF WordNet 2.0, the URL (URI? IRI?)
                # will contain a standardized label for this concept, which
                # we should use when we want to use this synset as the name of
                # a disambiguation category. RDF WordNet 3.1 assigns synsets
                # a number of labels in no particular order, making it hard to
                # determine from 3.1 alone what to name a category.
                objname = resource_name(obj)
                parts = objname.split("-")[1:-2]

                # Handle missing apostrophes
                label = (
                    "-".join(parts)
                    .replace("_s_", "'s_")
                    .replace("_s-", "'s_")
                    .replace("s__", "s'_")
                    .replace("s_-", "s'-")
                    .replace("_", " ")
                )
                synset_canonical_labels[subj] = label

        elif relname == "domain_category":
            synset_categories[subj] = obj
        elif relname == "lexical_domain":
            target = resource_name(obj)
            if "." in target:
                domain = target.split(".")[1]
                synset_domains[subj] = domain
        elif relname == "gloss":
            synset_glosses[subj] = objtext
        elif relname == "reference":
            lemma = resource_name(subj)
            synset = obj
            synset_senses[synset].append(lemma)
            sense_synsets[lemma] = synset

    used_labels = set(synset_canonical_labels.values())
    for synset, values in synset_labels.items():
        values.sort(key=lambda label: (label in used_labels,) + label_sort_key(label))
        if (
            synset not in synset_canonical_labels
            or synset_canonical_labels[synset][0].isupper()
            and synset_domains.get(synset) == "person"
        ):
            label = values[0]
            synset_canonical_labels[synset] = label
            used_labels.add(label)

    for synset, labels in synset_labels.items():
        if synset in synset_categories:
            category_name = synset_canonical_labels[synset_categories[synset]]
        else:
            category_name = synset_domains.get(synset, None)
        synset_no_fragment = synset.split("#")[0]
        pos = synset_no_fragment[-1].lower()
        assert pos in "nvarsp", synset
        if pos == "s":
            pos = "a"
        elif pos == "p":
            pos = "-"
        if category_name in ("pert", "all", "tops"):
            category_name = None
        synset_disambig[synset] = (pos, category_name)

        canon = synset_canonical_labels[synset]
        canon_uri = standardized_concept_uri("en", canon, pos, "wn", category_name)
        synset_uris[synset] = canon_uri

        for label in labels:
            if label != canon:
                other_uri = standardized_concept_uri(
                    "en", label, pos, "wn", category_name
                )
                rel_uri = "/r/Synonym"
                surface = "[[{0}]] is a synonym of [[{1}]]".format(label, canon)
                edge = make_edge(
                    rel_uri,
                    other_uri,
                    canon_uri,
                    dataset=DATASET,
                    surfaceText=surface,
                    license=Licenses.cc_attribution,
                    sources=[SOURCE],
                    weight=2.0,
                )
                out.write(edge)

    quads = parse_nquads(open(input_file, encoding="utf-8"))
    for subj_dict, rel_dict, obj_dict, _graph in quads:
        if "url" not in subj_dict or "url" not in rel_dict:
            continue
        subj = subj_dict["url"]
        rel = rel_dict["url"]
        obj = obj_dict.get("url")
        relname = resource_name(rel)
        if relname in REL_MAPPING:
            pos, sense = synset_disambig.get(subj, (None, None))
            if relname == "hypernym" and pos == "v":
                relname = "hypernym-v"
            rel, frame = REL_MAPPING[relname]
            reversed_frame = False
            if rel.startswith("~"):
                rel = rel[1:]
                reversed_frame = True
            rel_uri = "/r/" + rel
            if obj is not None:
                obj_uri = synset_uris.get(obj)
                if obj not in synset_canonical_labels:
                    continue
                obj_label = synset_canonical_labels[obj]
            else:
                text = obj_dict["text"]
                # Some WordNets use strings with "!" in them to indicate
                # out-of-band information, such as a missing translation
                if (not text) or "!" in text:
                    continue
                lang = obj_dict["lang"]
                obj_uri = standardized_concept_uri(lang, text, pos, "wn", sense)
                obj_label = text

            if subj not in synset_uris or subj not in synset_canonical_labels:
                continue
            subj_uri = synset_uris[subj]
            subj_label = synset_canonical_labels[subj]
            license = Licenses.cc_attribution
            langcode = subj_uri.split("/")[2]
            if langcode in SHAREALIKE_LANGUAGES:
                license = Licenses.cc_sharealike

            if reversed_frame:
                subj_uri, obj_uri = obj_uri, subj_uri
                subj_label, obj_label = obj_label, subj_label

            surface = frame.format("[[%s]]" % subj_label, "[[%s]]" % obj_label)

            edge = make_edge(
                rel_uri,
                subj_uri,
                obj_uri,
                dataset=DATASET,
                surfaceText=surface,
                license=license,
                sources=[SOURCE],
                weight=2.0,
            )
            out.write(edge)

    for wn_url in sorted(synset_uris):
        cn_uri = synset_uris[wn_url]
        edge = make_edge(
            "/r/ExternalURL",
            cn_uri,
            wn_url,
            dataset=DATASET,
            license=Licenses.cc_sharealike,
            sources=[SOURCE],
            weight=1.0,
        )
        out.write(edge)

    out.close()
Exemplo n.º 25
0
 def transform_file(self, input_filename, output_file):
     out = MsgpackStreamWriter(output_file)
     for obj in read_json_stream(input_filename):
         for new_obj in self.handle_assertion(obj):
             out.write(new_obj)
Exemplo n.º 26
0
def run_umbel(input_dir, output_file, sw_map_file):
    """
    Read N-Triples files containing Umbel data, outputting a file of
    ConceptNet edges and a file of mappings between the Semantic Web and
    ConceptNet.
    """
    out = MsgpackStreamWriter(output_file)
    map_out = NTriplesWriter(sw_map_file)
    reader = NTriplesReader()

    labels = {}
    label_sets = defaultdict(set)

    # There are two files we want to parse:
    # - umbel.nt, a transformation of umbel.n3, which is available from
    #   https://github.com/structureddynamics/UMBEL/.
    # - umbel_links.nt, distributed with DBPedia 3.9.
    #
    # We parse them both in this file so that umbel_links can reuse the
    # concept names extracted from umbel.nt.
    main_file = os.path.join(input_dir, 'umbel.nt')
    dbpedia_link_file = os.path.join(input_dir, 'umbel_links.nt')

    # Read through umbel.nt once, finding all the "preferred labels". We will
    # use these as the surface texts for the nodes.
    for web_subj, web_rel, web_obj, objtag in reader.parse_file(main_file):
        if resource_name(web_rel) == 'prefLabel':
            # 'CW' and 'PCW' are Cyc jargon for 'conceptual works'. If a node
            # cannot be described except as a CW, we're probably not
            # interested in it.
            if 'CW' not in web_obj.split() and 'PCW' not in web_obj.split():
                labels[web_subj] = web_obj
        if resource_name(web_rel).endswith('Label'):
            text = standardize_text(web_obj)
            label_sets[text].add(web_subj)

    # Read through umbel.nt again and extract ConceptNet edges.
    for web_subj, web_rel, web_obj, objtag in reader.parse_file(main_file):
        if objtag == 'URL' and acceptable_node(web_obj) and acceptable_node(web_subj):
            # Only use nodes for which we've seen preferred labels.
            # (This skips some anonymous OWL-cruft nodes.)
            if web_subj in labels and web_obj in labels:
                subj_uri = standardized_concept_uri('en', labels[web_subj])
                obj_uri = standardized_concept_uri('en', labels[web_obj])
                rel_name = resource_name(web_rel)
                # Check if this is a relation we want to handle.
                if rel_name in REL_MAPPING:
                    # Write the ConceptNet edges and the mappings to Semantic Web URLs.
                    rel_uri, frame = REL_MAPPING[rel_name]
                    surface = frame % (labels[web_subj], labels[web_obj])
                    out.write(umbel_edge(rel_uri, subj_uri, obj_uri, surface, SOURCE))
                    map_out.write_link(web_rel, full_conceptnet_url(rel_uri))
                    map_out.write_link(web_subj, full_conceptnet_url(subj_uri))
                    map_out.write_link(web_obj, full_conceptnet_url(obj_uri))

        # altLabel relations assign different texts to the same node. We'll
        # represent those in ConceptNet with Synonym relations.
        elif web_rel.endswith('altLabel'):
            # Make sure we know what's being labeled.
            if web_subj in labels:
                name = web_obj
                words = name.split(' ')
                if standardized_concept_name('en', name) != standardized_concept_name('en', labels[web_subj]):
                    if not set(words) & IGNORED_WORDS:
                        main_label = standardized_concept_uri('en', labels[web_subj])
                        name_text = standardize_text(name)
                        if len(label_sets[name_text]) >= 2 or len(name_text) <= 3:
                            disambig = un_camel_case(resource_name(web_subj))

                            # Cyc does not distinguish texts by their part of speech, so use
                            # '_' as the part of speech symbol.
                            alt_label = standardized_concept_uri('en', name, '_', disambig)
                        else:
                            alt_label = standardized_concept_uri('en', name)
                        surface = SYN_FRAME % (name, labels[web_subj])
                        out.write(umbel_edge('/r/Synonym', alt_label, main_label, surface, SOURCE))

    for web_subj, web_rel, web_obj, objtag in reader.parse_file(dbpedia_link_file):
        if objtag == 'URL' and acceptable_node(web_obj) and acceptable_node(web_subj):
            if web_obj in labels:
                subj_label = resource_name(web_subj).replace('_', ' ')
                subj_uri = translate_dbpedia_url(web_subj)
                obj_label = labels[web_obj]
                obj_uri = standardized_concept_uri('en', obj_label)
                rel_name = resource_name(web_rel)
                if rel_name in REL_MAPPING:
                    rel_uri, frame = REL_MAPPING[rel_name]
                    surface = frame % (subj_label, obj_label)
                    out.write(umbel_edge(rel_uri, subj_uri, obj_uri, surface, LINK_SOURCE))
                    map_out.write_link(web_rel, full_conceptnet_url(rel_uri))
                    map_out.write_link(web_subj, full_conceptnet_url(subj_uri))
                    map_out.write_link(web_obj, full_conceptnet_url(obj_uri))
Exemplo n.º 27
0
def handle_file(filename, output_file):
    out = MsgpackStreamWriter(output_file)

    for line in gzip.open(filename, 'rt'):

        # skip the intro information
        if line.startswith('#'):
            continue

        # parse the data to extract the traditional form, simplified form and the English definition
        traditional, simplified, definitions = re.match(LINE_REGEX, line).groups()

        # Make an edge between the traditional and simplified version
        edge = make_edge(
            rel='/r/Synonym',
            start=standardized_concept_uri('zh-Hant', traditional),
            end=standardized_concept_uri('zh-Hans', simplified),
            dataset=DATASET,
            license=LICENSE,
            sources=SOURCE,
        )
        out.write(edge)

        for definition in re.split(DEFINITIONS_REGEX, definitions):

            # Skip pronunciation information
            if 'Taiwan pr.' in definition or 'also pr.' in definition:
                continue

            # Check if it's the definition matches a person syntax, i.e. includes a date range
            person_match = re.match(DATE_RANGE_REGEX, definition)
            if person_match:
                persons = extract_person(person_match)
                for person in persons:
                    edge = make_edge(
                        rel='/r/Synonym',
                        start=standardized_concept_uri('zh-Hant', traditional),
                        end=standardized_concept_uri('en', person),
                        dataset=DATASET,
                        license=LICENSE,
                        sources=SOURCE,
                    )
                    out.write(edge)

                    edge = make_edge(
                        rel='/r/Synonym',
                        start=standardized_concept_uri('zh-Hans', simplified),
                        end=standardized_concept_uri('en', person),
                        dataset=DATASET,
                        license=LICENSE,
                        sources=SOURCE,
                    )
                    out.write(edge)
                continue

            # Check if a word is a measure word
            if definition.startswith('CL:'):
                related_words = extract_measure_words(definition)
                for word in related_words:
                    edge = make_edge(
                        rel='/r/RelatedTo',
                        start=standardized_concept_uri('zh-Hant', traditional),
                        end=standardized_concept_uri('zh', word),
                        dataset=DATASET,
                        license=LICENSE,
                        sources=SOURCE,
                    )
                    out.write(edge)

                    edge = make_edge(
                        rel='/r/RelatedTo',
                        start=standardized_concept_uri('zh-Hans', simplified),
                        end=standardized_concept_uri('zh', word),
                        dataset=DATASET,
                        license=LICENSE,
                        sources=SOURCE,
                    )
                    out.write(edge)
                continue

            # Remove clarifying information in parenthesis
            definition = PAREN_REGEX.sub('', definition)

            # Handle variants/word forms and abbreviations
            if re.match(VARIANT_REGEX, definition) or re.match(ABBR_REGEX, definition):
                variants = extract_han_characters(definition)
                for variant in variants:
                    edge = make_edge(
                        rel='/r/Synonym',
                        start=standardized_concept_uri('zh-Hant', traditional),
                        end=standardized_concept_uri('zh', variant),
                        dataset=DATASET,
                        license=LICENSE,
                        sources=SOURCE,
                    )
                    out.write(edge)

                    edge = make_edge(
                        rel='/r/Synonym',
                        start=standardized_concept_uri('zh-Hans', simplified),
                        end=standardized_concept_uri('zh', variant),
                        dataset=DATASET,
                        license=LICENSE,
                        sources=SOURCE,
                    )
                    out.write(edge)
                continue

            if re.match(SEE_ALSO_REGEX, definition):
                references = extract_han_characters(definition)
                for reference in references:
                    edge = make_edge(
                        rel='/r/RelatedTo',
                        start=standardized_concept_uri('zh-Hant', traditional),
                        end=standardized_concept_uri('zh', reference),
                        dataset=DATASET,
                        license=LICENSE,
                        sources=SOURCE,
                    )
                    out.write(edge)

                    edge = make_edge(
                        rel='/r/RelatedTo',
                        start=standardized_concept_uri('zh-Hans', simplified),
                        end=standardized_concept_uri('zh', reference),
                        dataset=DATASET,
                        license=LICENSE,
                        sources=SOURCE,
                    )
                    out.write(edge)

            # Remove 'lit.', 'fig.'
            definition = LIT_FIG_REGEX.sub('', definition)

            # Expand sth and sb
            definition = SB_REGEX.sub('someone', definition)
            definition = STH_REGEX.sub('something', definition)

            # Additional cleanups
            definition = remove_reference_syntax(definition)
            definition = remove_additional_info(definition)

            # Skip long definitions and make an edge out of remaining information
            if len(definition.split()) < 6:
                edge = make_edge(
                    rel='/r/Synonym',
                    start=standardized_concept_uri('zh-Hant', traditional),
                    end=standardized_concept_uri('en', definition),
                    dataset=DATASET,
                    license=LICENSE,
                    sources=SOURCE,
                )
                out.write(edge)

                edge = make_edge(
                    rel='/r/Synonym',
                    start=standardized_concept_uri('zh-Hans', simplified),
                    end=standardized_concept_uri('en', definition),
                    dataset=DATASET,
                    license=LICENSE,
                    sources=SOURCE,
                )
                out.write(edge)
Exemplo n.º 28
0
def run_wordnet(input_dir, output_file, sw_map_file):
    out = MsgpackStreamWriter(output_file)
    map_out = NTriplesWriter(sw_map_file)
    reader = NTriplesReader()

    synset_senses = defaultdict(list)
    sense_synsets = {}

    labels = {}
    glossary = {}
    concept_map = {}
    sense_to_synset = {}

    # Parse lines such as:
    #   wn30:synset-Aeolian-noun-2 rdfs:label "Aeolian"@en-us .
    for subj, rel, obj, objtag in reader.parse_file(
            os.path.join(input_dir, 'wordnet-synset.ttl')):
        if resource_name(rel) == 'label':
            # Everything in WordNet is in English
            assert objtag == 'en'
            labels[subj] = obj

    for subj, rel, obj, objtag in reader.parse_file(
            os.path.join(input_dir, 'wordnet-glossary.ttl')):
        if resource_name(rel) == 'gloss':
            assert objtag == 'en'

            # Take the definition up to the first semicolon
            text = obj.split(';')[0]

            # Remove introductory phrases with a colon
            text = text.split(': ', 1)[-1]

            # Remove parenthesized expressions
            while True:
                newtext = re.sub(r'\(.+?\) ?', '', text).strip()
                if newtext == text or newtext == '':
                    break
                else:
                    text = newtext

            glossary[subj] = text.replace('/', '_')

    # Get the list of word senses in each synset, and make a bidirectional mapping.
    #
    # Example line:
    #   wn30:synset-Aeolian-noun-2 wn20schema:containsWordSense wn30:wordsense-Aeolian-noun-2 .
    for subj, rel, obj, objtag in reader.parse_file(
            os.path.join(input_dir,
                         'full/wordnet-wordsense-synset-relations.ttl')):
        if resource_name(rel) == 'containsWordSense':
            synset_senses[subj].append(obj)
            sense_synsets[obj] = subj

    # Assign every synset to a disambiguated concept.
    for synset in synset_senses:
        synset_name = labels[synset]
        synset_pos = synset.split('-')[-2]
        pos = PARTS_OF_SPEECH[synset_pos]
        disambig = glossary[synset]

        concept = standardized_concept_uri('en', synset_name, pos, disambig)
        concept_map[synset] = concept

    # Map senses to their synsets.
    for sense, synset in sense_synsets.items():
        sense_to_synset[sense] = synset

    for filename in ('wordnet-attribute.ttl', 'wordnet-causes.ttl',
                     'wordnet-classifiedby.ttl', 'wordnet-entailment.ttl',
                     'wordnet-hyponym.ttl', 'wordnet-instances.ttl',
                     'wordnet-membermeronym.ttl', 'wordnet-partmeronym.ttl',
                     'wordnet-sameverbgroupas.ttl', 'wordnet-similarity.ttl',
                     'wordnet-substancemeronym.ttl',
                     'full/wordnet-antonym.ttl',
                     'full/wordnet-derivationallyrelated.ttl',
                     'full/wordnet-participleof.ttl',
                     'full/wordnet-pertainsto.ttl',
                     'full/wordnet-seealso.ttl'):
        filepath = os.path.join(input_dir, filename)
        if os.path.exists(filepath):
            for web_subj, web_rel, web_obj, objtag in reader.parse_file(
                    filepath):
                # If this relation involves word senses, map them to their synsets
                # first.
                if web_subj in sense_to_synset:
                    web_subj = sense_to_synset[web_subj]
                if web_obj in sense_to_synset:
                    web_obj = sense_to_synset[web_obj]
                subj = concept_map[web_subj]
                obj = concept_map[web_obj]
                pred_label = resource_name(web_rel)
                if pred_label in REL_MAPPING:
                    mapped_rel = REL_MAPPING[pred_label]

                    # Handle WordNet relations that are the reverse of ConceptNet
                    # relations. Change the word 'meronym' to 'holonym' if
                    # necessary.
                    if mapped_rel.startswith('~'):
                        subj, obj = obj, subj
                        web_subj, web_obj = web_obj, web_subj
                        web_rel = web_rel.replace('meronym', 'holonym')
                        mapped_rel = mapped_rel[1:]
                    rel = join_uri('r', mapped_rel)
                else:
                    rel = join_uri('r', 'wordnet', pred_label)

                map_out.write_link(web_rel, full_conceptnet_url(rel))
                map_out.write_link(web_subj, full_conceptnet_url(subj))
                map_out.write_link(web_obj, full_conceptnet_url(obj))
                edge = make_edge(rel,
                                 subj,
                                 obj,
                                 dataset='/d/wordnet/3.0',
                                 license='/l/CC/By',
                                 sources=SOURCE,
                                 weight=2.0)
                out.write(edge)
Exemplo n.º 29
0
def handle_file(filename, output_file):
    out = MsgpackStreamWriter(output_file)

    for line in gzip.open(filename, 'rt'):

        # skip the intro information
        if line.startswith('#'):
            continue

        # parse the data to extract the traditional form, simplified form and the English definition
        traditional, simplified, definitions = re.match(LINE_REGEX,
                                                        line).groups()

        # Make an edge between the traditional and simplified version
        edge = make_edge(
            rel='/r/Synonym',
            start=standardized_concept_uri('zh-Hant', traditional),
            end=standardized_concept_uri('zh-Hans', simplified),
            dataset=DATASET,
            license=LICENSE,
            sources=SOURCE,
        )
        out.write(edge)

        for definition in re.split(DEFINITIONS_REGEX, definitions):

            # Skip pronunciation information
            if 'Taiwan pr.' in definition or 'also pr.' in definition:
                continue

            # Check if it's the definition matches a person syntax, i.e. includes a date range
            person_match = re.match(DATE_RANGE_REGEX, definition)
            if person_match:
                persons = extract_person(person_match)
                for person in persons:
                    edge = make_edge(
                        rel='/r/Synonym',
                        start=standardized_concept_uri('zh-Hant', traditional),
                        end=standardized_concept_uri('en', person),
                        dataset=DATASET,
                        license=LICENSE,
                        sources=SOURCE,
                    )
                    out.write(edge)

                    edge = make_edge(
                        rel='/r/Synonym',
                        start=standardized_concept_uri('zh-Hans', simplified),
                        end=standardized_concept_uri('en', person),
                        dataset=DATASET,
                        license=LICENSE,
                        sources=SOURCE,
                    )
                    out.write(edge)
                continue

            # Check if a word is a measure word
            if definition.startswith('CL:'):
                related_words = extract_measure_words(definition)
                for word in related_words:
                    edge = make_edge(
                        rel='/r/RelatedTo',
                        start=standardized_concept_uri('zh-Hant', traditional),
                        end=standardized_concept_uri('zh', word),
                        dataset=DATASET,
                        license=LICENSE,
                        sources=SOURCE,
                    )
                    out.write(edge)

                    edge = make_edge(
                        rel='/r/RelatedTo',
                        start=standardized_concept_uri('zh-Hans', simplified),
                        end=standardized_concept_uri('zh', word),
                        dataset=DATASET,
                        license=LICENSE,
                        sources=SOURCE,
                    )
                    out.write(edge)
                continue

            # Remove clarifying information in parenthesis
            definition = PAREN_REGEX.sub('', definition)

            # Handle variants/word forms and abbreviations
            if re.match(VARIANT_REGEX, definition) or re.match(
                    ABBR_REGEX, definition):
                variants = extract_han_characters(definition)
                for variant in variants:
                    edge = make_edge(
                        rel='/r/Synonym',
                        start=standardized_concept_uri('zh-Hant', traditional),
                        end=standardized_concept_uri('zh', variant),
                        dataset=DATASET,
                        license=LICENSE,
                        sources=SOURCE,
                    )
                    out.write(edge)

                    edge = make_edge(
                        rel='/r/Synonym',
                        start=standardized_concept_uri('zh-Hans', simplified),
                        end=standardized_concept_uri('zh', variant),
                        dataset=DATASET,
                        license=LICENSE,
                        sources=SOURCE,
                    )
                    out.write(edge)
                continue

            if re.match(SEE_ALSO_REGEX, definition):
                references = extract_han_characters(definition)
                for reference in references:
                    edge = make_edge(
                        rel='/r/RelatedTo',
                        start=standardized_concept_uri('zh-Hant', traditional),
                        end=standardized_concept_uri('zh', reference),
                        dataset=DATASET,
                        license=LICENSE,
                        sources=SOURCE,
                    )
                    out.write(edge)

                    edge = make_edge(
                        rel='/r/RelatedTo',
                        start=standardized_concept_uri('zh-Hans', simplified),
                        end=standardized_concept_uri('zh', reference),
                        dataset=DATASET,
                        license=LICENSE,
                        sources=SOURCE,
                    )
                    out.write(edge)

            # Remove 'lit.', 'fig.'
            definition = LIT_FIG_REGEX.sub('', definition)

            # Expand sth and sb
            definition = SB_REGEX.sub('someone', definition)
            definition = STH_REGEX.sub('something', definition)

            # Additional cleanups
            definition = remove_reference_syntax(definition)
            definition = remove_additional_info(definition)

            # Skip long definitions and make an edge out of remaining information
            if len(definition.split()) < 6:
                edge = make_edge(
                    rel='/r/Synonym',
                    start=standardized_concept_uri('zh-Hant', traditional),
                    end=standardized_concept_uri('en', definition),
                    dataset=DATASET,
                    license=LICENSE,
                    sources=SOURCE,
                )
                out.write(edge)

                edge = make_edge(
                    rel='/r/Synonym',
                    start=standardized_concept_uri('zh-Hans', simplified),
                    end=standardized_concept_uri('en', definition),
                    dataset=DATASET,
                    license=LICENSE,
                    sources=SOURCE,
                )
                out.write(edge)
Exemplo n.º 30
0
def run_wordnet(input_file, output_file):
    out = MsgpackStreamWriter(output_file)

    synset_senses = defaultdict(list)
    sense_synsets = {}
    synset_labels = defaultdict(list)
    synset_canonical_labels = {}
    synset_categories = {}
    synset_domains = {}
    synset_glosses = {}
    synset_disambig = {}
    synset_uris = {}

    # First pass: find data about synsets
    quads = parse_nquads(open(input_file, encoding='utf-8'))
    for subj_dict, rel_dict, obj_dict, _graph in quads:
        if 'url' not in subj_dict or 'url' not in rel_dict:
            continue
        subj = subj_dict['url']
        rel = rel_dict['url']
        obj = obj_dict.get('url')
        objtext = obj_dict.get('text')

        relname = resource_name(rel)
        if relname == 'label':
            if obj_dict['lang'] == 'en':
                synset_labels[subj].append(objtext)
        elif relname == 'sameAs':
            if obj.startswith(WN20_URL):
                # If we have a link to RDF WordNet 2.0, the URL (URI? IRI?)
                # will contain a standardized label for this concept, which
                # we should use when we want to use this synset as the name of
                # a disambiguation category. RDF WordNet 3.1 assigns synsets
                # a number of labels in no particular order, making it hard to
                # determine from 3.1 alone what to name a category.
                objname = resource_name(obj)
                parts = objname.split('-')[1:-2]

                # Handle missing apostrophes
                label = '-'.join(parts).replace('_s_', "'s_").replace('_s-', "'s_").replace("s__", "s'_").replace("s_-", "s'-").replace('_', ' ')
                synset_canonical_labels[subj] = label

        elif relname == 'domain_category':
            synset_categories[subj] = obj
        elif relname == 'lexical_domain':
            target = resource_name(obj)
            if '.' in target:
                domain = target.split('.')[1]
                synset_domains[subj] = domain
        elif relname == 'gloss':
            synset_glosses[subj] = objtext
        elif relname == 'reference':
            lemma = resource_name(subj)
            synset = obj
            synset_senses[synset].append(lemma)
            sense_synsets[lemma] = synset

    used_labels = set(synset_canonical_labels.values())
    for synset, values in synset_labels.items():
        values.sort(key=lambda label: (label in used_labels,) + label_sort_key(label))
        if (
            synset not in synset_canonical_labels or
            synset_canonical_labels[synset][0].isupper() and synset_domains.get(synset) == 'person'
        ):
            label = values[0]
            synset_canonical_labels[synset] = label
            used_labels.add(label)

    for synset, labels in synset_labels.items():
        if synset in synset_categories:
            category_name = synset_canonical_labels[synset_categories[synset]]
        else:
            category_name = synset_domains.get(synset, None)
        synset_no_fragment = synset.split('#')[0]
        pos = synset_no_fragment[-1].lower()
        assert pos in 'nvarsp', synset
        if pos == 's':
            pos = 'a'
        elif pos == 'p':
            pos = '-'
        if category_name in ('pert', 'all', 'tops'):
            category_name = None
        synset_disambig[synset] = (pos, category_name)

        canon = synset_canonical_labels[synset]
        canon_uri = standardized_concept_uri('en', canon, pos, 'wn', category_name)
        synset_uris[synset] = canon_uri

        for label in labels:
            if label != canon:
                other_uri = standardized_concept_uri('en', label, pos, 'wn', category_name)
                rel_uri = '/r/Synonym'
                surface = '[[{0}]] is a synonym of [[{1}]]'.format(label, canon)
                edge = make_edge(
                    rel_uri, other_uri, canon_uri, dataset=DATASET, surfaceText=surface,
                    license=Licenses.cc_attribution, sources=[SOURCE], weight=2.0
                )
                out.write(edge)

    quads = parse_nquads(open(input_file, encoding='utf-8'))
    for subj_dict, rel_dict, obj_dict, _graph in quads:
        if 'url' not in subj_dict or 'url' not in rel_dict:
            continue
        subj = subj_dict['url']
        rel = rel_dict['url']
        obj = obj_dict.get('url')
        relname = resource_name(rel)
        if relname in REL_MAPPING:
            rel, frame = REL_MAPPING[relname]
            reversed_frame = False
            if rel.startswith('~'):
                rel = rel[1:]
                reversed_frame = True
            rel_uri = '/r/' + rel
            if obj is not None:
                obj_uri = synset_uris.get(obj)
                if obj not in synset_canonical_labels:
                    continue
                obj_label = synset_canonical_labels[obj]
            else:
                text = obj_dict['text']
                # Some WordNets use strings with "!" in them to indicate
                # out-of-band information, such as a missing translation
                if (not text) or '!' in text:
                    continue
                lang = obj_dict['lang']
                pos, sense = synset_disambig.get(subj, (None, None))
                obj_uri = standardized_concept_uri(lang, text, pos, 'wn', sense)
                obj_label = text

            if subj not in synset_uris or subj not in synset_canonical_labels:
                continue
            subj_uri = synset_uris[subj]
            subj_label = synset_canonical_labels[subj]
            license = Licenses.cc_attribution
            langcode = subj_uri.split('/')[2]
            if langcode in SHAREALIKE_LANGUAGES:
                license = Licenses.cc_sharealike

            if reversed_frame:
                subj_uri, obj_uri = obj_uri, subj_uri
                subj_label, obj_label = obj_label, subj_label

            surface = frame.format('[[%s]]' % subj_label, '[[%s]]' % obj_label)

            edge = make_edge(
                rel_uri, subj_uri, obj_uri, dataset=DATASET, surfaceText=surface,
                license=license, sources=[SOURCE], weight=2.0
            )
            out.write(edge)

    for wn_url in sorted(synset_uris):
        cn_uri = synset_uris[wn_url]
        edge = make_edge(
            '/r/ExternalURL', cn_uri, wn_url, dataset=DATASET,
            license=Licenses.cc_sharealike, sources=[SOURCE], weight=1.0
        )
        out.write(edge)

    out.close()
Exemplo n.º 31
0
def run_wordnet(input_file, output_file):
    out = MsgpackStreamWriter(output_file)

    synset_senses = defaultdict(list)
    sense_synsets = {}
    synset_labels = defaultdict(list)
    synset_canonical_labels = {}
    synset_categories = {}
    synset_domains = {}
    synset_glosses = {}
    synset_disambig = {}
    synset_uris = {}

    # First pass: find data about synsets
    quads = parse_nquads(open(input_file, encoding='utf-8'))
    for subj_dict, rel_dict, obj_dict, _graph in quads:
        if 'url' not in subj_dict or 'url' not in rel_dict:
            continue
        subj = subj_dict['url']
        rel = rel_dict['url']
        obj = obj_dict.get('url')
        objtext = obj_dict.get('text')

        relname = resource_name(rel)
        if relname == 'label':
            if obj_dict['lang'] == 'en':
                synset_labels[subj].append(objtext)
        elif relname == 'sameAs':
            if obj.startswith(WN20_URL):
                # If we have a link to RDF WordNet 2.0, the URL (URI? IRI?)
                # will contain a standardized label for this concept, which
                # we should use when we want to use this synset as the name of
                # a disambiguation category. RDF WordNet 3.1 assigns synsets
                # a number of labels in no particular order, making it hard to
                # determine from 3.1 alone what to name a category.
                objname = resource_name(obj)
                parts = objname.split('-')[1:-2]

                # Handle missing apostrophes
                label = '-'.join(parts).replace('_s_', "'s_").replace(
                    '_s-',
                    "'s_").replace("s__",
                                   "s'_").replace("s_-",
                                                  "s'-").replace('_', ' ')
                synset_canonical_labels[subj] = label

        elif relname == 'domain_category':
            synset_categories[subj] = obj
        elif relname == 'lexical_domain':
            target = resource_name(obj)
            if '.' in target:
                domain = target.split('.')[1]
                synset_domains[subj] = domain
        elif relname == 'gloss':
            synset_glosses[subj] = objtext
        elif relname == 'reference':
            lemma = resource_name(subj)
            synset = obj
            synset_senses[synset].append(lemma)
            sense_synsets[lemma] = synset

    used_labels = set(synset_canonical_labels.values())
    for synset, values in synset_labels.items():
        values.sort(
            key=lambda label: (label in used_labels, ) + label_sort_key(label))
        if (synset not in synset_canonical_labels
                or synset_canonical_labels[synset][0].isupper()
                and synset_domains.get(synset) == 'person'):
            label = values[0]
            synset_canonical_labels[synset] = label
            used_labels.add(label)

    for synset, labels in synset_labels.items():
        if synset in synset_categories:
            category_name = synset_canonical_labels[synset_categories[synset]]
        else:
            category_name = synset_domains.get(synset, None)
        synset_no_fragment = synset.split('#')[0]
        pos = synset_no_fragment[-1].lower()
        assert pos in 'nvarsp', synset
        if pos == 's':
            pos = 'a'
        elif pos == 'p':
            pos = '-'
        if category_name in ('pert', 'all', 'tops'):
            category_name = None
        synset_disambig[synset] = (pos, category_name)

        canon = synset_canonical_labels[synset]
        canon_uri = standardized_concept_uri('en', canon, pos, 'wn',
                                             category_name)
        synset_uris[synset] = canon_uri

        for label in labels:
            if label != canon:
                other_uri = standardized_concept_uri('en', label, pos, 'wn',
                                                     category_name)
                rel_uri = '/r/Synonym'
                surface = '[[{0}]] is a synonym of [[{1}]]'.format(
                    label, canon)
                edge = make_edge(rel_uri,
                                 other_uri,
                                 canon_uri,
                                 dataset=DATASET,
                                 surfaceText=surface,
                                 license=Licenses.cc_attribution,
                                 sources=[SOURCE],
                                 weight=2.0)
                out.write(edge)

    quads = parse_nquads(open(input_file, encoding='utf-8'))
    for subj_dict, rel_dict, obj_dict, _graph in quads:
        if 'url' not in subj_dict or 'url' not in rel_dict:
            continue
        subj = subj_dict['url']
        rel = rel_dict['url']
        obj = obj_dict.get('url')
        relname = resource_name(rel)
        if relname in REL_MAPPING:
            pos, sense = synset_disambig.get(subj, (None, None))
            if relname == 'hypernym' and pos == 'v':
                relname = 'hypernym-v'
            rel, frame = REL_MAPPING[relname]
            reversed_frame = False
            if rel.startswith('~'):
                rel = rel[1:]
                reversed_frame = True
            rel_uri = '/r/' + rel
            if obj is not None:
                obj_uri = synset_uris.get(obj)
                if obj not in synset_canonical_labels:
                    continue
                obj_label = synset_canonical_labels[obj]
            else:
                text = obj_dict['text']
                # Some WordNets use strings with "!" in them to indicate
                # out-of-band information, such as a missing translation
                if (not text) or '!' in text:
                    continue
                lang = obj_dict['lang']
                obj_uri = standardized_concept_uri(lang, text, pos, 'wn',
                                                   sense)
                obj_label = text

            if subj not in synset_uris or subj not in synset_canonical_labels:
                continue
            subj_uri = synset_uris[subj]
            subj_label = synset_canonical_labels[subj]
            license = Licenses.cc_attribution
            langcode = subj_uri.split('/')[2]
            if langcode in SHAREALIKE_LANGUAGES:
                license = Licenses.cc_sharealike

            if reversed_frame:
                subj_uri, obj_uri = obj_uri, subj_uri
                subj_label, obj_label = obj_label, subj_label

            surface = frame.format('[[%s]]' % subj_label, '[[%s]]' % obj_label)

            edge = make_edge(rel_uri,
                             subj_uri,
                             obj_uri,
                             dataset=DATASET,
                             surfaceText=surface,
                             license=license,
                             sources=[SOURCE],
                             weight=2.0)
            out.write(edge)

    for wn_url in sorted(synset_uris):
        cn_uri = synset_uris[wn_url]
        edge = make_edge('/r/ExternalURL',
                         cn_uri,
                         wn_url,
                         dataset=DATASET,
                         license=Licenses.cc_sharealike,
                         sources=[SOURCE],
                         weight=1.0)
        out.write(edge)

    out.close()
Exemplo n.º 32
0
def handle_file(infile, outfile):
    count = 0
    outcomes = defaultdict(int)

    writer = MsgpackStreamWriter(outfile)

    for line in open(infile):
        parts = line.strip().split('\t')
        if not parts:
            outcomes['blank'] += 1
            continue

        # The first 5 columns of the Verbosity output file are:
        #
        #   left: the word being clued
        #   relation: the relation between the word and the clue that the
        #             clue-giver chose, in a form such as "it is part of"
        #   right: the one or two words used as the clue
        #   freq: the number of different times this clue was given
        #   orderscore: the average position in the list of clues
        #
        # 'orderscore' is a number from 0 to 999, representing the average
        # quantile of its position in the list of clues. (It's like a
        # percentile, except there are 1000 of them, not 100.)
        #
        # A clue that's always given first has an orderscore of 0. A clue
        # that always appears halfway through the list has an orderscore of
        # 500.
        #
        # This may seem like a strange thing to measure, and I didn't come up
        # with it, but it actually turns out to be somewhat informative.
        # A clue with an orderscore of 0 is probably a good common-sense
        # relation, representing the first thing that comes to mind. A clue
        # with a high order score may be a move of desperation after several
        # other clues have failed. It causes the guesser to get the answer
        # soon afterward, but perhaps because it's a "cheating" move. So,
        # low orderscores represent better common sense relations.
        left, relation, right, freq, orderscore = parts[:5]
        freq = int(freq)
        orderscore = int(orderscore)

        # Test each word
        flagged = False
        for rword in right.split():
            if BAD_CLUE_REGEX.match(rword):
                flagged = True
                break

        if flagged:
            outcomes['flag word'] += 1
            continue
        if len(right) < 3:
            outcomes['clue too short'] += 1
            continue
        if len(right.split()[-1]) == 1:
            outcomes['letter'] += 1
            continue

        # The Verbosity interface and gameplay did not particularly encourage
        # players to choose an appropriate relation. In practice, players seem
        # to have used them all interchangeably, except for the negative
        # relation "it is the opposite of", expressing /r/Antonym.
        #
        # Another way that players expressed negative relations was to use
        # 'not' as the first word of their clue; we make that into an instance
        # of /r/Antonym as well.
        #
        # In other cases, the relation is a positive relation, so we replace it
        # with the most general positive relation, /r/RelatedTo.
        rel = '/r/RelatedTo'
        reltext = 'is related to'
        if right.startswith('not '):
            rel = '/r/DistinctFrom'
            right = right[4:]
            reltext = 'is not'
        if relation == 'it is the opposite of':
            rel = '/r/Antonym'
            reltext = 'is the opposite of'

        # The "sounds-like score" determines whether this clue seems to be a
        # pun or rhyme, rather than an actual common-sense relationship. If
        # the sounds-like score is over 0.35, skip the assertion.
        sls = sounds_like_score(left, right)
        if sls > 0.35:
            outcomes['text similarity'] += 1
            continue

        # Calculate a score for the assertion:
        #
        #   - The number of times it's been used as a clue
        #   - ...with a linear penalty for a high sounds-like score
        #   - ...and a linear penalty for high orderscores
        #
        # The penalties are multiplicative factors from 0 to 1, which decrease
        # linearly as the relevant penalties increase. If a clue is given N
        # times, with a sounds-like score of 0 and an orderscore of 0, it will
        # get an overall score of 2N - 1. This is a formula we should probably
        # revisit.
        #
        # The weight is the score divided by 100. All divisions are floating
        # point.
        score = (freq * 2 - 1) * (1 - sls) * (1 - orderscore / 1000)
        if score <= 1.:
            outcomes['low score'] += 1
            continue

        weight = score ** .5 / 10

        # If the clue on the right is a two-word phrase, we make additional
        # connections to both words individually. We label them with the
        # rule-based source '/s/process/split_words' to track that this
        # happened.
        rightwords = [right]
        if ' ' in right:
            morewords = [word for word in right.split(' ') if word not in STOPWORDS]
            rightwords.extend(morewords)

        for i, rightword in enumerate(rightwords):
            source = {'contributor': '/s/resource/verbosity'}
            if i > 0:
                source['process'] = '/s/process/split_words'

            # Build the natural-language-ish surface text for this clue
            text = '[[%s]] %s [[%s]]' % (left, reltext, rightword)

            count += 1
            outcomes['success'] += 1
            leftc = standardized_concept_uri('en', left)
            rightc = standardized_concept_uri('en', rightword)
            edge = make_edge(
                rel,
                leftc,
                rightc,
                dataset='/d/verbosity',
                license=Licenses.cc_attribution,
                sources=[source],
                surfaceText=text,
                weight=weight,
            )
            writer.write(edge)
Exemplo n.º 33
0
def process_dbpedia(input_dir, output_file, concept_file):
    """
    Read through multiple DBPedia files and output filtered assertions to
    `output_file`.
    """
    ok_concepts = read_concept_file(concept_file)

    input_path = pathlib.Path(input_dir)
    interlang_path = input_path / 'interlanguage_links_en.tql.bz2'
    mapped_urls = interlanguage_mapping(interlang_path, ok_concepts)

    out = MsgpackStreamWriter(output_file)

    types_path = input_path / 'instance_types_en.tql.bz2'
    quads = parse_nquads(bz2.open(str(types_path), 'rt'))
    for subj, pred, obj, _graph in quads:
        subj_url = subj['url']
        if ('Category:' in subj_url or 'File:' in subj_url
                or 'List_of' in subj_url or '__' in subj_url
                or 'Template:' in subj_url):
            continue
        if subj_url in mapped_urls:
            subj_concept = translate_dbpedia_url(subj_url)
            obj_type = un_camel_case(resource_name(obj['url']))
            if obj_type not in TYPE_BLACKLIST:
                obj_concept = standardized_concept_uri('en', obj_type, 'n')
                if obj_concept not in CONCEPT_BLACKLIST:
                    edge = make_edge('/r/IsA',
                                     subj_concept,
                                     obj_concept,
                                     dataset='/d/dbpedia/en',
                                     license=Licenses.cc_sharealike,
                                     sources=[{
                                         'contributor':
                                         '/s/resource/dbpedia/2015/en'
                                     }],
                                     weight=0.5,
                                     surfaceStart=url_to_label(subj['url']),
                                     surfaceEnd=url_to_label(obj['url']))
                    out.write(edge)
                for other_url in mapped_urls[subj_url]:
                    if other_url.startswith('http://wikidata.dbpedia.org/'):
                        urledge = make_edge('/r/ExternalURL',
                                            subj_concept,
                                            other_url,
                                            dataset='/d/dbpedia/en',
                                            license=Licenses.cc_sharealike,
                                            sources=[{
                                                'contributor':
                                                '/s/resource/dbpedia/2015/en'
                                            }],
                                            weight=1.0)
                        out.write(urledge)
                    else:
                        other_concept = translate_dbpedia_url(other_url)
                        if other_concept:
                            urledge = make_edge(
                                '/r/ExternalURL',
                                other_concept,
                                other_url,
                                dataset='/d/dbpedia/en',
                                license=Licenses.cc_sharealike,
                                sources=[{
                                    'contributor':
                                    '/s/resource/dbpedia/2015/en'
                                }],
                                weight=1.0)
                            out.write(urledge)
                            edge = make_edge(
                                '/r/Synonym',
                                other_concept,
                                subj_concept,
                                dataset='/d/dbpedia/en',
                                license=Licenses.cc_sharealike,
                                sources=[{
                                    'contributor':
                                    '/s/resource/dbpedia/2015/en'
                                }],
                                weight=0.5,
                                surfaceStart=url_to_label(other_url),
                                surfaceEnd=url_to_label(subj_url))
                            out.write(edge)

    relations_path = input_path / 'mappingbased_objects_en.tql.bz2'
    quads = parse_nquads(bz2.open(str(relations_path), 'rt'))
    for subj, pred, obj, _graph in quads:
        subj_concept = translate_dbpedia_url(subj['url'])
        obj_concept = translate_dbpedia_url(obj['url'])
        rel_name = resource_name(pred['url'])
        if (subj_concept and obj_concept and subj['url'] in mapped_urls
                and obj['url'] in mapped_urls):
            if rel_name in RELATIONS:
                rel = RELATIONS[rel_name]
                edge = make_edge(rel,
                                 subj_concept,
                                 obj_concept,
                                 dataset='/d/dbpedia/en',
                                 license=Licenses.cc_sharealike,
                                 sources=[{
                                     'contributor':
                                     '/s/resource/dbpedia/2015/en'
                                 }],
                                 weight=0.5,
                                 surfaceStart=url_to_label(subj['url']),
                                 surfaceEnd=url_to_label(obj['url']))
                out.write(edge)

    out.close()
Exemplo n.º 34
0
def run_wordnet(input_dir, output_file, sw_map_file):
    out = MsgpackStreamWriter(output_file)
    map_out = NTriplesWriter(sw_map_file)
    reader = NTriplesReader()

    synset_senses = defaultdict(list)
    sense_synsets = {}

    labels = {}
    glossary = {}
    concept_map = {}
    sense_to_synset = {}

    # Parse lines such as:
    #   wn30:synset-Aeolian-noun-2 rdfs:label "Aeolian"@en-us .
    for subj, rel, obj, objtag in reader.parse_file(os.path.join(input_dir, 'wordnet-synset.ttl')):
        if resource_name(rel) == 'label':
            # Everything in WordNet is in English
            assert objtag == 'en'
            labels[subj] = obj

    for subj, rel, obj, objtag in reader.parse_file(os.path.join(input_dir, 'wordnet-glossary.ttl')):
        if resource_name(rel) == 'gloss':
            assert objtag == 'en'

            # Take the definition up to the first semicolon
            text = obj.split(';')[0]

            # Remove introductory phrases with a colon
            text = text.split(': ', 1)[-1]

            # Remove parenthesized expressions
            while True:
                newtext = re.sub(r'\(.+?\) ?', '', text).strip()
                if newtext == text or newtext == '':
                    break
                else:
                    text = newtext

            glossary[subj] = text.replace('/', '_')

    # Get the list of word senses in each synset, and make a bidirectional mapping.
    #
    # Example line:
    #   wn30:synset-Aeolian-noun-2 wn20schema:containsWordSense wn30:wordsense-Aeolian-noun-2 .
    for subj, rel, obj, objtag in reader.parse_file(os.path.join(input_dir, 'full/wordnet-wordsense-synset-relations.ttl')):
        if resource_name(rel) == 'containsWordSense':
            synset_senses[subj].append(obj)
            sense_synsets[obj] = subj

    # Assign every synset to a disambiguated concept.
    for synset in synset_senses:
        synset_name = labels[synset]
        synset_pos = synset.split('-')[-2]
        pos = PARTS_OF_SPEECH[synset_pos]
        disambig = glossary[synset]

        concept = standardized_concept_uri('en', synset_name, pos, disambig)
        concept_map[synset] = concept

    # Map senses to their synsets.
    for sense, synset in sense_synsets.items():
        sense_to_synset[sense] = synset

    for filename in (
        'wordnet-attribute.ttl', 'wordnet-causes.ttl',
        'wordnet-classifiedby.ttl', 'wordnet-entailment.ttl',
        'wordnet-hyponym.ttl', 'wordnet-instances.ttl',
        'wordnet-membermeronym.ttl', 'wordnet-partmeronym.ttl',
        'wordnet-sameverbgroupas.ttl', 'wordnet-similarity.ttl',
        'wordnet-substancemeronym.ttl', 'full/wordnet-antonym.ttl',
        'full/wordnet-derivationallyrelated.ttl',
        'full/wordnet-participleof.ttl',
        'full/wordnet-pertainsto.ttl',
        'full/wordnet-seealso.ttl'
    ):
        filepath = os.path.join(input_dir, filename)
        if os.path.exists(filepath):
            for web_subj, web_rel, web_obj, objtag in reader.parse_file(filepath):
                # If this relation involves word senses, map them to their synsets
                # first.
                if web_subj in sense_to_synset:
                    web_subj = sense_to_synset[web_subj]
                if web_obj in sense_to_synset:
                    web_obj = sense_to_synset[web_obj]
                subj = concept_map[web_subj]
                obj = concept_map[web_obj]
                pred_label = resource_name(web_rel)
                if pred_label in REL_MAPPING:
                    mapped_rel = REL_MAPPING[pred_label]

                    # Handle WordNet relations that are the reverse of ConceptNet
                    # relations. Change the word 'meronym' to 'holonym' if
                    # necessary.
                    if mapped_rel.startswith('~'):
                        subj, obj = obj, subj
                        web_subj, web_obj = web_obj, web_subj
                        web_rel = web_rel.replace('meronym', 'holonym')
                        mapped_rel = mapped_rel[1:]
                    rel = join_uri('r', mapped_rel)
                else:
                    rel = join_uri('r', 'wordnet', pred_label)

                map_out.write_link(web_rel, full_conceptnet_url(rel))
                map_out.write_link(web_subj, full_conceptnet_url(subj))
                map_out.write_link(web_obj, full_conceptnet_url(obj))
                edge = make_edge(
                    rel, subj, obj, dataset='/d/wordnet/3.0',
                    license='/l/CC/By', sources=SOURCE, weight=2.0
                )
                out.write(edge)
Exemplo n.º 35
0
def run_umbel(input_dir, output_file, sw_map_file):
    """
    Read N-Triples files containing Umbel data, outputting a file of
    ConceptNet edges and a file of mappings between the Semantic Web and
    ConceptNet.
    """
    out = MsgpackStreamWriter(output_file)
    map_out = NTriplesWriter(sw_map_file)
    reader = NTriplesReader()

    labels = {}
    label_sets = defaultdict(set)

    # There are two files we want to parse:
    # - umbel.nt, a transformation of umbel.n3, which is available from
    #   https://github.com/structureddynamics/UMBEL/.
    # - umbel_links.nt, distributed with DBPedia 3.9.
    #
    # We parse them both in this file so that umbel_links can reuse the
    # concept names extracted from umbel.nt.
    main_file = os.path.join(input_dir, 'umbel.nt')
    dbpedia_link_file = os.path.join(input_dir, 'umbel_links.nt')

    # Read through umbel.nt once, finding all the "preferred labels". We will
    # use these as the surface texts for the nodes.
    for web_subj, web_rel, web_obj, objtag in reader.parse_file(main_file):
        if resource_name(web_rel) == 'prefLabel':
            # 'CW' and 'PCW' are Cyc jargon for 'conceptual works'. If a node
            # cannot be described except as a CW, we're probably not
            # interested in it.
            if 'CW' not in web_obj.split() and 'PCW' not in web_obj.split():
                labels[web_subj] = web_obj
        if resource_name(web_rel).endswith('Label'):
            text = standardize_text(web_obj)
            label_sets[text].add(web_subj)

    # Read through umbel.nt again and extract ConceptNet edges.
    for web_subj, web_rel, web_obj, objtag in reader.parse_file(main_file):
        if objtag == 'URL' and acceptable_node(web_obj) and acceptable_node(
                web_subj):
            # Only use nodes for which we've seen preferred labels.
            # (This skips some anonymous OWL-cruft nodes.)
            if web_subj in labels and web_obj in labels:
                subj_uri = standardized_concept_uri('en', labels[web_subj])
                obj_uri = standardized_concept_uri('en', labels[web_obj])
                rel_name = resource_name(web_rel)
                # Check if this is a relation we want to handle.
                if rel_name in REL_MAPPING:
                    # Write the ConceptNet edges and the mappings to Semantic Web URLs.
                    rel_uri, frame = REL_MAPPING[rel_name]
                    surface = frame % (labels[web_subj], labels[web_obj])
                    out.write(
                        umbel_edge(rel_uri, subj_uri, obj_uri, surface,
                                   SOURCE))
                    map_out.write_link(web_rel, full_conceptnet_url(rel_uri))
                    map_out.write_link(web_subj, full_conceptnet_url(subj_uri))
                    map_out.write_link(web_obj, full_conceptnet_url(obj_uri))

        # altLabel relations assign different texts to the same node. We'll
        # represent those in ConceptNet with Synonym relations.
        elif web_rel.endswith('altLabel'):
            # Make sure we know what's being labeled.
            if web_subj in labels:
                name = web_obj
                words = name.split(' ')
                if standardized_concept_name(
                        'en', name) != standardized_concept_name(
                            'en', labels[web_subj]):
                    if not set(words) & IGNORED_WORDS:
                        main_label = standardized_concept_uri(
                            'en', labels[web_subj])
                        name_text = standardize_text(name)
                        if len(label_sets[name_text]) >= 2 or len(
                                name_text) <= 3:
                            disambig = un_camel_case(resource_name(web_subj))

                            # Cyc does not distinguish texts by their part of speech, so use
                            # '_' as the part of speech symbol.
                            alt_label = standardized_concept_uri(
                                'en', name, '_', disambig)
                        else:
                            alt_label = standardized_concept_uri('en', name)
                        surface = SYN_FRAME % (name, labels[web_subj])
                        out.write(
                            umbel_edge('/r/Synonym', alt_label, main_label,
                                       surface, SOURCE))

    for web_subj, web_rel, web_obj, objtag in reader.parse_file(
            dbpedia_link_file):
        if objtag == 'URL' and acceptable_node(web_obj) and acceptable_node(
                web_subj):
            if web_obj in labels:
                subj_label = resource_name(web_subj).replace('_', ' ')
                subj_uri = translate_dbpedia_url(web_subj)
                obj_label = labels[web_obj]
                obj_uri = standardized_concept_uri('en', obj_label)
                rel_name = resource_name(web_rel)
                if rel_name in REL_MAPPING:
                    rel_uri, frame = REL_MAPPING[rel_name]
                    surface = frame % (subj_label, obj_label)
                    out.write(
                        umbel_edge(rel_uri, subj_uri, obj_uri, surface,
                                   LINK_SOURCE))
                    map_out.write_link(web_rel, full_conceptnet_url(rel_uri))
                    map_out.write_link(web_subj, full_conceptnet_url(subj_uri))
                    map_out.write_link(web_obj, full_conceptnet_url(obj_uri))
Exemplo n.º 36
0
def build_from_dir(dirname, output_file):
    """
    Read a GlobalMind database exported in YAML files, translate
    it into ConceptNet 5 edges, and write those edges to disk using
    a MsgpackStreamWriter.
    """
    out = MsgpackStreamWriter(output_file)
    userdata = yaml.load_all(open(dirname + '/GMUser.yaml'))
    users = {}

    for userinfo in userdata:
        users[userinfo['pk']] = userinfo

    frame_data = yaml.load_all(open(dirname + '/GMFrame.yaml'))
    frames = {}
    for frame in frame_data:
        frames[frame['pk']] = frame['fields']

    assertiondata = yaml.load_all(open(dirname + '/GMAssertion.yaml'))
    assertions = {}
    for assertion in assertiondata:
        obj = assertion['fields']
        frame = frames[obj['frame']]
        frametext = frame['text']
        userinfo = users[obj['author']]
        username = userinfo['fields']['username']

        # As far as I can tell, GlobalMind used the same namespace of
        # usernames as the original Open Mind.
        user_source = "/s/contributor/omcs/%s" % username

        sources = [user_source, "/s/activity/globalmind/assert"]

        lang = LANG_CODES[obj['lcode']]
        start = standardized_concept_uri(lang, obj['node1'])
        end = standardized_concept_uri(lang, obj['node2'])
        rel = '/r/' + RELATION_MAP.get(frame['relation'], frame['relation'])

        # fix messy english "around in"
        if ' around ' in frametext:
            if obj['node2'].startswith('in '):
                frametext = frametext.replace(' around ', ' in ')
                obj['node2'] = obj['node2'][3:]
            else:
                frametext = frametext.replace(' around ', ' near ')
                rel = '/r/LocatedNear'

        # fix more awkward English. I wonder how bad the other languages are.
        frametext = frametext.replace('hits your head', 'comes to mind')
        frametext = frametext.replace(': [node1], [node2]',
                                      ' [node1] and [node2]')

        node1 = u'[[' + obj['node1'] + u']]'
        node2 = u'[[' + obj['node2'] + u']]'
        surfaceText = frametext.replace('//', '').replace('[node1]',
                                                          node1).replace(
                                                              '[node2]', node2)
        edge = make_edge(rel,
                         start,
                         end,
                         dataset='/d/globalmind',
                         license='/l/CC/By',
                         sources=sources,
                         surfaceText=surfaceText,
                         weight=1)

        # Avoid duplication with the ConceptNet reader, but still save every edge so that we can
        # handle translations.
        if username != 'openmind':
            out.write(edge)

        assertions[assertion['pk']] = edge

    translationdata = yaml.load_all(open(dirname + '/GMTranslation.yaml'))
    for translation in translationdata:
        obj = translation['fields']
        assertion1 = assertions[obj['assertion1']]
        assertion2 = assertions[obj['assertion2']]
        start = assertion1['uri']
        end = assertion2['uri']
        rel = '/r/TranslationOf'
        text1 = assertion1['surfaceText'].replace('[[', '').replace(']]', '')
        text2 = assertion2['surfaceText'].replace('[[', '').replace(']]', '')
        lang1 = LANG_NAMES[get_lang(assertion1)]
        lang2 = LANG_NAMES[get_lang(assertion2)]
        surfaceText = u"[[%s]] in %s means [[%s]] in %s." % (text1, lang1,
                                                             text2, lang2)
        userinfo = users[obj['author']]
        username = userinfo['fields']['username']

        userlocale = userinfo['fields']['ccode'].lower()
        if userlocale:
            user_source = "/s/contributor/globalmind/%s/%s" % (userlocale,
                                                               username)
        else:
            user_source = "/s/contributor/globalmind/%s" % username

        sources = [user_source, "/s/activity/globalmind/translate"]
        edge = make_edge(rel,
                         start,
                         end,
                         dataset='/d/globalmind',
                         license=Licenses.cc_attribution,
                         sources=sources,
                         surfaceText=surfaceText,
                         weight=1)
        out.write(edge)
Exemplo n.º 37
0
def json_to_msgpack(input_filename, output_filename):
	out_stream = MsgpackStreamWriter(output_filename)
	for obj in read_json_stream(input_filename):
		out_stream.write(obj)
	out_stream.close()