示例#1
0
def can_skip(parts_dict):
    """
    Skip the kinds of data that we don't want to import from ConceptNet 4's
    database dump.

    The activity called 'testing' was actually collecting preliminary data for
    a dataset about subjective medical experiences. This data looks really out
    of place now.
    """
    lang = parts_dict['lang']
    if lang.startswith('zh'):
        # Chinese assertions from GlobalMind are not reliable enough. We'll get
        # Chinese from the `ptt_petgame` module instead.
        return True
    if lang == 'ja' and parts_dict["activity"] != 'nadya.jp':
        # Use Japanese data collected from nadya.jp, but not earlier attempts.
        return True
    if parts_dict["goodness"] < 1:
        return True
    if 'spatial concept' in parts_dict["startText"]:
        return True
    if not parts_dict["startText"] or not parts_dict["endText"]:
        return True
    if 'rubycommons' in parts_dict["activity"]:
        return True
    if 'Verbosity' in parts_dict["activity"]:
        return True
    if 'testing' in parts_dict["activity"]:
        return True
    if not (valid_concept_name(parts_dict["startText"])
            and valid_concept_name(parts_dict["endText"])):
        return True
    return False
示例#2
0
def can_skip(parts_dict):
    """
    Skip the kinds of data that we don't want to import from ConceptNet 4's
    database dump.

    The activity called 'testing' was actually collecting preliminary data for
    someone's dataset about subjective medical experiences. This data looks
    really out of place now.
    """
    lang = parts_dict["lang"]
    if lang.startswith("zh"):
        # Chinese assertions from GlobalMind are not reliable enough. We'll get
        # Chinese from the `ptt_petgame` module instead.
        return True
    if lang == "ja" and parts_dict["activity"] != "nadya.jp":
        # Use Japanese data collected from nadya.jp, but not earlier attempts.
        return True
    if parts_dict["goodness"] <= 1:
        return True
    if "spatial concept" in parts_dict["startText"]:
        return True
    if not parts_dict["startText"] or not parts_dict["endText"]:
        return True
    if "rubycommons" in parts_dict["activity"]:
        return True
    if "Verbosity" in parts_dict["activity"]:
        return True
    if "testing" in parts_dict["activity"]:
        return True
    if parts_dict["activity"] in ACTIVITY_BLACKLIST:
        return True
    if not (valid_concept_name(parts_dict["startText"]) and valid_concept_name(parts_dict["endText"])):
        return True
    return False
示例#3
0
def handle_file(filename, output_file):
    """
    JMdict is a Japanese translation dictionary, targeting multiple languages,
    released under a Creative Commons Attribution-ShareAlike license. That's
    awesome.

    It's released as a kind of annoying XML structure, using fancy XML features
    like entities, so in order to read it we need a full-blown XML parser. Python's
    built-in XML parsers are horrifying, so here we use the 'xmltodict' module, which
    is also horrifying but gets the job done.

    The majorly weird thing about xmltodict that we have to work around is that
    it gives results of different *types* when you get 0, 1, or many child nodes.
    This is what get_list is for.
    """
    # Read the XML file as UTF-8, and parse it into a dictionary.
    file = codecs.open(filename, encoding='utf-8')
    out = MsgpackStreamWriter(output_file)
    data = file.read()
    file.close()
    xml = xmltodict.parse(data)

    # The dictionary contains a list of <entry> tags.
    root_node = xml['JMdict']
    for entry in get_list(root_node, 'entry'):
        # From JMdict's documentation: "The kanji element, or in its absence,
        # the reading element, is the defining component of each entry."
        #
        # Quick summary of what's going on here: most Japanese words can be
        # written using kanji or kana.
        #
        # Kana are phonetic characters. Every word can be written in kana, in
        # one of two alphabets (hiragana or katakana). Words that are homonyms
        # have the same kana, unless they're written in different alphabets.
        #
        # Kanji are Chinese-based characters that are related to the meaning of
        # the word. They're compact and good at disambiguating homonyms, so
        # kanji are usually used as the canonical representation of a word.
        # However, some words have no kanji.
        #
        # The kana version of a word written in kanji is called its 'reading'.
        # Words that are pronounced differently in different contexts have
        # multiple readings.
        #
        # Okay, done with the intro to Japanese orthography. In JMdict, if
        # a word can be written in kanji, it has a <k_ele> element, containing
        # a <keb> element that contains the text. Every word also has an
        # <r_ele> element, containing one or more <reb> elements that are phonetic
        # readings of the word.
        #
        # We get the "defining text" of a word by taking its <keb> if it exists,
        # or all of its <reb>s if not. There's no way to tell which <reb> is the
        # most "defining" in the case where there's no <keb>.
        headwords = [word['keb'] for word in get_list(entry, 'k_ele')]
        if not headwords:
            headwords = [word['reb'] for word in get_list(entry, 'r_ele')]

        # An entry might have different word senses that are translated
        # differently to other languages. Ideally, we'd remember that they're
        # different senses. However, we have no way to refer to the different
        # senses. So for now, we disregard word senses. One day we might have
        # a better overall plan for word senses in ConceptNet.
        senses = get_list(entry, 'sense')
        for sense_num, sense in enumerate(senses):
            # Glosses are translations of the word to different languages.
            # If the word is a loan-word, the foreign word it was derived from
            # will be marked with the <lsource> tag instead of <gloss>.
            #
            # Get all the glosses, including the lsource if it's there.
            glosses = get_list(sense, 'gloss') + get_list(sense, 'lsource')
            contexts = [
                fix_context(context)
                for context in get_list(sense, 'field')
            ]
            pos = '_'
            for pos_tag in get_list(sense, 'pos'):
                if pos_tag[:10] in NOUN_TYPES:
                    pos = 'n'
                elif pos_tag[:10] in VERB_TYPES:
                    pos = 'v'
                elif pos_tag[:10] in ADJ_TYPES:
                    pos = 'a'
                elif pos_tag[:10] in ADV_TYPES:
                    pos = 'r'

            for gloss in glosses:
                if '#text' in gloss:
                    # A gloss node might be marked with a 'lang' attribute. If so,
                    # xmltodict represents it as a dictionary with '#text' and
                    # '@xml:lang' elements.
                    text = parse_gloss(gloss['#text'])
                    lang = convert_lang_code(gloss['@xml:lang'])
                elif isinstance(gloss, STRING_TYPE):
                    # If there's no 'lang' attribute, the gloss is in English,
                    # and xmltodict gives it to us as a plain Unicode string.
                    lang = 'en'
                    text = parse_gloss(gloss)

                # If we parsed the node at all and the text looks good, then we can
                # add edges to ConceptNet.
                #
                # We don't want to deal with texts with periods (these might be
                # dictionary-style abbreviations, which are sort of unhelpful when
                # we can't expand them), and we also don't want to deal with texts
                # that are more than five words long.
                if (
                    text is not None and '.' not in text and
                    text.count(' ') <= 4 and valid_concept_name(text)
                ):
                    for head in headwords:
                        if len(senses) >= 2:
                            sensekey = '%d' % (sense_num + 1)
                            ja_concept = standardized_concept_uri('ja', head, pos, 'jmdict', sensekey)
                        else:
                            ja_concept = standardized_concept_uri('ja', head, pos)
                        other_concept = standardized_concept_uri(lang, text)
                        output_edge(out, '/r/Synonym', ja_concept, other_concept)

                        for context in contexts:
                            context_node = standardized_concept_uri('en', context)
                            output_edge(out, '/r/HasContext', ja_concept, context_node)