def prepare_db(inputs, dbfile): """ Build a SQLite database that extracts some information from our parsed versions of Wiktionary. This is information that is needed by later reader steps, such as which words are known in which languages, and which words are forms of other words. """ # If the database already exists, delete it first try: os.unlink(dbfile) except FileNotFoundError: pass db = sqlite3.connect(dbfile) make_tables(db) try: for filename in inputs: filepath = pathlib.Path(filename) file_language = filepath.name.split('.')[0] for item in read_json_stream(filename): if 'rel' in item: tfrom = item['from'] tto = item['to'] # For all non-definition relations, record the fact that # the given entry name exists in the given language. We'll # use these to disambiguate definitions later. if item['rel'] != 'definition': if 'language' in tfrom and valid_language(tfrom['language']): add_title( db, file_language, tfrom['language'], tfrom['text'] ) if 'language' in tto and valid_language(tto['language']): add_title(db, file_language, tto['language'], tto['text']) # Record word forms so we can build a lemmatizer from them. if item['rel'].startswith('form/'): form_name = item['rel'][5:] # Look for the part of speech, first in the 'from' term, # then in the 'to' term. pos = tfrom.get('pos', tto.get('pos', '?')) # Use only Etymology 1 entries for learning word forms. if (tfrom.get('etym') or '1') == '1': language = tfrom.get('language', tto.get('language')) if ( valid_language(language) and tfrom['text'] != tto['text'] ): add_form( db, file_language, language, tfrom['text'], pos, tto['text'], form_name, ) db.commit() finally: db.close()
def transform_term(data_language, termdata, assumed_languages, db, use_etyms=True): text = termdata['text'] # Sometimes - is used to fill a slot in a Wiktionary template where the # term would usually be. It typically means "don't show this part", with # the implication "the term in question is obvious from context". # # Context is hard, so let's just cope with a hyphen as the term by # discarding it. if text == '-': return None language = termdata.get('language') if language is None: language = disambiguate_language(text, assumed_languages, db) if not valid_language(language): return None # Remove unnecessary subtags from the Wiktionary language if '-' in language and language not in ALL_LANGUAGES: language = language.split('-')[0] if 'pos' not in termdata: return standardized_concept_uri(language, text) else: pos = termdata['pos'] etym_sense = None if use_etyms: etym_sense = etym_label(data_language, termdata) if etym_sense is not None: return standardized_concept_uri(language, text, pos, 'wikt', etym_sense) else: return standardized_concept_uri(language, text, pos)
def read_wiktionary(input_file, db_file, output_file): """ Convert a stream of parsed Wiktionary data into ConceptNet edges. A `db_file` containing all known words in all languages must have already been prepared from the same data. """ db = sqlite3.connect(db_file) out = MsgpackStreamWriter(output_file) for heading, items in segmented_stream(input_file): language = heading['language'] title = heading['title'] dataset = '/d/wiktionary/{}'.format(language) url_title = heading['title'].replace(' ', '_') web_url = 'http://{}.wiktionary.org/wiki/{}'.format( language, url_title) web_source = '/s/resource/wiktionary/{}'.format(language) source = {'contributor': web_source, 'process': PARSER_RULE} # Scan through the 'from' items, such as the start nodes of # translations, looking for distinct etymologies. If we get more than # one etymology for a language, we need to distinguish them as # different senses in that language. all_etyms = { (item['from']['language'], etym_label(language, item['from'])) for item in items if 'language' in item['from'] and item['from']['text'] == title and etym_label(language, item['from']) is not None } word_languages = {wlang for (wlang, _) in all_etyms} for wlang in sorted(word_languages): if valid_language(wlang): cpage = standardized_concept_uri(wlang, title) ld_edge = make_edge('/r/ExternalURL', cpage, web_url, dataset=dataset, weight=0.25, sources=[source], license=Licenses.cc_sharealike) out.write(ld_edge) etym_to_translation_sense = {} language_etym_counts = Counter(lang for (lang, etym) in all_etyms) polysemous_languages = { lang for lang in language_etym_counts if language_etym_counts[lang] > 1 } for item in items: tfrom = item['from'] tto = item['to'] assumed_languages = [language] lang1 = tfrom.get('language') lang2 = tto.get('language') if lang1 and (lang1 not in assumed_languages) and valid_language(lang1): assumed_languages.append(lang1) if lang2 and (lang2 not in assumed_languages) and valid_language(lang2): assumed_languages.append(lang2) cfrom = transform_term(language, tfrom, assumed_languages, db, use_etyms=(lang1 in polysemous_languages)) cpage = cfrom cto = transform_term(language, tto, assumed_languages, db, use_etyms=(lang2 in polysemous_languages)) if cfrom is None or cto is None: continue if uri_prefix(cfrom, 3) == uri_prefix(cto, 3): continue rel, switch = transform_relation(item['rel']) if rel is None: continue if switch: cfrom, cto = cto, cfrom # When translations are separated by sense, use only the first # sense we see for each etymology. That will have the most # representative translations. if item['rel'] == 'translation': etym_key = (tfrom['language'], etym_label(language, tfrom)) sense = tfrom.get('sense', '') if etym_key in etym_to_translation_sense: if etym_to_translation_sense[etym_key] != sense: continue else: etym_to_translation_sense[etym_key] = sense weight = 1. if rel == '/r/EtymologicallyRelatedTo': weight = 0.25 edge = make_edge(rel, cfrom, cto, dataset=dataset, weight=weight, sources=[source], surfaceStart=tfrom['text'], surfaceEnd=tto['text'], license=Licenses.cc_sharealike) out.write(edge) out.close()
def read_wiktionary(input_file, db_file, output_file): """ Convert a stream of parsed Wiktionary data into ConceptNet edges. A `db_file` containing all known words in all languages must have already been prepared from the same data. """ db = sqlite3.connect(db_file) out = MsgpackStreamWriter(output_file) for heading, items in segmented_stream(input_file): language = heading['language'] title = heading['title'] dataset = '/d/wiktionary/{}'.format(language) url_title = heading['title'].replace(' ', '_') web_url = 'http://{}.wiktionary.org/wiki/{}'.format(language, url_title) web_source = '/s/resource/wiktionary/{}'.format(language) source = {'contributor': web_source, 'process': PARSER_RULE} # Scan through the 'from' items, such as the start nodes of # translations, looking for distinct etymologies. If we get more than # one etymology for a language, we need to distinguish them as # different senses in that language. all_etyms = { (item['from']['language'], etym_label(language, item['from'])) for item in items if 'language' in item['from'] and item['from']['text'] == title and etym_label(language, item['from']) is not None } word_languages = {wlang for (wlang, _) in all_etyms} for wlang in sorted(word_languages): if valid_language(wlang): cpage = standardized_concept_uri(wlang, title) ld_edge = make_edge( '/r/ExternalURL', cpage, web_url, dataset=dataset, weight=0.25, sources=[source], license=Licenses.cc_sharealike, ) out.write(ld_edge) etym_to_translation_sense = {} language_etym_counts = Counter(lang for (lang, etym) in all_etyms) polysemous_languages = { lang for lang in language_etym_counts if language_etym_counts[lang] > 1 } for item in items: tfrom = item['from'] tto = item['to'] assumed_languages = [language] lang1 = tfrom.get('language') lang2 = tto.get('language') if lang1 and (lang1 not in assumed_languages) and valid_language(lang1): assumed_languages.append(lang1) if lang2 and (lang2 not in assumed_languages) and valid_language(lang2): assumed_languages.append(lang2) cfrom = transform_term( language, tfrom, assumed_languages, db, use_etyms=(lang1 in polysemous_languages), ) cpage = cfrom cto = transform_term( language, tto, assumed_languages, db, use_etyms=(lang2 in polysemous_languages), ) if cfrom is None or cto is None: continue if uri_prefix(cfrom, 3) == uri_prefix(cto, 3): continue rel, switch = transform_relation(item['rel']) if rel is None: continue if switch: cfrom, cto = cto, cfrom # When translations are separated by sense, use only the first # sense we see for each etymology. That will have the most # representative translations. if item['rel'] == 'translation': etym_key = (tfrom['language'], etym_label(language, tfrom)) sense = tfrom.get('sense', '') if etym_key in etym_to_translation_sense: if etym_to_translation_sense[etym_key] != sense: continue else: etym_to_translation_sense[etym_key] = sense weight = 1. if rel == '/r/EtymologicallyRelatedTo': weight = 0.25 edge = make_edge( rel, cfrom, cto, dataset=dataset, weight=weight, sources=[source], surfaceStart=tfrom['text'], surfaceEnd=tto['text'], license=Licenses.cc_sharealike, ) out.write(edge) out.close()