def __init__(self, out_filename='wiktionary.json'): self.lang = None self.langcode = None self.inArticle = False self.inTitle = False self.curSense = None self.curTitle = '' self.curText = '' self.locales = [] self.curRelation = None self.writer = FlatEdgeWriter(out_filename)
def __init__(self, out_filename='wiktionary_ja.json'): self.lang = None self.langcode = None self.inArticle = False self.inTitle = False self.curSense = None self.curTitle = '' self.curText = '' self.locales = [] self.curRelation = None self.writer = FlatEdgeWriter(out_filename) self.nosensetrans = None # non-sense-specific translation
def run_wordnet(input_dir, output_file, sw_map_file): mapping = {} labels = {} prefixes = {} glossary = {} synset_senses = defaultdict(list) synset_sense_names = defaultdict(list) sense_name_synsets = defaultdict(list) sense_synsets = defaultdict(list) parts_of_speech = { 'noun': 'n', 'verb': 'v', 'adjective': 'a', 'adjectivesatellite': 'a', 'adverb': 'r', } rel_mapping = { 'attribute': 'Attribute', 'causes': 'Causes', 'classifiedByRegion': 'HasContext', 'classifiedByUsage': 'HasContext', 'classifiedByTopic': 'HasContext', 'entails': 'Entails', 'hyponymOf': 'IsA', 'instanceOf': 'InstanceOf', 'memberMeronymOf': 'MemberOf', 'partMeronymOf': 'PartOf', 'sameVerbGroupAs': '******', 'similarTo': 'SimilarTo', 'substanceMeronymOf': '~MadeOf', 'antonymOf': 'Antonym', 'derivationallyRelated': '~DerivedFrom', 'pertainsTo': 'PertainsTo', 'seeAlso': 'RelatedTo', } def resolve_prefix(entry): prefix, name = entry.split(':') return prefixes[prefix] + name def handle_line(line): """ Get the (subj, obj, pred) parts of a line, unless it's a blank line or a prefix definition, in which case return None. """ line = line.decode('utf-8').strip() if not line: return None parts = line.split(None, 2) if parts[0] == '@prefix': prefix = parts[1].strip(': ') value = parts[2].strip('<>. ') prefixes[prefix] = value return None return parts[0], parts[1], parts[2].strip('. ') # First, get the human-readable label and gloss for every synset. for line in chain( open(input_dir + '/wordnet-synset.ttl'), open(input_dir + '/full/wordnet-wordsensesandwords.ttl'), open(input_dir + '/wordnet-glossary.ttl') ): parts = handle_line(line) if parts is None: continue if parts[1] == 'rdfs:label': subj = resolve_prefix(parts[0]) obj = parts[2].split('"')[1] labels[subj] = obj elif parts[1] == 'wn20schema:gloss': subj = resolve_prefix(parts[0]) obj = parts[2].split('"')[1] glossary[subj] = obj.split(';')[0] while '(' in glossary[subj] and ')' in glossary[subj]: glossary[subj] = re.sub(r"\([^)]+\) ?", r"", glossary[subj]) # Get the list of word senses in each synset, and make a bidirectional mapping. for line in open(input_dir + '/full/wordnet-wordsense-synset-relations.ttl'): parts = handle_line(line) if parts is None: continue if parts[1] == 'wn20schema:containsWordSense': subj = resolve_prefix(parts[0]) obj = resolve_prefix(parts[2].strip('. ')) synset_senses[subj].append(obj) sense_synsets[obj] = subj sense_name = labels[obj] synset_sense_names[subj].append(sense_name) sense_name_synsets[sense_name].append(subj) # Assign every synset a disambiguation name. for synset in synset_senses: senses = sorted(synset_senses[synset]) synset_name = labels[synset] synset_pos = synset.split('-')[-2] pos = parts_of_speech[synset_pos] disambig = glossary[synset].replace('/', '_') # TODO: take into account domains, etc. # #if len(sense_name_synsets[synset_name]) > 1: # for sense in senses: # sense_name = labels[sense] # more_synsets = sense_name_synsets[sense_name] # if len(more_synsets) == 1: # disambig = sense_name # break # if disambig is None: # disambig = glossary[synset] #if disambig is None: # disambig = '*' node = make_concept_uri(synset_name, 'en', pos+'/'+disambig) if synset not in mapping: mapping[synset] = node # Map senses to the same nodes. for sense, synset in sense_synsets.items(): mapping[sense] = mapping[synset] sources = ['/s/wordnet/3.0'] writer = FlatEdgeWriter(output_file) sw_map = FlatEdgeWriter(sw_map_file) sw_map_used = set() for line in chain( open(input_dir + '/wordnet-attribute.ttl'), open(input_dir + '/wordnet-causes.ttl'), open(input_dir + '/wordnet-classifiedby.ttl'), open(input_dir + '/wordnet-entailment.ttl'), open(input_dir + '/wordnet-hyponym.ttl'), open(input_dir + '/wordnet-instances.ttl'), open(input_dir + '/wordnet-membermeronym.ttl'), open(input_dir + '/wordnet-partmeronym.ttl'), open(input_dir + '/wordnet-sameverbgroupas.ttl'), open(input_dir + '/wordnet-similarity.ttl'), open(input_dir + '/wordnet-substancemeronym.ttl'), open(input_dir + '/full/wordnet-antonym.ttl'), open(input_dir + '/full/wordnet-derivationallyrelated.ttl'), open(input_dir + '/full/wordnet-participleof.ttl'), open(input_dir + '/full/wordnet-pertainsto.ttl'), open(input_dir + '/full/wordnet-seealso.ttl'), ): parts = handle_line(line) if parts is None: continue web_subj = resolve_prefix(parts[0]) web_rel = resolve_prefix(parts[1]) web_obj = resolve_prefix(parts[2]) subj = mapping[web_subj] obj = mapping[web_obj] pred_label = parts[1].split(':')[-1] if pred_label in rel_mapping: mapped = rel_mapping[pred_label] if mapped.startswith('~'): subj, obj = obj, subj web_subj, web_obj = web_obj, web_subj web_rel = web_rel.replace('meronym', 'holonym') mapped = mapped[1:] pred = '/r/'+mapped else: pred = '/r/wordnet/'+pred_label if (web_rel, pred) not in sw_map_used: sw_map.write({'from': web_rel, 'to': pred}) sw_map_used.add((web_rel, pred)) if (web_subj, subj) not in sw_map_used: sw_map.write({'from': web_subj, 'to': subj}) sw_map_used.add((web_subj, subj)) if (web_obj, obj) not in sw_map_used: sw_map.write({'from': web_obj, 'to': obj}) sw_map_used.add((web_obj, obj)) edge = make_edge( pred, subj, obj, '/d/wordnet/3.0', license='/l/CC/By', sources=sources, context='/ctx/all', weight=2.0 ) writer.write(edge) writer.close() sw_map.close()
class FindTranslations(ContentHandler): def __init__(self, out_filename='wiktionary_ja.json'): self.lang = None self.langcode = None self.inArticle = False self.inTitle = False self.curSense = None self.curTitle = '' self.curText = '' self.locales = [] self.curRelation = None self.writer = FlatEdgeWriter(out_filename) self.nosensetrans = None # non-sense-specific translation def startElement(self, name, attrs): if name == 'page': self.inArticle = True self.curText = [] elif name == 'title': self.inTitle = True self.curTitle = '' def endElement(self, name): if name == 'page': self.inArticle = False self.handleArticle(self.curTitle, ''.join(self.curText)) elif name == 'title': self.inTitle = False def characters(self, text): if self.inTitle: self.curTitle += text elif self.inArticle: self.curText.append(text) if len(self.curText) > 10000: # bail out self.inArticle = False def handleArticle(self, title, text): lines = text.split('\n') self.pos = None for line in lines: self.handleLine(title, line.strip()) def handleLine(self, title, line): language_match = LANGUAGE_HEADER.match(line) trans_top_match = TRANS_TOP.match(line) trans_bottom_match = TRANS_BOTTOM.match(line) trans_match = TRANS.match(line) trans_tag_match = TRANS_TAG.search(line) chinese_match = CHINESE_TAG.search(line) if language_match: self.langcode = get_language_code(language_match.group(1)) ### Get sense-specific translation if trans_top_match: # start translation part pos = self.pos or 'n' # get translation sense if trans_top_match.group(1): sense = trans_top_match.group(1).lstrip('|') self.curSense = pos+'/'+sense return else: self.curSense = pos return if trans_bottom_match: # end translation part self.curSense = None return if self.curSense and line[0:5] == '*[[{{': # get translation lang = line[5:].split('}')[0] # get language of translation if lang in LANGUAGES_3_TO_2: # convert 3-letter code to 2-letter code lang = LANGUAGES_3_TO_2[lang] # find all translations of that language translations = re.findall(r"\[\[(.*?)\]\]", line)[1:] for translation in translations: # iterate over translations self.output_sense_translation(lang, translation, title, \ self.curSense) return ### Get relation if line.startswith('===={{rel}}===='): # start relation part self.curRelation = 'ConceptuallyRelatedTo' return if self.curRelation and self.langcode: # within relation part if line.startswith('*'): # get relation relations = re.findall(r"\{\{(.*?)\}\}", line) if len(relations) > 0: if relations[0] == 'syn': # synonym self.curRelation = 'Synonym' if relations[0] == 'drv': # derivative self.curRelation = 'Derivative' related_words = re.findall(r"\[\[(.*?)\]\]", line) for related_word in related_words: self.output_monolingual(self.langcode, self.curRelation, \ related_word, title) self.curRelation = 'ConceptuallyRelatedTo' # back to default else: self.curRelation = None ### Get non-sense-specific translation if trans_match: self.nosensetrans = 1 # *maybe* start non-sense-specific translation if self.nosensetrans == 1 and line.startswith('{{top}}'): self.nosensetrans = 2 # start non-sense-specific translation if self.nosensetrans == 2: if line.startswith('{{bottom}}'): self.nosensetrans = None return if line.startswith('*{{'): lang = line[3:].split('}')[0] if lang in LANGUAGES_3_TO_2: # convert 3-letter code to 2-letter code lang = LANGUAGES_3_TO_2[lang] translations = re.findall(r"\[\[(.*?)\]\]", line) for translation in translations: self.output_sense_translation(lang, translation, title, '') def output_monolingual(self, lang, relation, term1, term2): # skip Wiktionary: links and templates if u'ウィク' in term1 or u'ウィク' in term2: return if u'テンプレート' in term1 or u'テンプレート' in term2: return if lang in LANGUAGES_3_TO_2: # convert 3-letter code to 2-letter code lang = LANGUAGES_3_TO_2[lang] source = make_concept_uri_safe(term1, lang) if self.pos: target = make_concept_uri_safe(term2, lang, self.pos) else: target = make_concept_uri_safe(term2, lang) surfaceText = "[[%s]] %s [[%s]]" % (term1, relation, term2) #print surfaceText edge = make_edge('/r/'+relation, source, target, '/d/wiktionary/ja/%s' % (lang), license='/l/CC/By-SA', sources=[SOURCE, MONOLINGUAL], context='/ctx/all', weight=1.0, surfaceText=surfaceText) self.writer.write(edge) def output_sense_translation(self, lang, foreign, translated, disambiguation): if u':' in foreign or u':' in translated: return if lang == 'zh-cn': lang = 'zh_CN' elif lang == 'zh-tw': lang = 'zh_TW' source = make_concept_uri_safe( unicodedata.normalize('NFKC', foreign), lang ) target = make_concept_uri_safe( translated, self.langcode, disambiguation ) relation = '/r/TranslationOf' try: surfaceRel = "is %s for" % (langs.english_name(lang)) except KeyError: surfaceRel = "is [language %s] for" % lang if disambiguation and '/' in disambiguation: surfaceText = "[[%s]] %s [[%s (%s)]]" % (foreign, surfaceRel, translated, disambiguation.split('/')[-1].replace('_', ' ')) else: surfaceText = "[[%s]] %s [[%s]]" % (foreign, surfaceRel, translated) #print surfaceText edge = make_edge(relation, source, target, '/d/wiktionary/ja/%s' % (self.langcode), license='/l/CC/By-SA', sources=[SOURCE, TRANSLATE], context='/ctx/all', weight=1.0, surfaceText=surfaceText) self.writer.write(edge) def output_translation(self, foreign, japanese, locale=''): source = make_concept_uri_safe( unicodedata.normalize('NFKC', foreign), self.langcode+locale ) target = make_concept_uri_safe( japanese, 'ja' ) relation = '/r/TranslationOf' try: surfaceRel = "is %s for" % (langs.english_name(self.langcode)) except KeyError: surfaceRel = "is [language %s] for" % self.langcode surfaceText = "[[%s]] %s [[%s]]" % (foreign, surfaceRel, japanese) edge = make_edge(relation, source, target, '/d/wiktionary/ja/%s' % self.langcode, license='/l/CC/By-SA', sources=[SOURCE, INTERLINGUAL], context='/ctx/all', weight=1.0, surfaceText=surfaceText) self.writer.write(edge)
""" Get data from DBPedia. """ __author__ = 'Justin Venezuela ([email protected]), Rob Speer ([email protected])' from metanl.english import normalize_topic, un_camel_case from conceptnet5.nodes import make_concept_uri, normalize_uri from conceptnet5.edges import make_edge, MultiWriter, FlatEdgeWriter import urllib import urllib2 source = '/s/web/dbpedia.org' WRITER_NUM = 1 writer = MultiWriter('dbpedia.%d' % WRITER_NUM) sw_map = FlatEdgeWriter('data/sw/dbpedia.map.json') sw_map_used = set() def cycle_writer(): global writer, WRITER_NUM writer.close() WRITER_NUM += 1 writer = MultiWriter('dbpedia.%d' % WRITER_NUM) def translate_wp_url(url): url = urllib.unquote(url).decode('utf-8', 'ignore') return un_camel_case(url.strip('/').split('/')[-1].split('#')[-1])
def run_verbosity(infile, outfile): maxscore = 0 count = 0 counts = defaultdict(int) text_similarities = [] sources = ['/s/site/verbosity'] writer = FlatEdgeWriter(outfile) for line in open(infile): parts = line.strip().split('\t') if not parts: counts['blank'] += 1 continue left, relation, right, freq, orderscore = parts[:5] # catch bad stuff flagged = False for rword in right.split(): if bad_regex_no_biscuit.match(rword): flagged = True break if flagged: #print "FLAGGED:", right counts['flag word'] += 1 continue if len(right) < 3: counts['clue too short'] += 1 continue if len(right.split()[-1]) == 1: counts['letter'] += 1 continue if right.startswith('add') or right.startswith('delete') or right.startswith('remove'): counts['flag word'] += 1 continue freq = int(freq) orderscore = int(orderscore) rel = '/r/RelatedTo' reltext = 'is related to' if right.startswith('not '): rel = '/r/Antonym' right = right[4:] reltext = 'is not' if relation == 'it is the opposite of': rel = '/r/Antonym' reltext = 'is the opposite of' rightwords = [right] if ' ' in right: rightwords.extend(right.split(' ')) sls = sounds_like_score(left, right) text_similarities.append(sls) if sls > 0.35: counts['text similarity'] += 1 continue for i, rightword in enumerate(rightwords): edge_sources = list(sources) if i > 0: edge_sources.append('/s/rule/split_words') text = '[[%s]] %s [[%s]]' % (left, reltext, rightword) sls = sounds_like_score(left, rightword) text_similarities.append(sls) if sls > 0.35: counts['text similarity'] += 1 continue score = (freq*2-1) * (1000-orderscore) * (1-sls) / 1000 if score <= 0: counts['low score'] += 1 continue #weight = math.log(1 + score/10.0) / math.log(2) weight = score / 100.0 count += 1 counts['success'] += 1 leftc = make_concept_uri(unicode(left), 'en') rightc = make_concept_uri(unicode(rightword), 'en') edge = make_edge(rel, leftc, rightc, '/d/verbosity', '/l/CC/By', sources, surfaceText=text, weight=weight) writer.write(edge)
# break # if disambig is None: # disambig = glossary[synset] #if disambig is None: # disambig = '*' node = make_concept_uri(synset_name, 'en', pos+'/'+disambig) if synset not in mapping: mapping[synset] = node # Map senses to the same nodes. for sense, synset in sense_synsets.items(): mapping[sense] = mapping[synset] sources = ['/s/wordnet/3.0'] writer = MultiWriter('wordnet3') sw_map = FlatEdgeWriter('data/sw/wordnet30.map.json') sw_map_used = set() for line in chain( open('raw_data/wordnet-attribute.ttl'), open('raw_data/wordnet-causes.ttl'), open('raw_data/wordnet-classifiedby.ttl'), open('raw_data/wordnet-entailment.ttl'), open('raw_data/wordnet-hyponym.ttl'), open('raw_data/wordnet-instances.ttl'), open('raw_data/wordnet-membermeronym.ttl'), open('raw_data/wordnet-partmeronym.ttl'), open('raw_data/wordnet-sameverbgroupas.ttl'), open('raw_data/wordnet-similarity.ttl'), open('raw_data/wordnet-substancemeronym.ttl'), open('raw_data/full/wordnet-antonym.ttl'),
class FindTranslations(ContentHandler): def __init__(self, out_filename='wiktionary.json'): self.lang = None self.langcode = None self.inArticle = False self.inTitle = False self.curSense = None self.curTitle = '' self.curText = '' self.locales = [] self.curRelation = None self.writer = FlatEdgeWriter(out_filename) def startElement(self, name, attrs): if name == 'page': self.inArticle = True self.curText = [] elif name == 'title': self.inTitle = True self.curTitle = '' def endElement(self, name): if name == 'page': self.inArticle = False self.handleArticle(self.curTitle, ''.join(self.curText)) elif name == 'title': self.inTitle = False def characters(self, text): if self.inTitle: self.curTitle += text elif self.inArticle: self.curText.append(text) if len(self.curText) > 10000: # bail out self.inArticle = False def handleArticle(self, title, text): lines = text.split('\n') self.pos = None for line in lines: self.handleLine(title, line.strip()) def handleLine(self, title, line): language_match = LANGUAGE_HEADER.match(line) trans_top_match = TRANS_TOP.match(line) trans_tag_match = TRANS_TAG.search(line) chinese_match = CHINESE_TAG.search(line) if line.startswith('===') and line.endswith('==='): pos = line.strip('= ') if pos == 'Synonyms': self.curRelation = 'Synonym' elif pos == 'Antonym': self.curRelation = 'Antonym' elif pos == 'Related terms': self.curRelation = 'ConceptuallyRelatedTo' elif pos == 'Derived terms': if not line.startswith('===='): # this is at the same level as the part of speech; # now we don't know what POS these apply to self.pos = None self.curRelation = 'DerivedFrom' else: self.curRelation = None if pos in PARTS_OF_SPEECH: self.pos = PARTS_OF_SPEECH[pos] elif language_match: self.lang = language_match.group(1) self.langcode = LANGUAGES.get(self.lang) elif chinese_match: scripttag = chinese_match.group(2) self.locales = [] if 's' in scripttag: self.locales.append('_CN') if 't' in scripttag: self.locales.append('_TW') elif line[0:1] == '#' and self.lang != 'English' and self.lang is not None: defn = line[1:].strip() if defn[0:1] not in ':*#': for defn2 in filter_line(defn): if not ascii_enough(defn2): continue if 'Index:' in title: continue if self.langcode == 'zh': for locale in self.locales: self.output_translation(title, defn2, locale) elif self.langcode: self.output_translation(title, defn2) elif line[0:4] == '----': self.pos = None self.lang = None self.langcode = None self.curRelation = None elif trans_top_match: pos = self.pos or 'n' sense = trans_top_match.group(1).split(';')[0].strip('.') if 'translations' in sense.lower(): self.curSense = None else: self.curSense = pos+'/'+sense elif trans_tag_match: lang = trans_tag_match.group(1) translation = trans_tag_match.group(2) if self.curSense is not None and self.lang == 'English': # handle Chinese separately if lang not in ('cmn', 'yue', 'zh-yue', 'zh'): self.output_sense_translation(lang, translation, title, self.curSense) elif '{{trans-bottom}}' in line: self.curSense = None elif line.startswith('* ') and self.curRelation and self.langcode: relatedmatch = WIKILINK.search(line) if relatedmatch: related = relatedmatch.group(1) self.output_monolingual(self.langcode, self.curRelation, related, title) def output_monolingual(self, lang, relation, term1, term2): if 'Wik' in term1 or 'Wik' in term2: return source = make_concept_uri(term1, lang) if self.pos: target = make_concept_uri(term2, lang, self.pos) else: target = make_concept_uri(term2, lang) surfaceText = "[[%s]] %s [[%s]]" % (term1, relation, term2) #print surfaceText edge = make_edge('/r/'+relation, source, target, '/d/wiktionary/%s/%s' % (lang, lang), license='/l/CC/By-SA', sources=[SOURCE, MONOLINGUAL], context='/ctx/all', weight=1.0, surfaceText=surfaceText) self.writer.write(edge) def output_sense_translation(self, lang, foreign, english, disambiguation): if 'Wik' in foreign or 'Wik' in english: return if lang == 'zh-cn': lang = 'zh_CN' elif lang == 'zh-tw': lang = 'zh_TW' source = make_concept_uri( unicodedata.normalize('NFKC', foreign), lang ) target = make_concept_uri( english, 'en', disambiguation ) relation = '/r/TranslationOf' try: surfaceRel = "is %s for" % (langs.english_name(lang)) except KeyError: surfaceRel = "is [language %s] for" % lang surfaceText = "[[%s]] %s [[%s (%s)]]" % (foreign, surfaceRel, english, disambiguation.split('/')[-1].replace('_', ' ')) #print surfaceText edge = make_edge(relation, source, target, '/d/wiktionary/en/%s' % lang, license='/l/CC/By-SA', sources=[SOURCE, TRANSLATE], context='/ctx/all', weight=1.0, surfaceText=surfaceText) self.writer.write(edge) def output_translation(self, foreign, english, locale=''): source = make_concept_uri( unicodedata.normalize('NFKC', foreign), self.langcode+locale ) target = make_concept_uri( english, 'en' ) relation = '/r/TranslationOf' try: surfaceRel = "is %s for" % (langs.english_name(self.langcode)) except KeyError: surfaceRel = "is [language %s] for" % self.langcode surfaceText = "[[%s]] %s [[%s]]" % (foreign, surfaceRel, english) edge = make_edge(relation, source, target, '/d/wiktionary/en/%s' % self.langcode, license='/l/CC/By-SA', sources=[SOURCE, INTERLINGUAL], context='/ctx/all', weight=1.0, surfaceText=surfaceText) self.writer.write(edge)
class FindTranslations(ContentHandler): def __init__(self, out_filename='wiktionary.json'): self.lang = None self.langcode = None self.inArticle = False self.inTitle = False self.curSense = None self.curTitle = '' self.curText = '' self.locales = [] self.curRelation = None self.writer = FlatEdgeWriter(out_filename) def startElement(self, name, attrs): if name == 'page': self.inArticle = True self.curText = [] elif name == 'title': self.inTitle = True self.curTitle = '' def endElement(self, name): if name == 'page': self.inArticle = False self.handleArticle(self.curTitle, ''.join(self.curText)) elif name == 'title': self.inTitle = False def characters(self, text): if self.inTitle: self.curTitle += text elif self.inArticle: self.curText.append(text) if len(self.curText) > 10000: # bail out self.inArticle = False def handleArticle(self, title, text): lines = text.split('\n') self.pos = None for line in lines: self.handleLine(title, line.strip()) def handleLine(self, title, line): language_match = LANGUAGE_HEADER.match(line) trans_top_match = TRANS_TOP.match(line) trans_tag_match = TRANS_TAG.search(line) chinese_match = CHINESE_TAG.search(line) if line.startswith('===') and line.endswith('==='): pos = line.strip('= ') if pos == 'Synonyms': self.curRelation = 'Synonym' elif pos == 'Antonym': self.curRelation = 'Antonym' elif pos == 'Related terms': self.curRelation = 'ConceptuallyRelatedTo' elif pos == 'Derived terms': if not line.startswith('===='): # this is at the same level as the part of speech; # now we don't know what POS these apply to self.pos = None self.curRelation = 'DerivedFrom' else: self.curRelation = None if pos in PARTS_OF_SPEECH: self.pos = PARTS_OF_SPEECH[pos] elif language_match: self.lang = language_match.group(1) self.langcode = LANGUAGES.get(self.lang) elif chinese_match: scripttag = chinese_match.group(2) self.locales = [] if 's' in scripttag: self.locales.append('_CN') if 't' in scripttag: self.locales.append('_TW') elif line[ 0: 1] == '#' and self.lang != 'English' and self.lang is not None: defn = line[1:].strip() if defn[0:1] not in ':*#': for defn2 in filter_line(defn): if not ascii_enough(defn2): continue if 'Index:' in title: continue if self.langcode == 'zh': for locale in self.locales: self.output_translation(title, defn2, locale) elif self.langcode: self.output_translation(title, defn2) elif line[0:4] == '----': self.pos = None self.lang = None self.langcode = None self.curRelation = None elif trans_top_match: pos = self.pos or 'n' sense = trans_top_match.group(1).split(';')[0].strip('.') if 'translations' in sense.lower(): self.curSense = None else: self.curSense = pos + '/' + sense elif trans_tag_match: lang = trans_tag_match.group(1) translation = trans_tag_match.group(2) if self.curSense is not None and self.lang == 'English': # handle Chinese separately if lang not in ('cmn', 'yue', 'zh-yue', 'zh'): self.output_sense_translation(lang, translation, title, self.curSense) elif '{{trans-bottom}}' in line: self.curSense = None elif line.startswith('* ') and self.curRelation and self.langcode: relatedmatch = WIKILINK.search(line) if relatedmatch: related = relatedmatch.group(1) self.output_monolingual(self.langcode, self.curRelation, related, title) def output_monolingual(self, lang, relation, term1, term2): if 'Wik' in term1 or 'Wik' in term2: return source = make_concept_uri(term1, lang) if self.pos: target = make_concept_uri(term2, lang, self.pos) else: target = make_concept_uri(term2, lang) surfaceText = "[[%s]] %s [[%s]]" % (term1, relation, term2) #print surfaceText edge = make_edge('/r/' + relation, source, target, '/d/wiktionary/%s/%s' % (lang, lang), license='/l/CC/By-SA', sources=[SOURCE, MONOLINGUAL], context='/ctx/all', weight=1.0, surfaceText=surfaceText) self.writer.write(edge) def output_sense_translation(self, lang, foreign, english, disambiguation): if 'Wik' in foreign or 'Wik' in english: return if lang == 'zh-cn': lang = 'zh_CN' elif lang == 'zh-tw': lang = 'zh_TW' source = make_concept_uri(unicodedata.normalize('NFKC', foreign), lang) target = make_concept_uri(english, 'en', disambiguation) relation = '/r/TranslationOf' try: surfaceRel = "is %s for" % (langs.english_name(lang)) except KeyError: surfaceRel = "is [language %s] for" % lang surfaceText = "[[%s]] %s [[%s (%s)]]" % ( foreign, surfaceRel, english, disambiguation.split('/')[-1].replace('_', ' ')) #print surfaceText edge = make_edge(relation, source, target, '/d/wiktionary/en/%s' % lang, license='/l/CC/By-SA', sources=[SOURCE, TRANSLATE], context='/ctx/all', weight=1.0, surfaceText=surfaceText) self.writer.write(edge) def output_translation(self, foreign, english, locale=''): source = make_concept_uri(unicodedata.normalize('NFKC', foreign), self.langcode + locale) target = make_concept_uri(english, 'en') relation = '/r/TranslationOf' try: surfaceRel = "is %s for" % (langs.english_name(self.langcode)) except KeyError: surfaceRel = "is [language %s] for" % self.langcode surfaceText = "[[%s]] %s [[%s]]" % (foreign, surfaceRel, english) edge = make_edge(relation, source, target, '/d/wiktionary/en/%s' % self.langcode, license='/l/CC/By-SA', sources=[SOURCE, INTERLINGUAL], context='/ctx/all', weight=1.0, surfaceText=surfaceText) self.writer.write(edge)
class FindTranslations(ContentHandler): def __init__(self, out_filename='wiktionary_ja.json'): self.lang = None self.langcode = None self.inArticle = False self.inTitle = False self.curSense = None self.curTitle = '' self.curText = '' self.locales = [] self.curRelation = None self.writer = FlatEdgeWriter(out_filename) self.nosensetrans = None # non-sense-specific translation def startElement(self, name, attrs): if name == 'page': self.inArticle = True self.curText = [] elif name == 'title': self.inTitle = True self.curTitle = '' def endElement(self, name): if name == 'page': self.inArticle = False self.handleArticle(self.curTitle, ''.join(self.curText)) elif name == 'title': self.inTitle = False def characters(self, text): if self.inTitle: self.curTitle += text elif self.inArticle: self.curText.append(text) if len(self.curText) > 10000: # bail out self.inArticle = False def handleArticle(self, title, text): lines = text.split('\n') self.pos = None for line in lines: self.handleLine(title, line.strip()) def handleLine(self, title, line): language_match = LANGUAGE_HEADER.match(line) trans_top_match = TRANS_TOP.match(line) trans_bottom_match = TRANS_BOTTOM.match(line) trans_match = TRANS.match(line) trans_tag_match = TRANS_TAG.search(line) chinese_match = CHINESE_TAG.search(line) if language_match: self.langcode = get_language_code(language_match.group(1)) ### Get sense-specific translation if trans_top_match: # start translation part pos = self.pos or 'n' # get translation sense if trans_top_match.group(1): sense = trans_top_match.group(1).lstrip('|') self.curSense = pos + '/' + sense return else: self.curSense = pos return if trans_bottom_match: # end translation part self.curSense = None return if self.curSense and line[0:5] == '*[[{{': # get translation lang = line[5:].split('}')[0] # get language of translation if lang in LANGUAGES_3_TO_2: # convert 3-letter code to 2-letter code lang = LANGUAGES_3_TO_2[lang] # find all translations of that language translations = re.findall(r"\[\[(.*?)\]\]", line)[1:] for translation in translations: # iterate over translations self.output_sense_translation(lang, translation, title, \ self.curSense) return ### Get relation if line.startswith('===={{rel}}===='): # start relation part self.curRelation = 'ConceptuallyRelatedTo' return if self.curRelation and self.langcode: # within relation part if line.startswith('*'): # get relation relations = re.findall(r"\{\{(.*?)\}\}", line) if len(relations) > 0: if relations[0] == 'syn': # synonym self.curRelation = 'Synonym' if relations[0] == 'drv': # derivative self.curRelation = 'Derivative' related_words = re.findall(r"\[\[(.*?)\]\]", line) for related_word in related_words: self.output_monolingual(self.langcode, self.curRelation, \ related_word, title) self.curRelation = 'ConceptuallyRelatedTo' # back to default else: self.curRelation = None ### Get non-sense-specific translation if trans_match: self.nosensetrans = 1 # *maybe* start non-sense-specific translation if self.nosensetrans == 1 and line.startswith('{{top}}'): self.nosensetrans = 2 # start non-sense-specific translation if self.nosensetrans == 2: if line.startswith('{{bottom}}'): self.nosensetrans = None return if line.startswith('*{{'): lang = line[3:].split('}')[0] if lang in LANGUAGES_3_TO_2: # convert 3-letter code to 2-letter code lang = LANGUAGES_3_TO_2[lang] translations = re.findall(r"\[\[(.*?)\]\]", line) for translation in translations: self.output_sense_translation(lang, translation, title, '') def output_monolingual(self, lang, relation, term1, term2): # skip Wiktionary: links and templates if u'ウィク' in term1 or u'ウィク' in term2: return if u'テンプレート' in term1 or u'テンプレート' in term2: return if lang in LANGUAGES_3_TO_2: # convert 3-letter code to 2-letter code lang = LANGUAGES_3_TO_2[lang] source = make_concept_uri_safe(term1, lang) if self.pos: target = make_concept_uri_safe(term2, lang, self.pos) else: target = make_concept_uri_safe(term2, lang) surfaceText = "[[%s]] %s [[%s]]" % (term1, relation, term2) #print surfaceText edge = make_edge('/r/' + relation, source, target, '/d/wiktionary/ja/%s' % (lang), license='/l/CC/By-SA', sources=[SOURCE, MONOLINGUAL], context='/ctx/all', weight=1.0, surfaceText=surfaceText) self.writer.write(edge) def output_sense_translation(self, lang, foreign, translated, disambiguation): if u':' in foreign or u':' in translated: return if lang == 'zh-cn': lang = 'zh_CN' elif lang == 'zh-tw': lang = 'zh_TW' source = make_concept_uri_safe(unicodedata.normalize('NFKC', foreign), lang) target = make_concept_uri_safe(translated, self.langcode, disambiguation) relation = '/r/TranslationOf' try: surfaceRel = "is %s for" % (langs.english_name(lang)) except KeyError: surfaceRel = "is [language %s] for" % lang if disambiguation and '/' in disambiguation: surfaceText = "[[%s]] %s [[%s (%s)]]" % ( foreign, surfaceRel, translated, disambiguation.split('/')[-1].replace('_', ' ')) else: surfaceText = "[[%s]] %s [[%s]]" % (foreign, surfaceRel, translated) #print surfaceText edge = make_edge(relation, source, target, '/d/wiktionary/ja/%s' % (self.langcode), license='/l/CC/By-SA', sources=[SOURCE, TRANSLATE], context='/ctx/all', weight=1.0, surfaceText=surfaceText) self.writer.write(edge) def output_translation(self, foreign, japanese, locale=''): source = make_concept_uri_safe(unicodedata.normalize('NFKC', foreign), self.langcode + locale) target = make_concept_uri_safe(japanese, 'ja') relation = '/r/TranslationOf' try: surfaceRel = "is %s for" % (langs.english_name(self.langcode)) except KeyError: surfaceRel = "is [language %s] for" % self.langcode surfaceText = "[[%s]] %s [[%s]]" % (foreign, surfaceRel, japanese) edge = make_edge(relation, source, target, '/d/wiktionary/ja/%s' % self.langcode, license='/l/CC/By-SA', sources=[SOURCE, INTERLINGUAL], context='/ctx/all', weight=1.0, surfaceText=surfaceText) self.writer.write(edge)