def run_single_process(): writer = MultiWriter('conceptnet4_nadya') raw_assertions = RawAssertion.objects.filter() for raw_assertion in raw_assertions: edges = handle_raw_assertion(raw_assertion) for edge in edges: writer.write(edge)
def sum_assertions(file_index): weights = defaultdict(float) assertions = {} ccby = defaultdict(bool) for line in codecs.open(CURRENT_DIR +'/data/temp/core_'+str(file_index)+'.txt', 'r','utf-8'): uri, rel, start, end, context, weight, sources, id, dataset = line.split('\t')[:9] if uri != 'uri' and context == '/ctx/all': weight = float(weight) weights[uri] += float(weight) assertions[uri] = (rel, start, end, context, weights[uri]) if not (dataset.startswith('/d/reverb') or dataset.startswith('/d/wiktionary') or dataset.startswith('/d/dbpedia')): ccby[uri] = True writer_core = MultiWriter('assertion_totals_core') #writer_sa = MultiWriter('assertion_totals_sa') for uri, values in assertions.iteritems(): relation, start, end, context, weight = values if ccby[uri]: license = '/l/CC/By' dataset = '/d/conceptnet/5/combined-core' else: license = '/l/CC/By-SA' dataset = '/d/conceptnet/5/combined-sa' edge = make_edge(relation, start, end, dataset, license, ['/s/rule/sum_edges'], '/ctx/all', weight=weight) if license == '/l/CC/By': writer_core.write(edge) #else: #writer_sa.write(edge) writer_core.close()
def run_single_process(): writer = MultiWriter("conceptnet4_nadya") path = "./raw_data/" for filename in os.listdir(path): for raw_assertion in codecs.open(path + filename, encoding="utf-8", errors="replace"): edges = handle_raw_flat_assertion(raw_assertion) for edge in edges: writer.write(edge)
def run_single_process(): writer = MultiWriter('conceptnet4') path = "./raw_data/" for filename in os.listdir(path): for raw_assertion in codecs.open(path + filename, encoding='utf-8', errors='replace'): edges = handle_raw_assertion(raw_assertion) for edge in edges: writer.write(edge)
def run_single_process(): writer = MultiWriter('conceptnet4_zh') path = "./raw_data/" for filename in os.listdir(path): for line in codecs.open(path + filename, encoding='utf-8', errors='replace'): aggregate_assertion(line) for assertion, users in assertion_map.items(): edges = handle_raw_assertion((assertion, users)) for edge in edges: writer.write(edge)
def build_core_from_csvs(csv_files): weights = defaultdict(float) assertions = {} ccby = defaultdict(bool) for csv_file in csv_files: print "currently in file: " + str(csv_file) for line in codecs.open(csv_file, encoding='utf-8'): uri, rel, start, end, context, weight, sources, id, dataset = line.split( '\t')[:9] if uri != 'uri' and context == '/ctx/all': weight = float(weight) weights[uri] += float(weight) assertions[uri] = (rel, start, end, context, weights[uri]) if not (dataset.startswith('/d/reverb') or dataset.startswith('/d/wiktionary') or dataset.startswith('/d/dbpedia')): ccby[uri] = True print 'writing' writer_core = MultiWriter('assertion_totals_core') #writer_sa = MultiWriter('assertion_totals_sa') for uri, values in assertions.iteritems(): relation, start, end, context, weight = values if ccby[uri]: license = '/l/CC/By' dataset = '/d/conceptnet/5/combined-core' else: license = '/l/CC/By-SA' dataset = '/d/conceptnet/5/combined-sa' edge = make_edge(relation, start, end, dataset, license, ['/s/rule/sum_edges'], '/ctx/all', weight=weight) if license == '/l/CC/By': writer_core.write(edge) #else: #writer_sa.write(edge) writer_core.close()
def sum_assertions(file_index): weights = defaultdict(float) assertions = {} ccby = defaultdict(bool) for line in codecs.open( CURRENT_DIR + '/data/temp/core_' + str(file_index) + '.txt', 'r', 'utf-8'): uri, rel, start, end, context, weight, sources, id, dataset = line.split( '\t')[:9] if uri != 'uri' and context == '/ctx/all': weight = float(weight) weights[uri] += float(weight) assertions[uri] = (rel, start, end, context, weights[uri]) if not (dataset.startswith('/d/reverb') or dataset.startswith('/d/wiktionary') or dataset.startswith('/d/dbpedia')): ccby[uri] = True writer_core = MultiWriter('assertion_totals_core') #writer_sa = MultiWriter('assertion_totals_sa') for uri, values in assertions.iteritems(): relation, start, end, context, weight = values if ccby[uri]: license = '/l/CC/By' dataset = '/d/conceptnet/5/combined-core' else: license = '/l/CC/By-SA' dataset = '/d/conceptnet/5/combined-sa' edge = make_edge(relation, start, end, dataset, license, ['/s/rule/sum_edges'], '/ctx/all', weight=weight) if license == '/l/CC/By': writer_core.write(edge) #else: #writer_sa.write(edge) writer_core.close()
def build_core_from_csvs(csv_files): weights = defaultdict(float) assertions = {} ccby = defaultdict(bool) for csv_file in csv_files: print "currently in file: " + str(csv_file) for line in codecs.open(csv_file, encoding='utf-8'): uri, rel, start, end, context, weight, sources, id, dataset = line.split('\t')[:9] if uri != 'uri' and context == '/ctx/all': weight = float(weight) weights[uri] += float(weight) assertions[uri] = (rel, start, end, context, weights[uri]) if not (dataset.startswith('/d/reverb') or dataset.startswith('/d/wiktionary') or dataset.startswith('/d/dbpedia')): ccby[uri] = True print 'writing' writer_core = MultiWriter('assertion_totals_core') #writer_sa = MultiWriter('assertion_totals_sa') for uri, values in assertions.iteritems(): relation, start, end, context, weight = values if ccby[uri]: license = '/l/CC/By' dataset = '/d/conceptnet/5/combined-core' else: license = '/l/CC/By-SA' dataset = '/d/conceptnet/5/combined-sa' edge = make_edge(relation, start, end, dataset, license, ['/s/rule/sum_edges'], '/ctx/all', weight=weight) if license == '/l/CC/By': writer_core.write(edge) #else: #writer_sa.write(edge) writer_core.close()
class FindTranslations(ContentHandler): def __init__(self): self.lang = None self.langcode = None self.inArticle = False self.inTitle = False self.curSense = None self.curTitle = '' self.curText = '' self.locales = [] self.curRelation = None self.writer = MultiWriter('wiktionary') self.trans = False # in translation mode def startElement(self, name, attrs): if name == 'page': self.inArticle = True self.curText = [] elif name == 'title': self.inTitle = True self.curTitle = '' def endElement(self, name): if name == 'page': self.inArticle = False self.handleArticle(self.curTitle, ''.join(self.curText)) elif name == 'title': self.inTitle = False def characters(self, text): if self.inTitle: self.curTitle += text elif self.inArticle: self.curText.append(text) if len(self.curText) > 10000: # bail out self.inArticle = False def handleArticle(self, title, text): lines = text.split('\n') self.pos = None for line in lines: self.handleLine(title, line.strip()) def handleLine(self, title, line): language_match = LANGUAGE_HEADER.match(line) trans_top_match = TRANS_TOP.match(line) trans_bottom_match = TRANS_BOTTOM.match(line) trans_tag_match = TRANS_TAG.search(line) chinese_match = CHINESE_TAG.search(line) ### Get translation if trans_top_match: # start translation part self.trans = True if self.trans and trans_bottom_match: # end translation part self.trans = False if self.trans and line.startswith('*{{'): # get translation lang = line[3:5] # get language of translation # find all translations of that language translations = re.findall(u"\{\{Ü.*?\|.*?\|(.*?)\}\}", line) for translation in translations: # iterate over translations self.output_sense_translation(lang, translation, title, '') ### Get relation if line.startswith('{{Synonyme}}'): # synonym self.curRelation = 'synonym' elif line.startswith(u'{{Gegenwörter}}'): # antonym self.curRelation = 'antonym' elif line.startswith('{{Oberbegriffe}}'): # hypernym self.curRelation = 'hypernym' elif line.startswith('{{Unterbegriffe}}'): # hyponym self.curRelation = 'hyponym' elif line.startswith('{{Redewendungen}}'): # idiom self.curRelation = 'idiom' elif line.startswith('{{Charakteristische Wortkombinationen}}'): \ # word combination self.curRelation = 'word combination' elif line.startswith('{{Wortbildungen}}'): # morphology self.curRelation = 'morphology' if self.curRelation and line == '': # end relation self.curRelation = None if self.curRelation: related_words_or_phrases = re.findall(r"\[\[(.*?)\]\]", line) for related_word in related_words_or_phrases: self.output_monolingual('deu', self.curRelation, \ related_word, title) def output_monolingual(self, lang, relation, term1, term2): if 'Wik' in term1 or 'Wik' in term2: return source = make_concept_uri(term1, lang) if self.pos: target = make_concept_uri(term2, lang, self.pos) else: target = make_concept_uri(term2, lang) surfaceText = "[[%s]] %s [[%s]]" % (term1, relation, term2) #print surfaceText edge = make_edge('/r/'+relation, source, target, '/d/wiktionary/%s/%s' % (lang, lang), license='/l/CC/By-SA', sources=[SOURCE, MONOLINGUAL], context='/ctx/all', weight=1.5, surfaceText=surfaceText) self.writer.write(edge) def output_sense_translation(self, lang, foreign, german, disambiguation): if 'Wik' in foreign or 'Wik' in german: return if lang == 'zh-cn': lang = 'zh_CN' elif lang == 'zh-tw': lang = 'zh_TW' source = make_concept_uri( unicodedata.normalize('NFKC', foreign), lang ) target = make_concept_uri( german, 'de', disambiguation ) relation = '/r/TranslationOf' try: surfaceRel = "is %s for" % (langs.english_name(lang)) except KeyError: surfaceRel = "is [language %s] for" % lang surfaceText = "[[%s]] %s [[%s (%s)]]" % (foreign, surfaceRel, english, disambiguation.split('/')[-1].replace('_', ' ')) #print surfaceText edge = make_edge(relation, source, target, '/d/wiktionary/en/%s' % lang, license='/l/CC/By-SA', sources=[SOURCE, TRANSLATE], context='/ctx/all', weight=1.5, surfaceText=surfaceText) self.writer.write(edge) def output_translation(self, foreign, english, locale=''): source = make_concept_uri( unicodedata.normalize('NFKC', foreign), self.langcode+locale ) target = make_concept_uri( english, 'en' ) relation = '/r/TranslationOf' try: surfaceRel = "is %s for" % (langs.english_name(self.langcode)) except KeyError: surfaceRel = "is [language %s] for" % self.langcode surfaceText = "[[%s]] %s [[%s]]" % (foreign, surfaceRel, english) edge = make_edge(relation, source, target, '/d/wiktionary/en/%s' % self.langcode, license='/l/CC/By-SA', sources=[SOURCE, INTERLINGUAL], context='/ctx/all', weight=1.5, surfaceText=surfaceText) self.writer.write(edge)
score = (freq*2-1) * (1000-orderscore) * (1-sls) / 1000 if score <= 0: counts['low score'] += 1 weak_out.write(line) continue count += 1 counts['success'] += 1 good_out.write(line) if make_json: edge = make_edge(rel, left, right, '/d/verbosity', '/l/CC/By', sources, surfaceText=text, weight = score/10.0) writer.write(edge) if make_json: writer.close() flag_out.close() good_out.close() weak_out.close() similar_out.close() simout = open('data/output/similarity-scores.txt', 'w') for sim in text_similarities: print >> simout, sim simout.close()
class FindTranslations(ContentHandler): def __init__(self): self.lang = None self.langcode = None self.inArticle = False self.inTitle = False self.curSense = None self.curTitle = '' self.curText = '' self.locales = [] self.curRelation = None self.writer = MultiWriter('wiktionary') def startElement(self, name, attrs): if name == 'page': self.inArticle = True self.curText = [] elif name == 'title': self.inTitle = True self.curTitle = '' def endElement(self, name): if name == 'page': self.inArticle = False self.handleArticle(self.curTitle, ''.join(self.curText)) elif name == 'title': self.inTitle = False def characters(self, text): if self.inTitle: self.curTitle += text elif self.inArticle: self.curText.append(text) if len(self.curText) > 10000: # bail out self.inArticle = False def handleArticle(self, title, text): lines = text.split('\n') self.pos = None for line in lines: self.handleLine(title, line.strip()) def handleLine(self, title, line): language_match = LANGUAGE_HEADER.match(line) trans_top_match = TRANS_TOP.match(line) trans_tag_match = TRANS_TAG.search(line) chinese_match = CHINESE_TAG.search(line) if line.startswith('===') and line.endswith('==='): pos = line.strip('= ') if pos == 'Synonyms': self.curRelation = 'Synonym' elif pos == 'Antonym': self.curRelation = 'Antonym' elif pos == 'Related terms': self.curRelation = 'ConceptuallyRelatedTo' elif pos == 'Derived terms': if not line.startswith('===='): # this is at the same level as the part of speech; # now we don't know what POS these apply to self.pos = None self.curRelation = 'DerivedFrom' else: self.curRelation = None if pos in PARTS_OF_SPEECH: self.pos = PARTS_OF_SPEECH[pos] elif language_match: self.lang = language_match.group(1) self.langcode = LANGUAGES.get(self.lang) elif chinese_match: scripttag = chinese_match.group(2) self.locales = [] if 's' in scripttag: self.locales.append('_CN') if 't' in scripttag: self.locales.append('_TW') elif line[0:1] == '#' and self.lang != 'English' and self.lang is not None: defn = line[1:].strip() if defn[0:1] not in ':*#': for defn2 in filter_line(defn): if not ascii_enough(defn2): continue if 'Index:' in title: continue if self.langcode == 'zh': for locale in self.locales: self.output_translation(title, defn2, locale) elif self.langcode: self.output_translation(title, defn2) elif line[0:4] == '----': self.pos = None self.lang = None self.langcode = None self.curRelation = None elif trans_top_match: pos = self.pos or 'n' sense = trans_top_match.group(1).split(';')[0].strip('.') if 'translations' in sense.lower(): self.curSense = None else: self.curSense = pos+'/'+sense elif trans_tag_match: lang = trans_tag_match.group(1) translation = trans_tag_match.group(2) if self.curSense is not None and self.lang == 'English': # handle Chinese separately if lang not in ('cmn', 'yue', 'zh-yue', 'zh'): self.output_sense_translation(lang, translation, title, self.curSense) elif '{{trans-bottom}}' in line: self.curSense = None elif line.startswith('* ') and self.curRelation and self.langcode: relatedmatch = WIKILINK.search(line) if relatedmatch: related = relatedmatch.group(1) self.output_monolingual(self.langcode, self.curRelation, related, title) def output_monolingual(self, lang, relation, term1, term2): if 'Wik' in term1 or 'Wik' in term2: return source = make_concept_uri(term1, lang) if self.pos: target = make_concept_uri(term2, lang, self.pos) else: target = make_concept_uri(term2, lang) surfaceText = "[[%s]] %s [[%s]]" % (term1, relation, term2) #print surfaceText edge = make_edge('/r/'+relation, source, target, '/d/wiktionary/%s/%s' % (lang, lang), license='/l/CC/By-SA', sources=[SOURCE, MONOLINGUAL], context='/ctx/all', weight=1.5, surfaceText=surfaceText) self.writer.write(edge) def output_sense_translation(self, lang, foreign, english, disambiguation): if 'Wik' in foreign or 'Wik' in english: return if lang == 'zh-cn': lang = 'zh_CN' elif lang == 'zh-tw': lang = 'zh_TW' source = make_concept_uri( unicodedata.normalize('NFKC', foreign), lang ) target = make_concept_uri( english, 'en', disambiguation ) relation = '/r/TranslationOf' try: surfaceRel = "is %s for" % (langs.english_name(lang)) except KeyError: surfaceRel = "is [language %s] for" % lang surfaceText = "[[%s]] %s [[%s (%s)]]" % (foreign, surfaceRel, english, disambiguation.split('/')[-1].replace('_', ' ')) #print surfaceText edge = make_edge(relation, source, target, '/d/wiktionary/en/%s' % lang, license='/l/CC/By-SA', sources=[SOURCE, TRANSLATE], context='/ctx/all', weight=1.5, surfaceText=surfaceText) self.writer.write(edge) def output_translation(self, foreign, english, locale=''): source = make_concept_uri( unicodedata.normalize('NFKC', foreign), self.langcode+locale ) target = make_concept_uri( english, 'en' ) relation = '/r/TranslationOf' try: surfaceRel = "is %s for" % (langs.english_name(self.langcode)) except KeyError: surfaceRel = "is [language %s] for" % self.langcode surfaceText = "[[%s]] %s [[%s]]" % (foreign, surfaceRel, english) edge = make_edge(relation, source, target, '/d/wiktionary/en/%s' % self.langcode, license='/l/CC/By-SA', sources=[SOURCE, INTERLINGUAL], context='/ctx/all', weight=1.5, surfaceText=surfaceText) self.writer.write(edge)
class FindTranslations(ContentHandler): def __init__(self): self.lang = None self.langcode = None self.inArticle = False self.inTitle = False self.curSense = None self.curTitle = '' self.curText = '' self.locales = [] self.curRelation = None self.writer = MultiWriter('wiktionary_ja') self.nosensetrans = None # non-sense-specific translation def startElement(self, name, attrs): if name == 'page': self.inArticle = True self.curText = [] elif name == 'title': self.inTitle = True self.curTitle = '' def endElement(self, name): if name == 'page': self.inArticle = False self.handleArticle(self.curTitle, ''.join(self.curText)) elif name == 'title': self.inTitle = False def characters(self, text): if self.inTitle: self.curTitle += text elif self.inArticle: self.curText.append(text) if len(self.curText) > 10000: # bail out self.inArticle = False def handleArticle(self, title, text): lines = text.split('\n') self.pos = None for line in lines: self.handleLine(title, line.strip()) def handleLine(self, title, line): language_match = LANGUAGE_HEADER.match(line) trans_top_match = TRANS_TOP.match(line) trans_bottom_match = TRANS_BOTTOM.match(line) trans_match = TRANS.match(line) trans_tag_match = TRANS_TAG.search(line) chinese_match = CHINESE_TAG.search(line) if language_match: self.langcode = get_language_code(language_match.group(1)) ### Get sense-specific translation if trans_top_match: # start translation part pos = self.pos or 'n' # get translation sense if trans_top_match.group(1): sense = trans_top_match.group(1).lstrip('|') self.curSense = pos+'/'+sense return else: self.curSense = pos return if trans_bottom_match: # end translation part self.curSense = None return if self.curSense and line[0:5] == '*[[{{': # get translation lang = line[5:].split('}')[0] # get language of translation if lang in LANGUAGES_3_TO_2: # convert 3-letter code to 2-letter code lang = LANGUAGES_3_TO_2[lang] # find all translations of that language translations = re.findall(r"\[\[(.*?)\]\]", line)[1:] for translation in translations: # iterate over translations self.output_sense_translation(lang, translation, title, \ self.curSense) return ### Get relation if line.startswith('===={{rel}}===='): # start relation part self.curRelation = 'ConceptuallyRelatedTo' return if self.curRelation and self.langcode: # within relation part if line.startswith('*'): # get relation relations = re.findall(r"\{\{(.*?)\}\}", line) if len(relations) > 0: if relations[0] == 'syn': # synonym self.curRelation = 'Synonym' if relations[0] == 'drv': # derivative self.curRelation = 'Derivative' related_words = re.findall(r"\[\[(.*?)\]\]", line) for related_word in related_words: self.output_monolingual(self.langcode, self.curRelation, \ related_word, title) self.curRelation = 'ConceptuallyRelatedTo' # back to default else: self.curRelation = None ### Get non-sense-specific translation if trans_match: self.nosensetrans = 1 # *maybe* start non-sense-specific translation if self.nosensetrans == 1 and line.startswith('{{top}}'): self.nosensetrans = 2 # start non-sense-specific translation if self.nosensetrans == 2: if line.startswith('{{bottom}}'): self.nosensetrans = None return if line.startswith('*{{'): lang = line[3:].split('}')[0] if lang in LANGUAGES_3_TO_2: # convert 3-letter code to 2-letter code lang = LANGUAGES_3_TO_2[lang] translations = re.findall(r"\[\[(.*?)\]\]", line) for translation in translations: self.output_sense_translation(lang, translation, title, '') def output_monolingual(self, lang, relation, term1, term2): # skip Wiktionary: links and templates if u'ウィク' in term1 or u'ウィク' in term2: return if u'テンプレート' in term1 or u'テンプレート' in term2: return if lang in LANGUAGES_3_TO_2: # convert 3-letter code to 2-letter code lang = LANGUAGES_3_TO_2[lang] source = make_concept_uri_safe(term1, lang) if self.pos: target = make_concept_uri_safe(term2, lang, self.pos) else: target = make_concept_uri_safe(term2, lang) surfaceText = "[[%s]] %s [[%s]]" % (term1, relation, term2) #print surfaceText edge = make_edge('/r/'+relation, source, target, '/d/wiktionary/ja/%s' % (lang), license='/l/CC/By-SA', sources=[SOURCE, MONOLINGUAL], context='/ctx/all', weight=1.5, surfaceText=surfaceText) self.writer.write(edge) def output_sense_translation(self, lang, foreign, translated, disambiguation): if u':' in foreign or u':' in translated: return if lang == 'zh-cn': lang = 'zh_CN' elif lang == 'zh-tw': lang = 'zh_TW' source = make_concept_uri_safe( unicodedata.normalize('NFKC', foreign), lang ) target = make_concept_uri_safe( translated, self.langcode, disambiguation ) relation = '/r/TranslationOf' try: surfaceRel = "is %s for" % (langs.english_name(lang)) except KeyError: surfaceRel = "is [language %s] for" % lang if disambiguation and '/' in disambiguation: surfaceText = "[[%s]] %s [[%s (%s)]]" % (foreign, surfaceRel, translated, disambiguation.split('/')[-1].replace('_', ' ')) else: surfaceText = "[[%s]] %s [[%s]]" % (foreign, surfaceRel, translated) #print surfaceText edge = make_edge(relation, source, target, '/d/wiktionary/ja/%s' % (self.langcode), license='/l/CC/By-SA', sources=[SOURCE, TRANSLATE], context='/ctx/all', weight=1.5, surfaceText=surfaceText) self.writer.write(edge) def output_translation(self, foreign, japanese, locale=''): source = make_concept_uri_safe( unicodedata.normalize('NFKC', foreign), self.langcode+locale ) target = make_concept_uri_safe( japanese, 'ja' ) relation = '/r/TranslationOf' try: surfaceRel = "is %s for" % (langs.english_name(self.langcode)) except KeyError: surfaceRel = "is [language %s] for" % self.langcode surfaceText = "[[%s]] %s [[%s]]" % (foreign, surfaceRel, japanese) edge = make_edge(relation, source, target, '/d/wiktionary/ja/%s' % self.langcode, license='/l/CC/By-SA', sources=[SOURCE, INTERLINGUAL], context='/ctx/all', weight=1.5, surfaceText=surfaceText) self.writer.write(edge)
class FindTranslations(ContentHandler): def __init__(self): self.lang = None self.langcode = None self.inArticle = False self.inTitle = False self.curSense = None self.curTitle = "" self.curText = "" self.locales = [] self.curRelation = None self.writer = MultiWriter("wiktionary") self.trans = False # in translation mode def startElement(self, name, attrs): if name == "page": self.inArticle = True self.curText = [] elif name == "title": self.inTitle = True self.curTitle = "" def endElement(self, name): if name == "page": self.inArticle = False self.handleArticle(self.curTitle, "".join(self.curText)) elif name == "title": self.inTitle = False def characters(self, text): if self.inTitle: self.curTitle += text elif self.inArticle: self.curText.append(text) if len(self.curText) > 10000: # bail out self.inArticle = False def handleArticle(self, title, text): lines = text.split("\n") self.pos = None for line in lines: self.handleLine(title, line.strip()) def handleLine(self, title, line): language_match = LANGUAGE_HEADER.match(line) trans_top_match = TRANS_TOP.match(line) trans_bottom_match = TRANS_BOTTOM.match(line) trans_tag_match = TRANS_TAG.search(line) chinese_match = CHINESE_TAG.search(line) ### Get translation if trans_top_match: # start translation part self.trans = True if self.trans and trans_bottom_match: # end translation part self.trans = False if self.trans and line.startswith("*{{"): # get translation lang = line[3:5] # get language of translation # find all translations of that language translations = re.findall(u"\{\{Ü.*?\|.*?\|(.*?)\}\}", line) for translation in translations: # iterate over translations self.output_sense_translation(lang, translation, title, "") ### Get relation if line.startswith("{{Synonyme}}"): # synonym self.curRelation = "synonym" elif line.startswith(u"{{Gegenwörter}}"): # antonym self.curRelation = "antonym" elif line.startswith("{{Oberbegriffe}}"): # hypernym self.curRelation = "hypernym" elif line.startswith("{{Unterbegriffe}}"): # hyponym self.curRelation = "hyponym" elif line.startswith("{{Redewendungen}}"): # idiom self.curRelation = "idiom" elif line.startswith("{{Charakteristische Wortkombinationen}}"): # word combination self.curRelation = "word combination" elif line.startswith("{{Wortbildungen}}"): # morphology self.curRelation = "morphology" if self.curRelation and line == "": # end relation self.curRelation = None if self.curRelation: related_words_or_phrases = re.findall(r"\[\[(.*?)\]\]", line) for related_word in related_words_or_phrases: self.output_monolingual("deu", self.curRelation, related_word, title) def output_monolingual(self, lang, relation, term1, term2): if "Wik" in term1 or "Wik" in term2: return source = make_concept_uri(term1, lang) if self.pos: target = make_concept_uri(term2, lang, self.pos) else: target = make_concept_uri(term2, lang) surfaceText = "[[%s]] %s [[%s]]" % (term1, relation, term2) # print surfaceText edge = make_edge( "/r/" + relation, source, target, "/d/wiktionary/%s/%s" % (lang, lang), license="/l/CC/By-SA", sources=[SOURCE, MONOLINGUAL], context="/ctx/all", weight=1.5, surfaceText=surfaceText, ) self.writer.write(edge) def output_sense_translation(self, lang, foreign, german, disambiguation): if "Wik" in foreign or "Wik" in german: return if lang == "zh-cn": lang = "zh_CN" elif lang == "zh-tw": lang = "zh_TW" source = make_concept_uri(unicodedata.normalize("NFKC", foreign), lang) target = make_concept_uri(german, "de", disambiguation) relation = "/r/TranslationOf" try: surfaceRel = "is %s for" % (langs.english_name(lang)) except KeyError: surfaceRel = "is [language %s] for" % lang surfaceText = "[[%s]] %s [[%s (%s)]]" % ( foreign, surfaceRel, english, disambiguation.split("/")[-1].replace("_", " "), ) # print surfaceText edge = make_edge( relation, source, target, "/d/wiktionary/en/%s" % lang, license="/l/CC/By-SA", sources=[SOURCE, TRANSLATE], context="/ctx/all", weight=1.5, surfaceText=surfaceText, ) self.writer.write(edge) def output_translation(self, foreign, english, locale=""): source = make_concept_uri(unicodedata.normalize("NFKC", foreign), self.langcode + locale) target = make_concept_uri(english, "en") relation = "/r/TranslationOf" try: surfaceRel = "is %s for" % (langs.english_name(self.langcode)) except KeyError: surfaceRel = "is [language %s] for" % self.langcode surfaceText = "[[%s]] %s [[%s]]" % (foreign, surfaceRel, english) edge = make_edge( relation, source, target, "/d/wiktionary/en/%s" % self.langcode, license="/l/CC/By-SA", sources=[SOURCE, INTERLINGUAL], context="/ctx/all", weight=1.5, surfaceText=surfaceText, ) self.writer.write(edge)
class FindTranslations(ContentHandler): def __init__(self): self.lang = None self.langcode = None self.inArticle = False self.inTitle = False self.curSense = None self.curTitle = '' self.curText = '' self.locales = [] self.curRelation = None self.writer = MultiWriter('wiktionary') def startElement(self, name, attrs): if name == 'page': self.inArticle = True self.curText = [] elif name == 'title': self.inTitle = True self.curTitle = '' def endElement(self, name): if name == 'page': self.inArticle = False self.handleArticle(self.curTitle, ''.join(self.curText)) elif name == 'title': self.inTitle = False def characters(self, text): if self.inTitle: self.curTitle += text elif self.inArticle: self.curText.append(text) if len(self.curText) > 10000: # bail out self.inArticle = False def handleArticle(self, title, text): lines = text.split('\n') self.pos = None for line in lines: self.handleLine(title, line.strip()) def handleLine(self, title, line): language_match = LANGUAGE_HEADER.match(line) trans_top_match = TRANS_TOP.match(line) trans_tag_match = TRANS_TAG.search(line) chinese_match = CHINESE_TAG.search(line) if line.startswith('===') and line.endswith('==='): pos = line.strip('= ') if pos == 'Synonyms': self.curRelation = 'Synonym' elif pos == 'Antonym': self.curRelation = 'Antonym' elif pos == 'Related terms': self.curRelation = 'ConceptuallyRelatedTo' elif pos == 'Derived terms': if not line.startswith('===='): # this is at the same level as the part of speech; # now we don't know what POS these apply to self.pos = None self.curRelation = 'DerivedFrom' else: self.curRelation = None if pos in PARTS_OF_SPEECH: self.pos = PARTS_OF_SPEECH[pos] elif language_match: self.lang = language_match.group(1) self.langcode = LANGUAGES.get(self.lang) elif chinese_match: scripttag = chinese_match.group(2) self.locales = [] if 's' in scripttag: self.locales.append('_CN') if 't' in scripttag: self.locales.append('_TW') elif line[ 0: 1] == '#' and self.lang != 'English' and self.lang is not None: defn = line[1:].strip() if defn[0:1] not in ':*#': for defn2 in filter_line(defn): if not ascii_enough(defn2): continue if 'Index:' in title: continue if self.langcode == 'zh': for locale in self.locales: self.output_translation(title, defn2, locale) elif self.langcode: self.output_translation(title, defn2) elif line[0:4] == '----': self.pos = None self.lang = None self.langcode = None self.curRelation = None elif trans_top_match: pos = self.pos or 'n' sense = trans_top_match.group(1).split(';')[0].strip('.') if 'translations' in sense.lower(): self.curSense = None else: self.curSense = pos + '/' + sense elif trans_tag_match: lang = trans_tag_match.group(1) translation = trans_tag_match.group(2) if self.curSense is not None and self.lang == 'English': # handle Chinese separately if lang not in ('cmn', 'yue', 'zh-yue', 'zh'): self.output_sense_translation(lang, translation, title, self.curSense) elif '{{trans-bottom}}' in line: self.curSense = None elif line.startswith('* ') and self.curRelation and self.langcode: relatedmatch = WIKILINK.search(line) if relatedmatch: related = relatedmatch.group(1) self.output_monolingual(self.langcode, self.curRelation, related, title) def output_monolingual(self, lang, relation, term1, term2): if 'Wik' in term1 or 'Wik' in term2: return source = make_concept_uri(term1, lang) if self.pos: target = make_concept_uri(term2, lang, self.pos) else: target = make_concept_uri(term2, lang) surfaceText = "[[%s]] %s [[%s]]" % (term1, relation, term2) print surfaceText edge = make_edge('/r/' + relation, source, target, '/d/wiktionary/%s/%s' % (lang, lang), license='/l/CC/By-SA', sources=[SOURCE, MONOLINGUAL], context='/ctx/all', weight=1.5, surfaceText=surfaceText) self.writer.write(edge) def output_sense_translation(self, lang, foreign, english, disambiguation): if 'Wik' in foreign or 'Wik' in english: return if lang == 'zh-cn': lang = 'zh_CN' elif lang == 'zh-tw': lang = 'zh_TW' source = make_concept_uri(unicodedata.normalize('NFKC', foreign), lang) target = make_concept_uri(english, 'en', disambiguation) relation = '/r/TranslationOf' try: surfaceRel = "is %s for" % (langs.english_name(lang)) except KeyError: surfaceRel = "is [language %s] for" % lang surfaceText = "[[%s]] %s [[%s (%s)]]" % ( foreign, surfaceRel, english, disambiguation.split('/')[-1].replace('_', ' ')) print surfaceText edge = make_edge(relation, source, target, '/d/wiktionary/en/%s' % lang, license='/l/CC/By-SA', sources=[SOURCE, TRANSLATE], context='/ctx/all', weight=1.5, surfaceText=surfaceText) self.writer.write(edge) def output_translation(self, foreign, english, locale=''): source = make_concept_uri(unicodedata.normalize('NFKC', foreign), self.langcode + locale) target = make_concept_uri(english, 'en') relation = '/r/TranslationOf' try: surfaceRel = "is %s for" % (langs.english_name(self.langcode)) except KeyError: surfaceRel = "is [language %s] for" % self.langcode surfaceText = "[[%s]] %s [[%s]]" % (foreign, surfaceRel, english) edge = make_edge(relation, source, target, '/d/wiktionary/en/%s' % self.langcode, license='/l/CC/By-SA', sources=[SOURCE, INTERLINGUAL], context='/ctx/all', weight=1.5, surfaceText=surfaceText) self.writer.write(edge)
for line in codecs.open('data/flat/CORE', encoding='utf-8'): uri, rel, start, end, context, weight, sources, id, dataset = line.split('\t')[:9] if uri != 'uri' and context == '/ctx/all': weight = float(weight) weights[uri] += float(weight) assertions[uri] = (rel, start, end, context, weight) if not (dataset.startswith('/d/reverb') or dataset.startswith('/d/wiktionary') or dataset.startswith('/d/dbpedia')): ccby[uri] = True print 'writing' writer_core = MultiWriter('assertion_totals_core') #writer_sa = MultiWriter('assertion_totals_sa') for uri, weight in assertions.iteritems(): if ccby[uri]: license = '/l/CC/By' dataset = '/d/conceptnet/5/combined-core' else: license = '/l/CC/By-SA' dataset = '/d/conceptnet/5/combined-sa' relation, start, end, context, weight = assertions[uri] edge = make_edge(relation, start, end, dataset, license, ['/s/rule/sum_edges'], '/ctx/all', weight=weight) if license == '/l/CC/By': writer_core.write(edge) #else: # writer_sa.write(edge) writer_core.close() #writer_sa.close()
if score <= 0: counts['low score'] += 1 weak_out.write(line) continue count += 1 counts['success'] += 1 good_out.write(line) if make_json: left = make_concept_uri(unicode(left), 'en') right = make_concept_uri(unicode(right), 'en') edge = make_edge(rel, left, right, '/d/verbosity', '/l/CC/By', sources, surfaceText=text, weight = score/10.0) writer.write(edge) if make_json: writer.close() flag_out.close() good_out.close() weak_out.close() similar_out.close() simout = open('data/output/similarity-scores.txt', 'w') for sim in text_similarities: print >> simout, sim simout.close()
if not (dataset.startswith('/d/reverb') or dataset.startswith('/d/wiktionary') or dataset.startswith('/d/dbpedia')): ccby[uri] = True print 'writing' writer_core = MultiWriter('assertion_totals_core') #writer_sa = MultiWriter('assertion_totals_sa') for uri, weight in assertions.iteritems(): if ccby[uri]: license = '/l/CC/By' dataset = '/d/conceptnet/5/combined-core' else: license = '/l/CC/By-SA' dataset = '/d/conceptnet/5/combined-sa' relation, start, end, context, weight = assertions[uri] edge = make_edge(relation, start, end, dataset, license, ['/s/rule/sum_edges'], '/ctx/all', weight=weight) if license == '/l/CC/By': writer_core.write(edge) #else: # writer_sa.write(edge) writer_core.close() #writer_sa.close()