def handle_raw_assertion(line): parts = line.split(', ') user, frame_id, concept1, concept2 = parts fdata = FRAME_DATA[frame_id] ftext = fdata['text'] rel = fdata['relation'] surfaceText = ftext.replace('{1}', '[[' + concept1 + ']]').replace( '{2}', '[[' + concept2 + ']]' ) # We mark surface texts with * if {2} comes before {1}. if ftext.find('{2}') < ftext.find('{1}'): surfaceText = '*' + surfaceText start = standardized_concept_uri('zh_TW', concept1) end = standardized_concept_uri('zh_TW', concept2) source = { 'contributor': '/s/contributor/petgame/' + user, 'activity': '/s/activity/ptt/petgame', } yield make_edge( rel, start, end, dataset='/d/conceptnet/4/zh', license=Licenses.cc_attribution, sources=[source], surfaceText=surfaceText, weight=1, )
def output_monolingual(self, lang, relation, term1, term2): # skip Wiktionary: links and templates if u'ウィク' in term1 or u'ウィク' in term2: return if u'テンプレート' in term1 or u'テンプレート' in term2: return if lang in LANGUAGES_3_TO_2: # convert 3-letter code to 2-letter code lang = LANGUAGES_3_TO_2[lang] source = make_concept_uri_safe(term1, lang) if self.pos: target = make_concept_uri_safe(term2, lang, self.pos) else: target = make_concept_uri_safe(term2, lang) surfaceText = "[[%s]] %s [[%s]]" % (term1, relation, term2) #print surfaceText edge = make_edge('/r/' + relation, source, target, '/d/wiktionary/ja/%s' % (lang), license='/l/CC/By-SA', sources=[SOURCE, MONOLINGUAL], context='/ctx/all', weight=1.0, surfaceText=surfaceText) self.writer.write(edge)
def handle_raw_assertion(line): parts = line.split(', ') user, frame_id, concept1, concept2 = parts fdata = FRAME_DATA[frame_id] ftext = fdata['text'] rel = fdata['relation'] surfaceText = ftext.replace('{1}', '[[' + concept1 + ']]').replace( '{2}', '[[' + concept2 + ']]') # We mark surface texts with * if {2} comes before {1}. if ftext.find('{2}') < ftext.find('{1}'): surfaceText = '*' + surfaceText start = standardized_concept_uri('zh_TW', concept1) end = standardized_concept_uri('zh_TW', concept2) source = { 'contributor': '/s/contributor/petgame/' + user, 'activity': '/s/activity/ptt/petgame' } yield make_edge(rel, start, end, dataset='/d/conceptnet/4/zh', license=Licenses.cc_attribution, sources=[source], surfaceText=surfaceText, weight=1)
def output_assertion(out, **kwargs): uri = kwargs.pop('uri') source_tree = make_disjunction_uri(set(kwargs.pop('sources'))) assertion = make_edge(sources=source_tree, **kwargs) assert assertion['uri'] == uri, (assertion['uri'], uri) line = json.dumps(assertion, ensure_ascii=False) print >> out, line
def handle_triple(line): items = line.split() for i in xrange(3): if not (items[i].startswith('<') and items[i].endswith('>')): return items[i] = items[i][1:-1] subj, pred, obj = items[:3] if 'foaf/0.1/homepage' in pred or obj == 'work' or '_Feature' in obj or '#Thing' in obj or '__' in subj or '__' in obj or 'List_of' in subj or 'List_of' in obj: return subj_concept = make_concept_uri(translate_wp_url(subj), 'en') obj_concept = make_concept_uri(translate_wp_url(obj), 'en') webrel = map_web_relation(pred) if webrel is None: return rel = normalize_uri('/r/'+webrel) if (pred, rel) not in sw_map_used: sw_map_used.add((pred, rel)) sw_map.write({'from': pred, 'to': rel}) if (subj, subj_concept) not in sw_map_used: sw_map_used.add((subj, subj_concept)) sw_map.write({'from': subj, 'to': subj_concept}) if (obj, obj_concept) not in sw_map_used: sw_map_used.add((obj, obj_concept)) sw_map.write({'from': obj, 'to': obj_concept}) edge = make_edge(rel, subj_concept, obj_concept, dataset='/d/dbpedia/en', license='/l/CC/By-SA', sources=['/s/dbpedia/3.7'], context='/ctx/all', weight=0.5) writer.write(edge)
def handle_raw_db_assertion(raw_assertion): try: if can_skip(raw_assertion): return [] # build the assertion frame_text = build_frame_text(raw_assertion) relation = build_relation(raw_assertion) start = build_start(raw_assertion) end = build_end(raw_assertion) dataset = build_data_set(raw_assertion) sources = build_sources(raw_assertion) edges = [] for source_list, weight in sources: if 'commons2_reject' in ' '.join(source_list): weight = -1 if by_bedume_and_bad(source_list,start,end,raw_assertion): return [] else: edge = make_edge(relation, start, end, dataset, LICENSE, source_list, '/ctx/all', frame_text, weight=weight) edges.append(edge) return edges except Exception: import traceback #traceback.print_exc() return []
def handle_raw_flat_assertion(flat_assertion): try: parts_dict = extract_parts(flat_assertion) if can_skip(parts_dict): return [] # build the assertion frame_text = build_frame_text(parts_dict) relation = build_relation(parts_dict) start = build_start(parts_dict) end = build_end(parts_dict) dataset = build_data_set() sources = build_sources(parts_dict) edges = [] for source_list, weight in sources: if "commons2_reject" in " ".join(source_list): weight = -1 else: edge = make_edge( relation, start, end, dataset, LICENSE, source_list, "/ctx/all", frame_text, weight=weight ) edges.append(edge) return edges except Exception: # import traceback # print "failed on a flat_assertion" # traceback.print_exc() return []
def subwords_to_edges(language, input, output): """ Morfessor hypothesizes ways to break words into sub-word chunks. Produce edges from these sub-words that can be used in retrofitting. """ writer = MsgpackStreamWriter(output) for line in input: line = line.rstrip() if not line or line.startswith('#'): continue # Remove the unnecessary count ("1 ") from the start of each line line = line.split(' ', 1)[1] chunks = line.split(' + ') # Strip a possible trailing underscore, which would particularly show # up in the way we segment ATOMIC_SPACE_LANGUAGES (Vietnamese) full_text = ''.join(chunks).strip('_') end = join_uri('c', language, full_text) for chunk in chunks: if chunk != '_': start = join_uri('x', language, chunk.strip('_')) edge = make_edge( '/r/SubwordOf', start, end, dataset='/d/morphology', license=Licenses.cc_attribution, sources=MORPH_SOURCES, weight=0.01, ) writer.write(edge) writer.close()
def output_sense_translation(self, lang, foreign, german, disambiguation): if 'Wik' in foreign or 'Wik' in german: return if lang == 'zh-cn': lang = 'zh_CN' elif lang == 'zh-tw': lang = 'zh_TW' source = make_concept_uri( unicodedata.normalize('NFKC', foreign), lang ) target = make_concept_uri( german, 'de', disambiguation ) relation = '/r/TranslationOf' try: surfaceRel = "is %s for" % (langs.english_name(lang)) except KeyError: surfaceRel = "is [language %s] for" % lang surfaceText = "[[%s]] %s [[%s (%s)]]" % (foreign, surfaceRel, english, disambiguation.split('/')[-1].replace('_', ' ')) #print surfaceText edge = make_edge(relation, source, target, '/d/wiktionary/en/%s' % lang, license='/l/CC/By-SA', sources=[SOURCE, TRANSLATE], context='/ctx/all', weight=1.5, surfaceText=surfaceText) self.writer.write(edge)
def output_sense_translation(self, lang, foreign, english, disambiguation): if 'Wik' in foreign or 'Wik' in english: return if lang == 'zh-cn': lang = 'zh_CN' elif lang == 'zh-tw': lang = 'zh_TW' source = make_concept_uri(unicodedata.normalize('NFKC', foreign), lang) target = make_concept_uri(english, 'en', disambiguation) relation = '/r/TranslationOf' try: surfaceRel = "is %s for" % (langs.english_name(lang)) except KeyError: surfaceRel = "is [language %s] for" % lang surfaceText = "[[%s]] %s [[%s (%s)]]" % ( foreign, surfaceRel, english, disambiguation.split('/')[-1].replace('_', ' ')) print surfaceText edge = make_edge(relation, source, target, '/d/wiktionary/en/%s' % lang, license='/l/CC/By-SA', sources=[SOURCE, TRANSLATE], context='/ctx/all', weight=1.5, surfaceText=surfaceText) self.writer.write(edge)
def handle_raw_assertion(flat_assertion): try: parts_dict = extract_parts(flat_assertion) if can_skip(parts_dict): return [] # build the assertion frame_text = build_frame_text(parts_dict) relation = build_relation(parts_dict) start = build_start(parts_dict) end = build_end(parts_dict) dataset = build_data_set(parts_dict) sources = build_sources(parts_dict) edges = [] for source_list, weight in sources: if 'commons2_reject' in ' '.join(source_list): weight = -1 if by_bedume_and_bad(source_list,start,end): return [] else: edge = make_edge(relation, start, end, dataset, LICENSE, source_list, '/ctx/all', frame_text, weight=weight) edges.append(edge) return edges except Exception: import traceback print "failed on flat_assertion: " + str(flat_assertion) traceback.print_exc() return []
def handle_raw_assertion(raw_assertion): try: if can_skip(raw_assertion): return [] frame_text = build_frame_text(raw_assertion) relation = build_relation(raw_assertion) start = build_start(raw_assertion) end = build_end(raw_assertion) dataset = build_data_set() sources = build_sources(raw_assertion) edges = [] for source_list, weight in sources: if 'commons2_reject' in ' '.join(source_list): weight = -1 edge = make_edge(relation, start, end, dataset, LICENSE, source_list, '/ctx/all', frame_text, weight=weight) edges.append(edge) return edges except Exception: import traceback #traceback.print_exc() return []
def handle_raw_assertion(raw_assertion): line = raw_assertion.strip() edges = [] if line: parts = line.split(', ') user, frame_id, concept1, concept2 = parts frame = Frame.objects.get(id=int(frame_id)) ftext = frame.text relation = frame.relation.name rel = '/r/' + relation surfaceText = ftext.replace(u'{1}', u'[[' + concept1 + u']]').replace( u'{2}', u'[[' + concept2 + u']]') start = make_concept_uri(concept1, 'zh_TW') end = make_concept_uri(concept2, 'zh_TW') sources = ['/s/contributor/petgame/' + user, '/s/activity/ntt/petgame'] edge = make_edge(rel, start, end, dataset='/d/conceptnet/4/zh', license='/l/CC/By', sources=sources, surfaceText=surfaceText, weight=1) edges.append(edge) return edges
def sum_assertions(file_index): weights = defaultdict(float) assertions = {} ccby = defaultdict(bool) for line in codecs.open(CURRENT_DIR +'/data/temp/core_'+str(file_index)+'.txt', 'r','utf-8'): uri, rel, start, end, context, weight, sources, id, dataset = line.split('\t')[:9] if uri != 'uri' and context == '/ctx/all': weight = float(weight) weights[uri] += float(weight) assertions[uri] = (rel, start, end, context, weights[uri]) if not (dataset.startswith('/d/reverb') or dataset.startswith('/d/wiktionary') or dataset.startswith('/d/dbpedia')): ccby[uri] = True writer_core = MultiWriter('assertion_totals_core') #writer_sa = MultiWriter('assertion_totals_sa') for uri, values in assertions.iteritems(): relation, start, end, context, weight = values if ccby[uri]: license = '/l/CC/By' dataset = '/d/conceptnet/5/combined-core' else: license = '/l/CC/By-SA' dataset = '/d/conceptnet/5/combined-sa' edge = make_edge(relation, start, end, dataset, license, ['/s/rule/sum_edges'], '/ctx/all', weight=weight) if license == '/l/CC/By': writer_core.write(edge) #else: #writer_sa.write(edge) writer_core.close()
def output_sense_translation(self, lang, foreign, english, sense): pos, disambiguation = sense if 'Wik' in foreign or 'Wik' in english or term_is_bad(foreign) or term_is_bad(english): return # Quick fix that drops definitions written in Lojban syntax if lang == 'jbo' and re.search(r'x[1-5]', english): return if lang == 'zh-cn': lang = 'zh_CN' elif lang == 'zh-tw': lang = 'zh_TW' source = normalized_concept_uri( lang, unicodedata.normalize('NFKC', foreign) ) target = normalized_concept_uri( 'en', english, pos, disambiguation ) relation = '/r/TranslationOf' try: surfaceRel = "is %s for" % (CODE_TO_ENGLISH_NAME[lang.split('_')[0]]) except KeyError: surfaceRel = "is [language %s] for" % lang surfaceText = "[[%s]] %s [[%s (%s)]]" % (foreign, surfaceRel, english, disambiguation.split('/')[-1].replace('_', ' ')) edge = make_edge(relation, source, target, '/d/wiktionary/en/%s' % lang, license=Licenses.cc_sharealike, sources=[SOURCE, TRANSLATE], weight=1.0, surfaceText=surfaceText) self.writer.write(edge)
def output_edge(obj,writer): objsource = obj['sources'][0] if obj['arg1'].startswith(objsource): obj['arg1'] = objsource if obj['arg2'].startswith(objsource): obj['arg2'] = objsource if obj['arg1'].endswith(objsource): obj['arg1'] = objsource if obj['arg2'].endswith(objsource): obj['arg2'] = objsource start = make_concept_uri(obj['arg1'], 'en') end = make_concept_uri(obj['arg2'], 'en') if obj['rel'][0] in string.uppercase: rel = '/r/'+obj['rel'] else: rel = make_concept_uri(obj['rel'], 'en') if start.startswith('/c/en/this_') or start.startswith('/c/en/these_') or end.startswith('/c/en/this_') or end.startswith('/c/en/these_'): return context = make_concept_uri(objsource, 'en') source = "/s/web/en.wikipedia.org/wiki/%s" % (objsource.replace(' ', '_')) rules = ['/s/rule/reverb', '/s/rule/reverb_filter_apr2012'] surfaceText = u"[[%s]] %s [[%s]]" % (obj['arg1'], obj.get('surfaceRel', obj['rel']), obj['arg2']) weight = float(obj['weight']) ** 3 / 2 edge = make_edge(rel, start, end, dataset='/d/reverb/wp_frontpage', license='/l/CC/By-SA', sources=[source] + rules, context=context, surfaceText=surfaceText, weight=weight) writer.write(edge)
def output_translation(self, foreign, english, locale=''): if term_is_bad(foreign) or term_is_bad(english): return # Quick fix that drops definitions written in Lojban syntax if self.langcode == 'jbo' and re.search(r'x[1-5]', english): return source = normalized_concept_uri( self.langcode + locale, foreign ) target = normalized_concept_uri( 'en', english ) relation = '/r/TranslationOf' try: surfaceRel = "is %s for" % (CODE_TO_ENGLISH_NAME[self.langcode.split('_')[0]]) except KeyError: surfaceRel = "is [language %s] for" % self.langcode surfaceText = "[[%s]] %s [[%s]]" % (foreign, surfaceRel, english) edge = make_edge(relation, source, target, '/d/wiktionary/en/%s' % self.langcode, license=Licenses.cc_sharealike, sources=[SOURCE, INTERLINGUAL], weight=1.0, surfaceText=surfaceText) self.writer.write(edge)
def sum_assertions(file_index): weights = defaultdict(float) assertions = {} ccby = defaultdict(bool) for line in codecs.open(CURRENT_DIR +'/data/temp/core_'+str(file_index)+'.txt', 'r','utf-8'): uri, rel, start, end, context, weight, sources, id, dataset = line.split('\t')[:9] if uri != 'uri' and context == '/ctx/all': weight = float(weight) weights[uri] += float(weight) assertions[uri] = (rel, start, end, context, weights[uri]) if not (dataset.startswith('/d/reverb') or dataset.startswith('/d/wiktionary') or dataset.startswith('/d/dbpedia')): ccby[uri] = True writer_core = MultiWriter('assertion_totals_core') writer_sa = MultiWriter('assertion_totals_sa') for uri, values in assertions.iteritems(): relation, start, end, context, weight = values if ccby[uri]: license = '/l/CC/By' dataset = '/d/conceptnet/5/combined-core' else: license = '/l/CC/By-SA' dataset = '/d/conceptnet/5/combined-sa' edge = make_edge(relation, start, end, dataset, license, ['/s/rule/sum_edges'], '/ctx/all', weight=weight) if license == '/l/CC/By': writer_core.write(edge) writer_sa.write(edge) else: writer_sa.write(edge) writer_core.close() writer_sa.close()
def output_sense_translation(self, lang, foreign, english, sense): pos, disambiguation = sense if 'Wik' in foreign or 'Wik' in english or term_is_bad( foreign) or term_is_bad(english): return # Quick fix that drops definitions written in Lojban syntax if lang == 'jbo' and re.search(r'x[1-5]', english): return if lang == 'zh-cn': lang = 'zh_CN' elif lang == 'zh-tw': lang = 'zh_TW' source = normalized_concept_uri(lang, unicodedata.normalize('NFKC', foreign)) target = normalized_concept_uri('en', english, pos, disambiguation) relation = '/r/TranslationOf' try: surfaceRel = "is %s for" % ( CODE_TO_ENGLISH_NAME[lang.split('_')[0]]) except KeyError: surfaceRel = "is [language %s] for" % lang surfaceText = "[[%s]] %s [[%s (%s)]]" % ( foreign, surfaceRel, english, disambiguation.split('/')[-1].replace('_', ' ')) edge = make_edge(relation, source, target, '/d/wiktionary/en/%s' % lang, license=Licenses.cc_sharealike, sources=[SOURCE, TRANSLATE], weight=1.0, surfaceText=surfaceText) self.writer.write(edge)
def handle_assertion(self, parts_dict): """ Process one assertion from ConceptNet 4, which appears in the input file as a dictionary. Use the 'raw' text -- the text that's not yet reduced to a normalized form -- so we can run ConceptNet 5's normalization on it instead. Each assertion becomes a number of ConceptNet 5 edges, which will probably be grouped together into an assertion again. """ if can_skip(parts_dict): return # fix the result of some process that broke prepositions ages ago preposition_fix = False if '} around {' in parts_dict['frame_text']: for prep in AROUND_PREPOSITIONS: if parts_dict['endText'].startswith(prep + ' '): parts_dict['endText'] = parts_dict['endText'][len(prep) + 1 :] replacement = '} %s {' % prep parts_dict['frame_text'] = parts_dict['frame_text'].replace( '} around {', replacement ) preposition_fix = True break if can_skip(parts_dict): return # build the assertion frame_text = build_frame_text(parts_dict) relation = build_relation(parts_dict) start = build_start(parts_dict) end = build_end(parts_dict) dataset = build_data_set(parts_dict) weighted_sources = build_sources(parts_dict, preposition_fix) if relation in RELATIONS_TO_DROP: return if relation == '/r/DesireOf': # Fix an inconsistently-named relation from GlobalMind relation = '/r/Desires' for source_dict in weighted_sources: if not skip_assertion(source_dict, start, end): weight = source_dict.pop('weight') yield make_edge( rel=relation, start=start, end=end, dataset=dataset, license=Licenses.cc_attribution, sources=[source_dict], surfaceText=frame_text, # The edge weight is the weight computed by build_sources, # times the multiplier set on this instance weight=weight * self.weight, )
def handle_triple(line): items = line.split() for i in xrange(3): if not (items[i].startswith('<') and items[i].endswith('>')): return items[i] = items[i][1:-1] subj, pred, obj = items[:3] if 'foaf/0.1/homepage' in pred or obj == 'work' or '_Feature' in obj or '#Thing' in obj or '__' in subj or '__' in obj or 'List_of' in subj or 'List_of' in obj: return subj_concept = make_concept_uri(translate_wp_url(subj), 'en') obj_concept = make_concept_uri(translate_wp_url(obj), 'en') webrel = map_web_relation(pred) if webrel is None: return rel = normalize_uri('/r/' + webrel) if (pred, rel) not in sw_map_used: sw_map_used.add((pred, rel)) sw_map.write({'from': pred, 'to': rel}) if (subj, subj_concept) not in sw_map_used: sw_map_used.add((subj, subj_concept)) sw_map.write({'from': subj, 'to': subj_concept}) if (obj, obj_concept) not in sw_map_used: sw_map_used.add((obj, obj_concept)) sw_map.write({'from': obj, 'to': obj_concept}) edge = make_edge(rel, subj_concept, obj_concept, dataset='/d/dbpedia/en', license='/l/CC/By-SA', sources=['/s/dbpedia/3.7'], context='/ctx/all', weight=0.5) writer.write(edge)
def output_sense_translation(self, lang, foreign, german, disambiguation): if "Wik" in foreign or "Wik" in german: return if lang == "zh-cn": lang = "zh_CN" elif lang == "zh-tw": lang = "zh_TW" source = make_concept_uri(unicodedata.normalize("NFKC", foreign), lang) target = make_concept_uri(german, "de", disambiguation) relation = "/r/TranslationOf" try: surfaceRel = "is %s for" % (langs.english_name(lang)) except KeyError: surfaceRel = "is [language %s] for" % lang surfaceText = "[[%s]] %s [[%s (%s)]]" % ( foreign, surfaceRel, english, disambiguation.split("/")[-1].replace("_", " "), ) # print surfaceText edge = make_edge( relation, source, target, "/d/wiktionary/en/%s" % lang, license="/l/CC/By-SA", sources=[SOURCE, TRANSLATE], context="/ctx/all", weight=1.5, surfaceText=surfaceText, ) self.writer.write(edge)
def handle_raw_assertion(raw_assertion): edges = [] assertion, users = raw_assertion frame_id, concept1, concept2 = assertion frame = Frame.objects.get(id=int(frame_id)) ftext = frame.text relation = frame.relation.name rel = '/r/' + relation surfaceText = ftext.replace(u'{1}', u'[[' + concept1 + u']]').replace( u'{2}', u'[[' + concept2 + u']]') start = make_concept_uri(concept1, 'zh_TW') end = make_concept_uri(concept2, 'zh_TW') sources = ['/s/activity/ptt/petgame'] for user in users: sources.append('/s/contributor/petgame/' + user) edge = make_edge(rel, start, end, dataset='/d/conceptnet/4/zh', license='/l/CC/By', sources=sources, surfaceText=surfaceText, weight=len(users)) edges.append(edge) return edges
def output_sense_translation(self, lang, foreign, translated, disambiguation): if u':' in foreign or u':' in translated: return if lang == 'zh-cn': lang = 'zh_CN' elif lang == 'zh-tw': lang = 'zh_TW' source = make_concept_uri_safe( unicodedata.normalize('NFKC', foreign), lang ) target = make_concept_uri_safe( translated, self.langcode, disambiguation ) relation = '/r/TranslationOf' try: surfaceRel = "is %s for" % (langs.english_name(lang)) except KeyError: surfaceRel = "is [language %s] for" % lang if disambiguation and '/' in disambiguation: surfaceText = "[[%s]] %s [[%s (%s)]]" % (foreign, surfaceRel, translated, disambiguation.split('/')[-1].replace('_', ' ')) else: surfaceText = "[[%s]] %s [[%s]]" % (foreign, surfaceRel, translated) #print surfaceText edge = make_edge(relation, source, target, '/d/wiktionary/ja/%s' % (self.langcode), license='/l/CC/By-SA', sources=[SOURCE, TRANSLATE], context='/ctx/all', weight=1.0, surfaceText=surfaceText) self.writer.write(edge)
def handle_assertion(self, parts_dict): """ Process one assertion from ConceptNet 4, which appears in the input file as a dictionary. Use the 'raw' text -- the text that's not yet reduced to a normalized form -- so we can run ConceptNet 5's normalization on it instead. Each assertion becomes a number of ConceptNet 5 edges, which will probably be grouped together into an assertion again. """ if can_skip(parts_dict): return # fix the result of some process that broke prepositions ages ago preposition_fix = False if '} around {' in parts_dict['frame_text']: for prep in AROUND_PREPOSITIONS: if parts_dict['endText'].startswith(prep + ' '): parts_dict['endText'] = \ parts_dict['endText'][len(prep) + 1:] replacement = '} %s {' % prep parts_dict['frame_text'] = \ parts_dict['frame_text'].replace( '} around {', replacement ) preposition_fix = True break if can_skip(parts_dict): return # build the assertion frame_text = build_frame_text(parts_dict) relation = build_relation(parts_dict) start = build_start(parts_dict) end = build_end(parts_dict) dataset = build_data_set(parts_dict) weighted_sources = build_sources(parts_dict, preposition_fix) if relation in RELATIONS_TO_DROP: return if relation == '/r/DesireOf': # Fix an inconsistently-named relation from GlobalMind relation = '/r/Desires' for source_dict in weighted_sources: if not skip_assertion(source_dict, start, end): weight = source_dict.pop('weight') yield make_edge( rel=relation, start=start, end=end, dataset=dataset, license=Licenses.cc_attribution, sources=[source_dict], surfaceText=frame_text, # The edge weight is the weight computed by build_sources, # times the multiplier set on this instance weight=weight * self.weight )
def external_url_edge(start, end): return make_edge( rel='/r/ExternalURL', start=start, end=end, dataset='/d/opencyc', license=Licenses.cc_attribution, sources=[SOURCE], weight=1.0 )
def external_url_edge(start, end): return make_edge(rel='/r/ExternalURL', start=start, end=end, dataset='/d/opencyc', license=Licenses.cc_attribution, sources=[SOURCE], weight=1.0)
def output_edge(outfile, subj_concept, obj_concept): rel = '/r/TranslationOf' edge = make_edge(rel, subj_concept, obj_concept, dataset='/d/jmdict', license='/l/CC/By-SA', sources=['/s/jmdict/1.07'], context='/ctx/all', weight=0.5) print >> outfile, json.dumps(edge, ensure_ascii=False)
def output_assertion(out, **kwargs): """ Output an assertion to the given output stream. All keyword arguments become arguments to `make_edge`. (An assertion is a kind of edge.) """ # Build the assertion object. assertion = make_edge(**kwargs) # Output the result in a Msgpack stream. out.write(assertion)
def output_edge(out, rel, subj_concept, obj_concept): """ Write an edge to `out`, an instance of MsgpackStreamWriter. """ edge = make_edge(rel, subj_concept, obj_concept, dataset='/d/jmdict', license=Licenses.cc_sharealike, sources=[{'contributor': '/s/resource/jmdict/1.07'}], weight=2.0) out.write(edge)
def handle_raw_assertion(raw, writer): try: lang = raw.language_id assert lang == 'ja' if raw.frame.goodness < 1: return polarity = raw.frame.frequency.value activity = raw.sentence.activity.name if 'rubycommons' in activity: return # build the assertion frame_text = raw.frame.text frame_text = frame_text.replace('{1}', '[[%s]]' % raw.text1).replace( '{2}', '[[%s]]' % raw.text2) activity_node = normalize_uri(u'/s/site/nadya.jp') startText = ' '.join(JA.normalize_list(raw.text1)) endText = ' '.join(JA.normalize_list(raw.text2)) if startText != raw.text1: print raw.text1.encode('utf-8'), '=>', startText.encode('utf-8') normalize_uri('/text/' + lang + '/' + startText) end = normalize_uri('/text/' + lang + '/' + endText) relname = raw.frame.relation.name if relname == 'ConceptuallyRelatedTo': relname = 'RelatedTo' if polarity > 0: relation = normalize_uri('/r/' + relname) else: relation = normalize_uri('/r/Not' + relname) dataset = normalize_uri('/d/nadya.jp') score = raw.score sources = [([activity_node], score / 5.)] for source_list, weight in sources: if 'commons2_reject' in ' '.join(source_list): weight = -1 start = make_concept_uri(startText, lang) end = make_concept_uri(endText, lang) edge = make_edge(relation, start, end, dataset, LICENSE, source_list, '/ctx/all', frame_text, weight=weight) writer.write(edge) except Exception: import traceback traceback.print_exc()
def output_edge(out, subj_concept, obj_concept): """ Write an edge to `out`, an instance of MsgpackStreamWriter. """ rel = '/r/TranslationOf' edge = make_edge(rel, subj_concept, obj_concept, dataset='/d/jmdict', license=Licenses.cc_sharealike, sources=['/s/jmdict/1.07'], weight=0.5) out.write(edge)
def output_assertion(out, **kwargs): uri = kwargs.pop('uri') source_tree = make_disjunction_uri(set(kwargs.pop('sources'))) assertion = make_edge(sources=source_tree, **kwargs) current_weight = assertion['weight'] log_weight = math.log(max(1, current_weight + 1)) / math.log(2) assertion['weight'] = log_weight assert assertion['uri'] == uri, (assertion['uri'], uri) line = json.dumps(assertion, ensure_ascii=False) print >> out, line
def umbel_edge(rel, start, end, surface, source): """ Get the ConceptNet representation of an UMBEL edge. """ return make_edge( rel=rel, start=start, end=end, dataset='/d/umbel', license=Licenses.cc_attribution, sources=[source], weight=1.0, surfaceText=surface )
def umbel_edge(rel, start, end, surface, source): """ Get the ConceptNet representation of an UMBEL edge. """ return make_edge(rel=rel, start=start, end=end, dataset='/d/umbel', license=Licenses.cc_attribution, sources=[source], weight=1.0, surfaceText=surface)
def handle_file(input_file, output_file): tree = ET.parse(input_file) out = MsgpackStreamWriter(output_file) root = tree.getroot() lang = root[0][1].attrib['type'] for annotation in root[1]: for word in strip_words(annotation.text): start = standardized_concept_uri('mul', annotation.attrib['cp']) end = standardized_concept_uri(lang, word) edge = make_edge(REL, start, end, DATASET, LICENSE, SOURCE) out.write(edge)
def handle_assertion(self, parts_dict): """ Process one assertion from ConceptNet 4, which appears in the input file as a dictionary. Use the 'raw' text -- the text that's not yet reduced to a normalized form -- so we can run ConceptNet 5's normalization on it instead. Each assertion becomes a number of ConceptNet 5 edges, which will probably be grouped together into an assertion again. """ if can_skip(parts_dict): return # fix the result of some process that broke prepositions ages ago preposition_fix = False if '} around {' in parts_dict['frame_text']: for prep in AROUND_PREPOSITIONS: if parts_dict['endText'].startswith(prep + ' '): parts_dict['endText'] = \ parts_dict['endText'][len(prep) + 1:] replacement = '} %s {' % prep parts_dict['frame_text'] = \ parts_dict['frame_text'].replace( '} around {', replacement ) preposition_fix = True break if can_skip(parts_dict): return # build the assertion frame_text = build_frame_text(parts_dict) relation = build_relation(parts_dict) start = build_start(parts_dict) end = build_end(parts_dict) dataset = build_data_set(parts_dict) weighted_sources = build_sources(parts_dict, preposition_fix) for source_list, weight in weighted_sources: if 'commons2_reject' in ' '.join(source_list): return for source_list, weight in weighted_sources: if not by_bedume_and_bad(source_list, start, end): yield make_edge( rel=relation, start=start, end=end, dataset=dataset, license=Licenses.cc_attribution, sources=source_list, surfaceText=frame_text, weight=weight )
def handle_triple(line, reader, out, map_out): subj, pred, obj, tag = reader.parse_line(line) if tag != 'URL': return # Ignore types of edges that we don't care about: # - Homepage links # - GIS features # - Assertions that something "is a thing" # - Anonymous nodes identified with double-underscores, such as the node # "Alfred_Nobel__1", which means "Alfred Nobel's occupation, whatever # it is" # - Nodes that are articles named "List of X" on Wikipedia if ('foaf/0.1/homepage' in pred or '_Feature' in obj or '#Thing' in obj or '__' in subj or '__' in obj or 'List_of' in subj or 'List_of' in obj): return # We don't try to parse URIs from outside of dbpedia.org's namespace. if 'dbpedia.org' not in obj: return subj_concept = translate_dbpedia_url(subj, 'en') obj_concept = translate_dbpedia_url(obj, 'en') # DBPedia categorizes a lot of things as 'works', which causes unnecessary # ambiguity. Disregard these edges; there will almost always be a more # specific edge calling it a 'creative work' anyway. if obj_concept == '/c/en/work': return rel = map_dbpedia_relation(pred) if rel is None: return # We've successfully converted this Semantic Web triple to ConceptNet URIs. # Now write the results to the 'sw_map' file so others can follow this # mapping. mapped_pairs = [ (pred, rel), (subj, subj_concept), (obj, obj_concept) ] for sw_url, conceptnet_uri in mapped_pairs: conceptnet_url = full_conceptnet_url(conceptnet_uri) map_out.write_link(conceptnet_url, sw_url) edge = make_edge(rel, subj_concept, obj_concept, dataset='/d/dbpedia/en', license=Licenses.cc_sharealike, sources=['/s/dbpedia/3.7'], weight=0.5) out.write(edge)
def opencyc_edge(rel, start, end, start_text, end_text): """ Get the ConceptNet representation of an OpenCyc edge. """ return make_edge( rel=rel, start=start, end=end, dataset='/d/opencyc', license=Licenses.cc_attribution, sources=[SOURCE], weight=1.0, surfaceStart=start_text, surfaceEnd=end_text )
def opencyc_edge(rel, start, end, start_text, end_text): """ Get the ConceptNet representation of an OpenCyc edge. """ return make_edge(rel=rel, start=start, end=end, dataset='/d/opencyc', license=Licenses.cc_attribution, sources=[SOURCE], weight=1.0, surfaceStart=start_text, surfaceEnd=end_text)
def handle_raw_assertion(self, flat_assertion): parts_dict = json.loads(flat_assertion) if can_skip(parts_dict): return # fix the result of some process that broke prepositions ages ago preposition_fix = False if '} around {' in parts_dict['frame_text']: for prep in AROUND_PREPOSITIONS: if parts_dict['endText'].startswith(prep + ' '): parts_dict['endText'] = \ parts_dict['endText'][len(prep) + 1:] replacement = '} %s {' % prep parts_dict['frame_text'] = \ parts_dict['frame_text'].replace( '} around {', replacement ) preposition_fix = True break # build the assertion frame_text = build_frame_text(parts_dict) relation = build_relation(parts_dict) start = build_start(parts_dict) end = build_end(parts_dict) dataset = build_data_set(parts_dict) sources = build_sources(parts_dict, preposition_fix) reject = False for source_list, weight in sources: if 'commons2_reject' in ' '.join(source_list): reject = True if not reject: for source_list, weight in sources: if not by_bedume_and_bad(source_list,start,end): contributors = [s for s in source_list if s.startswith('/s/contributor')] assert len(contributors) <= 1, contributors edge = make_edge(relation, start, end, dataset, LICENSE, source_list, '/ctx/all', frame_text, weight=weight) okay = True if contributors: uri = edge['uri'] contributor = contributors[0] if (uri, contributor) in self.seen_sources: okay = False else: self.seen_sources.add((uri, contributor)) if okay: yield json.dumps(edge, ensure_ascii=False)
def _make_assertion(line_group): """ When a generator of tab-separated lines has been grouped by their assertion URI, this function takes all the lines with the same URI and makes a single assertion out of them. """ lines = [line.rstrip() for line in line_group] lines = [line for line in lines if line] if not lines: return None # FIXME: the steps leading up to this produce URIs that can differ based # on word senses. These don't get merged together, but they should. uri, rel, start, end, _ = lines[0].split('\t') if not (keep_concept(start) and keep_concept(end)): return None info_dicts = [json.loads(line.split('\t')[4]) for line in lines] unscaled_weight = sum(info['weight'] for info in info_dicts) licenses = {info['license'] for info in info_dicts} dataset = info_dicts[0]['dataset'] surface_text = None sources = [] seen_sources = set() for info in info_dicts: if surface_text is None and 'surfaceText' in info: surface_text = info['surfaceText'] for subsource in info['sources']: conjunction = conjunction_uri(*sorted(subsource.values())) if conjunction not in seen_sources: sources.append(subsource) seen_sources.add(conjunction) weight = weight_scale(unscaled_weight) if Licenses.cc_sharealike in licenses: license = Licenses.cc_sharealike else: license = Licenses.cc_attribution return make_edge( rel=rel, start=start, end=end, weight=weight, dataset=dataset, license=license, sources=sources, surfaceText=surface_text, )
def handle_raw_assertion(line): parts = line.split(', ') user, frame_id, concept1, concept2 = parts fdata = FRAME_DATA[frame_id] ftext = fdata['text'] rel = fdata['relation'] surfaceText = ftext.replace('{1}', '[[' + concept1 + ']]').replace('{2}', '[[' + concept2 + ']]') start = normalized_concept_uri('zh_TW', concept1) end = normalized_concept_uri('zh_TW', concept2) sources = ['/s/activity/ptt/petgame', '/s/contributor/petgame/' + user] yield make_edge(rel, start, end, dataset='/d/conceptnet/4/zh', license='/l/CC/By', sources=sources, surfaceText=surfaceText, weight=1)
def output_edge(out, subj_concept, obj_concept): """ Write an edge to `out`, an instance of JSONFileWriter. """ rel = "/r/TranslationOf" edge = make_edge( rel, subj_concept, obj_concept, dataset="/d/jmdict", license=Licenses.cc_sharealike, sources=["/s/jmdict/1.07"], weight=0.5, ) out.write(edge)
def make_assertion(line_group): lines = [line.rstrip() for line in line_group] lines = [line for line in lines if line] if not lines: return None # FIXME: the steps leading up to this produce URIs that can differ based # on word senses. These don't get merged together, but they should. uri, rel, start, end, _ = lines[0].split('\t') # We can't distinguish word senses well enough yet, so only keep them # up to the part of speech start = uri_prefix(start, 4) end = uri_prefix(end, 4) if not (keep_concept(start) and keep_concept(end)): return None info_dicts = [json.loads(line.split('\t')[4]) for line in lines] unscaled_weight = sum(info['weight'] for info in info_dicts) licenses = {info['license'] for info in info_dicts} dataset = info_dicts[0]['dataset'] surface_text = None sources = [] seen_sources = set() for info in info_dicts: if surface_text is None and 'surfaceText' in info: surface_text = info['surfaceText'] for subsource in info['sources']: conjunction = conjunction_uri(*sorted(subsource.values())) if conjunction not in seen_sources: sources.append(subsource) seen_sources.add(conjunction) weight = weight_scale(unscaled_weight) if Licenses.cc_sharealike in licenses: license = Licenses.cc_sharealike else: license = Licenses.cc_attribution return make_edge(rel=rel, start=start, end=end, weight=weight, dataset=dataset, license=license, sources=sources, surfaceText=surface_text)
def output_monolingual(self, lang, relation, term1, term2): if term_is_bad(term1) or term_is_bad(term2): return source = normalized_concept_uri(lang, term1) if self.pos: target = normalized_concept_uri(lang, term2, self.pos) else: target = normalized_concept_uri(lang, term2) surfaceText = "[[%s]] %s [[%s]]" % (term1, relation, term2) edge = make_edge('/r/'+relation, source, target, '/d/wiktionary/%s/%s' % (lang, lang), license=Licenses.cc_sharealike, sources=[SOURCE, MONOLINGUAL], weight=1.0, surfaceText=surfaceText) self.writer.write(edge)
def handle_raw_assertion(line): if not line: return parts = line.split(', ') user, frame_id, concept1, concept2 = parts fdata = FRAME_DATA[frame_id] ftext = fdata['text'] rel = fdata['relation'] surfaceText = ftext.replace(u'{1}', u'[['+concept1+u']]').replace(u'{2}', u'[['+concept2+u']]') start = make_concept_uri(concept1, 'zh_TW') end = make_concept_uri(concept2, 'zh_TW') sources = ['/s/activity/ptt/petgame', '/s/contributor/petgame/' + user] edge = make_edge(rel, start, end, dataset='/d/conceptnet/4/zh', license='/l/CC/By', sources=sources, surfaceText=surfaceText, weight=1) yield json.dumps(edge, ensure_ascii=False)
def handle_file(input_filename, output_file): out = MsgpackStreamWriter(output_file) for line in open(input_filename, encoding='utf-8'): parts = line.rstrip('\n').split('\t') uri, start, rel, end, weight, source = parts if uri == 'uri': return edge = make_edge(rel=rel, start=start, end=end, dataset=DATASET, sources=[{ 'activity': SOURCE }], license=Licenses.cc_attribution, weight=WEIGHT_TABLE[weight]) out.write(edge)
def output_monolingual(self, lang, relation, term1, term2): if 'Wik' in term1 or 'Wik' in term2: return source = make_concept_uri(term1, lang) if self.pos: target = make_concept_uri(term2, lang, self.pos) else: target = make_concept_uri(term2, lang) surfaceText = "[[%s]] %s [[%s]]" % (term1, relation, term2) #print surfaceText edge = make_edge('/r/'+relation, source, target, '/d/wiktionary/%s/%s' % (lang, lang), license='/l/CC/By-SA', sources=[SOURCE, MONOLINGUAL], context='/ctx/all', weight=1.5, surfaceText=surfaceText) self.writer.write(edge)
def build_core_from_csvs(csv_files): weights = defaultdict(float) assertions = {} ccby = defaultdict(bool) for csv_file in csv_files: print "currently in file: " + str(csv_file) for line in codecs.open(csv_file, encoding='utf-8'): uri, rel, start, end, context, weight, sources, id, dataset = line.split( '\t')[:9] if uri != 'uri' and context == '/ctx/all': weight = float(weight) weights[uri] += float(weight) assertions[uri] = (rel, start, end, context, weights[uri]) if not (dataset.startswith('/d/reverb') or dataset.startswith('/d/wiktionary') or dataset.startswith('/d/dbpedia')): ccby[uri] = True print 'writing' writer_core = MultiWriter('assertion_totals_core') #writer_sa = MultiWriter('assertion_totals_sa') for uri, values in assertions.iteritems(): relation, start, end, context, weight = values if ccby[uri]: license = '/l/CC/By' dataset = '/d/conceptnet/5/combined-core' else: license = '/l/CC/By-SA' dataset = '/d/conceptnet/5/combined-sa' edge = make_edge(relation, start, end, dataset, license, ['/s/rule/sum_edges'], '/ctx/all', weight=weight) if license == '/l/CC/By': writer_core.write(edge) #else: #writer_sa.write(edge) writer_core.close()
def handle_file(input_file, output_file): tree = ET.parse(input_file) out = MsgpackStreamWriter(output_file) root = tree.getroot() lang = root[0][1].attrib[ 'type' ] # language is at position [1] within the child node [0] if len(root) >= 2: for annotation in root[1]: for word in strip_words(annotation.text): start = standardized_concept_uri('mul', annotation.attrib['cp']) end = standardized_concept_uri(lang, word) edge = make_edge(REL, start, end, DATASET, LICENSE, SOURCE) out.write(edge) else: print("No emoji data in {!r}".format(input_file)) out.close()
def output_monolingual(self, lang, relation, term1, term2): if term_is_bad(term1) or term_is_bad(term2): return source = normalized_concept_uri(lang, term1) if self.pos: target = normalized_concept_uri(lang, term2, self.pos) else: target = normalized_concept_uri(lang, term2) surfaceText = "[[%s]] %s [[%s]]" % (term1, relation, term2) edge = make_edge('/r/' + relation, source, target, '/d/wiktionary/%s/%s' % (lang, lang), license=Licenses.cc_sharealike, sources=[SOURCE, MONOLINGUAL], weight=1.0, surfaceText=surfaceText) self.writer.write(edge)