Пример #1
0
def handle_triple(line):
    items = line.split()
    for i in xrange(3):
        if not (items[i].startswith('<') and items[i].endswith('>')):
            return
        items[i] = items[i][1:-1]
    subj, pred, obj = items[:3]
    if 'foaf/0.1/homepage' in pred or obj == 'work' or '_Feature' in obj or '#Thing' in obj or '__' in subj or '__' in obj or 'List_of' in subj or 'List_of' in obj:
        return
    subj_concept = make_concept_uri(translate_wp_url(subj), 'en')
    obj_concept = make_concept_uri(translate_wp_url(obj), 'en')
    webrel = map_web_relation(pred)
    if webrel is None:
        return
    rel = normalize_uri('/r/' + webrel)

    if (pred, rel) not in sw_map_used:
        sw_map_used.add((pred, rel))
        sw_map.write({'from': pred, 'to': rel})
    if (subj, subj_concept) not in sw_map_used:
        sw_map_used.add((subj, subj_concept))
        sw_map.write({'from': subj, 'to': subj_concept})
    if (obj, obj_concept) not in sw_map_used:
        sw_map_used.add((obj, obj_concept))
        sw_map.write({'from': obj, 'to': obj_concept})

    edge = make_edge(rel,
                     subj_concept,
                     obj_concept,
                     dataset='/d/dbpedia/en',
                     license='/l/CC/By-SA',
                     sources=['/s/dbpedia/3.7'],
                     context='/ctx/all',
                     weight=0.5)
    writer.write(edge)
Пример #2
0
def output_edge(obj,writer):
    objsource = obj['sources'][0]
    if obj['arg1'].startswith(objsource):
        obj['arg1'] = objsource
    if obj['arg2'].startswith(objsource):
        obj['arg2'] = objsource
    if obj['arg1'].endswith(objsource):
        obj['arg1'] = objsource
    if obj['arg2'].endswith(objsource):
        obj['arg2'] = objsource
    start = make_concept_uri(obj['arg1'], 'en')
    end = make_concept_uri(obj['arg2'], 'en')
    if obj['rel'][0] in string.uppercase:
        rel = '/r/'+obj['rel']
    else:
        rel = make_concept_uri(obj['rel'], 'en')
    if start.startswith('/c/en/this_') or start.startswith('/c/en/these_') or end.startswith('/c/en/this_') or end.startswith('/c/en/these_'):
        return
    context = make_concept_uri(objsource, 'en')
    source = "/s/web/en.wikipedia.org/wiki/%s" % (objsource.replace(' ', '_'))
    rules = ['/s/rule/reverb', '/s/rule/reverb_filter_apr2012']
    surfaceText = u"[[%s]] %s [[%s]]" % (obj['arg1'], obj.get('surfaceRel', obj['rel']), obj['arg2'])
    weight = float(obj['weight']) ** 3 / 2
    edge = make_edge(rel, start, end,
                     dataset='/d/reverb/wp_frontpage',
                     license='/l/CC/By-SA',
                     sources=[source] + rules,
                     context=context,
                     surfaceText=surfaceText,
                     weight=weight)
    writer.write(edge)
Пример #3
0
 def output_sense_translation(self, lang, foreign, german, disambiguation):
     if "Wik" in foreign or "Wik" in german:
         return
     if lang == "zh-cn":
         lang = "zh_CN"
     elif lang == "zh-tw":
         lang = "zh_TW"
     source = make_concept_uri(unicodedata.normalize("NFKC", foreign), lang)
     target = make_concept_uri(german, "de", disambiguation)
     relation = "/r/TranslationOf"
     try:
         surfaceRel = "is %s for" % (langs.english_name(lang))
     except KeyError:
         surfaceRel = "is [language %s] for" % lang
     surfaceText = "[[%s]] %s [[%s (%s)]]" % (
         foreign,
         surfaceRel,
         english,
         disambiguation.split("/")[-1].replace("_", " "),
     )
     # print surfaceText
     edge = make_edge(
         relation,
         source,
         target,
         "/d/wiktionary/en/%s" % lang,
         license="/l/CC/By-SA",
         sources=[SOURCE, TRANSLATE],
         context="/ctx/all",
         weight=1.5,
         surfaceText=surfaceText,
     )
     self.writer.write(edge)
Пример #4
0
 def output_sense_translation(self, lang, foreign, english, disambiguation):
     if 'Wik' in foreign or 'Wik' in english:
         return
     if lang == 'zh-cn':
         lang = 'zh_CN'
     elif lang == 'zh-tw':
         lang = 'zh_TW'
     source = make_concept_uri(unicodedata.normalize('NFKC', foreign), lang)
     target = make_concept_uri(english, 'en', disambiguation)
     relation = '/r/TranslationOf'
     try:
         surfaceRel = "is %s for" % (langs.english_name(lang))
     except KeyError:
         surfaceRel = "is [language %s] for" % lang
     surfaceText = "[[%s]] %s [[%s (%s)]]" % (
         foreign, surfaceRel, english,
         disambiguation.split('/')[-1].replace('_', ' '))
     print surfaceText
     edge = make_edge(relation,
                      source,
                      target,
                      '/d/wiktionary/en/%s' % lang,
                      license='/l/CC/By-SA',
                      sources=[SOURCE, TRANSLATE],
                      context='/ctx/all',
                      weight=1.5,
                      surfaceText=surfaceText)
     self.writer.write(edge)
Пример #5
0
def read_jmdict(filename, outfilename):
    file = open(filename)
    outfile = codecs.open(outfilename, 'w', encoding='utf-8')
    data = file.read().decode('utf-8')
    file.close()

    xml = xmltodict.parse(data)
    entries = xml['JMdict']['entry']
    for entry in entries:
        headwords = [word['keb'] for word in get_list(entry, 'k_ele')]
        if not headwords:
            headwords = [word['reb'] for word in get_list(entry, 'r_ele')]
        
        for sense in get_list(entry, 'sense'):
            pos = get_one(sense, 'pos')
            glosses = get_list(sense, 'gloss') + get_list(sense, 'lsource')
            for gloss in glosses:
                if '#text' in gloss:
                    text = parse_gloss(gloss['#text'])
                    if '.' not in text:
                        lang = convert_lang_code(gloss['@xml:lang'])
                        for head in headwords:
                            ja_concept = make_concept_uri(head, 'ja')
                            other_concept = make_concept_uri(text, lang)
                            if len(other_concept.split('_')) <= 5:
                                output_edge(outfile, ja_concept, other_concept)
    outfile.close()
Пример #6
0
def handle_raw_assertion(raw_assertion):
    edges = []
    assertion, users = raw_assertion
    frame_id, concept1, concept2 = assertion
    frame = Frame.objects.get(id=int(frame_id))
    ftext = frame.text
    relation = frame.relation.name
    rel = '/r/' + relation

    surfaceText = ftext.replace(u'{1}', u'[[' + concept1 + u']]').replace(
        u'{2}', u'[[' + concept2 + u']]')
    start = make_concept_uri(concept1, 'zh_TW')
    end = make_concept_uri(concept2, 'zh_TW')
    sources = ['/s/activity/ptt/petgame']
    for user in users:
        sources.append('/s/contributor/petgame/' + user)
    edge = make_edge(rel,
                     start,
                     end,
                     dataset='/d/conceptnet/4/zh',
                     license='/l/CC/By',
                     sources=sources,
                     surfaceText=surfaceText,
                     weight=len(users))
    edges.append(edge)
    return edges
Пример #7
0
def handle_raw_assertion(raw_assertion):
    line = raw_assertion.strip()
    edges = []
    if line:
        parts = line.split(', ')
        user, frame_id, concept1, concept2 = parts
        frame = Frame.objects.get(id=int(frame_id))
        ftext = frame.text
        relation = frame.relation.name
        rel = '/r/' + relation

        surfaceText = ftext.replace(u'{1}', u'[[' + concept1 + u']]').replace(
            u'{2}', u'[[' + concept2 + u']]')
        start = make_concept_uri(concept1, 'zh_TW')
        end = make_concept_uri(concept2, 'zh_TW')
        sources = ['/s/contributor/petgame/' + user, '/s/activity/ntt/petgame']
        edge = make_edge(rel,
                         start,
                         end,
                         dataset='/d/conceptnet/4/zh',
                         license='/l/CC/By',
                         sources=sources,
                         surfaceText=surfaceText,
                         weight=1)
        edges.append(edge)
    return edges
Пример #8
0
def handle_triple(line):
    items = line.split()
    for i in xrange(3):
        if not (items[i].startswith('<') and items[i].endswith('>')):
            return
        items[i] = items[i][1:-1]
    subj, pred, obj = items[:3]
    if 'foaf/0.1/homepage' in pred or obj == 'work' or '_Feature' in obj or '#Thing' in obj or '__' in subj or '__' in obj or 'List_of' in subj or 'List_of' in obj: return
    subj_concept = make_concept_uri(translate_wp_url(subj), 'en')
    obj_concept = make_concept_uri(translate_wp_url(obj), 'en')
    webrel = map_web_relation(pred)
    if webrel is None:
        return
    rel = normalize_uri('/r/'+webrel)

    if (pred, rel) not in sw_map_used:
        sw_map_used.add((pred, rel))
        sw_map.write({'from': pred, 'to': rel})
    if (subj, subj_concept) not in sw_map_used:
        sw_map_used.add((subj, subj_concept))
        sw_map.write({'from': subj, 'to': subj_concept})
    if (obj, obj_concept) not in sw_map_used:
        sw_map_used.add((obj, obj_concept))
        sw_map.write({'from': obj, 'to': obj_concept})

    edge = make_edge(rel, subj_concept, obj_concept,
                     dataset='/d/dbpedia/en',
                     license='/l/CC/By-SA',
                     sources=['/s/dbpedia/3.7'],
                     context='/ctx/all',
                     weight=0.5)
    writer.write(edge)
Пример #9
0
def read_jmdict(filename, outfilename):
    file = open(filename)
    outfile = codecs.open(outfilename, 'w', encoding='utf-8')
    data = file.read().decode('utf-8')
    file.close()

    xml = xmltodict.parse(data)
    entries = xml['JMdict']['entry']
    for entry in entries:
        headwords = [word['keb'] for word in get_list(entry, 'k_ele')]
        if not headwords:
            headwords = [word['reb'] for word in get_list(entry, 'r_ele')]

        for sense in get_list(entry, 'sense'):
            pos = get_one(sense, 'pos')
            glosses = get_list(sense, 'gloss') + get_list(sense, 'lsource')
            for gloss in glosses:
                if '#text' in gloss:
                    text = parse_gloss(gloss['#text'])
                    if '.' not in text:
                        lang = convert_lang_code(gloss['@xml:lang'])
                        for head in headwords:
                            ja_concept = make_concept_uri(head, 'ja')
                            other_concept = make_concept_uri(text, lang)
                            if len(other_concept.split('_')) <= 5:
                                output_edge(outfile, ja_concept, other_concept)
    outfile.close()
Пример #10
0
 def output_sense_translation(self, lang, foreign, german, disambiguation):
     if 'Wik' in foreign or 'Wik' in german:
         return
     if lang == 'zh-cn':
         lang = 'zh_CN'
     elif lang == 'zh-tw':
         lang = 'zh_TW'
     source = make_concept_uri(
       unicodedata.normalize('NFKC', foreign), lang
     )
     target = make_concept_uri(
       german, 'de', disambiguation
     )
     relation = '/r/TranslationOf'
     try:
         surfaceRel = "is %s for" % (langs.english_name(lang))
     except KeyError:
         surfaceRel = "is [language %s] for" % lang
     surfaceText = "[[%s]] %s [[%s (%s)]]" % (foreign, surfaceRel, english, disambiguation.split('/')[-1].replace('_', ' '))
     #print surfaceText
     edge = make_edge(relation, source, target, '/d/wiktionary/en/%s' % lang,
                      license='/l/CC/By-SA',
                      sources=[SOURCE, TRANSLATE],
                      context='/ctx/all',
                      weight=1.5,
                      surfaceText=surfaceText)
     self.writer.write(edge)
Пример #11
0
def handle_raw_assertion(raw, writer):
    try:
        lang = raw.language_id
        assert lang == 'ja'
        if raw.frame.goodness < 1: return
        polarity = raw.frame.frequency.value
        activity = raw.sentence.activity.name
        if 'rubycommons' in activity: return

        # build the assertion
        frame_text = raw.frame.text
        frame_text = frame_text.replace('{1}', '[[%s]]' % raw.text1).replace(
            '{2}', '[[%s]]' % raw.text2)

        activity_node = normalize_uri(u'/s/site/nadya.jp')

        startText = ' '.join(JA.normalize_list(raw.text1))
        endText = ' '.join(JA.normalize_list(raw.text2))
        if startText != raw.text1:
            print raw.text1.encode('utf-8'), '=>', startText.encode('utf-8')
        normalize_uri('/text/' + lang + '/' + startText)
        end = normalize_uri('/text/' + lang + '/' + endText)

        relname = raw.frame.relation.name
        if relname == 'ConceptuallyRelatedTo':
            relname = 'RelatedTo'

        if polarity > 0:
            relation = normalize_uri('/r/' + relname)
        else:
            relation = normalize_uri('/r/Not' + relname)

        dataset = normalize_uri('/d/nadya.jp')
        score = raw.score

        sources = [([activity_node], score / 5.)]

        for source_list, weight in sources:
            if 'commons2_reject' in ' '.join(source_list):
                weight = -1
            start = make_concept_uri(startText, lang)
            end = make_concept_uri(endText, lang)
            edge = make_edge(relation,
                             start,
                             end,
                             dataset,
                             LICENSE,
                             source_list,
                             '/ctx/all',
                             frame_text,
                             weight=weight)
            writer.write(edge)
    except Exception:
        import traceback
        traceback.print_exc()
Пример #12
0
def handle_raw_assertion(raw, writer):
    try:
        lang = raw.language_id
        assert lang == 'ja'
        if raw.frame.goodness < 1: return
        polarity = raw.frame.frequency.value
        activity = raw.sentence.activity.name
        if 'rubycommons' in activity: return

        # build the assertion
        frame_text = raw.frame.text
        frame_text = frame_text.replace('{1}', '[[%s]]' % raw.text1).replace('{2}', '[[%s]]' % raw.text2)

        activity_node = normalize_uri(u'/s/site/nadya.jp')
        
        startText = ' '.join(JA.normalize_list(raw.text1))
        endText = ' '.join(JA.normalize_list(raw.text2))
        if startText != raw.text1:
            print raw.text1.encode('utf-8'), '=>',  startText.encode('utf-8')
        normalize_uri('/text/'+lang+'/'+startText)
        end = normalize_uri('/text/'+lang+'/'+endText)

        relname = raw.frame.relation.name
        if relname == 'ConceptuallyRelatedTo':
            relname = 'RelatedTo'

        if polarity > 0:
            relation = normalize_uri('/r/'+relname)
        else:
            relation = normalize_uri('/r/Not'+relname)

        dataset = normalize_uri('/d/nadya.jp')
        score = raw.score

        sources = [([activity_node], score/5.)]

        for source_list, weight in sources:
            if 'commons2_reject' in ' '.join(source_list):
                weight = -1
            start = make_concept_uri(startText, lang)
            end = make_concept_uri(endText, lang)
            edge = make_edge(relation, start, end, dataset, LICENSE, source_list, '/ctx/all', frame_text, weight=weight)
            writer.write(edge)
    except Exception:
        import traceback
        traceback.print_exc()
Пример #13
0
def handle_raw_assertion(line):
    if not line:
        return
    parts = line.split(', ')
    user, frame_id, concept1, concept2 = parts
    fdata = FRAME_DATA[frame_id]
    ftext = fdata['text']
    rel = fdata['relation']

    surfaceText = ftext.replace(u'{1}', u'[['+concept1+u']]').replace(u'{2}', u'[['+concept2+u']]')
    start = make_concept_uri(concept1, 'zh_TW')
    end = make_concept_uri(concept2, 'zh_TW')
    sources = ['/s/activity/ptt/petgame', '/s/contributor/petgame/' + user]
    edge = make_edge(rel, start, end, dataset='/d/conceptnet/4/zh',
                     license='/l/CC/By', sources=sources,
                     surfaceText=surfaceText, weight=1)
    yield json.dumps(edge, ensure_ascii=False)
Пример #14
0
    def output_monolingual(self, lang, relation, term1, term2):
        if 'Wik' in term1 or 'Wik' in term2:
            return
        source = make_concept_uri(term1, lang)
        if self.pos:
            target = make_concept_uri(term2, lang, self.pos)
        else:
            target = make_concept_uri(term2, lang)
        surfaceText = "[[%s]] %s [[%s]]" % (term1, relation, term2)
        #print surfaceText

        edge = make_edge('/r/'+relation, source, target, '/d/wiktionary/%s/%s' % (lang, lang),
                         license='/l/CC/By-SA',
                         sources=[SOURCE, MONOLINGUAL],
                         context='/ctx/all',
                         weight=1.5,
                         surfaceText=surfaceText)
        self.writer.write(edge)
Пример #15
0
def make_concept_uri_safe(term, lang, disambiguation=None):
    if term is None:
        raise ValueError('term must not be None')
    if lang is None:
        raise ValueError('lang must not be None')
    if '|' in term:
        term = term.split('|')[0]
    if '#' in term:
        term = term.split('#')[0]
    return make_concept_uri(term, lang, disambiguation)
Пример #16
0
def make_concept_uri_safe(term, lang, disambiguation=None):
    if term is None:
        raise ValueError('term must not be None')
    if lang is None:
        raise ValueError('lang must not be None')
    if '|' in term:
        term = term.split('|')[0]
    if '#' in term:
        term = term.split('#')[0]
    return make_concept_uri(term, lang, disambiguation)
Пример #17
0
 def output_translation(self, foreign, english, locale=''):
     source = make_concept_uri(unicodedata.normalize('NFKC', foreign),
                               self.langcode + locale)
     target = make_concept_uri(english, 'en')
     relation = '/r/TranslationOf'
     try:
         surfaceRel = "is %s for" % (langs.english_name(self.langcode))
     except KeyError:
         surfaceRel = "is [language %s] for" % self.langcode
     surfaceText = "[[%s]] %s [[%s]]" % (foreign, surfaceRel, english)
     edge = make_edge(relation,
                      source,
                      target,
                      '/d/wiktionary/en/%s' % self.langcode,
                      license='/l/CC/By-SA',
                      sources=[SOURCE, INTERLINGUAL],
                      context='/ctx/all',
                      weight=1.5,
                      surfaceText=surfaceText)
     self.writer.write(edge)
Пример #18
0
def handle_raw_assertion(raw_assertion):
    edges = []
    assertion, users = raw_assertion
    frame_id, concept1, concept2 = assertion
    frame = Frame.objects.get(id=int(frame_id))
    ftext = frame.text
    relation = frame.relation.name
    rel = '/r/'+relation

    surfaceText = ftext.replace(u'{1}', u'[['+concept1+u']]').replace(u'{2}', u'[['+concept2+u']]')
    start = make_concept_uri(concept1, 'zh_TW')
    end = make_concept_uri(concept2, 'zh_TW')
    sources = ['/s/activity/ptt/petgame']
    for user in users:
        sources.append('/s/contributor/petgame/'+user)
    edge = make_edge(rel, start, end, dataset='/d/conceptnet/4/zh',
                     license='/l/CC/By', sources=sources,
                     surfaceText=surfaceText, weight=len(users))
    edges.append(edge)
    return edges
Пример #19
0
def handle_raw_assertion(raw_assertion):
    line = raw_assertion.strip()
    edges = []
    if line:
        parts = line.split(', ')
        user, frame_id, concept1, concept2 = parts
        frame = Frame.objects.get(id=int(frame_id))
        ftext = frame.text
        relation = frame.relation.name
        rel = '/r/'+relation

        surfaceText = ftext.replace(u'{1}', u'[['+concept1+u']]').replace(u'{2}', u'[['+concept2+u']]')
        start = make_concept_uri(concept1, 'zh_TW')
        end = make_concept_uri(concept2, 'zh_TW')
        sources = ['/s/contributor/petgame/'+user, '/s/activity/ntt/petgame']
        edge = make_edge(rel, start, end, dataset='/d/conceptnet/4/zh',
                         license='/l/CC/By', sources=sources,
                         surfaceText=surfaceText, weight=1)
        edges.append(edge)
    return edges
Пример #20
0
 def output_translation(self, foreign, english, locale=''):
     source = make_concept_uri(
       unicodedata.normalize('NFKC', foreign),
       self.langcode+locale
     )
     target = make_concept_uri(
       english, 'en'
     )
     relation = '/r/TranslationOf'
     try:
         surfaceRel = "is %s for" % (langs.english_name(self.langcode))
     except KeyError:
         surfaceRel = "is [language %s] for" % self.langcode
     surfaceText = "[[%s]] %s [[%s]]" % (foreign, surfaceRel, english)
     edge = make_edge(relation, source, target, '/d/wiktionary/en/%s' % self.langcode,
                      license='/l/CC/By-SA',
                      sources=[SOURCE, INTERLINGUAL],
                      context='/ctx/all',
                      weight=1.5,
                      surfaceText=surfaceText)
     self.writer.write(edge)
Пример #21
0
def handle_raw_assertion(line):
    if not line:
        return
    parts = line.split(', ')
    user, frame_id, concept1, concept2 = parts
    fdata = FRAME_DATA[frame_id]
    ftext = fdata['text']
    rel = fdata['relation']

    surfaceText = ftext.replace(u'{1}', u'[[' + concept1 + u']]').replace(
        u'{2}', u'[[' + concept2 + u']]')
    start = make_concept_uri(concept1, 'zh_TW')
    end = make_concept_uri(concept2, 'zh_TW')
    sources = ['/s/activity/ptt/petgame', '/s/contributor/petgame/' + user]
    edge = make_edge(rel,
                     start,
                     end,
                     dataset='/d/conceptnet/4/zh',
                     license='/l/CC/By',
                     sources=sources,
                     surfaceText=surfaceText,
                     weight=1)
    yield json.dumps(edge, ensure_ascii=False)
Пример #22
0
def output_edge(obj, writer):
    objsource = obj['sources'][0]
    if obj['arg1'].startswith(objsource):
        obj['arg1'] = objsource
    if obj['arg2'].startswith(objsource):
        obj['arg2'] = objsource
    if obj['arg1'].endswith(objsource):
        obj['arg1'] = objsource
    if obj['arg2'].endswith(objsource):
        obj['arg2'] = objsource
    start = make_concept_uri(obj['arg1'], 'en')
    end = make_concept_uri(obj['arg2'], 'en')
    if obj['rel'][0] in string.uppercase:
        rel = '/r/' + obj['rel']
    else:
        rel = make_concept_uri(obj['rel'], 'en')
    if start.startswith('/c/en/this_') or start.startswith(
            '/c/en/these_') or end.startswith('/c/en/this_') or end.startswith(
                '/c/en/these_'):
        return
    context = make_concept_uri(objsource, 'en')
    source = "/s/web/en.wikipedia.org/wiki/%s" % (objsource.replace(' ', '_'))
    rules = ['/s/rule/reverb', '/s/rule/reverb_filter_apr2012']
    surfaceText = u"[[%s]] %s [[%s]]" % (
        obj['arg1'], obj.get('surfaceRel', obj['rel']), obj['arg2'])
    weight = float(obj['weight'])**3 / 2
    edge = make_edge(rel,
                     start,
                     end,
                     dataset='/d/reverb/wp_frontpage',
                     license='/l/CC/By-SA',
                     sources=[source] + rules,
                     context=context,
                     surfaceText=surfaceText,
                     weight=weight)
    writer.write(edge)
Пример #23
0
    def output_monolingual(self, lang, relation, term1, term2):
        # skip Wiktionary: links and templates
        if u'ウィク' in term1 or u'ウィク' in term2:
            return
        if u'テンプレート' in term1 or u'テンプレート' in term2:
            return

        if lang in LANGUAGES_3_TO_2: # convert 3-letter code to 2-letter code
            lang = LANGUAGES_3_TO_2[lang]
        source = make_concept_uri(term1, lang)
        if self.pos:
            target = make_concept_uri(term2, lang, self.pos)
        else:
            target = make_concept_uri(term2, lang)
        surfaceText = "[[%s]] %s [[%s]]" % (term1, relation, term2)
        #print surfaceText

        edge = make_edge('/r/'+relation, source, target, '/d/wiktionary/%s/%s' % (lang, lang),
                         license='/l/CC/By-SA',
                         sources=[SOURCE, MONOLINGUAL],
                         context='/ctx/all',
                         weight=1.5,
                         surfaceText=surfaceText)
        self.writer.write(edge)
Пример #24
0
    if right.startswith('not '):
        right = right[4:]
        relation = 'it is not'
    if relation == 'it is the opposite of':
        relation = 'it is not'

    freq = int(freq)
    orderscore = int(orderscore)
    if relation == 'about the same size as':
        relation = 'it is about the same size as'
    elif relation == 'it looks like':
        relation = 'it is related to'
    rel = mapping.get(relation)
    reltext = relation[3:]
    if rel is None:
        rel = make_concept_uri(unicode(reltext), 'en')
    text = '[[%s]] %s [[%s]]' % (left, reltext, right)

    if relation == 'it is' and\
       (right.startswith('a ') or right.startswith('an ')
        or right.startswith('the ')):
        rel = '/r/IsA'

    sls = sounds_like_score(left, right)
    text_similarities.append(sls)
    if sls > 0.35:
        #print "* %s sounds like %s (%4.4f)" % (left, right, sls)
        counts['text similarity'] += 1
        similar_out.write('%4.4d\t%s' % (sls, line))
        continue
Пример #25
0
def run_wordnet(input_dir, output_file, sw_map_file):
    mapping = {}
    labels = {}
    prefixes = {}
    glossary = {}
    synset_senses = defaultdict(list)
    synset_sense_names = defaultdict(list)
    sense_name_synsets = defaultdict(list)
    sense_synsets = defaultdict(list)

    parts_of_speech = {
        'noun': 'n',
        'verb': 'v',
        'adjective': 'a',
        'adjectivesatellite': 'a',
        'adverb': 'r',
    }

    rel_mapping = {
        'attribute': 'Attribute',
        'causes': 'Causes',
        'classifiedByRegion': 'HasContext',
        'classifiedByUsage': 'HasContext',
        'classifiedByTopic': 'HasContext',
        'entails': 'Entails',
        'hyponymOf': 'IsA',
        'instanceOf': 'InstanceOf',
        'memberMeronymOf': 'MemberOf',
        'partMeronymOf': 'PartOf',
        'sameVerbGroupAs': '******',
        'similarTo': 'SimilarTo',
        'substanceMeronymOf': '~MadeOf',
        'antonymOf': 'Antonym',
        'derivationallyRelated': '~DerivedFrom',
        'pertainsTo': 'PertainsTo',
        'seeAlso': 'RelatedTo',
    }

    def resolve_prefix(entry):
        prefix, name = entry.split(':')
        return prefixes[prefix] + name

    def handle_line(line):
        """
        Get the (subj, obj, pred) parts of a line, unless it's a blank line
        or a prefix definition, in which case return None.
        """
        line = line.decode('utf-8').strip()
        if not line:
            return None
        parts = line.split(None, 2)
        if parts[0] == '@prefix':
            prefix = parts[1].strip(': ')
            value = parts[2].strip('<>. ')
            prefixes[prefix] = value
            return None
        return parts[0], parts[1], parts[2].strip('. ')

    # First, get the human-readable label and gloss for every synset.
    for line in chain(
        open(input_dir + '/wordnet-synset.ttl'),
        open(input_dir + '/full/wordnet-wordsensesandwords.ttl'),
        open(input_dir + '/wordnet-glossary.ttl')
    ):
        parts = handle_line(line)
        if parts is None:
            continue
        if parts[1] == 'rdfs:label':
            subj = resolve_prefix(parts[0])
            obj = parts[2].split('"')[1]
            labels[subj] = obj
        elif parts[1] == 'wn20schema:gloss':
            subj = resolve_prefix(parts[0])
            obj = parts[2].split('"')[1]
            glossary[subj] = obj.split(';')[0]
            while '(' in glossary[subj] and ')' in glossary[subj]:
                glossary[subj] = re.sub(r"\([^)]+\) ?", r"", glossary[subj])

    # Get the list of word senses in each synset, and make a bidirectional mapping.
    for line in open(input_dir + '/full/wordnet-wordsense-synset-relations.ttl'):
        parts = handle_line(line)
        if parts is None:
            continue
        if parts[1] == 'wn20schema:containsWordSense':
            subj = resolve_prefix(parts[0])
            obj = resolve_prefix(parts[2].strip('. '))
            synset_senses[subj].append(obj)
            sense_synsets[obj] = subj
            sense_name = labels[obj]
            synset_sense_names[subj].append(sense_name)
            sense_name_synsets[sense_name].append(subj)

    # Assign every synset a disambiguation name.
    for synset in synset_senses:
        senses = sorted(synset_senses[synset])
        synset_name = labels[synset]
        synset_pos = synset.split('-')[-2]
        pos = parts_of_speech[synset_pos]
        disambig = glossary[synset].replace('/', '_')
        # TODO: take into account domains, etc.
        #
        #if len(sense_name_synsets[synset_name]) > 1:
        #    for sense in senses:
        #        sense_name = labels[sense]
        #        more_synsets = sense_name_synsets[sense_name]
        #        if len(more_synsets) == 1:
        #            disambig = sense_name
        #            break
        #    if disambig is None:
        #        disambig = glossary[synset]
        #if disambig is None:
        #    disambig = '*'
        node = make_concept_uri(synset_name, 'en', pos+'/'+disambig)
        if synset not in mapping:
            mapping[synset] = node

    # Map senses to the same nodes.
    for sense, synset in sense_synsets.items():
        mapping[sense] = mapping[synset]

    sources = ['/s/wordnet/3.0']
    writer = FlatEdgeWriter(output_file)
    sw_map = FlatEdgeWriter(sw_map_file)
    sw_map_used = set()

    for line in chain(
        open(input_dir + '/wordnet-attribute.ttl'),
        open(input_dir + '/wordnet-causes.ttl'),
        open(input_dir + '/wordnet-classifiedby.ttl'),
        open(input_dir + '/wordnet-entailment.ttl'),
        open(input_dir + '/wordnet-hyponym.ttl'),
        open(input_dir + '/wordnet-instances.ttl'),
        open(input_dir + '/wordnet-membermeronym.ttl'),
        open(input_dir + '/wordnet-partmeronym.ttl'),
        open(input_dir + '/wordnet-sameverbgroupas.ttl'),
        open(input_dir + '/wordnet-similarity.ttl'),
        open(input_dir + '/wordnet-substancemeronym.ttl'),
        open(input_dir + '/full/wordnet-antonym.ttl'),
        open(input_dir + '/full/wordnet-derivationallyrelated.ttl'),
        open(input_dir + '/full/wordnet-participleof.ttl'),
        open(input_dir + '/full/wordnet-pertainsto.ttl'),
        open(input_dir + '/full/wordnet-seealso.ttl'),
    ):
        parts = handle_line(line)
        if parts is None:
            continue
        web_subj = resolve_prefix(parts[0])
        web_rel = resolve_prefix(parts[1])
        web_obj = resolve_prefix(parts[2])
        subj = mapping[web_subj]
        obj = mapping[web_obj]
        pred_label = parts[1].split(':')[-1]
        if pred_label in rel_mapping:
            mapped = rel_mapping[pred_label]
            if mapped.startswith('~'):
                subj, obj = obj, subj
                web_subj, web_obj = web_obj, web_subj
                web_rel = web_rel.replace('meronym', 'holonym')
                mapped = mapped[1:]
            pred = '/r/'+mapped
        else:
            pred = '/r/wordnet/'+pred_label

        if (web_rel, pred) not in sw_map_used:
            sw_map.write({'from': web_rel, 'to': pred})
            sw_map_used.add((web_rel, pred))
        if (web_subj, subj) not in sw_map_used:
            sw_map.write({'from': web_subj, 'to': subj})
            sw_map_used.add((web_subj, subj))
        if (web_obj, obj) not in sw_map_used:
            sw_map.write({'from': web_obj, 'to': obj})
            sw_map_used.add((web_obj, obj))

        edge = make_edge(
            pred, subj, obj, '/d/wordnet/3.0',
            license='/l/CC/By', sources=sources,
            context='/ctx/all', weight=2.0
        )
        writer.write(edge)

    writer.close()
    sw_map.close()
Пример #26
0
def build_start(parts_dict):
    lang = parts_dict['lang']
    startText = parts_dict["startText"]
    start = make_concept_uri(startText, lang)
    return start
Пример #27
0
def build_end(raw_assertion):
    lang = raw_assertion.language_id
    endText = ' '.join(JA.normalize_list(raw_assertion.text2))
    end = make_concept_uri(endText, lang)
    return end
def build_start(raw_assertion):
    lang = raw_assertion.language_id
    startText = raw_assertion.text1
    start = make_concept_uri(startText, lang)
    return start
def build_end(raw_assertion):
    lang = raw_assertion.language_id
    endText = raw_assertion.text2
    end = make_concept_uri(endText, lang)
    return end
def build_start(parts_dict):
    lang = parts_dict['lang']
    startText = ' '.join(JA.normalize_list(parts_dict["startText"]))
    start = make_concept_uri(startText, lang)
    return start
Пример #31
0
assertions = {}
for assertion in assertiondata:
    obj = assertion['fields']
    frame = frames[obj['frame']]
    frametext = frame['text']
    userinfo = users[obj['author']]
    username = userinfo['fields']['username']
    userlocale = userinfo['fields']['ccode'].lower()
    if userlocale:
        userlocale += '/'
    sources = [
        "/s/contributor/globalmind/%s%s" % (userlocale, username),
        "/s/activity/globalmind/assert"
    ]
    lang = lang_codes[obj['lcode']]
    start = make_concept_uri(obj['node1'], lang)
    end = make_concept_uri(obj['node2'], lang)
    rel = '/r/'+rel_change.get(frame['relation'], frame['relation'])
    
    # fix messy english "around in"
    if ' around ' in frametext:
        if obj['node2'].startswith('in '):
            frametext = frametext.replace(' around ', ' in ')
            obj['node2'] = obj['node2'][3:]
        else:
            frametext = frametext.replace(' around ', ' near ')
            rel = '/r/LocatedNear'
    
    # fix more awkward English. I wonder how bad the other languages are.
    frametext = frametext.replace('hits your head', 'comes to mind')
    frametext = frametext.replace(': [node1], [node2]', ' [node1] and [node2]')
Пример #32
0
def handle_raw_assertion(raw, writer):
    try:
        lang = raw.language_id
        if raw.frame.goodness < 1: return
        if lang.startswith('zh'): return
        polarity = raw.frame.frequency.value
        activity = raw.sentence.activity.name
        if 'rubycommons' in activity: return

        # build the assertion
        frame_text = raw.frame.text
        if polarity > 0:
            frame_text = frame_text.replace('{%}', '')
        else:
            frame_text = frame_text.replace('{%}', 'not')
        frame_text = frame_text.replace('{1}', '[[%s]]' % raw.text1).replace('{2}', '[[%s]]' % raw.text2)

        creator_node = normalize_uri(u'/s/contributor/omcs/'+raw.creator.username)
        activity_node = normalize_uri(u'/s/activity/omcs/'+activity)
        
        startText = raw.text1
        endText = raw.text2
        normalize_uri('/text/'+lang+'/'+raw.text1)
        end = normalize_uri('/text/'+lang+'/'+raw.text2)

        relname = raw.frame.relation.name
        if relname == 'ConceptuallyRelatedTo':
            relname = 'RelatedTo'

        if polarity > 0:
            relation = normalize_uri('/r/'+relname)
        else:
            relation = normalize_uri('/r/Not'+relname)

        dataset = normalize_uri('/d/conceptnet/4/'+lang)

        sources = [([creator_node, activity_node], 1)]

        for vote in raw.votes.all():
            sources.append(([normalize_uri('/s/contributor/omcs/'+vote.user.username),
                             normalize_uri(u'/s/activity/omcs/vote')], vote.vote))
        
        for source_list, weight in sources:
            bad = False
            if 'commons2_reject' in ' '.join(source_list):
                weight = -1
            start = make_concept_uri(startText, lang)
            end = make_concept_uri(endText, lang)
            if 'bedume' in ' '.join(source_list):
                for flagged in BEDUME_FLAGGED_CONCEPTS + BEDUME_FLAGGED_PLACES:
                    check = '/'+flagged.replace(' ', '_')
                    if start.endswith(check) or end.endswith(check):
                        bad = True
                        print "flagged:", str(raw)
                        break
            if not bad:
                edge = make_edge(relation, start, end, dataset, LICENSE, source_list, '/ctx/all', frame_text, weight=weight)
                writer.write(edge)
    except Exception:
        import traceback
        traceback.print_exc()
Пример #33
0
def run_verbosity(infile, outfile):
    maxscore = 0
    count = 0
    counts = defaultdict(int)
    text_similarities = []

    sources = ['/s/site/verbosity']

    writer = FlatEdgeWriter(outfile)

    for line in open(infile):
        parts = line.strip().split('\t')
        if not parts:
            counts['blank'] += 1
            continue
        left, relation, right, freq, orderscore = parts[:5]


        # catch bad stuff
        flagged = False

        for rword in right.split():
            if bad_regex_no_biscuit.match(rword):
                flagged = True
                break
        if flagged:
            #print "FLAGGED:", right
            counts['flag word'] += 1
            continue
        if len(right) < 3:
            counts['clue too short'] += 1
            continue
        if len(right.split()[-1]) == 1:
            counts['letter'] += 1
            continue
        if right.startswith('add') or right.startswith('delete') or right.startswith('remove'):
            counts['flag word'] += 1
            continue

        freq = int(freq)
        orderscore = int(orderscore)
        rel = '/r/RelatedTo'
        reltext = 'is related to'
        if right.startswith('not '):
            rel = '/r/Antonym'
            right = right[4:]
            reltext = 'is not'
        if relation == 'it is the opposite of':
            rel = '/r/Antonym'
            reltext = 'is the opposite of'

        rightwords = [right]
        if ' ' in right:
            rightwords.extend(right.split(' '))

        sls = sounds_like_score(left, right)
        text_similarities.append(sls)
        if sls > 0.35:
            counts['text similarity'] += 1
            continue
        
        for i, rightword in enumerate(rightwords):
            edge_sources = list(sources)
            if i > 0:
                edge_sources.append('/s/rule/split_words')
            text = '[[%s]] %s [[%s]]' % (left, reltext, rightword)
            
            sls = sounds_like_score(left, rightword)
            text_similarities.append(sls)
            if sls > 0.35:
                counts['text similarity'] += 1
                continue
            
            score = (freq*2-1) * (1000-orderscore) * (1-sls) / 1000
            if score <= 0:
                counts['low score'] += 1
                continue

            #weight = math.log(1 + score/10.0) / math.log(2)
            weight = score / 100.0

            count += 1
            counts['success'] += 1
            
            leftc = make_concept_uri(unicode(left), 'en')
            rightc = make_concept_uri(unicode(rightword), 'en')
            edge = make_edge(rel, leftc, rightc, '/d/verbosity',
                             '/l/CC/By', sources, surfaceText=text,
                             weight=weight)
            writer.write(edge)
Пример #34
0
import codecs
from conceptnet5.nodes import make_concept_uri
from conceptnet5.edges import make_edge, MultiWriter

path = "./raw_data/"
sparse_pieces = []
for filename in os.listdir(path):
    if filename.startswith('conceptnet_zh_'):
        print filename
        writer = MultiWriter(filename.split('.')[0])
        for line in codecs.open(path + filename, encoding='utf-8', errors='replace'):
            line = line.strip()
            if line:
                parts = line.split(', ')
                user, frame_id, concept1, concept2 = parts
                frame = Frame.objects.get(id=int(frame_id))
                ftext = frame.text
                relation = frame.relation.name
                rel = '/r/'+relation

                surfaceText = ftext.replace(u'{1}', u'[['+concept1+u']]').replace(u'{2}', u'[['+concept2+u']]')
                start = make_concept_uri(concept1, 'zh_TW')
                end = make_concept_uri(concept2, 'zh_TW')
                sources = ['/s/contributor/petgame/'+user, '/s/activity/ntt/petgame']
                edge = make_edge(rel, start, end, dataset='/d/conceptnet/4/zh',
                                 license='/l/CC/By', sources=sources,
                                 surfaceText=surfaceText, weight=1)
                writer.write(edge)
        writer.close()

Пример #35
0
    if right.startswith('not '):
        right = right[4:]
        relation = 'it is not'
    if relation == 'it is the opposite of':
        relation = 'it is not'

    freq = int(freq)
    orderscore = int(orderscore)
    if relation == 'about the same size as':
        relation = 'it is about the same size as'
    elif relation == 'it looks like':
        relation = 'it is related to'
    rel = mapping.get(relation)
    reltext = relation[3:]
    if rel is None:
        rel = make_concept_uri(reltext, 'en')
    text = '[[%s]] %s [[%s]]' % (left, reltext, right)
    
    if relation == 'it is' and\
       (right.startswith('a ') or right.startswith('an ')
        or right.startswith('the ')):
        rel = '/r/IsA'
    
    sls = sounds_like_score(left, right)
    text_similarities.append(sls)
    if sls > 0.35:
        #print "* %s sounds like %s (%4.4f)" % (left, right, sls)
        counts['text similarity'] += 1
        similar_out.write('%4.4d\t%s' % (sls, line))
        continue
    
Пример #36
0
def build_end(parts_dict):
    lang = parts_dict['lang']
    endText = parts_dict["endText"]
    end = make_concept_uri(endText, lang)
    return end
Пример #37
0
        writer = MultiWriter(filename.split('.')[0])
        for line in codecs.open(filename, encoding='utf-8', errors='replace'):
            line = line.strip()
            if line:
                parts = line.split(', ')
                user, frame_id, concept1, concept2 = parts
                frame = Frame.objects.get(id=int(frame_id))
                ftext = frame.text
                relation = frame.relation.name
                rel = '/r/' + relation

                surfaceText = ftext.replace(u'{1}',
                                            u'[[' + concept1 + u']]').replace(
                                                u'{2}',
                                                u'[[' + concept2 + u']]')
                start = make_concept_uri(concept1, 'zh_TW')
                end = make_concept_uri(concept2, 'zh_TW')
                sources = [
                    '/s/contributor/petgame/' + user, '/s/activity/ntt/petgame'
                ]
                edge = make_edge(rel,
                                 start,
                                 end,
                                 dataset='/d/conceptnet/4/zh',
                                 license='/l/CC/By',
                                 sources=sources,
                                 surfaceText=surfaceText,
                                 weight=1)
                writer.write(edge)
        writer.close()
Пример #38
0
def search():
    keyword = request.form.get('keyword')
    lang = request.form.get('language')
    return redirect(site + web_root + make_concept_uri(keyword, lang))
def build_end(parts_dict):
    lang = parts_dict['lang']
    endText = ' '.join(JA.normalize_list(parts_dict["endText"]))
    end = make_concept_uri(endText, lang)
    return end
def build_start(raw_assertion):
    lang = raw_assertion.language_id
    startText = raw_assertion.text1
    start = make_concept_uri(startText, lang)
    return start
Пример #41
0
    pos = parts_of_speech[synset_pos]
    disambig = glossary[synset].replace('/', '_')
    # TODO: take into account domains, etc.
    #
    #if len(sense_name_synsets[synset_name]) > 1:
    #    for sense in senses:
    #        sense_name = labels[sense]
    #        more_synsets = sense_name_synsets[sense_name]
    #        if len(more_synsets) == 1:
    #            disambig = sense_name
    #            break
    #    if disambig is None:
    #        disambig = glossary[synset]
    #if disambig is None:
    #    disambig = '*'
    node = make_concept_uri(synset_name, 'en', pos+'/'+disambig)
    if synset not in mapping:
        mapping[synset] = node

# Map senses to the same nodes.
for sense, synset in sense_synsets.items():
    mapping[sense] = mapping[synset]

sources = ['/s/wordnet/3.0']
writer = MultiWriter('wordnet3')
sw_map = FlatEdgeWriter('data/sw/wordnet30.map.json')
sw_map_used = set()

for line in chain(
    open('raw_data/wordnet-attribute.ttl'),
    open('raw_data/wordnet-causes.ttl'),
def build_end(raw_assertion):
    lang = raw_assertion.language_id
    endText = raw_assertion.text2
    end = make_concept_uri(endText, lang)
    return end
Пример #43
0
def build_end(parts_dict):
    lang = parts_dict['lang']
    endText = parts_dict["endText"]
    end = make_concept_uri(endText, lang)
    return end
Пример #44
0
def build_start(raw_assertion):
    lang = raw_assertion.language_id
    startText = ' '.join(JA.normalize_list(raw_assertion.text1))
    start = make_concept_uri(startText, lang)
    return start
Пример #45
0
def build_start(parts_dict):
    lang = parts_dict["lang"]
    startText = " ".join(JA.normalize_list(parts_dict["startText"]))
    start = make_concept_uri(startText, lang)
    return start
Пример #46
0
def build_start(parts_dict):
    lang = parts_dict['lang']
    startText = parts_dict["startText"]
    start = make_concept_uri(startText, lang)
    return start
Пример #47
0
assertions = {}
for assertion in assertiondata:
    obj = assertion['fields']
    frame = frames[obj['frame']]
    frametext = frame['text']
    userinfo = users[obj['author']]
    username = userinfo['fields']['username']
    userlocale = userinfo['fields']['ccode'].lower()
    if userlocale:
        userlocale += '/'
    sources = [
        "/s/contributor/globalmind/%s%s" % (userlocale, username),
        "/s/activity/globalmind/assert"
    ]
    lang = lang_codes[obj['lcode']]
    start = make_concept_uri(obj['node1'], lang)
    end = make_concept_uri(obj['node2'], lang)
    rel = '/r/'+rel_change.get(frame['relation'], frame['relation'])
    
    # fix messy english "around in"
    if ' around ' in frametext:
        if obj['node2'].startswith('in '):
            frametext = frametext.replace(' around ', ' in ')
            obj['node2'] = obj['node2'][3:]
        else:
            frametext = frametext.replace(' around ', ' near ')
            rel = '/r/LocatedNear'
    
    # fix more awkward English. I wonder how bad the other languages are.
    frametext = frametext.replace('hits your head', 'comes to mind')
    frametext = frametext.replace(': [node1], [node2]', ' [node1] and [node2]')
Пример #48
0
def build_start(raw_assertion):
    lang = raw_assertion.language_id
    startText = ' '.join(JA.normalize_list(raw_assertion.text1))
    start = make_concept_uri(startText, lang)
    return start
Пример #49
0
def build_end(raw_assertion):
    lang = raw_assertion.language_id
    endText = ' '.join(JA.normalize_list(raw_assertion.text2))
    end = make_concept_uri(endText, lang)
    return end
Пример #50
0
def build_end(parts_dict):
    lang = parts_dict["lang"]
    endText = " ".join(JA.normalize_list(parts_dict["endText"]))
    end = make_concept_uri(endText, lang)
    return end
Пример #51
0
def search():
    keyword = request.form.get('keyword')
    lang = request.form.get('language')
    return redirect(site + web_root + make_concept_uri(keyword, lang))
Пример #52
0
    if right.startswith('not '):
        right = right[4:]
        relation = 'it is not'
    if relation == 'it is the opposite of':
        relation = 'it is not'

    freq = int(freq)
    orderscore = int(orderscore)
    if relation == 'about the same size as':
        relation = 'it is about the same size as'
    elif relation == 'it looks like':
        relation = 'it is related to'
    rel = mapping.get(relation)
    reltext = relation[3:]
    if rel is None:
        rel = make_concept_uri(unicode(reltext), 'en')
    text = '[[%s]] %s [[%s]]' % (left, reltext, right)
    
    if relation == 'it is' and\
       (right.startswith('a ') or right.startswith('an ')
        or right.startswith('the ')):
        rel = '/r/IsA'
    
    sls = sounds_like_score(left, right)
    text_similarities.append(sls)
    if sls > 0.35:
        #print "* %s sounds like %s (%4.4f)" % (left, right, sls)
        counts['text similarity'] += 1
        similar_out.write('%4.4d\t%s' % (sls, line))
        continue
    
Пример #53
0
def build_from_dir(dirname):
    userdata = yaml.load_all(open(dirname + '/GMUser.yaml'))
    users = {}

    for userinfo in userdata:
        users[userinfo['pk']] = userinfo

    frame_data = yaml.load_all(open(dirname + '/GMFrame.yaml'))
    frames = {}
    for frame in frame_data:
        frames[frame['pk']] = frame['fields']

    assertiondata = yaml.load_all(open(dirname + '/GMAssertion.yaml'))
    assertions = {}
    for assertion in assertiondata:
        obj = assertion['fields']
        frame = frames[obj['frame']]
        frametext = frame['text']
        userinfo = users[obj['author']]
        username = userinfo['fields']['username']
        userlocale = userinfo['fields']['ccode'].lower()
        if userlocale:
            userlocale += '/'
        sources = [
            "/s/contributor/globalmind/%s%s" % (userlocale, username),
            "/s/activity/globalmind/assert"
        ]
        lang = lang_codes[obj['lcode']]
        obj['node1'] = unicode(obj['node1'])
        obj['node2'] = unicode(obj['node2'])
        start = make_concept_uri(obj['node1'], lang)
        end = make_concept_uri(obj['node2'], lang)
        rel = '/r/'+rel_change.get(frame['relation'], frame['relation'])
        
        # fix messy english "around in"
        if ' around ' in frametext:
            if obj['node2'].startswith('in '):
                frametext = frametext.replace(' around ', ' in ')
                obj['node2'] = obj['node2'][3:]
            else:
                frametext = frametext.replace(' around ', ' near ')
                rel = '/r/LocatedNear'
        
        # fix more awkward English. I wonder how bad the other languages are.
        frametext = frametext.replace('hits your head', 'comes to mind')
        frametext = frametext.replace(': [node1], [node2]', ' [node1] and [node2]')

        node1 = u'[[' + obj['node1'] + u']]'
        node2 = u'[[' + obj['node2'] + u']]'
        surfaceText = frametext.replace('//', '').replace('[node1]', node1).replace('[node2]', node2)
        edge = make_edge(rel, start, end,
                         dataset='/d/globalmind',
                         license='/l/CC/By',
                         sources=sources,
                         surfaceText=surfaceText,
                         weight=1)
        yield json.dumps(edge, ensure_ascii=False)
        assertions[assertion['pk']] = edge

    translationdata = yaml.load_all(open(dirname + '/GMTranslation.yaml'))
    for translation in translationdata:
        obj = translation['fields']
        assertion1 = assertions[obj['assertion1']]
        assertion2 = assertions[obj['assertion2']]
        start = assertion1['uri']
        end = assertion2['uri']
        rel = '/r/TranslationOf'
        text1 = assertion1['surfaceText'].replace('[[', '').replace(']]', '')
        text2 = assertion2['surfaceText'].replace('[[', '').replace(']]', '')
        lang1 = lang_names[get_lang(assertion1)]
        lang2 = lang_names[get_lang(assertion2)]
        surfaceText = u"[[%s]] in %s means [[%s]] in %s." % (text1, lang1, text2, lang2)
        userinfo = users[obj['author']]
        username = userinfo['fields']['username']
        userlocale = userinfo['fields']['ccode'].lower()
        if userlocale:
            userlocale += '/'
        sources = [
            "/s/contributor/globalmind/%s%s" % (userlocale, username),
            "/s/activity/globalmind/translate"
        ]
        edge = make_edge(rel, start, end,
                         dataset='/d/globalmind',
                         license='/l/CC/By',
                         sources=sources,
                         surfaceText=surfaceText,
                         weight=1)
        yield json.dumps(edge, ensure_ascii=False)