def handle_file(input_filename, output_file): builder = CN4Builder(weight=0.05) out = MsgpackStreamWriter(output_file) for line in open(input_filename, encoding='utf-8'): # Get a line from the file for new_obj in handle_line(line, builder): out.write(new_obj)
def subwords_to_edges(language, input, output): """ Morfessor hypothesizes ways to break words into sub-word chunks. Produce edges from these sub-words that can be used in retrofitting. """ writer = MsgpackStreamWriter(output) for line in input: line = line.rstrip() if not line or line.startswith('#'): continue # Remove the unnecessary count ("1 ") from the start of each line line = line.split(' ', 1)[1] chunks = line.split(' + ') # Strip a possible trailing underscore, which would particularly show # up in the way we segment ATOMIC_SPACE_LANGUAGES (Vietnamese) full_text = ''.join(chunks).strip('_') end = join_uri('c', language, full_text) for chunk in chunks: if chunk != '_': start = join_uri('x', language, chunk.strip('_')) edge = make_edge( '/r/SubwordOf', start, end, dataset='/d/morphology', license=Licenses.cc_attribution, sources=MORPH_SOURCES, weight=0.01, ) writer.write(edge) writer.close()
def handle_file(input_filename, output_file): out = MsgpackStreamWriter(output_file) for line in codecs.open(input_filename, encoding='utf-8'): line = line.strip() if line: for new_obj in handle_raw_assertion(line): out.write(new_obj)
def handle_file(input_filename, output_file): builder = CN4Builder() out = MsgpackStreamWriter(output_file) for line in open(input_filename, encoding='utf-8'): # Get a line from the file for new_obj in handle_line(line, builder): out.write(new_obj)
def json_to_msgpack(input_filename, output_filename): """ Convert a JSON stream (with one object per line) to a msgpack stream. """ out_stream = MsgpackStreamWriter(output_filename) for obj in read_json_stream(input_filename): out_stream.write(obj) out_stream.close()
def handle_file(input_file, output_file): tree = ET.parse(input_file) out = MsgpackStreamWriter(output_file) root = tree.getroot() lang = root[0][1].attrib['type'] for annotation in root[1]: for word in strip_words(annotation.text): start = standardized_concept_uri('mul', annotation.attrib['cp']) end = standardized_concept_uri(lang, word) edge = make_edge(REL, start, end, DATASET, LICENSE, SOURCE) out.write(edge)
def test_msgpack_to_json(): with TemporaryDirectory(prefix='conceptnet-test') as tmpdir: json_path = os.path.join(tmpdir, 'test.jsons') msgpack_path = os.path.join(tmpdir, 'test.msgpack') writer = MsgpackStreamWriter(json_path) for item in DATA: writer.write(item) writer.close() msgpack_to_json(json_path, msgpack_path) reader = read_json_stream(msgpack_path) for known, read in zip_longest(DATA, reader): eq_(known, read)
def run_opencyc(input_file, output_file): """ Read an .nq file containing OpenCyc data, outputting a file of ConceptNet edges and a file of mappings between the Semantic Web and ConceptNet. """ out = MsgpackStreamWriter(output_file) labels = {} unlabels = defaultdict(set) seen_external_urls = set() # Read through the file once, finding all the "preferred labels". We will # use these as the surface texts for the nodes. for subj, pred, obj, _graph in parse_nquads( open(input_file, encoding='utf-8')): if pred['url'] == RDF_LABEL: labels[subj['url']] = obj['text'] unlabels[obj['text']].add(subj['url']) # Read through the file again and extract ConceptNet edges. for subj, pred, obj, _graph in parse_nquads( open(input_file, encoding='utf-8')): rel_name = resource_name(pred['url']) web_subj = subj.get('url') web_obj = obj.get('url') if rel_name == 'subClassOf' and web_obj is not None and web_subj in labels and web_obj in labels: subj_label = labels[web_subj] obj_label = labels[web_obj] if '_' in subj_label or '_' in obj_label: continue if subj_label.startswith('xsd:') or obj_label.startswith('xsd:'): continue subj_words = set(simple_tokenize(subj_label)) obj_words = set(simple_tokenize(obj_label)) if (subj_words & BLACKLIST_WORDS) or (obj_words & BLACKLIST_WORDS): continue if len(subj_words) > 4 or len(obj_words) > 4: continue subj_uri = cyc_to_conceptnet_uri(labels, unlabels, web_subj) obj_uri = cyc_to_conceptnet_uri(labels, unlabels, web_obj) out.write( opencyc_edge('/r/IsA', subj_uri, obj_uri, subj_label, obj_label)) if (subj_uri, web_subj) not in seen_external_urls: out.write(external_url_edge(subj_uri, web_subj)) seen_external_urls.add((subj_uri, web_subj)) if (obj_uri, web_obj) not in seen_external_urls: out.write(external_url_edge(obj_uri, web_obj)) seen_external_urls.add((obj_uri, web_obj)) elif rel_name == 'sameAs' and web_subj in labels and web_obj.startswith( 'http://umbel.org/'): subj_label = labels[web_subj] subj_uri = standardized_concept_uri('en', subj_label) if (subj_uri, web_obj) not in seen_external_urls: out.write(external_url_edge(subj_uri, web_obj)) seen_external_urls.add((subj_uri, web_obj)) out.close()
def run_wiktionary(input_file, output_file, titledb=None, language='en', verbosity=0, logger=None): if titledb is None: titledb = os.path.dirname(input_file) + '/titles.db' trace = (verbosity >= 2) sem = SEMANTICS[language](language, titledb=titledb, trace=trace, logger=logger) output = MsgpackStreamWriter(output_file) for structure in read_msgpack_stream(input_file): for edge in sem.parse_structured_entry(structure): if verbosity >= 1: print(edge['rel'], edge['start'], edge['end']) output.write(edge)
def run_opencyc(input_file, output_file): """ Read an .nq file containing OpenCyc data, outputting a file of ConceptNet edges and a file of mappings between the Semantic Web and ConceptNet. """ out = MsgpackStreamWriter(output_file) labels = {} unlabels = defaultdict(set) seen_external_urls = set() # Read through the file once, finding all the "preferred labels". We will # use these as the surface texts for the nodes. for subj, pred, obj, _graph in parse_nquads(open(input_file, encoding='utf-8')): if pred['url'] == RDF_LABEL: labels[subj['url']] = obj['text'] unlabels[obj['text']].add(subj['url']) # Read through the file again and extract ConceptNet edges. for subj, pred, obj, _graph in parse_nquads(open(input_file, encoding='utf-8')): rel_name = resource_name(pred['url']) web_subj = subj.get('url') web_obj = obj.get('url') if rel_name == 'subClassOf' and web_obj is not None and web_subj in labels and web_obj in labels: subj_label = labels[web_subj] obj_label = labels[web_obj] if '_' in subj_label or '_' in obj_label: continue if subj_label.startswith('xsd:') or obj_label.startswith('xsd:'): continue subj_words = set(simple_tokenize(subj_label)) obj_words = set(simple_tokenize(obj_label)) if (subj_words & BLACKLIST_WORDS) or (obj_words & BLACKLIST_WORDS): continue if len(subj_words) > 4 or len(obj_words) > 4: continue subj_uri = cyc_to_conceptnet_uri(labels, unlabels, web_subj) obj_uri = cyc_to_conceptnet_uri(labels, unlabels, web_obj) out.write(opencyc_edge('/r/IsA', subj_uri, obj_uri, subj_label, obj_label)) if (subj_uri, web_subj) not in seen_external_urls: out.write(external_url_edge(subj_uri, web_subj)) seen_external_urls.add((subj_uri, web_subj)) if (obj_uri, web_obj) not in seen_external_urls: out.write(external_url_edge(obj_uri, web_obj)) seen_external_urls.add((obj_uri, web_obj)) elif rel_name == 'sameAs' and web_subj in labels and web_obj.startswith('http://umbel.org/'): subj_label = labels[web_subj] subj_uri = standardized_concept_uri('en', subj_label) if (subj_uri, web_obj) not in seen_external_urls: out.write(external_url_edge(subj_uri, web_obj)) seen_external_urls.add((subj_uri, web_obj)) out.close()
def handle_file(input_filename, output_file): out = MsgpackStreamWriter(output_file) for line in open(input_filename, encoding='utf-8'): parts = line.rstrip('\n').split('\t') uri, start, rel, end, weight, source = parts if uri == 'uri': continue edge = make_edge( rel=rel, start=start, end=end, dataset=DATASET, sources=[{'activity': SOURCE}], license=Licenses.cc_attribution, weight=WEIGHT_TABLE[weight], ) out.write(edge)
def handle_file(input_filename, output_file): out = MsgpackStreamWriter(output_file) for line in open(input_filename, encoding='utf-8'): parts = line.rstrip('\n').split('\t') uri, start, rel, end, weight, source = parts if uri == 'uri': return edge = make_edge(rel=rel, start=start, end=end, dataset=DATASET, sources=[{ 'activity': SOURCE }], license=Licenses.cc_attribution, weight=WEIGHT_TABLE[weight]) out.write(edge)
def handle_file(input_file, output_file): tree = ET.parse(input_file) out = MsgpackStreamWriter(output_file) root = tree.getroot() lang = root[0][1].attrib[ 'type' ] # language is at position [1] within the child node [0] if len(root) >= 2: for annotation in root[1]: for word in strip_words(annotation.text): start = standardized_concept_uri('mul', annotation.attrib['cp']) end = standardized_concept_uri(lang, word) edge = make_edge(REL, start, end, DATASET, LICENSE, SOURCE) out.write(edge) else: print("No emoji data in {!r}".format(input_file)) out.close()
def json_to_msgpack(input_filename, output_filename): out_stream = MsgpackStreamWriter(output_filename) for obj in read_json_stream(input_filename): out_stream.write(obj) out_stream.close()
def process_dbpedia(input_dir, output_file, concept_file): """ Read through multiple DBPedia files and output filtered assertions to `output_file`. """ ok_concepts = read_concept_file(concept_file) input_path = pathlib.Path(input_dir) interlang_path = input_path / 'interlanguage_links_en.tql.bz2' mapped_urls = interlanguage_mapping(interlang_path, ok_concepts) out = MsgpackStreamWriter(output_file) types_path = input_path / 'instance_types_en.tql.bz2' quads = parse_nquads(bz2.open(str(types_path), 'rt')) for subj, pred, obj, _graph in quads: subj_url = subj['url'] if ( 'Category:' in subj_url or 'File:' in subj_url or 'List_of' in subj_url or '__' in subj_url or 'Template:' in subj_url ): continue if subj_url in mapped_urls: subj_concept = translate_dbpedia_url(subj_url) obj_type = un_camel_case(resource_name(obj['url'])) if obj_type not in TYPE_BLACKLIST: obj_concept = standardized_concept_uri('en', obj_type, 'n') if obj_concept not in CONCEPT_BLACKLIST: edge = make_edge( '/r/IsA', subj_concept, obj_concept, dataset='/d/dbpedia/en', license=Licenses.cc_sharealike, sources=[{'contributor': '/s/resource/dbpedia/2015/en'}], weight=0.5, surfaceStart=url_to_label(subj['url']), surfaceEnd=url_to_label(obj['url']) ) out.write(edge) for other_url in mapped_urls[subj_url]: if other_url.startswith('http://wikidata.dbpedia.org/'): urledge = make_edge( '/r/ExternalURL', subj_concept, other_url, dataset='/d/dbpedia/en', license=Licenses.cc_sharealike, sources=[{'contributor': '/s/resource/dbpedia/2015/en'}], weight=1.0 ) out.write(urledge) else: other_concept = translate_dbpedia_url(other_url) if other_concept: urledge = make_edge( '/r/ExternalURL', other_concept, other_url, dataset='/d/dbpedia/en', license=Licenses.cc_sharealike, sources=[{'contributor': '/s/resource/dbpedia/2015/en'}], weight=1.0 ) out.write(urledge) edge = make_edge( '/r/Synonym', other_concept, subj_concept, dataset='/d/dbpedia/en', license=Licenses.cc_sharealike, sources=[{'contributor': '/s/resource/dbpedia/2015/en'}], weight=0.5, surfaceStart=url_to_label(other_url), surfaceEnd=url_to_label(subj_url) ) out.write(edge) relations_path = input_path / 'mappingbased_objects_en.tql.bz2' quads = parse_nquads(bz2.open(str(relations_path), 'rt')) for subj, pred, obj, _graph in quads: subj_concept = translate_dbpedia_url(subj['url']) obj_concept = translate_dbpedia_url(obj['url']) rel_name = resource_name(pred['url']) if ( subj_concept and obj_concept and subj['url'] in mapped_urls and obj['url'] in mapped_urls ): if rel_name in RELATIONS: rel = RELATIONS[rel_name] edge = make_edge( rel, subj_concept, obj_concept, dataset='/d/dbpedia/en', license=Licenses.cc_sharealike, sources=[{'contributor': '/s/resource/dbpedia/2015/en'}], weight=0.5, surfaceStart=url_to_label(subj['url']), surfaceEnd=url_to_label(obj['url']) ) out.write(edge) out.close()
def read_wiktionary(input_file, db_file, output_file): """ Convert a stream of parsed Wiktionary data into ConceptNet edges. A `db_file` containing all known words in all languages must have already been prepared from the same data. """ db = sqlite3.connect(db_file) out = MsgpackStreamWriter(output_file) for heading, items in segmented_stream(input_file): language = heading['language'] title = heading['title'] dataset = '/d/wiktionary/{}'.format(language) url_title = heading['title'].replace(' ', '_') web_url = 'http://{}.wiktionary.org/wiki/{}'.format(language, url_title) web_source = '/s/resource/wiktionary/{}'.format(language) source = { 'contributor': web_source, 'process': PARSER_RULE } # Scan through the 'from' items, such as the start nodes of # translations, looking for distinct etymologies. If we get more than # one etymology for a language, we need to distinguish them as # different senses in that language. all_etyms = { (item['from']['language'], etym_label(language, item['from'])) for item in items if 'language' in item['from'] and item['from']['text'] == title and etym_label(language, item['from']) is not None } word_languages = {wlang for (wlang, _) in all_etyms} for wlang in sorted(word_languages): cpage = standardized_concept_uri(wlang, title) ld_edge = make_edge( '/r/ExternalURL', cpage, web_url, dataset=dataset, weight=0.25, sources=[source], license=Licenses.cc_sharealike ) out.write(ld_edge) etym_to_translation_sense = {} language_etym_counts = Counter(lang for (lang, etym) in all_etyms) polysemous_languages = { lang for lang in language_etym_counts if language_etym_counts[lang] > 1 } for item in items: tfrom = item['from'] tto = item['to'] assumed_languages = [language] lang1 = tfrom.get('language') lang2 = tto.get('language') if lang1 and (lang1 not in assumed_languages) and valid_language(lang1): assumed_languages.append(lang1) if lang2 and (lang2 not in assumed_languages) and valid_language(lang2): assumed_languages.append(lang2) cfrom = transform_term( language, tfrom, assumed_languages, db, use_etyms=(lang1 in polysemous_languages) ) cpage = cfrom cto = transform_term( language, tto, assumed_languages, db, use_etyms=(lang2 in polysemous_languages) ) if cfrom is None or cto is None: continue if uri_prefix(cfrom, 3) == uri_prefix(cto, 3): continue rel, switch = transform_relation(item['rel']) if rel is None: continue if switch: cfrom, cto = cto, cfrom # When translations are separated by sense, use only the first # sense we see for each etymology. That will have the most # representative translations. if item['rel'] == 'translation': etym_key = (tfrom['language'], etym_label(language, tfrom)) sense = tfrom.get('sense', '') if etym_key in etym_to_translation_sense: if etym_to_translation_sense[etym_key] != sense: continue else: etym_to_translation_sense[etym_key] = sense weight = 1. if rel == '/r/EtymologicallyRelatedTo': weight = 0.25 edge = make_edge(rel, cfrom, cto, dataset=dataset, weight=weight, sources=[source], surfaceStart=tfrom['text'], surfaceEnd=tto['text'], license=Licenses.cc_sharealike) out.write(edge) out.close()
def build_from_dir(dirname, output_file): """ Read a GlobalMind database exported in YAML files, translate it into ConceptNet 5 edges, and write those edges to disk using a MsgpackStreamWriter. """ out = MsgpackStreamWriter(output_file) userdata = yaml.load_all(open(dirname + '/GMUser.yaml')) users = {} for userinfo in userdata: users[userinfo['pk']] = userinfo frame_data = yaml.load_all(open(dirname + '/GMFrame.yaml')) frames = {} for frame in frame_data: frames[frame['pk']] = frame['fields'] assertiondata = yaml.load_all(open(dirname + '/GMAssertion.yaml')) assertions = {} for assertion in assertiondata: obj = assertion['fields'] frame = frames[obj['frame']] frametext = frame['text'] userinfo = users[obj['author']] username = userinfo['fields']['username'] # As far as I can tell, GlobalMind used the same namespace of # usernames as the original Open Mind. user_source = "/s/contributor/omcs/%s" % username sources = [ user_source, "/s/activity/globalmind/assert" ] lang = LANG_CODES[obj['lcode']] start = normalized_concept_uri(lang, obj['node1']) end = normalized_concept_uri(lang, obj['node2']) rel = '/r/' + RELATION_MAP.get(frame['relation'], frame['relation']) # fix messy english "around in" if ' around ' in frametext: if obj['node2'].startswith('in '): frametext = frametext.replace(' around ', ' in ') obj['node2'] = obj['node2'][3:] else: frametext = frametext.replace(' around ', ' near ') rel = '/r/LocatedNear' # fix more awkward English. I wonder how bad the other languages are. frametext = frametext.replace('hits your head', 'comes to mind') frametext = frametext.replace(': [node1], [node2]', ' [node1] and [node2]') node1 = u'[[' + obj['node1'] + u']]' node2 = u'[[' + obj['node2'] + u']]' surfaceText = frametext.replace('//', '').replace('[node1]', node1).replace('[node2]', node2) edge = make_edge(rel, start, end, dataset='/d/globalmind', license='/l/CC/By', sources=sources, surfaceText=surfaceText, weight=1) # Avoid duplication with the ConceptNet reader, but still save every edge so that we can # handle translations. if username != 'openmind': out.write(edge) assertions[assertion['pk']] = edge translationdata = yaml.load_all(open(dirname + '/GMTranslation.yaml')) for translation in translationdata: obj = translation['fields'] assertion1 = assertions[obj['assertion1']] assertion2 = assertions[obj['assertion2']] start = assertion1['uri'] end = assertion2['uri'] rel = '/r/TranslationOf' text1 = assertion1['surfaceText'].replace('[[', '').replace(']]', '') text2 = assertion2['surfaceText'].replace('[[', '').replace(']]', '') lang1 = LANG_NAMES[get_lang(assertion1)] lang2 = LANG_NAMES[get_lang(assertion2)] surfaceText = u"[[%s]] in %s means [[%s]] in %s." % (text1, lang1, text2, lang2) userinfo = users[obj['author']] username = userinfo['fields']['username'] userlocale = userinfo['fields']['ccode'].lower() if userlocale: user_source = "/s/contributor/globalmind/%s/%s" % (userlocale, username) else: user_source = "/s/contributor/globalmind/%s" % username sources = [ user_source, "/s/activity/globalmind/translate" ] edge = make_edge(rel, start, end, dataset='/d/globalmind', license=Licenses.cc_attribution, sources=sources, surfaceText=surfaceText, weight=1) out.write(edge)
def read_wiktionary(input_file, db_file, output_file): """ Convert a stream of parsed Wiktionary data into ConceptNet edges. A `db_file` containing all known words in all languages must have already been prepared from the same data. """ db = sqlite3.connect(db_file) out = MsgpackStreamWriter(output_file) for heading, items in segmented_stream(input_file): language = heading['language'] title = heading['title'] dataset = '/d/wiktionary/{}'.format(language) url_title = heading['title'].replace(' ', '_') web_url = 'http://{}.wiktionary.org/wiki/{}'.format( language, url_title) web_source = '/s/resource/wiktionary/{}'.format(language) source = {'contributor': web_source, 'process': PARSER_RULE} # Scan through the 'from' items, such as the start nodes of # translations, looking for distinct etymologies. If we get more than # one etymology for a language, we need to distinguish them as # different senses in that language. all_etyms = { (item['from']['language'], etym_label(language, item['from'])) for item in items if 'language' in item['from'] and item['from']['text'] == title and etym_label(language, item['from']) is not None } word_languages = {wlang for (wlang, _) in all_etyms} for wlang in sorted(word_languages): if valid_language(wlang): cpage = standardized_concept_uri(wlang, title) ld_edge = make_edge('/r/ExternalURL', cpage, web_url, dataset=dataset, weight=0.25, sources=[source], license=Licenses.cc_sharealike) out.write(ld_edge) etym_to_translation_sense = {} language_etym_counts = Counter(lang for (lang, etym) in all_etyms) polysemous_languages = { lang for lang in language_etym_counts if language_etym_counts[lang] > 1 } for item in items: tfrom = item['from'] tto = item['to'] assumed_languages = [language] lang1 = tfrom.get('language') lang2 = tto.get('language') if lang1 and (lang1 not in assumed_languages) and valid_language(lang1): assumed_languages.append(lang1) if lang2 and (lang2 not in assumed_languages) and valid_language(lang2): assumed_languages.append(lang2) cfrom = transform_term(language, tfrom, assumed_languages, db, use_etyms=(lang1 in polysemous_languages)) cpage = cfrom cto = transform_term(language, tto, assumed_languages, db, use_etyms=(lang2 in polysemous_languages)) if cfrom is None or cto is None: continue if uri_prefix(cfrom, 3) == uri_prefix(cto, 3): continue rel, switch = transform_relation(item['rel']) if rel is None: continue if switch: cfrom, cto = cto, cfrom # When translations are separated by sense, use only the first # sense we see for each etymology. That will have the most # representative translations. if item['rel'] == 'translation': etym_key = (tfrom['language'], etym_label(language, tfrom)) sense = tfrom.get('sense', '') if etym_key in etym_to_translation_sense: if etym_to_translation_sense[etym_key] != sense: continue else: etym_to_translation_sense[etym_key] = sense weight = 1. if rel == '/r/EtymologicallyRelatedTo': weight = 0.25 edge = make_edge(rel, cfrom, cto, dataset=dataset, weight=weight, sources=[source], surfaceStart=tfrom['text'], surfaceEnd=tto['text'], license=Licenses.cc_sharealike) out.write(edge) out.close()
def transform_file(self, input_filename, output_file): out = MsgpackStreamWriter(output_file) for obj in read_json_stream(input_filename): for new_obj in self.handle_assertion(obj): out.write(new_obj)
def run_wordnet(input_file, output_file): out = MsgpackStreamWriter(output_file) synset_senses = defaultdict(list) sense_synsets = {} synset_labels = defaultdict(list) synset_canonical_labels = {} synset_categories = {} synset_domains = {} synset_glosses = {} synset_disambig = {} synset_uris = {} # First pass: find data about synsets quads = parse_nquads(open(input_file, encoding="utf-8")) for subj_dict, rel_dict, obj_dict, _graph in quads: if "url" not in subj_dict or "url" not in rel_dict: continue subj = subj_dict["url"] rel = rel_dict["url"] obj = obj_dict.get("url") objtext = obj_dict.get("text") relname = resource_name(rel) if relname == "label": if obj_dict["lang"] == "en": synset_labels[subj].append(objtext) elif relname == "sameAs": if obj.startswith(WN20_URL): # If we have a link to RDF WordNet 2.0, the URL (URI? IRI?) # will contain a standardized label for this concept, which # we should use when we want to use this synset as the name of # a disambiguation category. RDF WordNet 3.1 assigns synsets # a number of labels in no particular order, making it hard to # determine from 3.1 alone what to name a category. objname = resource_name(obj) parts = objname.split("-")[1:-2] # Handle missing apostrophes label = ( "-".join(parts) .replace("_s_", "'s_") .replace("_s-", "'s_") .replace("s__", "s'_") .replace("s_-", "s'-") .replace("_", " ") ) synset_canonical_labels[subj] = label elif relname == "domain_category": synset_categories[subj] = obj elif relname == "lexical_domain": target = resource_name(obj) if "." in target: domain = target.split(".")[1] synset_domains[subj] = domain elif relname == "gloss": synset_glosses[subj] = objtext elif relname == "reference": lemma = resource_name(subj) synset = obj synset_senses[synset].append(lemma) sense_synsets[lemma] = synset used_labels = set(synset_canonical_labels.values()) for synset, values in synset_labels.items(): values.sort(key=lambda label: (label in used_labels,) + label_sort_key(label)) if ( synset not in synset_canonical_labels or synset_canonical_labels[synset][0].isupper() and synset_domains.get(synset) == "person" ): label = values[0] synset_canonical_labels[synset] = label used_labels.add(label) for synset, labels in synset_labels.items(): if synset in synset_categories: category_name = synset_canonical_labels[synset_categories[synset]] else: category_name = synset_domains.get(synset, None) synset_no_fragment = synset.split("#")[0] pos = synset_no_fragment[-1].lower() assert pos in "nvarsp", synset if pos == "s": pos = "a" elif pos == "p": pos = "-" if category_name in ("pert", "all", "tops"): category_name = None synset_disambig[synset] = (pos, category_name) canon = synset_canonical_labels[synset] canon_uri = standardized_concept_uri("en", canon, pos, "wn", category_name) synset_uris[synset] = canon_uri for label in labels: if label != canon: other_uri = standardized_concept_uri( "en", label, pos, "wn", category_name ) rel_uri = "/r/Synonym" surface = "[[{0}]] is a synonym of [[{1}]]".format(label, canon) edge = make_edge( rel_uri, other_uri, canon_uri, dataset=DATASET, surfaceText=surface, license=Licenses.cc_attribution, sources=[SOURCE], weight=2.0, ) out.write(edge) quads = parse_nquads(open(input_file, encoding="utf-8")) for subj_dict, rel_dict, obj_dict, _graph in quads: if "url" not in subj_dict or "url" not in rel_dict: continue subj = subj_dict["url"] rel = rel_dict["url"] obj = obj_dict.get("url") relname = resource_name(rel) if relname in REL_MAPPING: pos, sense = synset_disambig.get(subj, (None, None)) if relname == "hypernym" and pos == "v": relname = "hypernym-v" rel, frame = REL_MAPPING[relname] reversed_frame = False if rel.startswith("~"): rel = rel[1:] reversed_frame = True rel_uri = "/r/" + rel if obj is not None: obj_uri = synset_uris.get(obj) if obj not in synset_canonical_labels: continue obj_label = synset_canonical_labels[obj] else: text = obj_dict["text"] # Some WordNets use strings with "!" in them to indicate # out-of-band information, such as a missing translation if (not text) or "!" in text: continue lang = obj_dict["lang"] obj_uri = standardized_concept_uri(lang, text, pos, "wn", sense) obj_label = text if subj not in synset_uris or subj not in synset_canonical_labels: continue subj_uri = synset_uris[subj] subj_label = synset_canonical_labels[subj] license = Licenses.cc_attribution langcode = subj_uri.split("/")[2] if langcode in SHAREALIKE_LANGUAGES: license = Licenses.cc_sharealike if reversed_frame: subj_uri, obj_uri = obj_uri, subj_uri subj_label, obj_label = obj_label, subj_label surface = frame.format("[[%s]]" % subj_label, "[[%s]]" % obj_label) edge = make_edge( rel_uri, subj_uri, obj_uri, dataset=DATASET, surfaceText=surface, license=license, sources=[SOURCE], weight=2.0, ) out.write(edge) for wn_url in sorted(synset_uris): cn_uri = synset_uris[wn_url] edge = make_edge( "/r/ExternalURL", cn_uri, wn_url, dataset=DATASET, license=Licenses.cc_sharealike, sources=[SOURCE], weight=1.0, ) out.write(edge) out.close()
def run_umbel(input_dir, output_file, sw_map_file): """ Read N-Triples files containing Umbel data, outputting a file of ConceptNet edges and a file of mappings between the Semantic Web and ConceptNet. """ out = MsgpackStreamWriter(output_file) map_out = NTriplesWriter(sw_map_file) reader = NTriplesReader() labels = {} label_sets = defaultdict(set) # There are two files we want to parse: # - umbel.nt, a transformation of umbel.n3, which is available from # https://github.com/structureddynamics/UMBEL/. # - umbel_links.nt, distributed with DBPedia 3.9. # # We parse them both in this file so that umbel_links can reuse the # concept names extracted from umbel.nt. main_file = os.path.join(input_dir, 'umbel.nt') dbpedia_link_file = os.path.join(input_dir, 'umbel_links.nt') # Read through umbel.nt once, finding all the "preferred labels". We will # use these as the surface texts for the nodes. for web_subj, web_rel, web_obj, objtag in reader.parse_file(main_file): if resource_name(web_rel) == 'prefLabel': # 'CW' and 'PCW' are Cyc jargon for 'conceptual works'. If a node # cannot be described except as a CW, we're probably not # interested in it. if 'CW' not in web_obj.split() and 'PCW' not in web_obj.split(): labels[web_subj] = web_obj if resource_name(web_rel).endswith('Label'): text = standardize_text(web_obj) label_sets[text].add(web_subj) # Read through umbel.nt again and extract ConceptNet edges. for web_subj, web_rel, web_obj, objtag in reader.parse_file(main_file): if objtag == 'URL' and acceptable_node(web_obj) and acceptable_node(web_subj): # Only use nodes for which we've seen preferred labels. # (This skips some anonymous OWL-cruft nodes.) if web_subj in labels and web_obj in labels: subj_uri = standardized_concept_uri('en', labels[web_subj]) obj_uri = standardized_concept_uri('en', labels[web_obj]) rel_name = resource_name(web_rel) # Check if this is a relation we want to handle. if rel_name in REL_MAPPING: # Write the ConceptNet edges and the mappings to Semantic Web URLs. rel_uri, frame = REL_MAPPING[rel_name] surface = frame % (labels[web_subj], labels[web_obj]) out.write(umbel_edge(rel_uri, subj_uri, obj_uri, surface, SOURCE)) map_out.write_link(web_rel, full_conceptnet_url(rel_uri)) map_out.write_link(web_subj, full_conceptnet_url(subj_uri)) map_out.write_link(web_obj, full_conceptnet_url(obj_uri)) # altLabel relations assign different texts to the same node. We'll # represent those in ConceptNet with Synonym relations. elif web_rel.endswith('altLabel'): # Make sure we know what's being labeled. if web_subj in labels: name = web_obj words = name.split(' ') if standardized_concept_name('en', name) != standardized_concept_name('en', labels[web_subj]): if not set(words) & IGNORED_WORDS: main_label = standardized_concept_uri('en', labels[web_subj]) name_text = standardize_text(name) if len(label_sets[name_text]) >= 2 or len(name_text) <= 3: disambig = un_camel_case(resource_name(web_subj)) # Cyc does not distinguish texts by their part of speech, so use # '_' as the part of speech symbol. alt_label = standardized_concept_uri('en', name, '_', disambig) else: alt_label = standardized_concept_uri('en', name) surface = SYN_FRAME % (name, labels[web_subj]) out.write(umbel_edge('/r/Synonym', alt_label, main_label, surface, SOURCE)) for web_subj, web_rel, web_obj, objtag in reader.parse_file(dbpedia_link_file): if objtag == 'URL' and acceptable_node(web_obj) and acceptable_node(web_subj): if web_obj in labels: subj_label = resource_name(web_subj).replace('_', ' ') subj_uri = translate_dbpedia_url(web_subj) obj_label = labels[web_obj] obj_uri = standardized_concept_uri('en', obj_label) rel_name = resource_name(web_rel) if rel_name in REL_MAPPING: rel_uri, frame = REL_MAPPING[rel_name] surface = frame % (subj_label, obj_label) out.write(umbel_edge(rel_uri, subj_uri, obj_uri, surface, LINK_SOURCE)) map_out.write_link(web_rel, full_conceptnet_url(rel_uri)) map_out.write_link(web_subj, full_conceptnet_url(subj_uri)) map_out.write_link(web_obj, full_conceptnet_url(obj_uri))
def handle_file(filename, output_file): out = MsgpackStreamWriter(output_file) for line in gzip.open(filename, 'rt'): # skip the intro information if line.startswith('#'): continue # parse the data to extract the traditional form, simplified form and the English definition traditional, simplified, definitions = re.match(LINE_REGEX, line).groups() # Make an edge between the traditional and simplified version edge = make_edge( rel='/r/Synonym', start=standardized_concept_uri('zh-Hant', traditional), end=standardized_concept_uri('zh-Hans', simplified), dataset=DATASET, license=LICENSE, sources=SOURCE, ) out.write(edge) for definition in re.split(DEFINITIONS_REGEX, definitions): # Skip pronunciation information if 'Taiwan pr.' in definition or 'also pr.' in definition: continue # Check if it's the definition matches a person syntax, i.e. includes a date range person_match = re.match(DATE_RANGE_REGEX, definition) if person_match: persons = extract_person(person_match) for person in persons: edge = make_edge( rel='/r/Synonym', start=standardized_concept_uri('zh-Hant', traditional), end=standardized_concept_uri('en', person), dataset=DATASET, license=LICENSE, sources=SOURCE, ) out.write(edge) edge = make_edge( rel='/r/Synonym', start=standardized_concept_uri('zh-Hans', simplified), end=standardized_concept_uri('en', person), dataset=DATASET, license=LICENSE, sources=SOURCE, ) out.write(edge) continue # Check if a word is a measure word if definition.startswith('CL:'): related_words = extract_measure_words(definition) for word in related_words: edge = make_edge( rel='/r/RelatedTo', start=standardized_concept_uri('zh-Hant', traditional), end=standardized_concept_uri('zh', word), dataset=DATASET, license=LICENSE, sources=SOURCE, ) out.write(edge) edge = make_edge( rel='/r/RelatedTo', start=standardized_concept_uri('zh-Hans', simplified), end=standardized_concept_uri('zh', word), dataset=DATASET, license=LICENSE, sources=SOURCE, ) out.write(edge) continue # Remove clarifying information in parenthesis definition = PAREN_REGEX.sub('', definition) # Handle variants/word forms and abbreviations if re.match(VARIANT_REGEX, definition) or re.match(ABBR_REGEX, definition): variants = extract_han_characters(definition) for variant in variants: edge = make_edge( rel='/r/Synonym', start=standardized_concept_uri('zh-Hant', traditional), end=standardized_concept_uri('zh', variant), dataset=DATASET, license=LICENSE, sources=SOURCE, ) out.write(edge) edge = make_edge( rel='/r/Synonym', start=standardized_concept_uri('zh-Hans', simplified), end=standardized_concept_uri('zh', variant), dataset=DATASET, license=LICENSE, sources=SOURCE, ) out.write(edge) continue if re.match(SEE_ALSO_REGEX, definition): references = extract_han_characters(definition) for reference in references: edge = make_edge( rel='/r/RelatedTo', start=standardized_concept_uri('zh-Hant', traditional), end=standardized_concept_uri('zh', reference), dataset=DATASET, license=LICENSE, sources=SOURCE, ) out.write(edge) edge = make_edge( rel='/r/RelatedTo', start=standardized_concept_uri('zh-Hans', simplified), end=standardized_concept_uri('zh', reference), dataset=DATASET, license=LICENSE, sources=SOURCE, ) out.write(edge) # Remove 'lit.', 'fig.' definition = LIT_FIG_REGEX.sub('', definition) # Expand sth and sb definition = SB_REGEX.sub('someone', definition) definition = STH_REGEX.sub('something', definition) # Additional cleanups definition = remove_reference_syntax(definition) definition = remove_additional_info(definition) # Skip long definitions and make an edge out of remaining information if len(definition.split()) < 6: edge = make_edge( rel='/r/Synonym', start=standardized_concept_uri('zh-Hant', traditional), end=standardized_concept_uri('en', definition), dataset=DATASET, license=LICENSE, sources=SOURCE, ) out.write(edge) edge = make_edge( rel='/r/Synonym', start=standardized_concept_uri('zh-Hans', simplified), end=standardized_concept_uri('en', definition), dataset=DATASET, license=LICENSE, sources=SOURCE, ) out.write(edge)
def run_wordnet(input_dir, output_file, sw_map_file): out = MsgpackStreamWriter(output_file) map_out = NTriplesWriter(sw_map_file) reader = NTriplesReader() synset_senses = defaultdict(list) sense_synsets = {} labels = {} glossary = {} concept_map = {} sense_to_synset = {} # Parse lines such as: # wn30:synset-Aeolian-noun-2 rdfs:label "Aeolian"@en-us . for subj, rel, obj, objtag in reader.parse_file( os.path.join(input_dir, 'wordnet-synset.ttl')): if resource_name(rel) == 'label': # Everything in WordNet is in English assert objtag == 'en' labels[subj] = obj for subj, rel, obj, objtag in reader.parse_file( os.path.join(input_dir, 'wordnet-glossary.ttl')): if resource_name(rel) == 'gloss': assert objtag == 'en' # Take the definition up to the first semicolon text = obj.split(';')[0] # Remove introductory phrases with a colon text = text.split(': ', 1)[-1] # Remove parenthesized expressions while True: newtext = re.sub(r'\(.+?\) ?', '', text).strip() if newtext == text or newtext == '': break else: text = newtext glossary[subj] = text.replace('/', '_') # Get the list of word senses in each synset, and make a bidirectional mapping. # # Example line: # wn30:synset-Aeolian-noun-2 wn20schema:containsWordSense wn30:wordsense-Aeolian-noun-2 . for subj, rel, obj, objtag in reader.parse_file( os.path.join(input_dir, 'full/wordnet-wordsense-synset-relations.ttl')): if resource_name(rel) == 'containsWordSense': synset_senses[subj].append(obj) sense_synsets[obj] = subj # Assign every synset to a disambiguated concept. for synset in synset_senses: synset_name = labels[synset] synset_pos = synset.split('-')[-2] pos = PARTS_OF_SPEECH[synset_pos] disambig = glossary[synset] concept = standardized_concept_uri('en', synset_name, pos, disambig) concept_map[synset] = concept # Map senses to their synsets. for sense, synset in sense_synsets.items(): sense_to_synset[sense] = synset for filename in ('wordnet-attribute.ttl', 'wordnet-causes.ttl', 'wordnet-classifiedby.ttl', 'wordnet-entailment.ttl', 'wordnet-hyponym.ttl', 'wordnet-instances.ttl', 'wordnet-membermeronym.ttl', 'wordnet-partmeronym.ttl', 'wordnet-sameverbgroupas.ttl', 'wordnet-similarity.ttl', 'wordnet-substancemeronym.ttl', 'full/wordnet-antonym.ttl', 'full/wordnet-derivationallyrelated.ttl', 'full/wordnet-participleof.ttl', 'full/wordnet-pertainsto.ttl', 'full/wordnet-seealso.ttl'): filepath = os.path.join(input_dir, filename) if os.path.exists(filepath): for web_subj, web_rel, web_obj, objtag in reader.parse_file( filepath): # If this relation involves word senses, map them to their synsets # first. if web_subj in sense_to_synset: web_subj = sense_to_synset[web_subj] if web_obj in sense_to_synset: web_obj = sense_to_synset[web_obj] subj = concept_map[web_subj] obj = concept_map[web_obj] pred_label = resource_name(web_rel) if pred_label in REL_MAPPING: mapped_rel = REL_MAPPING[pred_label] # Handle WordNet relations that are the reverse of ConceptNet # relations. Change the word 'meronym' to 'holonym' if # necessary. if mapped_rel.startswith('~'): subj, obj = obj, subj web_subj, web_obj = web_obj, web_subj web_rel = web_rel.replace('meronym', 'holonym') mapped_rel = mapped_rel[1:] rel = join_uri('r', mapped_rel) else: rel = join_uri('r', 'wordnet', pred_label) map_out.write_link(web_rel, full_conceptnet_url(rel)) map_out.write_link(web_subj, full_conceptnet_url(subj)) map_out.write_link(web_obj, full_conceptnet_url(obj)) edge = make_edge(rel, subj, obj, dataset='/d/wordnet/3.0', license='/l/CC/By', sources=SOURCE, weight=2.0) out.write(edge)
def handle_file(filename, output_file): out = MsgpackStreamWriter(output_file) for line in gzip.open(filename, 'rt'): # skip the intro information if line.startswith('#'): continue # parse the data to extract the traditional form, simplified form and the English definition traditional, simplified, definitions = re.match(LINE_REGEX, line).groups() # Make an edge between the traditional and simplified version edge = make_edge( rel='/r/Synonym', start=standardized_concept_uri('zh-Hant', traditional), end=standardized_concept_uri('zh-Hans', simplified), dataset=DATASET, license=LICENSE, sources=SOURCE, ) out.write(edge) for definition in re.split(DEFINITIONS_REGEX, definitions): # Skip pronunciation information if 'Taiwan pr.' in definition or 'also pr.' in definition: continue # Check if it's the definition matches a person syntax, i.e. includes a date range person_match = re.match(DATE_RANGE_REGEX, definition) if person_match: persons = extract_person(person_match) for person in persons: edge = make_edge( rel='/r/Synonym', start=standardized_concept_uri('zh-Hant', traditional), end=standardized_concept_uri('en', person), dataset=DATASET, license=LICENSE, sources=SOURCE, ) out.write(edge) edge = make_edge( rel='/r/Synonym', start=standardized_concept_uri('zh-Hans', simplified), end=standardized_concept_uri('en', person), dataset=DATASET, license=LICENSE, sources=SOURCE, ) out.write(edge) continue # Check if a word is a measure word if definition.startswith('CL:'): related_words = extract_measure_words(definition) for word in related_words: edge = make_edge( rel='/r/RelatedTo', start=standardized_concept_uri('zh-Hant', traditional), end=standardized_concept_uri('zh', word), dataset=DATASET, license=LICENSE, sources=SOURCE, ) out.write(edge) edge = make_edge( rel='/r/RelatedTo', start=standardized_concept_uri('zh-Hans', simplified), end=standardized_concept_uri('zh', word), dataset=DATASET, license=LICENSE, sources=SOURCE, ) out.write(edge) continue # Remove clarifying information in parenthesis definition = PAREN_REGEX.sub('', definition) # Handle variants/word forms and abbreviations if re.match(VARIANT_REGEX, definition) or re.match( ABBR_REGEX, definition): variants = extract_han_characters(definition) for variant in variants: edge = make_edge( rel='/r/Synonym', start=standardized_concept_uri('zh-Hant', traditional), end=standardized_concept_uri('zh', variant), dataset=DATASET, license=LICENSE, sources=SOURCE, ) out.write(edge) edge = make_edge( rel='/r/Synonym', start=standardized_concept_uri('zh-Hans', simplified), end=standardized_concept_uri('zh', variant), dataset=DATASET, license=LICENSE, sources=SOURCE, ) out.write(edge) continue if re.match(SEE_ALSO_REGEX, definition): references = extract_han_characters(definition) for reference in references: edge = make_edge( rel='/r/RelatedTo', start=standardized_concept_uri('zh-Hant', traditional), end=standardized_concept_uri('zh', reference), dataset=DATASET, license=LICENSE, sources=SOURCE, ) out.write(edge) edge = make_edge( rel='/r/RelatedTo', start=standardized_concept_uri('zh-Hans', simplified), end=standardized_concept_uri('zh', reference), dataset=DATASET, license=LICENSE, sources=SOURCE, ) out.write(edge) # Remove 'lit.', 'fig.' definition = LIT_FIG_REGEX.sub('', definition) # Expand sth and sb definition = SB_REGEX.sub('someone', definition) definition = STH_REGEX.sub('something', definition) # Additional cleanups definition = remove_reference_syntax(definition) definition = remove_additional_info(definition) # Skip long definitions and make an edge out of remaining information if len(definition.split()) < 6: edge = make_edge( rel='/r/Synonym', start=standardized_concept_uri('zh-Hant', traditional), end=standardized_concept_uri('en', definition), dataset=DATASET, license=LICENSE, sources=SOURCE, ) out.write(edge) edge = make_edge( rel='/r/Synonym', start=standardized_concept_uri('zh-Hans', simplified), end=standardized_concept_uri('en', definition), dataset=DATASET, license=LICENSE, sources=SOURCE, ) out.write(edge)
def run_wordnet(input_file, output_file): out = MsgpackStreamWriter(output_file) synset_senses = defaultdict(list) sense_synsets = {} synset_labels = defaultdict(list) synset_canonical_labels = {} synset_categories = {} synset_domains = {} synset_glosses = {} synset_disambig = {} synset_uris = {} # First pass: find data about synsets quads = parse_nquads(open(input_file, encoding='utf-8')) for subj_dict, rel_dict, obj_dict, _graph in quads: if 'url' not in subj_dict or 'url' not in rel_dict: continue subj = subj_dict['url'] rel = rel_dict['url'] obj = obj_dict.get('url') objtext = obj_dict.get('text') relname = resource_name(rel) if relname == 'label': if obj_dict['lang'] == 'en': synset_labels[subj].append(objtext) elif relname == 'sameAs': if obj.startswith(WN20_URL): # If we have a link to RDF WordNet 2.0, the URL (URI? IRI?) # will contain a standardized label for this concept, which # we should use when we want to use this synset as the name of # a disambiguation category. RDF WordNet 3.1 assigns synsets # a number of labels in no particular order, making it hard to # determine from 3.1 alone what to name a category. objname = resource_name(obj) parts = objname.split('-')[1:-2] # Handle missing apostrophes label = '-'.join(parts).replace('_s_', "'s_").replace('_s-', "'s_").replace("s__", "s'_").replace("s_-", "s'-").replace('_', ' ') synset_canonical_labels[subj] = label elif relname == 'domain_category': synset_categories[subj] = obj elif relname == 'lexical_domain': target = resource_name(obj) if '.' in target: domain = target.split('.')[1] synset_domains[subj] = domain elif relname == 'gloss': synset_glosses[subj] = objtext elif relname == 'reference': lemma = resource_name(subj) synset = obj synset_senses[synset].append(lemma) sense_synsets[lemma] = synset used_labels = set(synset_canonical_labels.values()) for synset, values in synset_labels.items(): values.sort(key=lambda label: (label in used_labels,) + label_sort_key(label)) if ( synset not in synset_canonical_labels or synset_canonical_labels[synset][0].isupper() and synset_domains.get(synset) == 'person' ): label = values[0] synset_canonical_labels[synset] = label used_labels.add(label) for synset, labels in synset_labels.items(): if synset in synset_categories: category_name = synset_canonical_labels[synset_categories[synset]] else: category_name = synset_domains.get(synset, None) synset_no_fragment = synset.split('#')[0] pos = synset_no_fragment[-1].lower() assert pos in 'nvarsp', synset if pos == 's': pos = 'a' elif pos == 'p': pos = '-' if category_name in ('pert', 'all', 'tops'): category_name = None synset_disambig[synset] = (pos, category_name) canon = synset_canonical_labels[synset] canon_uri = standardized_concept_uri('en', canon, pos, 'wn', category_name) synset_uris[synset] = canon_uri for label in labels: if label != canon: other_uri = standardized_concept_uri('en', label, pos, 'wn', category_name) rel_uri = '/r/Synonym' surface = '[[{0}]] is a synonym of [[{1}]]'.format(label, canon) edge = make_edge( rel_uri, other_uri, canon_uri, dataset=DATASET, surfaceText=surface, license=Licenses.cc_attribution, sources=[SOURCE], weight=2.0 ) out.write(edge) quads = parse_nquads(open(input_file, encoding='utf-8')) for subj_dict, rel_dict, obj_dict, _graph in quads: if 'url' not in subj_dict or 'url' not in rel_dict: continue subj = subj_dict['url'] rel = rel_dict['url'] obj = obj_dict.get('url') relname = resource_name(rel) if relname in REL_MAPPING: rel, frame = REL_MAPPING[relname] reversed_frame = False if rel.startswith('~'): rel = rel[1:] reversed_frame = True rel_uri = '/r/' + rel if obj is not None: obj_uri = synset_uris.get(obj) if obj not in synset_canonical_labels: continue obj_label = synset_canonical_labels[obj] else: text = obj_dict['text'] # Some WordNets use strings with "!" in them to indicate # out-of-band information, such as a missing translation if (not text) or '!' in text: continue lang = obj_dict['lang'] pos, sense = synset_disambig.get(subj, (None, None)) obj_uri = standardized_concept_uri(lang, text, pos, 'wn', sense) obj_label = text if subj not in synset_uris or subj not in synset_canonical_labels: continue subj_uri = synset_uris[subj] subj_label = synset_canonical_labels[subj] license = Licenses.cc_attribution langcode = subj_uri.split('/')[2] if langcode in SHAREALIKE_LANGUAGES: license = Licenses.cc_sharealike if reversed_frame: subj_uri, obj_uri = obj_uri, subj_uri subj_label, obj_label = obj_label, subj_label surface = frame.format('[[%s]]' % subj_label, '[[%s]]' % obj_label) edge = make_edge( rel_uri, subj_uri, obj_uri, dataset=DATASET, surfaceText=surface, license=license, sources=[SOURCE], weight=2.0 ) out.write(edge) for wn_url in sorted(synset_uris): cn_uri = synset_uris[wn_url] edge = make_edge( '/r/ExternalURL', cn_uri, wn_url, dataset=DATASET, license=Licenses.cc_sharealike, sources=[SOURCE], weight=1.0 ) out.write(edge) out.close()
def run_wordnet(input_file, output_file): out = MsgpackStreamWriter(output_file) synset_senses = defaultdict(list) sense_synsets = {} synset_labels = defaultdict(list) synset_canonical_labels = {} synset_categories = {} synset_domains = {} synset_glosses = {} synset_disambig = {} synset_uris = {} # First pass: find data about synsets quads = parse_nquads(open(input_file, encoding='utf-8')) for subj_dict, rel_dict, obj_dict, _graph in quads: if 'url' not in subj_dict or 'url' not in rel_dict: continue subj = subj_dict['url'] rel = rel_dict['url'] obj = obj_dict.get('url') objtext = obj_dict.get('text') relname = resource_name(rel) if relname == 'label': if obj_dict['lang'] == 'en': synset_labels[subj].append(objtext) elif relname == 'sameAs': if obj.startswith(WN20_URL): # If we have a link to RDF WordNet 2.0, the URL (URI? IRI?) # will contain a standardized label for this concept, which # we should use when we want to use this synset as the name of # a disambiguation category. RDF WordNet 3.1 assigns synsets # a number of labels in no particular order, making it hard to # determine from 3.1 alone what to name a category. objname = resource_name(obj) parts = objname.split('-')[1:-2] # Handle missing apostrophes label = '-'.join(parts).replace('_s_', "'s_").replace( '_s-', "'s_").replace("s__", "s'_").replace("s_-", "s'-").replace('_', ' ') synset_canonical_labels[subj] = label elif relname == 'domain_category': synset_categories[subj] = obj elif relname == 'lexical_domain': target = resource_name(obj) if '.' in target: domain = target.split('.')[1] synset_domains[subj] = domain elif relname == 'gloss': synset_glosses[subj] = objtext elif relname == 'reference': lemma = resource_name(subj) synset = obj synset_senses[synset].append(lemma) sense_synsets[lemma] = synset used_labels = set(synset_canonical_labels.values()) for synset, values in synset_labels.items(): values.sort( key=lambda label: (label in used_labels, ) + label_sort_key(label)) if (synset not in synset_canonical_labels or synset_canonical_labels[synset][0].isupper() and synset_domains.get(synset) == 'person'): label = values[0] synset_canonical_labels[synset] = label used_labels.add(label) for synset, labels in synset_labels.items(): if synset in synset_categories: category_name = synset_canonical_labels[synset_categories[synset]] else: category_name = synset_domains.get(synset, None) synset_no_fragment = synset.split('#')[0] pos = synset_no_fragment[-1].lower() assert pos in 'nvarsp', synset if pos == 's': pos = 'a' elif pos == 'p': pos = '-' if category_name in ('pert', 'all', 'tops'): category_name = None synset_disambig[synset] = (pos, category_name) canon = synset_canonical_labels[synset] canon_uri = standardized_concept_uri('en', canon, pos, 'wn', category_name) synset_uris[synset] = canon_uri for label in labels: if label != canon: other_uri = standardized_concept_uri('en', label, pos, 'wn', category_name) rel_uri = '/r/Synonym' surface = '[[{0}]] is a synonym of [[{1}]]'.format( label, canon) edge = make_edge(rel_uri, other_uri, canon_uri, dataset=DATASET, surfaceText=surface, license=Licenses.cc_attribution, sources=[SOURCE], weight=2.0) out.write(edge) quads = parse_nquads(open(input_file, encoding='utf-8')) for subj_dict, rel_dict, obj_dict, _graph in quads: if 'url' not in subj_dict or 'url' not in rel_dict: continue subj = subj_dict['url'] rel = rel_dict['url'] obj = obj_dict.get('url') relname = resource_name(rel) if relname in REL_MAPPING: pos, sense = synset_disambig.get(subj, (None, None)) if relname == 'hypernym' and pos == 'v': relname = 'hypernym-v' rel, frame = REL_MAPPING[relname] reversed_frame = False if rel.startswith('~'): rel = rel[1:] reversed_frame = True rel_uri = '/r/' + rel if obj is not None: obj_uri = synset_uris.get(obj) if obj not in synset_canonical_labels: continue obj_label = synset_canonical_labels[obj] else: text = obj_dict['text'] # Some WordNets use strings with "!" in them to indicate # out-of-band information, such as a missing translation if (not text) or '!' in text: continue lang = obj_dict['lang'] obj_uri = standardized_concept_uri(lang, text, pos, 'wn', sense) obj_label = text if subj not in synset_uris or subj not in synset_canonical_labels: continue subj_uri = synset_uris[subj] subj_label = synset_canonical_labels[subj] license = Licenses.cc_attribution langcode = subj_uri.split('/')[2] if langcode in SHAREALIKE_LANGUAGES: license = Licenses.cc_sharealike if reversed_frame: subj_uri, obj_uri = obj_uri, subj_uri subj_label, obj_label = obj_label, subj_label surface = frame.format('[[%s]]' % subj_label, '[[%s]]' % obj_label) edge = make_edge(rel_uri, subj_uri, obj_uri, dataset=DATASET, surfaceText=surface, license=license, sources=[SOURCE], weight=2.0) out.write(edge) for wn_url in sorted(synset_uris): cn_uri = synset_uris[wn_url] edge = make_edge('/r/ExternalURL', cn_uri, wn_url, dataset=DATASET, license=Licenses.cc_sharealike, sources=[SOURCE], weight=1.0) out.write(edge) out.close()
def handle_file(infile, outfile): count = 0 outcomes = defaultdict(int) writer = MsgpackStreamWriter(outfile) for line in open(infile): parts = line.strip().split('\t') if not parts: outcomes['blank'] += 1 continue # The first 5 columns of the Verbosity output file are: # # left: the word being clued # relation: the relation between the word and the clue that the # clue-giver chose, in a form such as "it is part of" # right: the one or two words used as the clue # freq: the number of different times this clue was given # orderscore: the average position in the list of clues # # 'orderscore' is a number from 0 to 999, representing the average # quantile of its position in the list of clues. (It's like a # percentile, except there are 1000 of them, not 100.) # # A clue that's always given first has an orderscore of 0. A clue # that always appears halfway through the list has an orderscore of # 500. # # This may seem like a strange thing to measure, and I didn't come up # with it, but it actually turns out to be somewhat informative. # A clue with an orderscore of 0 is probably a good common-sense # relation, representing the first thing that comes to mind. A clue # with a high order score may be a move of desperation after several # other clues have failed. It causes the guesser to get the answer # soon afterward, but perhaps because it's a "cheating" move. So, # low orderscores represent better common sense relations. left, relation, right, freq, orderscore = parts[:5] freq = int(freq) orderscore = int(orderscore) # Test each word flagged = False for rword in right.split(): if BAD_CLUE_REGEX.match(rword): flagged = True break if flagged: outcomes['flag word'] += 1 continue if len(right) < 3: outcomes['clue too short'] += 1 continue if len(right.split()[-1]) == 1: outcomes['letter'] += 1 continue # The Verbosity interface and gameplay did not particularly encourage # players to choose an appropriate relation. In practice, players seem # to have used them all interchangeably, except for the negative # relation "it is the opposite of", expressing /r/Antonym. # # Another way that players expressed negative relations was to use # 'not' as the first word of their clue; we make that into an instance # of /r/Antonym as well. # # In other cases, the relation is a positive relation, so we replace it # with the most general positive relation, /r/RelatedTo. rel = '/r/RelatedTo' reltext = 'is related to' if right.startswith('not '): rel = '/r/DistinctFrom' right = right[4:] reltext = 'is not' if relation == 'it is the opposite of': rel = '/r/Antonym' reltext = 'is the opposite of' # The "sounds-like score" determines whether this clue seems to be a # pun or rhyme, rather than an actual common-sense relationship. If # the sounds-like score is over 0.35, skip the assertion. sls = sounds_like_score(left, right) if sls > 0.35: outcomes['text similarity'] += 1 continue # Calculate a score for the assertion: # # - The number of times it's been used as a clue # - ...with a linear penalty for a high sounds-like score # - ...and a linear penalty for high orderscores # # The penalties are multiplicative factors from 0 to 1, which decrease # linearly as the relevant penalties increase. If a clue is given N # times, with a sounds-like score of 0 and an orderscore of 0, it will # get an overall score of 2N - 1. This is a formula we should probably # revisit. # # The weight is the score divided by 100. All divisions are floating # point. score = (freq * 2 - 1) * (1 - sls) * (1 - orderscore / 1000) if score <= 1.: outcomes['low score'] += 1 continue weight = score ** .5 / 10 # If the clue on the right is a two-word phrase, we make additional # connections to both words individually. We label them with the # rule-based source '/s/process/split_words' to track that this # happened. rightwords = [right] if ' ' in right: morewords = [word for word in right.split(' ') if word not in STOPWORDS] rightwords.extend(morewords) for i, rightword in enumerate(rightwords): source = {'contributor': '/s/resource/verbosity'} if i > 0: source['process'] = '/s/process/split_words' # Build the natural-language-ish surface text for this clue text = '[[%s]] %s [[%s]]' % (left, reltext, rightword) count += 1 outcomes['success'] += 1 leftc = standardized_concept_uri('en', left) rightc = standardized_concept_uri('en', rightword) edge = make_edge( rel, leftc, rightc, dataset='/d/verbosity', license=Licenses.cc_attribution, sources=[source], surfaceText=text, weight=weight, ) writer.write(edge)
def process_dbpedia(input_dir, output_file, concept_file): """ Read through multiple DBPedia files and output filtered assertions to `output_file`. """ ok_concepts = read_concept_file(concept_file) input_path = pathlib.Path(input_dir) interlang_path = input_path / 'interlanguage_links_en.tql.bz2' mapped_urls = interlanguage_mapping(interlang_path, ok_concepts) out = MsgpackStreamWriter(output_file) types_path = input_path / 'instance_types_en.tql.bz2' quads = parse_nquads(bz2.open(str(types_path), 'rt')) for subj, pred, obj, _graph in quads: subj_url = subj['url'] if ('Category:' in subj_url or 'File:' in subj_url or 'List_of' in subj_url or '__' in subj_url or 'Template:' in subj_url): continue if subj_url in mapped_urls: subj_concept = translate_dbpedia_url(subj_url) obj_type = un_camel_case(resource_name(obj['url'])) if obj_type not in TYPE_BLACKLIST: obj_concept = standardized_concept_uri('en', obj_type, 'n') if obj_concept not in CONCEPT_BLACKLIST: edge = make_edge('/r/IsA', subj_concept, obj_concept, dataset='/d/dbpedia/en', license=Licenses.cc_sharealike, sources=[{ 'contributor': '/s/resource/dbpedia/2015/en' }], weight=0.5, surfaceStart=url_to_label(subj['url']), surfaceEnd=url_to_label(obj['url'])) out.write(edge) for other_url in mapped_urls[subj_url]: if other_url.startswith('http://wikidata.dbpedia.org/'): urledge = make_edge('/r/ExternalURL', subj_concept, other_url, dataset='/d/dbpedia/en', license=Licenses.cc_sharealike, sources=[{ 'contributor': '/s/resource/dbpedia/2015/en' }], weight=1.0) out.write(urledge) else: other_concept = translate_dbpedia_url(other_url) if other_concept: urledge = make_edge( '/r/ExternalURL', other_concept, other_url, dataset='/d/dbpedia/en', license=Licenses.cc_sharealike, sources=[{ 'contributor': '/s/resource/dbpedia/2015/en' }], weight=1.0) out.write(urledge) edge = make_edge( '/r/Synonym', other_concept, subj_concept, dataset='/d/dbpedia/en', license=Licenses.cc_sharealike, sources=[{ 'contributor': '/s/resource/dbpedia/2015/en' }], weight=0.5, surfaceStart=url_to_label(other_url), surfaceEnd=url_to_label(subj_url)) out.write(edge) relations_path = input_path / 'mappingbased_objects_en.tql.bz2' quads = parse_nquads(bz2.open(str(relations_path), 'rt')) for subj, pred, obj, _graph in quads: subj_concept = translate_dbpedia_url(subj['url']) obj_concept = translate_dbpedia_url(obj['url']) rel_name = resource_name(pred['url']) if (subj_concept and obj_concept and subj['url'] in mapped_urls and obj['url'] in mapped_urls): if rel_name in RELATIONS: rel = RELATIONS[rel_name] edge = make_edge(rel, subj_concept, obj_concept, dataset='/d/dbpedia/en', license=Licenses.cc_sharealike, sources=[{ 'contributor': '/s/resource/dbpedia/2015/en' }], weight=0.5, surfaceStart=url_to_label(subj['url']), surfaceEnd=url_to_label(obj['url'])) out.write(edge) out.close()
def run_wordnet(input_dir, output_file, sw_map_file): out = MsgpackStreamWriter(output_file) map_out = NTriplesWriter(sw_map_file) reader = NTriplesReader() synset_senses = defaultdict(list) sense_synsets = {} labels = {} glossary = {} concept_map = {} sense_to_synset = {} # Parse lines such as: # wn30:synset-Aeolian-noun-2 rdfs:label "Aeolian"@en-us . for subj, rel, obj, objtag in reader.parse_file(os.path.join(input_dir, 'wordnet-synset.ttl')): if resource_name(rel) == 'label': # Everything in WordNet is in English assert objtag == 'en' labels[subj] = obj for subj, rel, obj, objtag in reader.parse_file(os.path.join(input_dir, 'wordnet-glossary.ttl')): if resource_name(rel) == 'gloss': assert objtag == 'en' # Take the definition up to the first semicolon text = obj.split(';')[0] # Remove introductory phrases with a colon text = text.split(': ', 1)[-1] # Remove parenthesized expressions while True: newtext = re.sub(r'\(.+?\) ?', '', text).strip() if newtext == text or newtext == '': break else: text = newtext glossary[subj] = text.replace('/', '_') # Get the list of word senses in each synset, and make a bidirectional mapping. # # Example line: # wn30:synset-Aeolian-noun-2 wn20schema:containsWordSense wn30:wordsense-Aeolian-noun-2 . for subj, rel, obj, objtag in reader.parse_file(os.path.join(input_dir, 'full/wordnet-wordsense-synset-relations.ttl')): if resource_name(rel) == 'containsWordSense': synset_senses[subj].append(obj) sense_synsets[obj] = subj # Assign every synset to a disambiguated concept. for synset in synset_senses: synset_name = labels[synset] synset_pos = synset.split('-')[-2] pos = PARTS_OF_SPEECH[synset_pos] disambig = glossary[synset] concept = standardized_concept_uri('en', synset_name, pos, disambig) concept_map[synset] = concept # Map senses to their synsets. for sense, synset in sense_synsets.items(): sense_to_synset[sense] = synset for filename in ( 'wordnet-attribute.ttl', 'wordnet-causes.ttl', 'wordnet-classifiedby.ttl', 'wordnet-entailment.ttl', 'wordnet-hyponym.ttl', 'wordnet-instances.ttl', 'wordnet-membermeronym.ttl', 'wordnet-partmeronym.ttl', 'wordnet-sameverbgroupas.ttl', 'wordnet-similarity.ttl', 'wordnet-substancemeronym.ttl', 'full/wordnet-antonym.ttl', 'full/wordnet-derivationallyrelated.ttl', 'full/wordnet-participleof.ttl', 'full/wordnet-pertainsto.ttl', 'full/wordnet-seealso.ttl' ): filepath = os.path.join(input_dir, filename) if os.path.exists(filepath): for web_subj, web_rel, web_obj, objtag in reader.parse_file(filepath): # If this relation involves word senses, map them to their synsets # first. if web_subj in sense_to_synset: web_subj = sense_to_synset[web_subj] if web_obj in sense_to_synset: web_obj = sense_to_synset[web_obj] subj = concept_map[web_subj] obj = concept_map[web_obj] pred_label = resource_name(web_rel) if pred_label in REL_MAPPING: mapped_rel = REL_MAPPING[pred_label] # Handle WordNet relations that are the reverse of ConceptNet # relations. Change the word 'meronym' to 'holonym' if # necessary. if mapped_rel.startswith('~'): subj, obj = obj, subj web_subj, web_obj = web_obj, web_subj web_rel = web_rel.replace('meronym', 'holonym') mapped_rel = mapped_rel[1:] rel = join_uri('r', mapped_rel) else: rel = join_uri('r', 'wordnet', pred_label) map_out.write_link(web_rel, full_conceptnet_url(rel)) map_out.write_link(web_subj, full_conceptnet_url(subj)) map_out.write_link(web_obj, full_conceptnet_url(obj)) edge = make_edge( rel, subj, obj, dataset='/d/wordnet/3.0', license='/l/CC/By', sources=SOURCE, weight=2.0 ) out.write(edge)
def run_umbel(input_dir, output_file, sw_map_file): """ Read N-Triples files containing Umbel data, outputting a file of ConceptNet edges and a file of mappings between the Semantic Web and ConceptNet. """ out = MsgpackStreamWriter(output_file) map_out = NTriplesWriter(sw_map_file) reader = NTriplesReader() labels = {} label_sets = defaultdict(set) # There are two files we want to parse: # - umbel.nt, a transformation of umbel.n3, which is available from # https://github.com/structureddynamics/UMBEL/. # - umbel_links.nt, distributed with DBPedia 3.9. # # We parse them both in this file so that umbel_links can reuse the # concept names extracted from umbel.nt. main_file = os.path.join(input_dir, 'umbel.nt') dbpedia_link_file = os.path.join(input_dir, 'umbel_links.nt') # Read through umbel.nt once, finding all the "preferred labels". We will # use these as the surface texts for the nodes. for web_subj, web_rel, web_obj, objtag in reader.parse_file(main_file): if resource_name(web_rel) == 'prefLabel': # 'CW' and 'PCW' are Cyc jargon for 'conceptual works'. If a node # cannot be described except as a CW, we're probably not # interested in it. if 'CW' not in web_obj.split() and 'PCW' not in web_obj.split(): labels[web_subj] = web_obj if resource_name(web_rel).endswith('Label'): text = standardize_text(web_obj) label_sets[text].add(web_subj) # Read through umbel.nt again and extract ConceptNet edges. for web_subj, web_rel, web_obj, objtag in reader.parse_file(main_file): if objtag == 'URL' and acceptable_node(web_obj) and acceptable_node( web_subj): # Only use nodes for which we've seen preferred labels. # (This skips some anonymous OWL-cruft nodes.) if web_subj in labels and web_obj in labels: subj_uri = standardized_concept_uri('en', labels[web_subj]) obj_uri = standardized_concept_uri('en', labels[web_obj]) rel_name = resource_name(web_rel) # Check if this is a relation we want to handle. if rel_name in REL_MAPPING: # Write the ConceptNet edges and the mappings to Semantic Web URLs. rel_uri, frame = REL_MAPPING[rel_name] surface = frame % (labels[web_subj], labels[web_obj]) out.write( umbel_edge(rel_uri, subj_uri, obj_uri, surface, SOURCE)) map_out.write_link(web_rel, full_conceptnet_url(rel_uri)) map_out.write_link(web_subj, full_conceptnet_url(subj_uri)) map_out.write_link(web_obj, full_conceptnet_url(obj_uri)) # altLabel relations assign different texts to the same node. We'll # represent those in ConceptNet with Synonym relations. elif web_rel.endswith('altLabel'): # Make sure we know what's being labeled. if web_subj in labels: name = web_obj words = name.split(' ') if standardized_concept_name( 'en', name) != standardized_concept_name( 'en', labels[web_subj]): if not set(words) & IGNORED_WORDS: main_label = standardized_concept_uri( 'en', labels[web_subj]) name_text = standardize_text(name) if len(label_sets[name_text]) >= 2 or len( name_text) <= 3: disambig = un_camel_case(resource_name(web_subj)) # Cyc does not distinguish texts by their part of speech, so use # '_' as the part of speech symbol. alt_label = standardized_concept_uri( 'en', name, '_', disambig) else: alt_label = standardized_concept_uri('en', name) surface = SYN_FRAME % (name, labels[web_subj]) out.write( umbel_edge('/r/Synonym', alt_label, main_label, surface, SOURCE)) for web_subj, web_rel, web_obj, objtag in reader.parse_file( dbpedia_link_file): if objtag == 'URL' and acceptable_node(web_obj) and acceptable_node( web_subj): if web_obj in labels: subj_label = resource_name(web_subj).replace('_', ' ') subj_uri = translate_dbpedia_url(web_subj) obj_label = labels[web_obj] obj_uri = standardized_concept_uri('en', obj_label) rel_name = resource_name(web_rel) if rel_name in REL_MAPPING: rel_uri, frame = REL_MAPPING[rel_name] surface = frame % (subj_label, obj_label) out.write( umbel_edge(rel_uri, subj_uri, obj_uri, surface, LINK_SOURCE)) map_out.write_link(web_rel, full_conceptnet_url(rel_uri)) map_out.write_link(web_subj, full_conceptnet_url(subj_uri)) map_out.write_link(web_obj, full_conceptnet_url(obj_uri))
def build_from_dir(dirname, output_file): """ Read a GlobalMind database exported in YAML files, translate it into ConceptNet 5 edges, and write those edges to disk using a MsgpackStreamWriter. """ out = MsgpackStreamWriter(output_file) userdata = yaml.load_all(open(dirname + '/GMUser.yaml')) users = {} for userinfo in userdata: users[userinfo['pk']] = userinfo frame_data = yaml.load_all(open(dirname + '/GMFrame.yaml')) frames = {} for frame in frame_data: frames[frame['pk']] = frame['fields'] assertiondata = yaml.load_all(open(dirname + '/GMAssertion.yaml')) assertions = {} for assertion in assertiondata: obj = assertion['fields'] frame = frames[obj['frame']] frametext = frame['text'] userinfo = users[obj['author']] username = userinfo['fields']['username'] # As far as I can tell, GlobalMind used the same namespace of # usernames as the original Open Mind. user_source = "/s/contributor/omcs/%s" % username sources = [user_source, "/s/activity/globalmind/assert"] lang = LANG_CODES[obj['lcode']] start = standardized_concept_uri(lang, obj['node1']) end = standardized_concept_uri(lang, obj['node2']) rel = '/r/' + RELATION_MAP.get(frame['relation'], frame['relation']) # fix messy english "around in" if ' around ' in frametext: if obj['node2'].startswith('in '): frametext = frametext.replace(' around ', ' in ') obj['node2'] = obj['node2'][3:] else: frametext = frametext.replace(' around ', ' near ') rel = '/r/LocatedNear' # fix more awkward English. I wonder how bad the other languages are. frametext = frametext.replace('hits your head', 'comes to mind') frametext = frametext.replace(': [node1], [node2]', ' [node1] and [node2]') node1 = u'[[' + obj['node1'] + u']]' node2 = u'[[' + obj['node2'] + u']]' surfaceText = frametext.replace('//', '').replace('[node1]', node1).replace( '[node2]', node2) edge = make_edge(rel, start, end, dataset='/d/globalmind', license='/l/CC/By', sources=sources, surfaceText=surfaceText, weight=1) # Avoid duplication with the ConceptNet reader, but still save every edge so that we can # handle translations. if username != 'openmind': out.write(edge) assertions[assertion['pk']] = edge translationdata = yaml.load_all(open(dirname + '/GMTranslation.yaml')) for translation in translationdata: obj = translation['fields'] assertion1 = assertions[obj['assertion1']] assertion2 = assertions[obj['assertion2']] start = assertion1['uri'] end = assertion2['uri'] rel = '/r/TranslationOf' text1 = assertion1['surfaceText'].replace('[[', '').replace(']]', '') text2 = assertion2['surfaceText'].replace('[[', '').replace(']]', '') lang1 = LANG_NAMES[get_lang(assertion1)] lang2 = LANG_NAMES[get_lang(assertion2)] surfaceText = u"[[%s]] in %s means [[%s]] in %s." % (text1, lang1, text2, lang2) userinfo = users[obj['author']] username = userinfo['fields']['username'] userlocale = userinfo['fields']['ccode'].lower() if userlocale: user_source = "/s/contributor/globalmind/%s/%s" % (userlocale, username) else: user_source = "/s/contributor/globalmind/%s" % username sources = [user_source, "/s/activity/globalmind/translate"] edge = make_edge(rel, start, end, dataset='/d/globalmind', license=Licenses.cc_attribution, sources=sources, surfaceText=surfaceText, weight=1) out.write(edge)