def handle_triple(line, reader, out, map_out): subj, pred, obj, tag = reader.parse_line(line) if tag != 'URL': return # Ignore types of edges that we don't care about: # - Homepage links # - GIS features # - Assertions that something "is a thing" # - Anonymous nodes identified with double-underscores, such as the node # "Alfred_Nobel__1", which means "Alfred Nobel's occupation, whatever # it is" # - Nodes that are articles named "List of X" on Wikipedia if ('foaf/0.1/homepage' in pred or '_Feature' in obj or '#Thing' in obj or '__' in subj or '__' in obj or 'List_of' in subj or 'List_of' in obj): return # We don't try to parse URIs from outside of dbpedia.org's namespace. if 'dbpedia.org' not in obj: return subj_concept = translate_dbpedia_url(subj, 'en') obj_concept = translate_dbpedia_url(obj, 'en') # DBPedia categorizes a lot of things as 'works', which causes unnecessary # ambiguity. Disregard these edges; there will almost always be a more # specific edge calling it a 'creative work' anyway. if obj_concept == '/c/en/work': return rel = map_dbpedia_relation(pred) if rel is None: return # We've successfully converted this Semantic Web triple to ConceptNet URIs. # Now write the results to the 'sw_map' file so others can follow this # mapping. mapped_pairs = [ (pred, rel), (subj, subj_concept), (obj, obj_concept) ] for sw_url, conceptnet_uri in mapped_pairs: conceptnet_url = full_conceptnet_url(conceptnet_uri) map_out.write_link(conceptnet_url, sw_url) edge = make_edge(rel, subj_concept, obj_concept, dataset='/d/dbpedia/en', license=Licenses.cc_sharealike, sources=['/s/dbpedia/3.7'], weight=0.5) out.write(edge)
def run_wordnet(input_dir, output_file, sw_map_file): out = MsgpackStreamWriter(output_file) map_out = NTriplesWriter(sw_map_file) reader = NTriplesReader() synset_senses = defaultdict(list) sense_synsets = {} labels = {} glossary = {} concept_map = {} sense_to_synset = {} # Parse lines such as: # wn30:synset-Aeolian-noun-2 rdfs:label "Aeolian"@en-us . for subj, rel, obj, objtag in reader.parse_file(os.path.join(input_dir, 'wordnet-synset.ttl')): if resource_name(rel) == 'label': # Everything in WordNet is in English assert objtag == 'en' labels[subj] = obj for subj, rel, obj, objtag in reader.parse_file(os.path.join(input_dir, 'wordnet-glossary.ttl')): if resource_name(rel) == 'gloss': assert objtag == 'en' # Take the definition up to the first semicolon text = obj.split(';')[0] # Remove introductory phrases with a colon text = text.split(': ', 1)[-1] # Remove parenthesized expressions while True: newtext = re.sub(r'\(.+?\) ?', '', text).strip() if newtext == text or newtext == '': break else: text = newtext glossary[subj] = text.replace('/', '_') # Get the list of word senses in each synset, and make a bidirectional mapping. # # Example line: # wn30:synset-Aeolian-noun-2 wn20schema:containsWordSense wn30:wordsense-Aeolian-noun-2 . for subj, rel, obj, objtag in reader.parse_file(os.path.join(input_dir, 'full/wordnet-wordsense-synset-relations.ttl')): if resource_name(rel) == 'containsWordSense': synset_senses[subj].append(obj) sense_synsets[obj] = subj # Assign every synset to a disambiguated concept. for synset in synset_senses: synset_name = labels[synset] synset_pos = synset.split('-')[-2] pos = PARTS_OF_SPEECH[synset_pos] disambig = glossary[synset] concept = standardized_concept_uri('en', synset_name, pos, disambig) concept_map[synset] = concept # Map senses to their synsets. for sense, synset in sense_synsets.items(): sense_to_synset[sense] = synset for filename in ( 'wordnet-attribute.ttl', 'wordnet-causes.ttl', 'wordnet-classifiedby.ttl', 'wordnet-entailment.ttl', 'wordnet-hyponym.ttl', 'wordnet-instances.ttl', 'wordnet-membermeronym.ttl', 'wordnet-partmeronym.ttl', 'wordnet-sameverbgroupas.ttl', 'wordnet-similarity.ttl', 'wordnet-substancemeronym.ttl', 'full/wordnet-antonym.ttl', 'full/wordnet-derivationallyrelated.ttl', 'full/wordnet-participleof.ttl', 'full/wordnet-pertainsto.ttl', 'full/wordnet-seealso.ttl' ): filepath = os.path.join(input_dir, filename) if os.path.exists(filepath): for web_subj, web_rel, web_obj, objtag in reader.parse_file(filepath): # If this relation involves word senses, map them to their synsets # first. if web_subj in sense_to_synset: web_subj = sense_to_synset[web_subj] if web_obj in sense_to_synset: web_obj = sense_to_synset[web_obj] subj = concept_map[web_subj] obj = concept_map[web_obj] pred_label = resource_name(web_rel) if pred_label in REL_MAPPING: mapped_rel = REL_MAPPING[pred_label] # Handle WordNet relations that are the reverse of ConceptNet # relations. Change the word 'meronym' to 'holonym' if # necessary. if mapped_rel.startswith('~'): subj, obj = obj, subj web_subj, web_obj = web_obj, web_subj web_rel = web_rel.replace('meronym', 'holonym') mapped_rel = mapped_rel[1:] rel = join_uri('r', mapped_rel) else: rel = join_uri('r', 'wordnet', pred_label) map_out.write_link(web_rel, full_conceptnet_url(rel)) map_out.write_link(web_subj, full_conceptnet_url(subj)) map_out.write_link(web_obj, full_conceptnet_url(obj)) edge = make_edge( rel, subj, obj, dataset='/d/wordnet/3.0', license='/l/CC/By', sources=SOURCE, weight=2.0 ) out.write(edge)
def handle_triple(line, reader, out, map_out): subj, pred, obj, tag = reader.parse_line(line) if tag != "URL": return # Ignore types of edges that we don't care about: # - Homepage links # - GIS features # - Assertions that something "is a thing" # - Anonymous nodes identified with double-underscores, such as the node # "Alfred_Nobel__1", which means "Alfred Nobel's occupation, whatever # it is" # - Nodes that are articles named "List of X" on Wikipedia if ( "foaf/0.1/homepage" in pred or "_Feature" in obj or "#Thing" in obj or "__" in subj or "__" in obj or "List_of" in subj or "List_of" in obj or "Wikidata:" in obj ): return # We don't try to parse URIs from outside of dbpedia.org's namespace. if "dbpedia.org" not in obj: return subj_concept = translate_dbpedia_url(subj) obj_concept = translate_dbpedia_url(obj) subj_text = un_camel_case(parse_topic_name(resource_name(subj))[0]) obj_text = un_camel_case(parse_topic_name(resource_name(obj))[0]) if subj_concept is None or obj_concept is None: return # DBPedia categorizes a lot of things as 'works', which causes unnecessary # ambiguity. Disregard these edges; there will almost always be a more # specific edge calling it a 'creative work' anyway. if obj_concept in CONCEPT_BLACKLIST: return rel = map_dbpedia_relation(pred) if rel is None: return if rel in {"/r/IsA", "/r/TranslationOf"}: obj_text = obj_text.lower() # We've successfully converted this Semantic Web triple to ConceptNet URIs. # Now write the results to the 'sw_map' file so others can follow this # mapping. mapped_pairs = [(pred, rel), (subj, subj_concept), (obj, obj_concept)] for sw_url, conceptnet_uri in mapped_pairs: conceptnet_url = full_conceptnet_url(conceptnet_uri) map_out.write_link(conceptnet_url, sw_url) edge = make_edge( rel, subj_concept, obj_concept, dataset="/d/dbpedia/en", license=Licenses.cc_sharealike, sources=["/s/dbpedia/2014"], surfaceText=make_surface_text(rel, subj_text, obj_text), weight=0.5, ) out.write(edge)
def run_wordnet(input_dir, output_file, sw_map_file): out = MsgpackStreamWriter(output_file) map_out = NTriplesWriter(sw_map_file) reader = NTriplesReader() synset_senses = defaultdict(list) sense_synsets = {} labels = {} glossary = {} concept_map = {} sense_to_synset = {} # Parse lines such as: # wn30:synset-Aeolian-noun-2 rdfs:label "Aeolian"@en-us . for subj, rel, obj, objtag in reader.parse_file( os.path.join(input_dir, 'wordnet-synset.ttl')): if resource_name(rel) == 'label': # Everything in WordNet is in English assert objtag == 'en' labels[subj] = obj for subj, rel, obj, objtag in reader.parse_file( os.path.join(input_dir, 'wordnet-glossary.ttl')): if resource_name(rel) == 'gloss': assert objtag == 'en' # Take the definition up to the first semicolon text = obj.split(';')[0] # Remove introductory phrases with a colon text = text.split(': ', 1)[-1] # Remove parenthesized expressions while True: newtext = re.sub(r'\(.+?\) ?', '', text).strip() if newtext == text or newtext == '': break else: text = newtext glossary[subj] = text.replace('/', '_') # Get the list of word senses in each synset, and make a bidirectional mapping. # # Example line: # wn30:synset-Aeolian-noun-2 wn20schema:containsWordSense wn30:wordsense-Aeolian-noun-2 . for subj, rel, obj, objtag in reader.parse_file( os.path.join(input_dir, 'full/wordnet-wordsense-synset-relations.ttl')): if resource_name(rel) == 'containsWordSense': synset_senses[subj].append(obj) sense_synsets[obj] = subj # Assign every synset to a disambiguated concept. for synset in synset_senses: synset_name = labels[synset] synset_pos = synset.split('-')[-2] pos = PARTS_OF_SPEECH[synset_pos] disambig = glossary[synset] concept = standardized_concept_uri('en', synset_name, pos, disambig) concept_map[synset] = concept # Map senses to their synsets. for sense, synset in sense_synsets.items(): sense_to_synset[sense] = synset for filename in ('wordnet-attribute.ttl', 'wordnet-causes.ttl', 'wordnet-classifiedby.ttl', 'wordnet-entailment.ttl', 'wordnet-hyponym.ttl', 'wordnet-instances.ttl', 'wordnet-membermeronym.ttl', 'wordnet-partmeronym.ttl', 'wordnet-sameverbgroupas.ttl', 'wordnet-similarity.ttl', 'wordnet-substancemeronym.ttl', 'full/wordnet-antonym.ttl', 'full/wordnet-derivationallyrelated.ttl', 'full/wordnet-participleof.ttl', 'full/wordnet-pertainsto.ttl', 'full/wordnet-seealso.ttl'): filepath = os.path.join(input_dir, filename) if os.path.exists(filepath): for web_subj, web_rel, web_obj, objtag in reader.parse_file( filepath): # If this relation involves word senses, map them to their synsets # first. if web_subj in sense_to_synset: web_subj = sense_to_synset[web_subj] if web_obj in sense_to_synset: web_obj = sense_to_synset[web_obj] subj = concept_map[web_subj] obj = concept_map[web_obj] pred_label = resource_name(web_rel) if pred_label in REL_MAPPING: mapped_rel = REL_MAPPING[pred_label] # Handle WordNet relations that are the reverse of ConceptNet # relations. Change the word 'meronym' to 'holonym' if # necessary. if mapped_rel.startswith('~'): subj, obj = obj, subj web_subj, web_obj = web_obj, web_subj web_rel = web_rel.replace('meronym', 'holonym') mapped_rel = mapped_rel[1:] rel = join_uri('r', mapped_rel) else: rel = join_uri('r', 'wordnet', pred_label) map_out.write_link(web_rel, full_conceptnet_url(rel)) map_out.write_link(web_subj, full_conceptnet_url(subj)) map_out.write_link(web_obj, full_conceptnet_url(obj)) edge = make_edge(rel, subj, obj, dataset='/d/wordnet/3.0', license='/l/CC/By', sources=SOURCE, weight=2.0) out.write(edge)
def run_umbel(input_dir, output_file, sw_map_file): """ Read N-Triples files containing Umbel data, outputting a file of ConceptNet edges and a file of mappings between the Semantic Web and ConceptNet. """ out = MsgpackStreamWriter(output_file) map_out = NTriplesWriter(sw_map_file) reader = NTriplesReader() labels = {} label_sets = defaultdict(set) # There are two files we want to parse: # - umbel.nt, a transformation of umbel.n3, which is available from # https://github.com/structureddynamics/UMBEL/. # - umbel_links.nt, distributed with DBPedia 3.9. # # We parse them both in this file so that umbel_links can reuse the # concept names extracted from umbel.nt. main_file = os.path.join(input_dir, 'umbel.nt') dbpedia_link_file = os.path.join(input_dir, 'umbel_links.nt') # Read through umbel.nt once, finding all the "preferred labels". We will # use these as the surface texts for the nodes. for web_subj, web_rel, web_obj, objtag in reader.parse_file(main_file): if resource_name(web_rel) == 'prefLabel': # 'CW' and 'PCW' are Cyc jargon for 'conceptual works'. If a node # cannot be described except as a CW, we're probably not # interested in it. if 'CW' not in web_obj.split() and 'PCW' not in web_obj.split(): labels[web_subj] = web_obj if resource_name(web_rel).endswith('Label'): text = standardize_text(web_obj) label_sets[text].add(web_subj) # Read through umbel.nt again and extract ConceptNet edges. for web_subj, web_rel, web_obj, objtag in reader.parse_file(main_file): if objtag == 'URL' and acceptable_node(web_obj) and acceptable_node( web_subj): # Only use nodes for which we've seen preferred labels. # (This skips some anonymous OWL-cruft nodes.) if web_subj in labels and web_obj in labels: subj_uri = standardized_concept_uri('en', labels[web_subj]) obj_uri = standardized_concept_uri('en', labels[web_obj]) rel_name = resource_name(web_rel) # Check if this is a relation we want to handle. if rel_name in REL_MAPPING: # Write the ConceptNet edges and the mappings to Semantic Web URLs. rel_uri, frame = REL_MAPPING[rel_name] surface = frame % (labels[web_subj], labels[web_obj]) out.write( umbel_edge(rel_uri, subj_uri, obj_uri, surface, SOURCE)) map_out.write_link(web_rel, full_conceptnet_url(rel_uri)) map_out.write_link(web_subj, full_conceptnet_url(subj_uri)) map_out.write_link(web_obj, full_conceptnet_url(obj_uri)) # altLabel relations assign different texts to the same node. We'll # represent those in ConceptNet with Synonym relations. elif web_rel.endswith('altLabel'): # Make sure we know what's being labeled. if web_subj in labels: name = web_obj words = name.split(' ') if standardized_concept_name( 'en', name) != standardized_concept_name( 'en', labels[web_subj]): if not set(words) & IGNORED_WORDS: main_label = standardized_concept_uri( 'en', labels[web_subj]) name_text = standardize_text(name) if len(label_sets[name_text]) >= 2 or len( name_text) <= 3: disambig = un_camel_case(resource_name(web_subj)) # Cyc does not distinguish texts by their part of speech, so use # '_' as the part of speech symbol. alt_label = standardized_concept_uri( 'en', name, '_', disambig) else: alt_label = standardized_concept_uri('en', name) surface = SYN_FRAME % (name, labels[web_subj]) out.write( umbel_edge('/r/Synonym', alt_label, main_label, surface, SOURCE)) for web_subj, web_rel, web_obj, objtag in reader.parse_file( dbpedia_link_file): if objtag == 'URL' and acceptable_node(web_obj) and acceptable_node( web_subj): if web_obj in labels: subj_label = resource_name(web_subj).replace('_', ' ') subj_uri = translate_dbpedia_url(web_subj) obj_label = labels[web_obj] obj_uri = standardized_concept_uri('en', obj_label) rel_name = resource_name(web_rel) if rel_name in REL_MAPPING: rel_uri, frame = REL_MAPPING[rel_name] surface = frame % (subj_label, obj_label) out.write( umbel_edge(rel_uri, subj_uri, obj_uri, surface, LINK_SOURCE)) map_out.write_link(web_rel, full_conceptnet_url(rel_uri)) map_out.write_link(web_subj, full_conceptnet_url(subj_uri)) map_out.write_link(web_obj, full_conceptnet_url(obj_uri))
def run_umbel(input_dir, output_file, sw_map_file): """ Read N-Triples files containing Umbel data, outputting a file of ConceptNet edges and a file of mappings between the Semantic Web and ConceptNet. """ out = MsgpackStreamWriter(output_file) map_out = NTriplesWriter(sw_map_file) reader = NTriplesReader() labels = {} label_sets = defaultdict(set) # There are two files we want to parse: # - umbel.nt, a transformation of umbel.n3, which is available from # https://github.com/structureddynamics/UMBEL/. # - umbel_links.nt, distributed with DBPedia 3.9. # # We parse them both in this file so that umbel_links can reuse the # concept names extracted from umbel.nt. main_file = os.path.join(input_dir, 'umbel.nt') dbpedia_link_file = os.path.join(input_dir, 'umbel_links.nt') # Read through umbel.nt once, finding all the "preferred labels". We will # use these as the surface texts for the nodes. for web_subj, web_rel, web_obj, objtag in reader.parse_file(main_file): if resource_name(web_rel) == 'prefLabel': # 'CW' and 'PCW' are Cyc jargon for 'conceptual works'. If a node # cannot be described except as a CW, we're probably not # interested in it. if 'CW' not in web_obj.split() and 'PCW' not in web_obj.split(): labels[web_subj] = web_obj if resource_name(web_rel).endswith('Label'): text = standardize_text(web_obj) label_sets[text].add(web_subj) # Read through umbel.nt again and extract ConceptNet edges. for web_subj, web_rel, web_obj, objtag in reader.parse_file(main_file): if objtag == 'URL' and acceptable_node(web_obj) and acceptable_node(web_subj): # Only use nodes for which we've seen preferred labels. # (This skips some anonymous OWL-cruft nodes.) if web_subj in labels and web_obj in labels: subj_uri = standardized_concept_uri('en', labels[web_subj]) obj_uri = standardized_concept_uri('en', labels[web_obj]) rel_name = resource_name(web_rel) # Check if this is a relation we want to handle. if rel_name in REL_MAPPING: # Write the ConceptNet edges and the mappings to Semantic Web URLs. rel_uri, frame = REL_MAPPING[rel_name] surface = frame % (labels[web_subj], labels[web_obj]) out.write(umbel_edge(rel_uri, subj_uri, obj_uri, surface, SOURCE)) map_out.write_link(web_rel, full_conceptnet_url(rel_uri)) map_out.write_link(web_subj, full_conceptnet_url(subj_uri)) map_out.write_link(web_obj, full_conceptnet_url(obj_uri)) # altLabel relations assign different texts to the same node. We'll # represent those in ConceptNet with Synonym relations. elif web_rel.endswith('altLabel'): # Make sure we know what's being labeled. if web_subj in labels: name = web_obj words = name.split(' ') if standardized_concept_name('en', name) != standardized_concept_name('en', labels[web_subj]): if not set(words) & IGNORED_WORDS: main_label = standardized_concept_uri('en', labels[web_subj]) name_text = standardize_text(name) if len(label_sets[name_text]) >= 2 or len(name_text) <= 3: disambig = un_camel_case(resource_name(web_subj)) # Cyc does not distinguish texts by their part of speech, so use # '_' as the part of speech symbol. alt_label = standardized_concept_uri('en', name, '_', disambig) else: alt_label = standardized_concept_uri('en', name) surface = SYN_FRAME % (name, labels[web_subj]) out.write(umbel_edge('/r/Synonym', alt_label, main_label, surface, SOURCE)) for web_subj, web_rel, web_obj, objtag in reader.parse_file(dbpedia_link_file): if objtag == 'URL' and acceptable_node(web_obj) and acceptable_node(web_subj): if web_obj in labels: subj_label = resource_name(web_subj).replace('_', ' ') subj_uri = translate_dbpedia_url(web_subj) obj_label = labels[web_obj] obj_uri = standardized_concept_uri('en', obj_label) rel_name = resource_name(web_rel) if rel_name in REL_MAPPING: rel_uri, frame = REL_MAPPING[rel_name] surface = frame % (subj_label, obj_label) out.write(umbel_edge(rel_uri, subj_uri, obj_uri, surface, LINK_SOURCE)) map_out.write_link(web_rel, full_conceptnet_url(rel_uri)) map_out.write_link(web_subj, full_conceptnet_url(subj_uri)) map_out.write_link(web_obj, full_conceptnet_url(obj_uri))