def run_opencyc(input_file, output_file): """ Read an .nq file containing OpenCyc data, outputting a file of ConceptNet edges and a file of mappings between the Semantic Web and ConceptNet. """ out = MsgpackStreamWriter(output_file) labels = {} unlabels = defaultdict(set) seen_external_urls = set() # Read through the file once, finding all the "preferred labels". We will # use these as the surface texts for the nodes. for subj, pred, obj, _graph in parse_nquads( open(input_file, encoding='utf-8')): if pred['url'] == RDF_LABEL: labels[subj['url']] = obj['text'] unlabels[obj['text']].add(subj['url']) # Read through the file again and extract ConceptNet edges. for subj, pred, obj, _graph in parse_nquads( open(input_file, encoding='utf-8')): rel_name = resource_name(pred['url']) web_subj = subj.get('url') web_obj = obj.get('url') if rel_name == 'subClassOf' and web_obj is not None and web_subj in labels and web_obj in labels: subj_label = labels[web_subj] obj_label = labels[web_obj] if '_' in subj_label or '_' in obj_label: continue if subj_label.startswith('xsd:') or obj_label.startswith('xsd:'): continue subj_words = set(simple_tokenize(subj_label)) obj_words = set(simple_tokenize(obj_label)) if (subj_words & BLACKLIST_WORDS) or (obj_words & BLACKLIST_WORDS): continue if len(subj_words) > 4 or len(obj_words) > 4: continue subj_uri = cyc_to_conceptnet_uri(labels, unlabels, web_subj) obj_uri = cyc_to_conceptnet_uri(labels, unlabels, web_obj) out.write( opencyc_edge('/r/IsA', subj_uri, obj_uri, subj_label, obj_label)) if (subj_uri, web_subj) not in seen_external_urls: out.write(external_url_edge(subj_uri, web_subj)) seen_external_urls.add((subj_uri, web_subj)) if (obj_uri, web_obj) not in seen_external_urls: out.write(external_url_edge(obj_uri, web_obj)) seen_external_urls.add((obj_uri, web_obj)) elif rel_name == 'sameAs' and web_subj in labels and web_obj.startswith( 'http://umbel.org/'): subj_label = labels[web_subj] subj_uri = standardized_concept_uri('en', subj_label) if (subj_uri, web_obj) not in seen_external_urls: out.write(external_url_edge(subj_uri, web_obj)) seen_external_urls.add((subj_uri, web_obj)) out.close()
def run_opencyc(input_file, output_file): """ Read an .nq file containing OpenCyc data, outputting a file of ConceptNet edges and a file of mappings between the Semantic Web and ConceptNet. """ out = MsgpackStreamWriter(output_file) labels = {} unlabels = defaultdict(set) seen_external_urls = set() # Read through the file once, finding all the "preferred labels". We will # use these as the surface texts for the nodes. for subj, pred, obj, _graph in parse_nquads(open(input_file, encoding='utf-8')): if pred['url'] == RDF_LABEL: labels[subj['url']] = obj['text'] unlabels[obj['text']].add(subj['url']) # Read through the file again and extract ConceptNet edges. for subj, pred, obj, _graph in parse_nquads(open(input_file, encoding='utf-8')): rel_name = resource_name(pred['url']) web_subj = subj.get('url') web_obj = obj.get('url') if rel_name == 'subClassOf' and web_obj is not None and web_subj in labels and web_obj in labels: subj_label = labels[web_subj] obj_label = labels[web_obj] if '_' in subj_label or '_' in obj_label: continue if subj_label.startswith('xsd:') or obj_label.startswith('xsd:'): continue subj_words = set(simple_tokenize(subj_label)) obj_words = set(simple_tokenize(obj_label)) if (subj_words & BLACKLIST_WORDS) or (obj_words & BLACKLIST_WORDS): continue if len(subj_words) > 4 or len(obj_words) > 4: continue subj_uri = cyc_to_conceptnet_uri(labels, unlabels, web_subj) obj_uri = cyc_to_conceptnet_uri(labels, unlabels, web_obj) out.write(opencyc_edge('/r/IsA', subj_uri, obj_uri, subj_label, obj_label)) if (subj_uri, web_subj) not in seen_external_urls: out.write(external_url_edge(subj_uri, web_subj)) seen_external_urls.add((subj_uri, web_subj)) if (obj_uri, web_obj) not in seen_external_urls: out.write(external_url_edge(obj_uri, web_obj)) seen_external_urls.add((obj_uri, web_obj)) elif rel_name == 'sameAs' and web_subj in labels and web_obj.startswith('http://umbel.org/'): subj_label = labels[web_subj] subj_uri = standardized_concept_uri('en', subj_label) if (subj_uri, web_obj) not in seen_external_urls: out.write(external_url_edge(subj_uri, web_obj)) seen_external_urls.add((subj_uri, web_obj)) out.close()
def interlanguage_mapping(interlang_path, ok_concepts): quads = parse_nquads(bz2.open(str(interlang_path), 'rt')) mapping = {} for subj, values in itertools.groupby(quads, itemgetter(0)): subj_url = subj['url'] subj_concept = translate_dbpedia_url(subj_url) pieces = split_uri(subj_concept) if len(pieces) >= 6: sense = pieces[5] if 'album' in sense or 'film' in sense or 'series' in sense or 'disambiguation' in sense or 'song' in sense or 'album' in sense or 'band' in sense: continue if uri_prefix(subj_concept) in ok_concepts: targets = [subj_url] for _subj, _pred, obj, _graph in values: url = obj['url'] if 'www.wikidata.org' in url: continue if url.startswith('http://wikidata.dbpedia.org/'): wikidata_id = resource_name(url) # Return early when we see a high-numbered Wikidata ID if int(wikidata_id[1:]) >= 1000000: return mapping targets.append(url) mapping[subj_url] = targets return mapping
def run_wordnet(input_file, output_file): out = MsgpackStreamWriter(output_file) synset_senses = defaultdict(list) sense_synsets = {} synset_labels = defaultdict(list) synset_canonical_labels = {} synset_categories = {} synset_domains = {} synset_glosses = {} synset_disambig = {} synset_uris = {} # First pass: find data about synsets quads = parse_nquads(open(input_file, encoding='utf-8')) for subj_dict, rel_dict, obj_dict, _graph in quads: if 'url' not in subj_dict or 'url' not in rel_dict: continue subj = subj_dict['url'] rel = rel_dict['url'] obj = obj_dict.get('url') objtext = obj_dict.get('text') relname = resource_name(rel) if relname == 'label': if obj_dict['lang'] == 'en': synset_labels[subj].append(objtext) elif relname == 'sameAs': if obj.startswith(WN20_URL): # If we have a link to RDF WordNet 2.0, the URL (URI? IRI?) # will contain a standardized label for this concept, which # we should use when we want to use this synset as the name of # a disambiguation category. RDF WordNet 3.1 assigns synsets # a number of labels in no particular order, making it hard to # determine from 3.1 alone what to name a category. objname = resource_name(obj) parts = objname.split('-')[1:-2] # Handle missing apostrophes label = '-'.join(parts).replace('_s_', "'s_").replace('_s-', "'s_").replace("s__", "s'_").replace("s_-", "s'-").replace('_', ' ') synset_canonical_labels[subj] = label elif relname == 'domain_category': synset_categories[subj] = obj elif relname == 'lexical_domain': target = resource_name(obj) if '.' in target: domain = target.split('.')[1] synset_domains[subj] = domain elif relname == 'gloss': synset_glosses[subj] = objtext elif relname == 'reference': lemma = resource_name(subj) synset = obj synset_senses[synset].append(lemma) sense_synsets[lemma] = synset used_labels = set(synset_canonical_labels.values()) for synset, values in synset_labels.items(): values.sort(key=lambda label: (label in used_labels,) + label_sort_key(label)) if ( synset not in synset_canonical_labels or synset_canonical_labels[synset][0].isupper() and synset_domains.get(synset) == 'person' ): label = values[0] synset_canonical_labels[synset] = label used_labels.add(label) for synset, labels in synset_labels.items(): if synset in synset_categories: category_name = synset_canonical_labels[synset_categories[synset]] else: category_name = synset_domains.get(synset, None) synset_no_fragment = synset.split('#')[0] pos = synset_no_fragment[-1].lower() assert pos in 'nvarsp', synset if pos == 's': pos = 'a' elif pos == 'p': pos = '-' if category_name in ('pert', 'all', 'tops'): category_name = None synset_disambig[synset] = (pos, category_name) canon = synset_canonical_labels[synset] canon_uri = standardized_concept_uri('en', canon, pos, 'wn', category_name) synset_uris[synset] = canon_uri for label in labels: if label != canon: other_uri = standardized_concept_uri('en', label, pos, 'wn', category_name) rel_uri = '/r/Synonym' surface = '[[{0}]] is a synonym of [[{1}]]'.format(label, canon) edge = make_edge( rel_uri, other_uri, canon_uri, dataset=DATASET, surfaceText=surface, license=Licenses.cc_attribution, sources=[SOURCE], weight=2.0 ) out.write(edge) quads = parse_nquads(open(input_file, encoding='utf-8')) for subj_dict, rel_dict, obj_dict, _graph in quads: if 'url' not in subj_dict or 'url' not in rel_dict: continue subj = subj_dict['url'] rel = rel_dict['url'] obj = obj_dict.get('url') relname = resource_name(rel) if relname in REL_MAPPING: rel, frame = REL_MAPPING[relname] reversed_frame = False if rel.startswith('~'): rel = rel[1:] reversed_frame = True rel_uri = '/r/' + rel if obj is not None: obj_uri = synset_uris.get(obj) if obj not in synset_canonical_labels: continue obj_label = synset_canonical_labels[obj] else: text = obj_dict['text'] # Some WordNets use strings with "!" in them to indicate # out-of-band information, such as a missing translation if (not text) or '!' in text: continue lang = obj_dict['lang'] pos, sense = synset_disambig.get(subj, (None, None)) obj_uri = standardized_concept_uri(lang, text, pos, 'wn', sense) obj_label = text if subj not in synset_uris or subj not in synset_canonical_labels: continue subj_uri = synset_uris[subj] subj_label = synset_canonical_labels[subj] license = Licenses.cc_attribution langcode = subj_uri.split('/')[2] if langcode in SHAREALIKE_LANGUAGES: license = Licenses.cc_sharealike if reversed_frame: subj_uri, obj_uri = obj_uri, subj_uri subj_label, obj_label = obj_label, subj_label surface = frame.format('[[%s]]' % subj_label, '[[%s]]' % obj_label) edge = make_edge( rel_uri, subj_uri, obj_uri, dataset=DATASET, surfaceText=surface, license=license, sources=[SOURCE], weight=2.0 ) out.write(edge) for wn_url in sorted(synset_uris): cn_uri = synset_uris[wn_url] edge = make_edge( '/r/ExternalURL', cn_uri, wn_url, dataset=DATASET, license=Licenses.cc_sharealike, sources=[SOURCE], weight=1.0 ) out.write(edge) out.close()
def run_wordnet(input_file, output_file): out = MsgpackStreamWriter(output_file) synset_senses = defaultdict(list) sense_synsets = {} synset_labels = defaultdict(list) synset_canonical_labels = {} synset_categories = {} synset_domains = {} synset_glosses = {} synset_disambig = {} synset_uris = {} # First pass: find data about synsets quads = parse_nquads(open(input_file, encoding='utf-8')) for subj_dict, rel_dict, obj_dict, _graph in quads: if 'url' not in subj_dict or 'url' not in rel_dict: continue subj = subj_dict['url'] rel = rel_dict['url'] obj = obj_dict.get('url') objtext = obj_dict.get('text') relname = resource_name(rel) if relname == 'label': if obj_dict['lang'] == 'en': synset_labels[subj].append(objtext) elif relname == 'sameAs': if obj.startswith(WN20_URL): # If we have a link to RDF WordNet 2.0, the URL (URI? IRI?) # will contain a standardized label for this concept, which # we should use when we want to use this synset as the name of # a disambiguation category. RDF WordNet 3.1 assigns synsets # a number of labels in no particular order, making it hard to # determine from 3.1 alone what to name a category. objname = resource_name(obj) parts = objname.split('-')[1:-2] # Handle missing apostrophes label = '-'.join(parts).replace('_s_', "'s_").replace( '_s-', "'s_").replace("s__", "s'_").replace("s_-", "s'-").replace('_', ' ') synset_canonical_labels[subj] = label elif relname == 'domain_category': synset_categories[subj] = obj elif relname == 'lexical_domain': target = resource_name(obj) if '.' in target: domain = target.split('.')[1] synset_domains[subj] = domain elif relname == 'gloss': synset_glosses[subj] = objtext elif relname == 'reference': lemma = resource_name(subj) synset = obj synset_senses[synset].append(lemma) sense_synsets[lemma] = synset used_labels = set(synset_canonical_labels.values()) for synset, values in synset_labels.items(): values.sort( key=lambda label: (label in used_labels, ) + label_sort_key(label)) if (synset not in synset_canonical_labels or synset_canonical_labels[synset][0].isupper() and synset_domains.get(synset) == 'person'): label = values[0] synset_canonical_labels[synset] = label used_labels.add(label) for synset, labels in synset_labels.items(): if synset in synset_categories: category_name = synset_canonical_labels[synset_categories[synset]] else: category_name = synset_domains.get(synset, None) synset_no_fragment = synset.split('#')[0] pos = synset_no_fragment[-1].lower() assert pos in 'nvarsp', synset if pos == 's': pos = 'a' elif pos == 'p': pos = '-' if category_name in ('pert', 'all', 'tops'): category_name = None synset_disambig[synset] = (pos, category_name) canon = synset_canonical_labels[synset] canon_uri = standardized_concept_uri('en', canon, pos, 'wn', category_name) synset_uris[synset] = canon_uri for label in labels: if label != canon: other_uri = standardized_concept_uri('en', label, pos, 'wn', category_name) rel_uri = '/r/Synonym' surface = '[[{0}]] is a synonym of [[{1}]]'.format( label, canon) edge = make_edge(rel_uri, other_uri, canon_uri, dataset=DATASET, surfaceText=surface, license=Licenses.cc_attribution, sources=[SOURCE], weight=2.0) out.write(edge) quads = parse_nquads(open(input_file, encoding='utf-8')) for subj_dict, rel_dict, obj_dict, _graph in quads: if 'url' not in subj_dict or 'url' not in rel_dict: continue subj = subj_dict['url'] rel = rel_dict['url'] obj = obj_dict.get('url') relname = resource_name(rel) if relname in REL_MAPPING: pos, sense = synset_disambig.get(subj, (None, None)) if relname == 'hypernym' and pos == 'v': relname = 'hypernym-v' rel, frame = REL_MAPPING[relname] reversed_frame = False if rel.startswith('~'): rel = rel[1:] reversed_frame = True rel_uri = '/r/' + rel if obj is not None: obj_uri = synset_uris.get(obj) if obj not in synset_canonical_labels: continue obj_label = synset_canonical_labels[obj] else: text = obj_dict['text'] # Some WordNets use strings with "!" in them to indicate # out-of-band information, such as a missing translation if (not text) or '!' in text: continue lang = obj_dict['lang'] obj_uri = standardized_concept_uri(lang, text, pos, 'wn', sense) obj_label = text if subj not in synset_uris or subj not in synset_canonical_labels: continue subj_uri = synset_uris[subj] subj_label = synset_canonical_labels[subj] license = Licenses.cc_attribution langcode = subj_uri.split('/')[2] if langcode in SHAREALIKE_LANGUAGES: license = Licenses.cc_sharealike if reversed_frame: subj_uri, obj_uri = obj_uri, subj_uri subj_label, obj_label = obj_label, subj_label surface = frame.format('[[%s]]' % subj_label, '[[%s]]' % obj_label) edge = make_edge(rel_uri, subj_uri, obj_uri, dataset=DATASET, surfaceText=surface, license=license, sources=[SOURCE], weight=2.0) out.write(edge) for wn_url in sorted(synset_uris): cn_uri = synset_uris[wn_url] edge = make_edge('/r/ExternalURL', cn_uri, wn_url, dataset=DATASET, license=Licenses.cc_sharealike, sources=[SOURCE], weight=1.0) out.write(edge) out.close()
def process_dbpedia(input_dir, output_file, concept_file): """ Read through multiple DBPedia files and output filtered assertions to `output_file`. """ ok_concepts = read_concept_file(concept_file) input_path = pathlib.Path(input_dir) interlang_path = input_path / 'interlanguage_links_en.tql.bz2' mapped_urls = interlanguage_mapping(interlang_path, ok_concepts) out = MsgpackStreamWriter(output_file) types_path = input_path / 'instance_types_en.tql.bz2' quads = parse_nquads(bz2.open(str(types_path), 'rt')) for subj, pred, obj, _graph in quads: subj_url = subj['url'] if ( 'Category:' in subj_url or 'File:' in subj_url or 'List_of' in subj_url or '__' in subj_url or 'Template:' in subj_url ): continue if subj_url in mapped_urls: subj_concept = translate_dbpedia_url(subj_url) obj_type = un_camel_case(resource_name(obj['url'])) if obj_type not in TYPE_BLACKLIST: obj_concept = standardized_concept_uri('en', obj_type, 'n') if obj_concept not in CONCEPT_BLACKLIST: edge = make_edge( '/r/IsA', subj_concept, obj_concept, dataset='/d/dbpedia/en', license=Licenses.cc_sharealike, sources=[{'contributor': '/s/resource/dbpedia/2015/en'}], weight=0.5, surfaceStart=url_to_label(subj['url']), surfaceEnd=url_to_label(obj['url']) ) out.write(edge) for other_url in mapped_urls[subj_url]: if other_url.startswith('http://wikidata.dbpedia.org/'): urledge = make_edge( '/r/ExternalURL', subj_concept, other_url, dataset='/d/dbpedia/en', license=Licenses.cc_sharealike, sources=[{'contributor': '/s/resource/dbpedia/2015/en'}], weight=1.0 ) out.write(urledge) else: other_concept = translate_dbpedia_url(other_url) if other_concept: urledge = make_edge( '/r/ExternalURL', other_concept, other_url, dataset='/d/dbpedia/en', license=Licenses.cc_sharealike, sources=[{'contributor': '/s/resource/dbpedia/2015/en'}], weight=1.0 ) out.write(urledge) edge = make_edge( '/r/Synonym', other_concept, subj_concept, dataset='/d/dbpedia/en', license=Licenses.cc_sharealike, sources=[{'contributor': '/s/resource/dbpedia/2015/en'}], weight=0.5, surfaceStart=url_to_label(other_url), surfaceEnd=url_to_label(subj_url) ) out.write(edge) relations_path = input_path / 'mappingbased_objects_en.tql.bz2' quads = parse_nquads(bz2.open(str(relations_path), 'rt')) for subj, pred, obj, _graph in quads: subj_concept = translate_dbpedia_url(subj['url']) obj_concept = translate_dbpedia_url(obj['url']) rel_name = resource_name(pred['url']) if ( subj_concept and obj_concept and subj['url'] in mapped_urls and obj['url'] in mapped_urls ): if rel_name in RELATIONS: rel = RELATIONS[rel_name] edge = make_edge( rel, subj_concept, obj_concept, dataset='/d/dbpedia/en', license=Licenses.cc_sharealike, sources=[{'contributor': '/s/resource/dbpedia/2015/en'}], weight=0.5, surfaceStart=url_to_label(subj['url']), surfaceEnd=url_to_label(obj['url']) ) out.write(edge) out.close()
def run_wordnet(input_file, output_file): out = MsgpackStreamWriter(output_file) synset_senses = defaultdict(list) sense_synsets = {} synset_labels = defaultdict(list) synset_canonical_labels = {} synset_categories = {} synset_domains = {} synset_glosses = {} synset_disambig = {} synset_uris = {} # First pass: find data about synsets quads = parse_nquads(open(input_file, encoding="utf-8")) for subj_dict, rel_dict, obj_dict, _graph in quads: if "url" not in subj_dict or "url" not in rel_dict: continue subj = subj_dict["url"] rel = rel_dict["url"] obj = obj_dict.get("url") objtext = obj_dict.get("text") relname = resource_name(rel) if relname == "label": if obj_dict["lang"] == "en": synset_labels[subj].append(objtext) elif relname == "sameAs": if obj.startswith(WN20_URL): # If we have a link to RDF WordNet 2.0, the URL (URI? IRI?) # will contain a standardized label for this concept, which # we should use when we want to use this synset as the name of # a disambiguation category. RDF WordNet 3.1 assigns synsets # a number of labels in no particular order, making it hard to # determine from 3.1 alone what to name a category. objname = resource_name(obj) parts = objname.split("-")[1:-2] # Handle missing apostrophes label = ( "-".join(parts) .replace("_s_", "'s_") .replace("_s-", "'s_") .replace("s__", "s'_") .replace("s_-", "s'-") .replace("_", " ") ) synset_canonical_labels[subj] = label elif relname == "domain_category": synset_categories[subj] = obj elif relname == "lexical_domain": target = resource_name(obj) if "." in target: domain = target.split(".")[1] synset_domains[subj] = domain elif relname == "gloss": synset_glosses[subj] = objtext elif relname == "reference": lemma = resource_name(subj) synset = obj synset_senses[synset].append(lemma) sense_synsets[lemma] = synset used_labels = set(synset_canonical_labels.values()) for synset, values in synset_labels.items(): values.sort(key=lambda label: (label in used_labels,) + label_sort_key(label)) if ( synset not in synset_canonical_labels or synset_canonical_labels[synset][0].isupper() and synset_domains.get(synset) == "person" ): label = values[0] synset_canonical_labels[synset] = label used_labels.add(label) for synset, labels in synset_labels.items(): if synset in synset_categories: category_name = synset_canonical_labels[synset_categories[synset]] else: category_name = synset_domains.get(synset, None) synset_no_fragment = synset.split("#")[0] pos = synset_no_fragment[-1].lower() assert pos in "nvarsp", synset if pos == "s": pos = "a" elif pos == "p": pos = "-" if category_name in ("pert", "all", "tops"): category_name = None synset_disambig[synset] = (pos, category_name) canon = synset_canonical_labels[synset] canon_uri = standardized_concept_uri("en", canon, pos, "wn", category_name) synset_uris[synset] = canon_uri for label in labels: if label != canon: other_uri = standardized_concept_uri( "en", label, pos, "wn", category_name ) rel_uri = "/r/Synonym" surface = "[[{0}]] is a synonym of [[{1}]]".format(label, canon) edge = make_edge( rel_uri, other_uri, canon_uri, dataset=DATASET, surfaceText=surface, license=Licenses.cc_attribution, sources=[SOURCE], weight=2.0, ) out.write(edge) quads = parse_nquads(open(input_file, encoding="utf-8")) for subj_dict, rel_dict, obj_dict, _graph in quads: if "url" not in subj_dict or "url" not in rel_dict: continue subj = subj_dict["url"] rel = rel_dict["url"] obj = obj_dict.get("url") relname = resource_name(rel) if relname in REL_MAPPING: pos, sense = synset_disambig.get(subj, (None, None)) if relname == "hypernym" and pos == "v": relname = "hypernym-v" rel, frame = REL_MAPPING[relname] reversed_frame = False if rel.startswith("~"): rel = rel[1:] reversed_frame = True rel_uri = "/r/" + rel if obj is not None: obj_uri = synset_uris.get(obj) if obj not in synset_canonical_labels: continue obj_label = synset_canonical_labels[obj] else: text = obj_dict["text"] # Some WordNets use strings with "!" in them to indicate # out-of-band information, such as a missing translation if (not text) or "!" in text: continue lang = obj_dict["lang"] obj_uri = standardized_concept_uri(lang, text, pos, "wn", sense) obj_label = text if subj not in synset_uris or subj not in synset_canonical_labels: continue subj_uri = synset_uris[subj] subj_label = synset_canonical_labels[subj] license = Licenses.cc_attribution langcode = subj_uri.split("/")[2] if langcode in SHAREALIKE_LANGUAGES: license = Licenses.cc_sharealike if reversed_frame: subj_uri, obj_uri = obj_uri, subj_uri subj_label, obj_label = obj_label, subj_label surface = frame.format("[[%s]]" % subj_label, "[[%s]]" % obj_label) edge = make_edge( rel_uri, subj_uri, obj_uri, dataset=DATASET, surfaceText=surface, license=license, sources=[SOURCE], weight=2.0, ) out.write(edge) for wn_url in sorted(synset_uris): cn_uri = synset_uris[wn_url] edge = make_edge( "/r/ExternalURL", cn_uri, wn_url, dataset=DATASET, license=Licenses.cc_sharealike, sources=[SOURCE], weight=1.0, ) out.write(edge) out.close()
def process_dbpedia(input_dir, output_file, concept_file): """ Read through multiple DBPedia files and output filtered assertions to `output_file`. """ ok_concepts = read_concept_file(concept_file) input_path = pathlib.Path(input_dir) interlang_path = input_path / 'interlanguage_links_en.tql.bz2' mapped_urls = interlanguage_mapping(interlang_path, ok_concepts) out = MsgpackStreamWriter(output_file) types_path = input_path / 'instance_types_en.tql.bz2' quads = parse_nquads(bz2.open(str(types_path), 'rt')) for subj, pred, obj, _graph in quads: subj_url = subj['url'] if ('Category:' in subj_url or 'File:' in subj_url or 'List_of' in subj_url or '__' in subj_url or 'Template:' in subj_url): continue if subj_url in mapped_urls: subj_concept = translate_dbpedia_url(subj_url) obj_type = un_camel_case(resource_name(obj['url'])) if obj_type not in TYPE_BLACKLIST: obj_concept = standardized_concept_uri('en', obj_type, 'n') if obj_concept not in CONCEPT_BLACKLIST: edge = make_edge('/r/IsA', subj_concept, obj_concept, dataset='/d/dbpedia/en', license=Licenses.cc_sharealike, sources=[{ 'contributor': '/s/resource/dbpedia/2015/en' }], weight=0.5, surfaceStart=url_to_label(subj['url']), surfaceEnd=url_to_label(obj['url'])) out.write(edge) for other_url in mapped_urls[subj_url]: if other_url.startswith('http://wikidata.dbpedia.org/'): urledge = make_edge('/r/ExternalURL', subj_concept, other_url, dataset='/d/dbpedia/en', license=Licenses.cc_sharealike, sources=[{ 'contributor': '/s/resource/dbpedia/2015/en' }], weight=1.0) out.write(urledge) else: other_concept = translate_dbpedia_url(other_url) if other_concept: urledge = make_edge( '/r/ExternalURL', other_concept, other_url, dataset='/d/dbpedia/en', license=Licenses.cc_sharealike, sources=[{ 'contributor': '/s/resource/dbpedia/2015/en' }], weight=1.0) out.write(urledge) edge = make_edge( '/r/Synonym', other_concept, subj_concept, dataset='/d/dbpedia/en', license=Licenses.cc_sharealike, sources=[{ 'contributor': '/s/resource/dbpedia/2015/en' }], weight=0.5, surfaceStart=url_to_label(other_url), surfaceEnd=url_to_label(subj_url)) out.write(edge) relations_path = input_path / 'mappingbased_objects_en.tql.bz2' quads = parse_nquads(bz2.open(str(relations_path), 'rt')) for subj, pred, obj, _graph in quads: subj_concept = translate_dbpedia_url(subj['url']) obj_concept = translate_dbpedia_url(obj['url']) rel_name = resource_name(pred['url']) if (subj_concept and obj_concept and subj['url'] in mapped_urls and obj['url'] in mapped_urls): if rel_name in RELATIONS: rel = RELATIONS[rel_name] edge = make_edge(rel, subj_concept, obj_concept, dataset='/d/dbpedia/en', license=Licenses.cc_sharealike, sources=[{ 'contributor': '/s/resource/dbpedia/2015/en' }], weight=0.5, surfaceStart=url_to_label(subj['url']), surfaceEnd=url_to_label(obj['url'])) out.write(edge) out.close()