def read_edges(root): """ Read all csv files (assumed to be ConceptNet edge files) under the given root (e.g. $CONCEPTNET_DATA/edges or a single edge file) and generate one five-tuple for every edge they contain, consisting of the relation, the (uri-prefixes of the) left and right endpoints, the dataset, and the data sources (as a string). """ if not os.path.isdir(root): files = [root] else: files = [] for root_dir, _, filenames in os.walk(root): for filename in filenames: if filename.lower().endswith('.csv'): path = os.path.join(root_dir, filename) files.append(path) for path in files: with open(path, 'rt', encoding='utf-8') as fp: for line in fp: _, rel, left, right, json_data = line.split('\t') left = uri_prefix(left) right = uri_prefix(right) data = json.loads(json_data, encoding='utf-8') dataset = data['dataset'] source = data['sources'] yield (rel, left, right, dataset, source)
def make_conceptnet_association_graph(filename, save_edge_list=True, concept_filter=None, bad_concept=concept_is_bad, bad_relation=is_negative_relation): """ Reads an association file and builds an (undirected) graph from it, """ graph = ConceptNetAssociationGraph(save_edge_list) if concept_filter is None: concept_filter = lambda concept: True if bad_concept is None: bad_concept = lambda concept: False if bad_relation is None: bad_relation = lambda rel: False with open(filename, encoding='utf-8') as file: for line in file: left, right, value, dataset, rel = line.rstrip().split('\t', 4) if bad_concept(left) or bad_concept(right) or bad_relation(rel): continue fvalue = float(value) gleft = uri_prefix(left) gright = uri_prefix(right) if concept_filter(gleft) and concept_filter(gright) \ and fvalue != 0 and gleft != gright: graph.add_edge(gleft, gright, value, dataset, rel) return graph
def make_filtered_concepts(filename, cutoff=3, en_cutoff=3): """ Takes in a file of tab-separated associations, and returns a set of concepts from which those which are unlikely to be useful have been removed. All concepts that occur fewer than `cutoff` times will be removed. All English concepts that occur fewer than `en_cutoff` times will be removed. """ counts = defaultdict(int) with open(filename, encoding='utf-8') as file: for line in file: left, right, _value, _dataset, rel = line.rstrip().split('\t') if rel == '/r/SenseOf': pass else: gleft = uri_prefix(left) gright = uri_prefix(right) if is_concept(gright): counts[gleft] += 1 if is_concept(gleft): counts[gright] += 1 filtered_concepts = { concept for (concept, count) in counts.items() if (count >= en_cutoff or (not is_concept(concept) and count >= cutoff)) } return filtered_concepts
def make_filtered_concepts(filename, cutoff=3, en_cutoff=3): """ Takes in a file of tab-separated associations, and returns a set of concepts from which those which are unlikely to be useful have been removed. All concepts that occur fewer than `cutoff` times will be removed. All English concepts that occur fewer than `en_cutoff` times will be removed. """ counts = defaultdict(int) with open(filename, encoding='utf-8') as file: for line in file: left, right, _value, _dataset, rel = line.rstrip().split('\t') if rel == '/r/SenseOf': pass else: gleft = uri_prefix(left) gright = uri_prefix(right) if is_concept(gright): counts[gleft] += 1 if is_concept(gleft): counts[gright] += 1 filtered_concepts = { concept for (concept, count) in counts.items() if (count >= en_cutoff or (not is_concept(concept) and count >= cutoff) ) } return filtered_concepts
def standardize_row_labels(frame, language='en', forms=True): """ Convert a frame whose row labels are bare English terms (e.g. of the form 'en/term') to one whose row labels are standardized ConceptNet URIs (e.g. of the form '/c/en/term'; and with some extra word2vec-style normalization of digits). Rows whose labels get the same standardized URI get combined, with earlier rows given more weight. Args: frame (DataFrame): Term vectors DataFrame with indexed with terms. language (str, default='en): Use this language for labels that aren't already standardized. forms (bool, default=True): Combine terms with the same lemma. """ # Re-label the DataFrame with standardized, non-unique row labels # (this used to be a bug, see previous and new behavior comment below) #if all('/' in label for label in frame.index[10:20]): if all(label.count('/') == 1 for label in frame.index[10:20]): # previously partitioned label='/c/en/term' into tuple=('', '/', 'c/en/term') # into new label='/c//en_term', now partitions label='en/term' into tuple=('en', # '/', 'term') into new label=/c/en/term tuples = [label.partition('/') for label in frame.index] frame.index = [ uri_prefix(standardized_uri(language, text)) for language, _slash, text in tuples ] # `language` argument is only used here for labels that aren't already standardized frame.index = [ uri_prefix(standardized_uri(language, label)) for label in frame.index ] # Assign row n a weight of 1/(n+1) for weighted averaging nrows = frame.shape[0] weights = 1.0 / np.arange( 1, nrows + 1) # "with earlier rows given more weight" label_weights = pd.Series(weights, index=frame.index) # groupby(level=0).sum() means to add rows that have the same label relabeled = frame.mul(weights, axis='rows').sort_index().groupby(level=0).sum() combined_weights = label_weights.sort_index().groupby(level=0).sum() # Optionally adjust words to be more like their word forms if forms: for label in relabeled.index: lemmatized = lemmatize_uri(label) if lemmatized != label and lemmatized in relabeled.index: relabeled.loc[lemmatized] += relabeled.loc[label] / 2 combined_weights.loc[ lemmatized] += combined_weights.loc[label] / 2 scaled = relabeled.div(combined_weights, axis='rows') # Rearrange the items in descending order of weight, similar to the order # we get them in from word2vec and GloVe combined_weights.sort_values(inplace=True, ascending=False) result = scaled.loc[combined_weights.index] return result
def combine_assertions(input_filename, core_filename, output_filename): """ Take in a tab-separated, sorted "CSV" files, indicated by `input_filename`, that should be grouped together into assertions. Output a msgpack stream of assertions the file indicated by `output_filename`. The input file should be made from multiple sources of assertions by concatenating and sorting them. The combined assertions will all have the dataset of the first edge that produces them, and the license of the strongest license being combined. This process requires its input to be a sorted CSV so that all edges for the same assertion will appear consecutively. """ def group_func(line): "Group lines by their URI (their first column)." return line.split('\t', 1)[0] out = MsgpackStreamWriter(output_filename) out_bad = MsgpackStreamWriter(output_filename + '.reject') core_prefixes = set() for line in open(core_filename, encoding='utf-8'): core_prefixes.add(uri_prefix(line.strip(), 3)) # Scan through the assertions twice to add derived words to the blocklist blocklist = Blocklist.load(get_support_data_filename(BLOCK_FILENAME)) for iter in range(2): with open(input_filename, encoding='utf-8') as stream: for line in stream: tmp_assertion = _make_assertion([line.strip()]) if tmp_assertion is None: continue blocklist.propagate_blocks(tmp_assertion) with open(input_filename, encoding='utf-8') as stream: for key, line_group in itertools.groupby(stream, group_func): assertion = _make_assertion(line_group) destination = out if assertion is None: continue if assertion['weight'] <= 0: destination = out_bad if blocklist.is_blocked(assertion): destination = out_bad if assertion['rel'] == 'ExternalURL': # discard ExternalURL edges for things that aren't otherwise # in ConceptNet prefix = uri_prefix(assertion['start'], 3) if prefix not in core_prefixes: destination = out_bad destination.write(assertion) out.close() out_bad.close()
def make_assertion(line_group): lines = [line.rstrip() for line in line_group] lines = [line for line in lines if line] if not lines: return None # FIXME: the steps leading up to this produce URIs that can differ based # on word senses. These don't get merged together, but they should. uri, rel, start, end, _ = lines[0].split('\t') # We can't distinguish word senses well enough yet, so only keep them # up to the part of speech start = uri_prefix(start, 4) end = uri_prefix(end, 4) if not (keep_concept(start) and keep_concept(end)): return None info_dicts = [json.loads(line.split('\t')[4]) for line in lines] unscaled_weight = sum(info['weight'] for info in info_dicts) licenses = {info['license'] for info in info_dicts} dataset = info_dicts[0]['dataset'] surface_text = None sources = [] seen_sources = set() for info in info_dicts: if surface_text is None and 'surfaceText' in info: surface_text = info['surfaceText'] for subsource in info['sources']: conjunction = conjunction_uri(*sorted(subsource.values())) if conjunction not in seen_sources: sources.append(subsource) seen_sources.add(conjunction) weight = weight_scale(unscaled_weight) if Licenses.cc_sharealike in licenses: license = Licenses.cc_sharealike else: license = Licenses.cc_attribution return make_edge(rel=rel, start=start, end=end, weight=weight, dataset=dataset, license=license, sources=sources, surfaceText=surface_text)
def interlanguage_mapping(interlang_path, ok_concepts): quads = parse_nquads(bz2.open(str(interlang_path), 'rt')) mapping = {} for subj, values in itertools.groupby(quads, itemgetter(0)): subj_url = subj['url'] subj_concept = translate_dbpedia_url(subj_url) pieces = split_uri(subj_concept) if len(pieces) >= 6: sense = pieces[5] if 'album' in sense or 'film' in sense or 'series' in sense or 'disambiguation' in sense or 'song' in sense or 'album' in sense or 'band' in sense: continue if uri_prefix(subj_concept) in ok_concepts: targets = [subj_url] for _subj, _pred, obj, _graph in values: url = obj['url'] if 'www.wikidata.org' in url: continue if url.startswith('http://wikidata.dbpedia.org/'): wikidata_id = resource_name(url) # Return early when we see a high-numbered Wikidata ID if int(wikidata_id[1:]) >= 1000000: return mapping targets.append(url) mapping[subj_url] = targets return mapping
def read_concept_file(concept_file): # TODO: docstring concepts = set() for line in open(concept_file, encoding='utf-8'): concept = uri_prefix(line.strip()) concepts.add(concept) return concepts
def standardize_row_labels(frame, language='en', forms=True): """ Convert a frame whose row labels are bare English terms to one whose row labels are standardized ConceptNet URIs (with some extra word2vec-style normalization of digits). Rows whose labels get the same standardized URI get combined, with earlier rows given more weight. """ # Re-label the DataFrame with standardized, non-unique row labels frame.index = [uri_prefix(standardized_uri(language, label)) for label in frame.index] # Assign row n a weight of 1/(n+1) for weighted averaging nrows = frame.shape[0] weights = 1.0 / np.arange(1, nrows + 1) label_weights = pd.Series(weights, index=frame.index) # groupby(level=0).sum() means to add rows that have the same label relabeled = frame.mul(weights, axis='rows').sort_index().groupby(level=0).sum() combined_weights = label_weights.sort_index().groupby(level=0).sum() # Optionally adjust words to be more like their word forms if forms: for label in relabeled.index: lemmatized = lemmatize_uri(label) if lemmatized != label and lemmatized in relabeled.index: relabeled.loc[lemmatized] += relabeled.loc[label] / 2 combined_weights.loc[lemmatized] += combined_weights.loc[label] / 2 scaled = relabeled.div(combined_weights, axis='rows') # Rearrange the items in descending order of weight, similar to the order # we get them in from word2vec and GloVe combined_weights.sort(ascending=False) result = scaled.loc[combined_weights.index] return result
def expand_terms(self, terms, limit_per_term=10, include_neighbors=True): """ Given a list of weighted terms as (term, weight) tuples, add terms that are one step away in ConceptNet at a lower weight. This helps increase the recall power of the vector space, because it means you can find terms that are too infrequent to have their own vector by looking up their neighbors. This forms a reasonable approximation of the vector an infrequent term would have anyway. """ self.load() expanded = terms[:] for term, weight in terms: expanded.append((term, weight / 10)) if include_neighbors and term not in self.frame.index and self.finder is not None: for edge in self.finder.lookup(term, limit=limit_per_term): if field_match(edge['start']['term'], term) and not field_match( edge['end']['term'], term): neighbor = edge['end']['term'] elif field_match(edge['end']['term'], term) and not field_match( edge['start']['term'], term): neighbor = edge['start']['term'] else: continue neighbor_weight = weight * min(10, edge['weight']) * 0.001 expanded.append((neighbor, neighbor_weight)) total_weight = sum(abs(weight) for term, weight in expanded) if total_weight == 0: return [] else: return [(uri_prefix(term), weight / total_weight) for (term, weight) in expanded]
def expand_terms(self, terms, limit_per_term=10, include_neighbors=True): """ Given a list of weighted terms as (term, weight) tuples, add terms that are one step away in ConceptNet at a lower weight, terms in English that share the surface form with these terms, and the terms which share prefix with these terms, if the terms are OOV. This helps increase the recall power of the vector space, because it means you can find terms that are too infrequent to have their own vector by looking up their neighbors, etc. This forms a reasonable approximation of the vector an infrequent term would have anyway. """ self.load() expanded = terms[:] for term, weight in terms: if include_neighbors and term not in self.frame.index and self.finder is not None: for edge in self.finder.lookup(term, limit=limit_per_term): if field_match(edge['start']['term'], term) and not field_match( edge['end']['term'], term): neighbor = edge['end']['term'] elif field_match(edge['end']['term'], term) and not field_match( edge['start']['term'], term): neighbor = edge['start']['term'] else: continue # TODO: explain this formula neighbor_weight = weight * min(10, edge['weight']) * 0.01 expanded.append((neighbor, neighbor_weight)) prefix_weight = 0.01 if not term.startswith('/c/en/'): # FIXME: better language code handling englishified = '/c/en/' + term[6:] expanded.append((englishified, prefix_weight)) while term: # Skip excessively general lookups, for either an entire # language, or all terms starting with a single # non-ideographic letter if term.endswith('/') or (term[-2] == '/' and term[-1] < chr(0x3000)): break prefixed = self.terms_with_prefix(term) if prefixed: n_prefixed = len(prefixed) for prefixed_term in prefixed: expanded.append( (prefixed_term, prefix_weight / n_prefixed)) break term = term[:-1] total_weight = sum(abs(weight) for term, weight in expanded) if total_weight == 0: return [] else: return [(uri_prefix(term), weight / total_weight) for (term, weight) in expanded]
def expand_terms(self, terms, limit_per_term=10, include_neighbors=True): """ Given a list of weighted terms as (term, weight) tuples, add terms that are one step away in ConceptNet at a lower weight. This helps increase the recall power of the vector space, because it means you can find terms that are too infrequent to have their own vector by looking up their neighbors. This forms a reasonable approximation of the vector an infrequent term would have anyway. """ self.load() expanded = terms[:] for term, weight in terms: expanded.append((term, weight / 10)) if include_neighbors and term not in self.frame.index and self.finder is not None: for edge in self.finder.lookup(term, limit=limit_per_term): if field_match(edge["start"]["term"], term) and not field_match(edge["end"]["term"], term): neighbor = edge["end"]["term"] elif field_match(edge["end"]["term"], term) and not field_match(edge["start"]["term"], term): neighbor = edge["start"]["term"] else: continue neighbor_weight = weight * min(10, edge["weight"]) * 0.001 expanded.append((neighbor, neighbor_weight)) total_weight = sum(abs(weight) for term, weight in expanded) if total_weight == 0: return [] else: return [(uri_prefix(term), weight / total_weight) for (term, weight) in expanded]
def expand_terms(self, terms, limit_per_term=10, oov_vector=True): """ Given a list of weighted terms as (term, weight) tuples, add terms that are one step away in ConceptNet at a lower weight, terms in English that share the surface form with these terms, and the terms which share prefix with these terms, if the terms are OOV. This helps increase the recall power of the vector space, because it means you can find terms that are too infrequent to have their own vector by looking up their neighbors, etc. This forms a reasonable approximation of the vector an infrequent term would have anyway. """ self.load() expanded = terms[:] for term, weight in terms: if oov_vector and term not in self.frame.index and self.finder is not None: neighbors = self._find_neighbors(term, limit_per_term, weight) expanded.extend(neighbors) prefix_weight = 0.01 if get_uri_language(term) != 'en': englishified = self._englishify(term) expanded.append((englishified, prefix_weight)) prefix_matches = self._match_prefix(term, prefix_weight) expanded.extend(prefix_matches) total_weight = sum(abs(weight) for term, weight in expanded) if total_weight == 0: return [] else: return [ (uri_prefix(term), weight / total_weight) for (term, weight) in expanded ]
def expand_terms(self, terms, oov_vector=True): """ Given a list of weighted terms as (term, weight) tuples, if any of the terms are OOV, find approximations to those terms: the same term in English, or terms that share a prefix that's as long as possible with the given term. This helps increase the recall power of the vector space, because it means you can find terms that are too infrequent to have their own vector, getting a reasonable guess at the vector they might have. """ expanded = terms[:] for term, weight in terms: if oov_vector and term not in self.frame.index: prefix_weight = 0.01 if get_uri_language(term) != 'en': englishified = self._englishify(term) if englishified is not None: expanded.append((englishified, prefix_weight)) prefix_matches = self._match_prefix(term, prefix_weight) expanded.extend(prefix_matches) total_weight = sum(abs(weight) for term, weight in expanded) if total_weight == 0: return [] else: return [(uri_prefix(term), weight / total_weight) for (term, weight) in expanded]
def ld_node(uri, label=None): """ Convert a ConceptNet URI into a dictionary suitable for Linked Data. """ if label is None: label = uri_to_label(uri) ld = {'@id': uri, 'label': label} if is_term(uri): pieces = split_uri(uri) ld['language'] = get_uri_language(uri) if len(pieces) > 3: ld['sense_label'] = '/'.join(pieces[3:]) ld['term'] = uri_prefix(uri) ld['@type'] = 'Node' elif uri.startswith('http'): domain = urlparse(uri).netloc ld['site'] = domain ld['term'] = uri # OpenCyc is down and UMBEL doesn't host their vocabulary on the # Web. This property indicates whether you can follow a link # via HTTP and retrieve more information. ld['site_available'] = domain not in {'sw.opencyc.org', 'umbel.org'} ld['@type'] = 'Node' elif uri.startswith('/r/'): ld['@type'] = 'Relation' return ld
def reduce_assoc(filename, output_filename, cutoff=3, en_cutoff=3): """ Takes in a file of tab-separated simple associations, and removes uncommon associations and associations unlikely to be useful. All concepts that occur fewer than `cutoff` times will be removed. All English concepts that occur fewer than `en_cutoff` times will be removed. """ counts = defaultdict(int) with open(filename, encoding='utf-8') as file: for line in file: left, right, _value, _dataset, rel = line.rstrip().split('\t') if rel == '/r/SenseOf': pass else: gleft = uri_prefix(left) gright = uri_prefix(right) if gright.startswith('/c/'): counts[gleft] += 1 if gleft.startswith('/c/'): counts[gright] += 1 filtered_concepts = { concept for (concept, count) in counts.items() if ( count >= en_cutoff or (not concept.startswith('/c/en/') and count >= cutoff) ) } with open(output_filename, 'w', encoding='utf-8') as out: with open(filename, encoding='utf-8') as file: for line in file: left, right, value, dataset, rel = line.rstrip().split('\t', 4) if concept_is_bad(left) or concept_is_bad(right) or is_negative_relation(rel): continue fvalue = float(value) gleft = uri_prefix(left) gright = uri_prefix(right) if ( gleft in filtered_concepts and gright in filtered_concepts and fvalue != 0 ): if gleft != gright: line = '\t'.join([gleft, gright, value, dataset, rel]) print(line, file=out)
def standardize_row_labels(frame, language='en', forms=True): """ Convert a frame whose row labels are bare English terms (e.g. of the form 'en/term') to one whose row labels are standardized ConceptNet URIs (e.g. of the form '/c/en/term'; and with some extra word2vec-style normalization of digits). Rows whose labels get the same standardized URI get combined, with earlier rows given more weight. """ # Check for en/term format we use to train fastText on OpenSubtitles data if all(label.count('/') == 1 for label in frame.index[0:5]): tuples = [label.partition('/') for label in frame.index] frame.index = [ uri_prefix(standardized_uri(language, text)) for language, _slash, text in tuples ] # Re-label the DataFrame with standardized, non-unique row labels frame.index = [ uri_prefix(standardized_uri(language, label)) for label in frame.index ] # Assign row n a weight of 1/(n+1) for weighted averaging nrows = frame.shape[0] weights = 1.0 / np.arange(1, nrows + 1) label_weights = pd.Series(weights, index=frame.index) # groupby(level=0).sum() means to add rows that have the same label relabeled = frame.mul(weights, axis='rows').sort_index().groupby(level=0).sum() combined_weights = label_weights.sort_index().groupby(level=0).sum() # Optionally adjust words to be more like their word forms if forms: for label in relabeled.index: lemmatized = lemmatize_uri(label) if lemmatized != label and lemmatized in relabeled.index: relabeled.loc[lemmatized] += relabeled.loc[label] / 2 combined_weights.loc[ lemmatized] += combined_weights.loc[label] / 2 scaled = relabeled.div(combined_weights, axis='rows') # Rearrange the items in descending order of weight, similar to the order # we get them in from word2vec and GloVe combined_weights.sort_values(inplace=True, ascending=False) result = scaled.loc[combined_weights.index] return result
def text_to_vector(self, language, text): """ Used in Story Cloze Test to create a vector for text. """ tokens = wordfreq.simple_tokenize(text) weighted_terms = [(uri_prefix(standardized_uri(language, token)), 1.) for token in tokens] return self.get_vector(weighted_terms, oov_vector=False)
def reduce_assoc(filename, output_filename, cutoff=3, en_cutoff=3): """ Takes in a file of tab-separated simple associations, and removes uncommon associations and associations unlikely to be useful. All concepts that occur fewer than `cutoff` times will be removed. All English concepts that occur fewer than `en_cutoff` times will be removed. """ counts = defaultdict(int) with open(filename, encoding='utf-8') as file: for line in file: left, right, _value, _dataset, rel = line.rstrip().split('\t') if rel == '/r/SenseOf': pass else: gleft = uri_prefix(left) gright = uri_prefix(right) counts[gleft] += 1 counts[gright] += 1 filtered_concepts = { concept for (concept, count) in counts.items() if ( count >= en_cutoff or (not concept.startswith('/c/en/') and count >= cutoff) ) } with open(output_filename, 'w', encoding='utf-8') as out: with open(filename, encoding='utf-8') as file: for line in file: left, right, value, dataset, rel = line.rstrip().split('\t', 4) if concept_is_bad(left) or concept_is_bad(right) or is_negative_relation(rel): continue fvalue = float(value) gleft = uri_prefix(left) gright = uri_prefix(right) if ( gleft in filtered_concepts and gright in filtered_concepts and fvalue != 0 ): if gleft != gright: line = '\t'.join([gleft, gright, value, dataset, rel]) print(line, file=out)
def from_csv(cls, filename, filtered_concepts=None, reject_negative_relations=True): """ Reads an association file and builds an (undirected) graph from it. If filtered_concepts isn't None, it should be a collection of concepts, and only vertices from this collection and edges that link two such vertices will be added to the graph. If it _is_ None (the default), however, please note that no such filtering will be done (i.e. the effective filter collection is then the universal set of concepts, not the empty set). If reject_negative_relations is True (the default), only edges not corresponding to negative relations will be added to the graph. """ graph = cls() if filtered_concepts is None: filter_concepts = False else: filter_concepts = True with open(filename, encoding='utf-8') as file: for line in file: left, right, value, dataset, rel = line.rstrip().split('\t', 4) if concept_is_bad(left) or concept_is_bad(right): continue if reject_negative_relations and is_negative_relation(rel): continue fvalue = float(value) gleft = uri_prefix(left) gright = uri_prefix(right) if fvalue == 0: continue if gleft == gright: continue if filter_concepts and gleft not in filtered_concepts: continue if filter_concepts and gright not in filtered_concepts: continue graph.add_edge(gleft, gright, value, dataset, rel) return graph
def standardize_row_labels(frame, language='en', forms=True): """ Convert a frame whose row labels are bare English terms (e.g. of the form 'en/term') to one whose row labels are standardized ConceptNet URIs (e.g. of the form '/c/en/term'; and with some extra word2vec-style normalization of digits). Rows whose labels get the same standardized URI get combined, with earlier rows given more weight. """ # Check for en/term format we use to train fastText on OpenSubtitles data if all(label.count('/') == 1 for label in frame.index[0:5]): tuples = [label.partition('/') for label in frame.index] frame.index = [ uri_prefix(standardized_uri(language, text)) for language, _slash, text in tuples ] # Re-label the DataFrame with standardized, non-unique row labels frame.index = [ uri_prefix(standardized_uri(language, label)) for label in frame.index ] # Assign row n a weight of 1/(n+1) for weighted averaging nrows = frame.shape[0] weights = 1.0 / np.arange(1, nrows + 1) label_weights = pd.Series(weights, index=frame.index) # groupby(level=0).sum() means to add rows that have the same label relabeled = frame.mul(weights, axis='rows').sort_index().groupby(level=0).sum() combined_weights = label_weights.sort_index().groupby(level=0).sum() # Optionally adjust words to be more like their word forms if forms: for label in relabeled.index: lemmatized = lemmatize_uri(label) if lemmatized != label and lemmatized in relabeled.index: relabeled.loc[lemmatized] += relabeled.loc[label] / 2 combined_weights.loc[lemmatized] += combined_weights.loc[label] / 2 scaled = relabeled.div(combined_weights, axis='rows') # Rearrange the items in descending order of weight, similar to the order # we get them in from word2vec and GloVe combined_weights.sort_values(inplace=True, ascending=False) result = scaled.loc[combined_weights.index] return result
def uri_to_label(uri): """ Convert a ConceptNet uri into a label to be used in nodes. This function replaces an underscore with a space, so while '/c/en/example' will be converted into 'example', '/c/en/canary_islands' will be converted into 'canary islands'. """ if uri.startswith('/c/'): uri = uri_prefix(uri) return uri.split('/')[-1].replace('_', ' ')
def text_to_vector(self, language, text): """ Used in Story Cloze Test to create a vector for text. """ tokens = wordfreq.tokenize(text, language) weighted_terms = [ (uri_prefix(standardized_uri(language, token)), 1.) for token in tokens ] return self.get_vector(weighted_terms, oov_vector=False)
def make_assertion(line_group): lines = [line.rstrip() for line in line_group] lines = [line for line in lines if line] if not lines: return None uri, rel, start, end, _ = lines[0].split('\t') # We can't distinguish word senses well enough yet, so only keep them # up to the part of speech start = uri_prefix(start, 4) end = uri_prefix(end, 4) if not (keep_concept(start) and keep_concept(end)): return None info_dicts = [json.loads(line.split('\t')[4]) for line in lines] unscaled_weight = sum(info['weight'] for info in info_dicts) licenses = {info['license'] for info in info_dicts} dataset = info_dicts[0]['dataset'] surface_text = None sources = [] seen_sources = set() for info in info_dicts: if surface_text is None and 'surfaceText' in info: surface_text = info['surfaceText'] for subsource in info['sources']: conjunction = conjunction_uri(*sorted(subsource.values())) if conjunction not in seen_sources: sources.append(subsource) seen_sources.add(conjunction) weight = weight_scale(unscaled_weight) if Licenses.cc_sharealike in licenses: license = Licenses.cc_sharealike else: license = Licenses.cc_attribution return make_edge( rel=rel, start=start, end=end, weight=weight, dataset=dataset, license=license, sources=sources, surfaceText=surface_text )
def propagate_blocks(self, edge, verbose=False): """ Scan an edge and see if it is a DerivedFrom or FormOf edge whose right side matches a derivation block. If so, add its left side as a simple block and a derivation block. """ if edge['rel'].endswith('DerivedFrom') or edge['rel'].endswith('FormOf'): if set(uri_prefixes(edge['end'])) & self.derivation_blocks: prefix = uri_prefix(edge['start'], 3) self.simple_blocks.add(prefix) self.derivation_blocks.add(prefix) if verbose: print(f"Added derivation block: {prefix}")
def expand_terms(self, terms, limit_per_term=10, include_neighbors=True): """ Given a list of weighted terms as (term, weight) tuples, add terms that are one step away in ConceptNet at a lower weight. This helps increase the recall power of the vector space, because it means you can find terms that are too infrequent to have their own vector by looking up their neighbors. This forms a reasonable approximation of the vector an infrequent term would have anyway. """ self.load() expanded = terms[:] for term, weight in terms: # TODO: this disagrees with the docstring about whether neighbors # are added to non-OOV terms if include_neighbors and term not in self.frame.index and self.finder is not None: for edge in self.finder.lookup(term, limit=limit_per_term): if field_match(edge['start']['term'], term) and not field_match(edge['end']['term'], term): neighbor = edge['end']['term'] elif field_match(edge['end']['term'], term) and not field_match(edge['start']['term'], term): neighbor = edge['start']['term'] else: continue # TODO: explain this formula neighbor_weight = weight * min(10, edge['weight']) * 0.01 expanded.append((neighbor, neighbor_weight)) prefix_weight = 0.01 if not term.startswith('/c/en/'): # FIXME: better language code handling englishified = '/c/en/' + term[6:] expanded.append((englishified, prefix_weight)) while term: if term.endswith('/'): break start_idx, end_idx = index_prefix_range(self.frame, term) if end_idx > start_idx: n_prefixed = end_idx - start_idx for prefixed_term in self.frame.index[start_idx:end_idx]: expanded.append((prefixed_term, prefix_weight / n_prefixed)) break term = term[:-1] total_weight = sum(abs(weight) for term, weight in expanded) if total_weight == 0: return [] else: return [(uri_prefix(term), weight / total_weight) for (term, weight) in expanded]
def ld_node(uri, label=None): """ Convert a ConceptNet URI into a dictionary suitable for Linked Data. """ if label is None: label = uri_to_label(uri) ld = {'@id': uri, 'label': label} if uri.startswith('/c/'): pieces = split_uri(uri) ld['language'] = get_language(uri) if len(pieces) > 3: ld['sense_label'] = '/'.join(pieces[3:]) ld['term'] = uri_prefix(uri) elif uri.startswith('http'): domain = urlparse(uri).netloc ld['site'] = domain ld['term'] = uri return ld
def ld_node(uri, label=None): """ Convert a ConceptNet URI into a dictionary suitable for Linked Data. """ if label is None: label = uri_to_label(uri) ld = { '@id': uri, 'label': label } if uri.startswith('/c/'): pieces = split_uri(uri) ld['language'] = pieces[1] if len(pieces) > 3: ld['sense_label'] = '/'.join(pieces[3:]) ld['term'] = uri_prefix(uri) elif uri.startswith('http'): domain = urlparse(uri).netloc ld['site'] = domain ld['term'] = uri return ld
def ld_node(uri, label=None): """ Convert a ConceptNet URI into a dictionary suitable for Linked Data. """ if label is None: label = uri_to_label(uri) ld = {'@id': uri, 'label': label} if is_term(uri): pieces = split_uri(uri) ld['language'] = get_uri_language(uri) # Get a reasonably-distinct sense label for the term. # Usually it will be the part of speech, but when we have fine-grained # information from Wikipedia or WordNet, it'll include the last # component as well. if len(pieces) > 3: ld['sense_label'] = pieces[3] if len(pieces) > 4 and pieces[4] in ('wp', 'wn'): ld['sense_label'] += ', ' + pieces[-1] ld['term'] = uri_prefix(uri) ld['@type'] = 'Node' elif uri.startswith('http'): domain = urlparse(uri).netloc ld['site'] = domain ld['term'] = uri # OpenCyc is down and UMBEL doesn't host their vocabulary on the # Web. This property indicates whether you can follow a link # via HTTP and retrieve more information. ld['site_available'] = True if domain in {'sw.opencyc.org', 'umbel.org', 'wikidata.dbpedia.org'}: ld['site_available'] = False ld['path'] = urlparse(uri).path ld['@type'] = 'Node' elif uri.startswith('/r/'): ld['@type'] = 'Relation' return ld
def ld_node(uri, label=None): """ Convert a ConceptNet URI into a dictionary suitable for Linked Data. """ if label is None: label = uri_to_label(uri) ld = {'@id': uri, 'label': label} if is_term(uri): pieces = split_uri(uri) ld['language'] = get_uri_language(uri) # Get a reasonably-distinct sense label for the term. # Usually it will be the part of speech, but when we have fine-grained # information from Wikipedia or WordNet, it'll include the last # component as well. if len(pieces) > 3: ld['sense_label'] = pieces[3] if len(pieces) > 4 and pieces[4] in ('wp', 'wn'): ld['sense_label'] += ', ' + pieces[-1] ld['term'] = uri_prefix(uri) ld['@type'] = 'Node' elif uri.startswith('http'): domain = urlparse(uri).netloc ld['site'] = domain ld['term'] = uri # OpenCyc is down and UMBEL doesn't host their vocabulary on the # Web. This property indicates whether you can follow a link # via HTTP and retrieve more information. ld['site_available'] = domain not in {'sw.opencyc.org', 'umbel.org'} ld['@type'] = 'Node' elif uri.startswith('/r/'): ld['@type'] = 'Relation' return ld
def describe_sources(sources, specific=True): """ Build a marked-up text phrase describing the sources of our data. If `specific` is True, sources with many known individual contributors will list up to MAX_INDIVIDUALS of those contributors. If False, only the source as a whole will be credited. specific=False is used for the credit at the top of a page. """ omcs_contributors = [] omcs_count = 0 ptt_count = 0 nadya_count = 0 more_sources = set() for source in sources: if 'activity' in source and source[ 'activity'] == '/s/activity/omcs/nadya.jp': nadya_count += 1 elif 'activity' in source and source[ 'activity'] == '/s/activity/kyoto_yahoo': more_sources.add( source_link(source['activity'], KYOTO_YAHOO_CREDIT)) elif 'contributor' in source: contributor = source['contributor'] prefix = uri_prefix(contributor, 3) if prefix == '/s/contributor/omcs': if len(omcs_contributors) < MAX_INDIVIDUALS: name = split_uri(contributor)[-1] omcs_contributors.append(source_link(contributor, name)) omcs_count += 1 elif prefix == '/s/contributor/petgame': ptt_count += 1 elif contributor in CONTRIBUTOR_NAME_MAP: more_sources.add( source_link(contributor, CONTRIBUTOR_NAME_MAP[contributor])) else: more_sources.add(source_link(contributor, contributor)) source_chunks = [] if omcs_contributors: if specific: if omcs_count > MAX_INDIVIDUALS: omcs_contributors.append("{} more".format(omcs_count - MAX_INDIVIDUALS)) omcs_str = '<a href="/s/activity/omcs">Open Mind Common Sense</a> contributors {}'.format( oxford_comma(omcs_contributors)) source_chunks.append(omcs_str) else: source_chunks.append( '<a href="/s/activity/omcs">Open Mind Common Sense</a> contributors' ) if ptt_count: if specific: if ptt_count == 1: count_str = "a player" else: count_str = "{} players".format(ptt_count) source_chunks.append( '{} of the <a href="/s/contributor/petgame">PTT Pet Game</a>'. format(count_str)) else: source_chunks.append( 'the <a href="/s/contributor/petgame">PTT Pet Game</a>') if nadya_count: if specific: if nadya_count == 1: count_str = "a player" else: count_str = "{} players".format(nadya_count) source_chunks.append( '{} of <a href="/s/activity/omcs/nadya.jp">nadya.jp</a>'. format(count_str)) else: source_chunks.append( '<a href="/s/activity/omcs/nadya.jp">nadya.jp</a>') source_chunks.extend(sorted(more_sources)) if len(source_chunks) == 1: source_markup = "<strong>Source:</strong> {}".format(source_chunks[0]) else: source_markup = "<strong>Sources:</strong> {}".format( oxford_comma(source_chunks)) return Markup(source_markup)
def read_wiktionary(input_file, db_file, output_file): """ Convert a stream of parsed Wiktionary data into ConceptNet edges. A `db_file` containing all known words in all languages must have already been prepared from the same data. """ db = sqlite3.connect(db_file) out = MsgpackStreamWriter(output_file) for heading, items in segmented_stream(input_file): language = heading['language'] title = heading['title'] dataset = '/d/wiktionary/{}'.format(language) url_title = heading['title'].replace(' ', '_') web_url = 'http://{}.wiktionary.org/wiki/{}'.format(language, url_title) web_source = '/s/resource/wiktionary/{}'.format(language) source = { 'contributor': web_source, 'process': PARSER_RULE } # Scan through the 'from' items, such as the start nodes of # translations, looking for distinct etymologies. If we get more than # one etymology for a language, we need to distinguish them as # different senses in that language. all_etyms = { (item['from']['language'], etym_label(language, item['from'])) for item in items if 'language' in item['from'] and item['from']['text'] == title and etym_label(language, item['from']) is not None } word_languages = {wlang for (wlang, _) in all_etyms} for wlang in sorted(word_languages): cpage = standardized_concept_uri(wlang, title) ld_edge = make_edge( '/r/ExternalURL', cpage, web_url, dataset=dataset, weight=0.25, sources=[source], license=Licenses.cc_sharealike ) out.write(ld_edge) etym_to_translation_sense = {} language_etym_counts = Counter(lang for (lang, etym) in all_etyms) polysemous_languages = { lang for lang in language_etym_counts if language_etym_counts[lang] > 1 } for item in items: tfrom = item['from'] tto = item['to'] assumed_languages = [language] lang1 = tfrom.get('language') lang2 = tto.get('language') if lang1 and (lang1 not in assumed_languages) and valid_language(lang1): assumed_languages.append(lang1) if lang2 and (lang2 not in assumed_languages) and valid_language(lang2): assumed_languages.append(lang2) cfrom = transform_term( language, tfrom, assumed_languages, db, use_etyms=(lang1 in polysemous_languages) ) cpage = cfrom cto = transform_term( language, tto, assumed_languages, db, use_etyms=(lang2 in polysemous_languages) ) if cfrom is None or cto is None: continue if uri_prefix(cfrom, 3) == uri_prefix(cto, 3): continue rel, switch = transform_relation(item['rel']) if rel is None: continue if switch: cfrom, cto = cto, cfrom # When translations are separated by sense, use only the first # sense we see for each etymology. That will have the most # representative translations. if item['rel'] == 'translation': etym_key = (tfrom['language'], etym_label(language, tfrom)) sense = tfrom.get('sense', '') if etym_key in etym_to_translation_sense: if etym_to_translation_sense[etym_key] != sense: continue else: etym_to_translation_sense[etym_key] = sense weight = 1. if rel == '/r/EtymologicallyRelatedTo': weight = 0.25 edge = make_edge(rel, cfrom, cto, dataset=dataset, weight=weight, sources=[source], surfaceStart=tfrom['text'], surfaceEnd=tto['text'], license=Licenses.cc_sharealike) out.write(edge) out.close()
def make_edge(rel, start, end, dataset, license, sources, surfaceText=None, surfaceStart=None, surfaceEnd=None, weight=1.0): """ Take in the information representing an edge (a justified assertion), and output that edge in dictionary form. >>> from pprint import pprint >>> from conceptnet5.uri import Licenses >>> e = make_edge(rel='/r/HasProperty', ... start='/c/en/fire', ... end='/c/en/hot', ... dataset='/d/conceptnet/4/en', ... license=Licenses.cc_attribution, ... sources=[{'contributor': '/s/contributor/omcs/dev'}], ... surfaceText='[[Fire]] is [[hot]]', ... weight=1.0) >>> pprint(e) {'dataset': '/d/conceptnet/4/en', 'end': '/c/en/hot', 'features': ['/c/en/fire /r/HasProperty -', '/c/en/fire - /c/en/hot', '- /r/HasProperty /c/en/hot'], 'license': 'cc:by/4.0', 'rel': '/r/HasProperty', 'sources': [{'contributor': '/s/contributor/omcs/dev'}], 'start': '/c/en/fire', 'surfaceEnd': 'hot', 'surfaceStart': 'Fire', 'surfaceText': '[[Fire]] is [[hot]]', 'uri': '/a/[/r/HasProperty/,/c/en/fire/,/c/en/hot/]', 'weight': 1.0} """ pstart = uri_prefix(start) pend = uri_prefix(end) if is_concept(pstart) and is_concept(pend): features = [ "%s %s -" % (pstart, rel), "%s - %s" % (pstart, pend), "- %s %s" % (rel, pend) ] else: features = [] uri = assertion_uri(rel, start, end) assert isinstance(sources, list), sources assert all([isinstance(source, dict) for source in sources]), sources if surfaceStart is None or surfaceEnd is None: surfaceStart, surfaceEnd = extract_surface_terms(surfaceText) obj = { 'uri': uri, 'rel': rel, 'start': start, 'end': end, 'dataset': dataset, 'sources': sources, 'features': features, 'license': license, 'weight': weight, 'surfaceText': surfaceText, 'surfaceStart': surfaceStart, 'surfaceEnd': surfaceEnd } return obj
def describe_sources(sources, specific=True): """ Build a marked-up text phrase describing the sources of our data. If `specific` is True, sources with many known individual contributors will list up to MAX_INDIVIDUALS of those contributors. If False, only the source as a whole will be credited. specific=False is used for the credit at the top of a page. """ omcs_contributors = [] omcs_count = 0 ptt_count = 0 nadya_count = 0 more_sources = set() for source in sources: if 'activity' in source and source['activity'] == '/s/activity/omcs/nadya.jp': nadya_count += 1 elif 'activity' in source and source['activity'] == '/s/activity/kyoto_yahoo': more_sources.add(source_link(source['activity'], KYOTO_YAHOO_CREDIT)) elif 'contributor' in source: contributor = source['contributor'] prefix = uri_prefix(contributor, 3) if prefix == '/s/contributor/omcs': if len(omcs_contributors) < MAX_INDIVIDUALS: name = split_uri(contributor)[-1] omcs_contributors.append(source_link(contributor, name)) omcs_count += 1 elif prefix == '/s/contributor/petgame': ptt_count += 1 elif contributor in CONTRIBUTOR_NAME_MAP: more_sources.add( source_link(contributor, CONTRIBUTOR_NAME_MAP[contributor]) ) else: more_sources.add(source_link(contributor, contributor)) source_chunks = [] if omcs_contributors: if specific: if omcs_count > MAX_INDIVIDUALS: omcs_contributors.append("{} more".format(omcs_count - MAX_INDIVIDUALS)) omcs_str = '<a href="/s/activity/omcs">Open Mind Common Sense</a> contributors {}'.format( oxford_comma(omcs_contributors) ) source_chunks.append(omcs_str) else: source_chunks.append( '<a href="/s/activity/omcs">Open Mind Common Sense</a> contributors' ) if ptt_count: if specific: if ptt_count == 1: count_str = "a player" else: count_str = "{} players".format(ptt_count) source_chunks.append( '{} of the <a href="/s/contributor/petgame">PTT Pet Game</a>'.format( count_str ) ) else: source_chunks.append( 'the <a href="/s/contributor/petgame">PTT Pet Game</a>' ) if nadya_count: if specific: if nadya_count == 1: count_str = "a player" else: count_str = "{} players".format(nadya_count) source_chunks.append( '{} of <a href="/s/activity/omcs/nadya.jp">nadya.jp</a>'.format( count_str ) ) else: source_chunks.append('<a href="/s/activity/omcs/nadya.jp">nadya.jp</a>') source_chunks.extend(sorted(more_sources)) if len(source_chunks) == 1: source_markup = "<strong>Source:</strong> {}".format(source_chunks[0]) else: source_markup = "<strong>Sources:</strong> {}".format( oxford_comma(source_chunks) ) return Markup(source_markup)
def describe_sources(sources, specific=True): omcs_contributors = [] omcs_count = 0 ptt_count = 0 nadya_count = 0 more_sources = set() for source in sources: if 'activity' in source and source['activity'] == '/s/activity/omcs/nadya.jp': nadya_count += 1 elif 'contributor' in source: contributor = source['contributor'] prefix = uri_prefix(contributor, 3) if prefix == '/s/contributor/omcs': if len(omcs_contributors) < MAX_INDIVIDUALS: name = split_uri(contributor)[-1] omcs_contributors.append(source_link(contributor, name)) omcs_count += 1 elif prefix == '/s/contributor/petgame': ptt_count += 1 elif prefix == '/s/resource/en.wiktionary.org': more_sources.add(source_link(prefix, "English Wiktionary")) elif prefix == '/s/resource/de.wiktionary.org': more_sources.add(source_link(prefix, "German Wiktionary")) elif prefix == '/s/resource/fr.wiktionary.org': more_sources.add(source_link(prefix, "French Wiktionary")) elif contributor in CONTRIBUTOR_NAME_MAP: more_sources.add(source_link(contributor, CONTRIBUTOR_NAME_MAP[contributor])) else: more_sources.add(source_link(contributor, contributor)) source_chunks = [] if omcs_contributors: if specific: if omcs_count > MAX_INDIVIDUALS: omcs_contributors.append("{} more".format(omcs_count - MAX_INDIVIDUALS)) omcs_str = '<a href="/s/activity/omcs">Open Mind Common Sense</a> contributors {}'.format( oxford_comma(omcs_contributors) ) source_chunks.append(omcs_str) else: source_chunks.append('<a href="/s/activity/omcs">Open Mind Common Sense</a> contributors') if ptt_count: if specific: if ptt_count == 1: count_str = "a player" else: count_str = "{} players".format(ptt_count) source_chunks.append( '{} of the <a href="/s/contributor/petgame">PTT Pet Game</a>'.format(count_str) ) else: source_chunks.append('the <a href="/s/contributor/petgame">PTT Pet Game</a>') if nadya_count: if specific: if nadya_count == 1: count_str = "a player" else: count_str = "{} players".format(nadya_count) source_chunks.append( '{} of <a href="/s/activity/omcs/nadya.jp">nadya.jp</a>'.format(count_str) ) else: source_chunks.append('<a href="/s/activity/omcs/nadya.jp">nadya.jp</a>') source_chunks.extend(sorted(more_sources)) if len(source_chunks) == 1: source_markup = "<strong>Source:</strong> {}".format(source_chunks[0]) else: source_markup = "<strong>Sources:</strong> {}".format(oxford_comma(source_chunks)) return Markup(source_markup)
def uri_to_label(uri): # FIXME: add docstring if uri.startswith('/c/'): uri = uri_prefix(uri) return uri.split('/')[-1].replace('_', ' ')
def build_features_from_conceptnet_table(filename): mat = SparseMatrixBuilder() concept_labels = OrderedSet() feature_labels = OrderedSet() with open(str(filename), encoding='utf-8') as infile: for line in infile: concept1, concept2, value_str, dataset, relation = line.strip( ).split('\t') concept1 = replace_numbers(concept1) concept2 = replace_numbers(concept2) value = float(value_str) if relation in SYMMETRIC_RELATIONS: feature_pairs = [] if get_language(concept1) in CORE_LANGUAGES: feature_pairs.append( ('{} {} ~'.format(uri_prefix(concept1), relation), concept2)) if get_language(concept2) in CORE_LANGUAGES: feature_pairs.append( ('{} {} ~'.format(uri_prefix(concept2), relation), concept1)) else: if get_language(concept1) in CORE_LANGUAGES: feature_pairs.append( ('{} {} -'.format(uri_prefix(concept1), relation), concept2)) if get_language(concept2) in CORE_LANGUAGES: feature_pairs.append( ('- {} {}'.format(uri_prefix(concept2), relation), concept1)) feature_counts = defaultdict(int) for feature, concept in feature_pairs: feature_counts[feature] += 1 for feature, concept in feature_pairs: prefixes = list(uri_prefixes(concept, 3)) if feature_counts[feature] > 1: for prefix in prefixes: concept_index = concept_labels.add(prefix) feature_index = feature_labels.add(feature) mat[concept_index, feature_index] = value # Link nodes to their more general versions for concept in concept_labels: prefixes = list(uri_prefixes(concept, 3)) for prefix in prefixes: auto_features = [ '{} {} ~'.format(prefix, 'SimilarTo'), '{} {} ~'.format(prefix, 'RelatedTo'), '{} {} -'.format(prefix, 'FormOf'), '- {} {}'.format(prefix, 'FormOf'), ] for feature in auto_features: concept_index = concept_labels.add(prefix) feature_index = feature_labels.add(feature) mat[concept_index, feature_index] = value shape = (len(concept_labels), len(feature_labels)) c_index = pd.Index(concept_labels) f_index = pd.Index(feature_labels) return mat.tocsr(shape), c_index, f_index
def uri_to_label(uri): if uri.startswith('/c/'): uri = uri_prefix(uri) return uri.split('/')[-1].replace('_', ' ')
def text_to_vector(self, language, text): """Used in Story Cloze Test to create a vector for text """ tokens = wordfreq.tokenize(text, language) weighted_terms = [(uri_prefix(standardized_uri(language, token)), 1.) for token in tokens] return self.get_vector(weighted_terms, include_neighbors=False)
def build_features_from_conceptnet_table(filename): mat = SparseMatrixBuilder() concept_labels = OrderedSet() feature_labels = OrderedSet() with open(str(filename), encoding='utf-8') as infile: for line in infile: concept1, concept2, value_str, dataset, relation = line.strip().split('\t') concept1 = replace_numbers(concept1) concept2 = replace_numbers(concept2) value = float(value_str) if relation in SYMMETRIC_RELATIONS: feature_pairs = [] if get_language(concept1) in CORE_LANGUAGES: feature_pairs.append( ('{} {} ~'.format(uri_prefix(concept1), relation), concept2) ) if get_language(concept2) in CORE_LANGUAGES: feature_pairs.append( ('{} {} ~'.format(uri_prefix(concept2), relation), concept1) ) else: if get_language(concept1) in CORE_LANGUAGES: feature_pairs.append( ('{} {} -'.format(uri_prefix(concept1), relation), concept2) ) if get_language(concept2) in CORE_LANGUAGES: feature_pairs.append( ('- {} {}'.format(uri_prefix(concept2), relation), concept1) ) feature_counts = defaultdict(int) for feature, concept in feature_pairs: feature_counts[feature] += 1 for feature, concept in feature_pairs: prefixes = list(uri_prefixes(concept, 3)) if feature_counts[feature] > 1: for prefix in prefixes: concept_index = concept_labels.add(prefix) feature_index = feature_labels.add(feature) mat[concept_index, feature_index] = value # Link nodes to their more general versions for concept in concept_labels: prefixes = list(uri_prefixes(concept, 3)) for prefix in prefixes: auto_features = [ '{} {} ~'.format(prefix, 'SimilarTo'), '{} {} ~'.format(prefix, 'RelatedTo'), '{} {} -'.format(prefix, 'FormOf'), '- {} {}'.format(prefix, 'FormOf'), ] for feature in auto_features: concept_index = concept_labels.add(prefix) feature_index = feature_labels.add(feature) mat[concept_index, feature_index] = value shape = (len(concept_labels), len(feature_labels)) c_index = pd.Index(concept_labels) f_index = pd.Index(feature_labels) return mat.tocsr(shape), c_index, f_index
def read_wiktionary(input_file, db_file, output_file): """ Convert a stream of parsed Wiktionary data into ConceptNet edges. A `db_file` containing all known words in all languages must have already been prepared from the same data. """ db = sqlite3.connect(db_file) out = MsgpackStreamWriter(output_file) for heading, items in segmented_stream(input_file): language = heading['language'] title = heading['title'] dataset = '/d/wiktionary/{}'.format(language) url_title = heading['title'].replace(' ', '_') web_url = 'http://{}.wiktionary.org/wiki/{}'.format( language, url_title) web_source = '/s/resource/wiktionary/{}'.format(language) source = {'contributor': web_source, 'process': PARSER_RULE} # Scan through the 'from' items, such as the start nodes of # translations, looking for distinct etymologies. If we get more than # one etymology for a language, we need to distinguish them as # different senses in that language. all_etyms = { (item['from']['language'], etym_label(language, item['from'])) for item in items if 'language' in item['from'] and item['from']['text'] == title and etym_label(language, item['from']) is not None } word_languages = {wlang for (wlang, _) in all_etyms} for wlang in sorted(word_languages): if valid_language(wlang): cpage = standardized_concept_uri(wlang, title) ld_edge = make_edge('/r/ExternalURL', cpage, web_url, dataset=dataset, weight=0.25, sources=[source], license=Licenses.cc_sharealike) out.write(ld_edge) etym_to_translation_sense = {} language_etym_counts = Counter(lang for (lang, etym) in all_etyms) polysemous_languages = { lang for lang in language_etym_counts if language_etym_counts[lang] > 1 } for item in items: tfrom = item['from'] tto = item['to'] assumed_languages = [language] lang1 = tfrom.get('language') lang2 = tto.get('language') if lang1 and (lang1 not in assumed_languages) and valid_language(lang1): assumed_languages.append(lang1) if lang2 and (lang2 not in assumed_languages) and valid_language(lang2): assumed_languages.append(lang2) cfrom = transform_term(language, tfrom, assumed_languages, db, use_etyms=(lang1 in polysemous_languages)) cpage = cfrom cto = transform_term(language, tto, assumed_languages, db, use_etyms=(lang2 in polysemous_languages)) if cfrom is None or cto is None: continue if uri_prefix(cfrom, 3) == uri_prefix(cto, 3): continue rel, switch = transform_relation(item['rel']) if rel is None: continue if switch: cfrom, cto = cto, cfrom # When translations are separated by sense, use only the first # sense we see for each etymology. That will have the most # representative translations. if item['rel'] == 'translation': etym_key = (tfrom['language'], etym_label(language, tfrom)) sense = tfrom.get('sense', '') if etym_key in etym_to_translation_sense: if etym_to_translation_sense[etym_key] != sense: continue else: etym_to_translation_sense[etym_key] = sense weight = 1. if rel == '/r/EtymologicallyRelatedTo': weight = 0.25 edge = make_edge(rel, cfrom, cto, dataset=dataset, weight=weight, sources=[source], surfaceStart=tfrom['text'], surfaceEnd=tto['text'], license=Licenses.cc_sharealike) out.write(edge) out.close()
def describe_sources(sources, specific=True): omcs_contributors = [] omcs_count = 0 ptt_count = 0 nadya_count = 0 more_sources = set() for source in sources: if 'activity' in source and source[ 'activity'] == '/s/activity/omcs/nadya.jp': nadya_count += 1 elif 'contributor' in source: contributor = source['contributor'] prefix = uri_prefix(contributor, 3) if prefix == '/s/contributor/omcs': if len(omcs_contributors) < MAX_INDIVIDUALS: name = split_uri(contributor)[-1] omcs_contributors.append(source_link(contributor, name)) omcs_count += 1 elif prefix == '/s/contributor/petgame': ptt_count += 1 elif prefix == '/s/resource/en.wiktionary.org': more_sources.add(source_link(prefix, "English Wiktionary")) elif prefix == '/s/resource/de.wiktionary.org': more_sources.add(source_link(prefix, "German Wiktionary")) elif prefix == '/s/resource/fr.wiktionary.org': more_sources.add(source_link(prefix, "French Wiktionary")) elif contributor in CONTRIBUTOR_NAME_MAP: more_sources.add( source_link(contributor, CONTRIBUTOR_NAME_MAP[contributor])) else: more_sources.add(source_link(contributor, contributor)) source_chunks = [] if omcs_contributors: if specific: if omcs_count > MAX_INDIVIDUALS: omcs_contributors.append("{} more".format(omcs_count - MAX_INDIVIDUALS)) omcs_str = '<a href="/s/activity/omcs">Open Mind Common Sense</a> contributors {}'.format( oxford_comma(omcs_contributors)) source_chunks.append(omcs_str) else: source_chunks.append( '<a href="/s/activity/omcs">Open Mind Common Sense</a> contributors' ) if ptt_count: if specific: if ptt_count == 1: count_str = "a player" else: count_str = "{} players".format(ptt_count) source_chunks.append( '{} of the <a href="/s/contributor/petgame">PTT Pet Game</a>'. format(count_str)) else: source_chunks.append( 'the <a href="/s/contributor/petgame">PTT Pet Game</a>') if nadya_count: if specific: if nadya_count == 1: count_str = "a player" else: count_str = "{} players".format(nadya_count) source_chunks.append( '{} of <a href="/s/activity/omcs/nadya.jp">nadya.jp</a>'. format(count_str)) else: source_chunks.append( '<a href="/s/activity/omcs/nadya.jp">nadya.jp</a>') source_chunks.extend(sorted(more_sources)) if len(source_chunks) == 1: source_markup = "<strong>Source:</strong> {}".format(source_chunks[0]) else: source_markup = "<strong>Sources:</strong> {}".format( oxford_comma(source_chunks)) return Markup(source_markup)