def expand_terms(terms, limit_per_term=20): start = itemgetter('start') end = itemgetter('end') results = [] uris = set() expanded = terms[:] for term in expanded: for edge in FINDER.lookup(term, limit=limit_per_term): if field_match(start(edge), term) and split_uri(end(edge))[1] == 'en': neighbor = edge['end'] elif field_match(end(edge), term) and split_uri(start(edge))[1] == 'en': neighbor = edge['start'] else: continue neighbor_weight = 1.0 * min(10, edge['weight']) if edge['rel'].startswith('/r/Not'): neighbor_weight *= -1 for prefix in uri_prefixes(neighbor): uris.add(prefix) results.append((neighbor, neighbor_weight)) total_weight = sum(abs(weight) for (term, weight) in results) if total_weight == 0: return [] return [(term, weight, weight / total_weight) for (term, weight) in results]
def expand_terms(self, terms, limit_per_term=10, include_neighbors=True): """ Given a list of weighted terms as (term, weight) tuples, add terms that are one step away in ConceptNet at a lower weight, terms in English that share the surface form with these terms, and the terms which share prefix with these terms, if the terms are OOV. This helps increase the recall power of the vector space, because it means you can find terms that are too infrequent to have their own vector by looking up their neighbors, etc. This forms a reasonable approximation of the vector an infrequent term would have anyway. """ self.load() expanded = terms[:] for term, weight in terms: if include_neighbors and term not in self.frame.index and self.finder is not None: for edge in self.finder.lookup(term, limit=limit_per_term): if field_match(edge['start']['term'], term) and not field_match( edge['end']['term'], term): neighbor = edge['end']['term'] elif field_match(edge['end']['term'], term) and not field_match( edge['start']['term'], term): neighbor = edge['start']['term'] else: continue # TODO: explain this formula neighbor_weight = weight * min(10, edge['weight']) * 0.01 expanded.append((neighbor, neighbor_weight)) prefix_weight = 0.01 if get_uri_language(term) != 'en': splits = split_uri(term) if len(splits) > 2: englishified = '/c/en/' + splits[2] expanded.append((englishified, prefix_weight)) while term: # Skip excessively general lookups, for either an entire # language, or all terms starting with a single # non-ideographic letter if len(split_uri(term))< 3 \ or term.endswith('/') \ or (term[-2] == '/' and term[-1] < chr(0x3000)): break prefixed = self.terms_with_prefix(term) if prefixed: n_prefixed = len(prefixed) for prefixed_term in prefixed: expanded.append((prefixed_term, prefix_weight / n_prefixed)) break term = term[:-1] total_weight = sum(abs(weight) for term, weight in expanded) if total_weight == 0: return [] else: return [(uri_prefix(term), weight / total_weight) for (term, weight) in expanded]
def ld_node(uri, label=None): """ Convert a ConceptNet URI into a dictionary suitable for Linked Data. """ if label is None: label = uri_to_label(uri) ld = {'@id': uri, 'label': label} if is_term(uri): pieces = split_uri(uri) ld['language'] = get_uri_language(uri) if len(pieces) > 3: ld['sense_label'] = '/'.join(pieces[3:]) ld['term'] = uri_prefix(uri) ld['@type'] = 'Node' elif uri.startswith('http'): domain = urlparse(uri).netloc ld['site'] = domain ld['term'] = uri # OpenCyc is down and UMBEL doesn't host their vocabulary on the # Web. This property indicates whether you can follow a link # via HTTP and retrieve more information. ld['site_available'] = domain not in {'sw.opencyc.org', 'umbel.org'} ld['@type'] = 'Node' elif uri.startswith('/r/'): ld['@type'] = 'Relation' return ld
def browse_concept(uri): req_args = flask.request.args concept = '/c/%s' % uri pieces = split_uri(concept) if len(pieces) <= 2: return browse_node('c', pieces[1]) limit = get_int(req_args, 'limit', 20, 0, 1000) filters = {} for key in VALID_KEYS: if key != 'node' and key in req_args: filters[key] = req_args[key] results = responses.lookup_grouped_by_feature(concept, filters, group_limit=limit) sources = [] if 'error' in results: return flask.render_template('error.html', error=results['error']) for feature in results['features']: rel = feature['feature']['rel'] if rel in REL_HEADINGS['en']: label_choices = REL_HEADINGS['en'][rel] else: label_choices = ['%s {0}' % rel, '{0} %s' % rel] if feature['symmetric'] or 'end' in feature['feature']: feat_label = label_choices[0] else: feat_label = label_choices[1] feature['label'] = feat_label.format(results['label']) for edge in feature['edges']: sources.extend(edge['sources']) return flask.render_template( 'node_by_feature.html', term=results, features=results['features'], sources=sources )
def keep_concept(uri): if is_absolute_url(uri): return True if get_uri_language(uri) not in ALL_LANGUAGES: return False pieces = split_uri(uri) return bool(pieces[2])
def interlanguage_mapping(interlang_path, ok_concepts): quads = parse_nquads(bz2.open(str(interlang_path), 'rt')) mapping = {} for subj, values in itertools.groupby(quads, itemgetter(0)): subj_url = subj['url'] subj_concept = translate_dbpedia_url(subj_url) pieces = split_uri(subj_concept) if len(pieces) >= 6: sense = pieces[5] if 'album' in sense or 'film' in sense or 'series' in sense or 'disambiguation' in sense or 'song' in sense or 'album' in sense or 'band' in sense: continue if uri_prefix(subj_concept) in ok_concepts: targets = [subj_url] for _subj, _pred, obj, _graph in values: url = obj['url'] if 'www.wikidata.org' in url: continue if url.startswith('http://wikidata.dbpedia.org/'): wikidata_id = resource_name(url) # Return early when we see a high-numbered Wikidata ID if int(wikidata_id[1:]) >= 1000000: return mapping targets.append(url) mapping[subj_url] = targets return mapping
def term_freq(term): _c, lang, term = split_uri(term)[:3] if lang == 'en': return wordfreq.word_frequency(term, 'en', 'large') elif lang in CORE_LANGUAGES: return wordfreq.word_frequency(term, lang) else: return 0.
def keep_concept(uri): # FIXME: possibly we should use the 'is_valid_concept' check that we use # elsewhere if is_absolute_url(uri): return True if get_uri_language(uri) not in ALL_LANGUAGES: return False pieces = split_uri(uri) return bool(pieces[2])
def browse_concept(uri): req_args = flask.request.args concept = '/c/%s' % uri pieces = split_uri(concept) if len(pieces) <= 2: return browse_node('c', pieces[1]) limit = get_int(req_args, 'limit', 20, 0, 1000) # Offset is not used when grouping by features offset = get_int(req_args, 'offset', 0, 0, 10000) filters = {} for key in responses.VALID_KEYS: if key != 'node' and key in req_args: filters[key] = req_args[key] if filters: filters['node'] = concept limit = get_int(req_args, 'limit', 100, 0, 1000) return edge_list_query(filters, offset=offset, limit=limit) else: results = responses.lookup_grouped_by_feature(concept, filters, feature_limit=limit) sources = [] external_links = [] if 'error' in results: return flask.render_template('error.html', error=results['error']) rendered_features = [] for feature in results['features']: rel = feature['feature']['rel'] if rel in REL_HEADINGS['en']: label_choices = REL_HEADINGS['en'][rel] else: label_choices = ['%s {0}' % rel, '{0} %s' % rel] if feature['symmetric'] or 'end' in feature['feature']: feat_label = label_choices[0] else: feat_label = label_choices[1] feature['label'] = feat_label.format(results['label']) for edge in feature['edges']: sources.extend(edge['sources']) if rel == '/r/ExternalURL': external_links = feature['edges'] else: rendered_features.append(feature) return flask.render_template('node_by_feature.html', term=results, features=rendered_features, sources=sources, external_links=external_links)
def get_uri_language(uri): """ Extract the language from a concept URI. If the URI points to an assertion, get the language of its first concept. """ if uri.startswith('/a/'): return get_uri_language(parse_possible_compound_uri('a', uri)[0]) elif uri.startswith('/c/'): return split_uri(uri)[1] else: return None
def _englishify(term): """ Change the language of a /c/ term to English. If the input isn't a term, return None. """ splits = split_uri(term) if not term.startswith('/c/'): return None if len(splits) > 2: englishified = '/c/en/' + splits[2] return englishified
def keep_concept(uri): # FIXME: possibly we should use the 'is_valid_concept' check that we use # elsewhere if is_absolute_url(uri): return True if get_uri_language(uri) not in ALL_LANGUAGES: return False if not valid_language(get_uri_language(uri)): return False pieces = split_uri(uri) return bool(pieces[2])
def uri2term(arg, include_more=False): if arg.startswith('/c/'): if len(arg.split('/')) <= 3: return arg.split('/')[-1] result = arg.split('/')[3] else: result = arg.split('/')[-1] if result.startswith('be_') or result.startswith('to_'): result = result[3:] if include_more: result = '/'.join(split_uri(arg)[2:]) return result
def lemmatize_uri(self, uri): pieces = split_uri(uri) if len(pieces) < 2: return uri language = pieces[1] text = pieces[2] rest = pieces[3:] if rest: pos = rest[0] else: pos = None root, _form = self.lookup(language, text, pos) return join_uri('c', language, root, *rest)
def get_results(feature): feature = feature.lower() ret = [] with urlopen('http://conceptnet5.media.mit.edu/data/5.4/assoc' + quote('/c/en/' + feature) + "?filter=/c/en&limit=100") as response: html = response.read().decode('utf8') result = json.loads(html) for u, score in result["similar"]: s = float(score) if s >= 0.5: tmp = split_uri(u) if tmp[-1] != "neg": #ignore negative relationships ret.append((tmp[2], 1 - s)) #invert score priority return ret
def term_freq(term): """ Get an estimate of the frequency of this term from the 'wordfreq' library. When miniaturizing, we use this as a cutoff for which words to include in the vocabulary. Because we have the most data for English, we allow lower word frequencies in English (by reading in the 'large' list, whose frequencies can go below 1e-6). """ _c, lang, term = split_uri(term)[:3] if lang == 'en': return wordfreq.word_frequency(term, 'en', 'large') elif lang in CORE_LANGUAGES: return wordfreq.word_frequency(term, lang) else: return 0.
def _match_prefix(self, term, prefix_weight): results = [] while term: # Skip excessively general lookups, for either an entire # language, or all terms starting with a single # non-ideographic letter if (len(split_uri(term)) < 3 or term.endswith('/') or (term[-2] == '/' and term[-1] < chr(0x3000))): break prefixed = self._terms_with_prefix(term) if prefixed: n_prefixed = len(prefixed) for prefixed_term in prefixed: results.append((prefixed_term, prefix_weight / n_prefixed)) break term = term[:-1] return results
def ld_node(uri, label=None): """ Convert a ConceptNet URI into a dictionary suitable for Linked Data. """ if label is None: label = uri_to_label(uri) ld = {'@id': uri, 'label': label} if uri.startswith('/c/'): pieces = split_uri(uri) ld['language'] = get_language(uri) if len(pieces) > 3: ld['sense_label'] = '/'.join(pieces[3:]) ld['term'] = uri_prefix(uri) elif uri.startswith('http'): domain = urlparse(uri).netloc ld['site'] = domain ld['term'] = uri return ld
def _match_prefix(self, term, prefix_weight): results = [] while term: # Skip excessively general lookups, for either an entire # language, or all terms starting with a single # non-ideographic letter if ( len(split_uri(term)) < 3 or term.endswith('/') or (term[-2] == '/' and term[-1] < chr(0x3000)) ): break prefixed = self._terms_with_prefix(term) if prefixed: n_prefixed = len(prefixed) for prefixed_term in prefixed: results.append((prefixed_term, prefix_weight / n_prefixed)) break term = term[:-1] return results
def uri_to_lemmas(uri): """ Given a normalized concept URI, extract the list of words (in their root form) that it contains in its text. >>> # This is the lemmatized concept meaning 'United States' >>> uri_to_lemmas('/c/en/unite_state') ['unite', 'state'] >>> uri_to_lemmas('/c/en/township/n/united_states') ['township', 'unite', 'state'] """ uri_pieces = split_uri(uri) lemmas = uri_pieces[2].split('_') if len(uri_pieces) >= 5: lang = uri_pieces[1] text = uri_pieces[4].replace('_', ' ') if text not in BAD_NAMES_FOR_THINGS: disambig = normalized_concept_name(lang, text) lemmas.extend(disambig.split('_')) return lemmas
def reduce_concept(concept): """ Remove the part of speech and disambiguation (if present) from a concept, leaving a potentially ambiguous concept that can be matched against surface text. Additionally, remove the region tag from Chinese assertions, so they are considered simply as assertions about Chinese regardless of whether it is Traditional or Simplified Chinese. In the cases where they overlap, this helps to make the information more complete. >>> reduce_concept('/c/en/cat/n/feline') '/c/en/cat' >>> reduce_concept('/c/zh_TW/良好') '/c/zh/良好' """ parts = split_uri(concept) # Unify simplified and traditional Chinese in associations. if parts[1] == 'zh_CN' or parts[1] == 'zh_TW': parts[1] = 'zh' return join_uri(*parts[:3])
def ld_node(uri, label=None): """ Convert a ConceptNet URI into a dictionary suitable for Linked Data. """ if label is None: label = uri_to_label(uri) ld = { '@id': uri, 'label': label } if uri.startswith('/c/'): pieces = split_uri(uri) ld['language'] = pieces[1] if len(pieces) > 3: ld['sense_label'] = '/'.join(pieces[3:]) ld['term'] = uri_prefix(uri) elif uri.startswith('http'): domain = urlparse(uri).netloc ld['site'] = domain ld['term'] = uri return ld
def ld_node(uri, label=None): """ Convert a ConceptNet URI into a dictionary suitable for Linked Data. """ if label is None: label = uri_to_label(uri) ld = {'@id': uri, 'label': label} if is_term(uri): pieces = split_uri(uri) ld['language'] = get_uri_language(uri) # Get a reasonably-distinct sense label for the term. # Usually it will be the part of speech, but when we have fine-grained # information from Wikipedia or WordNet, it'll include the last # component as well. if len(pieces) > 3: ld['sense_label'] = pieces[3] if len(pieces) > 4 and pieces[4] in ('wp', 'wn'): ld['sense_label'] += ', ' + pieces[-1] ld['term'] = uri_prefix(uri) ld['@type'] = 'Node' elif uri.startswith('http'): domain = urlparse(uri).netloc ld['site'] = domain ld['term'] = uri # OpenCyc is down and UMBEL doesn't host their vocabulary on the # Web. This property indicates whether you can follow a link # via HTTP and retrieve more information. ld['site_available'] = True if domain in {'sw.opencyc.org', 'umbel.org', 'wikidata.dbpedia.org'}: ld['site_available'] = False ld['path'] = urlparse(uri).path ld['@type'] = 'Node' elif uri.startswith('/r/'): ld['@type'] = 'Relation' return ld
def browse_concept(uri): req_args = flask.request.args concept = '/c/%s' % uri pieces = split_uri(concept) if len(pieces) <= 2: return browse_node('c', pieces[1]) limit = get_int(req_args, 'limit', 20, 0, 1000) filters = {} for key in VALID_KEYS: if key != 'node' and key in req_args: filters[key] = req_args[key] results = responses.lookup_grouped_by_feature(concept, filters, group_limit=limit) sources = [] if 'error' in results: return flask.render_template('error.html', error=results['error']) for feature in results['features']: rel = feature['feature']['rel'] if rel in REL_HEADINGS['en']: label_choices = REL_HEADINGS['en'][rel] else: label_choices = ['%s {0}' % rel, '{0} %s' % rel] if feature['symmetric'] or 'end' in feature['feature']: feat_label = label_choices[0] else: feat_label = label_choices[1] feature['label'] = feat_label.format(results['label']) for edge in feature['edges']: sources.extend(edge['sources']) return flask.render_template('node_by_feature.html', term=results, features=results['features'], sources=sources)
def reduce_concept(concept): """ Remove the part of speech and disambiguation (if present) from a concept, leaving a potentially ambiguous concept that can be matched against surface text. Additionally, simplify language tags to a bare language. The main purpose is to remove the region tag from Chinese assertions, so they are considered simply as assertions about Chinese regardless of whether it is Traditional or Simplified Chinese. In the cases where they overlap, this helps to make the information more complete. >>> reduce_concept('/c/en/cat/n/feline') '/c/en/cat' >>> reduce_concept('/c/zh_TW/良好') '/c/zh/良好' """ parts = split_uri(concept) langtag = parts[1] if parts[1] != '[': langcode = langcodes.get(langtag).language if langcode: parts[1] = langcode return join_uri(*parts[:3])
def ld_node(uri, label=None): """ Convert a ConceptNet URI into a dictionary suitable for Linked Data. """ if label is None: label = uri_to_label(uri) ld = {'@id': uri, 'label': label} if is_term(uri): pieces = split_uri(uri) ld['language'] = get_uri_language(uri) # Get a reasonably-distinct sense label for the term. # Usually it will be the part of speech, but when we have fine-grained # information from Wikipedia or WordNet, it'll include the last # component as well. if len(pieces) > 3: ld['sense_label'] = pieces[3] if len(pieces) > 4 and pieces[4] in ('wp', 'wn'): ld['sense_label'] += ', ' + pieces[-1] ld['term'] = uri_prefix(uri) ld['@type'] = 'Node' elif uri.startswith('http'): domain = urlparse(uri).netloc ld['site'] = domain ld['term'] = uri # OpenCyc is down and UMBEL doesn't host their vocabulary on the # Web. This property indicates whether you can follow a link # via HTTP and retrieve more information. ld['site_available'] = domain not in {'sw.opencyc.org', 'umbel.org'} ld['@type'] = 'Node' elif uri.startswith('/r/'): ld['@type'] = 'Relation' return ld
def describe_sources(sources, specific=True): omcs_contributors = [] omcs_count = 0 ptt_count = 0 nadya_count = 0 more_sources = set() for source in sources: if 'activity' in source and source[ 'activity'] == '/s/activity/omcs/nadya.jp': nadya_count += 1 elif 'contributor' in source: contributor = source['contributor'] prefix = uri_prefix(contributor, 3) if prefix == '/s/contributor/omcs': if len(omcs_contributors) < MAX_INDIVIDUALS: name = split_uri(contributor)[-1] omcs_contributors.append(source_link(contributor, name)) omcs_count += 1 elif prefix == '/s/contributor/petgame': ptt_count += 1 elif prefix == '/s/resource/en.wiktionary.org': more_sources.add(source_link(prefix, "English Wiktionary")) elif prefix == '/s/resource/de.wiktionary.org': more_sources.add(source_link(prefix, "German Wiktionary")) elif prefix == '/s/resource/fr.wiktionary.org': more_sources.add(source_link(prefix, "French Wiktionary")) elif contributor in CONTRIBUTOR_NAME_MAP: more_sources.add( source_link(contributor, CONTRIBUTOR_NAME_MAP[contributor])) else: more_sources.add(source_link(contributor, contributor)) source_chunks = [] if omcs_contributors: if specific: if omcs_count > MAX_INDIVIDUALS: omcs_contributors.append("{} more".format(omcs_count - MAX_INDIVIDUALS)) omcs_str = '<a href="/s/activity/omcs">Open Mind Common Sense</a> contributors {}'.format( oxford_comma(omcs_contributors)) source_chunks.append(omcs_str) else: source_chunks.append( '<a href="/s/activity/omcs">Open Mind Common Sense</a> contributors' ) if ptt_count: if specific: if ptt_count == 1: count_str = "a player" else: count_str = "{} players".format(ptt_count) source_chunks.append( '{} of the <a href="/s/contributor/petgame">PTT Pet Game</a>'. format(count_str)) else: source_chunks.append( 'the <a href="/s/contributor/petgame">PTT Pet Game</a>') if nadya_count: if specific: if nadya_count == 1: count_str = "a player" else: count_str = "{} players".format(nadya_count) source_chunks.append( '{} of <a href="/s/activity/omcs/nadya.jp">nadya.jp</a>'. format(count_str)) else: source_chunks.append( '<a href="/s/activity/omcs/nadya.jp">nadya.jp</a>') source_chunks.extend(sorted(more_sources)) if len(source_chunks) == 1: source_markup = "<strong>Source:</strong> {}".format(source_chunks[0]) else: source_markup = "<strong>Sources:</strong> {}".format( oxford_comma(source_chunks)) return Markup(source_markup)
def describe_sources(sources, specific=True): """ Build a marked-up text phrase describing the sources of our data. If `specific` is True, sources with many known individual contributors will list up to MAX_INDIVIDUALS of those contributors. If False, only the source as a whole will be credited. specific=False is used for the credit at the top of a page. """ omcs_contributors = [] omcs_count = 0 ptt_count = 0 nadya_count = 0 more_sources = set() for source in sources: if 'activity' in source and source[ 'activity'] == '/s/activity/omcs/nadya.jp': nadya_count += 1 elif 'activity' in source and source[ 'activity'] == '/s/activity/kyoto_yahoo': more_sources.add( source_link(source['activity'], KYOTO_YAHOO_CREDIT)) elif 'contributor' in source: contributor = source['contributor'] prefix = uri_prefix(contributor, 3) if prefix == '/s/contributor/omcs': if len(omcs_contributors) < MAX_INDIVIDUALS: name = split_uri(contributor)[-1] omcs_contributors.append(source_link(contributor, name)) omcs_count += 1 elif prefix == '/s/contributor/petgame': ptt_count += 1 elif contributor in CONTRIBUTOR_NAME_MAP: more_sources.add( source_link(contributor, CONTRIBUTOR_NAME_MAP[contributor])) else: more_sources.add(source_link(contributor, contributor)) source_chunks = [] if omcs_contributors: if specific: if omcs_count > MAX_INDIVIDUALS: omcs_contributors.append("{} more".format(omcs_count - MAX_INDIVIDUALS)) omcs_str = '<a href="/s/activity/omcs">Open Mind Common Sense</a> contributors {}'.format( oxford_comma(omcs_contributors)) source_chunks.append(omcs_str) else: source_chunks.append( '<a href="/s/activity/omcs">Open Mind Common Sense</a> contributors' ) if ptt_count: if specific: if ptt_count == 1: count_str = "a player" else: count_str = "{} players".format(ptt_count) source_chunks.append( '{} of the <a href="/s/contributor/petgame">PTT Pet Game</a>'. format(count_str)) else: source_chunks.append( 'the <a href="/s/contributor/petgame">PTT Pet Game</a>') if nadya_count: if specific: if nadya_count == 1: count_str = "a player" else: count_str = "{} players".format(nadya_count) source_chunks.append( '{} of <a href="/s/activity/omcs/nadya.jp">nadya.jp</a>'. format(count_str)) else: source_chunks.append( '<a href="/s/activity/omcs/nadya.jp">nadya.jp</a>') source_chunks.extend(sorted(more_sources)) if len(source_chunks) == 1: source_markup = "<strong>Source:</strong> {}".format(source_chunks[0]) else: source_markup = "<strong>Sources:</strong> {}".format( oxford_comma(source_chunks)) return Markup(source_markup)
def describe_sources(sources, specific=True): """ Build a marked-up text phrase describing the sources of our data. If `specific` is True, sources with many known individual contributors will list up to MAX_INDIVIDUALS of those contributors. If False, only the source as a whole will be credited. specific=False is used for the credit at the top of a page. """ omcs_contributors = [] omcs_count = 0 ptt_count = 0 nadya_count = 0 more_sources = set() for source in sources: if 'activity' in source and source['activity'] == '/s/activity/omcs/nadya.jp': nadya_count += 1 elif 'activity' in source and source['activity'] == '/s/activity/kyoto_yahoo': more_sources.add(source_link(source['activity'], KYOTO_YAHOO_CREDIT)) elif 'contributor' in source: contributor = source['contributor'] prefix = uri_prefix(contributor, 3) if prefix == '/s/contributor/omcs': if len(omcs_contributors) < MAX_INDIVIDUALS: name = split_uri(contributor)[-1] omcs_contributors.append(source_link(contributor, name)) omcs_count += 1 elif prefix == '/s/contributor/petgame': ptt_count += 1 elif contributor in CONTRIBUTOR_NAME_MAP: more_sources.add( source_link(contributor, CONTRIBUTOR_NAME_MAP[contributor]) ) else: more_sources.add(source_link(contributor, contributor)) source_chunks = [] if omcs_contributors: if specific: if omcs_count > MAX_INDIVIDUALS: omcs_contributors.append("{} more".format(omcs_count - MAX_INDIVIDUALS)) omcs_str = '<a href="/s/activity/omcs">Open Mind Common Sense</a> contributors {}'.format( oxford_comma(omcs_contributors) ) source_chunks.append(omcs_str) else: source_chunks.append( '<a href="/s/activity/omcs">Open Mind Common Sense</a> contributors' ) if ptt_count: if specific: if ptt_count == 1: count_str = "a player" else: count_str = "{} players".format(ptt_count) source_chunks.append( '{} of the <a href="/s/contributor/petgame">PTT Pet Game</a>'.format( count_str ) ) else: source_chunks.append( 'the <a href="/s/contributor/petgame">PTT Pet Game</a>' ) if nadya_count: if specific: if nadya_count == 1: count_str = "a player" else: count_str = "{} players".format(nadya_count) source_chunks.append( '{} of <a href="/s/activity/omcs/nadya.jp">nadya.jp</a>'.format( count_str ) ) else: source_chunks.append('<a href="/s/activity/omcs/nadya.jp">nadya.jp</a>') source_chunks.extend(sorted(more_sources)) if len(source_chunks) == 1: source_markup = "<strong>Source:</strong> {}".format(source_chunks[0]) else: source_markup = "<strong>Sources:</strong> {}".format( oxford_comma(source_chunks) ) return Markup(source_markup)
def uri_split(uri): type, lang, term = split_uri(uri) if lang == 'en': return term
def _englishify(term): splits = split_uri(term) if len(splits) > 2: englishified = '/c/en/' + splits[2] return englishified
def msgpack_to_assoc(input_filename, output_filename): """ Convert a msgpack stream to a tab-separated "CSV" of concept-to-concept associations. As a special case, we convert some "Desires" and "NotDesires" relations to "HasProperty" relations, so that: - An assertion that means "People want X" in English or Chinese is converted to an association meaning "X is good" - An assertion that "People don't want X" is converted to an association meaning "X is bad" The result is used to build machine-learning models that recognize semantic similarities between words, and particularly the ConceptNet Numberbatch embedding space. """ with open(output_filename, 'w', encoding='utf-8') as out_stream: weight_by_dataset = defaultdict(float) count_by_dataset = defaultdict(int) prefixed = set() for info in read_msgpack_stream(input_filename): start_uri = info['start'] end_uri = info['end'] if not (get_uri_language(start_uri) in COMMON_LANGUAGES and get_uri_language(end_uri) in COMMON_LANGUAGES): continue rel = info['rel'] weight = info['weight'] dataset = info['dataset'] for uri in (start_uri, end_uri): pieces = split_uri(uri) if len(pieces) > 3 and (uri, dataset) not in prefixed: prefix = join_uri(*pieces[:3]) prefixed.add((uri, dataset)) line = "{start}\t{end}\t{weight}\t{dataset}\t{rel}".format( start=uri, end=prefix, weight=1., dataset=dataset, rel='/r/SenseOf') weight_by_dataset[dataset] += 1. count_by_dataset[dataset] += 1 print(line, file=out_stream) if start_uri == '/c/en/person' or start_uri == '/c/en/people': if rel == '/r/Desires': pairs = [('/c/en/good', end_uri)] elif rel == '/r/NotDesires': pairs = [('/c/en/bad', end_uri)] else: pairs = [(start_uri, end_uri)] elif start_uri == '/c/zh/人': if rel == '/r/Desires': pairs = [('/c/zh/良好', end_uri)] elif rel == '/r/NotDesires': pairs = [('/c/zh/不良', end_uri)] else: pairs = [(start_uri, end_uri)] else: pairs = [(start_uri, end_uri)] for (start, end) in pairs: line = "{start}\t{end}\t{weight}\t{dataset}\t{rel}".format( start=start, end=end, weight=weight, dataset=dataset, rel=rel) weight_by_dataset[dataset] += weight count_by_dataset[dataset] += 1 print(line, file=out_stream) avg_weight_by_dataset = { dataset: weight_by_dataset[dataset] / count_by_dataset[dataset] for dataset in count_by_dataset } print("Average weights:") print(avg_weight_by_dataset)
def describe_sources(sources, specific=True): omcs_contributors = [] omcs_count = 0 ptt_count = 0 nadya_count = 0 more_sources = set() for source in sources: if 'activity' in source and source['activity'] == '/s/activity/omcs/nadya.jp': nadya_count += 1 elif 'contributor' in source: contributor = source['contributor'] prefix = uri_prefix(contributor, 3) if prefix == '/s/contributor/omcs': if len(omcs_contributors) < MAX_INDIVIDUALS: name = split_uri(contributor)[-1] omcs_contributors.append(source_link(contributor, name)) omcs_count += 1 elif prefix == '/s/contributor/petgame': ptt_count += 1 elif prefix == '/s/resource/en.wiktionary.org': more_sources.add(source_link(prefix, "English Wiktionary")) elif prefix == '/s/resource/de.wiktionary.org': more_sources.add(source_link(prefix, "German Wiktionary")) elif prefix == '/s/resource/fr.wiktionary.org': more_sources.add(source_link(prefix, "French Wiktionary")) elif contributor in CONTRIBUTOR_NAME_MAP: more_sources.add(source_link(contributor, CONTRIBUTOR_NAME_MAP[contributor])) else: more_sources.add(source_link(contributor, contributor)) source_chunks = [] if omcs_contributors: if specific: if omcs_count > MAX_INDIVIDUALS: omcs_contributors.append("{} more".format(omcs_count - MAX_INDIVIDUALS)) omcs_str = '<a href="/s/activity/omcs">Open Mind Common Sense</a> contributors {}'.format( oxford_comma(omcs_contributors) ) source_chunks.append(omcs_str) else: source_chunks.append('<a href="/s/activity/omcs">Open Mind Common Sense</a> contributors') if ptt_count: if specific: if ptt_count == 1: count_str = "a player" else: count_str = "{} players".format(ptt_count) source_chunks.append( '{} of the <a href="/s/contributor/petgame">PTT Pet Game</a>'.format(count_str) ) else: source_chunks.append('the <a href="/s/contributor/petgame">PTT Pet Game</a>') if nadya_count: if specific: if nadya_count == 1: count_str = "a player" else: count_str = "{} players".format(nadya_count) source_chunks.append( '{} of <a href="/s/activity/omcs/nadya.jp">nadya.jp</a>'.format(count_str) ) else: source_chunks.append('<a href="/s/activity/omcs/nadya.jp">nadya.jp</a>') source_chunks.extend(sorted(more_sources)) if len(source_chunks) == 1: source_markup = "<strong>Source:</strong> {}".format(source_chunks[0]) else: source_markup = "<strong>Sources:</strong> {}".format(oxford_comma(source_chunks)) return Markup(source_markup)
def msgpack_to_assoc(input_filename, output_filename): """ Convert a msgpack stream to a tab-separated "CSV" of concept-to-concept associations. As a special case, we convert some "Desires" and "NotDesires" relations to "HasProperty" relations, so that: - An assertion that means "People want X" in English or Chinese is converted to an association meaning "X is good" - An assertion that "People don't want X" is converted to an association meaning "X is bad" The result is used to build machine-learning models that recognize semantic similarities between words, and particularly the ConceptNet Numberbatch embedding space. """ with open(output_filename, 'w', encoding='utf-8') as out_stream: weight_by_dataset = defaultdict(float) count_by_dataset = defaultdict(int) prefixed = set() for info in read_msgpack_stream(input_filename): start_uri = info['start'] end_uri = info['end'] if not ( get_uri_language(start_uri) in COMMON_LANGUAGES and get_uri_language(end_uri) in COMMON_LANGUAGES ): continue rel = info['rel'] weight = info['weight'] dataset = info['dataset'] for uri in (start_uri, end_uri): pieces = split_uri(uri) if len(pieces) > 3 and (uri, dataset) not in prefixed: prefix = join_uri(*pieces[:3]) prefixed.add((uri, dataset)) line = "{start}\t{end}\t{weight}\t{dataset}\t{rel}".format( start=uri, end=prefix, weight=1., dataset=dataset, rel='/r/SenseOf', ) weight_by_dataset[dataset] += 1. count_by_dataset[dataset] += 1 print(line, file=out_stream) if start_uri == '/c/en/person' or start_uri == '/c/en/people': if rel == '/r/Desires': pairs = [('/c/en/good', end_uri)] elif rel == '/r/NotDesires': pairs = [('/c/en/bad', end_uri)] else: pairs = [(start_uri, end_uri)] elif start_uri == '/c/zh/人': if rel == '/r/Desires': pairs = [('/c/zh/良好', end_uri)] elif rel == '/r/NotDesires': pairs = [('/c/zh/不良', end_uri)] else: pairs = [(start_uri, end_uri)] else: pairs = [(start_uri, end_uri)] for (start, end) in pairs: line = "{start}\t{end}\t{weight}\t{dataset}\t{rel}".format( start=start, end=end, weight=weight, dataset=dataset, rel=rel ) weight_by_dataset[dataset] += weight count_by_dataset[dataset] += 1 print(line, file=out_stream) avg_weight_by_dataset = { dataset: weight_by_dataset[dataset] / count_by_dataset[dataset] for dataset in count_by_dataset } print("Average weights:") print(avg_weight_by_dataset)
def msgpack_to_assoc(input_filename, output_filename): """ Convert a msgpack stream to a tab-separated "CSV" of concept-to-concept associations. The relation is mostly ignored, except: - An assertion that means "People want X" in English or Chinese is converted to an association between X and "good" - An assertion that "People don't want X" is converted to an association between X and "bad" The result can be used to predict word associations using ConceptNet by using dimensionality reduction, as in the `assoc_space` package. The relation is mostly ignored because we have not yet found a good way to take the relation into account in dimensionality reduction. """ with open(output_filename, 'w', encoding='utf-8') as out_stream: weight_by_dataset = defaultdict(float) count_by_dataset = defaultdict(int) prefixed = set() for info in read_msgpack_stream(input_filename): start_uri = info['start'] end_uri = info['end'] if not ( start_uri.startswith('/c/') and end_uri.startswith('/c/') and get_uri_language(start_uri) in COMMON_LANGUAGES and get_uri_language(end_uri) in COMMON_LANGUAGES ): continue rel = info['rel'] weight = info['weight'] dataset = info['dataset'] pairs = [] for uri in (start_uri, end_uri): pieces = split_uri(uri) if len(pieces) > 3 and (uri, dataset) not in prefixed: prefix = join_uri(*pieces[:3]) prefixed.add((uri, dataset)) line = "{start}\t{end}\t{weight}\t{dataset}\t{rel}".format( start=uri, end=prefix, weight=1., dataset=dataset, rel='/r/SenseOf' ) weight_by_dataset[dataset] += 1. count_by_dataset[dataset] += 1 print(line, file=out_stream) if start_uri == '/c/en/person': if rel == '/r/Desires': pairs = [('/c/en/good', end_uri)] elif rel == '/r/NotDesires': pairs = [('/c/en/bad', end_uri)] else: pairs = [(start_uri, end_uri)] elif start_uri == '/c/zh/人': if rel == '/r/Desires': pairs = [('/c/zh/良好', end_uri)] elif rel == '/r/NotDesires': pairs = [('/c/zh/不良', end_uri)] else: pairs = [(start_uri, end_uri)] else: pairs = [(start_uri, end_uri)] for (start, end) in pairs: line = "{start}\t{end}\t{weight}\t{dataset}\t{rel}".format( start=start, end=end, weight=weight, dataset=dataset, rel=rel ) weight_by_dataset[dataset] += weight count_by_dataset[dataset] += 1 print(line, file=out_stream) avg_weight_by_dataset = { dataset: weight_by_dataset[dataset] / count_by_dataset[dataset] for dataset in count_by_dataset } print("Average weights:") print(avg_weight_by_dataset)
def msgpack_to_assoc(input_filename, output_filename): """ Convert a msgpack stream to a tab-separated "CSV" of concept-to-concept associations. The relation is mostly ignored, except: - An assertion that means "People want X" in English or Chinese is converted to an association between X and "good" - An assertion that "People don't want X" is converted to an association between X and "bad" The result can be used to predict word associations using ConceptNet by using dimensionality reduction, as in the `assoc_space` package. FIXME: the above is out of date, we use conceptnet5.vectors now The relation is mostly ignored because we have not yet found a good way to take the relation into account in dimensionality reduction. """ with open(output_filename, 'w', encoding='utf-8') as out_stream: weight_by_dataset = defaultdict(float) count_by_dataset = defaultdict(int) prefixed = set() for info in read_msgpack_stream(input_filename): start_uri = info['start'] end_uri = info['end'] if not ( get_uri_language(start_uri) in COMMON_LANGUAGES and get_uri_language(end_uri) in COMMON_LANGUAGES ): continue rel = info['rel'] weight = info['weight'] dataset = info['dataset'] pairs = [] for uri in (start_uri, end_uri): pieces = split_uri(uri) if len(pieces) > 3 and (uri, dataset) not in prefixed: prefix = join_uri(*pieces[:3]) prefixed.add((uri, dataset)) line = "{start}\t{end}\t{weight}\t{dataset}\t{rel}".format( start=uri, end=prefix, weight=1., dataset=dataset, rel='/r/SenseOf' ) weight_by_dataset[dataset] += 1. count_by_dataset[dataset] += 1 print(line, file=out_stream) if start_uri == '/c/en/person': if rel == '/r/Desires': pairs = [('/c/en/good', end_uri)] elif rel == '/r/NotDesires': pairs = [('/c/en/bad', end_uri)] else: pairs = [(start_uri, end_uri)] elif start_uri == '/c/zh/人': if rel == '/r/Desires': pairs = [('/c/zh/良好', end_uri)] elif rel == '/r/NotDesires': pairs = [('/c/zh/不良', end_uri)] else: pairs = [(start_uri, end_uri)] else: pairs = [(start_uri, end_uri)] for (start, end) in pairs: line = "{start}\t{end}\t{weight}\t{dataset}\t{rel}".format( start=start, end=end, weight=weight, dataset=dataset, rel=rel ) weight_by_dataset[dataset] += weight count_by_dataset[dataset] += 1 print(line, file=out_stream) avg_weight_by_dataset = { dataset: weight_by_dataset[dataset] / count_by_dataset[dataset] for dataset in count_by_dataset } print("Average weights:") print(avg_weight_by_dataset)