예제 #1
0
def expand_terms(terms, limit_per_term=20):
	start = itemgetter('start')
	end = itemgetter('end')
	results = []
	uris = set()
	expanded = terms[:]
	for term in expanded:
		for edge in FINDER.lookup(term, limit=limit_per_term):

			if field_match(start(edge), term) and split_uri(end(edge))[1] == 'en':
				neighbor = edge['end']
			elif field_match(end(edge), term) and split_uri(start(edge))[1] == 'en':
				neighbor = edge['start']
			else:
				continue
			neighbor_weight = 1.0 * min(10, edge['weight'])
			if edge['rel'].startswith('/r/Not'):
				neighbor_weight *= -1
			for prefix in uri_prefixes(neighbor):
				uris.add(prefix)
			results.append((neighbor, neighbor_weight))
	total_weight = sum(abs(weight) for (term, weight) in results)
	if total_weight == 0:
		return []
	return [(term, weight, weight / total_weight) for (term, weight) in results]
예제 #2
0
    def expand_terms(self, terms, limit_per_term=10, include_neighbors=True):
        """
        Given a list of weighted terms as (term, weight) tuples, add terms that
        are one step away in ConceptNet at a lower weight, terms in English that share the
        surface form with these terms, and the terms which share prefix with these terms,
        if the terms are OOV.

        This helps increase the recall power of the vector space, because it
        means you can find terms that are too infrequent to have their own
        vector by looking up their neighbors, etc.

        This forms a reasonable approximation of the vector an infrequent term would have anyway.
        """
        self.load()
        expanded = terms[:]
        for term, weight in terms:
            if include_neighbors and term not in self.frame.index and self.finder is not None:
                for edge in self.finder.lookup(term, limit=limit_per_term):
                    if field_match(edge['start']['term'], term) and not field_match(
                            edge['end']['term'], term):
                        neighbor = edge['end']['term']
                    elif field_match(edge['end']['term'], term) and not field_match(
                            edge['start']['term'], term):
                        neighbor = edge['start']['term']
                    else:
                        continue
                    # TODO: explain this formula
                    neighbor_weight = weight * min(10, edge['weight']) * 0.01
                    expanded.append((neighbor, neighbor_weight))

                prefix_weight = 0.01
                if get_uri_language(term) != 'en':
                    splits = split_uri(term)
                    if len(splits) > 2:
                        englishified = '/c/en/' + splits[2]
                        expanded.append((englishified, prefix_weight))

                while term:
                    # Skip excessively general lookups, for either an entire
                    # language, or all terms starting with a single
                    # non-ideographic letter
                    if len(split_uri(term))< 3  \
                       or term.endswith('/') \
                       or (term[-2] == '/' and term[-1] < chr(0x3000)):
                        break
                    prefixed = self.terms_with_prefix(term)
                    if prefixed:
                        n_prefixed = len(prefixed)
                        for prefixed_term in prefixed:
                            expanded.append((prefixed_term, prefix_weight / n_prefixed))
                        break
                    term = term[:-1]

        total_weight = sum(abs(weight) for term, weight in expanded)
        if total_weight == 0:
            return []
        else:
            return [(uri_prefix(term), weight / total_weight) for (term, weight) in expanded]
예제 #3
0
def ld_node(uri, label=None):
    """
    Convert a ConceptNet URI into a dictionary suitable for Linked Data.
    """
    if label is None:
        label = uri_to_label(uri)
    ld = {'@id': uri, 'label': label}
    if is_term(uri):
        pieces = split_uri(uri)
        ld['language'] = get_uri_language(uri)
        if len(pieces) > 3:
            ld['sense_label'] = '/'.join(pieces[3:])
        ld['term'] = uri_prefix(uri)
        ld['@type'] = 'Node'
    elif uri.startswith('http'):
        domain = urlparse(uri).netloc
        ld['site'] = domain
        ld['term'] = uri

        # OpenCyc is down and UMBEL doesn't host their vocabulary on the
        # Web. This property indicates whether you can follow a link
        # via HTTP and retrieve more information.
        ld['site_available'] = domain not in {'sw.opencyc.org', 'umbel.org'}
        ld['@type'] = 'Node'
    elif uri.startswith('/r/'):
        ld['@type'] = 'Relation'
    return ld
예제 #4
0
def browse_concept(uri):
    req_args = flask.request.args
    concept = '/c/%s' % uri
    pieces = split_uri(concept)
    if len(pieces) <= 2:
        return browse_node('c', pieces[1])
    limit = get_int(req_args, 'limit', 20, 0, 1000)
    filters = {}
    for key in VALID_KEYS:
        if key != 'node' and key in req_args:
            filters[key] = req_args[key]
    results = responses.lookup_grouped_by_feature(concept, filters, group_limit=limit)
    sources = []

    if 'error' in results:
        return flask.render_template('error.html', error=results['error'])

    for feature in results['features']:
        rel = feature['feature']['rel']
        if rel in REL_HEADINGS['en']:
            label_choices = REL_HEADINGS['en'][rel]
        else:
            label_choices = ['%s {0}' % rel, '{0} %s' % rel]

        if feature['symmetric'] or 'end' in feature['feature']:
            feat_label = label_choices[0]
        else:
            feat_label = label_choices[1]
        feature['label'] = feat_label.format(results['label'])
        for edge in feature['edges']:
            sources.extend(edge['sources'])

    return flask.render_template(
        'node_by_feature.html', term=results, features=results['features'], sources=sources
    )
예제 #5
0
def keep_concept(uri):
    if is_absolute_url(uri):
        return True
    if get_uri_language(uri) not in ALL_LANGUAGES:
        return False
    pieces = split_uri(uri)
    return bool(pieces[2])
예제 #6
0
def interlanguage_mapping(interlang_path, ok_concepts):
    quads = parse_nquads(bz2.open(str(interlang_path), 'rt'))
    mapping = {}
    for subj, values in itertools.groupby(quads, itemgetter(0)):
        subj_url = subj['url']
        subj_concept = translate_dbpedia_url(subj_url)
        pieces = split_uri(subj_concept)
        if len(pieces) >= 6:
            sense = pieces[5]
            if 'album' in sense or 'film' in sense or 'series' in sense or 'disambiguation' in sense or 'song' in sense or 'album' in sense or 'band' in sense:
                continue
        if uri_prefix(subj_concept) in ok_concepts:
            targets = [subj_url]

            for _subj, _pred, obj, _graph in values:
                url = obj['url']
                if 'www.wikidata.org' in url:
                    continue
                if url.startswith('http://wikidata.dbpedia.org/'):
                    wikidata_id = resource_name(url)

                    # Return early when we see a high-numbered Wikidata ID
                    if int(wikidata_id[1:]) >= 1000000:
                        return mapping
                targets.append(url)

            mapping[subj_url] = targets
    return mapping
예제 #7
0
def interlanguage_mapping(interlang_path, ok_concepts):
    quads = parse_nquads(bz2.open(str(interlang_path), 'rt'))
    mapping = {}
    for subj, values in itertools.groupby(quads, itemgetter(0)):
        subj_url = subj['url']
        subj_concept = translate_dbpedia_url(subj_url)
        pieces = split_uri(subj_concept)
        if len(pieces) >= 6:
            sense = pieces[5]
            if 'album' in sense or 'film' in sense or 'series' in sense or 'disambiguation' in sense or 'song' in sense or 'album' in sense or 'band' in sense:
                continue
        if uri_prefix(subj_concept) in ok_concepts:
            targets = [subj_url]

            for _subj, _pred, obj, _graph in values:
                url = obj['url']
                if 'www.wikidata.org' in url:
                    continue
                if url.startswith('http://wikidata.dbpedia.org/'):
                    wikidata_id = resource_name(url)

                    # Return early when we see a high-numbered Wikidata ID
                    if int(wikidata_id[1:]) >= 1000000:
                        return mapping
                targets.append(url)

            mapping[subj_url] = targets
    return mapping
예제 #8
0
def term_freq(term):
    _c, lang, term = split_uri(term)[:3]
    if lang == 'en':
        return wordfreq.word_frequency(term, 'en', 'large')
    elif lang in CORE_LANGUAGES:
        return wordfreq.word_frequency(term, lang)
    else:
        return 0.
예제 #9
0
def keep_concept(uri):
    # FIXME: possibly we should use the 'is_valid_concept' check that we use
    # elsewhere
    if is_absolute_url(uri):
        return True
    if get_uri_language(uri) not in ALL_LANGUAGES:
        return False
    pieces = split_uri(uri)
    return bool(pieces[2])
예제 #10
0
def browse_concept(uri):
    req_args = flask.request.args
    concept = '/c/%s' % uri
    pieces = split_uri(concept)
    if len(pieces) <= 2:
        return browse_node('c', pieces[1])
    limit = get_int(req_args, 'limit', 20, 0, 1000)

    # Offset is not used when grouping by features
    offset = get_int(req_args, 'offset', 0, 0, 10000)

    filters = {}
    for key in responses.VALID_KEYS:
        if key != 'node' and key in req_args:
            filters[key] = req_args[key]

    if filters:
        filters['node'] = concept
        limit = get_int(req_args, 'limit', 100, 0, 1000)
        return edge_list_query(filters, offset=offset, limit=limit)
    else:
        results = responses.lookup_grouped_by_feature(concept,
                                                      filters,
                                                      feature_limit=limit)
        sources = []
        external_links = []

        if 'error' in results:
            return flask.render_template('error.html', error=results['error'])

        rendered_features = []

        for feature in results['features']:
            rel = feature['feature']['rel']
            if rel in REL_HEADINGS['en']:
                label_choices = REL_HEADINGS['en'][rel]
            else:
                label_choices = ['%s {0}' % rel, '{0} %s' % rel]

            if feature['symmetric'] or 'end' in feature['feature']:
                feat_label = label_choices[0]
            else:
                feat_label = label_choices[1]
            feature['label'] = feat_label.format(results['label'])
            for edge in feature['edges']:
                sources.extend(edge['sources'])

            if rel == '/r/ExternalURL':
                external_links = feature['edges']
            else:
                rendered_features.append(feature)

        return flask.render_template('node_by_feature.html',
                                     term=results,
                                     features=rendered_features,
                                     sources=sources,
                                     external_links=external_links)
예제 #11
0
def get_uri_language(uri):
    """
    Extract the language from a concept URI. If the URI points to an assertion,
    get the language of its first concept.
    """
    if uri.startswith('/a/'):
        return get_uri_language(parse_possible_compound_uri('a', uri)[0])
    elif uri.startswith('/c/'):
        return split_uri(uri)[1]
    else:
        return None
예제 #12
0
 def _englishify(term):
     """
     Change the language of a /c/ term to English. If the input isn't a term,
     return None.
     """
     splits = split_uri(term)
     if not term.startswith('/c/'):
         return None
     if len(splits) > 2:
         englishified = '/c/en/' + splits[2]
         return englishified
예제 #13
0
def get_uri_language(uri):
    """
    Extract the language from a concept URI. If the URI points to an assertion,
    get the language of its first concept.
    """
    if uri.startswith('/a/'):
        return get_uri_language(parse_possible_compound_uri('a', uri)[0])
    elif uri.startswith('/c/'):
        return split_uri(uri)[1]
    else:
        return None
예제 #14
0
def keep_concept(uri):
    # FIXME: possibly we should use the 'is_valid_concept' check that we use
    # elsewhere
    if is_absolute_url(uri):
        return True
    if get_uri_language(uri) not in ALL_LANGUAGES:
        return False
    if not valid_language(get_uri_language(uri)):
        return False
    pieces = split_uri(uri)
    return bool(pieces[2])
예제 #15
0
def uri2term(arg, include_more=False):
	if arg.startswith('/c/'):
		if len(arg.split('/')) <= 3:
			return arg.split('/')[-1]
		result = arg.split('/')[3]
	else:
		result = arg.split('/')[-1]
	if result.startswith('be_') or result.startswith('to_'):
		result = result[3:]
	if include_more:
		result = '/'.join(split_uri(arg)[2:])
	return result
예제 #16
0
    def lemmatize_uri(self, uri):
        pieces = split_uri(uri)
        if len(pieces) < 2:
            return uri
        language = pieces[1]
        text = pieces[2]
        rest = pieces[3:]
        if rest:
            pos = rest[0]
        else:
            pos = None

        root, _form = self.lookup(language, text, pos)
        return join_uri('c', language, root, *rest)
예제 #17
0
    def lemmatize_uri(self, uri):
        pieces = split_uri(uri)
        if len(pieces) < 2:
            return uri
        language = pieces[1]
        text = pieces[2]
        rest = pieces[3:]
        if rest:
            pos = rest[0]
        else:
            pos = None

        root, _form = self.lookup(language, text, pos)
        return join_uri('c', language, root, *rest)
예제 #18
0
def get_results(feature):
    feature = feature.lower()
    ret = []
    with urlopen('http://conceptnet5.media.mit.edu/data/5.4/assoc' +
                 quote('/c/en/' + feature) +
                 "?filter=/c/en&limit=100") as response:
        html = response.read().decode('utf8')
        result = json.loads(html)
        for u, score in result["similar"]:
            s = float(score)
            if s >= 0.5:
                tmp = split_uri(u)
                if tmp[-1] != "neg":  #ignore negative relationships
                    ret.append((tmp[2], 1 - s))  #invert score priority
    return ret
예제 #19
0
def term_freq(term):
    """
    Get an estimate of the frequency of this term from the 'wordfreq' library.
    When miniaturizing, we use this as a cutoff for which words to include
    in the vocabulary.

    Because we have the most data for English, we allow lower word frequencies
    in English (by reading in the 'large' list, whose frequencies can go
    below 1e-6).
    """
    _c, lang, term = split_uri(term)[:3]
    if lang == 'en':
        return wordfreq.word_frequency(term, 'en', 'large')
    elif lang in CORE_LANGUAGES:
        return wordfreq.word_frequency(term, lang)
    else:
        return 0.
예제 #20
0
 def _match_prefix(self, term, prefix_weight):
     results = []
     while term:
         # Skip excessively general lookups, for either an entire
         # language, or all terms starting with a single
         # non-ideographic letter
         if (len(split_uri(term)) < 3 or term.endswith('/')
                 or (term[-2] == '/' and term[-1] < chr(0x3000))):
             break
         prefixed = self._terms_with_prefix(term)
         if prefixed:
             n_prefixed = len(prefixed)
             for prefixed_term in prefixed:
                 results.append((prefixed_term, prefix_weight / n_prefixed))
             break
         term = term[:-1]
     return results
예제 #21
0
def term_freq(term):
    """
    Get an estimate of the frequency of this term from the 'wordfreq' library.
    When miniaturizing, we use this as a cutoff for which words to include
    in the vocabulary.

    Because we have the most data for English, we allow lower word frequencies
    in English (by reading in the 'large' list, whose frequencies can go
    below 1e-6).
    """
    _c, lang, term = split_uri(term)[:3]
    if lang == 'en':
        return wordfreq.word_frequency(term, 'en', 'large')
    elif lang in CORE_LANGUAGES:
        return wordfreq.word_frequency(term, lang)
    else:
        return 0.
예제 #22
0
def ld_node(uri, label=None):
    """
    Convert a ConceptNet URI into a dictionary suitable for Linked Data.
    """
    if label is None:
        label = uri_to_label(uri)
    ld = {'@id': uri, 'label': label}
    if uri.startswith('/c/'):
        pieces = split_uri(uri)
        ld['language'] = get_language(uri)
        if len(pieces) > 3:
            ld['sense_label'] = '/'.join(pieces[3:])
        ld['term'] = uri_prefix(uri)
    elif uri.startswith('http'):
        domain = urlparse(uri).netloc
        ld['site'] = domain
        ld['term'] = uri
    return ld
예제 #23
0
 def _match_prefix(self, term, prefix_weight):
     results = []
     while term:
         # Skip excessively general lookups, for either an entire
         # language, or all terms starting with a single
         # non-ideographic letter
         if (
             len(split_uri(term)) < 3
             or term.endswith('/')
             or (term[-2] == '/' and term[-1] < chr(0x3000))
         ):
             break
         prefixed = self._terms_with_prefix(term)
         if prefixed:
             n_prefixed = len(prefixed)
             for prefixed_term in prefixed:
                 results.append((prefixed_term, prefix_weight / n_prefixed))
             break
         term = term[:-1]
     return results
예제 #24
0
def uri_to_lemmas(uri):
    """
    Given a normalized concept URI, extract the list of words (in their root
    form) that it contains in its text.

    >>> # This is the lemmatized concept meaning 'United States'
    >>> uri_to_lemmas('/c/en/unite_state')
    ['unite', 'state']
    >>> uri_to_lemmas('/c/en/township/n/united_states')
    ['township', 'unite', 'state']
    """
    uri_pieces = split_uri(uri)
    lemmas = uri_pieces[2].split('_')
    if len(uri_pieces) >= 5:
        lang = uri_pieces[1]
        text = uri_pieces[4].replace('_', ' ')
        if text not in BAD_NAMES_FOR_THINGS:
            disambig = normalized_concept_name(lang, text)
            lemmas.extend(disambig.split('_'))
    return lemmas
예제 #25
0
def reduce_concept(concept):
    """
    Remove the part of speech and disambiguation (if present) from a concept,
    leaving a potentially ambiguous concept that can be matched against surface
    text.

    Additionally, remove the region tag from Chinese assertions, so they are
    considered simply as assertions about Chinese regardless of whether it is
    Traditional or Simplified Chinese. In the cases where they overlap, this
    helps to make the information more complete.

    >>> reduce_concept('/c/en/cat/n/feline')
    '/c/en/cat'
    >>> reduce_concept('/c/zh_TW/良好')
    '/c/zh/良好'
    """
    parts = split_uri(concept)
    # Unify simplified and traditional Chinese in associations.
    if parts[1] == 'zh_CN' or parts[1] == 'zh_TW':
        parts[1] = 'zh'
    return join_uri(*parts[:3])
예제 #26
0
def ld_node(uri, label=None):
    """
    Convert a ConceptNet URI into a dictionary suitable for Linked Data.
    """
    if label is None:
        label = uri_to_label(uri)
    ld = {
        '@id': uri,
        'label': label
    }
    if uri.startswith('/c/'):
        pieces = split_uri(uri)
        ld['language'] = pieces[1]
        if len(pieces) > 3:
            ld['sense_label'] = '/'.join(pieces[3:])
        ld['term'] = uri_prefix(uri)
    elif uri.startswith('http'):
        domain = urlparse(uri).netloc
        ld['site'] = domain
        ld['term'] = uri
    return ld
예제 #27
0
def reduce_concept(concept):
    """
    Remove the part of speech and disambiguation (if present) from a concept,
    leaving a potentially ambiguous concept that can be matched against surface
    text.

    Additionally, remove the region tag from Chinese assertions, so they are
    considered simply as assertions about Chinese regardless of whether it is
    Traditional or Simplified Chinese. In the cases where they overlap, this
    helps to make the information more complete.

    >>> reduce_concept('/c/en/cat/n/feline')
    '/c/en/cat'
    >>> reduce_concept('/c/zh_TW/良好')
    '/c/zh/良好'
    """
    parts = split_uri(concept)
    # Unify simplified and traditional Chinese in associations.
    if parts[1] == 'zh_CN' or parts[1] == 'zh_TW':
        parts[1] = 'zh'
    return join_uri(*parts[:3])
예제 #28
0
def ld_node(uri, label=None):
    """
    Convert a ConceptNet URI into a dictionary suitable for Linked Data.
    """
    if label is None:
        label = uri_to_label(uri)
    ld = {'@id': uri, 'label': label}
    if is_term(uri):
        pieces = split_uri(uri)
        ld['language'] = get_uri_language(uri)

        # Get a reasonably-distinct sense label for the term.
        # Usually it will be the part of speech, but when we have fine-grained
        # information from Wikipedia or WordNet, it'll include the last
        # component as well.
        if len(pieces) > 3:
            ld['sense_label'] = pieces[3]

        if len(pieces) > 4 and pieces[4] in ('wp', 'wn'):
            ld['sense_label'] += ', ' + pieces[-1]

        ld['term'] = uri_prefix(uri)
        ld['@type'] = 'Node'
    elif uri.startswith('http'):
        domain = urlparse(uri).netloc
        ld['site'] = domain
        ld['term'] = uri

        # OpenCyc is down and UMBEL doesn't host their vocabulary on the
        # Web. This property indicates whether you can follow a link
        # via HTTP and retrieve more information.
        ld['site_available'] = True
        if domain in {'sw.opencyc.org', 'umbel.org', 'wikidata.dbpedia.org'}:
            ld['site_available'] = False
        ld['path'] = urlparse(uri).path
        ld['@type'] = 'Node'
    elif uri.startswith('/r/'):
        ld['@type'] = 'Relation'
    return ld
예제 #29
0
def browse_concept(uri):
    req_args = flask.request.args
    concept = '/c/%s' % uri
    pieces = split_uri(concept)
    if len(pieces) <= 2:
        return browse_node('c', pieces[1])
    limit = get_int(req_args, 'limit', 20, 0, 1000)
    filters = {}
    for key in VALID_KEYS:
        if key != 'node' and key in req_args:
            filters[key] = req_args[key]
    results = responses.lookup_grouped_by_feature(concept,
                                                  filters,
                                                  group_limit=limit)
    sources = []

    if 'error' in results:
        return flask.render_template('error.html', error=results['error'])

    for feature in results['features']:
        rel = feature['feature']['rel']
        if rel in REL_HEADINGS['en']:
            label_choices = REL_HEADINGS['en'][rel]
        else:
            label_choices = ['%s {0}' % rel, '{0} %s' % rel]

        if feature['symmetric'] or 'end' in feature['feature']:
            feat_label = label_choices[0]
        else:
            feat_label = label_choices[1]
        feature['label'] = feat_label.format(results['label'])
        for edge in feature['edges']:
            sources.extend(edge['sources'])

    return flask.render_template('node_by_feature.html',
                                 term=results,
                                 features=results['features'],
                                 sources=sources)
예제 #30
0
def reduce_concept(concept):
    """
    Remove the part of speech and disambiguation (if present) from a concept,
    leaving a potentially ambiguous concept that can be matched against surface
    text.

    Additionally, simplify language tags to a bare language. The main purpose
    is to remove the region tag from Chinese assertions, so they are considered
    simply as assertions about Chinese regardless of whether it is Traditional
    or Simplified Chinese. In the cases where they overlap, this helps to make
    the information more complete.

    >>> reduce_concept('/c/en/cat/n/feline')
    '/c/en/cat'
    >>> reduce_concept('/c/zh_TW/良好')
    '/c/zh/良好'
    """
    parts = split_uri(concept)
    langtag = parts[1]
    if parts[1] != '[':
        langcode = langcodes.get(langtag).language
        if langcode:
            parts[1] = langcode
    return join_uri(*parts[:3])
def reduce_concept(concept):
    """
    Remove the part of speech and disambiguation (if present) from a concept,
    leaving a potentially ambiguous concept that can be matched against surface
    text.

    Additionally, simplify language tags to a bare language. The main purpose
    is to remove the region tag from Chinese assertions, so they are considered
    simply as assertions about Chinese regardless of whether it is Traditional
    or Simplified Chinese. In the cases where they overlap, this helps to make
    the information more complete.

    >>> reduce_concept('/c/en/cat/n/feline')
    '/c/en/cat'
    >>> reduce_concept('/c/zh_TW/良好')
    '/c/zh/良好'
    """
    parts = split_uri(concept)
    langtag = parts[1]
    if parts[1] != '[':
        langcode = langcodes.get(langtag).language
        if langcode:
            parts[1] = langcode
    return join_uri(*parts[:3])
예제 #32
0
def ld_node(uri, label=None):
    """
    Convert a ConceptNet URI into a dictionary suitable for Linked Data.
    """
    if label is None:
        label = uri_to_label(uri)
    ld = {'@id': uri, 'label': label}
    if is_term(uri):
        pieces = split_uri(uri)
        ld['language'] = get_uri_language(uri)

        # Get a reasonably-distinct sense label for the term.
        # Usually it will be the part of speech, but when we have fine-grained
        # information from Wikipedia or WordNet, it'll include the last
        # component as well.
        if len(pieces) > 3:
            ld['sense_label'] = pieces[3]

        if len(pieces) > 4 and pieces[4] in ('wp', 'wn'):
            ld['sense_label'] += ', ' + pieces[-1]

        ld['term'] = uri_prefix(uri)
        ld['@type'] = 'Node'
    elif uri.startswith('http'):
        domain = urlparse(uri).netloc
        ld['site'] = domain
        ld['term'] = uri

        # OpenCyc is down and UMBEL doesn't host their vocabulary on the
        # Web. This property indicates whether you can follow a link
        # via HTTP and retrieve more information.
        ld['site_available'] = domain not in {'sw.opencyc.org', 'umbel.org'}
        ld['@type'] = 'Node'
    elif uri.startswith('/r/'):
        ld['@type'] = 'Relation'
    return ld
예제 #33
0
def describe_sources(sources, specific=True):
    omcs_contributors = []
    omcs_count = 0
    ptt_count = 0
    nadya_count = 0
    more_sources = set()

    for source in sources:
        if 'activity' in source and source[
                'activity'] == '/s/activity/omcs/nadya.jp':
            nadya_count += 1
        elif 'contributor' in source:
            contributor = source['contributor']
            prefix = uri_prefix(contributor, 3)
            if prefix == '/s/contributor/omcs':
                if len(omcs_contributors) < MAX_INDIVIDUALS:
                    name = split_uri(contributor)[-1]
                    omcs_contributors.append(source_link(contributor, name))
                omcs_count += 1
            elif prefix == '/s/contributor/petgame':
                ptt_count += 1
            elif prefix == '/s/resource/en.wiktionary.org':
                more_sources.add(source_link(prefix, "English Wiktionary"))
            elif prefix == '/s/resource/de.wiktionary.org':
                more_sources.add(source_link(prefix, "German Wiktionary"))
            elif prefix == '/s/resource/fr.wiktionary.org':
                more_sources.add(source_link(prefix, "French Wiktionary"))
            elif contributor in CONTRIBUTOR_NAME_MAP:
                more_sources.add(
                    source_link(contributor,
                                CONTRIBUTOR_NAME_MAP[contributor]))
            else:
                more_sources.add(source_link(contributor, contributor))

    source_chunks = []
    if omcs_contributors:
        if specific:
            if omcs_count > MAX_INDIVIDUALS:
                omcs_contributors.append("{} more".format(omcs_count -
                                                          MAX_INDIVIDUALS))

            omcs_str = '<a href="/s/activity/omcs">Open Mind Common Sense</a> contributors {}'.format(
                oxford_comma(omcs_contributors))
            source_chunks.append(omcs_str)
        else:
            source_chunks.append(
                '<a href="/s/activity/omcs">Open Mind Common Sense</a> contributors'
            )
    if ptt_count:
        if specific:
            if ptt_count == 1:
                count_str = "a player"
            else:
                count_str = "{} players".format(ptt_count)
            source_chunks.append(
                '{} of the <a href="/s/contributor/petgame">PTT Pet Game</a>'.
                format(count_str))
        else:
            source_chunks.append(
                'the <a href="/s/contributor/petgame">PTT Pet Game</a>')

    if nadya_count:
        if specific:
            if nadya_count == 1:
                count_str = "a player"
            else:
                count_str = "{} players".format(nadya_count)
            source_chunks.append(
                '{} of <a href="/s/activity/omcs/nadya.jp">nadya.jp</a>'.
                format(count_str))
        else:
            source_chunks.append(
                '<a href="/s/activity/omcs/nadya.jp">nadya.jp</a>')

    source_chunks.extend(sorted(more_sources))
    if len(source_chunks) == 1:
        source_markup = "<strong>Source:</strong> {}".format(source_chunks[0])
    else:
        source_markup = "<strong>Sources:</strong> {}".format(
            oxford_comma(source_chunks))
    return Markup(source_markup)
예제 #34
0
def describe_sources(sources, specific=True):
    """
    Build a marked-up text phrase describing the sources of our data.

    If `specific` is True, sources with many known individual contributors
    will list up to MAX_INDIVIDUALS of those contributors. If False, only
    the source as a whole will be credited. specific=False is used for the
    credit at the top of a page.
    """
    omcs_contributors = []
    omcs_count = 0
    ptt_count = 0
    nadya_count = 0
    more_sources = set()

    for source in sources:
        if 'activity' in source and source[
                'activity'] == '/s/activity/omcs/nadya.jp':
            nadya_count += 1
        elif 'activity' in source and source[
                'activity'] == '/s/activity/kyoto_yahoo':
            more_sources.add(
                source_link(source['activity'], KYOTO_YAHOO_CREDIT))
        elif 'contributor' in source:
            contributor = source['contributor']
            prefix = uri_prefix(contributor, 3)
            if prefix == '/s/contributor/omcs':
                if len(omcs_contributors) < MAX_INDIVIDUALS:
                    name = split_uri(contributor)[-1]
                    omcs_contributors.append(source_link(contributor, name))
                omcs_count += 1
            elif prefix == '/s/contributor/petgame':
                ptt_count += 1
            elif contributor in CONTRIBUTOR_NAME_MAP:
                more_sources.add(
                    source_link(contributor,
                                CONTRIBUTOR_NAME_MAP[contributor]))
            else:
                more_sources.add(source_link(contributor, contributor))

    source_chunks = []
    if omcs_contributors:
        if specific:
            if omcs_count > MAX_INDIVIDUALS:
                omcs_contributors.append("{} more".format(omcs_count -
                                                          MAX_INDIVIDUALS))

            omcs_str = '<a href="/s/activity/omcs">Open Mind Common Sense</a> contributors {}'.format(
                oxford_comma(omcs_contributors))
            source_chunks.append(omcs_str)
        else:
            source_chunks.append(
                '<a href="/s/activity/omcs">Open Mind Common Sense</a> contributors'
            )
    if ptt_count:
        if specific:
            if ptt_count == 1:
                count_str = "a player"
            else:
                count_str = "{} players".format(ptt_count)
            source_chunks.append(
                '{} of the <a href="/s/contributor/petgame">PTT Pet Game</a>'.
                format(count_str))
        else:
            source_chunks.append(
                'the <a href="/s/contributor/petgame">PTT Pet Game</a>')

    if nadya_count:
        if specific:
            if nadya_count == 1:
                count_str = "a player"
            else:
                count_str = "{} players".format(nadya_count)
            source_chunks.append(
                '{} of <a href="/s/activity/omcs/nadya.jp">nadya.jp</a>'.
                format(count_str))
        else:
            source_chunks.append(
                '<a href="/s/activity/omcs/nadya.jp">nadya.jp</a>')

    source_chunks.extend(sorted(more_sources))
    if len(source_chunks) == 1:
        source_markup = "<strong>Source:</strong> {}".format(source_chunks[0])
    else:
        source_markup = "<strong>Sources:</strong> {}".format(
            oxford_comma(source_chunks))
    return Markup(source_markup)
예제 #35
0
def describe_sources(sources, specific=True):
    """
    Build a marked-up text phrase describing the sources of our data.

    If `specific` is True, sources with many known individual contributors
    will list up to MAX_INDIVIDUALS of those contributors. If False, only
    the source as a whole will be credited. specific=False is used for the
    credit at the top of a page.
    """
    omcs_contributors = []
    omcs_count = 0
    ptt_count = 0
    nadya_count = 0
    more_sources = set()

    for source in sources:
        if 'activity' in source and source['activity'] == '/s/activity/omcs/nadya.jp':
            nadya_count += 1
        elif 'activity' in source and source['activity'] == '/s/activity/kyoto_yahoo':
            more_sources.add(source_link(source['activity'], KYOTO_YAHOO_CREDIT))
        elif 'contributor' in source:
            contributor = source['contributor']
            prefix = uri_prefix(contributor, 3)
            if prefix == '/s/contributor/omcs':
                if len(omcs_contributors) < MAX_INDIVIDUALS:
                    name = split_uri(contributor)[-1]
                    omcs_contributors.append(source_link(contributor, name))
                omcs_count += 1
            elif prefix == '/s/contributor/petgame':
                ptt_count += 1
            elif contributor in CONTRIBUTOR_NAME_MAP:
                more_sources.add(
                    source_link(contributor, CONTRIBUTOR_NAME_MAP[contributor])
                )
            else:
                more_sources.add(source_link(contributor, contributor))

    source_chunks = []
    if omcs_contributors:
        if specific:
            if omcs_count > MAX_INDIVIDUALS:
                omcs_contributors.append("{} more".format(omcs_count - MAX_INDIVIDUALS))

            omcs_str = '<a href="/s/activity/omcs">Open Mind Common Sense</a> contributors {}'.format(
                oxford_comma(omcs_contributors)
            )
            source_chunks.append(omcs_str)
        else:
            source_chunks.append(
                '<a href="/s/activity/omcs">Open Mind Common Sense</a> contributors'
            )
    if ptt_count:
        if specific:
            if ptt_count == 1:
                count_str = "a player"
            else:
                count_str = "{} players".format(ptt_count)
            source_chunks.append(
                '{} of the <a href="/s/contributor/petgame">PTT Pet Game</a>'.format(
                    count_str
                )
            )
        else:
            source_chunks.append(
                'the <a href="/s/contributor/petgame">PTT Pet Game</a>'
            )

    if nadya_count:
        if specific:
            if nadya_count == 1:
                count_str = "a player"
            else:
                count_str = "{} players".format(nadya_count)
            source_chunks.append(
                '{} of <a href="/s/activity/omcs/nadya.jp">nadya.jp</a>'.format(
                    count_str
                )
            )
        else:
            source_chunks.append('<a href="/s/activity/omcs/nadya.jp">nadya.jp</a>')

    source_chunks.extend(sorted(more_sources))
    if len(source_chunks) == 1:
        source_markup = "<strong>Source:</strong> {}".format(source_chunks[0])
    else:
        source_markup = "<strong>Sources:</strong> {}".format(
            oxford_comma(source_chunks)
        )
    return Markup(source_markup)
예제 #36
0
def uri_split(uri):
	type, lang, term = split_uri(uri)
	if lang == 'en':
		return term
예제 #37
0
 def _englishify(term):
     splits = split_uri(term)
     if len(splits) > 2:
         englishified = '/c/en/' + splits[2]
         return englishified
예제 #38
0
def msgpack_to_assoc(input_filename, output_filename):
    """
    Convert a msgpack stream to a tab-separated "CSV" of concept-to-concept
    associations.

    As a special case, we convert some "Desires" and "NotDesires" relations
    to "HasProperty" relations, so that:

    - An assertion that means "People want X" in English or Chinese is converted
      to an association meaning "X is good"
    - An assertion that "People don't want X" is converted to an association
      meaning "X is bad"

    The result is used to build machine-learning models that recognize
    semantic similarities between words, and particularly the ConceptNet
    Numberbatch embedding space.
    """
    with open(output_filename, 'w', encoding='utf-8') as out_stream:
        weight_by_dataset = defaultdict(float)
        count_by_dataset = defaultdict(int)
        prefixed = set()
        for info in read_msgpack_stream(input_filename):
            start_uri = info['start']
            end_uri = info['end']
            if not (get_uri_language(start_uri) in COMMON_LANGUAGES
                    and get_uri_language(end_uri) in COMMON_LANGUAGES):
                continue
            rel = info['rel']
            weight = info['weight']
            dataset = info['dataset']

            for uri in (start_uri, end_uri):
                pieces = split_uri(uri)
                if len(pieces) > 3 and (uri, dataset) not in prefixed:
                    prefix = join_uri(*pieces[:3])
                    prefixed.add((uri, dataset))
                    line = "{start}\t{end}\t{weight}\t{dataset}\t{rel}".format(
                        start=uri,
                        end=prefix,
                        weight=1.,
                        dataset=dataset,
                        rel='/r/SenseOf')
                    weight_by_dataset[dataset] += 1.
                    count_by_dataset[dataset] += 1
                    print(line, file=out_stream)

            if start_uri == '/c/en/person' or start_uri == '/c/en/people':
                if rel == '/r/Desires':
                    pairs = [('/c/en/good', end_uri)]
                elif rel == '/r/NotDesires':
                    pairs = [('/c/en/bad', end_uri)]
                else:
                    pairs = [(start_uri, end_uri)]
            elif start_uri == '/c/zh/人':
                if rel == '/r/Desires':
                    pairs = [('/c/zh/良好', end_uri)]
                elif rel == '/r/NotDesires':
                    pairs = [('/c/zh/不良', end_uri)]
                else:
                    pairs = [(start_uri, end_uri)]
            else:
                pairs = [(start_uri, end_uri)]

            for (start, end) in pairs:
                line = "{start}\t{end}\t{weight}\t{dataset}\t{rel}".format(
                    start=start,
                    end=end,
                    weight=weight,
                    dataset=dataset,
                    rel=rel)
                weight_by_dataset[dataset] += weight
                count_by_dataset[dataset] += 1
                print(line, file=out_stream)

        avg_weight_by_dataset = {
            dataset: weight_by_dataset[dataset] / count_by_dataset[dataset]
            for dataset in count_by_dataset
        }
        print("Average weights:")
        print(avg_weight_by_dataset)
예제 #39
0
def describe_sources(sources, specific=True):
    omcs_contributors = []
    omcs_count = 0
    ptt_count = 0
    nadya_count = 0
    more_sources = set()

    for source in sources:
        if 'activity' in source and source['activity'] == '/s/activity/omcs/nadya.jp':
            nadya_count += 1
        elif 'contributor' in source:
            contributor = source['contributor']
            prefix = uri_prefix(contributor, 3)
            if prefix == '/s/contributor/omcs':
                if len(omcs_contributors) < MAX_INDIVIDUALS:
                    name = split_uri(contributor)[-1]
                    omcs_contributors.append(source_link(contributor, name))
                omcs_count += 1
            elif prefix == '/s/contributor/petgame':
                ptt_count += 1
            elif prefix == '/s/resource/en.wiktionary.org':
                more_sources.add(source_link(prefix, "English Wiktionary"))
            elif prefix == '/s/resource/de.wiktionary.org':
                more_sources.add(source_link(prefix, "German Wiktionary"))
            elif prefix == '/s/resource/fr.wiktionary.org':
                more_sources.add(source_link(prefix, "French Wiktionary"))
            elif contributor in CONTRIBUTOR_NAME_MAP:
                more_sources.add(source_link(contributor, CONTRIBUTOR_NAME_MAP[contributor]))
            else:
                more_sources.add(source_link(contributor, contributor))

    source_chunks = []
    if omcs_contributors:
        if specific:
            if omcs_count > MAX_INDIVIDUALS:
                omcs_contributors.append("{} more".format(omcs_count - MAX_INDIVIDUALS))

            omcs_str = '<a href="/s/activity/omcs">Open Mind Common Sense</a> contributors {}'.format(
                oxford_comma(omcs_contributors)
            )
            source_chunks.append(omcs_str)
        else:
            source_chunks.append('<a href="/s/activity/omcs">Open Mind Common Sense</a> contributors')
    if ptt_count:
        if specific:
            if ptt_count == 1:
                count_str = "a player"
            else:
                count_str = "{} players".format(ptt_count)
            source_chunks.append(
                '{} of the <a href="/s/contributor/petgame">PTT Pet Game</a>'.format(count_str)
            )
        else:
            source_chunks.append('the <a href="/s/contributor/petgame">PTT Pet Game</a>')

    if nadya_count:
        if specific:
            if nadya_count == 1:
                count_str = "a player"
            else:
                count_str = "{} players".format(nadya_count)
            source_chunks.append(
                '{} of <a href="/s/activity/omcs/nadya.jp">nadya.jp</a>'.format(count_str)
            )
        else:
            source_chunks.append('<a href="/s/activity/omcs/nadya.jp">nadya.jp</a>')

    source_chunks.extend(sorted(more_sources))
    if len(source_chunks) == 1:
        source_markup = "<strong>Source:</strong> {}".format(source_chunks[0])
    else:
        source_markup = "<strong>Sources:</strong> {}".format(oxford_comma(source_chunks))
    return Markup(source_markup)
예제 #40
0
def msgpack_to_assoc(input_filename, output_filename):
    """
    Convert a msgpack stream to a tab-separated "CSV" of concept-to-concept
    associations.

    As a special case, we convert some "Desires" and "NotDesires" relations
    to "HasProperty" relations, so that:

    - An assertion that means "People want X" in English or Chinese is converted
      to an association meaning "X is good"
    - An assertion that "People don't want X" is converted to an association
      meaning "X is bad"

    The result is used to build machine-learning models that recognize
    semantic similarities between words, and particularly the ConceptNet
    Numberbatch embedding space.
    """
    with open(output_filename, 'w', encoding='utf-8') as out_stream:
        weight_by_dataset = defaultdict(float)
        count_by_dataset = defaultdict(int)
        prefixed = set()
        for info in read_msgpack_stream(input_filename):
            start_uri = info['start']
            end_uri = info['end']
            if not (
                get_uri_language(start_uri) in COMMON_LANGUAGES
                and get_uri_language(end_uri) in COMMON_LANGUAGES
            ):
                continue
            rel = info['rel']
            weight = info['weight']
            dataset = info['dataset']

            for uri in (start_uri, end_uri):
                pieces = split_uri(uri)
                if len(pieces) > 3 and (uri, dataset) not in prefixed:
                    prefix = join_uri(*pieces[:3])
                    prefixed.add((uri, dataset))
                    line = "{start}\t{end}\t{weight}\t{dataset}\t{rel}".format(
                        start=uri,
                        end=prefix,
                        weight=1.,
                        dataset=dataset,
                        rel='/r/SenseOf',
                    )
                    weight_by_dataset[dataset] += 1.
                    count_by_dataset[dataset] += 1
                    print(line, file=out_stream)

            if start_uri == '/c/en/person' or start_uri == '/c/en/people':
                if rel == '/r/Desires':
                    pairs = [('/c/en/good', end_uri)]
                elif rel == '/r/NotDesires':
                    pairs = [('/c/en/bad', end_uri)]
                else:
                    pairs = [(start_uri, end_uri)]
            elif start_uri == '/c/zh/人':
                if rel == '/r/Desires':
                    pairs = [('/c/zh/良好', end_uri)]
                elif rel == '/r/NotDesires':
                    pairs = [('/c/zh/不良', end_uri)]
                else:
                    pairs = [(start_uri, end_uri)]
            else:
                pairs = [(start_uri, end_uri)]

            for (start, end) in pairs:
                line = "{start}\t{end}\t{weight}\t{dataset}\t{rel}".format(
                    start=start, end=end, weight=weight, dataset=dataset, rel=rel
                )
                weight_by_dataset[dataset] += weight
                count_by_dataset[dataset] += 1
                print(line, file=out_stream)

        avg_weight_by_dataset = {
            dataset: weight_by_dataset[dataset] / count_by_dataset[dataset]
            for dataset in count_by_dataset
        }
        print("Average weights:")
        print(avg_weight_by_dataset)
예제 #41
0
def msgpack_to_assoc(input_filename, output_filename):
    """
    Convert a msgpack stream to a tab-separated "CSV" of concept-to-concept
    associations.

    The relation is mostly ignored, except:

    - An assertion that means "People want X" in English or Chinese is converted to
      an association between X and "good"
    - An assertion that "People don't want X" is converted to an association
      between X and "bad"

    The result can be used to predict word associations using ConceptNet by using
    dimensionality reduction, as in the `assoc_space` package.

    The relation is mostly ignored because we have not yet found a good way to
    take the relation into account in dimensionality reduction.
    """
    with open(output_filename, 'w', encoding='utf-8') as out_stream:
        weight_by_dataset = defaultdict(float)
        count_by_dataset = defaultdict(int)
        prefixed = set()
        for info in read_msgpack_stream(input_filename):
            start_uri = info['start']
            end_uri = info['end']
            if not (
                start_uri.startswith('/c/') and end_uri.startswith('/c/') and
                get_uri_language(start_uri) in COMMON_LANGUAGES and
                get_uri_language(end_uri) in COMMON_LANGUAGES
            ):
                continue
            rel = info['rel']
            weight = info['weight']
            dataset = info['dataset']

            pairs = []
            for uri in (start_uri, end_uri):
                pieces = split_uri(uri)
                if len(pieces) > 3 and (uri, dataset) not in prefixed:
                    prefix = join_uri(*pieces[:3])
                    prefixed.add((uri, dataset))
                    line = "{start}\t{end}\t{weight}\t{dataset}\t{rel}".format(
                        start=uri,
                        end=prefix,
                        weight=1.,
                        dataset=dataset,
                        rel='/r/SenseOf'
                    )
                    weight_by_dataset[dataset] += 1.
                    count_by_dataset[dataset] += 1
                    print(line, file=out_stream)

            if start_uri == '/c/en/person':
                if rel == '/r/Desires':
                    pairs = [('/c/en/good', end_uri)]
                elif rel == '/r/NotDesires':
                    pairs = [('/c/en/bad', end_uri)]
                else:
                    pairs = [(start_uri, end_uri)]
            elif start_uri == '/c/zh/人':
                if rel == '/r/Desires':
                    pairs = [('/c/zh/良好', end_uri)]
                elif rel == '/r/NotDesires':
                    pairs = [('/c/zh/不良', end_uri)]
                else:
                    pairs = [(start_uri, end_uri)]
            else:
                pairs = [(start_uri, end_uri)]

            for (start, end) in pairs:
                line = "{start}\t{end}\t{weight}\t{dataset}\t{rel}".format(
                    start=start, end=end, weight=weight, dataset=dataset,
                    rel=rel
                )
                weight_by_dataset[dataset] += weight
                count_by_dataset[dataset] += 1
                print(line, file=out_stream)

        avg_weight_by_dataset = {
            dataset: weight_by_dataset[dataset] / count_by_dataset[dataset]
            for dataset in count_by_dataset
        }
        print("Average weights:")
        print(avg_weight_by_dataset)
예제 #42
0
 def _englishify(term):
     splits = split_uri(term)
     if len(splits) > 2:
         englishified = '/c/en/' + splits[2]
         return englishified
예제 #43
0
def msgpack_to_assoc(input_filename, output_filename):
    """
    Convert a msgpack stream to a tab-separated "CSV" of concept-to-concept
    associations.

    The relation is mostly ignored, except:

    - An assertion that means "People want X" in English or Chinese is converted to
      an association between X and "good"
    - An assertion that "People don't want X" is converted to an association
      between X and "bad"

    The result can be used to predict word associations using ConceptNet by using
    dimensionality reduction, as in the `assoc_space` package.

    FIXME: the above is out of date, we use conceptnet5.vectors now

    The relation is mostly ignored because we have not yet found a good way to
    take the relation into account in dimensionality reduction.
    """
    with open(output_filename, 'w', encoding='utf-8') as out_stream:
        weight_by_dataset = defaultdict(float)
        count_by_dataset = defaultdict(int)
        prefixed = set()
        for info in read_msgpack_stream(input_filename):
            start_uri = info['start']
            end_uri = info['end']
            if not (
                get_uri_language(start_uri) in COMMON_LANGUAGES and
                get_uri_language(end_uri) in COMMON_LANGUAGES
            ):
                continue
            rel = info['rel']
            weight = info['weight']
            dataset = info['dataset']

            pairs = []
            for uri in (start_uri, end_uri):
                pieces = split_uri(uri)
                if len(pieces) > 3 and (uri, dataset) not in prefixed:
                    prefix = join_uri(*pieces[:3])
                    prefixed.add((uri, dataset))
                    line = "{start}\t{end}\t{weight}\t{dataset}\t{rel}".format(
                        start=uri,
                        end=prefix,
                        weight=1.,
                        dataset=dataset,
                        rel='/r/SenseOf'
                    )
                    weight_by_dataset[dataset] += 1.
                    count_by_dataset[dataset] += 1
                    print(line, file=out_stream)

            if start_uri == '/c/en/person':
                if rel == '/r/Desires':
                    pairs = [('/c/en/good', end_uri)]
                elif rel == '/r/NotDesires':
                    pairs = [('/c/en/bad', end_uri)]
                else:
                    pairs = [(start_uri, end_uri)]
            elif start_uri == '/c/zh/人':
                if rel == '/r/Desires':
                    pairs = [('/c/zh/良好', end_uri)]
                elif rel == '/r/NotDesires':
                    pairs = [('/c/zh/不良', end_uri)]
                else:
                    pairs = [(start_uri, end_uri)]
            else:
                pairs = [(start_uri, end_uri)]

            for (start, end) in pairs:
                line = "{start}\t{end}\t{weight}\t{dataset}\t{rel}".format(
                    start=start, end=end, weight=weight, dataset=dataset,
                    rel=rel
                )
                weight_by_dataset[dataset] += weight
                count_by_dataset[dataset] += 1
                print(line, file=out_stream)

        avg_weight_by_dataset = {
            dataset: weight_by_dataset[dataset] / count_by_dataset[dataset]
            for dataset in count_by_dataset
        }
        print("Average weights:")
        print(avg_weight_by_dataset)