예제 #1
0
def keep_concept(uri):
    # FIXME: possibly we should use the 'is_valid_concept' check that we use
    # elsewhere
    if is_absolute_url(uri):
        return True
    if get_uri_language(uri) not in ALL_LANGUAGES:
        return False
    if not valid_language(get_uri_language(uri)):
        return False
    pieces = split_uri(uri)
    return bool(pieces[2])
예제 #2
0
def keep_concept(uri):
    # FIXME: possibly we should use the 'is_valid_concept' check that we use
    # elsewhere
    if is_absolute_url(uri):
        return True
    if get_uri_language(uri) not in ALL_LANGUAGES:
        return False
    if not valid_language(get_uri_language(uri)):
        return False
    pieces = split_uri(uri)
    return bool(pieces[2])
예제 #3
0
    def expand_terms(self, terms, oov_vector=True):
        """
        Given a list of weighted terms as (term, weight) tuples, if any of the terms
        are OOV, find approximations to those terms: the same term in English, or terms
        that share a prefix that's as long as possible with the given term.

        This helps increase the recall power of the vector space, because it means
        you can find terms that are too infrequent to have their own vector, getting
        a reasonable guess at the vector they might have.
        """
        expanded = terms[:]
        for term, weight in terms:
            if oov_vector and term not in self.frame.index:
                prefix_weight = 0.01
                if get_uri_language(term) != 'en':
                    englishified = self._englishify(term)
                    if englishified is not None:
                        expanded.append((englishified, prefix_weight))

                prefix_matches = self._match_prefix(term, prefix_weight)
                expanded.extend(prefix_matches)

        total_weight = sum(abs(weight) for term, weight in expanded)
        if total_weight == 0:
            return []
        else:
            return [(uri_prefix(term), weight / total_weight)
                    for (term, weight) in expanded]
예제 #4
0
def ld_node(uri, label=None):
    """
    Convert a ConceptNet URI into a dictionary suitable for Linked Data.
    """
    if label is None:
        label = uri_to_label(uri)
    ld = {'@id': uri, 'label': label}
    if is_term(uri):
        pieces = split_uri(uri)
        ld['language'] = get_uri_language(uri)
        if len(pieces) > 3:
            ld['sense_label'] = '/'.join(pieces[3:])
        ld['term'] = uri_prefix(uri)
        ld['@type'] = 'Node'
    elif uri.startswith('http'):
        domain = urlparse(uri).netloc
        ld['site'] = domain
        ld['term'] = uri

        # OpenCyc is down and UMBEL doesn't host their vocabulary on the
        # Web. This property indicates whether you can follow a link
        # via HTTP and retrieve more information.
        ld['site_available'] = domain not in {'sw.opencyc.org', 'umbel.org'}
        ld['@type'] = 'Node'
    elif uri.startswith('/r/'):
        ld['@type'] = 'Relation'
    return ld
예제 #5
0
    def combined_index_and_new_term_sets(self, frame):
        """
        Find the sets of terms in the frame, and of additional terms in the
        graph not in the frame, but at finite distance in the graph from the 
        terms of the frame, in English and not.  Using these construct a 
        combined index satisfying the constraints required for input to 
        propagation (terms of the frame must come first, then terms from the 
        (vertices of) the graph not in the frame, with the non-English terms 
        not from the frame preceeding the English terms not from the frame).
        
        Returns the combined index, the set of non-English terms from the 
        graph not in the frame, and the set of English terms from the graph 
        not in the frame.
        """
        frame_terms = set(frame.index)

        # Get the ranks of the vertices of the graph with respect to the frame,
        # and use them to eliminate all vertices not a finite distance from the
        # frame.
        ranks = self.rank_vertices(frame)
        graph_terms = set(term for term in self.vertices if ranks[term] != -1)
        
        all_terms = graph_terms | frame_terms
        new_terms = all_terms - frame_terms
        new_english_terms = set(
            term for term in new_terms if get_uri_language(term) == 'en'
        )
        new_non_english_terms = new_terms - new_english_terms

        combined_index = pd.Index(
            list(frame_terms) + \
            list(new_non_english_terms) + \
            list(new_english_terms)
        )
        return combined_index, new_non_english_terms, new_english_terms
예제 #6
0
def prepare_vocab_for_morphology(language, input, output):
    """
    Morfessor's input is a list of terms with their counts. Here, we
    read a ConceptNet vocabulary file with counts (core_concept_counts.txt),
    filter it for a single language, and convert it into the input form that
    Morfessor expects.

    We're stripping out the word sense information here, which would cause
    the same term to appear multiple times. Because of that, we build up
    a new dictionary of counts, summing all occurrences of a term.

    We use _ to represent all spaces. In languages where the space-separated
    segments are atomic (Vietnamese), we use _ to represent the locations where
    subwords are allowed to end, and thus add _ to the end of the term as well.
    """
    vocab_counts = defaultdict(int)
    for line in input:
        countstr, uri = line.strip().split(' ', 1)
        if get_uri_language(uri) == language:
            term = split_uri(uri)[2]
            if language in ATOMIC_SPACE_LANGUAGES:
                term += '_'
            vocab_counts[term] += int(countstr)

    for term, count in sorted(list(vocab_counts.items())):
        print(count, term, file=output)
예제 #7
0
    def expand_terms(self, terms, limit_per_term=10, oov_vector=True):
        """
        Given a list of weighted terms as (term, weight) tuples, add terms that
        are one step away in ConceptNet at a lower weight, terms in English that share the
        surface form with these terms, and the terms which share prefix with these terms,
        if the terms are OOV.

        This helps increase the recall power of the vector space, because it
        means you can find terms that are too infrequent to have their own
        vector by looking up their neighbors, etc.

        This forms a reasonable approximation of the vector an infrequent term would have anyway.
        """
        self.load()
        expanded = terms[:]
        for term, weight in terms:
            if oov_vector and term not in self.frame.index and self.finder is not None:
                neighbors = self._find_neighbors(term, limit_per_term, weight)
                expanded.extend(neighbors)

                prefix_weight = 0.01
                if get_uri_language(term) != 'en':
                    englishified = self._englishify(term)
                    expanded.append((englishified, prefix_weight))

                prefix_matches = self._match_prefix(term, prefix_weight)
                expanded.extend(prefix_matches)

        total_weight = sum(abs(weight) for term, weight in expanded)
        if total_weight == 0:
            return []
        else:
            return [
                (uri_prefix(term), weight / total_weight) for (term, weight) in expanded
            ]
예제 #8
0
    def combined_index_and_new_term_sets(self, frame):
        """
        Find the sets of terms in the frame, and of additional terms in the
        graph not in the frame, but at finite distance in the graph from the 
        terms of the frame, in English and not.  Using these construct a 
        combined index satisfying the constraints required for input to 
        propagation (terms of the frame must come first, then terms from the 
        (vertices of) the graph not in the frame, with the non-English terms 
        not from the frame preceeding the English terms not from the frame).
        
        Returns the combined index, the set of non-English terms from the 
        graph not in the frame, and the set of English terms from the graph 
        not in the frame.
        """
        frame_terms = set(frame.index)

        # Get the ranks of the vertices of the graph with respect to the frame,
        # and use them to eliminate all vertices not a finite distance from the
        # frame.
        ranks = self.rank_vertices(frame)
        graph_terms = set(term for term in self.vertices if ranks[term] != -1)

        all_terms = graph_terms | frame_terms
        new_terms = all_terms - frame_terms
        new_english_terms = set(term for term in new_terms
                                if get_uri_language(term) == 'en')
        new_non_english_terms = new_terms - new_english_terms

        combined_index = pd.Index(
            list(frame_terms) + \
            list(new_non_english_terms) + \
            list(new_english_terms)
        )
        return combined_index, new_non_english_terms, new_english_terms
예제 #9
0
    def expand_terms(self, terms, limit_per_term=10, oov_vector=True):
        """
        Given a list of weighted terms as (term, weight) tuples, add terms that
        are one step away in ConceptNet at a lower weight, terms in English that share the
        surface form with these terms, and the terms which share prefix with these terms,
        if the terms are OOV.

        This helps increase the recall power of the vector space, because it
        means you can find terms that are too infrequent to have their own
        vector by looking up their neighbors, etc.

        This forms a reasonable approximation of the vector an infrequent term would have anyway.
        """
        self.load()
        expanded = terms[:]
        for term, weight in terms:
            if oov_vector and term not in self.frame.index and self.finder is not None:
                neighbors = self._find_neighbors(term, limit_per_term, weight)
                expanded.extend(neighbors)

                prefix_weight = 0.01
                if get_uri_language(term) != 'en':
                    englishified = self._englishify(term)
                    expanded.append((englishified, prefix_weight))

                prefix_matches = self._match_prefix(term, prefix_weight)
                expanded.extend(prefix_matches)

        total_weight = sum(abs(weight) for term, weight in expanded)
        if total_weight == 0:
            return []
        else:
            return [
                (uri_prefix(term), weight / total_weight) for (term, weight) in expanded
            ]
예제 #10
0
    def expand_terms(self, terms, limit_per_term=10, include_neighbors=True):
        """
        Given a list of weighted terms as (term, weight) tuples, add terms that
        are one step away in ConceptNet at a lower weight, terms in English that share the
        surface form with these terms, and the terms which share prefix with these terms,
        if the terms are OOV.

        This helps increase the recall power of the vector space, because it
        means you can find terms that are too infrequent to have their own
        vector by looking up their neighbors, etc.

        This forms a reasonable approximation of the vector an infrequent term would have anyway.
        """
        self.load()
        expanded = terms[:]
        for term, weight in terms:
            if include_neighbors and term not in self.frame.index and self.finder is not None:
                for edge in self.finder.lookup(term, limit=limit_per_term):
                    if field_match(edge['start']['term'],
                                   term) and not field_match(
                                       edge['end']['term'], term):
                        neighbor = edge['end']['term']
                    elif field_match(edge['end']['term'],
                                     term) and not field_match(
                                         edge['start']['term'], term):
                        neighbor = edge['start']['term']
                    else:
                        continue
                    # TODO: explain this formula
                    neighbor_weight = weight * min(10, edge['weight']) * 0.01
                    expanded.append((neighbor, neighbor_weight))

                prefix_weight = 0.01
                if get_uri_language(term) != 'en':
                    englishified = '/c/en/' + split_uri(term)[2]
                    expanded.append((englishified, prefix_weight))

                while term:
                    # Skip excessively general lookups, for either an entire
                    # language, or all terms starting with a single
                    # non-ideographic letter
                    if term.endswith('/') or (term[-2] == '/'
                                              and term[-1] < chr(0x3000)):
                        break
                    prefixed = self.terms_with_prefix(term)
                    if prefixed:
                        n_prefixed = len(prefixed)
                        for prefixed_term in prefixed:
                            expanded.append(
                                (prefixed_term, prefix_weight / n_prefixed))
                        break
                    term = term[:-1]

        total_weight = sum(abs(weight) for term, weight in expanded)
        if total_weight == 0:
            return []
        else:
            return [(uri_prefix(term), weight / total_weight)
                    for (term, weight) in expanded]
예제 #11
0
def merge_intersect(frames, subsample=20, k=300):
    """
    Combine the vector knowledge contained in `frames` over the vocabulary
    that they agree on, and use dimensionality reduction to mitigate the
    redundancy of learning the same thing multiple ways.

    If their vocabularies result from retrofitting, then the resulting
    vocabulary will be the vocabulary of the retrofit knowledge graph,
    plus any other terms that happen to be in all of the frames.
    """
    # Find the intersected vocabulary of the frames, and concatenate their
    # vectors over that vocabulary.
    joined = concat_intersect(frames)

    # Find a subset of the labels that we'll use for calculating the
    # dimensionality-reduced version. The labels we particularly care about
    # are single words in our CORE_LANGUAGES. Even those are too numerous,
    # so we take an arbitrary 1/n sample of them, where n is given by the
    # `subsample` parameter.
    filtered_labels = pd.Series([
        label for (i, label) in enumerate(joined.index) if i % subsample == 0
        and '_' not in label and get_uri_language(label) in CORE_LANGUAGES
    ])

    # Mean-center and L_2-normalize the data, to prevent artifacts
    # in dimensionality reduction.
    adjusted = joined.loc[filtered_labels]
    adjusted -= joined.mean(0)
    normalize(adjusted.values, norm='l2', copy=False)

    # The SVD of this normalized matrix will give us its projection into
    # a lower-dimensional space (`projected`), as well as the operator that
    # performs that projection (`projection`) and the relative weights of the
    # columns (`eigenvalues`).
    projected, eigenvalues, projection = dataframe_svd_projection(adjusted, k)

    # We don't actually need this smaller matrix or its projection anymore;
    # what we learned is how to project _any_ matrix into this space.
    del adjusted
    del projected

    # Project the original `joined` matrix into this space using the
    # `projection` operator.
    reprojected = joined.dot(projection)
    del joined

    # `projection` (V) is an orthogonal matrix, so when we multiply by it, we
    # get a `reprojected` that approximately preserves distances (U * Σ).
    #
    # But these distances reflect redundant features among the input matrices.
    # To mitigate this redundancy, and to match Levy and Goldberg's observation
    # that U * Σ ** (1/2) is a better SVD projection for word-representation
    # purposes than U * Σ, we divide by Σ ** (1/2).
    np.divide(reprojected.values, eigenvalues**.5, out=reprojected.values)
    normalize(reprojected.values, norm='l2', copy=False)

    # Return our unified vectors, and the projection that could map other
    # concatenated vectors into the same vector space.
    return reprojected, projection
예제 #12
0
def make_big_frame(frame, language):
    """
     Choose the vocabulary for the big frame and make the big frame. Eliminate the terms which
     are in languages other than the language specified.
    """
    vocabulary = [term for term in frame.index if get_uri_language(term) == language]
    big_frame = frame.ix[vocabulary]
    return big_frame
예제 #13
0
def make_big_frame(frame, language):
    """
     Choose the vocabulary for the big frame and make the big frame. Eliminate the terms which
     are in languages other than the language specified.
    """
    vocabulary = [
        term for term in frame.index if get_uri_language(term) == language
    ]
    big_frame = frame.ix[vocabulary]
    return big_frame
예제 #14
0
def prepare_vocab_for_morphology(language, input, output):
    vocab_counts = defaultdict(int)
    for line in input:
        countstr, uri = line.strip().split(' ', 1)
        if get_uri_language(uri) == language:
            term = split_uri(uri)[2]
            if language in ATOMIC_SPACE_LANGUAGES:
                term += '_'
            vocab_counts[term] += int(countstr)

    for term, count in sorted(list(vocab_counts.items())):
        print(count, term, file=output)
예제 #15
0
def ld_node(uri, label=None):
    """
    Convert a ConceptNet URI into a dictionary suitable for Linked Data.
    """
    if label is None:
        label = uri_to_label(uri)
    ld = {
        '@id': uri,
        'label': label
    }
    if is_term(uri):
        pieces = split_uri(uri)
        ld['language'] = get_uri_language(uri)
        if len(pieces) > 3:
            ld['sense_label'] = '/'.join(pieces[3:])
        ld['term'] = uri_prefix(uri)
    elif uri.startswith('http'):
        domain = urlparse(uri).netloc
        ld['site'] = domain
        ld['term'] = uri
    return ld
예제 #16
0
def ld_node(uri, label=None):
    """
    Convert a ConceptNet URI into a dictionary suitable for Linked Data.
    """
    if label is None:
        label = uri_to_label(uri)
    ld = {'@id': uri, 'label': label}
    if is_term(uri):
        pieces = split_uri(uri)
        ld['language'] = get_uri_language(uri)

        # Get a reasonably-distinct sense label for the term.
        # Usually it will be the part of speech, but when we have fine-grained
        # information from Wikipedia or WordNet, it'll include the last
        # component as well.
        if len(pieces) > 3:
            ld['sense_label'] = pieces[3]

        if len(pieces) > 4 and pieces[4] in ('wp', 'wn'):
            ld['sense_label'] += ', ' + pieces[-1]

        ld['term'] = uri_prefix(uri)
        ld['@type'] = 'Node'
    elif uri.startswith('http'):
        domain = urlparse(uri).netloc
        ld['site'] = domain
        ld['term'] = uri

        # OpenCyc is down and UMBEL doesn't host their vocabulary on the
        # Web. This property indicates whether you can follow a link
        # via HTTP and retrieve more information.
        ld['site_available'] = True
        if domain in {'sw.opencyc.org', 'umbel.org', 'wikidata.dbpedia.org'}:
            ld['site_available'] = False
        ld['path'] = urlparse(uri).path
        ld['@type'] = 'Node'
    elif uri.startswith('/r/'):
        ld['@type'] = 'Relation'
    return ld
예제 #17
0
def ld_node(uri, label=None):
    """
    Convert a ConceptNet URI into a dictionary suitable for Linked Data.
    """
    if label is None:
        label = uri_to_label(uri)
    ld = {'@id': uri, 'label': label}
    if is_term(uri):
        pieces = split_uri(uri)
        ld['language'] = get_uri_language(uri)

        # Get a reasonably-distinct sense label for the term.
        # Usually it will be the part of speech, but when we have fine-grained
        # information from Wikipedia or WordNet, it'll include the last
        # component as well.
        if len(pieces) > 3:
            ld['sense_label'] = pieces[3]

        if len(pieces) > 4 and pieces[4] in ('wp', 'wn'):
            ld['sense_label'] += ', ' + pieces[-1]

        ld['term'] = uri_prefix(uri)
        ld['@type'] = 'Node'
    elif uri.startswith('http'):
        domain = urlparse(uri).netloc
        ld['site'] = domain
        ld['term'] = uri

        # OpenCyc is down and UMBEL doesn't host their vocabulary on the
        # Web. This property indicates whether you can follow a link
        # via HTTP and retrieve more information.
        ld['site_available'] = domain not in {'sw.opencyc.org', 'umbel.org'}
        ld['@type'] = 'Node'
    elif uri.startswith('/r/'):
        ld['@type'] = 'Relation'
    return ld
예제 #18
0
def msgpack_to_assoc(input_filename, output_filename):
    """
    Convert a msgpack stream to a tab-separated "CSV" of concept-to-concept
    associations.

    As a special case, we convert some "Desires" and "NotDesires" relations
    to "HasProperty" relations, so that:

    - An assertion that means "People want X" in English or Chinese is converted
      to an association meaning "X is good"
    - An assertion that "People don't want X" is converted to an association
      meaning "X is bad"

    The result is used to build machine-learning models that recognize
    semantic similarities between words, and particularly the ConceptNet
    Numberbatch embedding space.
    """
    with open(output_filename, 'w', encoding='utf-8') as out_stream:
        weight_by_dataset = defaultdict(float)
        count_by_dataset = defaultdict(int)
        prefixed = set()
        for info in read_msgpack_stream(input_filename):
            start_uri = info['start']
            end_uri = info['end']
            if not (get_uri_language(start_uri) in COMMON_LANGUAGES
                    and get_uri_language(end_uri) in COMMON_LANGUAGES):
                continue
            rel = info['rel']
            weight = info['weight']
            dataset = info['dataset']

            for uri in (start_uri, end_uri):
                pieces = split_uri(uri)
                if len(pieces) > 3 and (uri, dataset) not in prefixed:
                    prefix = join_uri(*pieces[:3])
                    prefixed.add((uri, dataset))
                    line = "{start}\t{end}\t{weight}\t{dataset}\t{rel}".format(
                        start=uri,
                        end=prefix,
                        weight=1.,
                        dataset=dataset,
                        rel='/r/SenseOf')
                    weight_by_dataset[dataset] += 1.
                    count_by_dataset[dataset] += 1
                    print(line, file=out_stream)

            if start_uri == '/c/en/person' or start_uri == '/c/en/people':
                if rel == '/r/Desires':
                    pairs = [('/c/en/good', end_uri)]
                elif rel == '/r/NotDesires':
                    pairs = [('/c/en/bad', end_uri)]
                else:
                    pairs = [(start_uri, end_uri)]
            elif start_uri == '/c/zh/人':
                if rel == '/r/Desires':
                    pairs = [('/c/zh/良好', end_uri)]
                elif rel == '/r/NotDesires':
                    pairs = [('/c/zh/不良', end_uri)]
                else:
                    pairs = [(start_uri, end_uri)]
            else:
                pairs = [(start_uri, end_uri)]

            for (start, end) in pairs:
                line = "{start}\t{end}\t{weight}\t{dataset}\t{rel}".format(
                    start=start,
                    end=end,
                    weight=weight,
                    dataset=dataset,
                    rel=rel)
                weight_by_dataset[dataset] += weight
                count_by_dataset[dataset] += 1
                print(line, file=out_stream)

        avg_weight_by_dataset = {
            dataset: weight_by_dataset[dataset] / count_by_dataset[dataset]
            for dataset in count_by_dataset
        }
        print("Average weights:")
        print(avg_weight_by_dataset)
def build_features_from_conceptnet_table(filename):
    mat = SparseMatrixBuilder()

    concept_labels = OrderedSet()
    feature_labels = OrderedSet()

    with open(str(filename), encoding='utf-8') as infile:
        for line in infile:
            concept1, concept2, value_str, dataset, relation = line.strip(
            ).split('\t')
            concept1 = replace_numbers(concept1)
            concept2 = replace_numbers(concept2)
            value = float(value_str)
            if relation in SYMMETRIC_RELATIONS:
                feature_pairs = []
                if get_uri_language(concept1) in CORE_LANGUAGES:
                    feature_pairs.append(
                        ('{} {} ~'.format(uri_prefix(concept1),
                                          relation), concept2))
                if get_uri_language(concept2) in CORE_LANGUAGES:
                    feature_pairs.append(
                        ('{} {} ~'.format(uri_prefix(concept2),
                                          relation), concept1))
            else:
                if get_uri_language(concept1) in CORE_LANGUAGES:
                    feature_pairs.append(
                        ('{} {} -'.format(uri_prefix(concept1),
                                          relation), concept2))
                if get_uri_language(concept2) in CORE_LANGUAGES:
                    feature_pairs.append(
                        ('- {} {}'.format(uri_prefix(concept2),
                                          relation), concept1))

            feature_counts = defaultdict(int)
            for feature, concept in feature_pairs:
                feature_counts[feature] += 1

            for feature, concept in feature_pairs:
                prefixes = list(uri_prefixes(concept, 3))
                if feature_counts[feature] > 1:
                    for prefix in prefixes:
                        concept_index = concept_labels.add(prefix)
                        feature_index = feature_labels.add(feature)
                        mat[concept_index, feature_index] = value

    # Link nodes to their more general versions
    for concept in concept_labels:
        prefixes = list(uri_prefixes(concept, 3))
        for prefix in prefixes:
            auto_features = [
                '{} {} ~'.format(prefix, 'SimilarTo'),
                '{} {} ~'.format(prefix, 'RelatedTo'),
                '{} {} -'.format(prefix, 'FormOf'),
                '- {} {}'.format(prefix, 'FormOf'),
            ]
            for feature in auto_features:
                concept_index = concept_labels.add(prefix)
                feature_index = feature_labels.add(feature)
                mat[concept_index, feature_index] = value

    shape = (len(concept_labels), len(feature_labels))
    c_index = pd.Index(concept_labels)
    f_index = pd.Index(feature_labels)
    return mat.tocsr(shape), c_index, f_index
예제 #20
0
def make_adjacency_matrix(assoc_filename, embedding_vocab):
    """
    Build a sparse adjacency matrix for the ConceptNet graph presented 
    in the given assoc file, including all terms from the given embedding 
    vocabulary and removing all terms from connected components of the graph 
    that do not overlap that vocabulary.  
    
    Also builds an index giving all terms from the resulting joined 
    graph+embedding vocabulary in the order corresponding to the rows and 
    columns of the matrix.  Note that it is guaranteed that the terms from 
    the embedding vocabulary will preceed the remaining terms in that index, 
    and that among the remaining terms the terms in English will follow all 
    the others.
    
    Returns the matrix and index, and the number of new English terms.
    """
    # First eliminate all connected components of the graph that don't
    # overlap the vocabulary of the embedding; we can't do anything with
    # those terms.

    graph = ConceptNetAssociationGraphForPropagation.from_csv(
        assoc_filename, reject_negative_relations=False)
    component_labels = graph.find_components()

    # Get the labels of components that overlap the embedding vocabulary.
    good_component_labels = set(label
                                for term, label in component_labels.items()
                                if term in embedding_vocab)

    # Now get the concepts in those components.
    good_concepts = set(term for term, label in component_labels.items()
                        if label in good_component_labels)

    del component_labels, good_component_labels

    # Put terms from the embedding first, then terms from the good part
    # of the graph neither from the embedding nor in English, then terms
    # from the good part of the graph in English but not from the embedding.
    new_vocab = good_concepts - set(embedding_vocab)
    good_concepts = embedding_vocab.append(
        pd.Index(term for term in new_vocab if get_uri_language(term) != 'en'))
    n_good_concepts_not_new_en = len(good_concepts)
    good_concepts = good_concepts.append(
        pd.Index(term for term in new_vocab if get_uri_language(term) == 'en'))
    del new_vocab
    n_new_english = len(good_concepts) - n_good_concepts_not_new_en

    good_concepts_map = {term: i for i, term in enumerate(good_concepts)}

    # Convert the good part of the graph to an adjacency matrix representation.

    # Note: the edges added differ slightly from the way it is done in (e.g.)
    # build_from_conceptnet_table (in sparse_matrix_builder.py), in that we
    # do not add edges linking specific senses of terms to their more general
    # forms (as defined by uri_prefixes).  Currently no such specific senses
    # show up in the input to retrofitting (i.e. the output of
    # build_from_conceptnet_table), so it doesn't matter, but in the future
    # we may want to add such edges here as well.

    builder = SparseMatrixBuilder()
    for v, w in graph.edges:
        try:
            index0 = good_concepts_map[v]
            index1 = good_concepts_map[w]
            builder[index0, index1] = 1
            builder[index1, index0] = 1
        except KeyError:
            pass  # one of v, w wasn't good
    del graph

    adjacency_matrix = builder.tocsr(shape=(len(good_concepts),
                                            len(good_concepts)),
                                     dtype=np.int8)

    return adjacency_matrix, good_concepts, n_new_english
예제 #21
0
def msgpack_to_assoc(input_filename, output_filename):
    """
    Convert a msgpack stream to a tab-separated "CSV" of concept-to-concept
    associations.

    The relation is mostly ignored, except:

    - An assertion that means "People want X" in English or Chinese is converted to
      an association between X and "good"
    - An assertion that "People don't want X" is converted to an association
      between X and "bad"

    The result can be used to predict word associations using ConceptNet by using
    dimensionality reduction, as in the `assoc_space` package.

    FIXME: the above is out of date, we use conceptnet5.vectors now

    The relation is mostly ignored because we have not yet found a good way to
    take the relation into account in dimensionality reduction.
    """
    with open(output_filename, 'w', encoding='utf-8') as out_stream:
        weight_by_dataset = defaultdict(float)
        count_by_dataset = defaultdict(int)
        prefixed = set()
        for info in read_msgpack_stream(input_filename):
            start_uri = info['start']
            end_uri = info['end']
            if not (get_uri_language(start_uri) in COMMON_LANGUAGES
                    and get_uri_language(end_uri) in COMMON_LANGUAGES):
                continue
            rel = info['rel']
            weight = info['weight']
            dataset = info['dataset']

            pairs = []
            for uri in (start_uri, end_uri):
                pieces = split_uri(uri)
                if len(pieces) > 3 and (uri, dataset) not in prefixed:
                    prefix = join_uri(*pieces[:3])
                    prefixed.add((uri, dataset))
                    line = "{start}\t{end}\t{weight}\t{dataset}\t{rel}".format(
                        start=uri,
                        end=prefix,
                        weight=1.,
                        dataset=dataset,
                        rel='/r/SenseOf')
                    weight_by_dataset[dataset] += 1.
                    count_by_dataset[dataset] += 1
                    print(line, file=out_stream)

            if start_uri == '/c/en/person':
                if rel == '/r/Desires':
                    pairs = [('/c/en/good', end_uri)]
                elif rel == '/r/NotDesires':
                    pairs = [('/c/en/bad', end_uri)]
                else:
                    pairs = [(start_uri, end_uri)]
            elif start_uri == '/c/zh/人':
                if rel == '/r/Desires':
                    pairs = [('/c/zh/良好', end_uri)]
                elif rel == '/r/NotDesires':
                    pairs = [('/c/zh/不良', end_uri)]
                else:
                    pairs = [(start_uri, end_uri)]
            else:
                pairs = [(start_uri, end_uri)]

            for (start, end) in pairs:
                line = "{start}\t{end}\t{weight}\t{dataset}\t{rel}".format(
                    start=start,
                    end=end,
                    weight=weight,
                    dataset=dataset,
                    rel=rel)
                weight_by_dataset[dataset] += weight
                count_by_dataset[dataset] += 1
                print(line, file=out_stream)

        avg_weight_by_dataset = {
            dataset: weight_by_dataset[dataset] / count_by_dataset[dataset]
            for dataset in count_by_dataset
        }
        print("Average weights:")
        print(avg_weight_by_dataset)
예제 #22
0
def merge_intersect(frame_filenames, subsample=20, k=300):
    """
    Combine the vector knowledge contained in the frames over the vocabulary
    that they agree on, and use dimensionality reduction to mitigate the
    redundancy of learning the same thing multiple ways.

    If their vocabularies result from retrofitting, then the resulting
    vocabulary will be the vocabulary of the retrofit knowledge graph,
    plus any other terms that happen to be in all of the frames.
    """
    # Find the intersected vocabulary of the frames, and concatenate their
    # vectors over that vocabulary.
    joined = concat_intersect(frame_filenames)

    # Find a subset of the labels that we'll use for calculating the
    # dimensionality-reduced version. The labels we particularly care about
    # are single words in our CORE_LANGUAGES. Even those are too numerous,
    # so we take an arbitrary 1/n sample of them, where n is given by the
    # `subsample` parameter.
    filtered_labels = pd.Series(
        [
            label
            for (i, label) in enumerate(joined.index)
            if i % subsample == 0
            and '_' not in label
            and get_uri_language(label) in CORE_LANGUAGES
        ]
    )

    # Mean-center and L_2-normalize the data, to prevent artifacts
    # in dimensionality reduction.
    adjusted = joined.loc[filtered_labels]
    adjusted -= joined.mean(0)
    normalize(adjusted.values, norm='l2', copy=False)

    # The SVD of this normalized matrix will give us its projection into
    # a lower-dimensional space (`projected`), as well as the operator that
    # performs that projection (`projection`) and the relative weights of the
    # columns (`eigenvalues`).
    projected, eigenvalues, projection = dataframe_svd_projection(adjusted, k)

    # We don't actually need this smaller matrix or its projection anymore;
    # what we learned is how to project _any_ matrix into this space.
    del adjusted
    del projected

    # Project the original `joined` matrix into this space using the
    # `projection` operator.
    reprojected = joined.dot(projection)
    del joined

    # `projection` (V) is an orthogonal matrix, so when we multiply by it, we
    # get a `reprojected` that approximately preserves distances (U * Σ).
    #
    # But these distances reflect redundant features among the input matrices.
    # To mitigate this redundancy, and to match Levy and Goldberg's observation
    # that U * Σ ** (1/2) is a better SVD projection for word-representation
    # purposes than U * Σ, we divide by Σ ** (1/2).
    np.divide(reprojected.values, eigenvalues ** .5, out=reprojected.values)
    normalize(reprojected.values, norm='l2', copy=False)

    # Return our unified vectors, and the projection that could map other
    # concatenated vectors into the same vector space.
    return reprojected, projection
예제 #23
0
def make_adjacency_matrix(assoc_filename, embedding_vocab):
    """
    Build a sparse adjacency matrix for the ConceptNet graph presented
    in the given assoc file, including all terms from the given embedding
    vocabulary and removing all terms from connected components of the graph
    that do not overlap that vocabulary.

    Also builds an index giving all terms from the resulting joined
    graph+embedding vocabulary in the order corresponding to the rows and
    columns of the matrix.  Note that it is guaranteed that the terms from
    the embedding vocabulary will preceed the remaining terms in that index,
    and that among the remaining terms the terms in English will follow all
    the others.

    Returns the matrix and index, and the number of new English terms.
    """
    # First eliminate all connected components of the graph that don't
    # overlap the vocabulary of the embedding; we can't do anything with
    # those terms.

    graph = ConceptNetAssociationGraphForPropagation.from_csv(
        assoc_filename, reject_negative_relations=False
    )
    component_labels = graph.find_components()

    # Get the labels of components that overlap the embedding vocabulary.
    good_component_labels = set(
        label for term, label in component_labels.items() if term in embedding_vocab
    )

    # Now get the concepts in those components.
    good_concepts = set(
        term
        for term, label in component_labels.items()
        if label in good_component_labels
    )

    del component_labels, good_component_labels

    # Put terms from the embedding first, then terms from the good part
    # of the graph neither from the embedding nor in English, then terms
    # from the good part of the graph in English but not from the embedding.
    #
    # (In the corner case where either of these addtional sets of terms is
    # empty, construction of a pandas index will fail using generator rather
    # than list comprehensions.)
    new_vocab = good_concepts - set(embedding_vocab)
    good_concepts = embedding_vocab.append(
        pd.Index([term for term in new_vocab if get_uri_language(term) != 'en'])
    )
    n_good_concepts_not_new_en = len(good_concepts)
    good_concepts = good_concepts.append(
        pd.Index([term for term in new_vocab if get_uri_language(term) == 'en'])
    )
    del new_vocab
    n_new_english = len(good_concepts) - n_good_concepts_not_new_en

    good_concepts_map = {term: i for i, term in enumerate(good_concepts)}

    # Convert the good part of the graph to an adjacency matrix representation.

    # Note: the edges added differ slightly from the way it is done in (e.g.)
    # build_from_conceptnet_table (in sparse_matrix_builder.py), in that we
    # do not add edges linking specific senses of terms to their more general
    # forms (as defined by uri_prefixes).  Currently no such specific senses
    # show up in the input to retrofitting (i.e. the output of
    # build_from_conceptnet_table), so it doesn't matter, but in the future
    # we may want to add such edges here as well.

    builder = SparseMatrixBuilder()
    for v, w in graph.edges:
        try:
            index0 = good_concepts_map[v]
            index1 = good_concepts_map[w]
            builder[index0, index1] = 1
        except KeyError:
            pass  # one of v, w wasn't good
    del graph

    adjacency_matrix = builder.tocsr(
        shape=(len(good_concepts), len(good_concepts)), dtype=np.int8
    )

    return adjacency_matrix, good_concepts, n_new_english
예제 #24
0
def msgpack_to_assoc(input_filename, output_filename):
    """
    Convert a msgpack stream to a tab-separated "CSV" of concept-to-concept
    associations.

    As a special case, we convert some "Desires" and "NotDesires" relations
    to "HasProperty" relations, so that:

    - An assertion that means "People want X" in English or Chinese is converted
      to an association meaning "X is good"
    - An assertion that "People don't want X" is converted to an association
      meaning "X is bad"

    The result is used to build machine-learning models that recognize
    semantic similarities between words, and particularly the ConceptNet
    Numberbatch embedding space.
    """
    with open(output_filename, 'w', encoding='utf-8') as out_stream:
        weight_by_dataset = defaultdict(float)
        count_by_dataset = defaultdict(int)
        prefixed = set()
        for info in read_msgpack_stream(input_filename):
            start_uri = info['start']
            end_uri = info['end']
            if not (
                get_uri_language(start_uri) in COMMON_LANGUAGES
                and get_uri_language(end_uri) in COMMON_LANGUAGES
            ):
                continue
            rel = info['rel']
            weight = info['weight']
            dataset = info['dataset']

            for uri in (start_uri, end_uri):
                pieces = split_uri(uri)
                if len(pieces) > 3 and (uri, dataset) not in prefixed:
                    prefix = join_uri(*pieces[:3])
                    prefixed.add((uri, dataset))
                    line = "{start}\t{end}\t{weight}\t{dataset}\t{rel}".format(
                        start=uri,
                        end=prefix,
                        weight=1.,
                        dataset=dataset,
                        rel='/r/SenseOf',
                    )
                    weight_by_dataset[dataset] += 1.
                    count_by_dataset[dataset] += 1
                    print(line, file=out_stream)

            if start_uri == '/c/en/person' or start_uri == '/c/en/people':
                if rel == '/r/Desires':
                    pairs = [('/c/en/good', end_uri)]
                elif rel == '/r/NotDesires':
                    pairs = [('/c/en/bad', end_uri)]
                else:
                    pairs = [(start_uri, end_uri)]
            elif start_uri == '/c/zh/人':
                if rel == '/r/Desires':
                    pairs = [('/c/zh/良好', end_uri)]
                elif rel == '/r/NotDesires':
                    pairs = [('/c/zh/不良', end_uri)]
                else:
                    pairs = [(start_uri, end_uri)]
            else:
                pairs = [(start_uri, end_uri)]

            for (start, end) in pairs:
                line = "{start}\t{end}\t{weight}\t{dataset}\t{rel}".format(
                    start=start, end=end, weight=weight, dataset=dataset, rel=rel
                )
                weight_by_dataset[dataset] += weight
                count_by_dataset[dataset] += 1
                print(line, file=out_stream)

        avg_weight_by_dataset = {
            dataset: weight_by_dataset[dataset] / count_by_dataset[dataset]
            for dataset in count_by_dataset
        }
        print("Average weights:")
        print(avg_weight_by_dataset)