示例#1
0
def merge_intersect(frames, subsample=20, k=300):
    """
    Combine the vector knowledge contained in `frames` over the vocabulary
    that they agree on, and use dimensionality reduction to mitigate the
    redundancy of learning the same thing multiple ways.

    If their vocabularies result from retrofitting, then the resulting
    vocabulary will be the vocabulary of the retrofit knowledge graph,
    plus any other terms that happen to be in all of the frames.
    """
    # Find the intersected vocabulary of the frames, and concatenate their
    # vectors over that vocabulary.
    joined = concat_intersect(frames)

    # Find a subset of the labels that we'll use for calculating the
    # dimensionality-reduced version. The labels we particularly care about
    # are single words in our CORE_LANGUAGES. Even those are too numerous,
    # so we take an arbitrary 1/n sample of them, where n is given by the
    # `subsample` parameter.
    filtered_labels = pd.Series([
        label for (i, label) in enumerate(joined.index)
        if i % subsample == 0 and '_' not in label
        and get_language(label) in CORE_LANGUAGES
    ])

    # Mean-center and L_2-normalize the data, to prevent artifacts
    # in dimensionality reduction.
    adjusted = joined.loc[filtered_labels]
    adjusted -= joined.mean(0)
    normalize(adjusted.values, norm='l2', copy=False)

    # The SVD of this normalized matrix will give us its projection into
    # a lower-dimensional space (`projected`), as well as the operator that
    # performs that projection (`projection`) and the relative weights of the
    # columns (`eigenvalues`).
    projected, eigenvalues, projection = dataframe_svd_projection(adjusted, k)

    # We don't actually need this smaller matrix or its projection anymore;
    # what we learned is how to project _any_ matrix into this space.
    del adjusted
    del projected

    # Project the original `joined` matrix into this space using the
    # `projection` operator.
    reprojected = joined.dot(projection)
    del joined

    # `projection` (V) is an orthogonal matrix, so when we multiply by it, we
    # get a `reprojected` that approximately preserves distances (U * Σ).
    #
    # But these distances reflect redundant features among the input matrices.
    # To mitigate this redundancy, and to match Levy and Goldberg's observation
    # that U * Σ ** (1/2) is a better SVD projection for word-representation
    # purposes than U * Σ, we divide by Σ ** (1/2).
    np.divide(reprojected.values, eigenvalues ** .5, out=reprojected.values)
    normalize(reprojected.values, norm='l2', copy=False)

    # Return our unified vectors, and the projection that could map other
    # concatenated vectors into the same vector space.
    return reprojected, projection
示例#2
0
    def expand_terms(self, terms, limit_per_term=10, include_neighbors=True):
        """
        Given a list of weighted terms as (term, weight) tuples, add terms that
        are one step away in ConceptNet at a lower weight, terms in English that share the
        surface form with these terms, and the terms which share prefix with these terms,
        if the terms are OOV.

        This helps increase the recall power of the vector space, because it
        means you can find terms that are too infrequent to have their own
        vector by looking up their neighbors, etc.

        This forms a reasonable approximation of the vector an infrequent term would have anyway.
        """
        self.load()
        expanded = terms[:]
        for term, weight in terms:
            if include_neighbors and term not in self.frame.index and self.finder is not None:
                for edge in self.finder.lookup(term, limit=limit_per_term):
                    if field_match(edge['start']['term'],
                                   term) and not field_match(
                                       edge['end']['term'], term):
                        neighbor = edge['end']['term']
                    elif field_match(edge['end']['term'],
                                     term) and not field_match(
                                         edge['start']['term'], term):
                        neighbor = edge['start']['term']
                    else:
                        continue
                    # TODO: explain this formula
                    neighbor_weight = weight * min(10, edge['weight']) * 0.01
                    expanded.append((neighbor, neighbor_weight))

                prefix_weight = 0.01
                if get_language(term) != 'en':
                    englishified = '/c/en/' + split_uri(term)[2]
                    expanded.append((englishified, prefix_weight))

                while term:
                    # Skip excessively general lookups, for either an entire
                    # language, or all terms starting with a single
                    # non-ideographic letter
                    if term.endswith('/') or (term[-2] == '/'
                                              and term[-1] < chr(0x3000)):
                        break
                    prefixed = self.terms_with_prefix(term)
                    if prefixed:
                        n_prefixed = len(prefixed)
                        for prefixed_term in prefixed:
                            expanded.append(
                                (prefixed_term, prefix_weight / n_prefixed))
                        break
                    term = term[:-1]

        total_weight = sum(abs(weight) for term, weight in expanded)
        if total_weight == 0:
            return []
        else:
            return [(uri_prefix(term), weight / total_weight)
                    for (term, weight) in expanded]
示例#3
0
def make_big_frame(frame, language):
    """
     Choose the vocabulary for the big frame and make the big frame. Eliminate the terms which
     are in languages other than the language specified.
    """
    vocabulary = [term for term in frame.index if get_language(term) == language]
    big_frame = frame.ix[vocabulary]
    return big_frame
示例#4
0
def merge_intersect(frames, subsample=20, vocab_cutoff=200000, k=300):
    label_intersection = set(frames[0].index)
    for frame in frames[1:]:
        label_intersection &= set(frame.index)
    filtered_labels = pd.Series([
        label for label in sorted(label_intersection)
        if '_' not in label and get_language(label) in CORE_LANGUAGES
    ])
    frames = [frame.loc[filtered_labels].astype('f') for frame in frames]
    joined = pd.concat(frames, join='inner', axis=1, ignore_index=True)
    joined.fillna(0.)
    adjusted = l2_normalize_rows(joined.ix[::subsample] - joined.mean(0))

    # Search the frames for significant terms that we've missed.
    # Significant terms are those that appear in 3 different vocabularies,
    # or in 2 different vocabularies and in the first `vocab_cutoff` rows of
    # one of them.

    vocabulary = frames[0].index
    for frame in frames[1:]:
        vocabulary |= frame.index
    term_scores = pd.Series(index=vocabulary).fillna(0)
    for frame in frames[1:]:
        term_scores.loc[frame.index] += 1
        term_scores.loc[frame.index[:vocab_cutoff]] += 1
    new_terms = vocabulary[term_scores >= 3].difference(joined.index)
    new_vecs = [frame.reindex(new_terms) for frame in frames]

    joined2 = pd.concat([
        joined,
        pd.concat(new_vecs, join='outer', axis=1,
                  ignore_index=True).astype('f').fillna(0.)
    ])

    del new_vecs
    projected, eigenvalues, projection = dataframe_svd_projection(adjusted, k)
    del adjusted
    del projected

    reprojected = joined2.dot(projection)
    reprojected /= (eigenvalues**.5)
    del joined2
    reprojected = l2_normalize_rows(reprojected, offset=1e-6)
    reprojected.sort_index(inplace=True)
    return reprojected, projection
示例#5
0
def ld_node(uri, label=None):
    """
    Convert a ConceptNet URI into a dictionary suitable for Linked Data.
    """
    if label is None:
        label = uri_to_label(uri)
    ld = {'@id': uri, 'label': label}
    if uri.startswith('/c/'):
        pieces = split_uri(uri)
        ld['language'] = get_language(uri)
        if len(pieces) > 3:
            ld['sense_label'] = '/'.join(pieces[3:])
        ld['term'] = uri_prefix(uri)
    elif uri.startswith('http'):
        domain = urlparse(uri).netloc
        ld['site'] = domain
        ld['term'] = uri
    return ld
def build_features_from_conceptnet_table(filename):
    mat = SparseMatrixBuilder()

    concept_labels = OrderedSet()
    feature_labels = OrderedSet()

    with open(str(filename), encoding='utf-8') as infile:
        for line in infile:
            concept1, concept2, value_str, dataset, relation = line.strip(
            ).split('\t')
            concept1 = replace_numbers(concept1)
            concept2 = replace_numbers(concept2)
            value = float(value_str)
            if relation in SYMMETRIC_RELATIONS:
                feature_pairs = []
                if get_language(concept1) in CORE_LANGUAGES:
                    feature_pairs.append(
                        ('{} {} ~'.format(uri_prefix(concept1),
                                          relation), concept2))
                if get_language(concept2) in CORE_LANGUAGES:
                    feature_pairs.append(
                        ('{} {} ~'.format(uri_prefix(concept2),
                                          relation), concept1))
            else:
                if get_language(concept1) in CORE_LANGUAGES:
                    feature_pairs.append(
                        ('{} {} -'.format(uri_prefix(concept1),
                                          relation), concept2))
                if get_language(concept2) in CORE_LANGUAGES:
                    feature_pairs.append(
                        ('- {} {}'.format(uri_prefix(concept2),
                                          relation), concept1))

            feature_counts = defaultdict(int)
            for feature, concept in feature_pairs:
                feature_counts[feature] += 1

            for feature, concept in feature_pairs:
                prefixes = list(uri_prefixes(concept, 3))
                if feature_counts[feature] > 1:
                    for prefix in prefixes:
                        concept_index = concept_labels.add(prefix)
                        feature_index = feature_labels.add(feature)
                        mat[concept_index, feature_index] = value

    # Link nodes to their more general versions
    for concept in concept_labels:
        prefixes = list(uri_prefixes(concept, 3))
        for prefix in prefixes:
            auto_features = [
                '{} {} ~'.format(prefix, 'SimilarTo'),
                '{} {} ~'.format(prefix, 'RelatedTo'),
                '{} {} -'.format(prefix, 'FormOf'),
                '- {} {}'.format(prefix, 'FormOf'),
            ]
            for feature in auto_features:
                concept_index = concept_labels.add(prefix)
                feature_index = feature_labels.add(feature)
                mat[concept_index, feature_index] = value

    shape = (len(concept_labels), len(feature_labels))
    c_index = pd.Index(concept_labels)
    f_index = pd.Index(feature_labels)
    return mat.tocsr(shape), c_index, f_index
def retrofit(row_labels, dense_frame, sparse_csr, iterations=5, verbosity=0):
    """
    Retrofitting is a process of combining information from a machine-learned
    space of term vectors with further structured information about those
    terms. It was originally presented in this 2015 NAACL paper by Manaal
    Faruqui, Jesse Dodge, Sujay Jauhar, Chris Dyer, Eduard Hovy, and Noah
    Smith, "Retrofitting Word Vectors to Semantic Lexicons":

        https://www.cs.cmu.edu/~hovy/papers/15HLT-retrofitting-word-vectors.pdf

    This function implements a variant that I've been calling "wide
    retrofitting", which extends the process to learn vectors for terms that
    were outside the original space.

    `row_labels` is the list of terms that we want to have vectors for.

    `dense_frame` is a DataFrame assigning vectors to some of these terms.

    `sparse_csr` is a SciPy sparse square matrix, whose rows and columns are
    implicitly labeled with `row_labels`. The entries of this matrix are
    positive for terms that we know are related from our structured data.
    (This is an awkward form of input, but unfortunately there is no good
    way to represent sparse labeled data in Pandas.)

    `sharded_retrofit` is responsible for building `row_labels` and `sparse_csr`
    appropriately.
    """
    # Initialize a DataFrame with rows that we know
    retroframe = pd.DataFrame(
        index=row_labels, columns=dense_frame.columns, dtype='f'
    )
    retroframe.update(dense_frame)
    # orig_weights = 1 for known vectors, 0 for unknown vectors
    orig_weights = 1 - retroframe.iloc[:, 0].isnull()
    weight_array = orig_weights.values[:, np.newaxis].astype('f')
    orig_vecs = retroframe.fillna(0).values

    # Divide up the labels by what language they're in -- we'll use this to
    # subtract the mean of each language, reducing clumping by language and
    # improving multilingual alignment.
    rows_by_language = defaultdict(list)
    for i, label in enumerate(row_labels):
        lang = get_language(label)
        rows_by_language[lang].append(i)
    all_languages = sorted(rows_by_language)
    row_groups = [rows_by_language[lang] for lang in all_languages]

    # Subtract the mean so that vectors don't just clump around common
    # hypernyms
    for row_group in row_groups:
        orig_vecs[row_group] -= orig_vecs[row_group].mean(0)

    # Delete the frame we built, we won't need its indices again until the end
    del retroframe

    vecs = orig_vecs
    for iteration in range(iterations):
        if verbosity >= 1:
            print('Retrofitting: Iteration %s of %s' % (iteration+1, iterations))

        vecs = sparse_csr.dot(vecs)
        for row_group in row_groups:
            orig_vecs[row_group] -= orig_vecs[row_group].mean(0)

        # use sklearn's normalize, because it normalizes in place and
        # leaves zero-rows at 0
        normalize(vecs, norm='l2', copy=False)

        # Average known rows with original vectors
        vecs += orig_vecs
        vecs /= (weight_array + 1.)

    retroframe = pd.DataFrame(data=vecs, index=row_labels, columns=dense_frame.columns)
    return retroframe