예제 #1
0
def truncate(
    space,
    output,
    size=('', 2000, 'New vector length.'),
    nvaa=('', False, 'Use only nouns, verbs, adjectives and adverbs as features.'),
    tagset=('', '', 'Tagset'),
):
    assert space.matrix.shape[1] >= size

    features = space.column_labels
    if nvaa:
        if tagset == 'bnc':
            features = features[features.index.get_level_values('tag').isin(['SUBST', 'VERB', 'ADJ', 'ADV'])]
        else:
            features = features[features.index.get_level_values('tag').isin(['N', 'V', 'J', 'R'])]

    # It's important to sort by id to make sure that the most frequent features are selected.
    features = features.sort('id').head(size)
    matrix = sparse.csc_matrix(space.matrix)[:, features['id']]

    assert len(features) == size

    # Reindex features
    features['id'] = list(range(size))

    new_space = Space(
        matrix,
        row_labels=space.row_labels,
        column_labels=features,
    )

    new_space.write(output)
예제 #2
0
def truncate(
        space,
        output,
        size=('', 2000, 'New vector length.'),
        nvaa=('', False,
              'Use only nouns, verbs, adjectives and adverbs as features.'),
        tagset=('', '', 'Tagset'),
):
    assert space.matrix.shape[1] >= size

    features = space.column_labels
    if nvaa:
        if tagset == 'bnc':
            features = features[features.index.get_level_values('tag').isin(
                ['SUBST', 'VERB', 'ADJ', 'ADV'])]
        else:
            features = features[features.index.get_level_values('tag').isin(
                ['N', 'V', 'J', 'R'])]

    # It's important to sort by id to make sure that the most frequent features are selected.
    features = features.sort('id').head(size)
    matrix = sparse.csc_matrix(space.matrix)[:, features['id']]

    assert len(features) == size

    # Reindex features
    features['id'] = list(range(size))

    new_space = Space(
        matrix,
        row_labels=space.row_labels,
        column_labels=features,
    )

    new_space.write(output)
예제 #3
0
def to_space(
    word2vec=('', 'GoogleNews-vectors-negative300.bin.gz', 'Path to word2vec vectors.'),
    output=('o', 'space.h5', 'The output space file.'),
    word2vec_format=('', False, 'Word2vec_format.'),
    pos_separator=('', '', 'POS separator.'),
):
    """Read a word2vec file and save it as a space file."""
    from gensim.models import Word2Vec

    if word2vec_format:
        model = Word2Vec.load_word2vec_format(word2vec, binary=True)
    else:
        model = Word2Vec.load(word2vec)

    if not pos_separator:
        targets = pd.DataFrame(
            {
                'id': range(len(model.index2word)),
                'ngram': model.index2word,
                'tag': '_',
            },
        )
    else:
        tokens = [s.rsplit(pos_separator, maxsplit=1) for s in model.index2word]
        targets = pd.DataFrame(
            {
                'id': range(len(model.index2word)),
                'ngram': [n for n, _ in tokens],
                'tag': [t for _, t in tokens],
            },
        )

    targets.set_index(['ngram', 'tag'], inplace=True)

    context = pd.DataFrame(
        {
            'id': range(model.syn0.shape[1]),
            'ngram': range(model.syn0.shape[1]),
            'tag': '_'
        },

    )
    context.set_index(['ngram', 'tag'], inplace=True)

    space = Space(
        data_ij=model.syn0,
        row_labels=targets,
        column_labels=context,
    )

    space.write(output)
예제 #4
0
def to_space(
        word2vec=('', 'GoogleNews-vectors-negative300.bin.gz',
                  'Path to word2vec vectors.'),
        output=('o', 'space.h5', 'The output space file.'),
):
    """Read a word2vec file and save it as a space file."""
    from gensim.models import Word2Vec

    model = Word2Vec.load_word2vec_format(word2vec, binary=True)

    targets = pd.DataFrame({'id': range(len(model.index2word))},
                           index=model.index2word)
    targets.index.name = 'ngram'

    context = pd.DataFrame({'id': range(model.syn0.shape[1])})
    context.index.name = 'ngram'

    space = Space(
        data_ij=model.syn0,
        row_labels=targets,
        column_labels=context,
    )

    space.write(output)
예제 #5
0
def ittf(
        space,
        output,
        raw_space=('', '', 'Space with feature co-occurrence counts.'),
        times=('', ('n', 'logn'),
               'Multiply the resulted values by n or logn.'),
):
    raw_space = read_space_from_file(raw_space)

    feature_cardinality = np.array(
        [v.nnz for v in raw_space.get_target_rows(*space.column_labels.index)])

    n = space.matrix.todense()

    ittf = np.log(feature_cardinality) - np.log(n + 1)

    if times == 'n':
        matrix = np.multiply(n, ittf)
    elif times == 'logn':
        matrix = np.multiply(np.log(n + 1), ittf)

    Space(matrix, space.row_labels, space.column_labels).write(output)
예제 #6
0
def transitive_verb_space(
    space_file,
    transitive_verb_arguments,
    execnet_hub,
    output=('o', 'space.h5', 'Output verb vector space.'),
    chunk_size=('', 100, 'The length of a chunk.'),
):

    data_to_send = (
        'data',
        pickle.dumps(
            {
                'space_file': space_file,
            },
        )
    )

    def init(channel):
        channel.send(data_to_send)

    groups = transitive_verb_arguments.groupby(
        # ['subj_stem', 'subj_tag', 'obj_stem', 'obj_tag'],
        ['verb_stem', 'verb_tag']
    )

    groups = Bar(
        'Subject object Kronecker products',
        max=len(groups),
        suffix='%(index)d/%(max)d',
    ).iter(
        pickle.dumps(g) for g in groups
    )

    results = execnet_hub.run(
        remote_func=verb_space_builder,
        iterable=groups,
        init_func=init,
        verbose=False,
    )

    result = next(results)

    for r in results:
        for k, v in r.items():
            if k in result:
                result[k] += v
            else:
                result[k] = v

    result = list(result.items())

    verb_labels = [l for l, _ in result]
    verb_vectors = [v for _, v in result]

    del result

    matrix = sparse.vstack(verb_vectors)
    del verb_vectors

    row_labels = pd.DataFrame(
        {
            'ngram': [l[0] for l in verb_labels],
            'tag': [l[1] for l in verb_labels],
            'id': [i for i, _ in enumerate(verb_labels)],
        }
    ).set_index(['ngram', 'tag'])

    column_labels = pd.DataFrame(
        {
            'ngram': list(range(matrix.shape[1])),
            'tag': list(range(matrix.shape[1])),
            'id': list(range(matrix.shape[1])),
        }
    ).set_index(['ngram', 'tag'])

    space = Space(
        matrix,
        row_labels=row_labels,
        column_labels=column_labels,
    )

    space.write(output)
예제 #7
0
def pmi(
        space,
        output,
        dictionary,
        column_dictionary=('', '', 'The frequencies of column labels.'),
        column_dictionary_key=('', 'dictionary',
                               'An identifier for the group in the store.'),
        no_log=('', False, 'Do not take logarithm of the probability ratio.'),
        remove_missing=('', False,
                        'Remove items that are not in the dictionary.'),
        conditional_probability=('', False, 'Compute only P(c|t).'),
        keep_negative_values=('', False, 'Keep negative values.'),
        times=('', ('', 'n', 'logn'),
               'Multiply the resulted values by n or log(n+1).'),
        window_size=('', 10, 'The size of the window.'),
):
    """
    Weight elements using the positive PMI measure [3]. max(0, log(P(c|t) / P(c)))

    [1] and [2] use a measure similar to PMI, but without log, so it's just
    P(c|t) / P(c), which is sometimes called likelihood ratio.

    `--dictionary` provides word frequencies for rows. In case columns are
    labelled differently, provide `--column-dictionary`.

    `--keep-negative-values` keeps negative values but replaces negative
    infinity with 0. This is equivalent to replacing P(c, t) with just P(c) when
    P(c, t) is 0.

    [1] Mitchell, Jeff, and Mirella Lapata. "Vector-based Models of Semantic
    Composition." ACL. 2008.

    [2] Grefenstette, Edward, and Mehrnoosh Sadrzadeh. "Experimental support for
    a categorical compositional distributional model of meaning." Proceedings
    of the Conference on Empirical Methods in Natural Language Processing.
    Association for Computational Linguistics, 2011.

    [3] http://en.wikipedia.org/wiki/Pointwise_mutual_information

    """
    def set_index(dictionary):
        dictionary.set_index(
            [c for c in dictionary.columns if c != 'count'],
            inplace=True,
        )

    set_index(dictionary)

    if column_dictionary:
        column_dictionary = pd.read_hdf(column_dictionary,
                                        key=column_dictionary_key)
        set_index(column_dictionary)
    else:
        column_dictionary = dictionary

    # This are target frequency counts in the whole Corpora N(t)
    row_totals = dictionary.loc[space.row_labels.sort('id').index]['count']

    missing_rows = ~np.isfinite(row_totals)
    if missing_rows.any():
        if not remove_missing:
            raise ValueError('These rows are not finite!',
                             row_totals[missing_rows])
        else:
            logger.warning('Removing the following rows: %s',
                           row_totals[missing_rows])
            row_totals = row_totals[~missing_rows]

    row_totals = row_totals.values[:, np.newaxis]

    # This are context probabilities in the whole Corpora P(c)
    column_totals = (column_dictionary.loc[space.column_labels.sort(
        'id').index].values.flatten() / dictionary['count'].sum())

    # Elements in the matrix are N(c, t): the co-occurrence counts
    n = space.matrix.astype(float).todense()

    if remove_missing:
        n = n[~missing_rows.values]

    # The elements in the matrix are P(c|t)
    matrix = n / row_totals / window_size

    max_row_sum = matrix.sum(axis=1).max()
    assert max_row_sum < 1.0 or np.isclose(max_row_sum, 1.0)

    if not conditional_probability:
        if not no_log:
            # The elements in the matrix are log(P(c|t) / P(c))
            matrix = np.log(matrix) - np.log(column_totals)
            if keep_negative_values:
                matrix[matrix == -np.inf] = -np.log(dictionary['count'].sum())
            else:
                matrix[matrix < 0] = 0.0
        else:
            # The elements in the matrix are P(c|t) / P(c)
            matrix /= column_totals

    if times == 'n':
        matrix = np.multiply(n, matrix)
    if times == 'logn':
        matrix = np.multiply(np.log(n + 1), matrix)

    Space(matrix, space.row_labels, space.column_labels).write(output)
예제 #8
0
def pmi(
    space,
    output,
    dictionary,
    column_dictionary=('', '', 'The frequencies of column labels.'),
    column_dictionary_key=('', 'dictionary', 'An identifier for the group in the store.'),
    no_log=('', False, 'Do not take logarithm of the probability ratio.'),
    remove_missing=('', False, 'Remove items that are not in the dictionary.'),
    conditional_probability=('', False, 'Compute only P(c|t).'),
    keep_negative_values=('', False, 'Keep negative values.'),
    neg=('', 1.0, 'The K parameter for shifted PPMI.'),
    log_base=('', np.e, 'The logarithm base to use.'),
    times=('', ('', 'n', 'logn'), 'Multiply the resulted values by n or log(n+1).'),
    window_size=('', 10, 'The size of the window.'),
    cds=('', float('nan'), 'Context discounting smoothing cooficient.'),
    smoothing=('', ('minprob', 'chance', 'compress'), 'How to deal with unseen co-occurrence prbailty.'),
):
    """
    Weight elements using the positive PMI measure [3]. max(0, log(P(c|t) / P(c)))

    [1] and [2] use a measure similar to PMI, but without log, so it's just
    P(c|t) / P(c), which is sometimes called likelihood ratio.

    `--dictionary` provides word frequencies for rows. In case columns are
    labelled differently, provide `--column-dictionary`.

    [1] Mitchell, Jeff, and Mirella Lapata. "Vector-based Models of Semantic
    Composition." ACL. 2008.

    [2] Grefenstette, Edward, and Mehrnoosh Sadrzadeh. "Experimental support for
    a categorical compositional distributional model of meaning." Proceedings
    of the Conference on Empirical Methods in Natural Language Processing.
    Association for Computational Linguistics, 2011.

    [3] http://en.wikipedia.org/wiki/Pointwise_mutual_information

    """

    if log_base == np.e:
        log = np.log
        log1p = np.log1p
    else:
        def log(x, out=None):
            result = np.log(x, out)
            result /= np.log(log_base)

            return result

        def log1p(x, out=None):
            result = np.log1p(x, out)
            result /= np.log(log_base)

            return result

    def set_index(dictionary):
        dictionary.set_index(
            [c for c in dictionary.columns if c != 'count'],
            inplace=True,
        )

    set_index(dictionary)

    if column_dictionary:
        column_dictionary = pd.read_hdf(column_dictionary, key=column_dictionary_key)
        set_index(column_dictionary)
    else:
        column_dictionary = dictionary

    # This are target frequency counts in the whole Corpora N(t)
    row_totals = dictionary.loc[space.row_labels.sort('id').index]['count']

    missing_rows = ~np.isfinite(row_totals)
    if missing_rows.any():
        if not remove_missing:
            raise ValueError('These rows are not finite!', row_totals[missing_rows])
        else:
            logger.warning('Not finite rows: %s', row_totals[missing_rows])

    N = dictionary['count'].sum()

    row_totals[missing_rows] = 1
    row_totals = row_totals.values[:, np.newaxis] / N

    if np.isnan(cds):
        # Use dictionary for context total counts.
        column_totals = (
            column_dictionary.loc[space.column_labels.index].values.flatten() / N
        )
    else:
        # Use co-occurrence matrix for context co-occurrence counts.

        # Prepare for the Context Distribution Smoothing.
        smoothed_context_counts = np.array(space.matrix.sum(axis=0)).flatten() ** cds

        # This are context probabilities in the whole Corpora P(c)
        column_totals = smoothed_context_counts / smoothed_context_counts.sum()

    # Elements in the matrix are N(c, t): the co-occurrence counts
    n = space.matrix.astype(float).todense()

    # The elements in the matrix are P(c, t)
    matrix = n / (N * window_size)

    matrix_sum = matrix.sum()
    assert matrix_sum < 1.0 or np.isclose(matrix_sum, 1.0)

    # # Check that P(c|t) <= 1.
    # max_row_sum = (matrix / (column_totals * row_totals)).sum(axis=1).max()
    # assert max_row_sum < 1.0 or np.isclose(max_row_sum, 1.0)

    if not conditional_probability:
        if not no_log:
            # PMI
            zero_counts = matrix == 0

            if smoothing == 'minprob':
                # Pretned that unseen pairs occurred once.
                matrix[zero_counts] = 1 / (N * window_size)

            if smoothing != 'compress':
                # The elements in the matrix are log(P(c, t))
                log(matrix, matrix)

                # log(P(c, t)) - (log(P(c)) + log(P(t)))
                matrix -= log(column_totals)
                matrix -= log(row_totals)
            else:
                matrix /= column_totals * row_totals
                matrix = log1p(matrix, matrix)

            if smoothing in ('chance', 'compress'):
                matrix[zero_counts] = 0

            if not keep_negative_values:
                # PPMI
                if smoothing == 'compress':
                    matrix -= log(2)

                if neg != 1.0:
                    matrix -= log(neg)

                matrix[matrix < 0] = 0.0

        else:
            # Ratio
            # The elements in the matrix are P(c,t) / ((P(c) * P(t)))
            matrix /= column_totals * row_totals
    else:
        # Conditional: P(c|t)
        matrix /= row_totals
        max_row_sum = (matrix).sum(axis=1).max()
        assert max_row_sum < 1.0 or np.isclose(max_row_sum, 1.0)

    if times == 'n':
        matrix = np.multiply(n, matrix)
    if times == 'logn':
        matrix = np.multiply(np.log(n + 1), matrix)

    Space(matrix, space.row_labels, space.column_labels).write(output)
예제 #9
0
def pmi(
        space,
        output,
        dictionary,
        column_dictionary=('', '', 'The frequencies of column labels.'),
        column_dictionary_key=('', 'dictionary',
                               'An identifier for the group in the store.'),
        no_log=('', False, 'Do not take logarithm of the probability ratio.'),
):
    """
    Weight elements using the positive PMI measure [3]. max(0, log(P(c|t) / P(c)))

    [1] and [2] use a measure similar to PMI, but without log, so it's just
    P(c|t) / P(c).

    `--dictionary` provides word frequencies for rows. In case columns are
    labelled differently, provide `--column-dictionary`.

    [1] Mitchell, Jeff, and Mirella Lapata. "Vector-based Models of Semantic
    Composition." ACL. 2008.

    [2] Grefenstette, Edward, and Mehrnoosh Sadrzadeh. "Experimental support for
    a categorical compositional distributional model of meaning." Proceedings
    of the Conference on Empirical Methods in Natural Language Processing.
    Association for Computational Linguistics, 2011.

    [3] http://en.wikipedia.org/wiki/Pointwise_mutual_information

    """
    def set_index(dictionary):
        dictionary.set_index(
            [c for c in dictionary.columns if c != 'count'],
            inplace=True,
        )

    set_index(dictionary)

    if column_dictionary:
        column_dictionary = pd.read_hdf(column_dictionary,
                                        key=column_dictionary_key)
        set_index(column_dictionary)
    else:
        column_dictionary = dictionary

    # This are target frequency counts in the whole Corpora N(t)
    row_totals = dictionary.loc[space.row_labels.index]['count']
    assert np.isfinite(row_totals.values).all()
    row_totals = row_totals.values[:, np.newaxis]

    # This is the total number of words in the corpora
    N = dictionary['count'].sum()
    # This are context probabilities in the whole Corpora P(c)
    column_totals = column_dictionary.loc[
        space.column_labels.index].values.flatten() / N

    # Elements in the matrix are N(c, t): the co-occurrence counts
    matrix = space.matrix.astype(float).todense()

    # The elements in the matrix are P(c|t)
    matrix /= row_totals

    if not no_log:
        # The elements in the matrix are log(P(c|t) / P(c))
        new_matrix = np.log(matrix) - np.log(column_totals)
        new_matrix[new_matrix < 0] = 0.0
    else:
        # The elements in the matrix are P(c|t) / P(c)
        new_matrix = matrix / column_totals

    Space(new_matrix, space.row_labels, space.column_labels).write(output)