def truncate( space, output, size=('', 2000, 'New vector length.'), nvaa=('', False, 'Use only nouns, verbs, adjectives and adverbs as features.'), tagset=('', '', 'Tagset'), ): assert space.matrix.shape[1] >= size features = space.column_labels if nvaa: if tagset == 'bnc': features = features[features.index.get_level_values('tag').isin(['SUBST', 'VERB', 'ADJ', 'ADV'])] else: features = features[features.index.get_level_values('tag').isin(['N', 'V', 'J', 'R'])] # It's important to sort by id to make sure that the most frequent features are selected. features = features.sort('id').head(size) matrix = sparse.csc_matrix(space.matrix)[:, features['id']] assert len(features) == size # Reindex features features['id'] = list(range(size)) new_space = Space( matrix, row_labels=space.row_labels, column_labels=features, ) new_space.write(output)
def truncate( space, output, size=('', 2000, 'New vector length.'), nvaa=('', False, 'Use only nouns, verbs, adjectives and adverbs as features.'), tagset=('', '', 'Tagset'), ): assert space.matrix.shape[1] >= size features = space.column_labels if nvaa: if tagset == 'bnc': features = features[features.index.get_level_values('tag').isin( ['SUBST', 'VERB', 'ADJ', 'ADV'])] else: features = features[features.index.get_level_values('tag').isin( ['N', 'V', 'J', 'R'])] # It's important to sort by id to make sure that the most frequent features are selected. features = features.sort('id').head(size) matrix = sparse.csc_matrix(space.matrix)[:, features['id']] assert len(features) == size # Reindex features features['id'] = list(range(size)) new_space = Space( matrix, row_labels=space.row_labels, column_labels=features, ) new_space.write(output)
def to_space( word2vec=('', 'GoogleNews-vectors-negative300.bin.gz', 'Path to word2vec vectors.'), output=('o', 'space.h5', 'The output space file.'), word2vec_format=('', False, 'Word2vec_format.'), pos_separator=('', '', 'POS separator.'), ): """Read a word2vec file and save it as a space file.""" from gensim.models import Word2Vec if word2vec_format: model = Word2Vec.load_word2vec_format(word2vec, binary=True) else: model = Word2Vec.load(word2vec) if not pos_separator: targets = pd.DataFrame( { 'id': range(len(model.index2word)), 'ngram': model.index2word, 'tag': '_', }, ) else: tokens = [s.rsplit(pos_separator, maxsplit=1) for s in model.index2word] targets = pd.DataFrame( { 'id': range(len(model.index2word)), 'ngram': [n for n, _ in tokens], 'tag': [t for _, t in tokens], }, ) targets.set_index(['ngram', 'tag'], inplace=True) context = pd.DataFrame( { 'id': range(model.syn0.shape[1]), 'ngram': range(model.syn0.shape[1]), 'tag': '_' }, ) context.set_index(['ngram', 'tag'], inplace=True) space = Space( data_ij=model.syn0, row_labels=targets, column_labels=context, ) space.write(output)
def to_space( word2vec=('', 'GoogleNews-vectors-negative300.bin.gz', 'Path to word2vec vectors.'), output=('o', 'space.h5', 'The output space file.'), ): """Read a word2vec file and save it as a space file.""" from gensim.models import Word2Vec model = Word2Vec.load_word2vec_format(word2vec, binary=True) targets = pd.DataFrame({'id': range(len(model.index2word))}, index=model.index2word) targets.index.name = 'ngram' context = pd.DataFrame({'id': range(model.syn0.shape[1])}) context.index.name = 'ngram' space = Space( data_ij=model.syn0, row_labels=targets, column_labels=context, ) space.write(output)
def ittf( space, output, raw_space=('', '', 'Space with feature co-occurrence counts.'), times=('', ('n', 'logn'), 'Multiply the resulted values by n or logn.'), ): raw_space = read_space_from_file(raw_space) feature_cardinality = np.array( [v.nnz for v in raw_space.get_target_rows(*space.column_labels.index)]) n = space.matrix.todense() ittf = np.log(feature_cardinality) - np.log(n + 1) if times == 'n': matrix = np.multiply(n, ittf) elif times == 'logn': matrix = np.multiply(np.log(n + 1), ittf) Space(matrix, space.row_labels, space.column_labels).write(output)
def transitive_verb_space( space_file, transitive_verb_arguments, execnet_hub, output=('o', 'space.h5', 'Output verb vector space.'), chunk_size=('', 100, 'The length of a chunk.'), ): data_to_send = ( 'data', pickle.dumps( { 'space_file': space_file, }, ) ) def init(channel): channel.send(data_to_send) groups = transitive_verb_arguments.groupby( # ['subj_stem', 'subj_tag', 'obj_stem', 'obj_tag'], ['verb_stem', 'verb_tag'] ) groups = Bar( 'Subject object Kronecker products', max=len(groups), suffix='%(index)d/%(max)d', ).iter( pickle.dumps(g) for g in groups ) results = execnet_hub.run( remote_func=verb_space_builder, iterable=groups, init_func=init, verbose=False, ) result = next(results) for r in results: for k, v in r.items(): if k in result: result[k] += v else: result[k] = v result = list(result.items()) verb_labels = [l for l, _ in result] verb_vectors = [v for _, v in result] del result matrix = sparse.vstack(verb_vectors) del verb_vectors row_labels = pd.DataFrame( { 'ngram': [l[0] for l in verb_labels], 'tag': [l[1] for l in verb_labels], 'id': [i for i, _ in enumerate(verb_labels)], } ).set_index(['ngram', 'tag']) column_labels = pd.DataFrame( { 'ngram': list(range(matrix.shape[1])), 'tag': list(range(matrix.shape[1])), 'id': list(range(matrix.shape[1])), } ).set_index(['ngram', 'tag']) space = Space( matrix, row_labels=row_labels, column_labels=column_labels, ) space.write(output)
def pmi( space, output, dictionary, column_dictionary=('', '', 'The frequencies of column labels.'), column_dictionary_key=('', 'dictionary', 'An identifier for the group in the store.'), no_log=('', False, 'Do not take logarithm of the probability ratio.'), remove_missing=('', False, 'Remove items that are not in the dictionary.'), conditional_probability=('', False, 'Compute only P(c|t).'), keep_negative_values=('', False, 'Keep negative values.'), times=('', ('', 'n', 'logn'), 'Multiply the resulted values by n or log(n+1).'), window_size=('', 10, 'The size of the window.'), ): """ Weight elements using the positive PMI measure [3]. max(0, log(P(c|t) / P(c))) [1] and [2] use a measure similar to PMI, but without log, so it's just P(c|t) / P(c), which is sometimes called likelihood ratio. `--dictionary` provides word frequencies for rows. In case columns are labelled differently, provide `--column-dictionary`. `--keep-negative-values` keeps negative values but replaces negative infinity with 0. This is equivalent to replacing P(c, t) with just P(c) when P(c, t) is 0. [1] Mitchell, Jeff, and Mirella Lapata. "Vector-based Models of Semantic Composition." ACL. 2008. [2] Grefenstette, Edward, and Mehrnoosh Sadrzadeh. "Experimental support for a categorical compositional distributional model of meaning." Proceedings of the Conference on Empirical Methods in Natural Language Processing. Association for Computational Linguistics, 2011. [3] http://en.wikipedia.org/wiki/Pointwise_mutual_information """ def set_index(dictionary): dictionary.set_index( [c for c in dictionary.columns if c != 'count'], inplace=True, ) set_index(dictionary) if column_dictionary: column_dictionary = pd.read_hdf(column_dictionary, key=column_dictionary_key) set_index(column_dictionary) else: column_dictionary = dictionary # This are target frequency counts in the whole Corpora N(t) row_totals = dictionary.loc[space.row_labels.sort('id').index]['count'] missing_rows = ~np.isfinite(row_totals) if missing_rows.any(): if not remove_missing: raise ValueError('These rows are not finite!', row_totals[missing_rows]) else: logger.warning('Removing the following rows: %s', row_totals[missing_rows]) row_totals = row_totals[~missing_rows] row_totals = row_totals.values[:, np.newaxis] # This are context probabilities in the whole Corpora P(c) column_totals = (column_dictionary.loc[space.column_labels.sort( 'id').index].values.flatten() / dictionary['count'].sum()) # Elements in the matrix are N(c, t): the co-occurrence counts n = space.matrix.astype(float).todense() if remove_missing: n = n[~missing_rows.values] # The elements in the matrix are P(c|t) matrix = n / row_totals / window_size max_row_sum = matrix.sum(axis=1).max() assert max_row_sum < 1.0 or np.isclose(max_row_sum, 1.0) if not conditional_probability: if not no_log: # The elements in the matrix are log(P(c|t) / P(c)) matrix = np.log(matrix) - np.log(column_totals) if keep_negative_values: matrix[matrix == -np.inf] = -np.log(dictionary['count'].sum()) else: matrix[matrix < 0] = 0.0 else: # The elements in the matrix are P(c|t) / P(c) matrix /= column_totals if times == 'n': matrix = np.multiply(n, matrix) if times == 'logn': matrix = np.multiply(np.log(n + 1), matrix) Space(matrix, space.row_labels, space.column_labels).write(output)
def pmi( space, output, dictionary, column_dictionary=('', '', 'The frequencies of column labels.'), column_dictionary_key=('', 'dictionary', 'An identifier for the group in the store.'), no_log=('', False, 'Do not take logarithm of the probability ratio.'), remove_missing=('', False, 'Remove items that are not in the dictionary.'), conditional_probability=('', False, 'Compute only P(c|t).'), keep_negative_values=('', False, 'Keep negative values.'), neg=('', 1.0, 'The K parameter for shifted PPMI.'), log_base=('', np.e, 'The logarithm base to use.'), times=('', ('', 'n', 'logn'), 'Multiply the resulted values by n or log(n+1).'), window_size=('', 10, 'The size of the window.'), cds=('', float('nan'), 'Context discounting smoothing cooficient.'), smoothing=('', ('minprob', 'chance', 'compress'), 'How to deal with unseen co-occurrence prbailty.'), ): """ Weight elements using the positive PMI measure [3]. max(0, log(P(c|t) / P(c))) [1] and [2] use a measure similar to PMI, but without log, so it's just P(c|t) / P(c), which is sometimes called likelihood ratio. `--dictionary` provides word frequencies for rows. In case columns are labelled differently, provide `--column-dictionary`. [1] Mitchell, Jeff, and Mirella Lapata. "Vector-based Models of Semantic Composition." ACL. 2008. [2] Grefenstette, Edward, and Mehrnoosh Sadrzadeh. "Experimental support for a categorical compositional distributional model of meaning." Proceedings of the Conference on Empirical Methods in Natural Language Processing. Association for Computational Linguistics, 2011. [3] http://en.wikipedia.org/wiki/Pointwise_mutual_information """ if log_base == np.e: log = np.log log1p = np.log1p else: def log(x, out=None): result = np.log(x, out) result /= np.log(log_base) return result def log1p(x, out=None): result = np.log1p(x, out) result /= np.log(log_base) return result def set_index(dictionary): dictionary.set_index( [c for c in dictionary.columns if c != 'count'], inplace=True, ) set_index(dictionary) if column_dictionary: column_dictionary = pd.read_hdf(column_dictionary, key=column_dictionary_key) set_index(column_dictionary) else: column_dictionary = dictionary # This are target frequency counts in the whole Corpora N(t) row_totals = dictionary.loc[space.row_labels.sort('id').index]['count'] missing_rows = ~np.isfinite(row_totals) if missing_rows.any(): if not remove_missing: raise ValueError('These rows are not finite!', row_totals[missing_rows]) else: logger.warning('Not finite rows: %s', row_totals[missing_rows]) N = dictionary['count'].sum() row_totals[missing_rows] = 1 row_totals = row_totals.values[:, np.newaxis] / N if np.isnan(cds): # Use dictionary for context total counts. column_totals = ( column_dictionary.loc[space.column_labels.index].values.flatten() / N ) else: # Use co-occurrence matrix for context co-occurrence counts. # Prepare for the Context Distribution Smoothing. smoothed_context_counts = np.array(space.matrix.sum(axis=0)).flatten() ** cds # This are context probabilities in the whole Corpora P(c) column_totals = smoothed_context_counts / smoothed_context_counts.sum() # Elements in the matrix are N(c, t): the co-occurrence counts n = space.matrix.astype(float).todense() # The elements in the matrix are P(c, t) matrix = n / (N * window_size) matrix_sum = matrix.sum() assert matrix_sum < 1.0 or np.isclose(matrix_sum, 1.0) # # Check that P(c|t) <= 1. # max_row_sum = (matrix / (column_totals * row_totals)).sum(axis=1).max() # assert max_row_sum < 1.0 or np.isclose(max_row_sum, 1.0) if not conditional_probability: if not no_log: # PMI zero_counts = matrix == 0 if smoothing == 'minprob': # Pretned that unseen pairs occurred once. matrix[zero_counts] = 1 / (N * window_size) if smoothing != 'compress': # The elements in the matrix are log(P(c, t)) log(matrix, matrix) # log(P(c, t)) - (log(P(c)) + log(P(t))) matrix -= log(column_totals) matrix -= log(row_totals) else: matrix /= column_totals * row_totals matrix = log1p(matrix, matrix) if smoothing in ('chance', 'compress'): matrix[zero_counts] = 0 if not keep_negative_values: # PPMI if smoothing == 'compress': matrix -= log(2) if neg != 1.0: matrix -= log(neg) matrix[matrix < 0] = 0.0 else: # Ratio # The elements in the matrix are P(c,t) / ((P(c) * P(t))) matrix /= column_totals * row_totals else: # Conditional: P(c|t) matrix /= row_totals max_row_sum = (matrix).sum(axis=1).max() assert max_row_sum < 1.0 or np.isclose(max_row_sum, 1.0) if times == 'n': matrix = np.multiply(n, matrix) if times == 'logn': matrix = np.multiply(np.log(n + 1), matrix) Space(matrix, space.row_labels, space.column_labels).write(output)
def pmi( space, output, dictionary, column_dictionary=('', '', 'The frequencies of column labels.'), column_dictionary_key=('', 'dictionary', 'An identifier for the group in the store.'), no_log=('', False, 'Do not take logarithm of the probability ratio.'), ): """ Weight elements using the positive PMI measure [3]. max(0, log(P(c|t) / P(c))) [1] and [2] use a measure similar to PMI, but without log, so it's just P(c|t) / P(c). `--dictionary` provides word frequencies for rows. In case columns are labelled differently, provide `--column-dictionary`. [1] Mitchell, Jeff, and Mirella Lapata. "Vector-based Models of Semantic Composition." ACL. 2008. [2] Grefenstette, Edward, and Mehrnoosh Sadrzadeh. "Experimental support for a categorical compositional distributional model of meaning." Proceedings of the Conference on Empirical Methods in Natural Language Processing. Association for Computational Linguistics, 2011. [3] http://en.wikipedia.org/wiki/Pointwise_mutual_information """ def set_index(dictionary): dictionary.set_index( [c for c in dictionary.columns if c != 'count'], inplace=True, ) set_index(dictionary) if column_dictionary: column_dictionary = pd.read_hdf(column_dictionary, key=column_dictionary_key) set_index(column_dictionary) else: column_dictionary = dictionary # This are target frequency counts in the whole Corpora N(t) row_totals = dictionary.loc[space.row_labels.index]['count'] assert np.isfinite(row_totals.values).all() row_totals = row_totals.values[:, np.newaxis] # This is the total number of words in the corpora N = dictionary['count'].sum() # This are context probabilities in the whole Corpora P(c) column_totals = column_dictionary.loc[ space.column_labels.index].values.flatten() / N # Elements in the matrix are N(c, t): the co-occurrence counts matrix = space.matrix.astype(float).todense() # The elements in the matrix are P(c|t) matrix /= row_totals if not no_log: # The elements in the matrix are log(P(c|t) / P(c)) new_matrix = np.log(matrix) - np.log(column_totals) new_matrix[new_matrix < 0] = 0.0 else: # The elements in the matrix are P(c|t) / P(c) new_matrix = matrix / column_totals Space(new_matrix, space.row_labels, space.column_labels).write(output)