def test_with_different_separators(): DocumentFeature.recompile_pattern(pos_separator='_', ngram_separator='!') assert DocumentFeature('2-GRAM', (Token('very', 'RB'), Token('big', 'J'))) == \ DocumentFeature.from_string('very_RB!big_J') DocumentFeature.recompile_pattern(pos_separator='-', ngram_separator=' ') assert DocumentFeature('1-GRAM', (Token('very', 'RB'),)) == DocumentFeature.from_string('very-RB') assert DocumentFeature('2-GRAM', (Token('very', 'RB'), Token('big', 'J'))) == \ DocumentFeature.from_string('very-RB big-J')
def remove_overlapping_neighbours(cls, entry, to_insert): """ :type entry: DocumentFeature or str :type to_insert: list of (str, float) tuples """ if isinstance(entry, (six.string_types, six.text_type)): entry = DocumentFeature.from_string(entry) features = [(DocumentFeature.from_string(x[0]), x[1]) for x in to_insert] to_insert = [(f[0].tokens_as_str(), f[1]) for f in features if not any(t in entry.tokens for t in f[0].tokens)] return to_insert
def get_all_document_features(include_unigrams=False, remove_pos=False): """ Finds all noun-noun and adj-noun compounds (and optionally adjs and nouns) in all labelled corpora mentioned in the conf files. :param include_unigrams: if False, only NPs will be returned :param remove_pos: whether to remove PoS tags if present, result will be either "cat/N" or "cat" :rtype: set of DocumentFeature """ result = set() accepted_df_types = {'AN', 'NN', 'VO', 'SVO', '1-GRAM'} if include_unigrams else {'AN', 'NN', 'VO', 'SVO'} for corpus_name, _ in get_all_corpora(): path = os.path.abspath(os.path.join(__file__, '..', '..', '..', ROOT, '%s_all_features.txt' % corpus_name)) with open(path) as infile: for line in infile: df = DocumentFeature.from_string(line.strip()) if df.type in accepted_df_types: if remove_pos: # todo these are of type str, in the other branch it's DocumentFeature. things will likely break result.add(df.ngram_separator.join(t.text for t in df.tokens)) else: result.add(df) logging.info('Found a total of %d features in all corpora', len(result)) if not remove_pos: logging.info('Their types are %r', Counter(df.type for df in result)) if include_unigrams: logging.info('PoS tags of unigrams are are %r', Counter(df.tokens[0].pos for df in result if df.type == '1-GRAM')) else: logging.info('Unigram features not included!') return result
def contains_impl(self, feature): if isinstance(feature, six.string_types): feature = DocumentFeature.from_string(feature) if feature.type not in self.entry_types: # no point in composing single-word document features return False return str(feature[self.hardcoded_index]) in self.unigram_source
def to_tsv(self, events_path, entries_path='', features_path='', entry_filter=lambda x: True, row_transform=lambda x: x, gzipped=False, enforce_word_entry_pos_format=True, dense_hd5=False): """ Writes this thesaurus to Byblo-compatible file like the one it was most likely read from. In the process converts all entries to a DocumentFeature, so all entries must be parsable into one. May reorder the features of each entry. :param events_path: file to write to :param entry_filter: Called for every DocumentFeature that is an entry in this thesaurus. The vector will only be written if this callable return true :param row_transform: Callable, any transformation that might need to be done to each entry before converting it to a DocumentFeature. This is needed because some entries (e.g. african/J:amod-HEAD:leader) are not directly convertible (needs to be african/J_leader/N). Use this if the entries cannot be converted to DocumentFeature, e.g. if the data isn't PoS tagged. :param dense_hd5: if true, convert to a pandas `DataFrame` and write to a compressed HDF file. This is a 30% faster and produces 30% smaller files than using `gzipped`. This is only suitable for matrices with a small number of columns- this method enforces a hard limit of 1000. Requires PyTables and HDF5. :return: the file name """ if enforce_word_entry_pos_format: rows = {i: DocumentFeature.from_string(row_transform(feat)) for (feat, i) in self.name2row.items()} else: rows = {i: feat for (feat, i) in self.name2row.items()} if dense_hd5 and len(self.columns) <= 1000: write_vectors_to_hdf(self.matrix, self.row_names, self.columns, events_path) else: write_vectors_to_disk(coo_matrix(self.matrix), rows, self.columns, events_path, features_path=features_path, entries_path=entries_path, entry_filter=entry_filter, gzipped=gzipped) return events_path
def reformat_socher_vectors(): """ Formats the files output by Socher (2011)'s matlab code into byblo-compatible files. Before running this a list of all phrases needs to be extracted from the labelled data, and these need to be composed with Socher's matlab code. See note "Socher vectors" in Evernote. """ logging.info('Reformatting events file %s ---> %s', socher_output_vectors_file, socher_composed_vectors_file) # socher's code removes all PoS tags, so we can't translate his output # back to a DocumentFeature. Let's read the input to his code instead and # get the corresponding output vectors # get a list of all phrases that we attempted to compose with open(plaintext_socher_input_file) as infile: composed_phrases = [DocumentFeature.from_string(line.strip()) for line in infile] # get a list of all phrases where composition worked (no unknown words) with open(socher_output_phrases_file) as infile: success = [i for i, line in enumerate(infile) if '*UNKNOWN*' not in line] # pick out just the phrases that composes successfully composed_phrases = itemgetter(*success)(composed_phrases) # load all vectors, remove these containing unknown words mat = np.loadtxt(socher_output_vectors_file, delimiter=',') mat = mat[success, :] assert len(composed_phrases) == mat.shape[0] # same number of rows # do the actual writing write_vectors_to_hdf(sp.coo_matrix(mat), composed_phrases, ['RAE-feat%d' % i for i in range(100)], # Socher provides 100-dimensional vectors socher_composed_vectors_file)
def train_verb_tensors(svos_file, noun_vectors_file, output_filename): """ Trains Verb-bar matrices, as described in Milajevs et al (EMNLP-14, §3) :param svos_file: file containing a list of all SVOs in unlabelled data, one per line. May contain other document features too. Such a file is output by `find_all_NPs.py`, which is called from `observed_vectors.py` :param noun_vectors_file: a vector store containing noun vectors :param output_filename: name of output file- must identify the noun vectors and the unlabelled corpus """ mkdirs_if_not_exists(os.path.dirname(output_filename)) v = Vectors.from_tsv(noun_vectors_file) with open(svos_file) as infile: phrases = set() for line in infile: if DocumentFeature.from_string(line.strip()).type == 'SVO': phrases.add(tuple(line.strip().split('_'))) phrases = [(subj, verb, obj) for subj, verb, obj in phrases if subj in v and obj in v] phrases = sorted(phrases, key=itemgetter(1)) logging.info('Found %d SVOs in list', len(phrases)) verb_tensors = dict() for verb, svos in groupby(phrases, itemgetter(1)): svos = list(svos) if len(svos) < MIN_SVO_PER_VERB: continue logging.info('Training matrix for %s from %d SVOs', verb, len(svos)) vt = np.sum(np.outer(v.get_vector(subj).A, v.get_vector(obj).A) for subj, _, obj in svos) verb_tensors[verb] = vt logging.info('Trained %d verb matrices, saving...', len(verb_tensors)) for verb, tensor in verb_tensors.items(): df = pd.DataFrame(tensor) df.to_hdf(output_filename, verb.split('/')[0], complevel=9, complib='zlib')
def test_write_vectors_to_disk(resources, tmpdir): """ Checks the entries/features files, the events file is checked by thesisgenerator.tests.test_thesaurus.test_to_file :type th: Thesaurus """ th, expected_entries, expected_features, filter_callable = resources events_file = str(tmpdir.join('events.txt')) entries_file = str(tmpdir.join('entries.txt')) features_file = str(tmpdir.join('features.txt')) if not th: # empty thesaurus should raise an error with pytest.raises(ValueError): matrix, cols, rows = th.to_sparse_matrix() else: matrix, cols, rows = th.to_sparse_matrix() rows = [DocumentFeature.from_string(x) for x in rows] write_vectors_to_disk(sp.coo_matrix(matrix), rows, cols, events_file, features_file, entries_file, entry_filter=filter_callable) if expected_entries: # the file will not be written at all if there's nothing to put in it entries = [x.split('\t')[0] for x in _read_and_strip_lines(entries_file)] assert set(entries) == set(expected_entries) else: assert not os.path.exists(entries_file) if expected_features: features = [x.split('\t')[0] for x in _read_and_strip_lines(features_file)] assert features == expected_features else: assert not os.path.exists(features_file)
def __contains__(self, feature): if isinstance(feature, six.string_types): feature = DocumentFeature.from_string(feature) # this is a SVO, we have a verb tensor and vectors for both arguments return feature.type in self.entry_types and \ str(feature[1]) in self.verb_tensors and \ str(feature[0]) in self.unigram_source and \ str(feature[2]) in self.unigram_source
def get_vector(self, feature): """ :type feature: DocumentFeature :rtype: scipy.sparse.csr_matrix """ if isinstance(feature, six.string_types): feature = DocumentFeature.from_string(feature) return sp.csr_matrix(reduce(self.function, [self.unigram_source.get_vector(str(t)).A for t in feature[:]]))
def test_document_feature_slicing(): DocumentFeature.recompile_pattern() x = DocumentFeature.from_string('big/J_cat/N') assert x[0] == DocumentFeature.from_string('big/J') assert x[1] == DocumentFeature.from_string('cat/N') assert x[1] == DocumentFeature('1-GRAM', (Token('cat', 'N', 1), )) assert x[0:] == DocumentFeature.from_string('big/J_cat/N') x = DocumentFeature.from_string('cat/N') assert x[0] == DocumentFeature.from_string('cat/N') assert x[0:] == DocumentFeature.from_string('cat/N') assert x[:] == DocumentFeature.from_string('cat/N')
def __contains__(self, feature): # both head and modifier need to have unigram vectors. # I don't see why the modifier needs a vector, given that we're using # its matrix representation instead, but that is what dissect does if isinstance(feature, six.string_types): feature = DocumentFeature.from_string(feature) if feature.type not in self.entry_types: # no point in trying return False return all(str(f) in self.unigram_source for f in feature[:])
def _paraphrase(self, feature, vocabulary, j_indices, values, stats, **kwargs): """ Replaces term with its k nearest neighbours from the thesaurus Parameters ---------- neighbour_source : callable, returns a thesaurus-like object (a list of (neighbour, sim) tuples, sorted by highest sim first, acts as a defaultdict(list) ). The callable takes one parameter for compatibility purposes- one of the possible callables I want to use here requires access to the vocabulary. The default behaviour is to return a callable pointing to the currently loaded thesaurus. """ # logging.debug('Paraphrasing %r in doc %d', feature, doc_id) neighbours = self.thesaurus.get_nearest_neighbours(feature) if self.thesaurus.__class__.__name__ == 'Thesaurus': # todo this will also activate for DenseVectors, because they are also instances of thesaurus # the check needs to be self.thesaurus.__class__.__name__ == 'Thesaurus', but then # we need to make sure init_sims is called with the correct vocabulary so that all neighbours are IV # precomputed thesauri do not guarantee that the returned neighbours will be in vocabulary # these should by now only the used in testing though neighbours = [(neighbour, sim) for (neighbour, sim) in neighbours if DocumentFeature.from_string(neighbour) in vocabulary] event = [str(feature), len(neighbours)] for neighbour, sim in neighbours[:self.k]: # the document may already contain the feature we # are about to insert into it, # a merging strategy is required, # e.g. what do we do if the document has the word X # in it and we encounter X again. By default, # scipy uses addition df = DocumentFeature.from_string(neighbour) j_indices.append(vocabulary.get(df)) values.append(self.sim_transformer(sim)) # track the event event.extend([neighbour, sim]) stats.register_paraphrase(tuple(event))
def filter_out_infrequent_entries(desired_counts_per_feature_type, vectors): logging.info('Converting thesaurus to sparse matrix') mat, cols, rows = vectors.to_sparse_matrix() logging.info('Got a data matrix of shape %r', mat.shape) # convert to document feature for access to PoS tag document_features = [DocumentFeature.from_string(r) for r in rows] # don't want to do dimensionality reduction on composed vectors feature_types = [sorted_idx_and_pos_matching.type for sorted_idx_and_pos_matching in document_features] assert all(x == '1-GRAM' or x == 'AN' or x == 'NN' for x in feature_types), Counter(feature_types) # get the PoS tags of each row in the matrix pos_tags = np.array([df.tokens[0].pos if df.type == '1-GRAM' else df.type for df in document_features]) # find the rows of the matrix that correspond to the most frequent nouns, verbs, ..., # as measured by sum of feature counts. This is Byblo's definition of frequency (which is in fact a marginal), # but it is strongly correlated with one normally thinks of as entry frequency desired_rows = [] if desired_counts_per_feature_type is not None: for desired_pos, desired_count in desired_counts_per_feature_type: row_of_current_pos = pos_tags == desired_pos # what rows are the right PoS tags at, boolean mask array # indices of the array sorted by row sum, and where the pos == desired_pos if desired_count > 0: sorted_idx_by_sum = np.ravel(mat.sum(1)).argsort() row_of_current_pos = row_of_current_pos[sorted_idx_by_sum] sorted_idx_and_pos_matching = sorted_idx_by_sum[row_of_current_pos] # slice off the top desired_count and store them desired_rows.extend(list(sorted_idx_and_pos_matching[-desired_count:])) else: # do not include pass logging.info('Frequency filter keeping %d/%d %s entries ', desired_count, sum(row_of_current_pos), desired_pos) else: logging.info('Not filtering any of the entries') desired_rows = range(len(vectors)) # remove the vectors for infrequent entries, update list of pos tags too if desired_counts_per_feature_type is not None: # if some rows have been removed update respective data structures mat = mat[desired_rows, :] rows = itemgetter(*desired_rows)(document_features) pos_tags = pos_tags[desired_rows] # removing rows may empty some columns, remove these as well. This is probably not very like to occur as we have # already filtered out infrequent features, so the column count will stay roughly the same desired_cols = np.ravel(mat.sum(0)) > 0 mat = mat[:, desired_cols] col_indices = list(np.where(desired_cols)[0]) cols = itemgetter(*col_indices)(cols) logging.info('Selected only the most frequent entries, matrix size is now %r', mat.shape) assert mat.shape == (len(rows), len(cols)) return mat, pos_tags, rows, cols
def contains_impl(self, feature): """ Contains all sequences of words where we have a distrib vector for each unigram they contain. Rejects unigrams. """ # if isinstance(feature, six.string_types): # feature = DocumentFeature.from_string(feature) feat_str = str(feature) if isinstance(feature, DocumentFeature) else feature feat_df = feature if isinstance(feature, DocumentFeature) else DocumentFeature.from_string(feature) if feat_df.type not in self.entry_types: # no point in trying return False return all(f in self.unigram_source for f in feat_str.split(DocumentFeature.ngram_separator))
def run_glove(): logging.info('Starting training') with temp_chdir(args.glove_dir): run_and_log_output('sh {} {}'.format(glove_script, unlabelled_data)) # convert their format to ours df = pd.read_csv(raw_vectors_file, sep=' ', index_col=0, header=None) logging.info('Done training, filtering junk and converting %d vectors to Byblo-compatible format', len(df)) # remove any shit-looking tokens, they'll get in the way later mask = [DocumentFeature.from_string(x).type != 'EMPTY' and 3 < len(x) < 20 for x in df.index] logging.info('Keeping %d entries', sum(mask)) logging.info('Shape of vectors before filtering %r', df.shape) df = df[mask] logging.info('Shape of vectors after filtering %r', df.shape) cols = ['f%d' % i for i in range(df.shape[1])] mkdirs_if_not_exists(output_dir) write_vectors_to_hdf(df.values, df.index, cols, formatted_vectors_file)
def __iter__(self): for fname in self.files: filename = join(self.dirname, fname) infile = gzip.open(filename) if is_gzipped(filename) else open(filename) with contextlib.closing(infile): for line in infile: # yield gensim.utils.tokenize(line, lower=True) if isinstance(line, bytes): line = line.decode() res = [DocumentFeature.smart_lower(w) for w in line.split() if DocumentFeature.from_string(w).type != 'EMPTY'] if len(res) > 8: # ignore short sentences, they are probably noise if self.remove_pos: yield [x.split('/')[0] for x in res] else: yield res
def from_tsv(cls, tsv_file, sim_threshold=-1e20, lowercasing=False, ngram_separator='_', row_filter=lambda x, y: True, column_filter=lambda x: True, max_len=50, max_neighbours=1e8, merge_duplicates=True, immutable=True, **kwargs): """ Changes the default value of the sim_threshold parameter of super. Features can have any value, including negative (especially when working with neural embeddings). :rtype: Vectors """ # For vectors disallowing lexical overlap does not make sense at construction time, but should be # implemented in get_nearest_neighbours. A Thesaurus can afford to do the filtering when reading the # ready-made thesaurus from disk. allow_lexical_overlap = kwargs.pop('allow_lexical_overlap', True) if is_hdf(tsv_file): import pandas as pd df = pd.read_hdf(tsv_file, 'matrix') logging.info('Found a DF of shape %r in HDF file %s', df.shape, tsv_file) # pytables doesn't like unicode values and replaces them with an empty string. # pandas doesn't like duplicate values in index # remove these, we don't want to work with them anyway df = df[df.index != ''] row_filter_mask = [row_filter(f, DocumentFeature.from_string(f)) for f in df.index] df = df[row_filter_mask] logging.info('Dropped non-ascii rows and applied row filter. Shape is now %r', df.shape) return DenseVectors(df, immutable=immutable, allow_lexical_overlap=allow_lexical_overlap, **kwargs) th = Thesaurus.from_tsv(tsv_file, sim_threshold=sim_threshold, ngram_separator=ngram_separator, allow_lexical_overlap=True, row_filter=row_filter, column_filter=column_filter, max_len=max_len, max_neighbours=max_neighbours, merge_duplicates=merge_duplicates, **kwargs) # get underlying dict from thesaurus if not th._obj: raise ValueError('No entries left over after filtering') return Vectors(th._obj, immutable=immutable, allow_lexical_overlap=allow_lexical_overlap, **kwargs)
def extract_features_from_tree_list(self, doc_sentences): """ Turn a document (a list of sentences, each stored as a parse tree) into a sequence of features. Can extract features from the dependency trees (e.g. noun phrases) or traditional n-gram features. """ features = [] # extract sentence-internal token n-grams for parse_tree in doc_sentences: if not parse_tree: # the sentence segmenter sometimes returns empty sentences continue features.extend(self.extract_features_from_single_dependency_tree(parse_tree)) # extract sentence-internal n-grams of the right PoS tag if self.extract_unigram_features: # just unigrams, can get away without sorting the tokens for token in parse_tree.nodes_iter(): if token.pos not in self.extract_unigram_features: continue features.append(DocumentFeature('1-GRAM', (token,))) # some tests use standard bigrams, extract them too if self.standard_ngram_features > 1: # the tokens are stored as nodes in the parse tree in ANY order, sort them sentence = sorted(parse_tree.nodes(), key=attrgetter('index')) n_tokens = len(sentence) for n in range(2, min(self.standard_ngram_features + 1, n_tokens + 1)): for i in range(n_tokens - n + 1): feature = DocumentFeature('%d-GRAM' % n, tuple(sentence[i: i + n])) features.append(feature) # it doesn't matter where in the sentence/document these features were found # erase their index for feature in features: for token in feature.tokens: token.index = 'any' # remove all features that aren't right- they are there because the code above doesnt # put the features through the validation code in DocumentFeature.from_string # e.g. the verb phrase "123/V_$$$/N" is not put through validation, so it will be returned as feature return [f for f in features if DocumentFeature.from_string(str(f)).type != 'EMPTY']
def __contains__(self, feature): """ Accept all adjective-noun or noun-noun phrases where we have a corpus-observed vector for the head and a learnt matrix (through PLSR) for the modifier """ # todo expand unit tests now that we have a real composer if feature.type not in self.entry_types: # ignore non-AN features return False modifier, head = feature.tokens assert ('J', 'N') == (modifier.pos, head.pos) or ('N', 'N') == (modifier.pos, head.pos) # if DocumentFeature('1-GRAM', (noun,)) not in self.unigram_source: if DocumentFeature.from_string(str(head)) not in self.unigram_source: # ignore ANs containing unknown nouns return False # ignore ANs containing unknown adjectives return str(modifier) in self.available_modifiers
def to_tsv(self, events_path, entries_path='', features_path='', entry_filter=lambda x: True, row_transform=lambda x: x): """ Writes this thesaurus to Byblo-compatible file like the one it was most likely read from. In the process converts all entries to a DocumentFeature, so all entries must be parsable into one. May reorder the features of each entry. :param events_path: file to write to :param entry_filter: Called for every DocumentFeature that is an entry in this thesaurus. The vector will only be written if this callable return true :param row_transform: Callable, any transformation that might need to be done to each entry before converting it to a DocumentFeature. This is needed because some entries (e.g. african/J:amod-HEAD:leader) are not directly convertible (needs to be african/J_leader/N). Use this if the entries cannot be converted to DocumentFeature, e.g. if the data isn't PoS tagged. :return: the file name """ rows = {i: DocumentFeature.from_string(row_transform(feat)) for (feat, i) in self.name2row.items()} write_vectors_to_disk(self.matrix.tocoo(), rows, self.columns, events_path, features_path=features_path, entries_path=entries_path, entry_filter=entry_filter) return events_path
def test_left_right_compose_all(left_comp): original_matrix, original_cols, original_rows = left_comp.unigram_source.to_sparse_matrix() matrix, cols, rows = left_comp.compose_all(['cat/N_game/N', DocumentFeature.from_string('dog/N_game/N'), 'cat/N_a/N', 'cat/N_b/N', 'cat/N_c/N', 'cat/N_d/N', ]) # the columns should remain unchanges assert original_cols == cols # the first rows are for the unigrams that existed before composition- 7 of them assert_array_equal(original_matrix.A, matrix.A[:7, :]) # two new rows should appear, one for each composed feature # this should be reflected in both the index and the matrix assert rows['cat/N_game/N'] == 7 assert rows['dog/N_game/N'] == 8 assert matrix.shape == (13, 7) == (len(rows), len(cols)) assert_array_equal(matrix.A[7, :], left_comp.unigram_source.get_vector('cat/N').A.ravel()) assert_array_equal(matrix.A[8, :], left_comp.unigram_source.get_vector('dog/N').A.ravel()) assert_array_equal(matrix.A[8, :], left_comp.unigram_source.get_vector('dog/N').A.ravel()) for i in range(9, 12): assert_array_equal(matrix.A[i, :], left_comp.unigram_source.get_vector('cat/N').A.ravel())
def filter_preextracted_features(self, feature_list): """ Takes a list of features and keeps only those mentioned in the constructor parameters. This is a minor optimisation- extraction is a little slow, so we can just extract tons of features in advance and then just filter them dynamically for each experiment :param feature_list: :return: """ res = [] for feat_str in feature_list: feat = DocumentFeature.from_string(feat_str) if feat.type == 'EMPTY': continue if feat.type == '1-GRAM' and feat.tokens[0].pos not in self.extract_unigram_features: continue if feat.type != '1-GRAM' and feat.type not in self.extract_phrase_features: continue if self.remove_features_with_NER and set(t.ner for t in feat.tokens) != {'O'}: continue if self.remove_pos: for token in feat.tokens: token.pos = None res.append(feat) return res
def get_vector(self, feature): if isinstance(feature, six.string_types): feature = DocumentFeature.from_string(feature) return self.unigram_source.get_vector(str(feature[self.hardcoded_index]))
def do_svd(input_path, output_prefix, desired_counts_per_feature_type=[('N', 8), ('V', 4), ('J', 4), ('RB', 2), ('AN', 2)], reduce_to=[3, 10, 15], apply_to=None, write=3, use_hdf=True): """ Performs truncated SVD. A copy of the trained sklearn SVD estimator will be also be saved :param input_path: list of files containing vectors in TSV format. All vectors will be reduced together. :type input_path: list of file names or a Vectors object :param output_prefix: Where to output the reduced files. An extension will be added. :param desired_counts_per_feature_type: how many entries to keep of each DocumentFeature type, by frequency. This is the PoS tag for unigram features and the feature type otherwise. For instance, pass in [('N', 2), ('AN', 0)] to select 2 unigrams of PoS N and 0 bigrams of type adjective-noun. Types that are not explicitly given a positive desired count are treated as if the desired count is 0. If this is None, not filtering is performed. :param reduce_to: list of integers, what dimensionalities to reduce to :param apply_to: a file path. After SVD has been trained on input_path, it can be applied to apply_to. Output will be writen to the same file :param write: Once SVD is trained on A and applied to B, output either A, B or vstack(A, B). Use values 0, 1, and 2 respectively. Default is 3. :param use_hdf: if true, store results as a pandas DF in HDF. This will enforce some constraints like not having duplicate entries in the index, which I deliberately break with some of the unit tests. This switch is the easiest way to avoid modifying the unit tests :type write: int :raise ValueError: If the loaded thesaurus is empty """ if not 1 <= write <= 3: raise ValueError('value of parameter write must be 1, 2 or 3') if not isinstance(input_path, Vectors): thesaurus = Vectors.from_tsv(input_path, lowercasing=False) else: thesaurus = input_path if not thesaurus: raise ValueError('Empty thesaurus %r', input_path) mat, _, rows, cols = filter_out_infrequent_entries(desired_counts_per_feature_type, thesaurus) if apply_to: cols = set(cols) if not isinstance(apply_to, Vectors): thes_to_apply_to = Vectors.from_tsv(apply_to, lowercasing=False, column_filter=lambda foo: foo in cols) else: thes_to_apply_to = apply_to # get the names of each thesaurus entry extra_rows = [x for x in thes_to_apply_to.keys()] # vectorize second matrix with the vocabulary (columns) of the first thesaurus to ensure shapes match # "project" second thesaurus into space of first thesaurus thesaurus.v.vocabulary_ = {x: i for i, x in enumerate(list(cols))} extra_matrix = thesaurus.v.transform([dict(fv) for fv in thes_to_apply_to.values()]) # make sure the shape is right assert extra_matrix.shape[1] == mat.shape[1] if write == 3: # extend the list of names rows = list(rows) + [DocumentFeature.from_string(x) for x in extra_rows] elif write == 2: rows = [DocumentFeature.from_string(x) for x in extra_rows] # no need to do anything if write == 1 for n_components in reduce_to: method, reduced_mat = _do_svd_single(mat, n_components) if not method: continue if apply_to: logging.info('Applying learned SVD transform to matrix of shape %r', extra_matrix.shape) # apply learned transform to new data if write == 3: # append to old data reduced_mat = np.vstack((reduced_mat, method.transform(extra_matrix))) elif write == 2: reduced_mat = method.transform(extra_matrix) path = '{}-SVD{}'.format(output_prefix, n_components) _write_to_disk(scipy.sparse.coo_matrix(reduced_mat), path, rows, use_hdf=use_hdf)
def train_grefenstette_multistep_composer(all_vectors_file, root_dir): """ Train Grefenstette et al's multistep regression VO/SVO model Adapted from dissect's ex19.py :param all_vectors_file: file containing N, V, VO and SVO vectors :param root_dir: where to write temp files and output """ mkdirs_if_not_exists(root_dir) vo_composer_output_file = join(root_dir, 'vo_comp.pkl') svo_composer_output_file = join(root_dir, 'svo_comp.pkl') filename = basename(all_vectors_file) noun_events_file = join(root_dir, '%s-onlyN.tmp' % filename) # verb_events_file = join(root_dir, '%s-onlyV.tmp' % filename) # vo_events_file = join(root_dir, '%s-onlyVO.tmp' % filename) svo_events_file = join(root_dir, '%s-onlySVO.tmp' % filename) # this has unigrams and observed phrases thes = Vectors.from_tsv(all_vectors_file) thes.to_tsv(noun_events_file, entry_filter=lambda x: x.type == '1-GRAM' and x.tokens[0].pos == 'N') _translate_byblo_to_dissect(noun_events_file) # thes.to_tsv(verb_events_file, # entry_filter=lambda x: x.type == '1-GRAM' and x.tokens[0].pos == 'V') # _translate_byblo_to_dissect(verb_events_file) # thes.to_tsv(vo_events_file, # entry_filter=lambda x: x.type == 'VO') # _translate_byblo_to_dissect(vo_events_file) thes.to_tsv(svo_events_file, entry_filter=lambda x: x.type == 'SVO') _translate_byblo_to_dissect(svo_events_file) train_vo_data, train_v_data = [], [] for phrase in thes.keys(): df = DocumentFeature.from_string(phrase) if df.type == 'SVO': train_vo_data.append((str(df[1:]), str(df[0]), str(df))) if df.type == 'VO': train_v_data.append((str(df[0]), str(df[1]), str(df))) # logging.info('train_vo_data %r', len(train_vo_data)) # logging.info('train_v_data %r', len(train_v_data)) # load N and SVO spaces n_space = Space.build(data=noun_events_file + '.sm', cols=noun_events_file + '.cols', format="sm") svo_space = Space.build(data=svo_events_file + '.sm', cols=svo_events_file + '.cols', format="sm") logging.info("Input SVO training space:") logging.info(svo_space.id2row) # logging.info(svo_space.cooccurrence_matrix) # 1. train a model to learn VO functions on train data: VO N -> SVO logging.info("Step 1 training") vo_model = LexicalFunction(learner=RidgeRegressionLearner(), min_samples=2) # Gref et al 2013, §5 says 3 vo_model.train(train_vo_data, n_space, svo_space) io_utils.save(vo_model, vo_composer_output_file) # 2. train a model to learn V functions on train data: V N -> VO # where VO space: function space learned in step 1 logging.info("Step 2 training") vo_space = vo_model.function_space v_model = LexicalFunction(learner=RidgeRegressionLearner(), min_samples=2) v_model.train(train_v_data, n_space, vo_space) io_utils.save(v_model, svo_composer_output_file)
def train_baroni_guevara_composers(all_vectors, ROOT_DIR, baroni_output_path, guevara_output_path, baroni_threshold=10): """ :type all_vectors: str; path to vectors file containing both N and observed AN vectors :type ROOT_DIR: str; where to write temp files :type baroni_output_path: str; where to write pickled baroni composer :type guevara_output_path: str :type baroni_threshold: int """ SVD_DIMS = 100 baroni_training_phrase_types = {'AN', 'NN'} # what kind of NPs to train Baroni composer for # prepare the input files to be fed into Dissect mkdirs_if_not_exists(ROOT_DIR) filename = basename(all_vectors) noun_events_file = join(ROOT_DIR, '%s-onlyN-SVD%d.tmp' % (filename, SVD_DIMS)) NPs_events_file = join(ROOT_DIR, '%s-onlyPhrases-SVD%d.tmp' % (filename, SVD_DIMS)) thes = Vectors.from_tsv(all_vectors, lowercasing=False) thes.to_tsv(noun_events_file, entry_filter=lambda x: x.type == '1-GRAM' and x.tokens[0].pos == 'N') _translate_byblo_to_dissect(noun_events_file) thes.to_tsv(NPs_events_file, entry_filter=lambda x: x.type in baroni_training_phrase_types, row_transform=lambda x: str(x).replace(' ', '_')) _translate_byblo_to_dissect(NPs_events_file) my_space = Space.build(data="{}.sm".format(noun_events_file), rows="{}.rows".format(noun_events_file), cols="{}.cols".format(noun_events_file), format="sm") logging.info('Each unigram vector has dimensionality %r', my_space.element_shape) # create a peripheral space my_per_space = PeripheralSpace.build(my_space, data="{}.sm".format(NPs_events_file), rows="{}.rows".format(NPs_events_file), # The columns of the peripheral space have to be identical to those # in the core space (including their order)! cols="{}.cols".format(NPs_events_file), format="sm") logging.info('Each phrase vector has dimensionality %r', my_per_space.element_shape) # use the model to compose words in my_space all_data = [] for phrase in my_per_space._row2id: # make sure there are only NPs here if DocumentFeature.from_string(phrase.replace(' ', '_')).type in baroni_training_phrase_types: adj, noun = phrase.split('_') all_data.append((adj, noun, '%s_%s' % (adj, noun))) # train a composition model on the data and save it baroni = LexicalFunction(min_samples=baroni_threshold, learner=RidgeRegressionLearner()) guevara = FullAdditive(learner=RidgeRegressionLearner()) for composer, out_path in zip([baroni, guevara], [baroni_output_path, guevara_output_path]): composer.train(all_data, my_space, my_per_space) io_utils.save(composer, out_path) logging.info('Saved trained composer to %s', out_path)
def from_tsv(cls, tsv_file, sim_threshold=0, include_self=False, lowercasing=False, ngram_separator='_', allow_lexical_overlap=True, row_filter=lambda x, y: True, column_filter=lambda x: True, max_len=50, max_neighbours=1e8, merge_duplicates=False, immutable=True, enforce_word_entry_pos_format=True, tar=False, **kwargs): """ Create a Thesaurus by parsing a Byblo-compatible TSV files (events or sims). If duplicate values are encoutered during parsing, only the latest will be kept. :param tsv_file: path to input TSV file :type tsv_file: str :param sim_threshold: min similarity between an entry and its neighbour for the neighbour to be included :type sim_threshold: float :param include_self: whether to include self as nearest neighbour. :type include_self: bool :param lowercasing: if true, most of what is read will be lowercased (excluding PoS tags), so Cat/N -> cat/N. This is desirable when reading thesauri with this class. If False, no lowercasing will take place. This might be desirable when readings feature lists or already lowercased neighbour lists. FET + Byblo thesauri are already lowercased. :type lowercasing: bool :param ngram_separator: When n_gram entries are read in, what are the indidivual tokens separated by :param column_filter: A function that takes a string (column in the file) and returns whether or not the string should be kept :param row_filter: takes a string and its corresponding DocumentFeature and determines if it should be loaded. If `enforce_word_entry_pos_format` is `False`, the second parameter to this function will be `None` :param allow_lexical_overlap: whether neighbours/features are allowed to overlap lexically with the entry they are neighbours/features of. OTE: THE BEHAVIOUR OF THIS PARAMETER IS SLIGHTLY DIFFERENT FROM THE EQUIVALENT IN VECTORS. SEE COMMENT THERE. :param max_len: maximum length (in characters) of permissible **entries**. Longer entries are ignored. :param max_neighbours: maximum neighbours per entry. This is applied AFTER the filtering defined by column_filter and allow_lexical_overlap is finished. :param merge_duplicates: whether to raise en error if multiple entries exist, or concatenate/add them together. The former is appropriate for `Thesaurus`, and the latter for `Vectors` :param enforce_word_entry_pos_format: if true, entries that are not in a `word/POS` format are skipped. This must be true for `allow_lexical_overlap` to work. :param tar: whether the file is compressed by running `tar -zcvf file.gz file.txt`. Assuming the tar contains a single file. """ if not tsv_file: raise ValueError("No thesaurus specified") to_return = dict() logging.info('Loading thesaurus %s from disk', tsv_file) gz_file = tsv_file + '.gz' if os.path.exists(gz_file) and tar: logging.warning('Using .gz version of thesaurus') tsv_file = gz_file if not allow_lexical_overlap: logging.warning('DISALLOWING LEXICAL OVERLAP') if not allow_lexical_overlap and not enforce_word_entry_pos_format: raise ValueError('allow_lexical_overlap requires entries to be converted to a DocumentFeature. ' 'Please enable enforce_word_entry_pos_format') FILTERED = '___FILTERED___'.lower() if tar: tarf = tarfile.open(tsv_file, 'r') members = tarf.getmembers() if len(members) != 1: # todo this is odd, I don't know why it is happening # on some machine tar adds a second hidden file to the archive logging.warning('Tar archive contains multiple files: %r' % members) logging.warning('Using the last file in the tar') fhandle = tarf.extractfile(members[-1]) else: fhandle = open(tsv_file) with fhandle as infile: for line in infile.readlines(): if tar: # this is a byte steam, needs to be decoded tokens = line.decode('UTF8').strip().split('\t') else: tokens = line.strip().split('\t') if len(tokens) % 2 == 0: # must have an odd number of things, one for the entry # and pairs for (neighbour, similarity) logging.warning('Skipping dodgy line in thesaurus file: %s\n %s', tsv_file, line) continue if tokens[0] != FILTERED: key = DocumentFeature.smart_lower(tokens[0], ngram_separator, lowercasing) dfkey = DocumentFeature.from_string(key) if enforce_word_entry_pos_format else None if enforce_word_entry_pos_format and dfkey.type == 'EMPTY': # do not load things in the wrong format, they'll get in the way later logging.warning('%s is not in the word/POS format, skipping', tokens[0]) continue if (not row_filter(key, dfkey)) or len(key) > max_len: logging.warning('Skipping entry for %s', key) continue to_insert = [(DocumentFeature.smart_lower(word, ngram_separator, lowercasing), float(sim)) for (word, sim) in walk_nonoverlapping_pairs(tokens, 1) if word.lower() != FILTERED and column_filter(word) and float(sim) > sim_threshold] if not allow_lexical_overlap: to_insert = cls.remove_overlapping_neighbours(dfkey, to_insert) if len(to_insert) > max_neighbours: to_insert = to_insert[:max_neighbours] if include_self: to_insert.insert(0, (key, 1.0)) # the steps above may filter out all neighbours of an entry. if this happens, # do not bother adding it if len(to_insert) > 0: if key in to_return: # this is a duplicate entry, merge it or raise an error if merge_duplicates: logging.warning('Multiple entries for "%s" found. Merging.', tokens[0]) c = Counter(dict(to_return[key])) c.update(dict(to_insert)) to_return[key] = [(k, v) for k, v in c.items()] else: raise ValueError('Multiple entries for "%s" found.' % tokens[0]) else: to_return[key] = to_insert else: logging.warning('Nothing survived filtering for %r', key) return Thesaurus(to_return, immutable=immutable)
def _write_features_of_single_corpus_to_file(all_phrases, corpus_name): ALL_FEATURES_FILE = '%s/%s_all_features.txt' % (ROOT, corpus_name) NP_MODIFIERS_FILE = '%s/%s_np_modifiers.txt' % (ROOT, corpus_name) VERBS_FILE = '%s/%s_verbs.txt' % (ROOT, corpus_name) SOCHER_FILE = '%s/%s_socher.txt' % (ROOT, corpus_name) logging.info('Writing %d unique document features to files in %s', len(all_phrases), ROOT) # How stanford parser formats NPs and VPs # (ROOT # (NP (NN acquisition) (NN pact))) # # (ROOT # (NP (JJ pacific) (NN stock))) stanford_NP_pattern = '(ROOT\n (NP ({} {}) ({} {})))\n\n' # (ROOT # (S # (NP (NNS cats)) # (VP (VBP eat) # (NP (NNS dogs))))) stanford_SVO_pattern = '(ROOT\n (S\n (NP (NN {}))\n (VP (VB {})\n (NP (NN {})))))\n\n' # (ROOT # (S # (VP (VB eat) # (NP (NNS cats))))) stanford_VO_pattern = '(ROOT\n (S\n (VP (VB {})\n (NP (NN {})))))\n\n' # (ROOT # (NP (NN roads))) # I checked that this extracts the neural word embedding for the word stanford_unigram_pattern = '(ROOT\n (NP ({} {})))\n\n' mkdirs_if_not_exists(ROOT) logging.info('Writing all document features to files') seen_modifiers, seen_verbs = set(), set() with open(SOCHER_FILE, 'w') as outf_socher, \ open(NP_MODIFIERS_FILE, 'w') as outf_mods, \ open(VERBS_FILE, 'w') as outf_verbs, \ open(ALL_FEATURES_FILE, 'w') as outf_plain: for item in all_phrases: item = DocumentFeature.from_string(item) # write in my underscore-separated format outf_plain.write(str(item) + '\n') if item.type in {'AN', 'NN'}: # write the phrase in Socher's format string = stanford_NP_pattern.format(item.tokens[0].pos * 2, item.tokens[0].text, item.tokens[1].pos * 2, item.tokens[1].text) outf_socher.write(string) if item.type in {'VO', 'SVO'}: verb = str(item.tokens[-2]) if verb not in seen_verbs: seen_verbs.add(verb) outf_verbs.write(verb) outf_verbs.write('\n') if item.type == 'VO': string = stanford_VO_pattern.format(*[x.tokens[0].text for x in item]) outf_socher.write(string) if item.type == 'SVO': string = stanford_SVO_pattern.format(*[x.tokens[0].text for x in item]) outf_socher.write(string) if item.type in {'AN', 'NN'}: # write just the modifier separately first = str(item.tokens[0]) second = str(item.tokens[1]) if first not in seen_modifiers: outf_mods.write('%s\n' % first) seen_modifiers.add(first) if item.type == '1-GRAM': string = stanford_unigram_pattern.format(item.tokens[0].pos * 2, item.tokens[0].text) outf_socher.write(string) if item.type not in {'1-GRAM', 'AN', 'NN', 'VO', 'SVO'}: # there shouldn't be any other features raise ValueError('Item %r has the wrong feature type: %s' % (item, item.type))
def compose_and_write_vectors(unigram_vectors, short_vector_dataset_name, composer_classes, remove_pos= False, pretrained_Baroni_composer_file=None, pretrained_Guevara_composer_file=None, pretrained_Gref_composer_file=None, categorical_vector_matrix_file=None, output_dir='.', gzipped=True, dense_hd5=False, row_filter=default_row_filter): """ Extracts all composable features from a labelled classification corpus and dumps a composed vector for each of them to disk. The output file will also contain all unigram vectors that were passed in, and only unigrams! :param unigram_vectors: a file in Byblo events format that contain vectors for all unigrams OR a Vectors object. This will be used in the composition process. :type unigram_vectors: str or Vectors :param classification_corpora: Corpora to extract features from. Dict {corpus_path: conf_file} :param pretrained_Baroni_composer_file: path to pre-trained Baroni AN/NN composer file :param output_dir: :param composer_classes: what composers to use :type composer_classes: list """ phrases_to_compose = get_all_document_features(remove_pos=remove_pos) # if this isn't a Vectors object assume it's the name of a file containing vectors and load them if not isinstance(unigram_vectors, Vectors): # ensure there's only unigrams in the set of unigram vectors # composers do not need any ngram vectors contain in this file, they may well be # observed ones unigram_vectors = Vectors.from_tsv(unigram_vectors, row_filter=row_filter) logging.info('Starting composition with %d unigram vectors', len(unigram_vectors)) # doing this loop in parallel isn't worth it as pickling or shelving `vectors` is so slow # it negates any gains from using multiple cores for composer_class in composer_classes: if composer_class == BaroniComposer: assert pretrained_Baroni_composer_file is not None composer = BaroniComposer(unigram_vectors, pretrained_Baroni_composer_file) elif composer_class == GuevaraComposer: assert pretrained_Guevara_composer_file is not None composer = GuevaraComposer(unigram_vectors, pretrained_Guevara_composer_file) elif composer_class == GrefenstetteMultistepComposer: assert pretrained_Gref_composer_file is not None composer = GrefenstetteMultistepComposer(unigram_vectors, pretrained_Gref_composer_file) elif composer_class in [CopyObject, FrobeniusAdd, FrobeniusMult]: composer = composer_class(categorical_vector_matrix_file, unigram_vectors) else: composer = composer_class(unigram_vectors) try: # compose_all returns all unigrams and composed phrases mat, cols, rows = composer.compose_all(phrases_to_compose) events_path = os.path.join(output_dir, 'composed_%s_%s.events.filtered.strings' % (short_vector_dataset_name, composer.name)) if dense_hd5: write_vectors_to_hdf(mat, rows, cols, events_path) else: rows2idx = {i: DocumentFeature.from_string(x) for (x, i) in rows.items()} write_vectors_to_disk(mat.tocoo(), rows2idx, cols, events_path, entry_filter=lambda x: x.type in {'AN', 'NN', 'VO', 'SVO', '1-GRAM'}, gzipped=gzipped) except ValueError as e: logging.error('RED ALERT, RED ALERT') logging.error(e) continue