def reformat_socher_vectors(): """ Formats the files output by Socher (2011)'s matlab code into byblo-compatible files. Before running this a list of all phrases needs to be extracted from the labelled data, and these need to be composed with Socher's matlab code. See note "Socher vectors" in Evernote. """ logging.info('Reformatting events file %s ---> %s', socher_output_vectors_file, socher_composed_vectors_file) # socher's code removes all PoS tags, so we can't translate his output # back to a DocumentFeature. Let's read the input to his code instead and # get the corresponding output vectors # get a list of all phrases that we attempted to compose with open(plaintext_socher_input_file) as infile: composed_phrases = [DocumentFeature.from_string(line.strip()) for line in infile] # get a list of all phrases where composition worked (no unknown words) with open(socher_output_phrases_file) as infile: success = [i for i, line in enumerate(infile) if '*UNKNOWN*' not in line] # pick out just the phrases that composes successfully composed_phrases = itemgetter(*success)(composed_phrases) # load all vectors, remove these containing unknown words mat = np.loadtxt(socher_output_vectors_file, delimiter=',') mat = mat[success, :] assert len(composed_phrases) == mat.shape[0] # same number of rows # do the actual writing write_vectors_to_hdf(sp.coo_matrix(mat), composed_phrases, ['RAE-feat%d' % i for i in range(100)], # Socher provides 100-dimensional vectors socher_composed_vectors_file)
def to_tsv(self, events_path, entries_path='', features_path='', entry_filter=lambda x: True, row_transform=lambda x: x, gzipped=False, enforce_word_entry_pos_format=True, dense_hd5=False): """ Writes this thesaurus to Byblo-compatible file like the one it was most likely read from. In the process converts all entries to a DocumentFeature, so all entries must be parsable into one. May reorder the features of each entry. :param events_path: file to write to :param entry_filter: Called for every DocumentFeature that is an entry in this thesaurus. The vector will only be written if this callable return true :param row_transform: Callable, any transformation that might need to be done to each entry before converting it to a DocumentFeature. This is needed because some entries (e.g. african/J:amod-HEAD:leader) are not directly convertible (needs to be african/J_leader/N). Use this if the entries cannot be converted to DocumentFeature, e.g. if the data isn't PoS tagged. :param dense_hd5: if true, convert to a pandas `DataFrame` and write to a compressed HDF file. This is a 30% faster and produces 30% smaller files than using `gzipped`. This is only suitable for matrices with a small number of columns- this method enforces a hard limit of 1000. Requires PyTables and HDF5. :return: the file name """ if enforce_word_entry_pos_format: rows = {i: DocumentFeature.from_string(row_transform(feat)) for (feat, i) in self.name2row.items()} else: rows = {i: feat for (feat, i) in self.name2row.items()} if dense_hd5 and len(self.columns) <= 1000: write_vectors_to_hdf(self.matrix, self.row_names, self.columns, events_path) else: write_vectors_to_disk(coo_matrix(self.matrix), rows, self.columns, events_path, features_path=features_path, entries_path=entries_path, entry_filter=entry_filter, gzipped=gzipped) return events_path
def merge_vectors(composed_dir, unigrams, output, workers=4, chunk_size=10000): # this particular dataset uses spaces instead of underscores. State this to avoid parsing issues DocumentFeature.ngram_separator = " " DIMS = 100 # SVD dimensionality files = glob(os.path.join(composed_dir, "*apt.vec.gz")) logging.info("Found %d composed phrase files", len(files)) # ignore stuff that isn't unigrams, it will cause problems later unigrams = Vectors.from_tsv(unigrams, row_filter=lambda x, y: y.type == "1-GRAM") logging.info("Found %d unigram vectors", len(unigrams)) mat, cols, rows = unigrams.to_sparse_matrix() unigrams.v.vocabulary_ = {x: i for i, x in enumerate(list(cols))} cols = set(cols) svd = TruncatedSVD(DIMS, random_state=0) logging.info("Reducing dimensionality of matrix of shape %r...", mat.shape) start = time.time() reduced_mat = svd.fit_transform(mat) logging.info( "Reduced using {} from shape {} to shape {} in {} seconds".format( svd, mat.shape, reduced_mat.shape, time.time() - start ) ) write_vectors_to_hdf( reduced_mat, rows, ["SVD:feat{0:03d}".format(i) for i in range(reduced_mat.shape[1])], "%s-unigrams-SVD%d" % (output, DIMS), ) del mat for i, chunk in enumerate(grouper(chunk_size, files)): d = {} logging.info("Reading composed vectors, chunk %d...", i) for phrase, features in Parallel(n_jobs=workers)(delayed(_read_vector)(f) for f in chunk if f): if features: d[phrase] = features logging.info("Found %d non-empty composed vectors in this chunk, running SVD now...", len(d)) if not d: continue composed_vec = Vectors(d, column_filter=lambda foo: foo in cols) # vectorize second matrix with the vocabulary (columns) of the first thesaurus to ensure shapes match # "project" composed matrix into space of unigram thesaurus extra_matrix = unigrams.v.transform([dict(fv) for fv in composed_vec.values()]) assert extra_matrix.shape == (len(composed_vec), len(cols)) logging.info("Composed matrix is of shape %r before SVD", extra_matrix.shape) extra_matrix = svd.transform(extra_matrix) write_vectors_to_hdf( extra_matrix, list(composed_vec.keys()), ["SVD:feat{0:03d}".format(i) for i in range(extra_matrix.shape[1])], "%s-phrases-chunk%d-SVD%d" % (output, i, DIMS), ) del composed_vec
def _write_to_disk(reduced_mat, prefix, rows, use_hdf=True): events_file = prefix + '.events.filtered.strings' if use_hdf: write_vectors_to_hdf(reduced_mat, rows, ['SVD:feat{0:03d}'.format(i) for i in range(reduced_mat.shape[1])], events_file) else: write_vectors_to_disk(reduced_mat, rows, ['SVD:feat{0:03d}'.format(i) for i in range(reduced_mat.shape[1])], events_file)
def run_glove(): logging.info('Starting training') with temp_chdir(args.glove_dir): run_and_log_output('sh {} {}'.format(glove_script, unlabelled_data)) # convert their format to ours df = pd.read_csv(raw_vectors_file, sep=' ', index_col=0, header=None) logging.info('Done training, filtering junk and converting %d vectors to Byblo-compatible format', len(df)) # remove any shit-looking tokens, they'll get in the way later mask = [DocumentFeature.from_string(x).type != 'EMPTY' and 3 < len(x) < 20 for x in df.index] logging.info('Keeping %d entries', sum(mask)) logging.info('Shape of vectors before filtering %r', df.shape) df = df[mask] logging.info('Shape of vectors after filtering %r', df.shape) cols = ['f%d' % i for i in range(df.shape[1])] mkdirs_if_not_exists(output_dir) write_vectors_to_hdf(df.values, df.index, cols, formatted_vectors_file)
def compose_and_write_vectors(unigram_vectors, short_vector_dataset_name, composer_classes, remove_pos= False, pretrained_Baroni_composer_file=None, pretrained_Guevara_composer_file=None, pretrained_Gref_composer_file=None, categorical_vector_matrix_file=None, output_dir='.', gzipped=True, dense_hd5=False, row_filter=default_row_filter): """ Extracts all composable features from a labelled classification corpus and dumps a composed vector for each of them to disk. The output file will also contain all unigram vectors that were passed in, and only unigrams! :param unigram_vectors: a file in Byblo events format that contain vectors for all unigrams OR a Vectors object. This will be used in the composition process. :type unigram_vectors: str or Vectors :param classification_corpora: Corpora to extract features from. Dict {corpus_path: conf_file} :param pretrained_Baroni_composer_file: path to pre-trained Baroni AN/NN composer file :param output_dir: :param composer_classes: what composers to use :type composer_classes: list """ phrases_to_compose = get_all_document_features(remove_pos=remove_pos) # if this isn't a Vectors object assume it's the name of a file containing vectors and load them if not isinstance(unigram_vectors, Vectors): # ensure there's only unigrams in the set of unigram vectors # composers do not need any ngram vectors contain in this file, they may well be # observed ones unigram_vectors = Vectors.from_tsv(unigram_vectors, row_filter=row_filter) logging.info('Starting composition with %d unigram vectors', len(unigram_vectors)) # doing this loop in parallel isn't worth it as pickling or shelving `vectors` is so slow # it negates any gains from using multiple cores for composer_class in composer_classes: if composer_class == BaroniComposer: assert pretrained_Baroni_composer_file is not None composer = BaroniComposer(unigram_vectors, pretrained_Baroni_composer_file) elif composer_class == GuevaraComposer: assert pretrained_Guevara_composer_file is not None composer = GuevaraComposer(unigram_vectors, pretrained_Guevara_composer_file) elif composer_class == GrefenstetteMultistepComposer: assert pretrained_Gref_composer_file is not None composer = GrefenstetteMultistepComposer(unigram_vectors, pretrained_Gref_composer_file) elif composer_class in [CopyObject, FrobeniusAdd, FrobeniusMult]: composer = composer_class(categorical_vector_matrix_file, unigram_vectors) else: composer = composer_class(unigram_vectors) try: # compose_all returns all unigrams and composed phrases mat, cols, rows = composer.compose_all(phrases_to_compose) events_path = os.path.join(output_dir, 'composed_%s_%s.events.filtered.strings' % (short_vector_dataset_name, composer.name)) if dense_hd5: write_vectors_to_hdf(mat, rows, cols, events_path) else: rows2idx = {i: DocumentFeature.from_string(x) for (x, i) in rows.items()} write_vectors_to_disk(mat.tocoo(), rows2idx, cols, events_path, entry_filter=lambda x: x.type in {'AN', 'NN', 'VO', 'SVO', '1-GRAM'}, gzipped=gzipped) except ValueError as e: logging.error('RED ALERT, RED ALERT') logging.error(e) continue