def calculate_unigram_vectors(byblo_conf_file, byblo_base_dir): # find out where the conf file said output should go opts, _ = parse_byblo_conf_file(byblo_conf_file) byblo_output_prefix = join(opts.output, basename(opts.input)) # get byblo to calculate vectors for all entries set_stage_in_byblo_conf_file(byblo_conf_file, 1) with temp_chdir(byblo_base_dir): run_byblo(byblo_conf_file) set_stage_in_byblo_conf_file(byblo_conf_file, 0) # get vectors as strings unindex_all_byblo_vectors(byblo_output_prefix)
def run_glove(): logging.info('Starting training') with temp_chdir(args.glove_dir): run_and_log_output('sh {} {}'.format(glove_script, unlabelled_data)) # convert their format to ours df = pd.read_csv(raw_vectors_file, sep=' ', index_col=0, header=None) logging.info('Done training, filtering junk and converting %d vectors to Byblo-compatible format', len(df)) # remove any shit-looking tokens, they'll get in the way later mask = [DocumentFeature.from_string(x).type != 'EMPTY' and 3 < len(x) < 20 for x in df.index] logging.info('Keeping %d entries', sum(mask)) logging.info('Shape of vectors before filtering %r', df.shape) df = df[mask] logging.info('Shape of vectors after filtering %r', df.shape) cols = ['f%d' % i for i in range(df.shape[1])] mkdirs_if_not_exists(output_dir) write_vectors_to_hdf(df.values, df.index, cols, formatted_vectors_file)
def build_thesaurus_out_of_vectors(vectors_path, out_dir, threads=4, num_neighbours=100, sim_function='Cosine'): """ Builds a Byblo thesaurus out of the provided vectors, however these were constructed. This function will make an uncompressed copy of the provided vectors file- might be slow and use up a lot of extra space. :param vectors_path: input vectors in byblo format, compressed or not :param out_dir: where to put the thesaurus and all temp file :param threads: number of byblo threads :param num_neighbours: number of nearest neighbours per entry to output :param sim_function: similarity measure between vectors to use. see byblo docs """ from discoutils.thesaurus_loader import Vectors BYBLO_BASE_DIR = '/lustre/scratch/inf/mmb28/FeatureExtractionToolkit/Byblo-2.2.0' vectors_path = os.path.abspath(vectors_path) out_dir = os.path.abspath(out_dir) mkdirs_if_not_exists(out_dir) v = Vectors.from_tsv(vectors_path) # prepare the files that byblo expects outf_basename = os.path.join(out_dir, 'input') events_file = os.path.join(out_dir, outf_basename + '.events.filtered.strings') entries_file = os.path.join(out_dir, outf_basename + '.entries.filtered.strings') features_file = os.path.join(out_dir, outf_basename + '.features.filtered.strings') v.to_plain_txt(events_file, entries_file, features_file) # write the byblo conf file conf = '--input {} --output {} --threads {} --similarity-min 0.01 -k {} ' \ '--measure {} --stages allpairs,knn,unenumerate'.format(outf_basename, out_dir, threads, num_neighbours, sim_function) conf_path = os.path.join(out_dir, 'conf.txt') with open(conf_path, 'w') as outf: for line in conf.split(): outf.write(line) outf.write('\n') # go baby go with temp_chdir(BYBLO_BASE_DIR): reindex_all_byblo_vectors(outf_basename) run_byblo(conf_path, touch_input_file=True) unindex_all_byblo_vectors(outf_basename)
def run_socher_code(): # symlink the file Socher's code expects to where the list of phrases I'm interested is force_symlink(phrases_to_compose, socher_input_file) with temp_chdir(socher_base_dir): run_and_log_output('./phrase2Vector.sh') # this takes a while