def calculate_unigram_vectors(byblo_conf_file, byblo_base_dir):
    # find out where the conf file said output should go
    opts, _ = parse_byblo_conf_file(byblo_conf_file)
    byblo_output_prefix = join(opts.output, basename(opts.input))

    # get byblo to calculate vectors for all entries
    set_stage_in_byblo_conf_file(byblo_conf_file, 1)
    with temp_chdir(byblo_base_dir):
        run_byblo(byblo_conf_file)
        set_stage_in_byblo_conf_file(byblo_conf_file, 0)
        # get vectors as strings
        unindex_all_byblo_vectors(byblo_output_prefix)
def run_glove():
    logging.info('Starting training')
    with temp_chdir(args.glove_dir):
        run_and_log_output('sh {} {}'.format(glove_script, unlabelled_data))

    # convert their format to ours
    df = pd.read_csv(raw_vectors_file, sep=' ', index_col=0, header=None)
    logging.info('Done training, filtering junk and converting %d vectors to Byblo-compatible format', len(df))
    # remove any shit-looking tokens, they'll get in the way later
    mask = [DocumentFeature.from_string(x).type != 'EMPTY' and 3 < len(x) < 20 for x in df.index]
    logging.info('Keeping %d entries', sum(mask))
    logging.info('Shape of vectors before filtering %r', df.shape)
    df = df[mask]
    logging.info('Shape of vectors after filtering %r', df.shape)
    cols = ['f%d' % i for i in range(df.shape[1])]
    mkdirs_if_not_exists(output_dir)
    write_vectors_to_hdf(df.values, df.index, cols, formatted_vectors_file)
예제 #3
0
def build_thesaurus_out_of_vectors(vectors_path, out_dir, threads=4, num_neighbours=100, sim_function='Cosine'):
    """
    Builds a Byblo thesaurus out of the provided vectors, however these were constructed. This function will make an
    uncompressed copy of the provided vectors file- might be slow and use up a lot of extra space.

    :param vectors_path: input vectors in byblo format, compressed or not
    :param out_dir: where to put the thesaurus and all temp file
    :param threads: number of byblo threads
    :param num_neighbours: number of nearest neighbours per entry to output
    :param sim_function: similarity measure between vectors to use. see byblo docs
    """
    from discoutils.thesaurus_loader import Vectors

    BYBLO_BASE_DIR = '/lustre/scratch/inf/mmb28/FeatureExtractionToolkit/Byblo-2.2.0'
    vectors_path = os.path.abspath(vectors_path)
    out_dir = os.path.abspath(out_dir)
    mkdirs_if_not_exists(out_dir)
    v = Vectors.from_tsv(vectors_path)

    # prepare the files that byblo expects
    outf_basename = os.path.join(out_dir, 'input')
    events_file = os.path.join(out_dir, outf_basename + '.events.filtered.strings')
    entries_file = os.path.join(out_dir, outf_basename + '.entries.filtered.strings')
    features_file = os.path.join(out_dir, outf_basename + '.features.filtered.strings')

    v.to_plain_txt(events_file, entries_file, features_file)
    # write the byblo conf file
    conf = '--input {} --output {} --threads {} --similarity-min 0.01 -k {} ' \
           '--measure {} --stages allpairs,knn,unenumerate'.format(outf_basename, out_dir, threads,
                                                                   num_neighbours, sim_function)
    conf_path = os.path.join(out_dir, 'conf.txt')
    with open(conf_path, 'w') as outf:
        for line in conf.split():
            outf.write(line)
            outf.write('\n')

    # go baby go
    with temp_chdir(BYBLO_BASE_DIR):
        reindex_all_byblo_vectors(outf_basename)
        run_byblo(conf_path, touch_input_file=True)
        unindex_all_byblo_vectors(outf_basename)
예제 #4
0
def run_socher_code():
    # symlink the file Socher's code expects to where the list of phrases I'm interested is
    force_symlink(phrases_to_compose, socher_input_file)
    with temp_chdir(socher_base_dir):
        run_and_log_output('./phrase2Vector.sh')  # this takes a while