Exemplo n.º 1
0
def main(_):
    """Train a word2vec model."""
    if not FLAGS.train_data or not FLAGS.save_path:
        print("--train_data and --save_path must be specified.")
        sys.exit(1)
    opts = Options()
    bf = None
    with tf.Graph().as_default(), tf.Session() as session:
        if opts.plk_table:
            bf = bloomfilter()
            bf.load(opts.plk_table)

        if FLAGS.interactive and opts.plk_table and opts.restore_model:
            with tf.device("/cpu:0"):
                model = Word2Vec(opts, session, False, bf)
                model.saver.restore(session,
                                    tf.train.latest_checkpoint(opts.save_path))

            _start_shell(locals())
        else:
            with tf.device("/cpu:0"):
                model = Word2Vec(opts, session, True, bf)
            if opts.restore_model:
                model.saver.restore(session,
                                    tf.train.latest_checkpoint(opts.save_path))
            for _ in xrange(opts.epochs_to_train):
                model.train()  # Process one epoch
                # model.eval()  # Eval analogies.
            # Perform a final save.
            model.saver.save(session,
                             os.path.join(opts.save_path, "model.ckpt"),
                             global_step=model.global_step)
Exemplo n.º 2
0
 def __init__(self, name, size, k):
     self.bloomfilter = bloomfilter(name=name, size=size, k=k)
Exemplo n.º 3
0
    del words
    del count
    del filter_set
    del most_common_words

    return unsorted_res


n_words = 50000

print('Read vocabulary from {}...'.format(sys.argv[1]), end='')
most_common_words = read_data(sys.argv[1], n_words)
print('Done')

print('Loading bloomfilter...', end='')
bloomfilter = bf.bloomfilter()
bloomfilter.load(sys.argv[2])
print('Done')

output_file = open(sys.argv[3], 'w')

for word_tuple, word_count in most_common_words.items():
    possible_words = set()
    for idx, val in enumerate(word_tuple):
        if idx == 0:
            possible_words = bloomfilter.get_opcode_in_table(idx, val)
        else:
            possible_words = possible_words & bloomfilter.get_opcode_in_table(
                idx, val)

    if len(possible_words) == 0:
Exemplo n.º 4
0
            res.append(tuple(sorted(word_idx_list)))
            cnt += len(line)
            line_num += 1
            if line_num % 10000 == 0:
                progress(cnt, total)
    return res


# Step 1: Read hash lists
vocabulary = read_data(in_hash_path)
log.debug('Data size' + str(len(vocabulary)))

# Step 1.5: construct the 'UNK' tuple
# [FIXIT] we do not check the 'UNK's max_bf_size and k
#         It will be wrong if they do not match with the input's
bloom_filter = bloomfilter(name="UNK", size=args.max_bf_size, k=args.k)
unknow_indice = tuple(sorted(bloom_filter.get_indice('UNK')))


# Step 2: Build the dictionary and replace rare words with UNK token.
def build_dataset(words, n_words):
    """Process raw inputs into a dataset."""

    count = []
    rank_matrix = []
    words_counter = collections.Counter(words)
    count.extend(words_counter.most_common(n_words))
    count.extend([(unknow_indice, -1)])
    dictionary = dict()
    for word, _ in count:
        dictionary[word] = len(dictionary)