def main(_): """Train a word2vec model.""" if not FLAGS.train_data or not FLAGS.save_path: print("--train_data and --save_path must be specified.") sys.exit(1) opts = Options() bf = None with tf.Graph().as_default(), tf.Session() as session: if opts.plk_table: bf = bloomfilter() bf.load(opts.plk_table) if FLAGS.interactive and opts.plk_table and opts.restore_model: with tf.device("/cpu:0"): model = Word2Vec(opts, session, False, bf) model.saver.restore(session, tf.train.latest_checkpoint(opts.save_path)) _start_shell(locals()) else: with tf.device("/cpu:0"): model = Word2Vec(opts, session, True, bf) if opts.restore_model: model.saver.restore(session, tf.train.latest_checkpoint(opts.save_path)) for _ in xrange(opts.epochs_to_train): model.train() # Process one epoch # model.eval() # Eval analogies. # Perform a final save. model.saver.save(session, os.path.join(opts.save_path, "model.ckpt"), global_step=model.global_step)
def __init__(self, name, size, k): self.bloomfilter = bloomfilter(name=name, size=size, k=k)
del words del count del filter_set del most_common_words return unsorted_res n_words = 50000 print('Read vocabulary from {}...'.format(sys.argv[1]), end='') most_common_words = read_data(sys.argv[1], n_words) print('Done') print('Loading bloomfilter...', end='') bloomfilter = bf.bloomfilter() bloomfilter.load(sys.argv[2]) print('Done') output_file = open(sys.argv[3], 'w') for word_tuple, word_count in most_common_words.items(): possible_words = set() for idx, val in enumerate(word_tuple): if idx == 0: possible_words = bloomfilter.get_opcode_in_table(idx, val) else: possible_words = possible_words & bloomfilter.get_opcode_in_table( idx, val) if len(possible_words) == 0:
res.append(tuple(sorted(word_idx_list))) cnt += len(line) line_num += 1 if line_num % 10000 == 0: progress(cnt, total) return res # Step 1: Read hash lists vocabulary = read_data(in_hash_path) log.debug('Data size' + str(len(vocabulary))) # Step 1.5: construct the 'UNK' tuple # [FIXIT] we do not check the 'UNK's max_bf_size and k # It will be wrong if they do not match with the input's bloom_filter = bloomfilter(name="UNK", size=args.max_bf_size, k=args.k) unknow_indice = tuple(sorted(bloom_filter.get_indice('UNK'))) # Step 2: Build the dictionary and replace rare words with UNK token. def build_dataset(words, n_words): """Process raw inputs into a dataset.""" count = [] rank_matrix = [] words_counter = collections.Counter(words) count.extend(words_counter.most_common(n_words)) count.extend([(unknow_indice, -1)]) dictionary = dict() for word, _ in count: dictionary[word] = len(dictionary)