def train_planner(): print("Training Word2Vec-based planner ...") if not os.path.exists(save_dir): os.mkdir(save_dir) if not check_uptodate(plan_data_path): gen_train_data() word_lists = [] with open(plan_data_path, 'r') as fin: for line in fin.readlines(): word_lists.append(line.strip().split('\t')) model = models.Word2Vec(word_lists, size=512, min_count=5) model.save(_plan_model_path)
def train_planner(): """利用gensim,将提取的关键词向量化""" if not os.path.exists(save_dir): os.mkdir(save_dir) if not check_uptodate(plan_data_path): gen_train_data() keywords_list = [ ] # 格式:[ ['keyword1', 'keyword2', 'keyword3', 'keyword4'] ] with open(plan_data_path, 'r') as infile: for line in infile.readlines(): keywords_list.append(line.strip().split('\t')) # word2vec 训练词向量 model = models.Word2Vec(keywords_list, size=512, window=4, min_count=1) model.save(_plan_model_path)
def train_planner(): # TODO: try other keyword-expansion models. print("Training Word2Vec-based planner ...") if not os.path.exists(save_dir): os.mkdir(save_dir) if not check_uptodate(plan_data_path): gen_train_data() word_lists = [] with open(plan_data_path, 'r') as fin: for line in fin.readlines(): word_lists.append(line.strip().split('\t')) # model = models.FastText(word_lists, size = 512, min_count = 5) model = models.Word2Vec(word_lists, size=300, min_count=3) # print(model.wv.vocab) model.save(_plan_model_path)
def main(_): print("\nParameters: ") for k, v in sorted(FLAGS.__flags.items()): print("{} = {}".format(k, v)) if not os.path.exists("./prepro/"): os.makedirs("./prepro/") if FLAGS.eval: print("Evaluation...") feats, test_id = data_utils.load_test_data(FLAGS.test_id, FLAGS.test_dir) vocab_processor = VocabularyProcessor.restore(FLAGS.vocab) config = tf.ConfigProto(allow_soft_placement=True) config.gpu_options.allow_growth = True with tf.Session(config=config) as sess: model = load_model(sess, FLAGS.checkpoint_file, vocab_processor) sentences = greedy_inference(sess, model, feats, vocab_processor) # sentences = beam_search(sess, model, feats, vocab_processor) ans = [] for idx, sentence in enumerate(sentences): ans.append({"caption": sentence, "id": test_id[idx]}) json.dump(ans, open(FLAGS.output, 'w')) else: if FLAGS.prepro: print("Start preprocessing data...") vocab_processor, train_dict = data_utils.load_text_data( train_lab=FLAGS.train_lab, prepro_train_p=FLAGS.prepro_train, vocab_path=FLAGS.vocab) print("Vocabulary size: {}".format( len(vocab_processor._reverse_mapping))) print("Start dumping word2vec matrix...") w2v_W = data_utils.build_w2v_matrix(vocab_processor, FLAGS.w2v_data, FLAGS.vector_file, FLAGS.embedding_dim) else: train_dict = cPickle.load(open(FLAGS.prepro_train, 'rb')) vocab_processor = VocabularyProcessor.restore(FLAGS.vocab) w2v_W = cPickle.load(open(FLAGS.w2v_data, 'rb')) print("Start generating training data...") feats, encoder_in_idx, decoder_in = data_utils.gen_train_data( FLAGS.train_dir, FLAGS.train_lab, train_dict) print("Start generating validation data...") v_encoder_in, truth_captions = data_utils.load_valid( FLAGS.valid_dir, FLAGS.valid_lab) t_encoder_in = None files = None if FLAGS.task_dir != None: t_encoder_in, files = data_utils.load_task(FLAGS.task_dir) print('feats size: {}, training size: {}'.format( len(feats), len(encoder_in_idx))) print(encoder_in_idx.shape, decoder_in.shape) print(v_encoder_in.shape, len(truth_captions)) data = Data(feats, encoder_in_idx, decoder_in, v_encoder_in, truth_captions, t_encoder_in, files) model = CapGenModel(data, w2v_W, vocab_processor) model.build_model() model.train()