phrase_pairs, emb_dict = [], list() TEST_QUESTION_PATH = '../data/auto_QA_data/nomask_test/' + str( args.pred).upper() + '_test.question' log.info( "Open: %s", '../data/auto_QA_data/nomask_test/' + str(args.pred).upper() + '_test.question') TEST_ACTION_PATH = '../data/auto_QA_data/nomask_test/' + str( args.pred).upper() + '_test.action' log.info( "Open: %s", '../data/auto_QA_data/nomask_test/' + str(args.pred).upper() + '_test.action') if args.pred == 'pt' or 'final' in args.pred: phrase_pairs, emb_dict = data.load_data_from_existing_data( TEST_QUESTION_PATH, TEST_ACTION_PATH, DIC_PATH) elif args.pred == 'rl': phrase_pairs, emb_dict = data.load_RL_data(TEST_QUESTION_PATH, TEST_ACTION_PATH, DIC_PATH) log.info("Obtained %d phrase pairs with %d uniq words", len(phrase_pairs), len(emb_dict)) train_data = data.encode_phrase_pairs(phrase_pairs, emb_dict) if args.pred == 'rl': train_data = data.group_train_data(train_data) else: train_data = data.group_train_data_one_to_one(train_data) rev_emb_dict = {idx: word for word, idx in emb_dict.items()} net = model.PhraseModel(emb_size=model.EMBEDDING_DIM, dict_size=len(emb_dict), hid_size=model.HIDDEN_STATE_SIZE) net = net.cuda() model_path = '../data/saves/' + str(args.name) + '/' + str(args.model) net.load_state_dict((torch.load(model_path)))
help="Using attention mechanism in seq2seq") parser.add_argument("--lstm", type=lambda x: (str(x).lower() in ['true', '1', 'yes']), help="Using LSTM mechanism in seq2seq") args = parser.parse_args() device = torch.device("cuda" if args.cuda else "cpu") log.info("Device info: %s", str(device)) saves_path = os.path.join(SAVES_DIR, args.name) os.makedirs(saves_path, exist_ok=True) # phrase_pairs, emb_dict = data.load_data('comedy') # List of (seq1, [seq*]) pairs, the training pairs are in format of 1:N. phrase_pairs, emb_dict = data.load_RL_data(TRAIN_QUESTION_PATH, TRAIN_ACTION_PATH, DIC_PATH, MAX_TOKENS) log.info("Obtained %d phrase pairs with %d uniq words from %s and %s.", len(phrase_pairs), len(emb_dict), TRAIN_QUESTION_PATH, TRAIN_ACTION_PATH) data.save_emb_dict(saves_path, emb_dict) end_token = emb_dict[data.END_TOKEN] train_data = data.encode_phrase_pairs(phrase_pairs, emb_dict) # list of (seq1, [seq*]) pairs,把训练对做成1:N的形式; train_data = data.group_train_data(train_data) rand = np.random.RandomState(data.SHUFFLE_SEED) rand.shuffle(train_data) train_data, test_data = data.split_train_test(train_data, TRAIN_RATIO) log.info("Training data converted, got %d samples", len(train_data)) log.info("Train set has %d phrases, test %d", len(train_data), len(test_data))