args.pred) + '_refer.actions' fwRefer = open(REFER_PATH, 'w', encoding="UTF-8") phrase_pairs, emb_dict = [], list() TEST_QUESTION_PATH = '../data/auto_QA_data/nomask_test/' + str( args.pred).upper() + '_test.question' log.info( "Open: %s", '../data/auto_QA_data/nomask_test/' + str(args.pred).upper() + '_test.question') TEST_ACTION_PATH = '../data/auto_QA_data/nomask_test/' + str( args.pred).upper() + '_test.action' log.info( "Open: %s", '../data/auto_QA_data/nomask_test/' + str(args.pred).upper() + '_test.action') if args.pred == 'pt' or 'final' in args.pred: phrase_pairs, emb_dict = data.load_data_from_existing_data( TEST_QUESTION_PATH, TEST_ACTION_PATH, DIC_PATH) elif args.pred == 'rl': phrase_pairs, emb_dict = data.load_RL_data(TEST_QUESTION_PATH, TEST_ACTION_PATH, DIC_PATH) log.info("Obtained %d phrase pairs with %d uniq words", len(phrase_pairs), len(emb_dict)) train_data = data.encode_phrase_pairs(phrase_pairs, emb_dict) if args.pred == 'rl': train_data = data.group_train_data(train_data) else: train_data = data.group_train_data_one_to_one(train_data) rev_emb_dict = {idx: word for word, idx in emb_dict.items()} net = model.PhraseModel(emb_size=model.EMBEDDING_DIM, dict_size=len(emb_dict), hid_size=model.HIDDEN_STATE_SIZE)
parser = argparse.ArgumentParser() # parser.add_argument("--data", required=True, help="Category to use for training. " # "Empty string to train on full processDataset") parser.add_argument("--cuda", action='store_true', default=False, help="Enable cuda") parser.add_argument("-n", "--name", required=True, help="Name of the run") args = parser.parse_args() device = torch.device("cuda" if args.cuda else "cpu") log.info("Device info: %s", str(device)) saves_path = os.path.join(SAVES_DIR, args.name) os.makedirs(saves_path, exist_ok=True) # 得到配对的input-output pair和对应的词汇表(词汇表放在一起),这里可以换成自己的pair和词典! # phrase_pairs, emb_dict = data.load_data(genre_filter=args.data) phrase_pairs, emb_dict = data.load_data_from_existing_data(TRAIN_QUESTION_PATH, TRAIN_ACTION_PATH, DIC_PATH, MAX_TOKENS) # Index -> word. rev_emb_dict = {idx: word for word, idx in emb_dict.items()} log.info("Obtained %d phrase pairs with %d uniq words", len(phrase_pairs), len(emb_dict)) data.save_emb_dict(saves_path, emb_dict) end_token = emb_dict[data.END_TOKEN] # 将tokens转换为emb_dict中的indices; train_data = data.encode_phrase_pairs(phrase_pairs, emb_dict) rand = np.random.RandomState(data.SHUFFLE_SEED) rand.shuffle(train_data) log.info("Training data converted, got %d samples", len(train_data)) train_data, test_data = data.split_train_test(train_data) log.info("Train set has %d phrases, test %d", len(train_data), len(test_data)) net = attention_model.PhraseModel(emb_size=attention_model.EMBEDDING_DIM, dict_size=len(emb_dict),
device = torch.device("cuda" if args.cuda else "cpu") log.info("Device info: %s", str(device)) saves_path = os.path.join(SAVES_DIR, args.name) isExists = os.path.exists(saves_path) if not isExists: os.makedirs(saves_path) # saves_path = os.path.join(SAVES_DIR, args.name) # os.makedirs(saves_path, exist_ok=True) # To get the input-output pairs and the relevant dictionary. if not args.int: log.info("Training model without INT mask information...") if args.dataset == "csqa": phrase_pairs, emb_dict = data.load_data_from_existing_data( TRAIN_QUESTION_PATH, TRAIN_ACTION_PATH, DIC_PATH, MAX_TOKENS) else: phrase_pairs, emb_dict = data.load_data_from_existing_data( TRAIN_QUESTION_PATH_WEBQSP, TRAIN_ACTION_PATH_WEBQSP, DIC_PATH_WEBQSP, MAX_TOKENS) if args.int: log.info("Training model with INT mask information...") if args.dataset == "csqa": phrase_pairs, emb_dict = data.load_data_from_existing_data( TRAIN_QUESTION_PATH_INT, TRAIN_ACTION_PATH_INT, DIC_PATH_INT, MAX_TOKENS_INT) else: phrase_pairs, emb_dict = data.load_data_from_existing_data( TRAIN_QUESTION_PATH_INT_WEBQSP, TRAIN_ACTION_PATH_INT_WEBQSP, DIC_PATH_INT_WEBQSP, MAX_TOKENS_INT)