示例#1
0
    phrase_pairs, emb_dict = [], list()
    TEST_QUESTION_PATH = '../data/auto_QA_data/nomask_test/' + str(
        args.pred).upper() + '_test.question'
    log.info(
        "Open: %s", '../data/auto_QA_data/nomask_test/' +
        str(args.pred).upper() + '_test.question')
    TEST_ACTION_PATH = '../data/auto_QA_data/nomask_test/' + str(
        args.pred).upper() + '_test.action'
    log.info(
        "Open: %s", '../data/auto_QA_data/nomask_test/' +
        str(args.pred).upper() + '_test.action')
    if args.pred == 'pt' or 'final' in args.pred:
        phrase_pairs, emb_dict = data.load_data_from_existing_data(
            TEST_QUESTION_PATH, TEST_ACTION_PATH, DIC_PATH)
    elif args.pred == 'rl':
        phrase_pairs, emb_dict = data.load_RL_data(TEST_QUESTION_PATH,
                                                   TEST_ACTION_PATH, DIC_PATH)
    log.info("Obtained %d phrase pairs with %d uniq words", len(phrase_pairs),
             len(emb_dict))
    train_data = data.encode_phrase_pairs(phrase_pairs, emb_dict)
    if args.pred == 'rl':
        train_data = data.group_train_data(train_data)
    else:
        train_data = data.group_train_data_one_to_one(train_data)
    rev_emb_dict = {idx: word for word, idx in emb_dict.items()}

    net = model.PhraseModel(emb_size=model.EMBEDDING_DIM,
                            dict_size=len(emb_dict),
                            hid_size=model.HIDDEN_STATE_SIZE)
    net = net.cuda()
    model_path = '../data/saves/' + str(args.name) + '/' + str(args.model)
    net.load_state_dict((torch.load(model_path)))
示例#2
0
                        help="Using attention mechanism in seq2seq")
    parser.add_argument("--lstm",
                        type=lambda x:
                        (str(x).lower() in ['true', '1', 'yes']),
                        help="Using LSTM mechanism in seq2seq")
    args = parser.parse_args()
    device = torch.device("cuda" if args.cuda else "cpu")
    log.info("Device info: %s", str(device))

    saves_path = os.path.join(SAVES_DIR, args.name)
    os.makedirs(saves_path, exist_ok=True)

    # phrase_pairs, emb_dict = data.load_data('comedy')
    # List of (seq1, [seq*]) pairs, the training pairs are in format of 1:N.
    phrase_pairs, emb_dict = data.load_RL_data(TRAIN_QUESTION_PATH,
                                               TRAIN_ACTION_PATH, DIC_PATH,
                                               MAX_TOKENS)
    log.info("Obtained %d phrase pairs with %d uniq words from %s and %s.",
             len(phrase_pairs), len(emb_dict), TRAIN_QUESTION_PATH,
             TRAIN_ACTION_PATH)
    data.save_emb_dict(saves_path, emb_dict)
    end_token = emb_dict[data.END_TOKEN]
    train_data = data.encode_phrase_pairs(phrase_pairs, emb_dict)
    # list of (seq1, [seq*]) pairs,把训练对做成1:N的形式;
    train_data = data.group_train_data(train_data)
    rand = np.random.RandomState(data.SHUFFLE_SEED)
    rand.shuffle(train_data)
    train_data, test_data = data.split_train_test(train_data, TRAIN_RATIO)
    log.info("Training data converted, got %d samples", len(train_data))
    log.info("Train set has %d phrases, test %d", len(train_data),
             len(test_data))