예제 #1
0
def main():
    if not os.path.exists(args.ckpt_file):
        raise FileNotFoundError("model file not found")

    data_dir = '/home/tiankeke/workspace/datas/sumdata/'
    TRAIN_X = os.path.join(data_dir, 'train/train.article.txt')
    TRAIN_Y = os.path.join(data_dir, 'train/train.title.txt')
    TEST_X = args.input_file

    small_vocab_file = 'sumdata/small_vocab.json'
    if os.path.exists(small_vocab_file):
        small_vocab = json.load(open(small_vocab_file))
    else:
        small_vocab = build_vocab([TRAIN_X, TRAIN_Y],
                                  small_vocab_file,
                                  vocab_size=80000)

    max_src_len = 101
    max_tgt_len = 47

    test_x = BatchManager(load_data(TEST_X, max_src_len, args.n_test),
                          args.batch_size, small_vocab)

    model = Transformer(len(small_vocab),
                        len(small_vocab),
                        max_src_len,
                        d_word_vec=300,
                        d_model=300,
                        d_inner=1200,
                        n_layers=1,
                        n_head=6,
                        d_k=50,
                        d_v=50,
                        dropout=0.1,
                        tgt_emb_prj_weight_sharing=True,
                        emb_src_tgt_weight_sharing=True).cuda()
    # print(model)
    model.eval()

    saved_state = torch.load(args.ckpt_file)
    model.load_state_dict(saved_state['state_dict'])
    print('Load model parameters from %s' % args.ckpt_file)

    my_test(test_x, model, small_vocab)
예제 #2
0
def evaluate(model: Transformer, criterion, device):
    model.eval()
    epoches_loss = 0
    print('evaluate')
    with torch.no_grad():
        for index, batch in enumerate(dataset_pro.valid_iter):
            shang_lian, shang_lian_length = batch.shang_lian
            shang_lian = shang_lian.permute(1, 0).to(device)
            # shang_lian_length = shang_lian_length.permute(1, 0).to(device)
            # shang_lian_length = shang_lian_length.numpy()
            # shang_lian_pos = torch.LongTensor(get_pos_ids(shang_lian_length, shang_lian.shape[1])).to(device)
            xia_lian, xia_lian_length = batch.xia_lian
            xia_lian = xia_lian.permute(1, 0).to(device)
            # xia_lian_length = xia_lian_length.numpy()
            # xia_lian_pos = torch.LongTensor(get_pos_ids(xia_lian_length, xia_lian.shape[1])).to(device)

            outputs = model(shang_lian, xia_lian[:, :-1])
            outputs = outputs.contiguous().view(-1, outputs.shape[-1])
            xia_lian = xia_lian[:, 1:].contiguous().view(-1)
            loss = criterion(outputs, xia_lian)
            epoches_loss += loss.item()
    return epoches_loss / len(dataset_pro.valid_iter)
예제 #3
0
파일: play.py 프로젝트: settinghead/rl-chat
def main():

    device = torch.device("cuda:0" if USE_CUDA else "cpu")

    env = Environment()

    END_TAG_IDX = env.lang.word2idx[END_TAG]

    SAY_HI = "hello"

    targ_lang = env.lang

    vocab_inp_size = len(env.lang.word2idx)
    vocab_tar_size = len(targ_lang.word2idx)

    print("vocab_inp_size", vocab_inp_size)
    print("vocab_tar_size", vocab_tar_size)

    model = Transformer(
        vocab_inp_size,
        vocab_tar_size,
        MAX_TARGET_LEN,
        d_word_vec=32,
        d_model=32,
        d_inner=32,
        n_layers=3,
        n_head=4,
        d_k=32,
        d_v=32,
        dropout=0.1,
    ).to(device)

    # baseline = Baseline(UNITS)

    history = []

    l_optimizer = torch.optim.Adam(model.parameters(), lr=1e-5)

    batch = None

    def maybe_pad_sentence(s):
        return tf.keras.preprocessing.sequence.pad_sequences(
            s, maxlen=MAX_TARGET_LEN, padding='post')

    def get_returns(r: float, seq_len: int):
        return list(reversed([r * (GAMMA**t) for t in range(seq_len)]))

    def sentence_to_idxs(sentence: str):
        return [
            env.lang.word2idx[token] for token in tokenize_sentence(sentence)
        ]

    for episode in range(EPISODES):

        # Start of Episode
        env.reset()
        model.eval()

        # get first state from the env
        state, _, done = env.step(SAY_HI)

        while not done:

            src_seq = [
                env.lang.word2idx[token] for token in tokenize_sentence(state)
            ]
            src_seq, src_pos = collate_fn([src_seq])
            src_seq, src_pos = src_seq.to(device), src_pos.to(device)
            enc_output, *_ = model.encoder(src_seq, src_pos)
            actions_t = []
            actions = []
            actions_idx = []

            while len(actions) == 0 or actions[len(actions) -
                                               1] != END_TAG_IDX and len(
                                                   actions) < MAX_TARGET_LEN:
                # construct new tgt_seq based on what's outputed so far
                if len(actions_t) == 0:
                    tgt_seq = [env.lang.word2idx[Constants.UNK_WORD]]
                else:
                    tgt_seq = actions_idx
                tgt_seq, tgt_pos = collate_fn([tgt_seq])
                tgt_seq, tgt_pos = tgt_seq.to(device), tgt_pos.to(device)
                # dec_output dims: [1, pos, hidden]
                dec_output, * \
                    _ = model.decoder(tgt_seq, tgt_pos, src_seq, enc_output)
                # pick last step
                dec_output = dec_output[:, -1, :]
                # w_logits dims: [1, vocab_size]
                w_logits = model.tgt_word_prj(dec_output)
                # w_probs dims: [1, vocab_size]
                w_probs = torch.nn.functional.softmax(w_logits, dim=1)
                w_dist = torch.distributions.categorical.Categorical(
                    probs=w_probs)
                w_idx_t = w_dist.sample()
                w_idx = w_idx_t.cpu().numpy()[0]
                actions_t.append(w_idx_t)
                actions_idx.append(w_idx)
                actions.append(env.lang.idx2word[w_idx])

            # action is a sentence (string)
            action_str = ' '.join(actions)
            next_state, reward, done = env.step(action_str)
            # print(reward)
            history.append((state, actions_t, action_str, reward))
            state = next_state

            # record history (to be used for gradient updating after the episode is done)
        # End of Episode
        # Update policy
        model.train()
        while len(history) >= BATCH_SIZE:
            batch = history[:BATCH_SIZE]
            state_inp_b, action_inp_b, reward_b, ret_seq_b = zip(*[[
                sentence_to_idxs(state), actions_b, reward,
                get_returns(reward, MAX_TARGET_LEN)
            ] for state, actions_b, _, reward in batch])
            action_inp_b = [torch.stack(sent) for sent in action_inp_b]
            action_inp_b = torch.stack(action_inp_b)

            ret_seq_b = np.asarray(ret_seq_b)

            # ret_mean = np.mean(ret_seq_b)
            # ret_std = np.std(ret_seq_b)
            # ret_seq_b = (ret_seq_b - ret_mean) / ret_std
            ret_seq_b = np.exp((ret_seq_b - 0.5) * 5)

            ret_seq_b = torch.tensor(ret_seq_b, dtype=torch.float32).to(device)

            loss = 0
            # loss_bl=0
            l_optimizer.zero_grad()
            # accumulate gradient with GradientTape
            src_seq, src_pos = collate_fn(list(state_inp_b))
            src_seq, src_pos = src_seq.to(device), src_pos.to(device)
            enc_output_b, *_ = model.encoder(src_seq, src_pos)
            max_sentence_len = action_inp_b.shape[1]
            tgt_seq = [[Constants.BOS] for i in range(BATCH_SIZE)]
            for t in range(max_sentence_len):
                # _b stands for batch
                prev_w_idx_b, tgt_pos = collate_fn(tgt_seq)
                prev_w_idx_b, tgt_pos = prev_w_idx_b.to(device), tgt_pos.to(
                    device)
                # dec_output_b dims: [batch, pos, hidden]
                dec_output_b, *_ = \
                    model.decoder(prev_w_idx_b, tgt_pos, src_seq, enc_output_b)
                # pick last step
                dec_output_b = dec_output_b[:, -1, :]
                # w_logits_b dims: [batch, vocab_size]
                w_logits_b = model.tgt_word_prj(dec_output_b)
                # w_probs dims: [batch, vocab_size]
                w_probs_b = torch.nn.functional.softmax(w_logits_b, dim=1)

                dist_b = torch.distributions.categorical.Categorical(
                    probs=w_probs_b)
                curr_w_idx_b = action_inp_b[:, t, :]
                log_probs_b = torch.transpose(
                    dist_b.log_prob(torch.transpose(curr_w_idx_b, 0, 1)), 0, 1)

                # bl_val_b = baseline(tf.cast(dec_hidden_b, 'float32'))
                # delta_b = ret_b - bl_val_b

                # cost_b = -tf.math.multiply(log_probs_b, delta_b)
                # cost_b = -tf.math.multiply(log_probs_b, ret_b)
                ret_b = torch.reshape(ret_seq_b[:, t],
                                      (BATCH_SIZE, 1)).to(device)
                # alternatively, use torch.mul() but it is overloaded. Might need to try log_probs_b*vec.expand_as(A)
                cost_b = -torch.mul(log_probs_b, ret_b)
                #  log_probs_b*vec.expand_as(A)
                # cost_b = -torch.bmm()   #if we are doing batch multiplication

                loss += cost_b
                # loss_bl += -tf.math.multiply(delta_b, bl_val_b)

                prev_w_idx_b = curr_w_idx_b
                tgt_seq = np.append(tgt_seq,
                                    prev_w_idx_b.data.cpu().numpy(),
                                    axis=1).tolist()

            # calculate cumulative gradients

            # model_vars = encoder.variables + decoder.variables
            loss = loss.mean()
            loss.backward()
            # loss_bl.backward()

            # finally, apply gradient

            l_optimizer.step()
            # bl_optimizer.step()

            # Reset everything for the next episode
            history = history[BATCH_SIZE:]

        if episode % max(BATCH_SIZE, 32) == 0 and batch != None:
            print(">>>>>>>>>>>>>>>>>>>>>>>>>>")
            print("Episode # ", episode)
            print("Samples from episode with rewards > 0: ")
            good_rewards = [(s, a_str, r) for s, _, a_str, r in batch]
            for s, a, r in random.sample(good_rewards,
                                         min(len(good_rewards), 3)):
                print("prev_state: ", s)
                print("actions: ", a)
                print("reward: ", r)
                # print("return: ", get_returns(r, MAX_TARGET_LEN))
            ret_seq_b_np = ret_seq_b.cpu().numpy()
            print("all returns: min=%f, max=%f, median=%f" %
                  (np.min(ret_seq_b_np), np.max(ret_seq_b_np),
                   np.median(ret_seq_b_np)))
            print("avg reward: ", sum(reward_b) / len(reward_b))
            print("avg loss: ", np.mean(loss.cpu().detach().numpy()))
예제 #4
0
        # outpu_tensor = torch.argmax(output.squeeze(1), 1)
        ouput_str = get_output_char(result)
        return ouput_str
    else:
        
        target = beam_search.beam_decode(input_tensor, model, beam_with=5)
        print(target)
        print(len(target[0][0]))
        ouput_str = get_output_char(target[0][0][1:])
        return ouput_str


if __name__ == '__main__':
    args = get_args()
    # pad index

    device = torch.device('cuda' if args.no_cuda == False else 'cpu')
    transformer_model = Transformer(args.sl_vocab_size, args.xl_vocab_size, hid_dim=args.embedding_dim,
                                    pf_dim=args.fp_inner_dim, n_layers=args.n_layers, n_heads=args.n_head,
									dropout=args.dropout, device=device, SOS_IDX=SOS_IDX, PAD_IDX=PAD_IDX, EOS_IDX=EOS_IDX).to(
        device)
    # transformer_model.load_state_dict(torch.load('./models-bak/transformer/1121/transformer-model_11.pt', map_location='cpu'))
    transformer_model.load_state_dict(torch.load('./models-bak/transformer/1122/transformer-model_500.pt', map_location='cpu'))
    transformer_model.eval()
    text = '欲出烦恼须无我'
    print(predict_xl(text, transformer_model, device, is_beam_search=True))
    # df = pd.read_excel('./couplet/result-test.xlsx')
    # df['transformer'] = df['上联'].apply(lambda x: predict_xl(x, transformer_model, device, is_beam_search=False))
    # df['transformer_beam'] = df['上联'].apply(lambda x: predict_xl(x, transformer_model, device, is_beam_search=True))
    # df.to_excel('./couplet/result-test.xlsx',index=False)
예제 #5
0
for i in range(config["num_epochs"]):
    start = time.time()
    epoch_metrics = dict()
    # output an example
    greedy_output_example(model, val_dataset, device, vocab)
    # run each phase per epoch
    for phase in ["train", "val"]:
        if phase == "train":
            # set model to training mode
            model.train()
            dataloader = data_loader_train

            batch_size = config["train_batch_size"]
        else:
            # set model to evaluation mode
            model.eval()
            dataloader = data_loader_val
            batch_size = config["val_batch_size"]

        # initialize metrics
        phase_metrics = dict()
        epoch_loss = list()
        average_epoch_loss = None
        n_word_total = 0
        n_correct = 0
        n_word_correct = 0
        for i, batch in enumerate(
                tqdm(dataloader, mininterval=2, desc=phase, leave=False)):
            # forward
            pred, gold = forward(phase, batch, model, optimizer)
            # backward
예제 #6
0
def run():
    parser = ArgumentParser()
    parser.add_argument("--dataset_path",
                        type=str,
                        default="",
                        help="Path or url of the dataset.")
    parser.add_argument("--model_checkpoint",
                        type=str,
                        default="",
                        help="Path, url or short name of the model")
    parser.add_argument("--device",
                        type=str,
                        default="cuda" if torch.cuda.is_available() else "cpu",
                        help="Device (cuda or cpu)")
    parser.add_argument("--gpt2_model_name",
                        type=str,
                        default="gpt2",
                        help="name of the model ex)openai-gpt")
    parser.add_argument("--no_sample",
                        action='store_true',
                        help="Set to use greedy decoding instead of sampling")
    parser.add_argument("--max_length",
                        type=int,
                        default=30,
                        help="Maximum length of the output utterances")
    parser.add_argument("--min_length",
                        type=int,
                        default=4,
                        help="Minimum length of the output utterances")
    parser.add_argument("--seed", type=int, default=0, help="Seed")
    parser.add_argument("--keyword_module",
                        type=str,
                        default="new",
                        help="add, attention, ")
    parser.add_argument("--temperature",
                        type=int,
                        default=0.8,
                        help="Sampling softmax temperature")
    parser.add_argument(
        "--top_k",
        type=int,
        default=30,
        help="Filter top-k tokens before sampling (<=0: no filtering)")
    parser.add_argument(
        "--top_p",
        type=float,
        default=0.9,
        help="Nucleus filtering (top-p) before sampling (<=0.0: no filtering)")

    parser.add_argument('-d_model', type=int, default=512)
    parser.add_argument('-d_inner_hid', type=int, default=2048)
    parser.add_argument('-d_k', type=int, default=64)
    parser.add_argument('-d_v', type=int, default=64)

    parser.add_argument('-n_head', type=int, default=8)
    parser.add_argument('-n_layers', type=int, default=6)
    parser.add_argument('-warmup', '--n_warmup_steps', type=int, default=4000)

    parser.add_argument('-dropout', type=float, default=0.1)
    parser.add_argument('-embs_share_weight', action='store_true')
    parser.add_argument('-proj_share_weight', action='store_true')
    parser.add_argument('-label_smoothing', action='store_true')
    args = parser.parse_args()
    args.d_word_vec = args.d_model

    logging.basicConfig(level=logging.INFO)
    logger = logging.getLogger(__file__)
    logger.info(pformat(args))

    if args.seed != 0:
        random.seed(args.seed)
        torch.random.manual_seed(args.seed)
        torch.cuda.manual_seed(args.seed)

    logger.info("Get pretrained model and tokenizer")
    tokenizer_class = GPT2Tokenizer if "gpt2" in args.gpt2_model_name else OpenAIGPTTokenizer  # cant use Autotokenizer because checkpoint could be a Path
    tokenizer = tokenizer_class.from_pretrained(args.gpt2_model_name)

    num_tokens = len(tokenizer.encoder)
    num_added_tokens = tokenizer.add_special_tokens(
        ATTR_TO_SPECIAL_TOKEN)  # doesn't add if they are already there

    model = Transformer(
        num_tokens + num_added_tokens,
        num_tokens + num_added_tokens,
        src_pad_idx=tokenizer.convert_tokens_to_ids(SPECIAL_TOKENS[-1]),
        trg_pad_idx=tokenizer.convert_tokens_to_ids(SPECIAL_TOKENS[-1]),
        trg_emb_prj_weight_sharing=args.proj_share_weight,
        emb_src_trg_weight_sharing=args.embs_share_weight,
        d_k=args.d_k,
        d_v=args.d_v,
        d_model=args.d_model,
        d_word_vec=args.d_word_vec,
        d_inner=args.d_inner_hid,
        n_layers=args.n_layers,
        n_head=args.n_head,
        dropout=args.dropout,
        n_position=512,
        keyword_module=args.keyword_module).to(args.device)

    model.load_state_dict(torch.load(args.model_checkpoint), strict=False)
    model.eval()

    sourceList, targetList, scoreList = get_test_datasetEN(
        tokenizer, tokenizer, args.dataset_path)
    current_time = datetime.now().strftime('%b%d_%H-%M-%S')
    f1 = open((args.model_checkpoint + current_time + "_output.txt"), 'w')
    for line in tqdm(zip(sourceList, targetList, scoreList),
                     total=len(sourceList)):
        out_ids = sample_sequence(line[0], line[2], tokenizer, model,
                                  tokenizer, args)
        out_texts = tokenizer.decode(out_ids)
        for text in out_texts:
            f1.write(text.replace('▁', ' ').replace('</s>', ' '))
        """
        for id in out_ids:
            f1.write(str(id))
            f1.write(' ')
        """
        f1.write("\n")
    f1.close()
예제 #7
0
def get_embedding():

    import transformer.Constants as Constants
    from transformer.Models import Transformer
    from transformer.Optim import ScheduledOptim
    from transformer.Modules import LabelSmoothing
    from transformer.Beam import Beam
    from transformer.Translator import translate
    from preprocess import read_instances_from_file, convert_instance_to_idx_seq
    import evals
    from evals import Logger
    from DataLoader import DataLoader

    data = torch.load(opt.data)

    opt.max_token_seq_len_e = data['settings'].max_seq_len
    opt.max_token_seq_len_d = 30
    opt.proj_share_weight = True
    opt.d_word_vec = opt.d_model

    # training_data = DataLoader(
    #    data['dict']['src'],
    #    data['dict']['tgt'],
    #    src_insts=data['train']['src'],
    #    tgt_insts=data['train']['tgt'],
    #    batch_size=opt.batch_size,
    #    shuffle=True,
    #    cuda=opt.cuda)

    opt.src_vocab_size = training_data.src_vocab_size
    opt.tgt_vocab_size = training_data.tgt_vocab_size
    opt.tgt_vocab_size = opt.tgt_vocab_size - 4

    opt.src_vocab_size = training_data.src_vocab_size
    opt.tgt_vocab_size = training_data.tgt_vocab_size
    opt.tgt_vocab_size = opt.tgt_vocab_size - 4

    opt.d_v = int(opt.d_model / opt.n_head)
    opt.d_k = int(opt.d_model / opt.n_head)

    model = Transformer(opt.src_vocab_size,
                        opt.tgt_vocab_size,
                        opt.max_token_seq_len_e,
                        opt.max_token_seq_len_d,
                        proj_share_weight=opt.proj_share_weight,
                        embs_share_weight=False,
                        d_k=opt.d_k,
                        d_v=opt.d_v,
                        d_model=opt.d_model,
                        d_word_vec=opt.d_word_vec,
                        d_inner_hid=opt.d_inner_hid,
                        n_layers_enc=opt.n_layers_enc,
                        n_layers_dec=opt.n_layers_dec,
                        n_head=opt.n_head,
                        dropout=opt.dropout,
                        dec_dropout=opt.dec_dropout,
                        encoder=opt.encoder,
                        decoder=opt.decoder,
                        enc_transform=opt.enc_transform,
                        onehot=opt.onehot,
                        no_enc_pos_embedding=opt.no_enc_pos_embedding,
                        dec_reverse=opt.dec_reverse,
                        no_residual=opt.no_residual)

    state_dict = torch.load(opt.results_dir + '/' + opt.mname + '/model.chkpt')

    model.load_state_dict(state_dict['model'])

    model = model.cuda()
    model.eval()

    model.decoder.tgt_word_emb.weight

    W = model.decoder.tgt_word_emb.weight.data.cpu().numpy()

    numpy.save(W, 'Embedding')