예제 #1
0
    def set_transformer_model(self):
        '''
        This Function loads the base transformer model.
        
        Args:
            transformer_config_path : config path(yaml) of the transformer
            transformer_weights_path : optional . if given loads the weight as well
        
        Returns:None
        '''

        # load base transformer model from config
        with open(self.args.transformer_config_path, 'r') as file:
            config= yaml.load(file, yaml.FullLoader)        

        model_config = TransformerConfig(config)
        input_dim = config['transformer']['input_dim']
        
        dr= model_config.downsample_rate
        hidden_size = model_config.hidden_size
        output_attention= False
        
        base_transformer_model = TransformerModel(model_config,input_dim,output_attentions=output_attention).to('cpu')

        #load weights
        if self.args.transformer_weights_path:
            ckpt = torch.load(self.args.transformer_weights_path, map_location='cpu')
            base_transformer_model.load_state_dict(ckpt['Transformer'])

        self.base_transformer_model = base_transformer_model
예제 #2
0
def main():
    print("Generating data...", end="")
    voc_size = args.vocab_sz
    inp = np.arange(2, voc_size, 2)
    tgt = np.arange(3, voc_size, 2)
    data_x, data_y = get_numbers(inp, tgt)
    train_len = int(len(data_x) * 0.9)
    train_x, val_x = data_x[:train_len], data_x[train_len:]
    train_y, val_y = data_y[:train_len], data_y[train_len:]
    print("Done")

    print("Setting model...", end="")
    model = TransformerModel(
        input_sz=voc_size,
        output_sz=voc_size,
        d_model=args.d_model,
        nhead=args.n_head,
        num_encoder_layers=args.n_encoder_layers,
        num_decoder_layers=args.n_decoder_layers,
        dim_feedforward=args.dim_feedforward,
        dropout=args.dropout,
    )
    if args.load_dir != ".":
        model.load_state_dict(flow.load(args.load_dir))
    model = to_cuda(model)
    criterion = to_cuda(nn.CrossEntropyLoss())

    optimizer = flow.optim.Adam(model.parameters(), lr=args.lr)
    print("Done")

    print("Training...")

    min_loss = 100
    for i in range(1, args.n_epochs + 1):
        epoch_loss = train(model, criterion, optimizer, train_x, train_y)
        epoch_loss_val = validation(model, criterion, val_x, val_y)
        print("epoch: {} train loss: {}".format(i, epoch_loss))
        print("epoch: {} val loss: {}".format(i, epoch_loss_val))
        if epoch_loss < min_loss:
            if not os.path.exists(args.save_dir):
                os.mkdir(args.save_dir)
            else:
                shutil.rmtree(args.save_dir)
                assert not os.path.exists(args.save_dir)
                os.mkdir(args.save_dir)
            flow.save(model.state_dict(), args.save_dir)
        if i % 3 == 2:
            print(test(model, test_times=10))
예제 #3
0
def main(model_name=None, hidden=64, nlayers=1):
    voc_size = 10000
    inp = arange(2, voc_size, 2)
    tgt = arange(3, voc_size, 2)
    batch_size = 128
    epochs = 30
    dataset = NumberLoader(inp, tgt)
    train_len = int(len(dataset) * 0.9)
    val_len = len(dataset) - train_len
    train_set, val_set = random_split(dataset, [train_len, val_len])
    train_loader = DataLoader(train_set,
                              batch_size=batch_size,
                              shuffle=True,
                              num_workers=1)
    val_loader = DataLoader(val_set,
                            batch_size=batch_size,
                            shuffle=True,
                            num_workers=1)
    model = TransformerModel(voc_size,
                             voc_size,
                             hidden=hidden,
                             nlayers=nlayers)
    if model_name is not None:
        model.load_state_dict(load(model_name))
    model = model.cuda()
    # optimizer = optim.SGD(model.parameters(), lr=0.5)
    optimizer = optim.Adam(model.parameters())
    # scheduler = optim.lr_scheduler.StepLR(optimizer, step_size=10, gamma=0.5)
    criterion = nn.CrossEntropyLoss()
    best_loss = 100
    for i in range(epochs):
        epoch_loss = train(model, criterion, optimizer, train_loader)
        epoch_loss_val = validation(model, criterion, val_loader)
        # scheduler.step()
        print("epoch: {} train loss: {}".format(i, epoch_loss))
        print("epoch: {} val loss: {}".format(i, epoch_loss_val))
        if epoch_loss_val < best_loss:
            best_loss = epoch_loss_val
            model_name = "model/model_{0:.5f}.pt".format(epoch_loss_val)
            save(model.state_dict(), model_name)
    return model_name
예제 #4
0
def main():

    voc_size = args.vocab_sz
    print("Setting model...", end="")
    model = TransformerModel(
        input_sz=voc_size,
        output_sz=voc_size,
        d_model=args.d_model,
        nhead=args.n_head,
        num_encoder_layers=args.n_encoder_layers,
        num_decoder_layers=args.n_decoder_layers,
        dim_feedforward=args.dim_feedforward,
        dropout=args.dropout,
    )
    model.load_state_dict(flow.load(args.load_dir))
    model = to_cuda(model)
    print("Done")

    print("Inference:")
    num = args.input_start
    if num % 2 != 0:
        print("The input number must be an even number.")
        return
    if num > args.vocab_sz - MAX_LEN * 2:
        print("The input sequence may be out of range.")
        return

    input_nums = [num + i * 2 for i in range(MAX_LEN)]
    src = to_cuda(flow.tensor(input_nums)).unsqueeze(1)
    pred = [0]
    for i in range(MAX_LEN):
        inp = to_cuda(flow.tensor(pred)).unsqueeze(1)
        output = model(src, inp)
        out_num = output.argmax(2)[-1].numpy()[0]
        pred.append(out_num)
    print("input:", input_nums)
    print("pred:", pred)
예제 #5
0
vocab_to_int = vocab["vocab_to_int"]
int_to_vocab = vocab["int_to_vocab"]

ntokens = len(vocab_to_int)
emsize = 512
nhid = 512
nlayers = 4
nhead = 4
dropout = 0.2

model = TransformerModel(ntokens, emsize, nhead, nhid, nlayers,
                         dropout).to(device)

model_save_path = "./models/transformer/lm-siamzone-v4-space-342.pkl"
model.load_state_dict(
    torch.load(model_save_path, map_location=torch.device("cpu")))
model.eval()

print("Model initialized")


def top_k_top_p_filtering(logits,
                          top_k,
                          top_p,
                          temperature,
                          filter_value=-float("Inf")):
    # Hugging Face script to apply top k and nucleus sampling
    logits = logits / temperature

    top_k = min(top_k, logits.size(-1))  # Safety check
    if top_k > 0:
예제 #6
0
def main(args):
    random_seed(args.seed)
    if torch.cuda.is_available():
        if not args.cuda:
            print(
                "WARNING: You have a CUDA device, so you should probably run with --cuda")
    device = torch.device("cuda" if args.cuda else "cpu")

    corpus = data.Corpus(args.data)
    ntokens = len(corpus.dictionary)
    print('loaded dictionary')
    if args.model == 'Transformer':
        model = TransformerModel(
            ntokens,
            args.emsize,
            args.nhead,
            args.nhid,
            args.nlayers,
            args.dropout).to(device)
    else:
        model = RNNModel(
            args.model,
            ntokens,
            args.emsize,
            args.nhid,
            args.nlayers,
            args.dropout,
            args.tied).to(device)

    checkpoint = torch.load(args.checkpoint)
    model.load_state_dict(checkpoint['model_state_dict'])
    model.eval()
    print('loaded model')

    is_transformer_model = hasattr(
        model, 'model_type') and model.model_type == 'Transformer'
    if not is_transformer_model:
        hidden = model.init_hidden(1)
    input = torch.randint(ntokens, (1, 1), dtype=torch.long).to(device)
    with open(args.outf, 'w') as outf:
        with torch.no_grad():  # no tracking history
            for i in range(args.words):
                if is_transformer_model:
                    output = model(input, False)
                    word_weights = output[-1].squeeze().div(
                        args.temperature).exp().cpu()
                    word_idx = torch.multinomial(word_weights, 1)[0]
                    word_tensor = torch.Tensor([[word_idx]]).long().to(device)
                    input = torch.cat([input, word_tensor], 0)
                else:
                    output, hidden = model(input, hidden)
                    word_weights = output.squeeze().div(args.temperature).exp().cpu()
                    word_idx = torch.multinomial(word_weights, 1)[0]
                    input.fill_(word_idx)

                word = corpus.dictionary.idx2word[word_idx]

                outf.write(word + ('\n' if i % 20 == 19 else ' '))

                if i % args.log_interval == 0:
                    print('| Generated {}/{} words'.format(i, args.words))
        validation='wiki.valid.tokens',
        test='wiki.test.tokens')

    # 依据训练集构建词典
    TEXT.build_vocab(train_txt)

    model = TransformerModel(len(TEXT.vocab.stoi),
                             ninp=200,
                             nhead=2,
                             nhid=200,
                             nlayers=2,
                             dropout=0.2).to(device)
    # 模型加载训练好的参数
    # checkpoint = torch.load('datasets/models/best_model.pth.tar')
    checkpoint = torch.load('temp/models/best_model.pth.tar')
    model.load_state_dict(checkpoint['state_dict'])

    # 已知序列
    history = 'it seems'
    h = []
    for w in history.split():
        h.append([TEXT.vocab.stoi[w]])

    while (True):
        # 把列表转化成 tensor ,然后计算模型输出
        output = model(torch.tensor(h).to(device))
        # 获取概率最大的5个单词的 id
        idxs = output[-1].argsort(descending=True).view(-1)[:10]
        # 随机选择其中一个
        r = random.randint(0, 10)
        h.append([r])
예제 #8
0
def main(args):
    random_seed(args.seed)
    if torch.cuda.is_available():
        if not args.cuda:
            print(
                "WARNING: You have a CUDA device, so you should probably run with --cuda")
    device = torch.device("cuda" if args.cuda else "cpu")

    corpus = data.Corpus(args.data)
    ntokens = len(corpus.dictionary)
    word2idx = corpus.dictionary.word2idx
    idx2word = corpus.dictionary.idx2word
    args.vocab_size = len(word2idx)
    print('loaded dictionary')

    if args.model == 'Transformer':
        model = TransformerModel(
            ntokens,
            args.emsize,
            args.nhead,
            args.nhid,
            args.nlayers,
            args.dropout).to(device)
    else:
        model = RNNModel(
            args.model,
            ntokens,
            args.emsize,
            args.nhid,
            args.nlayers,
            args.dropout,
            args.tied).to(device)

    checkpoint = torch.load(args.checkpoint)
    model.load_state_dict(checkpoint['model_state_dict'])
    model.eval()
    is_transformer_model = hasattr(
        model, 'model_type') and model.model_type == 'Transformer'
    print('loaded model')

    input = torch.randint(ntokens, (1, 1), dtype=torch.long).to(device)

    # get as starting words only most common starting word
    # from data corpus(heuristics from baseline)
    most_common_first_words_ids = [i[0] for i in Counter(corpus.train.tolist()).most_common()
                                   if idx2word[i[0]][0].isupper()][:200]
#     most_common_first_words = [corpus.dictionary.idx2word[i]
#                                for i in most_common_first_words_ids]

    # private message(binary code)
    bit_stream = open(args.bit_stream_path, 'r').readline()
    outfile = open(args.save_path + 'generated' +
                   str(args.bit_num) + '_bit.txt', 'w')
    bitfile = open(args.save_path + 'bitfile_' +
                   str(args.bit_num) + '_bit.txt', 'w')
    bit_index = random.randint(0, len(word2idx))
    soft = torch.nn.Softmax(0)

    for uter_id, uter in tqdm.tqdm(
            enumerate(range(args.utterances_to_generate))):
        #         with torch.no_grad():  # no tracking history
        input_ = torch.LongTensor([random.choice(
            most_common_first_words_ids)]).unsqueeze(0).to(device)
        if not is_transformer_model:
            hidden = model.init_hidden(1)

        output, hidden = model(input_, hidden)
        gen = np.random.choice(len(corpus.dictionary), 1,
                               p=np.array(soft(output.reshape(-1)).tolist()) /
                               sum(soft(output.reshape(-1)).tolist()))[0]
        gen_res = list()
        gen_res.append(idx2word[gen])
        bit = ""
        for word_id, word in enumerate(range(args.len_of_generation - 2)):
            if is_transformer_model:
                assert NotImplementedError
            else:
                output, hidden = model(input_, hidden)
            p = output.reshape(-1)
            sorted_, indices = torch.sort(p, descending=True)
            words_prob = [(j, i) for i, j in
                          zip(sorted_[:2**int(args.bit_num)].tolist(),
                              indices[:2**int(args.bit_num)].tolist())]

            nodes = createNodes([item[1] for item in words_prob])
            root = createHuffmanTree(nodes)
            codes = huffmanEncoding(nodes, root)

            for i in range(2**int(args.bit_num)):
                if bit_stream[bit_index:bit_index + i + 1] in codes:
                    code_index = codes.index(
                        bit_stream[bit_index:bit_index + i + 1])
                    gen = words_prob[code_index][0]
                    test_data = np.int32(gen)
                    gen_res.append(idx2word[gen])
                    if idx2word[gen] in ['\n', '', "<eos>"]:
                        break
                    bit += bit_stream[bit_index: bit_index + i + 1]
                    bit_index = bit_index + i + 1
                    break

        gen_sen = ' '.join(
            [word for word in gen_res if word not in ["\n", "", "<eos>"]])
        outfile.write(gen_sen + "\n")
        bitfile.write(bit)
예제 #9
0
#               (1.0, 0.0),
#               (0.0, -1.0),
#               (0.0, 1.0),
#               (0.0, 2.0),
#               (0.0, 5.0),
#               ]
a = args.alpha
b = args.beta

encoder = TransformerModel(unidirectional=False)
decoder = TransformerLMHeadModel()

logger.info(f"Start training of alpha={a} beta={b}")

states = torch.load("../TSP/TSP-best.th")
encoder.load_state_dict(states["encoder"])
decoder.load_state_dict(states["decoder"])

device = torch.device("cuda")

encoder = encoder.to(device)
decoder = decoder.to(device)

num_epochs = 10
num_gradients_accumulation = 1
num_train_optimization_steps = len(
    train_dataset) * num_epochs // batch_size // num_gradients_accumulation

param_optimizer = list(encoder.named_parameters()) + list(
    decoder.named_parameters())
no_decay = ['bias', 'LayerNorm.bias', 'LayerNorm.weight']
예제 #10
0
from dataset import TedDataset
from tqdm import tqdm
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")

# Load tokenizer
de_tokenizer = WordpieceTokenizer('de').load_model()
en_tokenizer = WordpieceTokenizer('en').load_model()

model = TransformerModel(d_model=512,
                         num_heads=8,
                         num_encoders=6,
                         num_decoders=6,
                         in_vocab_size=len(de_tokenizer),
                         out_vocab_size=len(en_tokenizer)).to(device)

model.load_state_dict(torch.load("./outputs/model-epoch10.pt"))
model.eval()


def translate(inputs):
    input_len = len(inputs)
    inputs = torch.tensor([
        de_tokenizer.transform(input, max_length=50) for input in inputs
    ]).cuda()
    outputs = torch.tensor([[2]] * input_len).cuda()  #2 means sos token
    for i in range(50):
        prediction = model(inputs, outputs)
        prediction = torch.argmax(prediction, dim=-1)[:, -1]  # get final token
        outputs = torch.cat((outputs, prediction.view(-1, 1)), dim=-1)
    outputs = outputs.tolist()
    cleanoutput = []
예제 #11
0
def main():
    # from pathlib import Path
    # print("File      Path:", Path(__file__).absolute())
    # print("Directory Path:", Path().absolute())

    args = get_args()
    args.n_gpu = 1

    # noisy_sents_1 = read_strings(os.path.join(args.data_dir, "train_data", "train_data"))
    # clean_sents = read_strings(os.path.join(args.data_dir, "train_label"))
    # noisy_sents_2 = read_strings(os.path.join(args.data_dir, "train_data", "train_corpus"))
    #
    # noisy_sents = noisy_sents_1 + noisy_sents_2
    # noise_space_ratio = []
    #
    # for sentence in noisy_sents:
    #     noise_space_ratio.append(sentence.count(' ') / len(sentence))
    #
    # clean_space_ratio = []
    # for sentence in clean_sents:
    #     clean_space_ratio.append(sentence.count(' ') / len(sentence))
    #
    # print("noise_space_ratio: {}, clean_space_ratio: {}".format(sum(noise_space_ratio) / len(noise_space_ratio),
    #                                                             sum(clean_space_ratio) / len(clean_space_ratio)))

    # ##########
    # ##for local
    # args.num_workers=0
    # args.train_batch_size = 4
    # args.eval_batch_size = 4
    # args.eval_interval = 10
    # ##########

    set_seed(args)

    if args.tokenizer == 'char':
        tokenizer = CharTokenizer([])
    if args.tokenizer == 'kobert':
        print("koBERT tokenizer")
        tokenizer = KoBertTokenizer.from_pretrained('monologg/kobert')
        args.vocab_size = tokenizer.vocab_size
        print(args.vocab_size)

    if args.load_vocab != "":
        tokenizer.load(args.load_vocab)
        args.vocab_size = tokenizer.__len__()

    logger.info(f"args: {json.dumps(args.__dict__, indent=2, sort_keys=True)}")

    os.environ["CUDA_VISIBLE_DEVICES"] = args.cuda
    args.device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

    model = TransformerModel(
        vocab_size=args.vocab_size,
        hidden_size=args.hidden_size,
        num_attention_heads=args.num_attention_heads,
        num_encoder_layers=args.num_encoder_layers,
        num_decoder_layers=args.num_decoder_layers,
        intermediate_size=args.intermediate_size,
        dropout=args.dropout,
    ).to(args.device)
    logger.info(
        f"# of model parameters: {sum(p.numel() for p in model.parameters()) * 1e-6:.2f}M"
    )

    eos_setting = args.eos_setting

    bind_nsml(model, tokenizer, args, eos=eos_setting)
    if args.pause:
        nsml.paused(scope=locals())

    if args.mode != 'test' and args.averaging != "":
        sess = 't0005/rush1-3/37'
        checkpoints = ["4500", "6500", "7500", "8000"]

        nsml.load(checkpoint=checkpoints[0], session=sess)
        args.vocab_size = tokenizer.__len__()
        print(args.vocab_size)

        model = TransformerModel(
            vocab_size=args.vocab_size,
            hidden_size=args.hidden_size,
            num_attention_heads=args.num_attention_heads,
            num_encoder_layers=args.num_encoder_layers,
            num_decoder_layers=args.num_decoder_layers,
            intermediate_size=args.intermediate_size,
            dropout=args.dropout,
        ).to(args.device)

        params = model.named_parameters()
        new_dict_params = dict(params)

        for checkpoint in checkpoints:
            bind_nsml(model, tokenizer, args, eos=eos_setting)
            nsml.load(checkpoint=checkpoint, session=sess)
            for name, param in params:
                new_dict_params[name] += param / len(checkpoints)

        model.load_state_dict(new_dict_params, strict=False)

        bind_nsml(model, tokenizer, args, eos=eos_setting)
        nsml.save('best')

    elif args.mode == 'eval':
        print("I'm in EVAL")

        checkpoint = 'best'
        sess = 't0005/rush1-3/507'
        nsml.load(checkpoint=checkpoint, session=sess)
        args.vocab_size = tokenizer.__len__()

        model = TransformerModel(
            vocab_size=args.vocab_size,
            hidden_size=args.hidden_size,
            num_attention_heads=args.num_attention_heads,
            num_encoder_layers=args.num_encoder_layers,
            num_decoder_layers=args.num_decoder_layers,
            intermediate_size=args.intermediate_size,
            dropout=args.dropout,
        ).to(args.device)

        bind_nsml(model, tokenizer, args, eos=eos_setting)
        nsml.load(checkpoint=checkpoint, session=sess)

        model.eval()
        #noisy_sents = open("./naver_data_clean.txt", "r", encoding='utf-8').read().splitlines()
        noisy_sents = read_strings(
            os.path.join(args.data_dir, "train_data", "train_corpus"))
        valid_noisy = noisy_sents[:1000]

        prediction = correct_beam(model,
                                  tokenizer,
                                  valid_noisy,
                                  args,
                                  eos=True,
                                  length_limit=0.15)

        for i, pred in enumerate(prediction[:1000]):
            print("noisy_input: {}, pred: {}".format(valid_noisy[i], pred))

        # bind_txt(prediction)
        # nsml.save('prediction')

        # with open('naver_data_clean_again.txt', 'w',encoding='utf-8') as f:
        #     for i, pred in enumerate(prediction):
        #         if i%500==0: print(i)
        #         f.write("%s\n" % pred)

    ## only works when char tokenizer
    ##TODO: kobert tokenizer, different vocabsize if it is needed
    elif args.mode != 'test' and args.resubmit != "":
        checkpoint = 'best'
        sess = 't0005/rush1-3/' + args.resubmit
        print(sess)

        model = None
        tokenizer = CharTokenizer([])
        bind_nsml(model, tokenizer, args, eos=eos_setting)
        nsml.load(checkpoint=checkpoint, session=sess)

        args.vocab_size = len(tokenizer)
        print(args.vocab_size)

        model = TransformerModel(
            vocab_size=args.vocab_size,
            hidden_size=args.hidden_size,
            num_attention_heads=args.num_attention_heads,
            num_encoder_layers=args.num_encoder_layers,
            num_decoder_layers=args.num_decoder_layers,
            intermediate_size=args.intermediate_size,
            dropout=args.dropout,
        ).to(args.device)

        bind_nsml(model, tokenizer, args, eos=eos_setting)
        nsml.load(checkpoint=checkpoint, session=sess)

        bind_nsml(model, tokenizer, args, eos=eos_setting)
        ########## testing loaded model & tokenizer ###############

        # model.eval()
        # noisy_sents = read_strings(os.path.join(args.data_dir, "train_data", "train_data"))
        # valid_noisy = noisy_sents[-10:]
        #
        # prediction = correct(model, tokenizer, valid_noisy, args, eos=True, length_limit=0.1)
        #
        # for pred in prediction:
        #     print(pred)

        ##################

        nsml.save("best")

    else:
        #train_data, valid_data = None, None
        if args.mode == "train" or args.mode == "pretrain" or args.mode == "semi-train":
            if args.mode == "train":
                # noisy_sents = open("./noisy_sejong_500k.txt", "r", encoding='utf-8').read().splitlines()[:20000]
                # clean_sents = open("./clean_sejong_500k.txt", "r", encoding='utf-8').read().splitlines()[:20000]
                # sents_annotation = ['None'] * len(noisy_sents)
                noisy_sents = read_strings(
                    os.path.join(args.data_dir, "train_data", "train_data"))
                sents_annotation = read_strings(
                    os.path.join(args.data_dir, "train_data",
                                 "train_annotation"))
                clean_sents = read_strings(
                    os.path.join(args.data_dir, "train_label"))

            if args.mode == "semi-train":
                noisy_sents = read_strings(
                    os.path.join(args.data_dir, "train_data", "train_data"))
                sents_annotation = read_strings(
                    os.path.join(args.data_dir, "train_data",
                                 "train_annotation"))
                clean_sents = read_strings(
                    os.path.join(args.data_dir, "train_label"))

                checkpoint = 'generated_data'
                sess = 't0005/rush1-1/' + str(args.semi_dataset)
                # five copy
                #sess = 't0005/rush1-1/209'
                # one copy
                #sess = 't0005/rush1-1/224'
                semi_noisy_sents, semi_clean_sents = load_generated_data(
                    checkpoint=checkpoint, session=sess)
                semi_sents_annotation = ['None'] * len(semi_noisy_sents)

            if args.mode == "pretrain":
                print("PRETRAIN MODE ON!!")
                noisy_sents = read_strings(
                    os.path.join('sejong_corpus', args.noisy_file))
                clean_sents = read_strings(
                    os.path.join('sejong_corpus', args.clean_file))
                # checkpoint = 'generated_data'
                # sess = 't0005/rush1-1/113'
                # noisy_sents, clean_sents = load_generated_data(checkpoint=checkpoint, session=sess)
                sents_annotation = ['None'] * len(noisy_sents)

            error_type_counter = Counter()

            for annotation in sents_annotation:
                error_type_counter += Counter(annotation.split(','))

            print(error_type_counter)

            # cleaning noise 버전
            # pairs = [{"noisy": preprocess_sentence(noisy), "clean": clean} for noisy, clean in zip(noisy_sents, clean_sents)]
            # original 버전

            if args.mode == "semi-train":
                pairs = [{
                    "noisy": noisy,
                    "clean": clean,
                    "annotation": annot
                }
                         for noisy, clean, annot in zip(
                             noisy_sents, clean_sents, sents_annotation)]
                semi_pairs = [{
                    "noisy": noisy,
                    "clean": clean,
                    "annotation": annot
                } for noisy, clean, annot in zip(
                    semi_noisy_sents, semi_clean_sents, semi_sents_annotation)]

                train_data = pairs[:-args.num_val_data] + semi_pairs
                valid_data = pairs[-args.num_val_data:]
                logger.info(f"# of train data: {len(train_data)}")
                logger.info(f"# of valid data: {len(valid_data)}")

                train_sents = [x['noisy'] for x in train_data
                               ] + [x['clean'] for x in train_data]
                tokenizer = CharTokenizer.from_strings(train_sents,
                                                       args.vocab_size)
                bind_nsml(model, tokenizer, args, eos=eos_setting)

            else:
                pairs = [{
                    "noisy": noisy,
                    "clean": clean,
                    "annotation": annot
                }
                         for noisy, clean, annot in zip(
                             noisy_sents, clean_sents, sents_annotation)]

                train_data, valid_data = train_test_split(
                    pairs, test_size=args.val_ratio,
                    random_state=args.seed)  # test: about 1000
                logger.info(f"# of train data: {len(train_data)}")
                logger.info(f"# of valid data: {len(valid_data)}")

                # print("validation: ", valid_data)

                train_sents = [x['noisy'] for x in train_data
                               ] + [x['clean'] for x in train_data]
                # train_sents = [x['clean'] for x in train_data]

                if args.load_model != "" and args.mode == "train":  # Load pretrained model
                    print("load pretrained model")
                    model.load_state_dict(
                        torch.load(args.load_model, map_location=args.device))

                    if args.freeze:
                        model.token_embeddings.weight.requires_grad = False
                        model.decoder_embeddings.weight.requires_grad = False

                if args.tokenizer == 'char' and args.load_vocab == "":
                    tokenizer = CharTokenizer.from_strings(
                        train_sents, args.vocab_size)
                    print(
                        f'tokenizer loaded from strings. len={len(tokenizer)}.'
                    )

                bind_nsml(model, tokenizer, args, eos=eos_setting)

                if args.tokenizer == 'char' and tokenizer is not None:
                    tokenizer.save('vocab.txt')

        if args.n_gpu > 1:
            model = torch.nn.DataParallel(model, dim=1)

        if args.mode == "train" or args.mode == "pretrain" or args.mode == 'semi-train':
            train(model,
                  tokenizer,
                  train_data,
                  valid_data,
                  args,
                  eos=eos_setting)
예제 #12
0
def predict(dn, rn):
    dir_name_format = "../data/{dn}-{rn}-raw"
    dir_name = dir_name_format.format(dn=dn, rn=rn)
    input_path = os.path.join(dir_name, "src-test.txt")
    if not os.path.isfile(input_path):
        print(f"File: {input_path} not exist.")
        return

    output_filename = f"prediction-{dn}-{rn}.txt"
    output_path = os.path.join(outputDir, output_filename)
    if os.path.isfile(output_path):
        print(f"File {output_path} already exists.")
        return

    # 作用:将src进行index
    preprocess = IndexedInputTargetTranslationDataset.preprocess(source_dictionary)
    # 作用:将输出逆index为句子
    postprocess = lambda x: ''.join(
        [token for token in target_dictionary.tokenize_indexes(x) if token != END_TOKEN and token != START_TOKEN and token != PAD_TOKEN])
    device = torch.device(f'cuda:{args.device}' if torch.cuda.is_available() and not args.no_cuda else 'cpu')

    print('Building model...')
    model = TransformerModel(source_dictionary.vocabulary_size, target_dictionary.vocabulary_size,
                             config['d_model'],
                             config['nhead'],
                             config['nhid'],
                             config['nlayers'])
    model.eval()
    checkpoint_filepath = checkpoint_path
    checkpoint = torch.load(checkpoint_filepath, map_location='cpu')
    model.load_state_dict(checkpoint)
    translator = Translator(
        model=model,
        beam_size=args.beam_size,
        max_seq_len=args.max_seq_len,
        trg_bos_idx=target_dictionary.token_to_index(START_TOKEN),
        trg_eos_idx=target_dictionary.token_to_index(END_TOKEN)
    ).to(device)

    from utils.pipe import PAD_INDEX
    def pad_src(batch):
        sources_lengths = [len(sources) for sources in batch]
        sources_max_length = max(sources_lengths)
        sources_padded = [sources + [PAD_INDEX] * (sources_max_length - len(sources)) for sources in batch]
        sources_tensor = torch.tensor(sources_padded)
        return sources_tensor
    def process(seq):
        seq = seq.strip()
        def is_proof(name):
            return name.count("balance") > 0 or name.count("one") > 0
        if is_proof(data_name) and not is_proof(dn):
            seq += ",$,1"
            global is_proof_process
            if is_proof_process:
                print("processing")
                is_proof_process = False
        return seq

    batch_size = args.bs
    print(f"Output to {output_path}:")
    with open(output_path, 'w', encoding='utf-8') as outFile:
        with open(input_path, 'r', encoding='utf-8') as inFile:
            seqs = []
            for seq in tqdm(inFile):
                seq = process(seq)
                src_seq = preprocess(seq)
                seqs.append(src_seq)
                if len(seqs) >= batch_size:
                    pred_seq = translator.translate_sentence(pad_src(seqs).to(device))
                    pred_line = [postprocess(pred) for pred in pred_seq]
                    # print(pred_line)
                    outFile.writelines([p.strip() + '\n' for p in pred_line])
                    seqs.clear()
                # endif
            # endfor
            if seqs:    # last batch
                pred_seq = translator.translate_sentence(pad_src(seqs).to(device))
                pred_line = [postprocess(pred).replace(START_TOKEN, '').replace(END_TOKEN, '') for pred in pred_seq]
                # print(pred_line)
                outFile.writelines([p.strip() + '\n' for p in pred_line])
                seqs.clear()
        # endwith
    # endwith
    print(f'[Info] {input_path} Finished.')
예제 #13
0
        if epoch_loss_val < best_loss:
            best_loss = epoch_loss_val
            model_name = "model/model_{0:.5f}.pt".format(epoch_loss_val)
            save(model.state_dict(), model_name)
    return model_name


if __name__ == "__main__":
    parser = argparse.ArgumentParser(
        description=
        'A PyTorch Transformer Language Model for Predicting Odd Numbers')
    parser.add_argument('--test_model',
                        type=str,
                        help='the model file to load')
    parser.add_argument('--train_model',
                        type=str,
                        help='the model file to load')
    args = parser.parse_args()
    hidden = 128
    nlayers = 2
    if args.test_model is None:
        if args.train_model is not None:
            model_name = main(args.train_model, hidden=hidden, nlayers=nlayers)
        else:
            model_name = main(hidden=hidden, nlayers=nlayers)
    else:
        model_name = args.test_model
    model = TransformerModel(10000, 10000, hidden=hidden, nlayers=nlayers)
    model.load_state_dict(load(model_name))
    test(model, test_times=10)
예제 #14
0
def main():
    ### settings
    args = set_args()
    save_path = args.save_path
    if not os.path.isdir(save_path):
        os.makedirs(save_path)
    logger.info(args)

    ### prepare for data
    train_dataset = COCOMultiLabel(args,
                                   train=True,
                                   image_path=args.image_path)
    test_dataset = COCOMultiLabel(args,
                                  train=False,
                                  image_path=args.image_path)
    train_loader = DataLoader(train_dataset,
                              batch_size=args.batch_size,
                              num_workers=args.num_workers,
                              pin_memory=True,
                              shuffle=True,
                              drop_last=True,
                              collate_fn=my_collate)
    test_loader = DataLoader(test_dataset,
                             batch_size=args.batch_size,
                             num_workers=args.num_workers,
                             pin_memory=True,
                             shuffle=False,
                             drop_last=False,
                             collate_fn=my_collate)

    ## prepare for models
    encoder = CNN_Encoder().cuda()
    decoder = TransformerModel(args).cuda()
    ## set different parameter for training or only evaluation'
    if args.use_eval:
        weights_dic = torch.load(args.use_model)
        encoder.load_state_dict(
            convert_weights(weights_dic['encoder_state_dict']))
        decoder.load_state_dict(
            convert_weights(weights_dic['decoder_state_dict']))
    else:
        encoder.load_state_dict(
            convert_weights(torch.load(args.encoder_weights)))
        encoder_optimizer = torch.optim.Adam(encoder.parameters(),
                                             lr=args.encoder_lr)
        decoder_optimizer = torch.optim.Adam(decoder.parameters(),
                                             lr=args.decoder_lr)

    ## whether using dataparallel'
    if torch.cuda.device_count() > 1:
        encoder = nn.DataParallel(encoder)
        decoder = nn.DataParallel(decoder)

    ## set hinge loss function'
    loss_hinge = torch.nn.HingeEmbeddingLoss(margin=args.C,
                                             size_average=None,
                                             reduce=None,
                                             reduction='mean')

    ## if only evaluation, return"
    if args.use_eval:
        f1 = test(args, encoder, decoder, test_loader, args.threshold, 1)
        return

    ##  training stage
    highest_f1 = 0
    epochs_without_improve = 0
    for epoch in range(args.epochs):
        ## train and test
        train(args, encoder, decoder, train_loader, encoder_optimizer,
              decoder_optimizer, epoch, loss_hinge)
        f1 = test(args, encoder, decoder, test_loader, args.threshold, epoch)

        ### save parameter
        save_dict = {
            'encoder_state_dict': encoder.state_dict(),
            'decoder_state_dict': decoder.state_dict(),
            'epoch': epoch,
            'f1': f1,
            'decoder_optimizer_state_dict': decoder_optimizer.state_dict(),
            'encoder_optimizer_state_dict': encoder_optimizer.state_dict(),
            'epochs_without_improve': epochs_without_improve
        }

        ### save models'
        torch.save(save_dict,
                   args.save_path + "/checkpoint_" + timestr + '.pt.tar')
        if f1 > highest_f1:
            torch.save(
                save_dict,
                args.save_path + "/BEST_checkpoint_" + timestr + '.pt.tar')
            logger.info("Now the highest f1 is {}, it was {}".format(
                100 * f1, 100 * highest_f1))
            highest_f1 = f1
            epochs_without_improve = 0
        else:
            epochs_without_improve += 1
            if epochs_without_improve == 3:
                adjust_learning_rate(decoder_optimizer, args.coeff)
                adjust_learning_rate(encoder_optimizer, args.coeff)
                epochs_without_imp = 0
예제 #15
0
    parser.add_argument('--pretrain_emb_path', type=str, default=hp.pretrain_emb_path)
    parser.add_argument('--pretrain_cnn_path', type=str, default=hp.pretrain_cnn_path)
    parser.add_argument('--pretrain_model_path', type=str, default=hp.pretrain_model_path)
    args = parser.parse_args()
    for k, v in vars(args).items():
        setattr(hp, k, v)
    args = parser.parse_args()

    pretrain_emb = align_word_embedding(hp.word_dict_pickle_path, hp.pretrain_emb_path, hp.ntoken,
                                        hp.nhid) if hp.load_pretrain_emb else None
    pretrain_cnn = torch.load(hp.pretrain_cnn_path) if hp.load_pretrain_cnn else None

    model = TransformerModel(hp.ntoken, hp.ninp, hp.nhead, hp.nhid, hp.nlayers, hp.batch_size, dropout=0.2,
                             pretrain_cnn=pretrain_cnn, pretrain_emb=pretrain_emb, freeze_cnn=hp.freeze_cnn).to(device)
    if hp.load_pretrain_model:
        model.load_state_dict(torch.load(hp.pretrain_model_path))

    optimizer = torch.optim.Adam(filter(lambda p: p.requires_grad, model.parameters()), lr=hp.lr, weight_decay=1e-6)
    scheduler = torch.optim.lr_scheduler.ExponentialLR(optimizer, hp.scheduler_decay)
    if hp.label_smoothing:
        criterion = LabelSmoothingLoss(hp.ntoken, smoothing=0.1)
    else:
        criterion = nn.CrossEntropyLoss(ignore_index=hp.ntoken - 1)

    now_time = str(time.strftime("%Y-%m-%d-%H-%M-%S", time.localtime(time.time())))
    log_dir = 'models/{name}'.format(name=hp.name)

    writer = SummaryWriter(log_dir=log_dir)

    log_path = os.path.join(log_dir, 'train.log')