예제 #1
0
def load_data(train_src, train_tgt, val_src, val_tgt, batch_size=64, save_path="checkpoint"):
    # prepare dataset
    print("Reading data...")
    train = Seq2SeqDataset.from_file(train_src, train_tgt)

    print("Building vocab...")
    train.build_vocab(max_size=300)

    val = Seq2SeqDataset.from_file(val_src, val_tgt, share_fields_from=train)

    src_vocab = train.src_field.vocab
    tgt_vocab = train.tgt_field.vocab

    # save vocab
    with open(os.path.join(save_path, "vocab.src"), "wb") as f:
        dill.dump(src_vocab, f)
    with open(os.path.join(save_path, "vocab.tgt"), "wb") as f:
        dill.dump(tgt_vocab, f)

    print("Source vocab size:", len(src_vocab))
    print("Target vocab size:", len(tgt_vocab))

    # data iterator
    # keep sort=False and shuffle=False to speed up training and reduce memory usage
    train_iterator = BucketIterator(dataset=train, batch_size=batch_size,
                                    sort=False, sort_within_batch=True,
                                    sort_key=lambda x: len(x.src),
                                    shuffle=False, device=device)
    val_iterator = BucketIterator(dataset=val, batch_size=batch_size, train=False,
                                  sort=False, sort_within_batch=True,
                                  sort_key=lambda x: len(x.src),
                                  shuffle=False, device=device)

    return src_vocab, tgt_vocab, train_iterator, val_iterator
예제 #2
0
 def make_data(self):
     train_dataset = Seq2SeqDataset(self._config.train_path)
     dev_dataset = Seq2SeqDataset(self._config.dev_path)
     train_loader = DataLoader(dataset=train_dataset,
                               batch_size=self._config.batch_size,
                               shuffle=True,
                               pin_memory=True)
     dev_loader = DataLoader(dataset=dev_dataset,
                             batch_size=self._config.batch_size,
                             shuffle=False,
                             pin_memory=True)
     return train_loader, dev_loader
예제 #3
0
 def _make_data(self):
     train_dataset = Seq2SeqDataset(self._config.train_path)
     dev_dataset = Seq2SeqDataset(self._config.dev_path)
     train_loader = DataLoader(train_dataset,
                               self._config.batch_size,
                               shuffle=True,
                               num_workers=2)
     dev_loader = DataLoader(dev_dataset,
                             self._config.batch_size,
                             shuffle=False,
                             num_workers=2)
     return train_loader, dev_loader
예제 #4
0
    def __init__(self, input_size, hidden_size, batch_size, learning_rate,
                 num_epoch, method):
        dataset = Seq2SeqDataset()

        self.vocab = sorted(set(dataset.full_text))
        self.vocab_size = len(self.vocab)
        self.char2ind, self.ind2char = self.get_vocab()
        self.input_size = input_size
        self.hidden_size = hidden_size
        self.output_size = self.vocab_size
        self.method = method
        self.learning_rate = learning_rate
        self.batch_size = batch_size
        self.num_epoch = num_epoch
        self.device = "cuda:0" if torch.cuda.is_available() else "cpu"

        self.dataloader = DataLoader(dataset=dataset,
                                     batch_size=batch_size,
                                     shuffle=True)

        self.encoder = Encoder(input_size, hidden_size, self.vocab_size)
        self.decoder = Decoder(hidden_size, self.output_size, method)

        self.encoder = self.encoder.to(self.device)
        self.decoder = self.decoder.to(self.device)

        self.loss_function = NLLLoss()

        self.encoder_optim = optim.Adam(self.encoder.parameters(),
                                        lr=self.learning_rate)
        self.decoder_optim = optim.Adam(self.decoder.parameters(),
                                        lr=self.learning_rate)
예제 #5
0
def create_seq2seq_dataset(samples, save_path, padding=0):
    dataset = Seq2SeqDataset(samples,
                             padding=padding,
                             max_text_len=300,
                             max_summary_len=80)
    with open(save_path, 'wb') as f:
        pickle.dump(dataset, f)
예제 #6
0
    def train_in_parts(self, train_parts, val, val_iterator, batch_size, start_epoch=0, print_every=100):
        for epoch in range(start_epoch, self.n_epochs):
            # shuffle data each epoch
            random.shuffle(train_parts)

            for train_src_, train_tgt_ in train_parts:
                # create train dataset
                print("Training part [{}] with target [{}]...".format(train_src_, train_tgt_))
                train_ = Seq2SeqDataset.from_file(train_src_, train_tgt_, share_fields_from=val)

                # create iterator
                train_iterator_ = BucketIterator(dataset=train_, batch_size=batch_size,
                                                 sort=False, sort_within_batch=True,
                                                 sort_key=lambda x: len(x.src),
                                                 shuffle=True, device=device)
                # train
                self._train_epoch(epoch, train_iterator_, train=True, print_every=print_every)

                # clean
                del train_
                del train_iterator_
                gc.collect()

            # save
            self.save(epoch)

            # evaluate on validation set after each epoch
            with torch.no_grad():
                self._train_epoch(epoch, val_iterator, train=False, print_every=print_every)
예제 #7
0
def create_seq2seq_dataset_without_save(samples, config, padding=0):
    dataset = Seq2SeqDataset(
        samples, padding=padding,
        max_text_len=config.get('max_text_len') or 300,
        max_summary_len=config.get('max_summary_len') or 80,
        train=False
    )
    return dataset
예제 #8
0
    def infer(self, test_sentence):
        # read raw data to list
        test_sentence = self.convert(test_sentence)
        print(test_sentence)
        lines_raw = [test_sentence]
        lines_prep = [self.preprocess(test_sentence)]

        # prepare dataset
        print("Reading test data...")
        test = Seq2SeqDataset.from_list(lines_prep)
        test.src_field.vocab = self.src_vocab

        # prepare iterator
        test_iterator = BucketIterator(dataset=test,
                                       batch_size=1,
                                       train=False,
                                       sort=False,
                                       sort_within_batch=False,
                                       shuffle=False,
                                       device=device)
        # predict
        with torch.no_grad():
            for i, batch in enumerate(test_iterator):
                # forward through model
                _, _, output = self.model(batch,
                                          has_targets=False,
                                          mask_softmax=1.0,
                                          teacher_forcing=1.0)
                # get top-1
                predicted_values, predicted_indices = torch.max(output, dim=-1)

                # convert predicted vocab indices to an actual sentence
                predicted_seq = [
                    self.tgt_vocab.itos[c]
                    for c in predicted_indices.squeeze(0).tolist()
                ]

                # output is log_softmax so do exp()
                predicted_values = predicted_values.exp()

                # convert to list
                predicted_values_ = predicted_values.squeeze(0).tolist()

                # beam search
                predicted_seq = self.beam_lm(''.join(predicted_seq[1:-1]),
                                             predicted_values_[1:-1],
                                             lines_raw[i])

                # match case and punctuations
                predicted_seq = self.match_case(predicted_seq, lines_raw[i])

                # do some post-processing to match submission output
                print("{} {}".format(i, predicted_seq))
        return predicted_seq
예제 #9
0
def train(args):
    args.save_dir += "_" + args.model_type + "_lm" if not args.seq2seq else "_seq2seq"
    os.makedirs(args.save_dir, exist_ok=True)
    device = "cuda" if torch.cuda.is_available() else "cpu"

    if args.model_type == "lstm":
        from lstm import LMModel, Seq2SeqModel
    elif args.model_type == "transformer":
        from transformer import LMModel, Seq2SeqModel

    if args.seq2seq:
        train_set = Seq2SeqDataset(device=device)
        valid_set = Seq2SeqDataset(split="valid", device=device)
        model = Seq2SeqModel(args, train_set.dictionary).to(device)
    else:
        train_set = LMDataset(device=device)
        valid_set = LMDataset(split="valid", device=device)
        model = LMModel(args, train_set.dictionary).to(device)

    optimizer = optim.Adam(model.parameters(), lr=args.lr, weight_decay=args.weight_decay)
    train_loader = DataLoader(train_set, batch_size=args.batch_size, collate_fn=train_set.collate_fn, shuffle=True)

    evaluate(model, valid_set)
    for epoch in range(args.num_epoch):
        model.train()
        with tqdm(train_loader, desc="training") as pbar:
            losses = []
            for samples in pbar:

                optimizer.zero_grad()
                loss = model.get_loss(**samples)
                loss.backward()
                optimizer.step()
                losses.append(loss.item())
                pbar.set_description("Epoch: %d, Loss: %0.8f, lr: %0.6f" % (epoch + 1, np.mean(losses), optimizer.param_groups[0]['lr']))

        if epoch % args.save_interval == 0:
            torch.save(model, args.save_dir + "/{}_{}.pt".format(args.model_type, epoch + 1))
        evaluate(model,valid_set)
예제 #10
0
 def val_dataloader(self):
     dataset = Seq2SeqDataset(data_files=Path(
         self.test_folder).glob("*.npz"),
                              previous_poses=self.previous_poses,
                              predicted_poses=self.predicted_poses,
                              stride=self.stride,
                              with_context=True,
                              text_folder=self.text_folder,
                              vocab=self.vocab)
     loader = DataLoader(dataset,
                         batch_size=self.batch_size,
                         shuffle=True,
                         collate_fn=dataset.collate_fn)
     return loader
예제 #11
0
 def train_dataloader(self):
     dataset = Seq2SeqDataset(
         Path(self.train_folder).glob("*.npz"),
         self.previous_poses,
         self.predicted_poses,
         self.stride,
         self.with_context,
         text_folder=self.text_folder,
         vocab=self.vocab
     )
     loader = DataLoader(
         dataset, batch_size=self.batch_size, shuffle=True, collate_fn=dataset.collate_fn
     )
     return loader
예제 #12
0
    def __init__(self,
                 input_size,
                 hidden_size,
                 batch_size,
                 learning_rate,
                 method,
                 num_layers=1):
        dataset = Seq2SeqDataset()
        self.data_loader = DataLoader(dataset=dataset,
                                      batch_size=batch_size,
                                      shuffle=True)
        self.vocab = dataset.vocab
        self.output_size = len(self.vocab)
        self.char2index, self.index2char = self.data_index()

        self.input_size = input_size
        self.hidden_size = hidden_size
        self.batch_size = batch_size
        self.learning_rate = learning_rate
        self.num_layers = 1
        self.method = method

        self.device = 'cuda:0' if torch.cuda.is_available() else 'cpu'
        self.attn = Attn(method, hidden_size)
        self.encoder = Encoder(input_size, hidden_size, self.output_size,
                               self.num_layers)
        self.decoder = Decoder(hidden_size, self.output_size, method,
                               self.num_layers)

        self.attn = self.attn.to(self.device)
        self.encoder = self.encoder.to(self.device)
        self.decoder = self.decoder.to(self.device)

        self.loss_function = NLLLoss()
        self.encoder_optim = torch.optim.Adam(self.encoder.parameters(),
                                              lr=self.learning_rate)
        self.decoder_optim = torch.optim.Adam(self.decoder.parameters(),
                                              lr=self.learning_rate)
예제 #13
0
from dataset import Seq2SeqDataset

if __name__ == "__main__":
    parser = ArgumentParser()
    parser.add_argument("--checkpoint", type=str, required=True)
    parser.add_argument("--dest", type=str, required=True)
    parser.add_argument("--src", type=str)
    parser.add_argument("--text_folder", type=str)
    args = parser.parse_args()
    system = Seq2SeqSystem.load_from_checkpoint(
        args.checkpoint, train_folder=None, test_folder="data/dataset/test")
    system = system.eval().cuda()
    dataset = Seq2SeqDataset([Path(args.src)],
                             previous_poses=system.previous_poses,
                             predicted_poses=system.predicted_poses,
                             stride=system.predicted_poses,
                             with_context=system.with_context,
                             text_folder=args.text_folder,
                             vocab=system.vocab)
    prev_poses = system.predicted_poses
    pred_poses = system.previous_poses

    all_predictions = []
    dataset_iter = iter(dataset)

    x, y, p, w = next(dataset_iter)
    x = x.unsqueeze(1).cuda()
    p = p.unsqueeze(1).cuda()
    w = w.unsqueeze(1).cuda()
    pose = system(x, p, w)
    all_predictions.append(pose.squeeze(1).detach().cpu().numpy())
    with open(trainingName, "rb") as FileTraining:
        #print(sys.argv[1])
        trainingData = pickle.load(FileTraining)
    """
    with open(validName,"rb") as FileValidating:
        #print(sys.argv[1])
        validData = pickle.load(FileValidating)
    """
    """
    with open("data/valid.jsonl","r") as f:
        answers = [json.loads(line) for line in f]
        answers = {a['id']: a for a in answers}    
    """

    trainingData = Seq2SeqDataset(trainingData)
    #validData = Seq2SeqDataset(validData)

    with open(embeddingName, 'rb') as f:
        embedding = pickle.load(f)

    encoder = EncoderRNN(len(embedding.vocab), hidden_size, embedding.vectors,
                         BATCH_SIZE).to(device)
    decoder = DecoderRNN(hidden_size, len(embedding.vocab), embedding.vectors,
                         BATCH_SIZE).to(device)

    loader = Data.DataLoader(
        dataset=trainingData,  # torch TensorDataset format
        batch_size=BATCH_SIZE,  # mini batch size
        shuffle=True,  # 要不要打乱数据 (打乱比较好)
        #num_workers=1,              # 多线程来读数据
예제 #15
0
def generate(args):

    model_class, tokenizer_class = register(args.model_class)

    if args.score_reference:
        args.batch_size = 1
        test_dataset = Seq2SeqDataset(
            tokenizer_class=tokenizer_class,
            tokenizer_path=args.save_dir,
            source_data_path=args.test_source_data_path,
            target_data_path=args.test_target_data_path
        )
    else:
        test_dataset = Seq2SeqDataset(
            tokenizer_class=tokenizer_class,
            tokenizer_path=args.save_dir,
            source_data_path=args.test_source_data_path
        )
    test_dataloader = test_dataset.get_dataloader(batch_size=args.batch_size, shuffle=False)

    model = model_class.from_pretrained(args.save_dir)
    model.to(DEVICE)
    model.eval()

    if not args.debug:
        num_batches = math.ceil(len(test_dataset) / args.batch_size)
        widgets = [
            progressbar.Percentage(), ' | ',
            progressbar.SimpleProgress(), ' ',
            progressbar.Bar('▇'), ' ',
            progressbar.Timer(), ' | ',
            progressbar.ETA()
        ]

        progress = progressbar.ProgressBar(
            max_value=num_batches,
            widgets=widgets,
            redirect_stdout=True
        ).start()

    output_file = open(args.output_path, 'w')

    for itr, data in enumerate(test_dataloader):

        if args.score_reference:
            src_input_ids, src_attn_mask, tgt_input_ids, tgt_attn_mask = (x.to(DEVICE) for x in data)
        else:
            src_input_ids, src_attn_mask = (x.to(DEVICE) for x in data)

        if args.score_reference:
            labels = shift_target_inputs_to_labels(tgt_input_ids, test_dataset.tokenizer.pad_token_id)
            with torch.no_grad():
                output = model(
                    src_input_ids,
                    attention_mask=src_attn_mask,
                    decoder_input_ids=tgt_input_ids,
                    decoder_attention_mask=tgt_attn_mask,
                    labels=labels
                )
            score = output[0].item()
            output_file.write(str(score) + '\n')
        else:
            with torch.no_grad():
                tgt_output_ids = model.generate(
                    src_input_ids,
                    attention_mask=src_attn_mask,
                    num_beams=args.beam_size,
                    num_return_sequences=args.num_return_sequences,
                    max_length=args.max_length
                )
            for seq_ids in tgt_output_ids.to('cpu').numpy().tolist():
                seq_toks = test_dataset.tokenizer.decode(
                    seq_ids,
                    skip_special_tokens=True,
                    clean_up_tokenization_spaces=args.clean_up_tokenization_spaces
                )
                output_file.write(seq_toks + '\n')

        if not args.debug:
            progress.update(itr+1)

    if not args.debug:
        progress.finish()

    output_file.close()
                              embedding.vectors,
                              BATCH_SIZE,
                              maxLength,
                              dropout_p=0.1).to(device)
    decoder1.load_state_dict(torch.load(decoderName))

    encoder1 = encoder1.to(device)
    encoder1.eval()
    decoder1 = decoder1.to(device)
    decoder1.eval()

    with open(testDataName, "rb") as FileTesting:
        #print(sys.argv[1])
        testingData = pickle.load(FileTesting)

    testingData = Seq2SeqDataset(testingData)

    def pad_to_len(seqs, to_len, padding=0):
        paddeds = []
        for seq in seqs:
            paddeds.append(seq[:to_len] +
                           [padding] * max(0, to_len - len(seq)))

        return paddeds

    def attention_collate_fn(samples):
        batch = {}
        for key in ['id', 'len_text', 'len_summary']:
            batch[key] = [sample[key] for sample in samples]

        for key in ['text', 'summary', 'attention_mask']:
예제 #17
0
    modelName = sys.argv[4]

    with open(trainingName, "rb") as FileTraining:
        #print(sys.argv[1])
        trainingData = pickle.load(FileTraining)

    with open(validName, "rb") as FileValidating:
        #print(sys.argv[1])
        validData = pickle.load(FileValidating)
    """
    with open("data/valid.jsonl","r") as f:
        answers = [json.loads(line) for line in f]
        answers = {a['id']: a for a in answers}    
    """

    trainingData = Seq2SeqDataset(trainingData)
    validData = Seq2SeqDataset(validData)

    with open(embeddingName, 'rb') as f:
        embedding = pickle.load(f)

    with open('../datasets/seq2seq/config.json', 'r') as f:
        config = json.load(f)

    maxTextLen = config.get('max_text_len')
    maxSummaryLen = config.get('max_summary_len')
    #print(maxTextLen, maxSummaryLen)
    maxLength = max(maxTextLen, maxSummaryLen)

    encoder = AttnEncoderRNN(len(embedding.vocab), hidden_size,
                             embedding.vectors, BATCH_SIZE).to(device)
예제 #18
0
                     help='Whether to enable reconstruction model or not',
                     action='store_true')
    arg.add_argument('--evaluate',
                     help='Evaluate the model using the pretrained model',
                     action='store_true')
    args = arg.parse_args()

    evaluate = True if args.evaluate else False
    copy = True if args.copy else False
    recons = True if args.recons else False

    print('------------ Loading Datasets ------------\n')
    train_descs, train_slogans, valid_descs, valid_slogans, test_descs, test_slogans = load_csv(
        args.dataset_path)

    train_data = Seq2SeqDataset(train_descs, train_slogans, (SRC, TRG))
    test_data = Seq2SeqDataset(test_descs, test_slogans, (SRC, TRG))
    valid_data = Seq2SeqDataset(valid_descs, valid_slogans, (SRC, TRG))

    print('------------ Building Vocab ------------\n')

    SRC.build_vocab(train_data, max_size=args.vocab_size)
    TRG.build_vocab(train_data, max_size=args.vocab_size)

    train_iterator, valid_iterator, test_iterator = BucketIterator.splits(
        (train_data, valid_data, test_data),
        batch_size=args.bs,
        sort_within_batch=True,
        sort_key=lambda x: len(x.src),
        device=device)
예제 #19
0
def train(args):

    logfile = logging.FileHandler(args.save_dir + '/log.txt', mode='w')
    logfile.setFormatter(fmt)
    logger.addHandler(logfile)

    model_class, tokenizer_class = register(args.pretrained_model_path)

    train_dataset = Seq2SeqDataset(
        tokenizer_class=tokenizer_class,
        tokenizer_path=args.pretrained_model_path,
        source_data_path=args.train_source_data_path,
        target_data_path=args.train_target_data_path,
        indivisible_tokens_path=args.indivisible_tokens_path,
        cache_dir=args.cache_dir,
        save_tokenizer=args.save_dir
    )
    train_dataloader = train_dataset.get_dataloader(batch_size=args.batch_size, shuffle=True)
    valid_dataset = Seq2SeqDataset(
        tokenizer_class=tokenizer_class,
        tokenizer_path=args.save_dir,
        source_data_path=args.valid_source_data_path,
        target_data_path=args.valid_target_data_path
    )
    valid_dataloader = valid_dataset.get_dataloader(batch_size=args.valid_batch_size, shuffle=False)

    model = model_class.from_pretrained(args.pretrained_model_path, cache_dir=args.cache_dir)
    if args.indivisible_tokens_path is not None:
        model.resize_token_embeddings(len(train_dataset.tokenizer))
    model.to(DEVICE)
    model.train()
    logger.info(f'model\n{model}')
    num_total_params = sum(p.numel() for p in model.parameters())
    logger.info(f'total parameters: {num_total_params}')

    optimizer = optim.Adam(model.parameters(), lr=args.learning_rate)
    optimizer.zero_grad()
    logger.info(f'optimizer\n{optimizer}')

    if not args.debug:
        train_num_batchs_per_epoch = math.ceil(len(train_dataset) / args.batch_size)
        train_progress_widgets = [
            progressbar.Percentage(), ' | ',
            progressbar.SimpleProgress(), ' | ',
            progressbar.Variable('step', width=0), ' | ',
            progressbar.Variable('loss', width=0, precision=6), ' ',
            progressbar.Bar('▇'), ' ',
            progressbar.Timer(), ' | ',
            progressbar.ETA()
        ]
        valid_num_batchs_per_epoch = math.ceil(len(valid_dataset) / args.valid_batch_size)
        valid_progress_widgets = [
            progressbar.Percentage(), ' | ',
            progressbar.SimpleProgress(), ' ',
            progressbar.Bar('▇'), ' ',
            progressbar.Timer(), ' | ',
            progressbar.ETA()
        ]

    global_step = 1
    best_valid_measure = math.inf
    best_epoch_itr = 0

    for epoch_itr in range(args.max_epoch):

        train_epoch_sum_loss = 0
        train_epoch_average_loss = 0

        logger.info(f'begin training epoch {epoch_itr+1}')
        if not args.debug:
            train_progress = progressbar.ProgressBar(
                max_value=train_num_batchs_per_epoch,
                widgets=train_progress_widgets,
                redirect_stdout=True
            ).start()

        for itr, data in enumerate(train_dataloader):

            src_input_ids, src_attn_mask, tgt_input_ids, tgt_attn_mask = (x.to(DEVICE) for x in data)

            labels = shift_target_inputs_to_labels(tgt_input_ids, train_dataset.tokenizer.pad_token_id)

            output = model(
                input_ids=src_input_ids,
                attention_mask=src_attn_mask,
                decoder_input_ids=tgt_input_ids,
                decoder_attention_mask=tgt_attn_mask,
                labels=labels
            )

            loss = output[0]
            train_epoch_sum_loss += loss * src_input_ids.shape[0]

            normalized_loss = loss / args.update_frequency
            normalized_loss.backward()

            global_step += 1
            if not args.debug:
                train_progress.update(itr+1, step=global_step, loss=loss)

            if (itr + 1) % args.update_frequency == 0:
                optimizer.step()
                optimizer.zero_grad()

        if not args.debug:
            train_progress.finish()

        train_epoch_average_loss = train_epoch_sum_loss.item() / len(train_dataset)
        logger.info(f'average training loss: {train_epoch_average_loss}')

        logger.info(f'begin validation for epoch {epoch_itr+1}')
        model.eval()

        if not args.debug:
            valid_progress = progressbar.ProgressBar(
                max_value=valid_num_batchs_per_epoch,
                widgets=valid_progress_widgets,
                redirect_stdout=True
            ).start()

        valid_measure = 0
        if args.valid_bleu:
            hypotheses = []
            references = []
        else:
            valid_epoch_sum_loss = 0

        for itr, data in enumerate(valid_dataloader):

            src_input_ids, src_attn_mask, tgt_input_ids, tgt_attn_mask = (x.to(DEVICE) for x in data)

            if args.valid_bleu:
                with torch.no_grad():
                    tgt_output_ids = model.generate(
                        src_input_ids,
                        attention_mask=src_attn_mask,
                        num_beams=args.valid_beam_size,
                        max_length=args.valid_max_length
                    )
                for seq_ids in tgt_output_ids.to('cpu').numpy().tolist():
                    seq_toks = valid_dataset.tokenizer.decode(
                        seq_ids,
                        skip_special_tokens=True,
                        clean_up_tokenization_spaces=False
                    )
                    hypotheses.append(seq_toks)
                for seq_ids in tgt_input_ids.to('cpu').numpy().tolist():
                    seq_toks = valid_dataset.tokenizer.decode(
                        seq_ids,
                        skip_special_tokens=True,
                        clean_up_tokenization_spaces=False
                    )
                    references.append(seq_toks)
            else:
                labels = shift_target_inputs_to_labels(tgt_input_ids, valid_dataset.tokenizer.pad_token_id)
                with torch.no_grad():
                    output = model(
                        input_ids=src_input_ids,
                        attention_mask=src_attn_mask,
                        decoder_input_ids=tgt_input_ids,
                        decoder_attention_mask=tgt_attn_mask,
                        labels=labels
                    )
                valid_loss = output[0]
                valid_epoch_sum_loss += valid_loss * src_input_ids.shape[0]

            if not args.debug:
                valid_progress.update(itr+1)

        model.train()
        if not args.debug:
            valid_progress.finish()

        if args.valid_bleu:
            bleu = sacrebleu.corpus_bleu(hypotheses, [references], force=True)
            valid_measure = -bleu.score
            logger.info(f'validation BLEU: {bleu.score}')
        else:
            valid_measure = valid_epoch_sum_loss.item() / len(valid_dataset)
            logger.info(f'validation loss: {valid_measure}')

        if valid_measure < best_valid_measure:
            logger.info('saving new best checkpoints')
            best_valid_measure = valid_measure
            best_epoch_itr = epoch_itr + 1
            model.save_pretrained(args.save_dir)

        if (epoch_itr + 1 - best_epoch_itr) > args.patience:
            logger.info(f'early stop since valid performance hasn\'t improved for last {args.patience} eopchs')
            break
예제 #20
0
                        sentense_ints = vocab_to_int["<UNK>"]
            else:
                sentense_ints = vocab_to_int["<UNK>"]
                # print(word)
#         if eos:
#             sentense_ints.append(vocab_to_int["<EOS>"])
        ints.append(sentense_ints)
    ints.append(vocab_to_int["<EOS>"])
    return ints


# seq2seq_preprocess(data_path_text,data_path_motion)
print("preprocess finished")
extro_data_test_path = 'head.npz'
word2idx = dic
test_set = Seq2SeqDataset(extro_data_test_path, word2idx)

test_dataloader = DataLoader(test_set,
                             batch_size=1,
                             collate_fn=seq2seq_collate_fn)

#%%
# buliding model

dof_num = 4
embed_dim = 100
learning_rate = 1e-3
encoder_hidden_dim = 32
decoder_hidden_dim = 32
model_save_folder = './saved_models/'
예제 #21
0
    def predict(self, test_path, test_cleaned_path, out_path):
        # read raw data to list
        lines_id = []
        lines_raw = []
        lines_cleaned = []
        lines_prep = []
        with open(test_path, 'r') as f, open(test_cleaned_path, 'r') as fc:
            for line in f:
                line_id = line[:3]
                line_seq = line[4:]
                lines_id.append(line_id)
                lines_raw.append(line_seq)
                lines_prep.append(self.preprocess(line_seq))
            for line in fc:
                lines_cleaned.append(line[4:])

        # prepare dataset
        print("Reading test data...")
        test = Seq2SeqDataset.from_list(lines_prep)
        test.src_field.vocab = self.src_vocab

        # prepare iterator
        test_iterator = BucketIterator(dataset=test,
                                       batch_size=1,
                                       train=False,
                                       sort=False,
                                       sort_within_batch=False,
                                       shuffle=False,
                                       device=device)

        # predict
        with open(out_path, 'w') as writer:
            with torch.no_grad():
                for i, batch in enumerate(test_iterator):
                    # forward through model
                    _, _, output = self.model(batch,
                                              has_targets=False,
                                              mask_softmax=1.0,
                                              teacher_forcing=1.0)
                    print(output.shape)
                    # get top-1
                    predicted_values, predicted_indices = torch.max(output,
                                                                    dim=-1)
                    print(predicted_values.shape)
                    print(predicted_indices.shape)

                    # convert predicted vocab indices to an actual sentence
                    predicted_seq = [
                        self.tgt_vocab.itos[c]
                        for c in predicted_indices.squeeze(0).tolist()
                    ]
                    # print('predicted_seq')
                    # print(predicted_seq)

                    # output is log_softmax so do exp()
                    predicted_values = predicted_values.exp()
                    # print('predicted_values')
                    # print(predicted_values)

                    # convert to list
                    predicted_values_ = predicted_values.squeeze(0).tolist()

                    # beam search
                    predicted_seq = self.beam_lm(''.join(predicted_seq[1:-1]),
                                                 predicted_values_[1:-1],
                                                 lines_raw[i])

                    # match case and punctuations
                    predicted_seq = self.match_case(predicted_seq,
                                                    lines_raw[i])

                    # do some post-processing to match submission output
                    predicted_seq = self.match_output(predicted_seq,
                                                      lines_cleaned[i])
                    print("{} {}".format(i, predicted_seq))

                    # write to file with line_id
                    writer.write(lines_id[i] + ',' + predicted_seq + '\n')
예제 #22
0
def train(args):
    if args.logdir is None:
        args.logdir = "Models-{}".format(time.strftime("%Y%m%d-%H%M%S"))
    task = "lm" if not args.seq2seq else "seq2seq"
    args.logdir += "_" + args.model_type + "_" + task
    os.makedirs(args.logdir, exist_ok=True)
    os.makedirs(os.path.join(args.logdir, "models"), exist_ok=True)
    print("Experiment dir : {}".format(args.logdir))

    log_format = '%(asctime)s %(message)s'
    logging.basicConfig(stream=sys.stdout,
                        level=logging.INFO,
                        format=log_format,
                        datefmt='%m/%d %I:%M:%S %p')
    fh = logging.FileHandler(os.path.join(args.logdir, 'log.txt'))
    fh.setFormatter(logging.Formatter(log_format))
    logging.getLogger().addHandler(fh)

    device = "cuda:" + str(args.gpuid) if torch.cuda.is_available() else "cpu"

    mem_crammer = []

    if args.model_type == "lstm":
        from lstm import LMModel, Seq2SeqModel
    elif args.model_type == "transformer":
        from transformer import LMModel, Seq2SeqModel

    if args.seq2seq:
        train_set = Seq2SeqDataset(device=device)
        valid_set = Seq2SeqDataset(split="valid", device=device)
        model = Seq2SeqModel(args, train_set.dictionary).to(device)
    else:
        train_set = LMDataset(device=device)
        valid_set = LMDataset(split="valid", device=device)
        model = LMModel(args, train_set.dictionary).to(device)

    optimizer = optim.Adam(model.parameters(),
                           lr=args.lr,
                           weight_decay=args.weight_decay)

    warmup_epoch = args.num_epoch * 0.1
    scheduler = ExponentialLR(optimizer,
                              0.1**(1 / (args.num_epoch - warmup_epoch)))
    iter_per_epoch = (len(train_set) + args.batch_size - 1) // args.batch_size
    warmup_scheduler = WarmUpLR(optimizer, iter_per_epoch * warmup_epoch)

    train_loader = DataLoader(train_set,
                              batch_size=args.batch_size,
                              collate_fn=train_set.collate_fn,
                              shuffle=True)

    bestppl = 1e9
    for epoch in range(args.num_epoch):
        model.train()

        if args.cram:
            while True:
                try:
                    junk = torch.rand((9999, 9999), dtype=float, device=device)
                except:
                    with torch.cuda.device(device):
                        torch.cuda.empty_cache()
                    break
                mem_crammer.append(junk)

        with tqdm(train_loader, desc="training") as pbar:
            losses = []
            for samples in pbar:
                if epoch < warmup_epoch:
                    warmup_scheduler.step()
                optimizer.zero_grad()

                while True:
                    success = True
                    try:
                        loss = model.get_loss(**samples)
                        loss.backward()
                        optimizer.step()
                    except:
                        del mem_crammer[-1]
                        with torch.cuda.device(device):
                            torch.cuda.empty_cache()
                        success = False
                        optimizer.zero_grad()
                    if success:
                        break

                losses.append(loss.item())
                pbar.set_description("Epoch: %d, Loss: %0.8f, lr: %0.6f" %
                                     (epoch + 1, np.mean(losses),
                                      optimizer.param_groups[0]['lr']))

            logging.info(
                "Epoch: %d, Loss: %0.8f, lr: %0.6f" %
                (epoch + 1, np.mean(losses), optimizer.param_groups[0]['lr']))

        if epoch % args.save_interval == 0:
            savepath = os.path.join(
                args.logdir, "models/{}_{}.pt".format(args.model_type,
                                                      epoch + 1))
            torch.save(model, savepath)
            logging.info("Saving to {}".format(savepath))

        if task == "lm":
            print("好    -->", model.generate("好", beam_size=3, device=device))
            print("秋水  -->", model.generate("秋水", beam_size=3, device=device))
            print("寒烟翠-->", model.generate("寒烟翠", beam_size=3, device=device))
        elif task == "seq2seq":
            print("改革春风吹满地-->",
                  model.generate("改革春风吹满地", beam_size=2, device=device))
            print("牛津大学聪明人不及蟾蜍一半-->",
                  model.generate("牛津大学聪明人不及蟾蜍一半", beam_size=2, device=device))
            print("一支穿云箭,青天白日重新现-->",
                  model.generate("一支穿云箭,青天白日重新现", beam_size=2, device=device))

        loss, ppl = evaluate(model, valid_set, False)
        logging.info("Valid, Loss: %0.8f, ppl: %0.8f" % (loss, ppl))

        if ppl < bestppl:
            bestppl = ppl
            savepath = os.path.join(
                args.logdir, "models/{}_{}.pt".format(args.model_type, task))
            torch.save(model, savepath)
            logging.info("Best ppl! Saving to {}".format(savepath))

        if epoch >= warmup_epoch:
            scheduler.step()
예제 #23
0
# coding=utf-8

from tool.config import loadConfig
from model import Seq2SeqModel
from dataset import Seq2SeqDataset

args    = loadConfig('config.ini')
dataset = Seq2SeqDataset(args)
model   = Seq2SeqModel(args)

if args.mode == 'train':
    print('trainging')
    train_set = dataset.getDatas('train')
    eval_set = dataset.getDatas('eval')
    model.train(train_set, eval_set)
elif args.mode == 'eval':
    print('evaluation')
    eval_set = dataset.getDatas('eval')
    model.eval(eval_set)
elif args.mode == 'predict':
    print('prediction')
    eval_set = dataset.getDatas('eval')
    print(dataset.ftk.convert_ids_to_tokens(eval_set[0][0]))
    pred_id = model.predict(eval_set[0][0], eval_set[2][0])
    print(dataset.ftk.convert_ids_to_tokens(pred_id))
elif args.mode == 'freeze':
    print('evaluation')
    model.freeze()
elif args.mode == 'infer':
    print('infer')
    eval_set = dataset.getDatas('eval')
예제 #24
0
    intro_data_train_path = './data/intro_seq2seq_dataset_train.npz'
    intro_data_valid_path = './data/intro_seq2seq_dataset_valid.npz'
    intro_data_test_path = './data/intro_seq2seq_dataset_test.npz'

    natural_data_train_path = './data/natural_seq2seq_dataset_train.npz'
    natural_data_valid_path = './data/natural_seq2seq_dataset_valid.npz'
    natural_data_test_path = './data/natural_seq2seq_dataset_test.npz'
    # data_path = './data/extro_seq2seq_dataset.npz'

    dic = json.load(open("vocab_to_int.txt"))
    word2idx = dic  # load word map

    # torch.cuda.empty_cache()

    train_set = Seq2SeqDataset(extro_data_train_path, word2idx)
    valid_set = Seq2SeqDataset(extro_data_valid_path, word2idx)
    test_set = Seq2SeqDataset(extro_data_test_path, word2idx)

    # training_set = EmotionDataLoaderStart(X_train, y_train, tag_train, pad_len, word2id)
    train_loader = DataLoader(train_set,
                              batch_size,
                              shuffle=True,
                              collate_fn=seq2seq_collate_fn)

    valid_loader = DataLoader(valid_set,
                              batch_size,
                              shuffle=True,
                              collate_fn=seq2seq_collate_fn)

    test_dataloader = DataLoader(test_set,
예제 #25
0
# test_data_path = './train_valid_test_data/'+ personality +'_seq2seq_dataset_test.npz'

extro_data_train_sample_path = './data/extro_seq2seq_dataset_train_sample.npz'

extro_data_test_path = './data/extro_seq2seq_dataset_test.npz'

intro_data_test_path = './data/intro_seq2seq_dataset_test.npz'

natural_data_test_path = './data/natural_seq2seq_dataset_test.npz'

dic = json.load(open("vocab_to_int.txt"))
word2idx = dic  # load word map

idx2word = {v: k for k, v in word2idx.items()}

train_set = Seq2SeqDataset(extro_data_train_sample_path, word2idx)

test_set = Seq2SeqDataset(extro_data_test_path, word2idx)

test_dataloader = DataLoader(test_set,
                             batch_size=1,
                             collate_fn=seq2seq_collate_fn)

#%%
# buliding model

dof_num = 4
embed_dim = 100
learning_rate = 1e-3
encoder_hidden_dim = 32
decoder_hidden_dim = 32