Exemplo n.º 1
0
    def _get_train_iter(self,
                        shard_dataset: textdata.Dataset,
                        batch_size: int,
                        world_size: int = 1) -> BatchIterator:
        """
        Generate data batch iterator for training data. If distributed training
        is enabled, the dataset will be partitioned first. We use BucketIterator
        here to pool together examples with a similar size length to reduce the
        padding required for each batch.

        Args:
            shard_dataset (str): sharded training or evaluation dataset
            batch_size (int): batch size
            rank (int): used for distributed training, the rank of current Gpu,
                don't set it to anything but 0 for non-distributed training
            world_size (int): used for distributed training, total number of Gpu
        """
        # Compute the per-worker batch size
        batch_size = batch_size // world_size or batch_size

        return BatchIterator(
            textdata.BucketIterator(
                shard_dataset,
                batch_size=batch_size,
                device="cuda:{}".format(torch.cuda.current_device())
                if cuda.CUDA_ENABLED else "cpu",
                sort_within_batch=self.sort_within_batch,
                repeat=False,
                sort_key=self.sort_key,
                shuffle=self.shuffle,
            ),
            self._postprocess_batch,
            num_batches=math.ceil(len(shard_dataset) / float(batch_size)),
        )
Exemplo n.º 2
0
    def __init__(self, config, w2v_stoi, w2v_vectors, device):
        self.config = config
        self.w2v_stoi = w2v_stoi
        self.w2v_vectors = w2v_vectors
        print(self.w2v_vectors.shape)
        self.device = device
        self.SRC = data.Field(tokenize=lambda x: x.split(),
                              unk_token='<unk>',
                              pad_token='<pad>',
                              lower=True,
                              batch_first=True,
                              include_lengths=True)
        self.TRG = data.Field(
            tokenize=lambda x: x.split(),
            unk_token='<unk>',
            pad_token='<pad>',
            lower=True,
            batch_first=True,
        )
        self.train_data = TranslationDataset(
            path='dataset/klue-ner-v1_train_cleaned_tokenized',
            exts=('.src', '.trg'),
            fields=(self.SRC, self.TRG))
        self.test_data = TranslationDataset(
            path='dataset/klue-ner-v1_dev_cleaned_tokenized',
            exts=('.src', '.trg'),
            fields=(self.SRC, self.TRG))

        self.build_vocab()

        print('number of training data : {}'.format(len(self.train_data)))
        print('number of test data : {}'.format(len(self.test_data)))

        self.train_iterator = data.BucketIterator(
            self.train_data,
            batch_size=self.config['batch_size'],
            device=device,
            sort_key=lambda x: len(x.src),
            sort_within_batch=True)
        self.test_iterator = data.BucketIterator(
            self.test_data,
            batch_size=self.config['batch_size'],
            device=device,
            sort_key=lambda x: len(x.src),
            sort_within_batch=True)
Exemplo n.º 3
0
def main():
    global WORD
    WORD = data.Field(include_lengths=True,
                      batch_first=True,
                      eos_token=None,
                      init_token=None)
    LABEL = data.Field(sequential=False, batch_first=True)
    TREE = data.RawField(postprocessing=ListOpsDataset.tree_field(WORD))
    TREE.is_target = False
    train = ListOpsDataset(
        "data/train_d20s.tsv",
        (("word", WORD), ("label", LABEL), ("tree", TREE)),
        filter_pred=lambda x: 5 < len(x.word) < config["train_len"],
    )
    WORD.build_vocab(train)
    LABEL.build_vocab(train)
    valid = ListOpsDataset(
        "data/test_d20s.tsv",
        (("word", WORD), ("label", LABEL), ("tree", TREE)),
        filter_pred=lambda x: 5 < len(x.word) < 150,
    )

    train_iter = TokenBucket(train,
                             batch_size=1500,
                             device="cuda:0",
                             key=lambda x: len(x.word))
    train_iter.repeat = False
    valid_iter = data.BucketIterator(train,
                                     batch_size=50,
                                     train=False,
                                     sort=False,
                                     device="cuda:0")

    NT = 1
    T = len(WORD.vocab)
    V = T

    if True:
        tree_lstm = TreeLSTM(config["H"],
                             len(WORD.vocab) + 100, len(LABEL.vocab)).cuda()
        for p in tree_lstm.parameters():
            if p.dim() > 1:
                torch.nn.init.xavier_uniform_(p)

        model = SpanLSTM(NT, len(WORD.vocab), config["H"]).cuda()
        for p in model.parameters():
            if p.dim() > 1:
                torch.nn.init.xavier_uniform_(p)

        wandb.watch((model, tree_lstm))
        print(wandb.config)
        tree = run_train(train_iter, valid_iter, model, tree_lstm, V)
    else:
        print("loading")
        model, tree_lstm = torch.load("cp.yoyo.model")
        print(valid_sup(valid_iter, model, tree_lstm, V))
Exemplo n.º 4
0
def load_dataset(config,
                 train_pos='train.hh',
                 train_neg='train.fb',
                 dev_pos='dev.hh',
                 dev_neg='dev.fb',
                 test_pos='test.hh',
                 test_neg='test.fb'):
    logger = logging.getLogger(__name__)
    root = config.data_path
    TEXT = data.Field(batch_first=True, eos_token='<eos>')

    dataset_fn = lambda name: data.TabularDataset(
        path=root + name, format='tsv', fields=[('text', TEXT)])

    train_pos_set, train_neg_set = map(dataset_fn, [train_pos, train_neg])
    dev_pos_set, dev_neg_set = map(dataset_fn, [dev_pos, dev_neg])
    test_pos_set, test_neg_set = map(dataset_fn, [test_pos, test_neg])

    TEXT.build_vocab(train_pos_set, train_neg_set, min_freq=config.min_freq)

    if config.load_pretrained_embed:
        start = time.time()

        vectors = torchtext.vocab.GloVe('6B',
                                        dim=config.embed_size,
                                        cache=config.pretrained_embed_path)
        TEXT.vocab.set_vectors(vectors.stoi, vectors.vectors, vectors.dim)
        logger.info('vectors', TEXT.vocab.vectors.size())

        logger.info('load embedding took {:.2f} s.'.format(time.time() -
                                                           start))

    vocab = TEXT.vocab

    dataiter_fn = lambda dataset, train: data.BucketIterator(
        dataset=dataset,
        batch_size=config.batch_size,
        shuffle=train,
        repeat=train,
        sort_key=lambda x: len(x.text),
        sort_within_batch=False,
        device=config.device)

    train_pos_iter, train_neg_iter = map(lambda x: dataiter_fn(x, True),
                                         [train_pos_set, train_neg_set])
    dev_pos_iter, dev_neg_iter = map(lambda x: dataiter_fn(x, False),
                                     [dev_pos_set, dev_neg_set])
    test_pos_iter, test_neg_iter = map(lambda x: dataiter_fn(x, False),
                                       [test_pos_set, test_neg_set])

    train_iters = DatasetIterator(train_pos_iter, train_neg_iter)
    dev_iters = DatasetIterator(dev_pos_iter, dev_neg_iter)
    test_iters = DatasetIterator(test_pos_iter, test_neg_iter)

    return train_iters, dev_iters, test_iters, vocab
Exemplo n.º 5
0
def make_data_iter(dataset: Dataset,
                   batch_size: int,
                   batch_type: str = "sentence",
                   train: bool = False,
                   shuffle: bool = False) -> Iterator:
    """
    Returns a torchtext iterator for a torchtext dataset.

    :param dataset: torchtext dataset containing src and optionally trg
    :param batch_size: size of the batches the iterator prepares
    :param batch_type: measure batch size by sentence count or by token count
    :param train: whether it's training time, when turned off,
        bucketing, sorting within batches and shuffling is disabled
    :param shuffle: whether to shuffle the data before each epoch
        (no effect if set to True for testing)
    :return: torchtext iterator
    """

    batch_size_fn = token_batch_size_fn if batch_type == "token" else None

    if train:
        # optionally shuffle and sort during training
        data_iter = data.BucketIterator(repeat=False,
                                        sort=False,
                                        dataset=dataset,
                                        batch_size=batch_size,
                                        batch_size_fn=batch_size_fn,
                                        train=True,
                                        sort_within_batch=True,
                                        sort_key=lambda x: len(x.src),
                                        shuffle=shuffle)
    else:
        # don't sort/shuffle for validation/inference
        data_iter = data.BucketIterator(repeat=False,
                                        dataset=dataset,
                                        batch_size=batch_size,
                                        batch_size_fn=batch_size_fn,
                                        train=False,
                                        sort=False)

    return data_iter
    def train(self):
        """Reads .csv files for train and test sets and trains a Transformer architecture.
           Performs testing by calling the private _test() method and saves results with 
           metrics in runs folder."""
        data_path = "Datasets"
        data_file_names = ["train_dataset.csv.gz", "test_dataset.csv.gz"]
        for i, data_file_name in enumerate(data_file_names):
            data_file_name = os.path.join(data_path, data_file_name)
            uncompressed_data_file_name = ".".join(
                data_file_name.split(".")[:-1]
            )
            if data_file_name.split(".")[-1] == "gz" and not os.path.exists(
                uncompressed_data_file_name
            ):
                print("Uncompressing data")
                with gzip.open(data_file_name, "rb") as f_in:
                    with open(uncompressed_data_file_name, "wb") as f_out:
                        shutil.copyfileobj(f_in, f_out)
            data_file_names[i] = uncompressed_data_file_name.split("/")[1]
        print("Extracting datasets")
        train_val_dataset, test_dataset = data.TabularDataset.splits(
            data_path,
            train="train_dataset.csv",
            test="test_dataset.csv",
            fields=(("label", self.label_field), ("tweet", self.text_field)),
            format="csv",
            skip_header=True,
        )
        train_dataset, val_dataset = train_val_dataset.split(0.9)
        optimizer = torch.optim.Adam(
            lr=self.lr, params=self.model.parameters()
        )
        scheduler = torch.optim.lr_scheduler.LambdaLR(
            optimizer,
            lambda i: min(i / (self.lr_warmup / self.batch_size), 1.0),
        )

        print("Creating batch iterators")
        val_data_iter = data.BucketIterator(
            val_dataset,
            batch_size=self.test_batch_size,
            device=self.device,
            shuffle=True,
        )
        train_data_iter = data.BucketIterator(
            train_dataset,
            batch_size=self.batch_size,
            device=self.device,
            shuffle=True,
        )
        test_data_iter = data.BucketIterator(
            test_dataset,
            batch_size=self.test_batch_size,
            device=self.device,
            shuffle=True,
        )
        avg_val_loss, avg_val_rec = self._test(0, val_data_iter)
        print(
            f"AvgRec: {round(avg_val_rec, 4)},\tavg loss: {round(avg_val_loss, 6)},\tepoch: 0\n"
        )
        log_count = 0
        step_loss = 0
        tot_loss = 0
        for epoch in range(self.epochs):
            self.model.train()
            for batch in tqdm(train_data_iter):
                optimizer.zero_grad()
                label = batch.label - 2
                output = self.model(batch.tweet)
                loss = F.nll_loss(output, label[0])
                step_loss += loss.item()
                loss.backward()
                if self.gradient_clipping > 0.0:
                    nn.utils.clip_grad_norm_(
                        self.model.parameters(), self.gradient_clipping
                    )
                optimizer.step()
                scheduler.step()
                log_count += len(batch)
                if log_count >= self.log_step:
                    step_avg_loss = step_loss / log_count
                    tot_loss += step_loss
                    print(
                        f"Train loss: {round(step_avg_loss, 6)}\tEpoch: {epoch}"
                    )
                    log_count = 0
                    step_loss = 0.0

            torch.save(self.model.state_dict(), self.model_name)
            avg_train_loss = tot_loss / len(train_dataset)
            self.writer.add_scalar(
                "train_avg_loss", avg_train_loss, global_step=epoch
            )
            print(f"Train avg loss: {round(avg_train_loss, 6)}")
            avg_val_loss, avg_val_rec = self._test(epoch + 1, val_data_iter)
            self.writer.add_scalar(
                "avg_val_rec", avg_val_rec, global_step=epoch
            )
            self.writer.add_scalar(
                "avg_val_loss", avg_val_loss, global_step=epoch
            )
            print(
                f"AvgRec: {round(avg_val_rec, 4)},\tavg loss: {round(avg_val_loss, 6)},\tepoch: {epoch}\n"
            )
            tot_loss = 0
            step_avg_loss = 0
            log_count = 0
        print("Performance on test dataset")
        test_avg_loss, test_avg_rec = self._test(-1, test_data_iter)
        test_avg_loss = round(test_avg_loss, 6)
        self.writer.add_text("results", f"test_avg_loss {test_avg_loss}")
        self.writer.add_text("results", f"test_avg_rec {test_avg_rec}")
        print(
            f"AvgRec: {round(test_avg_rec, 4)},\tavg loss: {test_avg_loss},\tepoch: {epoch}\n"
        )
        print(
            f"Best val AvgRec:{round(self.avg_rec[1], 3)} at epoch:{self.avg_rec[0]}"
        )
Exemplo n.º 7
0
def caption_iterator(cfg, batch_size, phase):
    print(f'Contructing caption_iterator for "{phase}" phase')
    spacy_en = spacy.load('en')

    def tokenize_en(txt):
        return [token.text for token in spacy_en.tokenizer(txt)]

    CAPTION = data.ReversibleField(tokenize='spacy',
                                   init_token=cfg.start_token,
                                   eos_token=cfg.end_token,
                                   pad_token=cfg.pad_token,
                                   lower=True,
                                   batch_first=True,
                                   is_target=True)
    INDEX = data.Field(sequential=False, use_vocab=False, batch_first=True)

    # the order has to be the same as in the table
    fields = [
        ('video_id', None),
        ('caption', CAPTION),
        ('start', None),
        ('end', None),
        ('duration', None),
        ('phase', None),
        ('idx', INDEX),
    ]

    dataset = data.TabularDataset(
        path=cfg.train_meta_path,
        format='tsv',
        skip_header=True,
        fields=fields,
    )
    CAPTION.build_vocab(dataset.caption,
                        min_freq=cfg.min_freq_caps,
                        vectors=cfg.word_emb_caps)
    train_vocab = CAPTION.vocab

    if phase == 'val_1':
        dataset = data.TabularDataset(path=cfg.val_1_meta_path,
                                      format='tsv',
                                      skip_header=True,
                                      fields=fields)
    elif phase == 'val_2':
        dataset = data.TabularDataset(path=cfg.val_2_meta_path,
                                      format='tsv',
                                      skip_header=True,
                                      fields=fields)
    elif phase == 'learned_props':
        dataset = data.TabularDataset(path=cfg.val_prop_meta_path,
                                      format='tsv',
                                      skip_header=True,
                                      fields=fields)

    # sort_key = lambda x: data.interleave_keys(len(x.caption), len(y.caption))
    datasetloader = data.BucketIterator(dataset,
                                        batch_size,
                                        sort_key=lambda x: 0,
                                        device=torch.device(cfg.device),
                                        repeat=False,
                                        shuffle=True)
    return train_vocab, datasetloader
    def __init__(
        self, root, 
        train_path, test_path, predict_path,
        batch_size=64,
        valid_ratio=.2,
        max_vocab=999999,
        min_freq=1,
        use_eos=False,
        shuffle=True,
        rm = re.compile('[:;\'\"\[\]\(\)\.,@]') #제거할 특수문자
    ):
        super().__init__()
        # 전처리는 여기서 진행한다. 
        # Data Field 정의
        self.id = data.Field( # 학습에 쓰지 않을 column
            sequential=False, 
            use_vocab=False,
            unk_token=None
        )
        self.text = data.Field( 
            use_vocab=True,
            tokenize=word_tokenize,
            batch_first=True,
            include_lengths=False,
            eos_token='<EOS>' if use_eos else None
        )
        self.label = data.Field(
            sequential=False, # 0 or 1
            use_vocab=False,
            unk_token=None,
            is_target=True
        )
        
        # 데이터 읽어오기
        # ratings_train.txt : train+valid
        train, valid = data.TabularDataset(
            path = root + train_path,
            format ='tsv',
            fields = [
                ('id', self.id),
                ('text', self.text),
                ('label', self.label)],
            skip_header=True
        ).split(split_ratio=(1 - valid_ratio))

        # ratings_test.txt : test
        test = data.TabularDataset(
            path = root + test_path,
            format='tsv',
            fields=[
                ('id', self.id),
                ('text', self.text),
                ('label', self.label)],
            skip_header=True
        )

        # ko_data.csv : Kaggle commit
        predict = data.TabularDataset(
            path = root + predict_path,
            format='csv',
            fields=[
                ('id', self.id),
                ('text', self.text)],
            skip_header=True
        )

        # Batchify (Dataloader에 올리기)
        # train+valid loader
        self.train_loader, self.valid_loader = data.BucketIterator.splits(
            (train, valid),
            batch_size=batch_size,
            device=torch.device('cuda' if torch.cuda.is_available() else 'cpu'),
            shuffle=shuffle,
            sort_key=lambda x: len(x.text), # 길이로 sort 후 batch 나눔!
            sort_within_batch=True, # 미니 배치 내에서 sort
        )

        # test_loader
        self.test_loader = data.BucketIterator(
            test,
            batch_size=batch_size,
            device=torch.device('cuda' if torch.cuda.is_available() else 'cpu'),
            shuffle=False,
            sort_key=lambda x: len(x.text),
            sort_within_batch=False,
        )

        # predict_loader
        self.predict_loader = data.BucketIterator(
            predict,
            batch_size=batch_size,
            device=torch.device('cuda' if torch.cuda.is_available() else 'cpu'),
            shuffle=False
        )

        self.label.build_vocab(train)
        self.text.build_vocab(train, max_size=max_vocab, min_freq=min_freq) # vocabulary set build
Exemplo n.º 9
0
def main():
    print("Using device: {}" "\n".format(str(device)))

    # Load the training dataset, and create a dataloader to generate a batch.
    textField = data.Field(lower=True,
                           include_lengths=True,
                           batch_first=True,
                           tokenize=student.tokenise,
                           preprocessing=student.preprocessing,
                           postprocessing=student.postprocessing,
                           stop_words=student.stopWords)
    labelField = data.Field(sequential=False, use_vocab=False, is_target=True)

    dataset = data.TabularDataset(
        'train.json', 'json', {
            'reviewText': ('reviewText', textField),
            'rating': ('rating', labelField),
            'businessCategory': ('businessCategory', labelField)
        })

    textField.build_vocab(dataset, vectors=student.wordVectors)

    # Allow training on the entire dataset, or split it for training and validation.
    if student.trainValSplit == 1:
        trainLoader = data.BucketIterator(dataset,
                                          shuffle=True,
                                          batch_size=student.batchSize,
                                          sort_key=lambda x: len(x.reviewText),
                                          sort_within_batch=True)
    else:
        train, validate = dataset.split(split_ratio=student.trainValSplit)

        trainLoader, valLoader = data.BucketIterator.splits(
            (train, validate),
            shuffle=True,
            batch_size=student.batchSize,
            sort_key=lambda x: len(x.reviewText),
            sort_within_batch=True)

    # Get model and optimiser from student.
    net = student.net.to(device)
    lossFunc = student.lossFunc
    optimiser = student.optimiser

    # Train.
    for epoch in range(student.epochs):
        runningLoss = 0

        for i, batch in enumerate(trainLoader):
            # Get a batch and potentially send it to GPU memory.
            inputs = textField.vocab.vectors[batch.reviewText[0]].to(device)
            length = batch.reviewText[1].to(device)
            rating = batch.rating.to(device)
            businessCategory = batch.businessCategory.to(device)

            # PyTorch calculates gradients by accumulating contributions to them
            # (useful for RNNs).  Hence we must manually set them to zero before
            # calculating them.
            optimiser.zero_grad()

            # Forward pass through the network.
            ratingOutput, categoryOutput = net(inputs, length)
            loss = lossFunc(ratingOutput, categoryOutput, rating,
                            businessCategory)

            # Calculate gradients.
            loss.backward()

            # Minimise the loss according to the gradient.
            optimiser.step()

            runningLoss += loss.item()

            if i % 32 == 31:
                print("Epoch: %2d, Batch: %4d, Loss: %.3f" %
                      (epoch + 1, i + 1, runningLoss / 32))
                runningLoss = 0

    # Save model.
    torch.save(net.state_dict(), 'savedModel.pth')
    print("\n" "Model saved to savedModel.pth")

    # Test on validation data if it exists.
    if student.trainValSplit != 1:
        net.eval()

        correctRatingOnlySum = 0
        correctCategoryOnlySum = 0
        bothCorrectSum = 0
        with torch.no_grad():
            for batch in valLoader:
                # Get a batch and potentially send it to GPU memory.
                inputs = textField.vocab.vectors[batch.reviewText[0]].to(
                    device)
                length = batch.reviewText[1].to(device)
                rating = batch.rating.to(device)
                businessCategory = batch.businessCategory.to(device)

                # Convert network output to integer values.
                ratingOutputs, categoryOutputs = student.convertNetOutput(
                    *net(inputs, length))

                # Calculate performance
                correctRating = rating == ratingOutputs.flatten()
                correctCategory = businessCategory == categoryOutputs.flatten()

                correctRatingOnlySum += torch.sum(correctRating
                                                  & ~correctCategory).item()
                correctCategoryOnlySum += torch.sum(correctCategory
                                                    & ~correctRating).item()
                bothCorrectSum += torch.sum(correctRating
                                            & correctCategory).item()

        correctRatingOnlyPercent = correctRatingOnlySum / len(validate)
        correctCategoryOnlyPercent = correctCategoryOnlySum / len(validate)
        bothCorrectPercent = bothCorrectSum / len(validate)
        neitherCorrectPer = 1 - correctRatingOnlyPercent \
                              - correctCategoryOnlyPercent \
                              - bothCorrectPercent

        score = 100 * (bothCorrectPercent + 0.5 * correctCategoryOnlyPercent +
                       0.1 * correctRatingOnlyPercent)

        print("\n"
              "Rating incorrect, business category incorrect: {:.2%}\n"
              "Rating correct, business category incorrect: {:.2%}\n"
              "Rating incorrect, business category correct: {:.2%}\n"
              "Rating correct, business category correct: {:.2%}\n"
              "\n"
              "Weighted score: {:.2f}".format(neitherCorrectPer,
                                              correctRatingOnlyPercent,
                                              correctCategoryOnlyPercent,
                                              bothCorrectPercent, score))
Exemplo n.º 10
0
train_fileds = [("text", TEXT), ("label", LABEL)]
train_data = data.TabularDataset(path=r"./imdb_data.csv",
                                 format='csv',
                                 skip_header=True,
                                 fields=train_fileds)
train_data_real, val_data_real = train_data.split(split_ratio=0.7)
vec = Vectors("glove.6B.100d.txt", "./Emotion")
# 将训练集转换为词向量
TEXT.build_vocab(train_data_real, max_size=20000, vectors=vec)
LABEL.build_vocab(train_data_real)
# print(TEXT.vocab.freqs.most_common(n=10))
# print("类别标签情况: ", LABEL.vocab.freqs)
# print("词典个数: ", len(TEXT.vocab.itos))

# 定义加载器
train_iter = data.BucketIterator(train_data_real, batch_size=BATCH_SIZE)
val_iter = data.BucketIterator(val_data_real, batch_size=BATCH_SIZE)

INPUT_DIM = len(TEXT.vocab)  # 词典数量
PAD_IDX = TEXT.vocab.stoi[TEXT.pad_token]

model = TextCNN(INPUT_DIM, EMBEDDING_DIM, N_FILITERS, FILTER_SIZES, OUTPUT_DIM,
                DROPOUT, PAD_IDX)

# 将导入的词向量作为embedding.weight的初值
pretrained_embedding = TEXT.vocab.vectors
model.embedding.weight.data.copy_(pretrained_embedding)
UNK_IDX = TEXT.vocab.stoi[TEXT.unk_token]
model.embedding.weight.data[UNK_IDX] = torch.zeros(EMBEDDING_DIM)
model.embedding.weight.data[PAD_IDX] = torch.zeros(EMBEDDING_DIM)