def get_dataset(self, dataset, n_workers=4, dataset_args={}):
        """ Load data and return Dataset objects for training and validating.

        Args:
            data_path (str): Path to the data.
        """
        self.logging.info('preprocessing data...')

        results = [None] * n_workers
        with Pool(processes=n_workers) as pool:
            for i in range(n_workers):
                batch_start = (len(dataset) // n_workers) * i
                if i == n_workers - 1:
                    batch_end = len(dataset)
                else:
                    batch_end = (len(dataset) // n_workers) * (i + 1)

                batch = dataset[batch_start: batch_end]
                results[i] = pool.apply_async(self.preprocess_samples, [batch])

            pool.close()
            pool.join()

        processed = []
        for result in results:
            processed += result.get()

        padding = self.words_dict["<PAD>"]
        sp_tag = [self.words_dict["<SOS>"], self.words_dict["<EOS>"]]
        return CorpusDataset(processed, padding=padding, sp_tag=sp_tag, **dataset_args)
Exemplo n.º 2
0
def main():
    random.seed(420)
    parser = argparse.ArgumentParser(
        description='Evaluate accuracy of trained model',
        formatter_class=argparse.ArgumentDefaultsHelpFormatter)
    parser.add_argument('--model',
                        default='checkpoint.pt',
                        help='model to use')
    parser.add_argument('--data',
                        default='corpus.pt',
                        help='preprocessed data file')
    parser.add_argument(
        '--device',
        default='cuda:0' if torch.cuda.is_available() else 'cpu',
        help='device to use')
    parser.add_argument('--batch', default=64, type=int, help='batch size')
    args = parser.parse_args()

    cp = CorpusPreprocessor()
    cp.load(args.data)

    net = Net(len(cp.alphabet), cp.max_sentence_length, cp.max_word_length)
    net.load_state_dict(torch.load(args.model, map_location=args.device))
    net.to(args.device)

    _, testset = CorpusDataset.split(cp, 0.8)
    testloader = DataLoader(testset, batch_size=args.batch, num_workers=4)

    accuracy = evaluate(net, args.device, testloader)
    print('Model accuracy: {}'.format(accuracy))
Exemplo n.º 3
0
def get_dataloader(data_path: str, transform: Callable[[List, List], Tuple],
                   batch_size: int) -> DataLoader:
    """dataloader 생성

    Args:
        data_path: dataset 경로
        transform: input feature로 변환해주는 funciton
        batch_size: dataloader batch size

    Returns:
        dataloader
    """
    dataset = CorpusDataset(data_path, transform)
    print(dataset[0])
    dataloader = DataLoader(dataset, batch_size=batch_size)
    return dataloader
Exemplo n.º 4
0
print(json.dumps([model_config, pretraining_config], indent=4))

########################### Loading Datasets ###########################

if "dataset" not in config:
    config["dataset"] = None

tokenizer = RobertaTokenizerFast.from_pretrained(
    '/code/roberta-base', model_max_length=model_config["max_seq_len"])
tokenizer.model_max_length = model_config["max_seq_len"]
tokenizer.init_kwargs['model_max_length'] = model_config["max_seq_len"]
model_config["vocab_size"] = len(tokenizer.get_vocab())

if not args.use_data:
    dataset = CorpusDataset(folder_path=data_folder,
                            file_json="train.json",
                            option=config["dataset"])
    data_collator = DataCollatorForLanguageModeling(tokenizer=tokenizer,
                                                    mlm=True,
                                                    mlm_probability=0.15)
    data_loader = DataLoader(dataset,
                             batch_size=pretraining_config["batch_size"],
                             shuffle=True,
                             collate_fn=data_collator)
    pretrain_dataloader_iter = enumerate(data_loader)

########################### Loading Model ###########################

model = ModelForMaskedLM(model_config)

print(model)
Exemplo n.º 5
0
def get_dataloader(data_path, transform, batch_size):
    dataset = CorpusDataset(data_path, transform)
    dataloader = DataLoader(dataset, batch_size=batch_size)

    return dataloader
Exemplo n.º 6
0
device_ids = list(range(torch.cuda.device_count()))
print(f"GPU list: {device_ids}")

print(json.dumps([model_config, pretraining_config], indent=4))

########################### Loading Dataset ###########################

tokenizer = utils.get_tokenizer(model_config["max_seq_len"])
model_config["vocab_size"] = len(tokenizer.get_vocab())

if "dataset" not in config:
    config["dataset"] = None

dataset = CorpusDataset(folder_path=data_folder,
                        file_json="dev.json",
                        files_per_batch=128,
                        option=config["dataset"])
data_collator = DataCollatorForLanguageModeling(tokenizer=tokenizer,
                                                mlm=True,
                                                mlm_probability=0.15)
data_loader = DataLoader(dataset,
                         batch_size=pretraining_config["batch_size"],
                         collate_fn=data_collator)
pretrain_dataloader_iter = enumerate(data_loader)

########################### Loading Model ###########################

model = ModelForMaskedLM(model_config)
print(model)

model = model.cuda()