Exemplo n.º 1
0
def main(path_to_data: str,
         cache_dir: str,
         texts_col: str,
         labels_col: str,
         n_classes: int,
         batch_size: int,
         batch_size_eval: int,
         min_lr: int,
         max_lr: int,
         n_epochs: int,
         cuda: int = 0):
    '''

    '''
    df = pd.read_csv(path_to_data)

    if os.path.isdir(cache_dir):
        logger.info('Cache dir found here {}'.format(cache_dir))
        pass
    else:
        logger.info('Creating cache dir')
        os.mkdir(cache_dir)

    # Preprocess
    optimal_length = get_length(df, texts_col)
    X, vocab_size = encode_texts(df,
                                 texts_col,
                                 max_seq_length=optimal_length,
                                 return_vocab_size=True)

    y = get_labels(df, labels_col, n_classes)

    train_loader, test_loader = create_TorchLoaders(
        X,
        y,
        test_size=0.10,
        batch_size=batch_size,
        batch_size_eval=batch_size_eval)

    Model = LSTMModel(vocab_size=vocab_size, n_classes=n_classes)

    config_dict = {
        "vocab_size": vocab_size,
        "n_classes": n_classes,
        "max_length": optimal_length
    }

    if n_classes > 2:
        criterion = torch.nn.CrossEntropyLoss()
    else:
        criterion = torch.nn.BCEWithLogitsLoss()

    optim = torch.optim.Adam(Model.parameters())

    ## Heuristic
    opt_cycle = ((((len(X) * (1 - 0.10)) / batch_size) * n_epochs) * 0.25) / 2

    schedul = torch.optim.lr_scheduler.CyclicLR(optim,
                                                min_lr,
                                                max_lr,
                                                step_size_up=opt_cycle,
                                                step_size_down=opt_cycle,
                                                mode="exp_range",
                                                cycle_momentum=False,
                                                gamma=0.999)

    if cuda == 1:
        Model.cuda()
        device = "cuda"
    else:
        device = "cpu"

    metrics = {
        "training_loss": [],
        "eval_loss": [],
        "training_f1": [],
        "eval_f1": []
    }

    logger.info("Starting training for {} epochs".format(n_epochs))

    for epoch in range(n_epochs):
        Model.train()
        progress = progressbar.ProgressBar()
        for batch in progress(train_loader):
            batch = tuple(t for t in batch)

            inputs, labels = batch  #unpacking
            inputs = inputs.to(device, dtype=torch.long)
            labels = labels.to(device, dtype=torch.float)

            preds = Model(inputs)
            loss = criterion(preds, labels)

            ## Metrics computation
            metrics["training_loss"].append(loss.item())

            preds = preds.to("cpu").detach().numpy()
            preds = flat_pred(preds, 0.5)
            tmp_f1 = f1_score(labels.to("cpu").detach().numpy(),
                              preds,
                              average='macro')

            metrics["training_f1"].append(tmp_f1)

            ## Backward pass ##
            loss.backward()

            optim.step()  #Gradient descent
            schedul.step()
            Model.zero_grad()

        logger.info(
            "Epoch {} done with: training loss: {}\n training f1: {}".format(
                epoch, loss.item(), tmp_f1))

        ## Eval
        progress = progressbar.ProgressBar()
        Model.eval()
        for batch in progress(test_loader):
            with torch.no_grad():  #computationaly efficient
                batch = tuple(t for t in batch)

                inputs, labels = batch
                inputs = inputs.to(device, dtype=torch.long)
                labels = labels.to(device, dtype=torch.float)

                preds = Model(inputs)
                eval_loss = criterion(preds, labels)

                ## Eval metrics
                metrics["eval_loss"].append(eval_loss.item())

                preds = preds.to("cpu").detach().numpy()
                preds = flat_pred(preds, 0.5)
                tmp_f1 = f1_score(labels.to("cpu").detach().numpy(),
                                  preds,
                                  average='macro')  ## detach

                metrics["eval_f1"].append(tmp_f1)

        logger.info(
            "Evaluation at iteration {} done: eval loss: {}\n eval f1: {}".
            format(epoch, eval_loss.item(), tmp_f1))

    ## Bring back model to cpu
    Model.cpu()

    ## Get/Save param dict
    logger.info('Saving model in cache dir {}'.format(cache_dir))
    torch.save(Model.state_dict(), os.path.join(cache_dir, 'state_dict.pt'))
    with open(os.path.join(cache_dir, 'config_model.json'), 'w') as file:
        json.dump(config_dict, file)
def main(args):
    use_cuda = (len(args.gpuid) >= 1)
    if args.gpuid:
        cuda.set_device(args.gpuid[0])

    # Load dataset
    splits = ['train', 'valid']
    if data.has_binary_files(args.data, splits):
        dataset = data.load_dataset(args.data, splits, args.src_lang,
                                    args.trg_lang)
    else:
        dataset = data.load_raw_text_dataset(args.data, splits, args.src_lang,
                                             args.trg_lang)
    if args.src_lang is None or args.trg_lang is None:
        # record inferred languages in args, so that it's saved in checkpoints
        args.src_lang, args.trg_lang = dataset.src, dataset.dst
    print('| [{}] dictionary: {} types'.format(dataset.src,
                                               len(dataset.src_dict)))
    print('| [{}] dictionary: {} types'.format(dataset.dst,
                                               len(dataset.dst_dict)))
    for split in splits:
        print('| {} {} {} examples'.format(args.data, split,
                                           len(dataset.splits[split])))

    # Set model parameters
    args.encoder_embed_dim = 1000
    args.encoder_layers = 4
    args.encoder_dropout_out = 0
    args.decoder_embed_dim = 1000
    args.decoder_layers = 4
    args.decoder_out_embed_dim = 1000
    args.decoder_dropout_out = 0
    args.bidirectional = False

    logging_meters = OrderedDict()
    logging_meters['train_loss'] = AverageMeter()
    logging_meters['valid_loss'] = AverageMeter()
    logging_meters['bsz'] = AverageMeter()  # sentences per batch

    # Build model
    generator = LSTMModel(args,
                          dataset.src_dict,
                          dataset.dst_dict,
                          use_cuda=use_cuda)

    if use_cuda:
        generator.cuda()
    else:
        generator.cpu()

    optimizer = eval("torch.optim." + args.optimizer)(generator.parameters(),
                                                      args.learning_rate)

    lr_scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(
        optimizer, patience=0, factor=args.lr_shrink)

    # Train until the learning rate gets too small
    max_epoch = args.max_epoch or math.inf

    epoch_i = 1
    best_dev_loss = math.inf
    lr = optimizer.param_groups[0]['lr']
    # main training loop

    # added for write training loss
    f1 = open("train_loss", "a")

    while lr > args.min_lr and epoch_i <= max_epoch:
        logging.info("At {0}-th epoch.".format(epoch_i))

        seed = args.seed + epoch_i
        torch.manual_seed(seed)

        max_positions_train = (min(args.max_source_positions,
                                   generator.encoder.max_positions()),
                               min(args.max_target_positions,
                                   generator.decoder.max_positions()))

        # Initialize dataloader, starting at batch_offset
        itr = dataset.train_dataloader(
            'train',
            max_tokens=args.max_tokens,
            max_sentences=args.max_sentences,
            max_positions=max_positions_train,
            seed=seed,
            epoch=epoch_i,
            sample_without_replacement=args.sample_without_replacement,
            sort_by_source_size=(epoch_i <= args.curriculum),
            shard_id=args.distributed_rank,
            num_shards=args.distributed_world_size,
        )
        # set training mode
        generator.train()

        # reset meters
        for key, val in logging_meters.items():
            if val is not None:
                val.reset()

        for i, sample in enumerate(itr):

            if use_cuda:
                # wrap input tensors in cuda tensors
                sample = utils.make_variable(sample, cuda=cuda)

            loss = generator(sample)
            sample_size = sample['target'].size(
                0) if args.sentence_avg else sample['ntokens']
            nsentences = sample['target'].size(0)
            logging_loss = loss.data / sample_size / math.log(2)
            logging_meters['bsz'].update(nsentences)
            logging_meters['train_loss'].update(logging_loss, sample_size)
            f1.write("{0}\n".format(logging_meters['train_loss'].avg))
            logging.debug(
                "loss at batch {0}: {1:.3f}, batch size: {2}, lr={3}".format(
                    i, logging_meters['train_loss'].avg,
                    round(logging_meters['bsz'].avg),
                    optimizer.param_groups[0]['lr']))
            optimizer.zero_grad()
            loss.backward()

            # all-reduce grads and rescale by grad_denom
            for p in generator.parameters():
                if p.requires_grad:
                    p.grad.data.div_(sample_size)

            torch.nn.utils.clip_grad_norm(generator.parameters(),
                                          args.clip_norm)
            optimizer.step()

            # validation -- this is a crude estimation because there might be some padding at the end
            max_positions_valid = (
                generator.encoder.max_positions(),
                generator.decoder.max_positions(),
            )

        # Initialize dataloader
        itr = dataset.eval_dataloader(
            'valid',
            max_tokens=args.max_tokens,
            max_sentences=args.max_sentences,
            max_positions=max_positions_valid,
            skip_invalid_size_inputs_valid_test=args.
            skip_invalid_size_inputs_valid_test,
            descending=True,  # largest batch first to warm the caching allocator
            shard_id=args.distributed_rank,
            num_shards=args.distributed_world_size,
        )
        # set validation mode
        generator.eval()

        # reset meters
        for key, val in logging_meters.items():
            if val is not None:
                val.reset()

        for i, sample in enumerate(itr):
            with torch.no_grad():
                if use_cuda:
                    # wrap input tensors in cuda tensors
                    sample = utils.make_variable(sample, cuda=cuda)
                loss = generator(sample)
                sample_size = sample['target'].size(
                    0) if args.sentence_avg else sample['ntokens']
                loss = loss / sample_size / math.log(2)
                logging_meters['valid_loss'].update(loss, sample_size)
                logging.debug("dev loss at batch {0}: {1:.3f}".format(
                    i, logging_meters['valid_loss'].avg))

        # update learning rate
        lr_scheduler.step(logging_meters['valid_loss'].avg)
        lr = optimizer.param_groups[0]['lr']

        logging.info(
            "Average loss value per instance is {0} at the end of epoch {1}".
            format(logging_meters['valid_loss'].avg, epoch_i))
        torch.save(
            generator.state_dict(),
            open(
                args.model_file + "data.nll_{0:.3f}.epoch_{1}.pt".format(
                    logging_meters['valid_loss'].avg, epoch_i), 'wb'))

        if logging_meters['valid_loss'].avg < best_dev_loss:
            best_dev_loss = logging_meters['valid_loss'].avg
            torch.save(generator.state_dict(),
                       open(args.model_file + "best_gmodel.pt", 'wb'))

        epoch_i += 1

    f1.close()