示例#1
0
def train_model(model, lr, epochs, train_loader, val_loader, patience):
    optimizer = Adagrad(model.parameters(), lr)
    criterion = nn.MSELoss()

    best_rmse = 100
    rounds_no_imporve = 0
    for epoch in range(epochs):
        for users, items, x, y in train_loader:
            y_pred = model(users, items, x)
            loss = criterion(y_pred.reshape(-1), y)
            optimizer.zero_grad()
            loss.backward()
            optimizer.step()
        logging.info('Last train loss: {0:.3f}'.format(
            loss.detach().cpu().numpy().tolist()))
        with torch.no_grad():
            errors = np.array([])
            for users, items, x, y in val_loader:
                y_pred = model(users, items, x)
                group_errors = (y_pred - y).reshape(-1).cpu().numpy()
                errors = np.concatenate([errors, group_errors])
            rmse = (errors**2).mean()**0.5
            logging.info('Test RMSE: {0:.3f}'.format(rmse))
            if rmse < best_rmse:
                best_rmse = rmse
                rounds_no_imporve = 0
            else:
                rounds_no_imporve += 1
            if rounds_no_imporve >= patience:
                return model
    return model
示例#2
0
    def fit(self, data_loader, print_freq=1000, num_epochs=10):
        ''' fit to the data

        Parameters
        ----------
        data_loader : DataLoader
            if enumerated, it returns array-like object of shape (batch_size, length),
            where each element corresponds to word index.
        print_freq : int
            how frequent to print loss
        num_epochs : int
            the number of epochs
        '''

        def repackage_hidden(h):
            """Wraps hidden states in new Variables, to detach them from their history."""
            if type(h) == Variable:
                return Variable(h.data)
            else:
                return tuple(repackage_hidden(v) for v in h)

        if self.padding_idx is None:
            criterion = nn.CrossEntropyLoss()
        else:
            criterion = nn.CrossEntropyLoss(ignore_index=self.padding_idx)
        optimizer = Adagrad(self.parameters())
        i = 0
        running_loss = 0
        for epoch in range(num_epochs):
            for each_idx, each_batch in enumerate(data_loader):
                batch_var = Variable(each_batch, requires_grad=False)
                if self.use_gpu:
                    batch_var = batch_var.cuda()

                try:
                    pred_batch = self.forward(batch_var[:, :-1])
                except:
                    import ipdb; ipdb.set_trace()
                    
                pred_batch.contiguous()
                batch_var.contiguous()
                tgt = batch_var[:, :-1]
                tgt.contiguous()
                loss = criterion(pred_batch.view(-1, self.vocab_size),
                                 tgt.view(-1))
                loss.backward()
                optimizer.step()
                self.init_hidden()

                # print statistics
                running_loss += loss.data[0]
                i += 1
                if i % print_freq == print_freq-1:
                    print('epoch: {}\t total examples: {}\t loss: {}'.format(
                        epoch + 1, (i + 1) * self.batch_size, running_loss / print_freq))
                    running_loss = 0.0

        print('Finished Training')
示例#3
0
def demo_pytorch_vae_mnist(hidden_sizes=[200, 200],
                           latent_dim=5,
                           distribution_type='bernoulli',
                           minibatch_size=20,
                           checkpoints=100,
                           n_epochs=20):

    cp = Checkpoints(checkpoints)

    model = VAEModel(
        encoder=make_mlp_encoder(visible_dim=784,
                                 hidden_sizes=hidden_sizes,
                                 latent_dim=latent_dim),
        decoder=make_mlp_decoder(latent_dim=latent_dim,
                                 hidden_sizes=hidden_sizes,
                                 visible_dim=784,
                                 dist_type=distribution_type),
        latent_dim=latent_dim,
    )
    # optimizer = Adam(params = model.parameters())
    # optimizer = RMSprop(params = model.parameters())
    # optimizer = Adamax(params = model.parameters())
    optimizer = Adagrad(params=model.parameters())
    # optimizer = SGD(lr=0.001, params = model.parameters())

    train_loader = torch.utils.data.DataLoader(datasets.MNIST(
        '../data',
        train=True,
        download=True,
        transform=transforms.Compose([transforms.ToTensor()])),
                                               batch_size=minibatch_size,
                                               shuffle=True)

    for epoch in range(n_epochs):
        for batch_idx, (x, y) in enumerate(train_loader):

            epoch_pt = epoch + batch_idx / len(train_loader)

            optimizer.zero_grad()
            loss = -model.elbo(x.flatten(1)).sum()
            loss.backward()
            optimizer.step()

            rate = measure_global_rate('training')

            if cp():

                print(f'Mean Rate at Epoch {epoch_pt:.2g}: {rate:.3g}iter/s')
                z_samples = model.prior().sample((64, ))
                x_dist = model.decode(z_samples)
                dbplot(x_dist.mean.reshape(-1, 28, 28),
                       'Sample Means',
                       title=f'Sample Means at epoch {epoch_pt:.2g}')
示例#4
0
    def fit(self, seq_list: List, objective='cross_entropy',
            print_freq=1000, num_epochs=10, sgd_kwargs={}):
        ''' train LSTM using DataLoader

        Parameters
        ----------
        seq_list : list
            each element corresponds to a sequence
        objective : str
            objective function
        print_freq : int
            how frequently loss is printed
        num_epochs : int
            the number of training epochs
        sgd_kwargs : dict
            keywords fed into SGD
        '''
        if objective == 'cross_entropy':
            criterion = nn.CrossEntropyLoss()
        elif objective == 'mse':
            criterion = nn.MSELoss()
        elif objective == 'nll': # nll stands for negative log-likelihood
            criterion = nn.NLLLoss()
        else:
            raise NotImplementedError

        optimizer = Adagrad(self.parameters(), **sgd_kwargs)
        i = 0
        running_loss = 0
        for epoch in range(num_epochs):
            for each_idx in range(0, len(seq_list), self.batch_size):
                each_seq = torch.stack(
                    seq_list[each_idx:each_idx + self.batch_size], dim=1)
                seq = Variable(each_seq, requires_grad=False)

                optimizer.zero_grad()
                pred_seq = self.forward(seq[:-1])
                loss = criterion(pred_seq, seq[:-1])
                loss.backward()
                optimizer.step()
                self.init_hidden()

                # print statistics
                running_loss += loss.data[0]
                i += 1
                if i % print_freq == print_freq-1:
                    print('epoch: {}\t total examples: {}\t loss: {}'.format(
                        epoch + 1, i + 1, running_loss / print_freq))
                    running_loss = 0.0

        print('Finished Training')
def train(inputs, outputs, model, l1, l2, lr=1e-5, epochs=10000):
    criterion = torch.nn.CrossEntropyLoss()
    optimizer = Adagrad(model.parameters(), lr=lr)

    log = []
    for _ in range(epochs):
        prediction = model(inputs)
        acc = tn((prediction.max(1)[1] == outputs).float().mean())
        original_loss = criterion(prediction, outputs)
        penalty = model.penalty(l1, l2)
        error = original_loss + penalty
        optimizer.zero_grad()
        error.backward()
        log.append(
            (tn(original_loss), tn(penalty), tn(get_sparsity(model)), acc))
        optimizer.step()

    return np.array(log)
示例#6
0
    def train(self, device):
        set_random_seed()
        self.loaded_data.negative_sample()
        # Compose Graph NN
        gnn_channel = GNNChannel(self.sr_ent_num, self.tg_ent_num, self.dim,
                                 self.layer_num, self.drop_out, self.channels)
        self.gnn_channel = gnn_channel
        gnn_channel.to(device)
        gnn_channel.train()

        # Prepare optimizer
        optimizer = Adagrad(filter(lambda p: p.requires_grad,
                                   gnn_channel.parameters()),
                            lr=self.learning_rate,
                            weight_decay=self.l2_regularization)
        criterion = AlignLoss(self.margin_gamma)

        best_hit_at_1 = 0
        best_epoch_num = 0

        for epoch_num in range(1, self.epoch_num + 1):
            gnn_channel.train()
            optimizer.zero_grad()
            sr_seed_hid, tg_seed_hid, _, _ = gnn_channel.forward(
                self.loaded_data.train_sr_ent_seeds,
                self.loaded_data.train_tg_ent_seeds)
            loss = criterion(sr_seed_hid, tg_seed_hid)
            loss.backward()
            optimizer.step()
            if epoch_num % self.nega_sample_freq == 0:
                if str(self.directory).find('DWY100k') >= 0:
                    self.loaded_data.negative_sample()
                else:
                    self.negative_sample()
                hit_at_1 = self.evaluate(epoch_num,
                                         gnn_channel,
                                         print_info=False,
                                         device=device)
                if hit_at_1 > best_hit_at_1:
                    best_hit_at_1 = hit_at_1
                    best_epoch_num = epoch_num
        print('Model best Hit@1 on valid set is %.2f at %d epoch.' %
              (best_hit_at_1, best_epoch_num))
        return best_hit_at_1, best_epoch_num
示例#7
0
def train(inputs, outputs, model, l1, l2, lr=1e-5, epochs=10000):
    criterion = torch.nn.MSELoss()
    optimizer = Adagrad(model.parameters(), lr=lr)

    # Everyone starts the same way
    for p in model.parameters():
        # p.data.fill_(1.0)
        pass

    log = []
    for _ in range(epochs):
        original_loss = criterion(model(inputs), outputs)
        penalty = model.penalty(l1, l2)
        error = original_loss + penalty
        optimizer.zero_grad()
        error.backward()
        log.append((tn(original_loss), tn(penalty), tn(get_sparsity(model))))
        optimizer.step()

    return np.array(log)
示例#8
0
def run(dim, ds, epochs, attempts, lrs, reg_coef):
    losses = pd.DataFrame(columns=['lr', 'epoch', 'attempt', 'loss'])
    total_epochs = len(lrs) * len(attempts) * len(epochs)
    with tqdm(total=total_epochs, desc='lr = NA, attempt = NA, epoch = NA, loss = NA', unit='epochs',
              ncols=140) as pbar:
        for lr in lrs:
            for attempt in attempts:
                x = torch.empty(dim, requires_grad=True, dtype=torch.double)
                torch.nn.init.normal_(x)
                opt = Adagrad([x], lr=lr)

                for epoch in epochs:
                    train_loss = 0
                    for X, y in DataLoader(ds, shuffle=True, batch_size=1):
                        opt.zero_grad()

                        if y.item() == 0:
                            score = -torch.dot(X[0, :], x)
                        else:
                            score = torch.dot(X[0, :], x)

                        loss = torch.log1p(torch.exp(score)) + (reg_coef / 2) * torch.dot(x, x)
                        loss.backward()

                        train_loss += loss.item()
                        opt.step()

                    train_loss /= len(ds)
                    losses = losses.append(pd.DataFrame.from_dict(
                        {'loss': [train_loss],
                         'epoch': [epoch],
                         'lr': [lr],
                         'attempt': [attempt]}), sort=True)
                    pbar.update()
                    pbar.set_description(desc=f'lr = {lr}, attempt = {attempt}, epoch = {epoch}, loss = {train_loss}')
    return losses
示例#9
0
def main():
    parser = argparse.ArgumentParser()
    parser.add_argument("--build_pre_train",
                        action='store_true',
                        help="Whether to build Pre-Train data.")
    parser.add_argument("--train_path",
                        type=str,
                        default="../data/tacred_train.json",
                        help="Path to unlabled data.")
    parser.add_argument("--dev_path",
                        type=str,
                        default="../data/tacred_dev.json",
                        help="Path to unlabled data.")
    parser.add_argument("--test_path",
                        type=str,
                        default="../data/tacred_test.json",
                        help="Path to unlabled data.")
    parser.add_argument("--explanation_data_path",
                        type=str,
                        default="../data/tacred_explanations.json",
                        help="Path to explanation data.")
    parser.add_argument("--train_batch_size",
                        default=64,
                        type=int,
                        help="Total batch size for train.")
    parser.add_argument("--eval_batch_size",
                        default=128,
                        type=int,
                        help="Total batch size for eval.")
    parser.add_argument("--learning_rate",
                        default=0.001,
                        type=float,
                        help="The initial learning rate for Adam.")
    parser.add_argument(
        "--epochs",
        default=25,  # will train for 24, stopping criteria of 0.9 f1
        type=int,
        help="Number of Epochs for training")
    parser.add_argument('--embeddings',
                        type=str,
                        default="glove.840B.300d",
                        help="initial embeddings to use")
    parser.add_argument('--seed',
                        type=int,
                        default=42,
                        help="random seed for initialization")
    parser.add_argument('--gamma',
                        type=float,
                        default=0.5,
                        help="weight of sim_loss")
    parser.add_argument('--emb_dim',
                        type=int,
                        default=300,
                        help="embedding vector size")
    parser.add_argument(
        '--hidden_dim',
        type=int,
        default=300,
        help="hidden vector size of lstm (really 2*hidden_dim, due to bilstm)")
    parser.add_argument('--model_save_dir',
                        type=str,
                        default="",
                        help="where to save the model")
    parser.add_argument('--experiment_name',
                        type=str,
                        default="official",
                        help="what to save the model file as")
    parser.add_argument('--load_model',
                        action='store_true',
                        help="Whether to load a model")
    parser.add_argument('--start_epoch',
                        type=int,
                        default=0,
                        help="start_epoch")
    parser.add_argument('--use_adagrad',
                        action='store_true',
                        help="use adagrad optimizer")

    args = parser.parse_args()

    torch.manual_seed(args.seed)
    random.seed(args.seed)
    sample_rate = 0.6
    lower_bound = -20.0
    dataset = "tacred"

    if args.build_pre_train:
        build_pre_train_find_datasets_from_splits(
            args.train_path,
            args.dev_path,
            args.test_path,
            args.explanation_data_path,
            embedding_name=args.embeddings,
            sample_rate=sample_rate,
            dataset=dataset)

    save_string = generate_save_string(dataset,
                                       args.embeddings,
                                       sample=sample_rate)

    with open("../data/pre_train_data/train_data_{}.p".format(save_string),
              "rb") as f:
        train_dataset = pickle.load(f)

    primary_eval_path = "../data/pre_train_data/rq_data_{}.p".format(
        save_string)

    # optional secondary eval, can set this to the empty string
    secondary_eval_path = "../data/pre_train_data/dev_data_{}.p".format(
        save_string)

    with open("../data/vocabs/vocab_{}.p".format(save_string), "rb") as f:
        vocab = pickle.load(f)

    with open("../data/pre_train_data/sim_data_{}.p".format(save_string),
              "rb") as f:
        sim_data = pickle.load(f)

    pad_idx = vocab["<pad>"]

    if torch.cuda.is_available():
        device = torch.device("cuda")
    else:
        device = torch.device("cpu")

    custom_vocab = build_custom_vocab(dataset, len(vocab))
    custom_vocab_length = len(custom_vocab)

    model = Find_Module.Find_Module(emb_weight=vocab.vectors,
                                    padding_idx=pad_idx,
                                    emb_dim=args.emb_dim,
                                    hidden_dim=args.hidden_dim,
                                    cuda=torch.cuda.is_available(),
                                    custom_token_count=custom_vocab_length)
    del vocab

    # prepping variables for storing training progress
    epochs = args.epochs
    epoch_string = str(epochs)
    epoch_losses = []
    dev_2_epoch_losses = []
    best_f1_score = -1
    best_dev_2_f1_score = -1
    best_dev_loss = float('inf')

    if args.load_model:
        model.load_state_dict(
            torch.load("../data/saved_models/Find-Module-pt_{}.p".format(
                args.experiment_name)))
        print("loaded model")

        with open("../data/result_data/loss_per_epoch_Find-Module-pt_{}.csv".
                  format(args.experiment_name)) as f:
            reader = csv.reader(f)
            next(reader)
            for row in reader:
                epoch_losses.append(row)
                if float(row[-1]) > best_f1_score:
                    best_f1_score = float(row[-1])
                if float(row[3]) < best_dev_loss:
                    best_dev_loss = float(row[3])

        with open(
                "../data/result_data/dev_2_loss_per_epoch_Find-Module-pt_{}.csv"
                .format(args.experiment_name)) as f:
            reader = csv.reader(f)
            next(reader)
            for row in reader:
                dev_2_epoch_losses.append(row)
                if float(row[-1]) > best_dev_2_f1_score:
                    best_dev_2_f1_score = float(row[-1])

        print("loaded past results")

    model = model.to(device)

    # Get L_sim Data ready
    real_query_tokens, _ = BaseVariableLengthDataset.variable_length_batch_as_tensors(
        sim_data["queries"], pad_idx)
    real_query_tokens = real_query_tokens.to(device)
    query_labels = sim_data["labels"]

    queries_by_label = {}
    for i, label in enumerate(query_labels):
        if label in queries_by_label:
            queries_by_label[label][i] = 1
        else:
            queries_by_label[label] = [0] * len(query_labels)
            queries_by_label[label][i] = 1

    query_index_matrix = []
    for i, label in enumerate(query_labels):
        query_index_matrix.append(queries_by_label[label][:])

    query_index_matrix = torch.tensor(query_index_matrix)
    neg_query_index_matrix = 1 - query_index_matrix
    for i, row in enumerate(neg_query_index_matrix):
        neg_query_index_matrix[i][i] = 1

    query_index_matrix = query_index_matrix.to(device)
    neg_query_index_matrix = neg_query_index_matrix.to(device)

    # define the optimizer
    if args.use_adagrad:
        optimizer = Adagrad(model.parameters(), lr=args.learning_rate)
    else:
        optimizer = AdamW(model.parameters(), lr=args.learning_rate)

    # define loss functions
    find_loss_function = nn.BCEWithLogitsLoss(
        pos_weight=torch.tensor([20.0]).to(device))
    sim_loss_function = similarity_loss_function

    for epoch in range(args.start_epoch, args.start_epoch + epochs):
        print('\n Epoch {:} / {:}'.format(epoch + 1,
                                          args.start_epoch + epochs))

        total_loss, find_total_loss, sim_total_loss = 0, 0, 0
        batch_count = 0
        model.train()
        # iterate over batches
        for step, batch in enumerate(
                tqdm(
                    train_dataset.as_batches(batch_size=args.train_batch_size,
                                             seed=epoch))):
            # push the batch to gpu
            batch = [r.to(device) for r in batch]

            tokens, queries, labels = batch

            # clear previously calculated gradients
            model.zero_grad()

            # get model predictions for the current batch
            token_scores = model.find_forward(tokens, queries, lower_bound)
            pos_scores, neg_scores = model.sim_forward(real_query_tokens,
                                                       query_index_matrix,
                                                       neg_query_index_matrix)

            # compute the loss between actual and predicted values
            find_loss = find_loss_function(token_scores, labels)
            sim_loss = sim_loss_function(pos_scores, neg_scores)
            string_loss = find_loss + args.gamma * sim_loss

            # add on to the total loss
            find_total_loss = find_total_loss + find_loss.item()
            sim_total_loss = sim_total_loss + sim_loss.item()
            total_loss = total_loss + string_loss.item()
            batch_count += 1

            if batch_count % 100 == 0 and batch_count > 0:
                print(
                    (find_total_loss, sim_total_loss, total_loss, batch_count))

            # backward pass to calculate the gradients
            string_loss.backward()

            # clip the the gradients to 1.0. It helps in preventing the exploding gradient problem
            torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)

            # update parameters
            optimizer.step()

        # compute the training loss of the epoch
        train_avg_loss = total_loss / batch_count
        train_avg_find_loss = find_total_loss / batch_count
        train_avg_sim_loss = sim_total_loss / batch_count

        print("Starting Primary Evaluation")
        eval_results = evaluate_find_module(
            primary_eval_path, real_query_tokens, query_index_matrix,
            neg_query_index_matrix, lower_bound, model, find_loss_function,
            sim_loss_function, args.eval_batch_size, args.gamma)
        dev_avg_loss, dev_avg_find_loss, dev_avg_sim_loss, dev_f1_score, total_og_scores, total_new_scores = eval_results
        print("Finished Primary Evaluation")

        if dev_f1_score > best_f1_score or (dev_f1_score == best_f1_score
                                            and dev_avg_loss < best_dev_loss):
            print("Saving Model")
            if len(args.model_save_dir) > 0:
                dir_name = args.model_save_dir
            else:
                dir_name = "../data/saved_models/"
            torch.save(
                model.state_dict(),
                "{}Find-Module-pt_{}.p".format(dir_name, args.experiment_name))
            with open(
                    "../data/result_data/best_dev_total_og_scores_{}.p".format(
                        args.experiment_name), "wb") as f:
                pickle.dump(total_og_scores, f)
            with open(
                    "../data/result_data/best_dev_total_new_scores_{}.p".
                    format(args.experiment_name), "wb") as f:
                pickle.dump(total_new_scores, f)
            best_f1_score = dev_f1_score
            best_dev_loss = dev_avg_loss

        epoch_losses.append(
            (train_avg_loss, train_avg_find_loss, train_avg_sim_loss,
             dev_avg_loss, dev_avg_find_loss, dev_avg_sim_loss, dev_f1_score))
        print("Best Primary F1: {}".format(str(best_f1_score)))
        print(epoch_losses[-3:])

        if len(secondary_eval_path) > 0:
            print("Starting Secondary Evaluation")
            eval_results = evaluate_find_module(
                secondary_eval_path, real_query_tokens, query_index_matrix,
                neg_query_index_matrix, lower_bound, model, find_loss_function,
                sim_loss_function, args.eval_batch_size, args.gamma)
            dev_2_avg_loss, dev_2_avg_find_loss, dev_2_avg_sim_loss, dev_2_f1_score, total_og_scores, total_new_scores = eval_results
            print("Finished Secondary Evaluation")

            if dev_2_f1_score > best_dev_2_f1_score:
                best_dev_2_f1_score = dev_2_f1_score
                with open(
                        "../data/result_data/best_dev_2_total_og_scores_{}.p".
                        format(args.experiment_name), "wb") as f:
                    pickle.dump(total_og_scores, f)
                with open(
                        "../data/result_data/best_dev_2_total_new_scores_{}.p".
                        format(args.experiment_name), "wb") as f:
                    pickle.dump(total_new_scores, f)

            dev_2_epoch_losses.append((dev_2_avg_loss, dev_2_avg_find_loss,
                                       dev_2_avg_sim_loss, dev_2_f1_score))
            print("Best Secondary F1: {}".format(str(best_dev_2_f1_score)))
            print(dev_2_epoch_losses[-3:])

        if best_f1_score > 0.9:
            break

    with open(
            "../data/result_data/loss_per_epoch_Find-Module-pt_{}.csv".format(
                args.experiment_name), "w") as f:
        writer = csv.writer(f)
        writer.writerow([
            'train_loss', 'train_find_loss', 'train_sim_loss', 'dev_loss',
            'dev_find_loss', 'dev_sim_loss', 'dev_f1_score'
        ])
        for row in epoch_losses:
            writer.writerow(row)

    if len(secondary_eval_path) > 0:
        with open(
                "../data/result_data/dev_2_loss_per_epoch_Find-Module-pt_{}.csv"
                .format(args.experiment_name), "w") as f:
            writer = csv.writer(f)
            writer.writerow([
                "dev_2_avg_loss", "dev_2_avg_find_loss", "dev_2_avg_sim_loss",
                "dev_2_f1_score"
            ])
            for row in dev_2_epoch_losses:
                writer.writerow(row)
示例#10
0
class VAE(BaseEstimator, TransformerMixin):
    """
    :param decoder: Address neural network in decoder. Possible values are
                    bernoulli and gaussian.
    """
    def __init__(self,
                 nohiddens: int = 400,
                 nolatents: int = 20,
                 nosamples: int = 1,
                 noepochs: int = 15,
                 batch_size: int = 100,
                 show_every: int = 100,
                 decoder: str = 'bernoulli',
                 outdir: str = 'output/'):

        super(BaseEstimator, self).__init__()

        makedirs(outdir, exist_ok=True)

        self.noinputs = None
        self.nohiddens = nohiddens
        self.nolatents = nolatents
        self.nosamples = nosamples
        self.noepochs = noepochs
        self.batch_size = batch_size
        self.show_every = show_every
        self.outdir = outdir

        if decoder == 'bernoulli':
            self.fit = self.__fit_bernoulli
        elif decoder == 'gaussian':
            self.fit = self.__fit_gaussian
        else:
            raise ValueError(f'Unknown decoder type: "{decoder}".')

        self.logger = getLogger(__name__)
        self.model = None
        self.opt = None

    def fit(self, X):
        """Method fit is overloaded during construction of VAE estimator. See
        constructor for details.
        """

    def transform(self, X: Tensor) -> LatentVariable:
        latvar = LatentVariable(self.model, *self.model.encode(X))
        return latvar

    def inverse_transform(self, X: Tensor) -> Tensor:
        origin = self.model.decode(X)

        if isinstance(origin, tuple):
            return origin[0]
        else:
            return origin

    def __fit(self, dataset: Tensor, model: VAEBase):
        it = DataLoader(dataset, batch_size=self.batch_size, shuffle=True)

        dur = self.noepochs * ceil(len(dataset) / self.batch_size)
        history = History(zeros(dur), zeros(dur), zeros(dur), zeros(dur),
                          zeros(dur))

        hooks = CombinedHook()
        hooks.add(LossHook)
        hooks.add(RekonstruktHook, dataset[:10, :])
        hooks.add(LatentSamplerHook, self.nolatents)
        hooks.prehook(self, history)

        self.model = model
        self.noinputs = model.noinputs
        self.opt = Adagrad(self.model.parameters(), lr=0.01)  # See Section 5.

        for epoch in range(self.noepochs):
            for i, x in enumerate(it):
                self.opt.zero_grad()

                # Apply model in the following steps:
                # (a) encode datapoint into latent space;
                # (b) sample points from latent space;
                # (c) decode sampled points from latent space.
                mu, logsigma2 = self.model.encode(x)
                z = self.model.sample(mu, logsigma2)
                X = self.model.decode(z)

                # Estimate KL-divergence and reconstruction error (RE).
                kl = self.model.kl(mu, logsigma2)
                re = self.model.re(x, X)

                # Do error backpropagation.
                loss = kl + re
                loss.backward()
                self.opt.step()

                # Aggregation runtime statistics.
                history.append(epoch=epoch,
                               batch=i,
                               kl=float(kl / self.batch_size),
                               re=float(re / self.batch_size))

                if i % self.show_every == 0:
                    hooks.hook(self, history)

        # Print status before exit.
        hooks.posthook(self, history)

        # Return itself for calls chaining.
        return self

    def __fit_bernoulli(self, dataset: Tensor):
        params = self.get_params()
        params['noinputs'] = dataset.shape[1]
        model = VAEBernoulliDecoder(**params)
        return self.__fit(dataset, model)

    def __fit_gaussian(self, dataset: Tensor):
        params = self.get_params()
        params['noinputs'] = dataset.shape[1]
        model = VAEGaussianDecoder(**params)
        return self.__fit(dataset, model)
示例#11
0
class trainW2V:

    """ To train a word2vec model on a text obtained from pubmed scraping. """

    def __init__(self, text, windowSize=5, negWords=15, embedDim=200, vocabSize=None, 
                 nOccur=10, phMinCount=5, phThresh=10, phDepth=2,
                 wInit='scaled-uniform', epochs=50, batchSize= 1024, 
                 optimizer='SGD', lr=0.01, patience=5, epsilon=1e-5, raw=False, 
                 tShuff=False, saveFreq=-1, restoreBest=True, outPath='./'):

        """ Args:
                text (nested list): input text as list of sentences.
                windowSize (int): size of the context window.
                negWords (int): number of negative words used in training.
                embedDim (int): dimensionality of the embedded space (default 200).
                vocabSize (int): size of the the vocabulary (default None)
                nOccur (int): minimum number of occurrencies to keep a word in the dictionary,
                          can be overwritten by vocabSiz (default 10).
                phMinCount (int): minimum number of occurrences to keep a phrase (default 5).
                phThresh (float): minimum score to keep a phrase (default 10).
                phDepth (int): number of recursions during phrase search (1 = bi-grams, default 2).
                wInit (string): distribution from which to draw initial node weights (only 'scaled-uniform'
                        and 'xavier' are currently available, default 'scaled-uniform').
                epochs (int): number of epochs  (default 50).
                batchSize (int): size of batches (default 1024).
                optimizer (str): optimizer choice, 'SGD' amd 'Adagrad' only 
                        (default 'SGD').
                lr (float): learning rage (default .01).
                patience (int): early stop patience (default 5).
                epsilon (float): early stop epsilon (default 1e-5).
                raw (bool): if True clean the input text (default True).                
                tShuff (bool): shuffle training set at each epoch (default false).
                saveFreq (int): frequency of model checkpoints, if < 0 don't save checkpoints (default -1).
                restoreBest (bool): restore and save best model by early stopping.
                outPath (string): path to directory where to save the trained models.
            """

        """ Set up training dataset and batches. """

        self.trainDs = textDataset(text, windowSize, negWords, vocabSize=vocabSize, nOccur=nOccur,
                                    phMinCount=phMinCount, phThresh=phThresh, phDepth=phDepth,  raw=raw)
        self.trainBatch = DataLoader(self.trainDs, batch_size = batchSize, shuffle = tShuff)
        
        """ Set up model """

        self.model = skipGram(int(self.trainDs.wDict.shape[0]), embedDim, wInit)

        """ Send model to GPU if available. """

        if torch.cuda.is_available():
            self.model.cuda()

        self.epochs = epochs
        

        if optimizer == 'SGD':
             # no momentum allowed with sparse matrices :(
            self.optimizer = SGD(self.model.parameters(), lr=lr)

        elif optimizer == 'Adagrad':
            self.optimizer = Adagrad(self.model.parameters(), lr=lr)

        else:
            print ('ERROR: '+optimizer+' is not available, please select SGD or Adagrad.')
            sys.exit(1)


        self.losses = []

        """ Set up early stopping. """

        self.earlStop = EarlyStopping(patience=patience, epsilon=epsilon, keepBest=True)
        self.restoreBest = restoreBest

        self.saveFreq = saveFreq
        if self.saveFreq < 0:
            self.saveFreq = self.epochs + 1 


        self.outPath = outPath
        if not os.path.exists(self.outPath):
            os.makedirs(self.outPath)


    def train(self):

        """ Run the training of the model. """    
            
        for epoch in tqdm(range(self.epochs), desc='Epoch'):
      
            pBarB = tqdm(enumerate(self.trainBatch), total=len(self.trainBatch),  desc='Batch')
            for batchNum, batch in pBarB:
        
                wordBatch = batch[0]
                contBatch = batch[1]
                negaBatch = batch[2]

                """ Move batches to GPU if available. """

                if torch.cuda.is_available():
                    wordBatch = wordBatch.cuda()
                    contBatch = contBatch.cuda()
                    negaBatch = negaBatch.cuda()

                """ Core of training. """

                self.optimizer.zero_grad()
                loss = self.model(wordBatch, contBatch, negaBatch)
                loss.backward()
                self.optimizer.step()    


                pBarB.set_postfix({'loss' : '{:.5f}'.format(loss.item())})
        
            """ Store loss. """

            self.losses.append(loss.item())

            """ Save checkpoint model every n-th epoch. """ 
            
            if epoch > 0 and epoch%self.saveFreq == 0:

                self.saveModel(name='_{:d}_{:.5f}'.format(epoch,loss))

            """ Early stop check. """

            self.earlStop(loss, self.model)

            if self.earlStop.earlyStop:

                print('Limit loss improvement reached, stopping the training.')

                break

        """ Restore and save best model. """

        if self.restoreBest:

            self.model = self.earlStop.bestModel        


    def saveModel(self, name):

        """ Saves any model and its dictionary. 

        Args:
            name (string): file name.
        """

        torch.save({'model_state_dict': self.model.state_dict(), 
                    'word_to_ix': self.trainDs.wDict['word'].to_dict()
                    },                  
                    os.path.join(self.outPath, 'model_'+name+'.pt'))


    def getEmbedded(self):

        """ Returns the embedding layer weights, equivalent to the word vectors in 
            the embedded space.

        Returns:
            (numpy array): the embedding layer weights.
        """

        return self.model.getEmbedded()
示例#12
0
class Train(object):
    def __init__(self):
        self.vocab = Vocab(config.vocab_path, config.vocab_size)
        self.batcher = Batcher(config.train_data_path, self.vocab, mode='train',
                               batch_size=config.batch_size, single_pass=False)
        time.sleep(15)

        train_dir = os.path.join(config.log_root, 'train_%d' % (int(time.time())))
        if not os.path.exists(train_dir):
            os.mkdir(train_dir)

        self.model_dir = os.path.join(train_dir, 'model')
        if not os.path.exists(self.model_dir):
            os.mkdir(self.model_dir)

        self.summary_writer = tf.summary.FileWriter(train_dir)

    def save_model(self, running_avg_loss, iter):
        state = {
            'iter': iter,
            'encoder_state_dict': self.model.encoder.state_dict(),
            'decoder_state_dict': self.model.decoder.state_dict(),
            'reduce_state_dict': self.model.reduce_state.state_dict(),
            'optimizer': self.optimizer.state_dict(),
            'current_loss': running_avg_loss
        }
        model_save_path = os.path.join(self.model_dir, 'model_%d_%d' % (iter, int(time.time())))
        torch.save(state, model_save_path)

    def setup_train(self, model_file_path=None):
        self.model = Model(model_file_path)

        params = list(self.model.encoder.parameters()) + list(self.model.decoder.parameters()) + \
                 list(self.model.reduce_state.parameters())
        initial_lr = config.lr_coverage if config.is_coverage else config.lr
        self.optimizer = Adagrad(params, lr=initial_lr, initial_accumulator_value=config.adagrad_init_acc)

        start_iter, start_loss = 0, 0

        if model_file_path is not None:
            state = torch.load(model_file_path, map_location=lambda storage, location: storage)
            start_iter = state['iter']
            start_loss = state['current_loss']

            if not config.is_coverage:
                self.optimizer.load_state_dict(state['optimizer'])
                if use_cuda:
                    for state in self.optimizer.state.values():
                        for k, v in state.items():
                            if torch.is_tensor(v):
                                state[k] = v.cuda()

        return start_iter, start_loss

    def train_one_batch(self, batch):
        enc_batch, enc_padding_mask, enc_lens, enc_batch_extend_vocab, extra_zeros, c_t_1, coverage = \
            get_input_from_batch(batch, use_cuda)
        dec_batch, dec_padding_mask, max_dec_len, dec_lens_var, target_batch = \
            get_output_from_batch(batch, use_cuda)

        self.optimizer.zero_grad()

        encoder_outputs, encoder_feature, encoder_hidden = self.model.encoder(enc_batch, enc_lens)
        s_t_1 = self.model.reduce_state(encoder_hidden)
        s_t_1_origin = s_t_1

        batch_size = batch.batch_size
        step_losses = []

        sample_idx = []
        sample_log_probs = Variable(torch.zeros(batch_size))
        baseline_idx = []

        for di in range(min(max_dec_len, config.max_dec_steps)):

            y_t_1 = dec_batch[:, di]  # Teacher forcing, shape [batch_size]
            final_dist, s_t_1, c_t_1, attn_dist, p_gen, next_coverage = self.model.decoder(y_t_1, s_t_1,
                                                                                           encoder_outputs,
                                                                                           encoder_feature,
                                                                                           enc_padding_mask, c_t_1,
                                                                                           extra_zeros,
                                                                                           enc_batch_extend_vocab,
                                                                                           coverage, di)
            target = target_batch[:, di]
            gold_probs = torch.gather(final_dist, 1, target.unsqueeze(1)).squeeze()
            step_loss = -torch.log(gold_probs + config.eps)
            if config.is_coverage:
                step_coverage_loss = torch.sum(torch.min(attn_dist, coverage), 1)
                step_loss = step_loss + config.cov_loss_wt * step_coverage_loss
                coverage = next_coverage

            step_mask = dec_padding_mask[:, di]
            step_loss = step_loss * step_mask
            step_losses.append(step_loss)

            # sample
            if di == 0:  # use decoder input[0], which is <BOS>
                sample_t_1 = dec_batch[:, di]
                s_t_sample = s_t_1_origin
                c_t_sample = Variable(torch.zeros((batch_size, 2 * config.hidden_dim)))

            final_dist, s_t_sample, c_t_sample, attn_dist, p_gen, next_coverage = self.model.decoder(sample_t_1,
                                                                                                     s_t_sample,
                                                                                                     encoder_outputs,
                                                                                                     encoder_feature,
                                                                                                     enc_padding_mask,
                                                                                                     c_t_sample,
                                                                                                     extra_zeros,
                                                                                                     enc_batch_extend_vocab,
                                                                                                     coverage, di)
            # according to final_dist to sample
            # change sample_t_1
            dist = torch.distributions.Categorical(final_dist)
            sample_t_1 = Variable(dist.sample())
            # record sample idx
            sample_idx.append(sample_t_1)  # tensor list
            # compute sample probability
            sample_log_probs += torch.log(
                final_dist.gather(1, sample_t_1.view(-1, 1)))  # gather value along axis=1. given index

            # baseline
            if di == 0:  # use decoder input[0], which is <BOS>
                baseline_t_1 = dec_batch[:, di]
                s_t_sample = s_t_1_origin
                c_t_sample = Variable(torch.zeros((batch_size, 2 * config.hidden_dim)))

            final_dist, s_t_baseline, c_t_baseline, attn_dist, p_gen, next_coverage = self.model.decoder(baseline_t_1,
                                                                                                         s_t_baseline,
                                                                                                         encoder_outputs,
                                                                                                         encoder_feature,
                                                                                                         enc_padding_mask,
                                                                                                         c_t_baseline,
                                                                                                         extra_zeros,
                                                                                                         enc_batch_extend_vocab,
                                                                                                         coverage, di)
            # according to final_dist to get baseline
            # change baseline_t_1
            baseline_t_1 = torch.autograd.Variable(final_dist.max(1))  # get max value along axis=1
            # record baseline probability
            baseline_idx.append(baseline_t_1)

        sum_losses = torch.sum(torch.stack(step_losses, 1), 1)
        batch_avg_loss = sum_losses / dec_lens_var
        loss = torch.mean(batch_avg_loss)

        # according to sample_idx and baseline_idx to compute RL loss
        # map sample/baseline_idx to string
        # compute rouge score
        # compute loss
        sample_idx = torch.stack(sample_idx, dim=1).squeeze()  # expect shape (batch_size, seq_len)
        baseline_idx = torch.stack(baseline_idx, dim=1).squeeze()
        rl_loss = torch.zeros(batch_size)
        for i in range(sample_idx.shape[0]):  # each example in a batch
            sample_y = data.outputids2words(sample_idx[i], self.vocab,
                                            (batch.art_oovs[i] if config.pointer_gen else None))
            baseline_y = data.outputids2words(baseline_idx[i], self.vocab,
                                              (batch.art_oovs[i] if config.pointer_gen else None))
            true_y = batch.original_abstracts[i]

            sample_score = rouge_l_f(sample_y, true_y)
            baseline_score = rouge_l_f(baseline_y, true_y)

            sample_score = Variable(sample_score)
            baseline_score = Variable(baseline_score)

            rl_loss[i] = baseline_score - sample_score
        rl_loss = rl_loss * sample_log_probs

        gamma = 0.9984
        loss = (1 - gamma) * loss + gamma * rl_loss

        loss.backward()

        self.norm = clip_grad_norm_(self.model.encoder.parameters(), config.max_grad_norm)
        clip_grad_norm_(self.model.decoder.parameters(), config.max_grad_norm)
        clip_grad_norm_(self.model.reduce_state.parameters(), config.max_grad_norm)

        self.optimizer.step()

        return loss.item()

    def trainIters(self, n_iters, model_file_path=None):
        iter, running_avg_loss = self.setup_train(model_file_path)
        start = time.time()
        while iter < n_iters:
            batch = self.batcher.next_batch()
            loss = self.train_one_batch(batch)

            running_avg_loss = calc_running_avg_loss(loss, running_avg_loss, self.summary_writer, iter)
            iter += 1

            if iter % 100 == 0:
                self.summary_writer.flush()
            print_interval = 1000
            if iter % print_interval == 0:
                print('steps %d, seconds for %d batch: %.2f , loss: %f' % (iter, print_interval,
                                                                           time.time() - start, loss))
                start = time.time()
            if iter % 5000 == 0:
                self.save_model(running_avg_loss, iter)
示例#13
0
class Train(object):
    def __init__(self, model_file_path=None):
        self.vocab = Vocab(config.vocab_path, config.vocab_size)
        self.batcher = Batcher(config.train_data_path,
                               self.vocab,
                               mode='train',
                               batch_size=config.batch_size,
                               single_pass=False)
        time.sleep(15)

        if not model_file_path:
            train_dir = os.path.join(config.log_root,
                                     'train_%d' % (int(time.time())))
            if not os.path.exists(train_dir):
                os.mkdir(train_dir)
        else:
            train_dir = re.sub('/model/model.*', '', model_file_path)

        self.model_dir = os.path.join(train_dir, 'model')
        if not os.path.exists(self.model_dir):
            os.mkdir(self.model_dir)

        self.summary_writer = tf.summary.create_file_writer(train_dir)

    def save_model(self, running_avg_loss, iter):
        state = {
            'iter': iter,
            'encoder_state_dict': self.model.encoder.state_dict(),
            'decoder_state_dict': self.model.decoder.state_dict(),
            'reduce_state_dict': self.model.reduce_state.state_dict(),
            'optimizer': self.optimizer.state_dict(),
            'current_loss': running_avg_loss
        }
        model_save_path = os.path.join(
            self.model_dir, 'model_%d_%d' % (iter, int(time.time())))
        torch.save(state, model_save_path)

    def setup_train(self, model_file_path=None):
        self.model = Model(model_file_path)

        params = list(self.model.encoder.parameters()) + list(self.model.decoder.parameters()) + \
                 list(self.model.reduce_state.parameters())
        initial_lr = config.lr_coverage if config.is_coverage else config.lr
        self.optimizer = Adagrad(
            params,
            lr=initial_lr,
            initial_accumulator_value=config.adagrad_init_acc)
        # self.optimizer = Adam(params)
        start_iter, start_loss = 0, 0

        if model_file_path is not None:
            state = torch.load(model_file_path,
                               map_location=lambda storage, location: storage)
            start_iter = state['iter']
            start_loss = state['current_loss']

            if not config.is_coverage:
                self.optimizer.load_state_dict(state['optimizer'])
                if use_cuda:
                    for state in self.optimizer.state.values():
                        for k, v in state.items():
                            if torch.is_tensor(v):
                                state[k] = v.cuda()

        return start_iter, start_loss

    def f(self, x, alpha):
        # # 1 - x ** alpha
        # k = utils.EPOCH / (utils.MAX_EPOCH / 2) - 1
        # return k * x + (1 - k)/2
        return 1 - x**alpha

    def get_loss_mask(self, src, tgt, absts, alpha=config.alpha):
        loss_mask = []
        for i in range(len(src)):

            # debug('src[i]',src[i])
            # debug('tgt[i]',src[i])
            # cnt = 0
            # tgt_i = [t for t in tgt[i] if t != 1]
            # src_i = set([s for s in src[i] if s != 1])
            # debug('src_i',src_i)
            # m = [t for t in tgt_i if t not in src_i ]
            # # for token in tgt_i:
            # #     if token not in src_i:
            # #         cnt += 1
            # cnt = len(m)
            # abst = round(cnt / len(tgt_i),4)
            abst = absts[i]
            loss_factor = self.f(abst, alpha)
            loss_mask.append(loss_factor)
        return torch.Tensor(loss_mask).cuda()

    def train_one_batch(self, batch):
        enc_batch, enc_padding_mask, enc_lens, enc_batch_extend_vocab, extra_zeros, c_t_1, coverage = \
            get_input_from_batch(batch, use_cuda)
        dec_batch, dec_padding_mask, max_dec_len, dec_lens_var, target_batch = \
            get_output_from_batch(batch, use_cuda)

        self.optimizer.zero_grad()

        encoder_outputs, encoder_feature, encoder_hidden = self.model.encoder(
            enc_batch, enc_lens)
        s_t_1 = self.model.reduce_state(encoder_hidden)

        # debug(batch.original_articles[0])
        # debug(batch.original_abstracts[0])
        loss_mask = self.get_loss_mask(enc_batch, dec_batch, batch.absts)
        # debug('loss_mask',loss_mask)
        step_losses = []
        for di in range(min(max_dec_len, config.max_dec_steps)):
            y_t_1 = dec_batch[:, di]  # Teacher forcing
            final_dist, s_t_1, c_t_1, attn_dist, p_gen, next_coverage, tau = self.model.decoder(
                y_t_1, s_t_1, encoder_outputs, encoder_feature,
                enc_padding_mask, c_t_1, extra_zeros, enc_batch_extend_vocab,
                coverage, di)
            target = target_batch[:, di]
            gold_probs = torch.gather(final_dist, 1,
                                      target.unsqueeze(1)).squeeze()
            step_loss = -torch.log(gold_probs + config.eps)

            # debug('enc_batch',enc_batch.size())
            # debug('dec_batch',dec_batch.size())
            # debug('final_dist', final_dist.size())
            # debug('target',target)
            # debug('gold_probs',gold_probs)

            if config.is_coverage:
                step_coverage_loss = torch.sum(torch.min(attn_dist, coverage),
                                               1)
                step_loss = step_loss + config.cov_loss_wt * step_coverage_loss
                coverage = next_coverage

            step_mask = dec_padding_mask[:, di]
            step_loss = step_loss * step_mask
            # debug('step_loss_before',step_loss)
            # debug('config.loss_mask',config.loss_mask)
            if config.loss_mask:
                step_loss = step_loss * loss_mask
                # pass
            # debug('step_loss_after',step_loss)
            step_losses.append(step_loss)

            if config.DEBUG:
                # break
                pass

        sum_losses = torch.sum(torch.stack(step_losses, 1), 1)
        batch_avg_loss = sum_losses / dec_lens_var
        loss = torch.mean(batch_avg_loss)

        if not config.DEBUG:
            loss.backward()

        self.norm = clip_grad_norm_(self.model.encoder.parameters(),
                                    config.max_grad_norm)
        clip_grad_norm_(self.model.decoder.parameters(), config.max_grad_norm)
        clip_grad_norm_(self.model.reduce_state.parameters(),
                        config.max_grad_norm)

        self.optimizer.step()

        return loss.item(), tau

    def trainIters(self, n_iters, model_file_path=None):
        iter, running_avg_loss = self.setup_train(model_file_path)
        start = time.time()

        start_iter = iter
        while iter < n_iters:
            batch = self.batcher.next_batch()
            loss, tau = self.train_one_batch(batch)

            running_avg_loss = calc_running_avg_loss(loss, running_avg_loss,
                                                     self.summary_writer, iter)
            iter += 1

            if config.DEBUG:
                debug('iter', iter)
                if iter - start_iter > config.BREAK_POINT:
                    break

            if iter % 100 == 0:
                self.summary_writer.flush()
            print_interval = 100
            if iter % print_interval == 0:
                print('steps %d, seconds for %d batch: %.2f , loss: %f' %
                      (iter, print_interval, time.time() - start, loss))
                if config.adaptive_sparsemax:
                    print('tau + eps', [
                        round(e[0], 4)
                        for e in (tau +
                                  config.eps).detach().cpu().numpy().tolist()
                    ])
                start = time.time()
            if iter % 5000 == 0:
                self.save_model(running_avg_loss, iter)
示例#14
0
class Train:
    def __init__(self):
        self.vocab = Vocab(config.vocab_path, config.vocab_size)
        self.batcher = Batcher(config.train_data_path,
                               self.vocab,
                               mode='train',
                               batch_size=config.batch_size,
                               single_pass=False)
        time.sleep(15)

        train_dir = os.path.join(config.log_root,
                                 'train_%d' % (int(time.time())))
        if not os.path.exists(train_dir):
            os.mkdir(train_dir)

        self.model_dir = os.path.join(train_dir, 'model')
        if not os.path.exists(self.model_dir):
            os.mkdir(self.model_dir)

        self.summary_writer = tf.summary.FileWriter(train_dir)

    def save_model(self, moving_avg_loss, iter):
        state = {
            'iter': iter,
            'encoder_state_dict': self.model.encoder.state_dict(),
            'decoder_state_dict': self.model.decoder.state_dict(),
            'reduce_state_dict': self.model.reduce_state.state_dict(),
            'optimizer': self.optimizer.state_dict(),
            'current_loss': moving_avg_loss
        }
        model_save_path = os.path.join(
            self.model_dir, 'model_%d_%d' % (iter, int(time.time())))
        torch.save(state, model_save_path)

    def setup_train(self, model_file_path=None):
        self.model = Model(model_file_path)

        params = list(self.model.encoder.parameters()) + list(self.model.decoder.parameters()) + \
                 list(self.model.reduce_state.parameters())
        initial_lr = config.lr_coverage if config.do_coverage else config.lr
        self.optimizer = Adagrad(
            params,
            lr=initial_lr,
            initial_accumulator_value=config.adagrad_init_acc)

        start_iter, start_loss = 0, 0
        if model_file_path is not None:
            state = torch.load(model_file_path,
                               map_location=lambda storage, location: storage)
            start_iter = state['iter']
            start_loss = state['current_loss']

            # 在训练到某个epoch,需要切换到coverage结构,因此需要使用新的optimizer状态。此处控制切换时机。
            if not config.do_coverage:
                self.optimizer.load_state_dict(state['optimizer'])
                if use_cuda:
                    for state in self.optimizer.state.values():
                        for k, v in state.items():
                            if torch.is_tensor(v):
                                state[k] = v.cuda()

        return start_iter, start_loss

    def train_one_batch(self, batch):
        enc_batch, enc_padding_mask, enc_lens, enc_batch_extend_vocab, extra_zeros, context_v, coverage = \
            get_encoder_variables(batch, use_cuda)
        # dec_lens_var:一个batch的decoder目标序列长度
        dec_batch, dec_padding_mask, max_dec_len, dec_lens_var, target_batch = \
            get_decoder_variables(batch, use_cuda)

        self.optimizer.zero_grad()

        if 0 in enc_lens:
            print('=================')
            print(enc_batch.shape)
            print(enc_lens)
            print(enc_batch)
            print('=================')
        encoder_outputs, encoder_feature, encoder_hidden = self.model.encoder(
            enc_batch, enc_lens)
        d_hc = self.model.reduce_state(encoder_hidden)  # decoder初始h,c

        step_losses = []
        # for step in tqdm.tqdm(range(min(max_dec_len, config.max_dec_steps))):
        for step in range(min(max_dec_len, config.max_dec_steps)):
            d_inp = dec_batch[:, step]  # Teacher forcing
            final_dist, d_hc, context_v, attn_dist, p_gen, next_coverage = self.model.decoder(
                d_inp, d_hc, encoder_outputs, encoder_feature,
                enc_padding_mask, context_v, extra_zeros,
                enc_batch_extend_vocab, coverage, step)
            target = target_batch[:, step]
            # gather每一步target id的预测概率
            gold_probs = torch.gather(final_dist, 1,
                                      target.unsqueeze(1)).squeeze()
            step_loss = -torch.log(gold_probs + config.eps)
            if config.do_coverage:
                step_coverage_loss = torch.sum(torch.min(attn_dist, coverage),
                                               1)  # encoder的累计分布作为损失,见原论文
                step_loss = step_loss + config.cov_loss_wt * step_coverage_loss
                coverage = next_coverage

            step_mask = dec_padding_mask[:, step]
            step_loss = step_loss * step_mask
            step_losses.append(step_loss)

        sum_losses = torch.sum(torch.stack(step_losses, 1), 1)
        batch_avg_loss = sum_losses / dec_lens_var
        loss = torch.mean(batch_avg_loss)

        loss.backward()

        self.norm = clip_grad_norm_(self.model.encoder.parameters(),
                                    config.max_grad_norm)
        clip_grad_norm_(self.model.decoder.parameters(), config.max_grad_norm)
        clip_grad_norm_(self.model.reduce_state.parameters(),
                        config.max_grad_norm)

        self.optimizer.step()

        return loss.item()

    def trainIters(self, n_iters, model_file_path=None):
        iter, moving_avg_loss = self.setup_train(model_file_path)
        start = time.time()
        pbar = tqdm.tqdm(total=n_iters)
        while iter < n_iters:
            batch = self.batcher.next_batch()
            loss = self.train_one_batch(batch)

            moving_avg_loss = calc_moving_avg_loss(loss, moving_avg_loss,
                                                   self.summary_writer, iter)
            iter += 1
            pbar.update(1)

            if iter % 100 == 0:
                self.summary_writer.flush()
            print_interval = 100
            if iter % print_interval == 0:
                print('steps %d, seconds for %d batch: %.2f , loss: %f' %
                      (iter, print_interval, time.time() - start, loss))
                start = time.time()
            if iter % 5000 == 0:
                self.save_model(moving_avg_loss, iter)
        pbar.close()
示例#15
0
class Trainer:
    def __init__(self, config):

        self.config = config
        self.device = config['device']
        self.step = 0
        if os.path.exists('../vocab.pt'):
            self.vocab = torch.load('../vocab.pt')
        else:
            self.vocab = Vocab(config['vocab_file'], config['vocab_size'])
            torch.save(self.vocab, '../vocab.pt')
        self.train_data = CNNDMDataset('train', config['data_path'], config,
                                       self.vocab)
        self.validate_data = CNNDMDataset('val', config['data_path'], config,
                                          self.vocab)

        self.setup(config)

    def setup(self, config):

        self.model = Model(config).to(config['device'])
        self.optimizer = Adagrad(self.model.parameters(),
                                 lr=config['learning_rate'],
                                 initial_accumulator_value=0.1)
        # self.optimizer = Adam(self.model.parameters(),lr = config['learning_rate'],betas = config['betas'])
        checkpoint = None

        if config[
                'train_from'] != '':  # Counter在两次mostCommon间, 相同频率的元素可能以不同的次序输出...!
            logging('Train from %s' % config['train_from'])
            checkpoint = torch.load(config['train_from'], map_location='cpu')
            self.model.load_state_dict(checkpoint['model'])
            self.step = checkpoint['step']
            self.vocab = checkpoint['vocab']
            self.optimizer.load_state_dict(checkpoint['optimizer'])
            # print('State dict parameters:')
            # for n in model.state_dict().keys():
            #     print(n)
        #self.optimizer = Adam(self.model.parameters(),lr = config['learning_rate'],betas = config['betas'])

    def train_one(self, batch):
        """ coverage not implemented """
        config = self.config
        enc_batch, enc_padding_mask, enc_lens, enc_batch_extend_vocab, extra_zeros = \
            get_input_from_batch(batch, config, self.device)
        dec_batch, dec_padding_mask, max_dec_len, dec_lens_var, target_batch = \
            get_output_from_batch(batch, self.device)
        pred = self.model(enc_batch, dec_batch, enc_padding_mask,
                          dec_padding_mask, enc_batch_extend_vocab,
                          extra_zeros)
        # >>>>>>>> DEBUG Session <<<<<<<<<
        # print("ENC\n")
        # print(enc_batch)
        # print("DEC\n")
        # print(dec_batch)
        # print("TGT\n")
        # print(target_batch)
        # print("ENCP\n")
        # print(enc_padding_mask)
        # print("DECP\n")
        # print(dec_padding_mask)
        # encs = [self.vocab.id2word(int(v)) for v in enc_batch[:, 0]]
        # decs = [self.vocab.id2word(int(v)) for v in dec_batch[:, 0]]
        # print(' '.join(encs))
        # print(' '.join(decs))
        #print(pred.max(dim=-1)[1][:,0])    #
        #loss = self.model.nll_loss(pred, target_batch, dec_lens_var)
        loss = self.model.label_smoothing_loss(pred, target_batch)
        return loss

    def train(self):

        config = self.config
        train_loader = DataLoader(self.train_data,
                                  batch_size=config['batch_size'],
                                  shuffle=True,
                                  collate_fn=Collate())

        running_avg_loss = 0
        self.model.train()

        for _ in range(config['train_epoch']):
            for batch in train_loader:
                self.step += 1

                loss = self.train_one(batch)
                running_avg_loss = calc_running_avg_loss(
                    loss.item(), running_avg_loss)
                loss.div(float(config['gradient_accum'])).backward()

                if self.step % config[
                        'gradient_accum'] == 0:  # gradient accumulation
                    clip_grad_norm_(self.model.parameters(),
                                    config['max_grad_norm'])
                    self.optimizer.step()
                    self.optimizer.zero_grad()

                if self.step % config['report_every'] == 0:
                    logging("Step %d Train loss %.3f" %
                            (self.step, running_avg_loss))
                if self.step % config['save_every'] == 0:
                    self.save()
                if self.step % config['validate_every'] == 0:
                    self.validate()

    @torch.no_grad()
    def validate(self):
        self.model.eval()
        validate_loader = DataLoader(self.validate_data,
                                     batch_size=self.config['batch_size'],
                                     shuffle=False,
                                     collate_fn=Collate())
        losses = []
        for batch in tqdm(validate_loader):
            loss = self.train_one(batch)
            losses.append(loss.item())
        self.model.train()
        ave_loss = sum(losses) / len(losses)
        logging('Validate loss : %f' % ave_loss)

    def save(self):
        state = {
            'model': self.model.state_dict(),
            'optimizer': self.optimizer.state_dict(),
            'step': self.step,
            'vocab': self.vocab
        }
        save_path = os.path.join(self.config['model_path'],
                                 'model_s%d.pt' % self.step)
        logging('Saving model step %d to %s...' % (self.step, save_path))
        torch.save(state, save_path)
示例#16
0
class TrainSeq2Seq(object):
    def __init__(self, is_word_level=False, is_combined=False, alpha=0.3):
        self.vocab = Vocab(config.vocab_path, config.vocab_size)
        # self.batcher = Batcher(config.train_data_path, self.vocab, mode='train',
        #                        batch_size=config.batch_size, single_pass=False)
        self.dataset = DailyMailDataset("train", self.vocab)
        #time.sleep(15)

        self.is_word_level = is_word_level
        self.is_combined = is_combined
        self.alpha = alpha

        if is_word_level:
            print("Using Word Level Policy Gradient")
        elif is_combined:
            print("Using Combined Policy Gradient w/ alpha = ", alpha)
        else:
            print("Using Sentence Level Policy Gradient")

        train_dir = './train_dumps'
        # train_dir = './train_dumps'
        if not os.path.exists(train_dir):
            #print('create dict')
            os.mkdir(train_dir)

        self.model_dir = os.path.join(
            train_dir, 'dumps_model_{:%m_%d_%H_%M}'.format(datetime.now()))
        if not os.path.exists(self.model_dir):
            #print('create folder')
            os.mkdir(self.model_dir)

    def save_model(self, running_avg_loss, iter):
        state = {
            'iter': iter,
            'encoder_state_dict': self.model.encoder.state_dict(),
            'decoder_state_dict': self.model.decoder.state_dict(),
            'reduce_state_dict': self.model.reduce_state.state_dict(),
            'optimizer': self.optimizer.state_dict(),
            'current_loss': running_avg_loss
        }
        model_save_path = os.path.join(
            self.model_dir, 'model_%d_%d' % (iter, int(time.time())))
        torch.save(state, model_save_path)
        return model_save_path

    def setup(self, seqseq_model, model_file_path):
        self.model = seqseq_model

        params = list(self.model.encoder.parameters()) + list(self.model.decoder.parameters()) + \
                 list(self.model.reduce_state.parameters())
        initial_lr = config.lr_coverage if config.is_coverage else config.lr
        self.optimizer = Adagrad(
            params,
            lr=initial_lr,
            initial_accumulator_value=config.adagrad_init_acc)
        #self.optimizer = Adam(params, lr=initial_lr)

        start_iter, start_loss = 0, 0

        if model_file_path is not None:
            print("Loading checkpoint .... ")
            state = torch.load(model_file_path,
                               map_location=lambda storage, location: storage)
            start_iter = state['iter']
            start_loss = state['current_loss']

            if not config.is_coverage:
                self.optimizer.load_state_dict(state['optimizer'])
                if config.use_gpu:
                    for state in self.optimizer.state.values():
                        for k, v in state.items():
                            if torch.is_tensor(v):
                                state[k] = v.cuda()

        return start_iter, start_loss

    def train_one_batch_nll(self, batch):
        enc_batch, enc_padding_mask, enc_lens, enc_batch_extend_vocab, extra_zeros, c_t_1, coverage = \
            get_input_from_batch(batch, config.use_gpu)
        dec_batch, dec_padding_mask, max_dec_len, dec_lens_var, target_batch = \
            get_output_from_batch(batch, config.use_gpu)

        self.optimizer.zero_grad()

        encoder_outputs, encoder_feature, encoder_hidden = self.model.encoder(
            enc_batch, enc_lens)
        s_t_1 = self.model.reduce_state(encoder_hidden)

        step_losses = []
        for di in range(min(max_dec_len, config.max_dec_steps)):
            y_t_1 = dec_batch[:, di]  # Teacher forcing
            final_dist, s_t_1, c_t_1, attn_dist, p_gen, next_coverage = self.model.decoder(
                y_t_1, s_t_1, encoder_outputs, encoder_feature,
                enc_padding_mask, c_t_1, extra_zeros, enc_batch_extend_vocab,
                coverage, di)
            target = target_batch[:, di]
            gold_probs = torch.gather(final_dist, 1,
                                      target.unsqueeze(1)).squeeze()
            step_loss = -torch.log(gold_probs + config.eps)
            if config.is_coverage:
                step_coverage_loss = torch.sum(torch.min(attn_dist, coverage),
                                               1)
                step_loss = step_loss + config.cov_loss_wt * step_coverage_loss
                coverage = next_coverage

            step_mask = dec_padding_mask[:, di]
            step_loss = step_loss * step_mask
            step_losses.append(step_loss)

        sum_losses = torch.sum(torch.stack(step_losses, 1), 1)
        batch_avg_loss = sum_losses / dec_lens_var
        loss = torch.mean(batch_avg_loss)

        loss.backward()

        self.norm = clip_grad_norm_(self.model.encoder.parameters(),
                                    config.max_grad_norm)
        clip_grad_norm_(self.model.decoder.parameters(), config.max_grad_norm)
        clip_grad_norm_(self.model.reduce_state.parameters(),
                        config.max_grad_norm)

        self.optimizer.step()

        return loss.item()

    def train_nll(self, n_iters, iter, running_avg_loss):
        start = time.time()
        while iter < n_iters:
            batch = self.batcher.next_batch()
            loss = self.train_one_batch_nll(batch)

            running_avg_loss = calc_running_avg_loss(loss, running_avg_loss,
                                                     iter)
            print("Iteration:", iter, "  loss:", loss, "  Running avg loss:",
                  running_avg_loss)
            iter += 1

            print_interval = 1000
            if iter % print_interval == 0:
                print('steps %d, seconds for %d batch: %.2f , loss: %f' %
                      (iter, print_interval, time.time() - start, loss))
                start = time.time()
            if iter % 1000 == 0:
                self.save_model(running_avg_loss, iter)

    def train_pg(self,
                 n_iters,
                 start_iter,
                 start_running_avg_loss,
                 start_pg_losses,
                 start_run_avg_losses,
                 num_epochs=50):
        """
        The generator is trained using policy gradients, using the reward from the discriminator.
        Training is done for num_batches batches.
        """

        dataloader = DataLoader(self.dataset,
                                batch_size=config.batch_size,
                                shuffle=True,
                                num_workers=1,
                                collate_fn=create_batch_collate(
                                    self.vocab, config.batch_size))
        # pg_batcher = Batcher(config.train_data_path, self.vocab, mode='train',
        #     batch_size=config.batch_size, single_pass=False)
        #
        # time.sleep(15)

        start = time.time()
        running_avg_loss = start_running_avg_loss
        pg_losses = start_pg_losses
        run_avg_losses = start_run_avg_losses
        iteration = start_iter

        for epoch in range(num_epochs):
            print("Epoch :", epoch + 1)
            for batch in dataloader:
                iteration += 1

                loss = self.train_one_batch_pg(batch)

                running_avg_loss = calc_running_avg_loss(
                    loss, running_avg_loss, iteration)
                print("Iteration:", iteration, "  PG loss:", loss,
                      "  Running avg loss:", running_avg_loss)
                pg_losses.append(loss)
                run_avg_losses.append(running_avg_loss)

                print_interval = 10
                if iteration % print_interval == 0:
                    print(
                        'steps %d, seconds for %d batch: %.2f , loss: %f' %
                        (iteration, print_interval, time.time() - start, loss))

                    start = time.time()

                if iteration % 10 == 0:
                    # Dump model and losses
                    model_file_path = self.save_model(running_avg_loss,
                                                      iteration)
                    pickle.dump(
                        pg_losses,
                        open(
                            os.path.join(
                                self.model_dir,
                                'train_pg_losses_{}.p'.format(iteration)),
                            'wb'))
                    pickle.dump(
                        run_avg_losses,
                        open(
                            os.path.join(
                                self.model_dir,
                                'train_run_avg_losses_{}.p'.format(iteration)),
                            'wb'))
                    # Run eval
                    eval_processor = Evaluate_pg(
                        model_file_path,
                        is_word_level=self.is_word_level,
                        is_combined=self.is_combined,
                        alpha=self.alpha)
                    eval_losses = eval_processor.run_eval(
                        self.model_dir, iteration)

                    # Check if we should stop
                    avg_eval_loss = np.mean(eval_losses)
                    if running_avg_loss < avg_eval_loss:
                        print("Stopping at iteration {}".format(iteration))
                        break

    def compute_policy_grads_using_rewards(self, sentence_rewards,
                                           word_rewards, sentence_losses,
                                           word_losses, word_to_sent_ind):
        if self.is_combined:
            pg_losses = [[(self.alpha * word_reward + (1 - self.alpha) *
                           sentence_rewards[i][word_to_sent_ind[i][j]]) *
                          word_losses[i][j]
                          for j, word_reward in enumerate(abstract_rewards)
                          if j < len(word_to_sent_ind[i])]
                         for i, abstract_rewards in enumerate(word_rewards)]
            pg_losses = [sum(pg) for pg in pg_losses]
        elif self.is_word_level:
            pg_losses = [[
                word_reward * word_losses[i][j]
                for j, word_reward in enumerate(abstract_rewards)
                if j < len(word_to_sent_ind[i])
            ] for i, abstract_rewards in enumerate(word_rewards)]
            pg_losses = [sum(pg) for pg in pg_losses]
        else:
            pg_losses = [[
                rs * sentence_losses[ri][rsi] for rsi, rs in enumerate(r)
            ] for ri, r in enumerate(sentence_rewards)]
            pg_losses = [sum(pg) for pg in pg_losses]
        return pg_losses

    def compute_pg_loss(self, orig, pred, sentence_losses, split_predictions,
                        word_losses, word_to_sent_ind):
        sentence_rewards = None
        word_rewards = None
        # First compute the rewards
        if not self.is_word_level or self.is_combined:
            sentence_rewards = get_sentence_rewards(orig, pred)

        if self.is_word_level or self.is_combined:
            word_rewards = get_word_level_rewards(orig, split_predictions)

        pg_losses = self.compute_policy_grads_using_rewards(
            sentence_rewards=sentence_rewards,
            word_rewards=word_rewards,
            sentence_losses=sentence_losses,
            word_losses=word_losses,
            word_to_sent_ind=word_to_sent_ind)

        return pg_losses

    def compute_batched_sentence_loss(self, word_losses, orig, pred):
        orig_sum = []
        new_pred = []
        pred_sum = []
        sentence_losses = []

        # Convert the original sum as one single string per article
        for i in range(len(orig)):
            orig_sum.append(' '.join(map(str, orig[i])))
            new_pred.append([])
            pred_sum.append([])
            sentence_losses.append([])

        batch_sent_indices = []
        for i in range(len(pred)):
            sentence = []
            sentence = pred[i]
            losses = word_losses[i]
            sentence_indices = []
            count = 0
            while len(sentence) > 0:
                try:
                    idx = sentence.index(".")
                except ValueError:
                    idx = len(sentence)

                sentence_indices.extend([count for _ in range(idx)])

                if count > 0:
                    new_pred[i].append(new_pred[i][count - 1] +
                                       sentence[:idx + 1])
                else:
                    new_pred[i].append(sentence[:idx + 1])

                sentence_losses[i].append(sum(losses[:idx + 1]))

                sentence = sentence[idx + 1:]
                losses = losses[idx + 1:]
                count += 1
            batch_sent_indices.append(sentence_indices)

        for i in range(len(pred)):
            for j in range(len(new_pred[i])):
                pred_sum[i].append(' '.join(map(str, new_pred[i][j])))

        pg_losses = self.compute_pg_loss(orig_sum,
                                         pred_sum,
                                         sentence_losses,
                                         split_predictions=pred,
                                         word_losses=word_losses,
                                         word_to_sent_ind=batch_sent_indices)

        return pg_losses

    def train_one_batch_pg(self, batch):
        batch_size = batch.batch_size

        enc_batch, enc_padding_mask, enc_lens, enc_batch_extend_vocab, extra_zeros, c_t_1, coverage = \
            get_input_from_batch(batch, config.use_gpu)
        dec_batch, dec_padding_mask, max_dec_len, dec_lens_var, target_batch = \
            get_output_from_batch(batch, config.use_gpu)

        self.optimizer.zero_grad()

        encoder_outputs, encoder_feature, encoder_hidden = self.model.encoder(
            enc_batch, enc_lens)
        s_t_1 = self.model.reduce_state(encoder_hidden)

        step_losses = []
        output_ids = []
        # Begin with START symbol
        y_t_1 = torch.ones(batch_size, dtype=torch.long) * self.vocab.word2id(
            data.START_DECODING)
        if config.use_gpu:
            y_t_1 = y_t_1.cuda()

        for _ in range(batch_size):
            output_ids.append([])
            step_losses.append([])

        for di in range(min(max_dec_len, config.max_dec_steps)):
            #y_t_1 = dec_batch[:, di]  # Teacher forcing
            final_dist, s_t_1, c_t_1, attn_dist, p_gen, next_coverage = self.model.decoder(
                y_t_1, s_t_1, encoder_outputs, encoder_feature,
                enc_padding_mask, c_t_1, extra_zeros, enc_batch_extend_vocab,
                coverage, di)
            target = target_batch[:, di]
            gold_probs = torch.gather(final_dist, 1,
                                      target.unsqueeze(1)).squeeze()
            step_loss = -torch.log(gold_probs + config.eps)  # NLL

            step_mask = dec_padding_mask[:, di]
            step_loss = step_loss * step_mask

            # Move on to next token
            _, idx = torch.max(final_dist, 1)
            idx = idx.reshape(batch_size, -1).squeeze()
            y_t_1 = idx

            for i, pred in enumerate(y_t_1):
                if not pred.item() == data.PAD_TOKEN:
                    output_ids[i].append(pred.item())

            for i, loss in enumerate(step_loss):
                step_losses[i].append(step_loss[i])

        # Obtain the original and predicted summaries
        original_abstracts = batch.original_abstracts_sents
        predicted_abstracts = [
            data.outputids2words(ids, self.vocab, None) for ids in output_ids
        ]

        # Compute the batched loss
        batched_losses = self.compute_batched_sentence_loss(
            step_losses, original_abstracts, predicted_abstracts)
        #batched_losses = Variable(batched_losses, requires_grad=True)
        losses = torch.stack(batched_losses)
        losses = losses / dec_lens_var

        loss = torch.mean(losses)
        loss.backward()

        self.norm = clip_grad_norm_(self.model.encoder.parameters(),
                                    config.max_grad_norm)
        clip_grad_norm_(self.model.decoder.parameters(), config.max_grad_norm)
        clip_grad_norm_(self.model.reduce_state.parameters(),
                        config.max_grad_norm)

        self.optimizer.step()

        return loss.item()
示例#17
0
class Train(object):
    def __init__(self):
        self.vocab = Vocab(config.vocab_path, config.vocab_size)
        self.batcher = Batcher(config.train_data_path, self.vocab, mode='train',
                               batch_size=config.batch_size, single_pass=False)
        time.sleep(15)
        stamp = time.strftime("%Y%m%d_%H%M%S", time.localtime())
        train_dir = os.path.join(config.log_root, 'train_{}'.format(stamp))
        if not os.path.exists(train_dir):
            os.makedirs(train_dir)

        self.model_dir = os.path.join(train_dir, 'model')
        if not os.path.exists(self.model_dir):
            os.mkdir(self.model_dir)

        self.summary_writer = tf.compat.v1.summary.FileWriter(train_dir)

    def save_model(self, running_avg_loss, iter_step):
        """保存模型"""
        state = {
            'iter': iter_step,
            'encoder_state_dict': self.model.encoder.state_dict(),
            'decoder_state_dict': self.model.decoder.state_dict(),
            'reduce_state_dict': self.model.reduce_state.state_dict(),
            'optimizer': self.optimizer.state_dict(),
            'current_loss': running_avg_loss
        }
        stamp = time.strftime("%Y%m%d_%H%M%S", time.localtime())
        model_save_path = os.path.join(self.model_dir, 'model_{}_{}'.format(iter_step, stamp))
        torch.save(state, model_save_path)

    def setup_train(self, model_file_path=None):
        """模型初始化或加载、初始化迭代次数、损失、优化器"""
        # 初始化模型
        self.model = Model(model_file_path)

        # 模型参数的列表
        params = list(self.model.encoder.parameters()) + list(self.model.decoder.parameters()) + \
                 list(self.model.reduce_state.parameters())
        initial_lr = config.lr_coverage if config.is_coverage else config.lr  # lr_coverage和lr二选一
        # 定义优化器
        self.optimizer = Adagrad(params, lr=initial_lr, initial_accumulator_value=config.adagrad_init_acc)
        # 初始化迭代次数和损失
        start_iter, start_loss = 0, 0
        # 如果传入的已存在的模型路径,加载模型继续训练
        if model_file_path is not None:
            state = torch.load(model_file_path, map_location=lambda storage, location: storage)
            start_iter = state['iter']
            start_loss = state['current_loss']

            if not config.is_coverage:
                self.optimizer.load_state_dict(state['optimizer'])
                if USE_CUDA:
                    for state in self.optimizer.state.values():
                        for k, v in state.items():
                            if torch.is_tensor(v):
                                state[k] = v.to(DEVICE)

        return start_iter, start_loss

    def train_one_batch(self, batch):
        """
        训练一个batch,返回该batch的loss。
        enc_batch:             torch.Size([16, 400]), 16篇文章的编码,不足400词的用pad的编码补足, oov词汇用0编码;
        enc_padding_mask:      torch.Size([16, 400]), 对应pad的位置为0,其余为1;
        enc_lens:              numpy.ndarray, 列表内每个元素表示每篇article的单词数;
        enc_batch_extend_vocab:torch.Size([16, 400]), 16篇文章的编码;oov词汇用超过词汇表的编码;
        extra_zeros:           torch.Size([16, 文章oov词汇数量]) zero tensor;
        c_t_1:                 torch.Size([16, 512]) zero tensor;
        coverage:              Variable(torch.zeros(batch_size, max_enc_seq_len)) if is_coverage==True else None;coverage模式时后续有值
        ----------------------------------------
        dec_batch:             torch.Size([16, 100]) 摘要编码含有开始符号编码以及PAD;
        dec_padding_mask:      torch.Size([16, 100]) 对应pad的位置为0,其余为1;
        max_dec_len:           标量,摘要词语数量,不包含pad
        dec_lens_var:          torch.Size([16] 摘要词汇数量
        target_batch:          torch.Size([16, 100]) 目标摘要编码含有STOP符号编码以及PAD
        """
        enc_batch, enc_padding_mask, enc_lens, enc_batch_extend_vocab, extra_zeros, c_t_1, coverage = \
            get_input_from_batch(batch)
        dec_batch, dec_padding_mask, max_dec_len, dec_lens_var, target_batch = \
            get_output_from_batch(batch)
        # 暂时未理解extra_zeros含义

        self.optimizer.zero_grad()
        """
        # 记得修改Batch类添加vocab属性

        print("模型输入文章编码:", "*"*100)
        print("enc_batch:", enc_batch, enc_batch.size())
        print("enc_batch[-1]:", enc_batch[-1])
        # print("batch._id_to_word:", batch.vocab._id_to_word)
        print("enc_batch[-1]原文:", [batch.vocab.id2word(idx) for idx in enc_batch[-1].cpu().numpy()])
        print("-"*50)
        print("enc_padding_mask:", enc_padding_mask, enc_padding_mask.size())
        print("-"*50)
        print("enc_lens:", enc_lens, enc_lens.shape)
        print("-"*50)
        print("enc_batch_extend_vocab", enc_batch_extend_vocab, enc_batch_extend_vocab.size())
        print("enc_batch_extend_vocab[-1]:", enc_batch_extend_vocab[-1])
        print("enc_batch_extend_vocab[-1]的原文:", [batch.vocab.id2word(idx) if idx<50000 else '[UNK]+{}'.format(idx-50000) for idx in enc_batch_extend_vocab[-1].cpu().numpy()])
        print("-"*50)
        print("extra_zeros:", extra_zeros, extra_zeros.size())
        print("-"*50)
        print("c_t_1:", c_t_1, c_t_1.size())
        print("-"*50)
        print("coverage:", coverage)
        print("*"*100)

        print("模型输入摘要编码,包括源和目标:", "*"*100)
        print("dec_batch:", dec_batch, dec_batch.size())
        print("dec_batch[0]:", dec_batch[0])
        # print("batch._id_to_word:", batch.vocab._id_to_word)
        print("dec_batch[0]原文:", [batch.vocab.id2word(idx) for idx in dec_batch[0].cpu().numpy()])
        print("-"*50)
        print("dec_padding_mask:", dec_padding_mask, dec_padding_mask.size())
        print("-"*50)
        print("max_dec_len:", max_dec_len)
        print("-"*50)
        print("dec_lens_var", dec_lens_var, dec_lens_var.size())
        print("-"*50)
        print("target_batch:", target_batch, target_batch.size())
        print("-"*50)
        print("target_batch[0]:", target_batch[0], target_batch[0].size())
        print("target_batch[0]的原文:", [batch.vocab.id2word(idx) if idx<50000 else '[UNK]+{}'.format(idx-50000) for idx in target_batch[0].cpu().numpy()])
        print("*"*100)
        input("任意键继续>>>")
        """
        # [B, max(seq_lens), 2*hid_dim], [B*max(seq_lens), 2*hid_dim], tuple([2, B, hid_dim], [2, B, hid_dim])
        encoder_outputs, encoder_feature, encoder_hidden = self.model.encoder(enc_batch, enc_lens)
        s_t_1 = self.model.reduce_state(encoder_hidden)  # (h,c) = ([3, B, hid_dim], [3, B, hid_dim])
        step_losses = []
        for di in range(min(max_dec_len, config.max_dec_steps)):
            y_t_1 = dec_batch[:, di]  # 摘要的一个单词,batch里的每个句子的同一位置的单词编码
            # print("y_t_1:", y_t_1, y_t_1.size())
            final_dist, s_t_1, c_t_1, attn_dist, p_gen, next_coverage = self.model.decoder(y_t_1, s_t_1,
                                                                                           encoder_outputs,
                                                                                           encoder_feature,
                                                                                           enc_padding_mask, c_t_1,
                                                                                           extra_zeros,
                                                                                           enc_batch_extend_vocab,
                                                                                           coverage, di)
            target = target_batch[:, di]  # 摘要的下一个单词的编码
            # print("target-iter:", target, target.size())
            # print("final_dist:", final_dist, final_dist.size())
            # input("go on>>")
            # final_dist 是词汇表每个单词的概率,词汇表是扩展之后的词汇表,也就是大于预设的50_000
            gold_probs = torch.gather(final_dist, 1, target.unsqueeze(1)).squeeze()  # 取出目标单词的概率gold_probs
            step_loss = -torch.log(gold_probs + config.eps)  # 最大化gold_probs,也就是最小化step_loss(添加负号)
            if config.is_coverage:
                step_coverage_loss = torch.sum(torch.min(attn_dist, coverage), 1)
                step_loss = step_loss + config.cov_loss_wt * step_coverage_loss
                coverage = next_coverage

            step_mask = dec_padding_mask[:, di]
            step_loss = step_loss * step_mask
            step_losses.append(step_loss)

        sum_losses = torch.sum(torch.stack(step_losses, 1), 1)
        batch_avg_loss = sum_losses / dec_lens_var
        loss = torch.mean(batch_avg_loss)

        loss.backward()

        self.norm = clip_grad_norm_(self.model.encoder.parameters(), config.max_grad_norm)
        clip_grad_norm_(self.model.decoder.parameters(), config.max_grad_norm)
        clip_grad_norm_(self.model.reduce_state.parameters(), config.max_grad_norm)

        self.optimizer.step()

        return loss.item()

    def trainIters(self, n_iters, model_file_path=None):
        # 训练设置,包括
        iter_step, running_avg_loss = self.setup_train(model_file_path)
        start = time.time()
        while iter_step < n_iters:
            # 获取下一个batch数据
            batch = self.batcher.next_batch()
            loss = self.train_one_batch(batch)

            running_avg_loss = calc_running_avg_loss(loss, running_avg_loss, self.summary_writer, iter_step)
            iter_step += 1

            if iter_step % 100 == 0:
                self.summary_writer.flush()

            # print_interval = 1000
            if iter_step % 100 == 0:
                # lr = self.optimizer.state_dict()['param_groups'][0]['lr']
                logging.info('steps %d, seconds for %d steps: %.2f, loss: %f' % (iter_step, 10,
                                                                          time.time() - start, loss))
                start = time.time()
            # 50000次迭代就保存一下模型
            if iter_step % 50000 == 0:
                logging.info("model saved = {}/{}".format(int(iter_step / 50000) + 1, int(config.max_iterations/50000) + 1))
                self.save_model(running_avg_loss, iter_step)
示例#18
0
class Trainer():
    def __init__(self,
                 model,
                 args,
                 train_dataset,
                 eval_dataset,
                 test_dataset,
                 vocab,
                 is_train=True):
        self.model = model  #.to(args.device)
        self.args = args
        self.train_dataset = train_dataset
        self.eval_dataset = eval_dataset
        self.test_dataset = test_dataset
        self.is_train = is_train
        self.vocab = vocab

        self.params = list(model.encoder.parameters()) + \
            list(model.decoder.parameters()) + list(model.reduce_state.parameters())
        initial_lr = args.lr_coverage if args.is_coverage else args.lr
        self.optimizer = Adagrad(
            self.params,
            lr=initial_lr,
            initial_accumulator_value=args.adagrad_init_acc)

    def get_train_dataloader(self):
        if self.train_dataset is None:
            raise ValueError('Trainer: training requires a train_dataset.')
        return BucketIterator(dataset=self.train_dataset,
                              batch_size=self.args.batch_size,
                              device=self.args.device,
                              sort_key=lambda x: len(x.source),
                              sort_within_batch=True)

    def get_eval_dataloader(self):
        if self.eval_dataset is None:
            raise ValueError('Trainer: eval requires a eval_dataset.')
        return BucketIterator(dataset=self.eval_dataset,
                              batch_size=self.args.batch_size,
                              device=self.args.device,
                              sort_key=lambda x: len(x.source),
                              sort_within_batch=True)

    def get_test_dataloader(self):
        if self.test_dataset is None:
            raise ValueError('Trainer: testing requires a test_dataset.')
        return BucketIterator(dataset=self.test_dataset,
                              batch_size=self.args.batch_size,
                              device=self.args.device,
                              sort_key=lambda x: len(x.source),
                              sort_within_batch=True)

    def get_mask(self, batch):
        # print('each batch', batch[0].size())
        maxlen = batch[0].size()[1]
        max_enc_seq_len = batch[1]
        mask = torch.arange(maxlen).to(self.args.device)
        mask = mask[None, :] < max_enc_seq_len[:, None]
        # print(batch.source[0]*mask)
        return mask

    def get_extra_features(self, batch):
        unk_index = self.vocab.stoi[UNKNOWN_TOKEN]
        batch = batch.cpu().detach().numpy()
        batch_size = batch.shape[0]
        max_art_oovs = max([Counter(sample)[unk_index] for sample in batch])
        extra_zeros = None

        enc_batch_extend_vocab = np.full_like(
            batch, fill_value=self.vocab.stoi[PAD_TOKEN])
        max_art_oovs = 0
        for i, sample_index in enumerate(batch):
            oov_word_count = len(self.vocab)
            for j, word_index in enumerate(sample_index):
                if word_index == unk_index:
                    enc_batch_extend_vocab[i, j] = oov_word_count
                    oov_word_count += 1
            max_art_oovs = max(max_art_oovs, oov_word_count)
        max_art_oovs -= len(self.vocab)
        enc_batch_extend_vocab = Variable(
            torch.from_numpy(enc_batch_extend_vocab).long())

        extra_zeros = Variable(torch.zeros((batch_size, max_art_oovs)))
        return extra_zeros, enc_batch_extend_vocab, max_art_oovs

    def save_model(self, running_avg_loss, iter, model_dir):
        if not os.path.exists(model_dir):
            os.makedirs(model_dir)
        state = {
            'iter': iter,
            'encoder_state_dict': self.model.encoder.state_dict(),
            'decoder_state_dict': self.model.decoder.state_dict(),
            'reduce_state_dict': self.model.reduce_state.state_dict(),
            'optimizer': self.optimizer.state_dict(),
            'current_loss': running_avg_loss,
            'vocab': self.vocab
        }
        model_save_path = os.path.join(
            model_dir, 'model_%d_%d' % (iter, int(time.time())))
        torch.save(state, model_save_path)

    def evaluate(self, eval_dataset=None, iter=0, is_test=False):
        if is_test:
            eval_iter = self.get_test_dataloader()
        else:
            eval_iter = self.get_eval_dataloader()
        self.model.eval()

        running_avg_loss = 0
        with torch.no_grad():
            for i, batch in tqdm(enumerate(eval_iter), total=len(eval_iter)):
                # print(batch.source[0].size())
                # exit()
                batch_size = batch.batch_size
                # encoder part
                enc_padding_mask = self.get_mask(batch.source)
                enc_batch = batch.source[0]
                enc_lens = batch.source[1]
                encoder_outputs, encoder_feature, encoder_hidden = self.model.encoder(
                    enc_batch, enc_lens)
                s_t_1 = self.model.reduce_state(encoder_hidden)
                coverage = Variable(torch.zeros(batch.source[0].size())).to(
                    self.args.device)
                c_t_1 = Variable(
                    torch.zeros(
                        (batch_size,
                         2 * self.args.hidden_dim))).to(self.args.device)
                extra_zeros, enc_batch_extend_vocab, max_art_oovs = self.get_extra_features(
                    batch.source[0])
                extra_zeros = extra_zeros.to(self.args.device)
                enc_batch_extend_vocab = enc_batch_extend_vocab.to(
                    self.args.device)
                # decoder part
                dec_batch = batch.target[0][:, :-1]
                # print(dec_batch.size())
                target_batch = batch.target[0][:, 0:]
                dec_lens_var = batch.target[1]
                dec_padding_mask = self.get_mask(batch.target)
                max_dec_len = max(dec_lens_var)

                step_losses = []
                for di in range(min(max_dec_len, self.args.max_dec_steps) - 1):
                    y_t_1 = dec_batch[:, di]  # Teacher forcing
                    final_dist, s_t_1, c_t_1, attn_dist, p_gen, next_coverage = self.model.decoder(
                        y_t_1, s_t_1, encoder_outputs, encoder_feature,
                        enc_padding_mask, c_t_1, extra_zeros,
                        enc_batch_extend_vocab, coverage, di)
                    target = target_batch[:, di]
                    gold_probs = torch.gather(final_dist, 1,
                                              target.unsqueeze(1)).squeeze()
                    step_loss = -torch.log(gold_probs + self.args.eps)
                    if self.args.is_coverage:
                        step_coverage_loss = torch.sum(
                            torch.min(attn_dist, coverage), 1)
                        step_loss = step_loss + self.args.cov_loss_wt * step_coverage_loss
                        coverage = next_coverage

                    step_mask = dec_padding_mask[:, di]
                    step_loss = step_loss * step_mask
                    step_losses.append(step_loss)
                sum_losses = torch.sum(torch.stack(step_losses, 1), 1)
                batch_avg_loss = sum_losses / dec_lens_var
                loss = torch.mean(batch_avg_loss)

                norm = clip_grad_norm_(self.model.encoder.parameters(),
                                       self.args.max_grad_norm)
                clip_grad_norm_(self.model.decoder.parameters(),
                                self.args.max_grad_norm)
                clip_grad_norm_(self.model.reduce_state.parameters(),
                                self.args.max_grad_norm)

                self.optimizer.step()

                # running_avg_loss = loss if running_avg_loss == 0 else running_avg_loss * decay + (1 - decay) * loss
                # running_avg_loss = min(running_avg_loss, 12)
            name = 'Test' if is_test else 'Evaluation'
            calc_running_avg_loss(loss.item(), running_avg_loss,
                                  summary_writer, iter, name)
            # iter += 1

    # def predict(self, source_sentence):

    def train(self, model_path=None):

        train_iter = self.get_train_dataloader()
        iter, running_avg_loss = 0, 0
        start = time.time()
        for epoch in range(self.args.epoches):
            print(f"Epoch: {epoch+1}")
            self.model.train()
            for i, batch in tqdm(enumerate(train_iter), total=len(train_iter)):
                # print(batch.source[0].size())
                # exit()
                batch_size = batch.batch_size
                # encoder part
                enc_padding_mask = self.get_mask(batch.source)
                enc_batch = batch.source[0]
                enc_lens = batch.source[1]
                encoder_outputs, encoder_feature, encoder_hidden = self.model.encoder(
                    enc_batch, enc_lens)
                s_t_1 = self.model.reduce_state(encoder_hidden)
                coverage = Variable(torch.zeros(batch.source[0].size())).to(
                    self.args.device)
                c_t_1 = Variable(
                    torch.zeros(
                        (batch_size,
                         2 * self.args.hidden_dim))).to(self.args.device)
                extra_zeros, enc_batch_extend_vocab, max_art_oovs = self.get_extra_features(
                    batch.source[0])
                extra_zeros = extra_zeros.to(self.args.device)
                enc_batch_extend_vocab = enc_batch_extend_vocab.to(
                    self.args.device)
                # decoder part
                dec_batch = batch.target[0][:, :-1]
                # print(dec_batch.size())
                target_batch = batch.target[0][:, 0:]
                dec_lens_var = batch.target[1]
                dec_padding_mask = self.get_mask(batch.target)
                max_dec_len = max(dec_lens_var)

                step_losses = []
                for di in range(min(max_dec_len, self.args.max_dec_steps) - 1):
                    y_t_1 = dec_batch[:, di]  # Teacher forcing
                    final_dist, s_t_1, c_t_1, attn_dist, p_gen, next_coverage = self.model.decoder(
                        y_t_1, s_t_1, encoder_outputs, encoder_feature,
                        enc_padding_mask, c_t_1, extra_zeros,
                        enc_batch_extend_vocab, coverage, di)
                    target = target_batch[:, di]
                    gold_probs = torch.gather(final_dist, 1,
                                              target.unsqueeze(1)).squeeze()
                    step_loss = -torch.log(gold_probs + self.args.eps)
                    if self.args.is_coverage:
                        step_coverage_loss = torch.sum(
                            torch.min(attn_dist, coverage), 1)
                        step_loss = step_loss + self.args.cov_loss_wt * step_coverage_loss
                        coverage = next_coverage

                    step_mask = dec_padding_mask[:, di]
                    step_loss = step_loss * step_mask
                    step_losses.append(step_loss)
                sum_losses = torch.sum(torch.stack(step_losses, 1), 1)
                batch_avg_loss = sum_losses / dec_lens_var
                loss = torch.mean(batch_avg_loss)

                loss.backward()

                norm = clip_grad_norm_(self.model.encoder.parameters(),
                                       self.args.max_grad_norm)
                clip_grad_norm_(self.model.decoder.parameters(),
                                self.args.max_grad_norm)
                clip_grad_norm_(self.model.reduce_state.parameters(),
                                self.args.max_grad_norm)

                self.optimizer.step()

                running_avg_loss = calc_running_avg_loss(
                    loss.item(), running_avg_loss, summary_writer, iter,
                    'Train')
                iter += 1
                if iter % self.args.flush:
                    # print('flush')
                    summary_writer.flush()
                # print_interval = 10
                # if iter % print_interval == 0:
                #     print(f'steps {iter}, batch number: {i} with {time.time() - start} seconds, loss: {loss}')
                #     start = time.time()
                # if iter % 300 == 0:
            self.save_model(running_avg_loss, iter, model_dir)
            self.evaluate(self.eval_dataset, epoch)
            self.evaluate(self.test_dataset, epoch, True)
示例#19
0
class WeightedHolE(nn.Module):
    def __init__(self, *args, **kwargs):
        super(WeightedHolE, self).__init__()
        # self.add_hyperparam('rparam', kwargs.pop('rparam', 0.0))

        self.learning_rate = kwargs.get('lr', _DEF_LEARNING_RATE)
        entity_dim, _, relation_dim = args[0]
        embed_dim = args[1]
        self._max_epochs = kwargs.get('max_epochs', _DEF_MAX_EPOCHS)
        
        init_relations = kwargs.get('init_relations')
        if init_relations is not None:
            self.R = nn.Parameter(init_relations)
        else:
            self.R = nn.Parameter(torch.FloatTensor(relation_dim, embed_dim).uniform_(-.1,.1))
        self.R.my_name = 'R'
        self.R.grad = torch.zeros_like(self.R)
        
        pretrained_ent = kwargs.get('pretrained_entities')
        if pretrained_ent is not None:
            self.E = nn.Parameter(pretrained_ent)
        else:
            self.E = nn.Parameter(torch.FloatTensor(entitiy_dim, embed_dim).uniform_(-.1,.1))
        self.E.my_name = 'E'
        self.E.grad = torch.zeros_like(self.E)
        
        self.loss_function = nn.SoftMarginLoss(reduction='sum')
        self.optim = Adagrad(list(self.parameters()), lr=self.learning_rate)
        
    def forward(self, xs, ys, minibatch_size):
        for loss, grads in self._optim(list(zip(xs, ys)), minibatch_size):
            yield loss, grads
        
    def _optim(self, xys, minibatch_size):
        for self._epoch in range(1, self._max_epochs+1):
            self.loss = 0
            self.optim.zero_grad()
            self.train()
            
            # shuffle training examples
            indices = list(range(len(xys)))
            shuffle(indices)
            
            # store epoch for callback
            self.epoch_start = timeit.default_timer()
            
            # process mini-batches
            lower_iter, upper_iter = count(0, minibatch_size), count(minibatch_size, minibatch_size) 
            for lower, upper in zip(lower_iter, upper_iter):
                # select indices for current batch
                if lower >= len(indices):
                    break

                batch_examples = [xys[idx] for idx in indices[lower:upper]]
                triples,ys = zip(*batch_examples)
                ss,ps,os = zip(*triples)
                ss,ps,os,ys=torch.LongTensor(ss), torch.LongTensor(ps), torch.LongTensor(os), torch.FloatTensor(ys)
                        
                yscores = self._scores(ss, ps, os) # see Holographic Embeddings, eq. 2
                self.loss = self.loss_function(yscores, ys)
                print('loss', self.loss)

                fs = -(ys * torch.sigmoid(-yscores)).unsqueeze(1)
                entity_grad, entity_idxs = self._fn_Entity_Grad(yscores, ss, os, ps, fs)
                relation_grad, relation_idxs = self._fn_Relation_Grad(yscores, ss, os, ps, fs)
                #print('grad rel', relation_grads.shape, torch.sum(relation_grads))
                
                for param in self.parameters():
                    if param.my_name == 'R':
                        self.R.grad = relation_grad
                    
                    if param.my_name == 'E':
                        for col,row_grads in zip(entity_idxs, entity_grad): # FIXME use index_put_
                            self.E.grad[col] = row_grads

                self.optim.step()
                
                #batch_loss, batch_grads = self._process_batch(bxys)
                #yield batch_loss, batch_grads


    def _fn_Entity_Grad(self, yscores, ss, os, ps, fs):
        sparse_indices, Sm, n = grad_sum_matrix(torch.cat((ss, os)))
        combined = torch.cat((fs * ccorr(self.R[ps], self.E[os]),
                              fs * cconv(self.E[ss], self.R[ps])),
                             dim=0)
        grads = torch.mm(Sm, combined) / n.unsqueeze(1)
        return grads, sparse_indices

    def _fn_Relation_Grad(self, yscores, ss, os, ps, fs):
        sparse_indices, Sm, n = grad_sum_matrix(ps)
        grads = torch.mm(Sm, fs * ccorr(self.E[ss], self.E[os])) / n
        return grads, sparse_indices
        
    def _scores(self, ss, ps, os):
        return torch.sum(self.R[ps] * ccorr(self.E[ss], self.E[os]), dim=1)

    def _update(self, g, idx=None):
        self.p2[idx] += g * g
        H = np.maximum(np.sqrt(self.p2[idx]), 1e-7)
        self.param[idx] -= self.learning_rate * g / H
示例#20
0
class Train(object):
    def __init__(self):
        self.vocab = Vocab(config.vocab_path, config.vocab_size)
        self.batcher = Batcher(config.train_data_path,
                               self.vocab,
                               mode='train',
                               batch_size=config.batch_size,
                               single_pass=False)
        time.sleep(15)

        train_dir = os.path.join(config.log_root,
                                 'train_%d' % (int(time.time())))
        if not os.path.exists(train_dir):
            os.mkdir(train_dir)

        self.model_dir = os.path.join(train_dir, 'model')
        if not os.path.exists(self.model_dir):
            os.mkdir(self.model_dir)

        self.summary_writer = tf.summary.FileWriter(train_dir)

    def save_model(self, running_avg_loss, iter):
        state = {
            'iter': iter,
            'encoder_state_dict': self.model.encoder.state_dict(),
            'decoder_state_dict': self.model.decoder.state_dict(),
            'reduce_state_dict': self.model.reduce_state.state_dict(),
            'optimizer': self.optimizer.state_dict(),
            'current_loss': running_avg_loss
        }
        model_save_path = os.path.join(
            self.model_dir, 'model_%d_%d' % (iter, int(time.time())))
        torch.save(state, model_save_path)

    def setup_train(self, model_file_path=None):
        self.model = Model(model_file_path)

        params = list(self.model.encoder.parameters()) + list(self.model.decoder.parameters()) + \
                 list(self.model.reduce_state.parameters())
        initial_lr = config.lr_coverage if config.is_coverage else config.lr
        self.optimizer = Adagrad(
            params,
            lr=initial_lr,
            initial_accumulator_value=config.adagrad_init_acc)

        start_iter, start_loss = 0, 0

        if model_file_path is not None:
            state = torch.load(model_file_path,
                               map_location=lambda storage, location: storage)
            start_iter = state['iter']
            start_loss = state['current_loss']

            if not config.is_coverage:
                self.optimizer.load_state_dict(state['optimizer'])
                if use_cuda:
                    for state in self.optimizer.state.values():
                        for k, v in state.items():
                            if torch.is_tensor(v):
                                state[k] = v.cuda()

        return start_iter, start_loss

    def train_one_batch(self, batch):
        enc_batch, enc_padding_mask, enc_lens, enc_batch_extend_vocab, extra_zeros, c_t_1, coverage = \
            get_input_from_batch(batch, use_cuda)
        dec_batch, dec_padding_mask, max_dec_len, dec_lens_var, target_batch = \
            get_output_from_batch(batch, use_cuda)

        self.optimizer.zero_grad()

        encoder_outputs, encoder_feature, encoder_hidden = self.model.encoder(
            enc_batch, enc_lens)
        s_t_1 = self.model.reduce_state(encoder_hidden)

        step_losses = []
        for di in range(min(max_dec_len, config.max_dec_steps)):
            y_t_1 = dec_batch[:, di]  # Teacher forcing
            final_dist, s_t_1, c_t_1, attn_dist, p_gen, next_coverage = self.model.decoder(
                y_t_1, s_t_1, encoder_outputs, encoder_feature,
                enc_padding_mask, c_t_1, extra_zeros, enc_batch_extend_vocab,
                coverage, di)
            target = target_batch[:, di]
            gold_probs = torch.gather(final_dist, 1,
                                      target.unsqueeze(1)).squeeze()
            step_loss = -torch.log(gold_probs + config.eps)
            if config.is_coverage:
                step_coverage_loss = torch.sum(torch.min(attn_dist, coverage),
                                               1)
                step_loss = step_loss + config.cov_loss_wt * step_coverage_loss
                coverage = next_coverage

            step_mask = dec_padding_mask[:, di]
            step_loss = step_loss * step_mask
            step_losses.append(step_loss)

        sum_losses = torch.sum(torch.stack(step_losses, 1), 1)
        batch_avg_loss = sum_losses / dec_lens_var
        loss = torch.mean(batch_avg_loss)

        loss.backward()

        self.norm = clip_grad_norm_(self.model.encoder.parameters(),
                                    config.max_grad_norm)
        clip_grad_norm_(self.model.decoder.parameters(), config.max_grad_norm)
        clip_grad_norm_(self.model.reduce_state.parameters(),
                        config.max_grad_norm)

        self.optimizer.step()

        return loss.item()

    def trainIters(self, n_iters, model_file_path=None):
        iter, running_avg_loss = self.setup_train(model_file_path)
        start = time.time()
        while iter < n_iters:
            batch = self.batcher.next_batch()

            loss = self.train_one_batch(batch)

            running_avg_loss = calc_running_avg_loss(loss, running_avg_loss,
                                                     self.summary_writer, iter)
            iter += 1

            if iter % 100 == 0:
                self.summary_writer.flush()
            print_interval = 50
            if iter % print_interval == 0:
                print('steps %d, seconds for %d batch: %.2f , loss: %f' %
                      (iter, print_interval, time.time() - start, loss))
                start = time.time()
            if iter % 5000 == 0:
                self.save_model(running_avg_loss, iter)
示例#21
0
class Trainer:
    def __init__(self, config):
        self.config = config
        self.step = 0
        self.vocab = Vocab(config.vocab_file, config.vocab_size)
        self.train_data = CNNDMDataset('train', config.data_path, config,
                                       self.vocab)
        self.validate_data = CNNDMDataset('val', config.data_path, config,
                                          self.vocab)
        # self.model = Model(config).to(device)
        # self.optimizer = None
        self.setup(config)

    def setup(self, config):

        model = Model(config)
        checkpoint = None
        if config.train_from != '':
            logging('Train from %s' % config.train_from)
            checkpoint = torch.load(config.train_from, map_location='cpu')
            model.load_state_dict(checkpoint['model'])
            self.step = checkpoint['step']

        self.model = model.to(device)
        self.optimizer = Adagrad(model.parameters(),
                                 lr=config.learning_rate,
                                 initial_accumulator_value=config.initial_acc)
        if checkpoint is not None:
            self.optimizer.load_state_dict(checkpoint['optimizer'])

    def train_one(self, batch):

        config = self.config
        enc_batch, enc_padding_mask, enc_lens, enc_batch_extend_vocab, extra_zeros, c_t_1, coverage = \
            get_input_from_batch(batch, config, device)
        dec_batch, dec_padding_mask, max_dec_len, dec_lens_var, target_batch = \
            get_output_from_batch(batch, device)

        encoder_outputs, encoder_feature, encoder_hidden = self.model.encoder(
            enc_batch, enc_lens)
        s_t_1 = self.model.reduce_state(encoder_hidden)

        step_losses = []
        for di in range(max_dec_len):
            y_t_1 = dec_batch[:, di]  # Teacher forcing
            final_dist, s_t_1, c_t_1, attn_dist, p_gen, next_coverage = self.model.decoder(
                y_t_1, s_t_1, encoder_outputs, encoder_feature,
                enc_padding_mask, c_t_1, extra_zeros, enc_batch_extend_vocab,
                coverage, di)
            target = target_batch[:, di]
            gold_probs = torch.gather(final_dist, 1,
                                      target.unsqueeze(1)).squeeze()
            step_loss = -torch.log(gold_probs + config.eps)
            if config.is_coverage:
                step_coverage_loss = torch.sum(torch.min(attn_dist, coverage),
                                               1)
                step_loss = step_loss + config.cov_loss_wt * step_coverage_loss
                coverage = next_coverage

            step_mask = dec_padding_mask[:, di]
            step_loss = step_loss * step_mask
            step_losses.append(step_loss)

        sum_losses = torch.sum(torch.stack(step_losses, 1), 1)
        batch_avg_loss = sum_losses / dec_lens_var
        loss = torch.mean(batch_avg_loss)
        return loss

    def train(self):

        config = self.config
        train_loader = DataLoader(self.train_data,
                                  batch_size=config.batch_size,
                                  shuffle=True,
                                  collate_fn=Collate())

        running_avg_loss = 0
        self.model.train()

        for e in range(config.train_epoch):
            for batch in train_loader:
                self.step += 1
                self.optimizer.zero_grad()
                loss = self.train_one(batch)
                loss.backward()
                clip_grad_norm_(self.model.parameters(), config.max_grad_norm)
                self.optimizer.step()
                #print(loss.item())
                running_avg_loss = calc_running_avg_loss(
                    loss.item(), running_avg_loss)

                if self.step % config.report_every == 0:
                    logging("Step %d Train loss %.3f" %
                            (self.step, running_avg_loss))
                if self.step % config.validate_every == 0:
                    self.validate()
                if self.step % config.save_every == 0:
                    self.save(self.step)
                if self.step % config.test_every == 0:
                    pass

    @torch.no_grad()
    def validate(self):
        self.model.eval()
        validate_loader = DataLoader(self.validate_data,
                                     batch_size=self.config.batch_size,
                                     shuffle=False,
                                     collate_fn=Collate())
        losses = []
        for batch in validate_loader:
            loss = self.train_one(batch)
            losses.append(loss.item())
        self.model.train()
        ave_loss = sum(losses) / len(losses)
        logging('Validate loss : %f' % ave_loss)

    def save(self, step):
        state = {
            'model': self.model.state_dict(),
            'optimizer': self.optimizer.state_dict(),
            'step': step
        }
        save_path = os.path.join(self.config.model_path, 'model_s%d.pt' % step)
        logging('Saving model step %d to %s...' % (step, save_path))
        torch.save(state, save_path)
示例#22
0
def train():
    target_field = Field(sequential=True,
                         init_token=START_DECODING,
                         eos_token=STOP_DECODING,
                         pad_token=PAD_TOKEN,
                         batch_first=True,
                         include_lengths=True,
                         unk_token=UNKNOWN_TOKEN,
                         lower=True)

    source_field = Field(sequential=True,
                         init_token=SENTENCE_START,
                         eos_token=SENTENCE_END,
                         pad_token=PAD_TOKEN,
                         batch_first=True,
                         include_lengths=True,
                         unk_token=UNKNOWN_TOKEN,
                         lower=True)
    train_path = '../data/incar_alexa/train_public.pickle'
    dev_path = '../data/incar_alexa/dev_public.pickle'
    test_path = '../data/incar_alexa/test_public.pickle'
    path = '../data/cnn_stories_tokenized'
    summary_writer = SummaryWriter(config.summary_path)

    train_src, train_tgt, train_id = load_data(train_path)
    dev_src, dev_tgt, dev_id = load_data(dev_path)
    test_src, test_tgt, test_id = load_data(test_path)
    # train_data = prepare_data_cnn(path)
    # # print(train_data[0])
    # train_src = [dt['src'] for dt in train_data]
    # train_tgt = [dt['tgt'] for dt in train_data]
    # train_id = [dt['id'] for dt in train_data]
    # train_src, test_src, train_tgt, test_tgt = train_test_split(
    #     train_src, train_tgt, test_size=0.15, random_state=123)
    # train_id, test_id = train_test_split(
    #     train_id, test_size=0.15, random_state=123)
    # # print(f"{len(train_src)}, {len(train_tgt)}")
    # train_src, dev_src, train_tgt, dev_tgt = train_test_split(
    #     train_src, train_tgt, test_size=0.15, random_state=123)
    # train_id, dev_id = train_test_split(
    #     train_id, test_size=0.15, random_state=123)

    # print(source_field.preprocess(train_src[0]))
    # exit()
    train_src_preprocessed = [source_field.preprocess(x) for x in train_src]
    dev_src_preprocessed = [source_field.preprocess(x) for x in dev_src]
    test_src_preprocessed = [source_field.preprocess(x) for x in test_src]

    train_tgt_preprocessed = [target_field.preprocess(x) for x in train_tgt]
    dev_tgt_preprocessed = [target_field.preprocess(x) for x in dev_tgt]
    test_tgt_preprocessed = [target_field.preprocess(x) for x in test_tgt]
    # train_src_preprocessed = source_field.apply(lambda x: source_field.preprocess(x))

    vectors = Vectors(
        name='/home/binhna/Downloads/shared_resources/cc.en.300.vec',
        cache='/home/binhna/Downloads/shared_resources/')

    source_field.build_vocab([
        train_src_preprocessed, dev_src_preprocessed, train_tgt_preprocessed,
        dev_tgt_preprocessed
    ],
                             vectors=vectors)
    target_field.build_vocab([
        train_src_preprocessed, dev_src_preprocessed, train_tgt_preprocessed,
        dev_tgt_preprocessed
    ],
                             vectors=vectors)

    train_data = [{
        'src': src,
        'tgt': tgt,
        'id': id
    } for src, tgt, id in zip(train_src, train_tgt, train_id)]
    train_data = Mydataset(data=train_data,
                           fields=(('source', source_field), ('target',
                                                              target_field)))
    dev_data = [{
        'src': src,
        'tgt': tgt,
        'id': id
    } for src, tgt, id in zip(dev_src, dev_tgt, dev_id)]
    # print(dev_data[0])
    dev_data = Mydataset(data=dev_data,
                         fields=(('source', source_field), ('target',
                                                            target_field)))

    test_data = [{
        'src': src,
        'tgt': tgt,
        'id': id
    } for src, tgt, id in zip(test_src, test_tgt, test_id)]
    test_data = Mydataset(data=test_data,
                          fields=(('source', source_field), ('target',
                                                             target_field)))
    # print(train_data[10].source)
    # print(train_data[10].target)
    # print(len(target_field.vocab))
    device = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu')
    train_iter, test_iter, dev_iter = BucketIterator.splits(
        datasets=(train_data, test_data, dev_data),
        batch_sizes=(config.batch_size, config.batch_size, config.batch_size),
        device=device,
        sort_key=lambda x: len(x.source),
        sort_within_batch=True)

    args = ARGS()
    setattr(args, 'vectors', source_field.vocab.vectors)
    setattr(args, 'vocab_size', len(source_field.vocab.itos))
    setattr(args, 'emb_dim', vectors.dim)
    model = Model(args)

    params = list(model.encoder.parameters()) + list(
        model.decoder.parameters()) + list(model.reduce_state.parameters())
    initial_lr = config.lr_coverage if config.is_coverage else config.lr
    optimizer = Adagrad(params,
                        lr=initial_lr,
                        initial_accumulator_value=config.adagrad_init_acc)

    iter, running_avg_loss = 0, 0
    start = time.time()
    for epoch in range(500):
        print(f"Epoch: {epoch+1}")
        for i, batch in tqdm(enumerate(train_iter), total=len(train_iter)):
            # print(batch.source[0].size())
            # exit()
            batch_size = batch.batch_size
            # encoder part
            enc_padding_mask = get_mask(batch.source, device)
            enc_batch = batch.source[0]
            enc_lens = batch.source[1]
            encoder_outputs, encoder_feature, encoder_hidden = model.encoder(
                enc_batch, enc_lens)
            s_t_1 = model.reduce_state(encoder_hidden)
            coverage = Variable(torch.zeros(batch.source[0].size())).to(device)
            c_t_1 = Variable(torch.zeros(
                (batch_size, 2 * config.hidden_dim))).to(device)
            extra_zeros, enc_batch_extend_vocab, max_art_oovs = get_extra_features(
                batch.source[0], source_field.vocab)
            extra_zeros = extra_zeros.to(device)
            enc_batch_extend_vocab = enc_batch_extend_vocab.to(device)
            # decoder part
            dec_batch = batch.target[0][:, :-1]
            # print(dec_batch.size())
            target_batch = batch.target[0][:, 0:]
            dec_lens_var = batch.target[1]
            dec_padding_mask = get_mask(batch.target, device)
            max_dec_len = max(dec_lens_var)

            step_losses = []
            for di in range(min(max_dec_len, config.max_dec_steps) - 1):
                y_t_1 = dec_batch[:, di]  # Teacher forcing
                final_dist, s_t_1, c_t_1, attn_dist, p_gen, next_coverage = model.decoder(
                    y_t_1, s_t_1, encoder_outputs, encoder_feature,
                    enc_padding_mask, c_t_1, extra_zeros,
                    enc_batch_extend_vocab, coverage, di)
                target = target_batch[:, di]
                gold_probs = torch.gather(final_dist, 1,
                                          target.unsqueeze(1)).squeeze()
                step_loss = -torch.log(gold_probs + config.eps)
                if config.is_coverage:
                    step_coverage_loss = torch.sum(
                        torch.min(attn_dist, coverage), 1)
                    step_loss = step_loss + config.cov_loss_wt * step_coverage_loss
                    coverage = next_coverage

                step_mask = dec_padding_mask[:, di]
                step_loss = step_loss * step_mask
                step_losses.append(step_loss)
            sum_losses = torch.sum(torch.stack(step_losses, 1), 1)
            batch_avg_loss = sum_losses / dec_lens_var
            loss = torch.mean(batch_avg_loss)

            loss.backward()

            norm = clip_grad_norm_(model.encoder.parameters(),
                                   config.max_grad_norm)
            clip_grad_norm_(model.decoder.parameters(), config.max_grad_norm)
            clip_grad_norm_(model.reduce_state.parameters(),
                            config.max_grad_norm)

            optimizer.step()

            running_avg_loss = calc_running_avg_loss(loss.item(),
                                                     running_avg_loss,
                                                     summary_writer, iter)
            iter += 1
            summary_writer.flush()
            # print_interval = 10
            # if iter % print_interval == 0:
            #     print(f'steps {iter}, batch number: {i} with {time.time() - start} seconds, loss: {loss}')
            #     start = time.time()
            if iter % 300 == 0:
                save_model(model, optimizer, running_avg_loss, iter,
                           config.model_dir)
示例#23
0
class Train(object):
    def __init__(self):
        if config.is_hierarchical:
            raise Exception("Hierarchical PGN-AMI not supported!")

        self.vocab = Vocab(config.vocab_path, config.vocab_size)
        self.pad_id = self.vocab.word2id(PAD_TOKEN)
        self.start_id = self.vocab.word2id(START_DECODING)
        self.stop_id = self.vocab.word2id(STOP_DECODING)

        self.print_interval = config.print_interval

        train_dir = config.train_dir
        if not os.path.exists(train_dir):
            os.mkdir(train_dir)

        self.model_dir = train_dir
        if not os.path.exists(self.model_dir):
            os.mkdir(self.model_dir)

    def save_model(self, running_avg_loss, iter):
        state = {
            'iter': iter,
            'encoder_state_dict': self.model.encoder.state_dict(),
            'decoder_state_dict': self.model.decoder.state_dict(),
            'reduce_state_dict': self.model.reduce_state.state_dict(),
            'optimizer': self.optimizer.state_dict(),
            'current_loss': running_avg_loss
        }
        model_save_path = os.path.join(self.model_dir,
                                       'iter{}.pt'.format(iter))
        torch.save(state, model_save_path)

    def setup_train(self, model_file_path=None):
        self.model = Model(model_file_path)

        params = list(self.model.encoder.parameters()) + list(self.model.decoder.parameters()) + \
                 list(self.model.reduce_state.parameters())
        initial_lr = config.lr_coverage if config.is_coverage else config.lr
        self.optimizer = Adagrad(
            params,
            lr=initial_lr,
            initial_accumulator_value=config.adagrad_init_acc)

        start_iter, start_loss = 0, 0

        if model_file_path is not None:
            state = torch.load(model_file_path,
                               map_location=lambda storage, location: storage)
            start_iter = state['iter']
            start_loss = state['current_loss']

            if not config.is_coverage:
                self.optimizer.load_state_dict(state['optimizer'])
                if use_cuda:
                    for state in self.optimizer.state.values():
                        for k, v in state.items():
                            if torch.is_tensor(v):
                                state[k] = v.cuda()

        return start_iter, start_loss

    def train_one_batch(self, ami_data, idx):
        # enc_batch, enc_padding_mask, enc_lens, enc_batch_extend_vocab, extra_zeros, c_t_1, coverage = \
        #     get_ami_input_from_batch(batch, use_cuda)
        # dec_batch, dec_padding_mask, max_dec_len, dec_lens_var, target_batch = \
        #     get_ami_output_from_batch(batch, use_cuda)

        enc_pack, dec_pack = get_a_batch(ami_data,
                                         idx,
                                         self.vocab,
                                         config.batch_size,
                                         config.max_enc_steps,
                                         config.max_dec_steps,
                                         self.start_id,
                                         self.stop_id,
                                         self.pad_id,
                                         sum_type='short',
                                         use_cuda=use_cuda)
        enc_batch, enc_padding_mask, enc_lens, enc_batch_extend_vocab, extra_zeros, c_t_1, coverage = enc_pack
        dec_batch, dec_padding_mask, max_dec_len, dec_lens_var, target_batch = dec_pack

        self.optimizer.zero_grad()

        encoder_outputs, encoder_feature, encoder_hidden = self.model.encoder(
            enc_batch, enc_lens)
        s_t_1 = self.model.reduce_state.forward1(encoder_hidden)

        step_losses = []
        for di in range(min(max_dec_len, config.max_dec_steps)):
            y_t_1 = dec_batch[:, di]  # Teacher forcing

            final_dist, s_t_1, c_t_1, attn_dist, p_gen, next_coverage = self.model.decoder.forward1(
                y_t_1, s_t_1, encoder_outputs, encoder_feature,
                enc_padding_mask, c_t_1, extra_zeros, enc_batch_extend_vocab,
                coverage, di)

            target = target_batch[:, di]
            gold_probs = torch.gather(final_dist, 1,
                                      target.unsqueeze(1)).squeeze()
            step_loss = -torch.log(gold_probs + config.eps)
            if config.is_coverage:
                step_coverage_loss = torch.sum(torch.min(attn_dist, coverage),
                                               1)
                step_loss = step_loss + config.cov_loss_wt * step_coverage_loss
                coverage = next_coverage

            step_mask = dec_padding_mask[:, di]
            step_loss = step_loss * step_mask
            step_losses.append(step_loss)

        sum_losses = torch.sum(torch.stack(step_losses, 1), 1)
        batch_avg_loss = sum_losses / dec_lens_var
        loss = torch.mean(batch_avg_loss)

        loss.backward()

        clip_grad_norm_(self.model.encoder.parameters(), config.max_grad_norm)
        clip_grad_norm_(self.model.decoder.parameters(), config.max_grad_norm)
        clip_grad_norm_(self.model.reduce_state.parameters(),
                        config.max_grad_norm)

        self.optimizer.step()

        return loss.item()

    def trainIters(self, n_iters, model_file_path=None):
        iter, running_avg_loss = self.setup_train(model_file_path)
        sys.stdout.flush()

        ami_data = load_ami_data('train')
        valid_data = load_ami_data('valid')
        # make the training data 100
        random.shuffle(valid_data)
        ami_data.extend(valid_data[:6])
        valid_data = valid_data[6:]

        num_batches = len(ami_data)
        idx = 0

        # validation & stopping
        best_valid_loss = 1000000000
        stop_counter = 0

        while iter < n_iters:
            if idx == 0:
                print("shuffle training data")
                random.shuffle(ami_data)

            loss = self.train_one_batch(ami_data, idx)

            running_avg_loss = calc_running_avg_loss(loss, running_avg_loss,
                                                     iter)

            iter += 1
            idx += config.batch_size
            if idx == num_batches: idx = 0

            if iter % self.print_interval == 0:
                print("[{}] iter {}, loss: {:.5f}".format(
                    str(datetime.now()), iter, loss))
                sys.stdout.flush()

            if iter % config.save_every == 0:
                self.save_model(running_avg_loss, iter)

            if iter % config.eval_every == 0:
                valid_loss = self.run_eval(valid_data)
                print("valid_loss = {:.5f}".format(valid_loss))
                if valid_loss < best_valid_loss:
                    stop_counter = 0
                    best_valid_loss = valid_loss
                    print("VALID better")
                else:
                    stop_counter += 1
                    print(
                        "VALID NOT better, counter = {}".format(stop_counter))
                    if stop_counter == config.stop_after:
                        print("Stop training")
                        return

        print("Finished training!")

    def eval_one_batch(self, eval_data, idx):

        enc_pack, dec_pack = get_a_batch(eval_data,
                                         idx,
                                         self.vocab,
                                         1,
                                         config.max_enc_steps,
                                         config.max_dec_steps,
                                         self.start_id,
                                         self.stop_id,
                                         self.pad_id,
                                         sum_type='short',
                                         use_cuda=use_cuda)

        enc_batch, enc_padding_mask, enc_lens, enc_batch_extend_vocab, extra_zeros, c_t_1, coverage = enc_pack
        dec_batch, dec_padding_mask, max_dec_len, dec_lens_var, target_batch = dec_pack

        encoder_outputs, encoder_feature, encoder_hidden = self.model.encoder(
            enc_batch, enc_lens)
        s_t_1 = self.model.reduce_state.forward1(encoder_hidden)

        step_losses = []
        for di in range(min(max_dec_len, config.max_dec_steps)):
            y_t_1 = dec_batch[:, di]  # Teacher forcing
            final_dist, s_t_1, c_t_1, attn_dist, p_gen, next_coverage = self.model.decoder.forward1(
                y_t_1, s_t_1, encoder_outputs, encoder_feature,
                enc_padding_mask, c_t_1, extra_zeros, enc_batch_extend_vocab,
                coverage, di)

            target = target_batch[:, di]
            gold_probs = torch.gather(final_dist,
                                      dim=1,
                                      index=target.unsqueeze(1)).squeeze()
            step_loss = -torch.log(gold_probs + config.eps)
            if config.is_coverage:
                step_coverage_loss = torch.sum(torch.min(attn_dist, coverage),
                                               1)
                step_loss = step_loss + config.cov_loss_wt * step_coverage_loss
                coverage = next_coverage

            step_mask = dec_padding_mask[:, di]
            step_loss = step_loss * step_mask
            step_losses.append(step_loss)

        sum_step_losses = torch.sum(torch.stack(step_losses, 1), 1)
        batch_avg_loss = sum_step_losses / dec_lens_var
        loss = torch.mean(batch_avg_loss)

        return loss.data.item()

    def run_eval(self, eval_data):
        running_avg_loss, iter = 0, 0
        batch_losses = []
        num_batches = len(eval_data)
        print("valid data size = {}".format(num_batches))
        for idx in range(num_batches):
            loss = self.eval_one_batch(eval_data, idx)
            batch_losses.append(loss)
            running_avg_loss = calc_running_avg_loss(loss, running_avg_loss,
                                                     iter)
            print("#", end="")
            sys.stdout.flush()
        print()

        avg_loss = sum(batch_losses) / len(batch_losses)
        return avg_loss
示例#24
0
class Train(object):
    def __init__(self, opt):
        self.vocab = Vocab(config.vocab_path, config.vocab_size)
        self.batcher = Batcher(config.train_data_path,
                               self.vocab,
                               mode='train',
                               batch_size=config.batch_size,
                               single_pass=False)
        time.sleep(15)

        train_dir = os.path.join(config.log_root,
                                 'train_%d' % (int(time.time())))
        if not os.path.exists(train_dir):
            os.mkdir(train_dir)

        self.model_dir = os.path.join(train_dir, 'model')
        if not os.path.exists(self.model_dir):
            os.mkdir(self.model_dir)
        self.opt = opt
        self.summary_writer = tf.summary.FileWriter(train_dir)

    def save_model(self, running_avg_loss, iter):
        state = {
            'iter': iter,
            'encoder_state_dict': self.model.encoder.state_dict(),
            'decoder_state_dict': self.model.decoder.state_dict(),
            'reduce_state_dict': self.model.reduce_state.state_dict(),
            'optimizer': self.optimizer.state_dict(),
            'current_loss': running_avg_loss
        }
        model_save_path = os.path.join(
            self.model_dir, 'model_%d_%d' % (iter, int(time.time())))
        torch.save(state, model_save_path)

    def setup_train(self, model_file_path=None):

        # 训练设置,包括
        if self.opt.load_model != None:
            model_file_path = os.path.join(self.model_dir, self.opt.load_model)
        else:
            model_file_path = None

        self.model = Model(model_file_path)

        params = list(self.model.encoder.parameters()) + list(self.model.decoder.parameters()) + \
                 list(self.model.reduce_state.parameters())
        initial_lr = config.lr_coverage if config.is_coverage else config.lr
        self.optimizer = Adagrad(
            params,
            lr=initial_lr,
            initial_accumulator_value=config.adagrad_init_acc)

        start_iter, start_loss = 0, 0

        if model_file_path is not None:
            state = torch.load(model_file_path,
                               map_location=lambda storage, location: storage)
            start_iter = state['iter']
            start_loss = state['current_loss']

            if not config.is_coverage:
                self.optimizer.load_state_dict(state['optimizer'])
                if use_cuda:
                    for state in self.optimizer.state.values():
                        for k, v in state.items():
                            if torch.is_tensor(v):
                                state[k] = v.cuda()

        return start_iter, start_loss

    def train_one_batch(self, batch):
        enc_batch, enc_padding_mask, enc_lens, enc_batch_extend_vocab, extra_zeros, c_t_1, coverage = \
            get_input_from_batch(batch, use_cuda)
        dec_batch, dec_padding_mask, max_dec_len, dec_lens_var, target_batch = \
            get_output_from_batch(batch, use_cuda)

        self.optimizer.zero_grad()

        encoder_outputs, encoder_feature, encoder_hidden = self.model.encoder(
            enc_batch, enc_lens)
        s_t_1 = self.model.reduce_state(encoder_hidden)

        if self.opt.train_mle == "yes":
            step_losses = []
            for di in range(min(max_dec_len, config.max_dec_steps)):
                y_t_1 = dec_batch[:, di]  # Teacher forcing
                final_dist, s_t_1, c_t_1, attn_dist, p_gen, next_coverage = self.model.decoder(
                    y_t_1, s_t_1, encoder_outputs, encoder_feature,
                    enc_padding_mask, c_t_1, extra_zeros,
                    enc_batch_extend_vocab, coverage, di)
                target = target_batch[:, di]
                gold_probs = torch.gather(final_dist, 1,
                                          target.unsqueeze(1)).squeeze()
                step_loss = -torch.log(gold_probs + config.eps)
                if config.is_coverage:
                    step_coverage_loss = torch.sum(
                        torch.min(attn_dist, coverage), 1)
                    step_loss = step_loss + config.cov_loss_wt * step_coverage_loss
                    coverage = next_coverage

                step_mask = dec_padding_mask[:, di]
                step_loss = step_loss * step_mask
                step_losses.append(step_loss)

            sum_losses = torch.sum(torch.stack(step_losses, 1), 1)
            batch_avg_loss = sum_losses / dec_lens_var
            mle_loss = torch.mean(batch_avg_loss)
        else:
            mle_loss = get_cuda(torch.FloatTensor([0]))
            # --------------RL training-----------------------------------------------------
        if self.opt.train_rl == "yes":  # perform reinforcement learning training
            # multinomial sampling
            sample_sents, RL_log_probs = self.train_batch_RL(
                encoder_outputs,
                encoder_hidden,
                enc_padding_mask,
                encoder_feature,
                enc_batch_extend_vocab,
                extra_zeros,
                c_t_1,
                batch.art_oovs,
                coverage,
                greedy=False)
            with torch.autograd.no_grad():
                # greedy sampling
                greedy_sents, _ = self.train_batch_RL(encoder_outputs,
                                                      encoder_hidden,
                                                      enc_padding_mask,
                                                      encoder_feature,
                                                      enc_batch_extend_vocab,
                                                      extra_zeros,
                                                      c_t_1,
                                                      batch.art_oovs,
                                                      coverage,
                                                      greedy=True)

            sample_reward = self.reward_function(sample_sents,
                                                 batch.original_abstracts)
            baseline_reward = self.reward_function(greedy_sents,
                                                   batch.original_abstracts)
            # if iter%200 == 0:
            #     self.write_to_file(sample_sents, greedy_sents, batch.original_abstracts, sample_reward, baseline_reward, iter)
            rl_loss = -(
                sample_reward - baseline_reward
            ) * RL_log_probs  # Self-critic policy gradient training (eq 15 in https://arxiv.org/pdf/1705.04304.pdf)
            rl_loss = torch.mean(rl_loss)

            batch_reward = torch.mean(sample_reward).item()
        else:
            rl_loss = get_cuda(torch.FloatTensor([0]))
            batch_reward = 0
        #loss.backward()
        (self.opt.mle_weight * mle_loss +
         self.opt.rl_weight * rl_loss).backward()
        self.norm = clip_grad_norm_(self.model.encoder.parameters(),
                                    config.max_grad_norm)
        clip_grad_norm_(self.model.decoder.parameters(), config.max_grad_norm)
        clip_grad_norm_(self.model.reduce_state.parameters(),
                        config.max_grad_norm)

        self.optimizer.step()

        return mle_loss.item(), batch_reward

    def train_batch_RL(self, encoder_outputs, encoder_hidden, enc_padding_mask,
                       encoder_feature, enc_batch_extend_vocab, extra_zeros,
                       c_t_1, article_oovs, coverage, greedy):
        '''Generate sentences from decoder entirely using sampled tokens as input. These sentences are used for ROUGE evaluation
        Args
        :param enc_out: Outputs of the encoder for all time steps (batch_size, length_input_sequence, 2*hidden_size)
        :param enc_hidden: Tuple containing final hidden state & cell state of encoder. Shape of h & c: (batch_size, hidden_size)
        :param enc_padding_mask: Mask for encoder input; Tensor of size (batch_size, length_input_sequence) with values of 0 for pad tokens & 1 for others
        :param ct_e: encoder context vector for time_step=0 (eq 5 in https://arxiv.org/pdf/1705.04304.pdf)
        :param extra_zeros: Tensor used to extend vocab distribution for pointer mechanism
        :param enc_batch_extend_vocab: Input batch that stores OOV ids
        :param article_oovs: Batch containing list of OOVs in each example
        :param greedy: If true, performs greedy based sampling, else performs multinomial sampling
        Returns:
        :decoded_strs: List of decoded sentences
        :log_probs: Log probabilities of sampled words
        '''
        s_t_1 = self.model.reduce_state(
            encoder_hidden)  # Decoder hidden states
        y_t_1 = get_cuda(
            torch.LongTensor(len(encoder_outputs)).fill_(
                self.vocab.word2id(data.START_DECODING))
        )  # Input to the decoder                                                              #Used for intra-temporal attention (section 2.1 in https://arxiv.org/pdf/1705.04304.pdf)
        inds = []  # Stores sampled indices for each time step
        decoder_padding_mask = [
        ]  # 存储生成样本的填充掩码 Stores padding masks of generated samples
        log_probs = []  # Stores log probabilites of generated samples
        mask = get_cuda(
            torch.LongTensor(len(encoder_outputs)).fill_(1)
        )  # Values that indicate whether [STOP] token has already been encountered; 1 => Not encountered, 0 otherwise

        for t in range(config.max_dec_steps):
            probs, s_t_1, c_t_1, attn_dist, p_gen, next_coverage = self.model.decoder(
                y_t_1, s_t_1, encoder_outputs, encoder_feature,
                enc_padding_mask, c_t_1, extra_zeros, enc_batch_extend_vocab,
                coverage, t)
            if greedy is False:
                multi_dist = Categorical(probs)  # 根据概率分布进行采样
                y_t_1 = multi_dist.sample()  # perform multinomial sampling
                log_prob = multi_dist.log_prob(y_t_1)
                log_probs.append(log_prob)
            else:
                _, y_t_1 = torch.max(
                    probs, dim=1
                )  # 取概率最大的词                                                  #perform greedy sampling
            y_t_1 = y_t_1.detach()
            inds.append(y_t_1)
            mask_t = get_cuda(torch.zeros(len(encoder_outputs))
                              )  # Padding mask of batch for current time step
            mask_t[
                mask ==
                1] = 1  # If [STOP] is not encountered till previous time step, mask_t = 1 else mask_t = 0
            mask[
                (mask == 1) +
                (y_t_1 == self.vocab.word2id(data.STOP_DECODING)) ==
                2] = 0  # If [STOP] is not encountered till previous time step and current word is [STOP], make mask = 0
            decoder_padding_mask.append(mask_t)
            is_oov = (y_t_1 >= config.vocab_size
                      ).long()  # Mask indicating whether sampled word is OOV
            y_t_1 = (1 - is_oov) * y_t_1 + (is_oov) * self.vocab.word2id(
                data.UNKNOWN_TOKEN)  # Replace OOVs with [UNK] token

        inds = torch.stack(inds, dim=1)
        decoder_padding_mask = torch.stack(decoder_padding_mask, dim=1)
        if greedy is False:  # If multinomial based sampling, compute log probabilites of sampled words
            log_probs = torch.stack(log_probs, dim=1)
            log_probs = log_probs * decoder_padding_mask  # Not considering sampled words with padding mask = 0
            lens = torch.sum(decoder_padding_mask,
                             dim=1)  # Length of sampled sentence
            log_probs = torch.sum(
                log_probs, dim=1
            ) / lens  # (bs,)                                     #compute normalizied log probability of a sentence
        decoded_strs = []
        for i in range(len(encoder_outputs)):
            id_list = inds[i].cpu().numpy()
            oovs = article_oovs[i]
            S = data.outputids2words(
                id_list, self.vocab,
                oovs)  # Generate sentence corresponding to sampled words
            try:
                end_idx = S.index(data.STOP_DECODING)
                S = S[:end_idx]
            except ValueError:
                S = S
            if len(
                    S
            ) < 2:  # If length of sentence is less than 2 words, replace it with "xxx"; Avoids setences like "." which throws error while calculating ROUGE
                S = ["xxx"]
            S = " ".join(S)
            decoded_strs.append(S)

        return decoded_strs, log_probs

    def reward_function(self, decoded_sents, original_sents):
        rouge = Rouge()
        try:
            scores = rouge.get_scores(decoded_sents, original_sents)
        except Exception:
            print(
                "Rouge failed for multi sentence evaluation.. Finding exact pair"
            )
            scores = []
            for i in range(len(decoded_sents)):
                try:
                    score = rouge.get_scores(decoded_sents[i],
                                             original_sents[i])
                except Exception:
                    print("Error occured at:")
                    print("decoded_sents:", decoded_sents[i])
                    print("original_sents:", original_sents[i])
                    score = [{"rouge-1": {"p": 0.0}}]
                scores.append(score[0])
        rouge_l_p1 = [score["rouge-1"]["p"] for score in scores]
        rouge_l_p1 = get_cuda(torch.FloatTensor(rouge_l_p1))
        return rouge_l_p1

    def trainIters(self, n_iters, model_file_path=None):
        iter, running_avg_loss = self.setup_train(model_file_path)
        start = time.time()
        while iter < n_iters:
            batch = self.batcher.next_batch()
            loss = self.train_one_batch(batch)

            running_avg_loss = calc_running_avg_loss(loss, running_avg_loss,
                                                     self.summary_writer, iter)
            iter += 1

            if iter % 50 == 0:
                self.summary_writer.flush()
            print_interval = 50
            if iter % print_interval == 0:
                print('steps %d, seconds for %d batch: %.2f , loss: %f' %
                      (iter, print_interval, time.time() - start, loss))
                start = time.time()
            if iter % 100 == 0:
                self.save_model(running_avg_loss, iter)
示例#25
0
class Train(object):
    def __init__(self):
        self.vocab = Vocab(config.vocab_path, config.vocab_size)

        self.batcher = Batcher(config.train_data_path, self.vocab, mode='train',
                               batch_size=config.batch_size, single_pass=False)
        # print("MODE MUST BE train")
        # time.sleep(15)
        self.print_interval = config.print_interval

        train_dir = config.train_dir
        if not os.path.exists(train_dir):
            os.mkdir(train_dir)

        self.model_dir = train_dir
        if not os.path.exists(self.model_dir):
            os.mkdir(self.model_dir)

        # self.summary_writer = tf.compat.v1.summary.FileWriter(train_dir)

    def save_model(self, running_avg_loss, iter):
        state = {
            'iter': iter,
            'encoder_state_dict': self.model.encoder.state_dict(),
            'decoder_state_dict': self.model.decoder.state_dict(),
            'reduce_state_dict': self.model.reduce_state.state_dict(),
            'optimizer': self.optimizer.state_dict(),
            'current_loss': running_avg_loss
        }
        model_save_path = os.path.join(self.model_dir, 'iter{}.pt'.format(iter))
        torch.save(state, model_save_path)

    def setup_train(self, model_file_path=None):
        self.model = Model(model_file_path)

        params = list(self.model.encoder.parameters()) + list(self.model.decoder.parameters()) + \
                 list(self.model.reduce_state.parameters())
        initial_lr = config.lr_coverage if config.is_coverage else config.lr
        self.optimizer = Adagrad(params, lr=initial_lr, initial_accumulator_value=config.adagrad_init_acc)

        start_iter, start_loss = 0, 0

        if model_file_path is not None:
            state = torch.load(model_file_path, map_location= lambda storage, location: storage)
            start_iter = state['iter']
            start_loss = state['current_loss']

            if not config.is_coverage:
                self.optimizer.load_state_dict(state['optimizer'])
                if use_cuda:
                    for state in self.optimizer.state.values():
                        for k, v in state.items():
                            if torch.is_tensor(v):
                                state[k] = v.cuda()

        return start_iter, start_loss

    def train_one_batch(self, batch):
        enc_batch, enc_padding_mask, enc_lens, enc_batch_extend_vocab, extra_zeros, c_t_1, coverage = \
            get_input_from_batch(batch, use_cuda)
        dec_batch, dec_padding_mask, max_dec_len, dec_lens_var, target_batch = \
            get_output_from_batch(batch, use_cuda)

        self.optimizer.zero_grad()

        if not config.is_hierarchical:
            encoder_outputs, encoder_feature, encoder_hidden = self.model.encoder(enc_batch, enc_lens)
            s_t_1 = self.model.reduce_state.forward1(encoder_hidden)

        else:
            stop_id = self.vocab.word2id('.')
            pad_id  = self.vocab.word2id('[PAD]')
            enc_sent_pos = get_sent_position(enc_batch, stop_id, pad_id)
            dec_sent_pos = get_sent_position(dec_batch, stop_id, pad_id)

            encoder_outputs, encoder_feature, encoder_hidden, sent_enc_outputs, sent_enc_feature, sent_enc_hidden, sent_enc_padding_mask, sent_lens, seq_lens2 = \
                                                                    self.model.encoder(enc_batch, enc_lens, enc_sent_pos)

            s_t_1, sent_s_t_1 = self.model.reduce_state(encoder_hidden, sent_enc_hidden)
        step_losses = []
        for di in range(min(max_dec_len, config.max_dec_steps)):
            y_t_1 = dec_batch[:, di]  # Teacher forcing
            if not config.is_hierarchical:
                # start = datetime.now()

                final_dist, s_t_1,  c_t_1, attn_dist, p_gen, next_coverage = self.model.decoder.forward1(y_t_1, s_t_1,
                                                            encoder_outputs, encoder_feature, enc_padding_mask, c_t_1,
                                                            extra_zeros, enc_batch_extend_vocab,
                                                                               coverage, di)
                # print('NO HIER Time: ',datetime.now() - start)
                # import pdb; pdb.set_trace()
            else:
                # start = datetime.now()
                max_doc_len = enc_batch.size(1)
                final_dist, sent_s_t_1,  c_t_1, attn_dist, p_gen, next_coverage = self.model.decoder(y_t_1, sent_s_t_1,
                                                            encoder_outputs, encoder_feature, enc_padding_mask, seq_lens2,
                                                            sent_s_t_1, sent_enc_outputs, sent_enc_feature, sent_enc_padding_mask,
                                                            sent_lens, max_doc_len,
                                                            c_t_1, extra_zeros, enc_batch_extend_vocab, coverage, di)
                # print('DO HIER Time: ',datetime.now() - start)
                # import pdb; pdb.set_trace()


            target = target_batch[:, di]
            gold_probs = torch.gather(final_dist, 1, target.unsqueeze(1)).squeeze()
            step_loss = -torch.log(gold_probs + config.eps)
            if config.is_coverage:
                step_coverage_loss = torch.sum(torch.min(attn_dist, coverage), 1)
                step_loss = step_loss + config.cov_loss_wt * step_coverage_loss
                coverage = next_coverage

            step_mask = dec_padding_mask[:, di]
            step_loss = step_loss * step_mask
            step_losses.append(step_loss)

        sum_losses = torch.sum(torch.stack(step_losses, 1), 1)
        batch_avg_loss = sum_losses/dec_lens_var
        loss = torch.mean(batch_avg_loss)

        # start = datatime.now()
        loss.backward()
        # print('{} HIER Time: {}'.format(config.is_hierarchical ,datetime.now() - start))
        # import pdb; pdb.set_trace()

        clip_grad_norm_(self.model.encoder.parameters(), config.max_grad_norm)
        clip_grad_norm_(self.model.decoder.parameters(), config.max_grad_norm)
        clip_grad_norm_(self.model.reduce_state.parameters(), config.max_grad_norm)

        self.optimizer.step()

        return loss.item()

    def trainIters(self, n_iters, model_file_path=None):
        iter, running_avg_loss = self.setup_train(model_file_path)
        sys.stdout.flush()

        # data_path = "lib/data/batches_train.vocab50000.batch16.pk.bin"
        # with open(data_path, 'rb') as f:
        #     stored_batches = pickle.load(f, encoding="bytes")
        # print("loaded data: {}".format(data_path))
        # num_batches = len(stored_batches)

        while iter < n_iters:
            batch = self.batcher.next_batch()
            # batch_id = iter%num_batches
            # batch = stored_batches[batch_id]

            loss = self.train_one_batch(batch)

            # running_avg_loss = calc_running_avg_loss(loss, running_avg_loss, self.summary_writer, iter)
            running_avg_loss = calc_running_avg_loss(loss, running_avg_loss, iter)

            iter += 1

            # if iter % 100 == 0:
            #     self.summary_writer.flush()

            if iter % self.print_interval == 0:
                print("[{}] iter {}, loss: {:.5f}".format(str(datetime.now()), iter, loss))
                sys.stdout.flush()

            if iter % config.save_every == 0:
                self.save_model(running_avg_loss, iter)

        print("Finished training!")
示例#26
0
class Train(object):
    def __init__(self):
        self.vocab = Vocab(config.vocab_path, config.vocab_size)
        self.batcher = Batcher(config.train_data_path,
                               self.vocab,
                               mode='train',
                               batch_size=config.batch_size,
                               single_pass=False)
        time.sleep(15)

        train_dir = os.path.join(config.ouput_root,
                                 'train_%d' % (int(time.time())))
        if not os.path.exists(train_dir):
            os.makedirs(train_dir)

        self.checkpoint_dir = os.path.join(train_dir, 'checkpoints')
        if not os.path.exists(self.checkpoint_dir):
            os.makedirs(self.checkpoint_dir)

        self.train_summary_writer = tf.summary.create_file_writer(
            os.path.join(train_dir, 'log', 'train'))
        self.eval_summary_writer = tf.summary.create_file_writer(
            os.path.join(train_dir, 'log', 'eval'))

    def save_model(self, model_path, running_avg_loss, iter):
        state = {
            'iter': iter,
            'encoder_state_dict': self.model.encoder.state_dict(),
            'decoder_state_dict': self.model.decoder.state_dict(),
            'reduce_state_dict': self.model.reduce_state.state_dict(),
            'optimizer': self.optimizer.state_dict(),
            'current_loss': running_avg_loss
        }
        torch.save(state, model_path)

    def setup_train(self, model_file_path=None):
        self.model = Model(device, model_file_path)

        params = list(self.model.encoder.parameters()) + list(self.model.decoder.parameters()) + \
                 list(self.model.reduce_state.parameters())
        initial_lr = config.lr_coverage if config.is_coverage else config.lr
        self.optimizer = Adagrad(
            params,
            lr=initial_lr,
            initial_accumulator_value=config.adagrad_init_acc)

        start_iter, start_loss = 0, 0

        if model_file_path is not None:
            state = torch.load(model_file_path,
                               map_location=lambda storage, location: storage)
            start_iter = state['iter']
            start_loss = state['current_loss']

            if not config.is_coverage:
                self.optimizer.load_state_dict(state['optimizer'])
                for state in self.optimizer.state.values():
                    for k, v in state.items():
                        if torch.is_tensor(v):
                            state[k] = v.to(device)

        return start_iter, start_loss

    def train_one_batch(self, batch, forcing_ratio=1):
        enc_batch, enc_padding_mask, enc_lens, enc_batch_extend_vocab, extra_zeros, c_t_1, coverage = \
            get_input_from_batch(batch, device)
        dec_batch, dec_padding_mask, max_dec_len, dec_lens_var, target_batch = \
            get_output_from_batch(batch, device)

        self.optimizer.zero_grad()

        encoder_outputs, encoder_feature, encoder_hidden = self.model.encoder(
            enc_batch, enc_lens)
        s_t_1 = self.model.reduce_state(encoder_hidden)

        step_losses = []
        y_t_1_hat = None
        for di in range(min(max_dec_len, config.max_dec_steps)):
            y_t_1 = dec_batch[:, di]
            # decide the next input
            if di == 0 or random.random() < forcing_ratio:
                x_t = y_t_1  # teacher forcing, use label from last time step as input
            else:
                # use embedding of UNK for all oov word
                y_t_1_hat[y_t_1_hat > self.vocab.size()] = self.vocab.word2id(
                    UNKNOWN_TOKEN)
                x_t = y_t_1_hat.flatten(
                )  # use prediction from last time step as input
            final_dist, s_t_1, c_t_1, attn_dist, p_gen, next_coverage = self.model.decoder(
                x_t, s_t_1, encoder_outputs, encoder_feature, enc_padding_mask,
                c_t_1, extra_zeros, enc_batch_extend_vocab, coverage, di)
            _, y_t_1_hat = final_dist.data.topk(1)
            target = target_batch[:, di].unsqueeze(1)
            step_loss = cal_NLLLoss(target, final_dist)
            if config.is_coverage:  # if not using coverge, keep coverage=None
                step_coverage_loss = torch.sum(torch.min(attn_dist, coverage),
                                               1)
                step_loss = step_loss + config.cov_loss_wt * step_coverage_loss
                coverage = next_coverage

            step_mask = dec_padding_mask[:,
                                         di]  # padding in target should not count into loss
            step_loss = step_loss * step_mask
            step_losses.append(step_loss)

        sum_losses = torch.sum(torch.stack(step_losses, 1), 1)
        batch_avg_loss = sum_losses / dec_lens_var
        loss = torch.mean(batch_avg_loss)

        loss.backward()

        self.norm = clip_grad_norm_(self.model.encoder.parameters(),
                                    config.max_grad_norm)
        clip_grad_norm_(self.model.decoder.parameters(), config.max_grad_norm)
        clip_grad_norm_(self.model.reduce_state.parameters(),
                        config.max_grad_norm)

        self.optimizer.step()

        return loss.item()

    def train(self, n_iters, init_model_path=None):
        iter, avg_loss = self.setup_train(init_model_path)
        start = time.time()
        cnt = 0
        best_model_path = None
        min_eval_loss = float('inf')
        while iter < n_iters:
            s = config.forcing_ratio
            k = config.decay_to_0_iter
            x = iter
            nere_zero = 0.0001
            if config.forcing_decay_type:
                if x >= config.decay_to_0_iter:
                    forcing_ratio = 0
                elif config.forcing_decay_type == 'linear':
                    forcing_ratio = s * (k - x) / k
                elif config.forcing_decay_type == 'exp':
                    p = pow(nere_zero, 1 / k)
                    forcing_ratio = s * (p**x)
                elif config.forcing_decay_type == 'sig':
                    r = math.log((1 / nere_zero) - 1) / k
                    forcing_ratio = s / (1 + pow(math.e, r * (x - k / 2)))
                else:
                    raise ValueError('Unrecognized forcing_decay_type: ' +
                                     config.forcing_decay_type)
            else:
                forcing_ratio = config.forcing_ratio
            batch = self.batcher.next_batch()
            loss = self.train_one_batch(batch, forcing_ratio=forcing_ratio)
            model_path = os.path.join(self.checkpoint_dir,
                                      'model_step_%d' % (iter + 1))
            avg_loss = calc_avg_loss(loss, avg_loss)

            if (iter + 1) % config.print_interval == 0:
                with self.train_summary_writer.as_default():
                    tf.summary.scalar(name='loss', data=loss, step=iter)
                self.train_summary_writer.flush()
                logger.info('steps %d, took %.2f seconds, train avg loss: %f' %
                            (iter + 1, time.time() - start, avg_loss))
                start = time.time()
            if config.eval_interval is not None and (
                    iter + 1) % config.eval_interval == 0:
                start = time.time()
                logger.info("Start Evaluation on model %s" % model_path)
                eval_processor = Evaluate(self.model, self.vocab)
                eval_loss = eval_processor.run_eval()
                logger.info(
                    "Evaluation finished, took %.2f seconds, eval loss: %f" %
                    (time.time() - start, eval_loss))
                with self.eval_summary_writer.as_default():
                    tf.summary.scalar(name='eval_loss',
                                      data=eval_loss,
                                      step=iter)
                self.eval_summary_writer.flush()
                if eval_loss < min_eval_loss:
                    logger.info(
                        "This is the best model so far, saving it to disk.")
                    min_eval_loss = eval_loss
                    best_model_path = model_path
                    self.save_model(model_path, eval_loss, iter)
                    cnt = 0
                else:
                    cnt += 1
                    if cnt > config.patience:
                        logger.info(
                            "Eval loss doesn't drop for %d straight times, early stopping.\n"
                            "Best model: %s (Eval loss %f: )" %
                            (config.patience, best_model_path, min_eval_loss))
                        break
                start = time.time()
            elif (iter + 1) % config.save_interval == 0:
                self.save_model(model_path, avg_loss, iter)
            iter += 1
        else:
            logger.info(
                "Training finished, best model: %s, with train loss %f: " %
                (best_model_path, min_eval_loss))
示例#27
0
        train_loss = 0

        for _iter in tqdm(range(cfg.EPOCH_ITERS)):
            batch_iterator = iter(dataloader)
            # zero the gradient buffers
            optimizer.zero_grad()

            (gt, patch_2, patch_3) = next(batch_iterator)

            # Use CUDA if possible
            if torch.cuda.device_count():
                gt = gt.cuda()
                patch_2 = patch_2.cuda()
                patch_3 = patch_3.cuda()
            else:
                gt = gt
                patch_2 = patch_2
                patch_3 = patch_3

            softmax_scores = model.forward(patch_2=Variable(patch_2),
                                           patch_3=Variable(patch_3))
            loss = px3_loss(softmax_scores, gt, loss_weights)
            loss.backward()
            optimizer.step()

            train_loss += loss.data[0]

        print('Epoch: {} Loss: {}'.format(
            epoch, train_loss / cfg.EPOCH_ITERS / cfg.BATCH_SIZE))
        torch.save(model.state_dict(), cfg.SAVE_PATH.format(epoch))
    print('Done.')
示例#28
0
optim = Adagrad(model.parameters(), lr=0.01, weight_decay=0.003)
batch_size = 1000
n_batch = n_train // batch_size + 1

for epoch in range(n_epoch):
    loss_t = 0
    for i in range(n_batch):
        start = i*batch_size
        end = min(n_train, (i+1)*batch_size)
        x_batch = x_train[start:end]
        y_batch = y_train[start:end]
        model.zero_grad()
        y_p = model(x_batch)
        loss = l1(y_p, y_batch)
        loss.backward()
        optim.step()
        loss_t += loss.cpu().data.numpy()
    y_pred_v = model(x_valid)
    loss_v = l1(y_pred_v, y_valid)
    y_pred_a = model(x_all)
    loss_a = l1(y_pred_a, y_all)
    print('Epoch:{}, Loss:{}, Loss_valid:{}, Loss_all: {}'.format(epoch, loss_t/n_batch, loss_v.data.cpu().numpy(), loss_a.data.cpu().numpy()))
print(y_valid.topk(10))
print(y_pred_v.topk(10))
print(y_all.topk(20))
print(y_pred_a.topk(20))
print('\ntesting.........................B5')
y_p_B5 = model(x_B5)
loss_b5 = l1(y_p_B5, y_B5)
print(loss_b5)
print(y_p_B5)
class Train(object):
    def __init__(self):
        #config("print.vocab_path ",config.vocab_path)
        self.vocab = Vocab(config.vocab_path, config.vocab_size)
        self.batcher = Batcher(config.train_data_path,
                               self.vocab,
                               mode='train',
                               batch_size=config.batch_size,
                               single_pass=False)
        time.sleep(15)

        train_dir = os.path.join(config.log_root,
                                 'train_%d' % (int(time.time())))
        if not os.path.exists(train_dir):
            os.mkdir(train_dir)

        self.model_dir = os.path.join(train_dir, 'model')
        if not os.path.exists(self.model_dir):
            os.mkdir(self.model_dir)

        self.summary_writer = tf.summary.FileWriter(train_dir)

    def save_model(self, running_avg_loss, iter):
        state = {
            'iter': iter,
            'encoder_state_dict': self.model.encoder.state_dict(),
            'decoder_state_dict': self.model.decoder.state_dict(),
            'reduce_state_dict': self.model.reduce_state.state_dict(),
            'optimizer': self.optimizer.state_dict(),
            'current_loss': running_avg_loss
        }
        model_save_path = os.path.join(
            self.model_dir, 'model_%d_%d' % (iter, int(time.time())))
        torch.save(state, model_save_path)

    def setup_train(self, model_file_path=None):
        self.model = Model(model_file_path)

        params = list(self.model.encoder.parameters()) + list(self.model.decoder.parameters()) + \
                 list(self.model.reduce_state.parameters())
        #print("params : ",params)
        #print("params collection is completed....")
        initial_lr = config.lr_coverage if config.is_coverage else config.lr
        self.optimizer = Adagrad(
            params,
            lr=initial_lr,
            initial_accumulator_value=config.adagrad_init_acc)

        start_iter, start_loss = 0, 0

        #### Loading state where the training stopped earlier use that to train for future epoches ####
        if model_file_path is not None:
            state = torch.load(model_file_path,
                               map_location=lambda storage, location: storage)
            start_iter = state['iter']
            start_loss = state['current_loss']

            if not config.is_coverage:
                self.optimizer.load_state_dict(state['optimizer'])
                ###### Making into GPU/server accessable Variables #####
                if use_cuda:
                    for state in self.optimizer.state.values():
                        for k, v in state.items():
                            if torch.is_tensor(v):
                                state[k] = v.cuda()
        return start_iter, start_loss

    def train_one_batch(self, batch):

        ########### Below Two lines of code is for just initialization of Encoder and Decoder sizes,vocab, lenghts etc : ######
        enc_batch, enc_padding_mask, enc_lens, enc_batch_extend_vocab, extra_zeros, c_t_1, coverage = \
            get_input_from_batch(batch, use_cuda)
        dec_batch, dec_padding_mask, max_dec_len, dec_lens_var, target_batch = \
            get_output_from_batch(batch, use_cuda)

        self.optimizer.zero_grad()
        #print("train_one_batch function ......")
        encoder_outputs, encoder_feature, encoder_hidden = self.model.encoder(
            enc_batch, enc_lens)
        s_t_1 = self.model.reduce_state(
            encoder_hidden
        )  ### Here initially encoder final hiddenstate==decoder first/prev word at timestamp=0
        #print("s_t_1 : ",len(s_t_1),s_t_1[0].shape,s_t_1[1].shape)

        #print("steps.....")
        #print("max_dec_len = ",max_dec_len)
        step_losses = []
        for di in range(min(max_dec_len, config.max_dec_steps)):
            ############ Traing [ Teacher Forcing ] ###########
            y_t_1 = dec_batch[:, di]  # Teacher forcing
            #print("y_t_1 : ",len(y_t_1))
            final_dist, s_t_1, c_t_1, attn_dist, p_gen, next_coverage = self.model.decoder(
                y_t_1, s_t_1, encoder_outputs, encoder_feature,
                enc_padding_mask, c_t_1, extra_zeros, enc_batch_extend_vocab,
                coverage, di)
            #print("attn_dist : ",len(attn_dist),attn_dist[0].shape)
            #print("final_dist : ",len(final_dist),final_dist[0].shape) ############## vocab_Size
            target = target_batch[:, di]
            #print("target = ",len(target))

            gold_probs = torch.gather(final_dist, 1,
                                      target.unsqueeze(1)).squeeze()
            step_loss = -torch.log(
                gold_probs + config.eps
            )  #################################################### Eqn_6
            if config.is_coverage:
                step_coverage_loss = torch.sum(
                    torch.min(attn_dist, coverage),
                    1)  ###############################Eqn_13a
                step_loss = step_loss + config.cov_loss_wt * step_coverage_loss  ###############################Eqn_13b
                coverage = next_coverage

            step_mask = dec_padding_mask[:, di]
            step_loss = step_loss * step_mask
            step_losses.append(step_loss)

        sum_losses = torch.sum(torch.stack(step_losses, 1), 1)
        batch_avg_loss = sum_losses / dec_lens_var
        loss = torch.mean(batch_avg_loss)

        loss.backward()

        self.norm = clip_grad_norm_(self.model.encoder.parameters(),
                                    config.max_grad_norm)
        clip_grad_norm_(self.model.decoder.parameters(), config.max_grad_norm)
        clip_grad_norm_(self.model.reduce_state.parameters(),
                        config.max_grad_norm)

        self.optimizer.step()

        return loss.item()

    def trainIters(self, n_iters, model_file_path=None):
        print("trainIters__Started___model_file_path is : ", model_file_path)
        iter, running_avg_loss = self.setup_train(model_file_path)
        start = time.time()
        print("Max iteration : n_iters = ", n_iters)
        print("going to start running iter NO : ", iter)
        print("\n******************************\n")
        while iter < n_iters:
            print("\n###################################\n")
            print("iter : ", iter)
            batch = self.batcher.next_batch()
            print("batch data loading : ", batch)
            loss = self.train_one_batch(batch)
            running_avg_loss = calc_running_avg_loss(loss, running_avg_loss,
                                                     self.summary_writer, iter)
            print("running_avg_loss : ", running_avg_loss)
            iter += 1
            if iter % 100 == 0:  ##100
                self.summary_writer.flush()
            print_interval = 100  #1000
            if iter % print_interval == 0:
                print('steps %d, seconds for %d batch: %.2f , loss: %f' %
                      (iter, print_interval, time.time() - start, loss))
                start = time.time()
            if iter % 500 == 0:  ##5000
                self.save_model(running_avg_loss, iter)
示例#30
0
def main():
    parser = argparse.ArgumentParser()
    parser.add_argument("--build_data",
                        action="store_true",
                        help="Whether to build data.")
    parser.add_argument("--train_path",
                        type=str,
                        default="../data/tacred_train.json",
                        help="Path to unlabled data.")
    parser.add_argument("--dev_path",
                        type=str,
                        default="../data/tacred_dev.json",
                        help="Path to dev data.")
    parser.add_argument("--test_path",
                        type=str,
                        default="../data/tacred_test.json",
                        help="Path to train data.")
    parser.add_argument("--explanation_data_path",
                        type=str,
                        default="../data/tacred_explanations.json",
                        help="Path to explanation data.")
    parser.add_argument(
        "--vocab_path",
        type=str,
        default="../data/vocabs/vocab_glove.840B.300d_-1_0.6.p",
        help="Path to vocab created in Pre-training")
    parser.add_argument("--match_batch_size",
                        default=50,
                        type=int,
                        help="Match batch size for train.")
    parser.add_argument("--unlabeled_batch_size",
                        default=100,
                        type=int,
                        help="Unlabeled batch size for train.")
    parser.add_argument("--eval_batch_size",
                        default=50,
                        type=int,
                        help="Total batch size for eval.")
    parser.add_argument("--learning_rate",
                        default=0.1,
                        type=float,
                        help="The initial learning rate for Adam.")
    parser.add_argument("--epochs",
                        default=60,
                        type=int,
                        help="Number of Epochs for training")
    parser.add_argument('--embeddings',
                        type=str,
                        default="glove.840B.300d",
                        help="initial embeddings to use")
    parser.add_argument('--seed',
                        type=int,
                        default=7698,
                        help="random seed for initialization")
    parser.add_argument('--emb_dim',
                        type=int,
                        default=300,
                        help="embedding vector size")
    parser.add_argument(
        '--hidden_dim',
        type=int,
        default=100,
        help="hidden vector size of lstm (really 2*hidden_dim, due to bilstm)")
    parser.add_argument('--model_save_dir',
                        type=str,
                        default="",
                        help="where to save the model")
    parser.add_argument('--experiment_name',
                        type=str,
                        help="what to save the model file as")
    parser.add_argument('--load_clf_model',
                        action='store_true',
                        help="Whether to load a trained classifier model")
    parser.add_argument('--start_epoch',
                        type=int,
                        default=0,
                        help="start_epoch")
    parser.add_argument('--use_adagrad',
                        action='store_true',
                        help="use adagrad optimizer")

    args = parser.parse_args()

    torch.manual_seed(args.seed)
    random.seed(args.seed)
    lower_bound = -20.0
    dataset = "tacred"
    save_string = generate_save_string(dataset, args.embeddings)
    number_of_classes = len(TACRED_LABEL_MAP)
    none_label_id = TACRED_LABEL_MAP["no_relation"]
    set_re_dataset_ner_label_space(dataset)
    task = "re"

    if args.build_data:
        build_datasets_from_splits(args.train_path,
                                   args.dev_path,
                                   args.test_path,
                                   args.vocab_path,
                                   TACRED_LABEL_MAP,
                                   args.explanation_data_path,
                                   save_string,
                                   task=task,
                                   dataset=dataset)

    with open(
            "../data/training_data/{}_data_{}.p".format(
                "matched", save_string), "rb") as f:
        strict_match_data = pickle.load(f)

    with open(args.vocab_path, "rb") as f:
        vocab = pickle.load(f)

    dev_path = "../data/training_data/dev_data_{}.p".format(save_string)
    test_path = "../data/training_data/test_data_{}.p".format(save_string)

    pad_idx = vocab["<pad>"]

    if torch.cuda.is_available():
        device = torch.device("cuda")
    else:
        device = torch.device("cpu")

    tacred_vocab = build_custom_vocab("tacred", len(vocab))
    custom_vocab_length = len(tacred_vocab)

    clf = BiLSTM_Att_Clf.BiLSTM_Att_Clf(vocab.vectors,
                                        pad_idx,
                                        args.emb_dim,
                                        args.hidden_dim,
                                        torch.cuda.is_available(),
                                        number_of_classes,
                                        custom_token_count=custom_vocab_length)

    del vocab

    epochs = args.epochs
    epoch_string = str(epochs)
    test_epoch_f1_scores = []
    dev_epoch_f1_scores = []
    best_test_f1_score = -1
    best_dev_f1_score = -1

    strict_loss_epoch = []

    if args.load_clf_model:
        clf.load_state_dict(
            torch.load("../data/saved_models/Clf_{}.p".format(
                args.experiment_name)))
        print("loaded model")

        with open("../data/result_data/test_f1_per_epoch_Clf_{}.csv".format(
                args.experiment_name)) as f:
            reader = csv.reader(f)
            next(reader)
            for row in reader:
                test_epoch_f1_scores.append(row)
                if float(row[-1]) > best_test_f1_score:
                    best_test_f1_score = float(row[-1])

        with open("../data/result_data/dev_f1_per_epoch_Clf_{}.csv".format(
                args.experiment_name)) as f:
            reader = csv.reader(f)
            next(reader)
            for row in reader:
                dev_epoch_f1_scores.append(row)
                if float(row[-1]) > best_dev_f1_score:
                    best_dev_f1_score = float(row[-1])

        print("loaded past results")

    clf = clf.to(device)

    if args.use_adagrad:
        optimizer = Adagrad(clf.parameters(), lr=args.learning_rate)
    else:
        optimizer = SGD(clf.parameters(), lr=args.learning_rate)

    h0 = torch.empty(4, args.match_batch_size, args.hidden_dim).to(device)
    c0 = torch.empty(4, args.match_batch_size, args.hidden_dim).to(device)
    nn.init.xavier_normal_(h0)
    nn.init.xavier_normal_(c0)

    # define loss functions
    strict_match_loss_function = nn.CrossEntropyLoss()

    for epoch in range(args.start_epoch, args.start_epoch + epochs):
        print('\n Epoch {:} / {:}'.format(epoch + 1,
                                          args.start_epoch + epochs))

        total_loss, strict_total_loss, soft_total_loss, sim_total_loss = 0, 0, 0, 0
        batch_count = 0
        clf.train()

        for step, batch in enumerate(
                tqdm(
                    strict_match_data.as_batches(
                        batch_size=args.match_batch_size, seed=epoch))):

            # prepping batch data
            strict_match_tokens, strict_match_lengths, strict_match_labels = batch

            strict_match_tokens = strict_match_tokens.to(device)
            strict_match_labels = strict_match_labels.to(device)

            strict_match_predictions = clf.forward(strict_match_tokens,
                                                   strict_match_lengths, h0,
                                                   c0)

            strict_match_loss = strict_match_loss_function(
                strict_match_predictions, strict_match_labels)

            strict_total_loss = strict_total_loss + strict_match_loss.item()
            batch_count += 1

            if batch_count % 50 == 0 and batch_count > 0:
                print((total_loss, strict_total_loss, soft_total_loss,
                       sim_total_loss, batch_count))

            strict_match_loss.backward()
            torch.nn.utils.clip_grad_norm_(clf.parameters(), 5.0)

            optimizer.step()

        # compute the training loss of the epoch
        train_avg_loss = total_loss / batch_count
        train_avg_strict_loss = strict_total_loss / batch_count
        train_avg_soft_loss = soft_total_loss / batch_count
        train_avg_sim_loss = sim_total_loss / batch_count

        print("Train Losses")
        loss_tuples = ("%.5f" % train_avg_loss, "%.5f" % train_avg_strict_loss,
                       "%.5f" % train_avg_soft_loss,
                       "%.5f" % train_avg_sim_loss)
        print(
            "Avg Train Total Loss: {}, Avg Train Strict Loss: {}, Avg Train Soft Loss: {}, Avg Train Sim Loss: {}"
            .format(*loss_tuples))

        strict_loss_epoch.append(train_avg_strict_loss)

        train_path = "../data/training_data/{}_data_{}.p".format(
            "matched", save_string)
        train_results = evaluate_next_clf(train_path,
                                          clf,
                                          strict_match_loss_function,
                                          number_of_classes,
                                          batch_size=args.eval_batch_size,
                                          none_label_id=none_label_id)
        avg_loss, avg_train_ent_f1_score, avg_train_val_f1_score, total_train_class_probs, no_relation_thresholds = train_results
        print("Train Results")
        train_tuple = ("%.5f" % avg_loss, "%.5f" % avg_train_ent_f1_score,
                       "%.5f" % avg_train_val_f1_score,
                       str(no_relation_thresholds))
        print(
            "Avg Train Loss: {}, Avg Train Entropy F1 Score: {}, Avg Train Max Value F1 Score: {}, Thresholds: {}"
            .format(*train_tuple))

        dev_results = evaluate_next_clf(dev_path,
                                        clf,
                                        strict_match_loss_function,
                                        number_of_classes,
                                        batch_size=args.eval_batch_size,
                                        none_label_id=none_label_id)

        avg_loss, avg_dev_ent_f1_score, avg_dev_val_f1_score, total_dev_class_probs, no_relation_thresholds = dev_results

        print("Dev Results")
        dev_tuple = ("%.5f" % avg_loss, "%.5f" % avg_dev_ent_f1_score,
                     "%.5f" % avg_dev_val_f1_score,
                     str(no_relation_thresholds))
        print(
            "Avg Dev Loss: {}, Avg Dev Entropy F1 Score: {}, Avg Dev Max Value F1 Score: {}, Thresholds: {}"
            .format(*dev_tuple))

        dev_epoch_f1_scores.append(
            (avg_loss, avg_dev_ent_f1_score, avg_dev_val_f1_score,
             max(avg_dev_ent_f1_score, avg_dev_val_f1_score)))

        if max(avg_dev_ent_f1_score, avg_dev_val_f1_score) > best_dev_f1_score:
            best_dev_f1_score = max(avg_dev_ent_f1_score, avg_dev_val_f1_score)
            print("Updated Dev F1 Score")

        test_results = evaluate_next_clf(test_path, clf, strict_match_loss_function, number_of_classes,\
                                         no_relation_thresholds=no_relation_thresholds,\
                                         batch_size=args.eval_batch_size, none_label_id=none_label_id)

        avg_loss, avg_test_ent_f1_score, avg_test_val_f1_score, total_test_class_probs, _ = test_results

        print("Test Results")
        test_tuple = ("%.5f" % avg_loss, "%.5f" % avg_test_ent_f1_score,
                      "%.5f" % avg_test_val_f1_score,
                      str(no_relation_thresholds))
        print(
            "Avg Test Loss: {}, Avg Test Entropy F1 Score: {}, Avg Test Max Value F1 Score: {}, Thresholds: {}"
            .format(*test_tuple))

        test_epoch_f1_scores.append(
            (avg_loss, avg_test_ent_f1_score, avg_test_val_f1_score,
             max(avg_test_ent_f1_score, avg_test_val_f1_score)))

        if best_test_f1_score < max(avg_test_ent_f1_score,
                                    avg_test_val_f1_score):
            print("Saving Model")
            if len(args.model_save_dir) > 0:
                dir_name = args.model_save_dir
            else:
                dir_name = "../data/saved_models/"
            torch.save(clf.state_dict(),
                       "{}Clf_{}.p".format(dir_name, args.experiment_name))
            with open(
                    "../data/result_data/test_predictions_Clf_{}.csv".format(
                        args.experiment_name), "wb") as f:
                pickle.dump(total_test_class_probs, f)
            with open(
                    "../data/result_data/dev_predictions_Clf_{}.csv".format(
                        args.experiment_name), "wb") as f:
                pickle.dump(total_dev_class_probs, f)
            with open("../data/result_data/thresholds.p", "wb") as f:
                pickle.dump({"thresholds": no_relation_thresholds}, f)

            best_test_f1_score = max(avg_test_ent_f1_score,
                                     avg_test_val_f1_score)

        print("Best Test F1: {}".format("%.5f" % best_test_f1_score))
        print(test_epoch_f1_scores[-3:])

    with open(
            "../data/result_data/train_strict_loss_per_epoch_Clf_{}.csv".
            format(args.experiment_name), "w") as f:
        writer = csv.writer(f)
        writer.writerow(['train loss'])
        for row in strict_loss_epoch:
            writer.writerow([row])

    with open(
            "../data/result_data/dev_f1_per_epoch_Clf_{}.csv".format(
                args.experiment_name), "w") as f:
        writer = csv.writer(f)
        writer.writerow(
            ['avg_loss, entropy_f1_score', 'max_value_f1_score', 'max'])
        for row in dev_epoch_f1_scores:
            writer.writerow(row)

    with open(
            "../data/result_data/test_f1_per_epoch_Clf_{}.csv".format(
                args.experiment_name), "w") as f:
        writer = csv.writer(f)
        writer.writerow(
            ['avg_loss, entropy_f1_score', 'max_value_f1_score', 'max'])
        for row in test_epoch_f1_scores:
            writer.writerow(row)