def __init__(self, args):

        # set up output directory
        self.output_dir = os.path.join(args.experiment_dir, args.run_name)
        if not os.path.exists(args.experiment_dir):
            os.mkdir(args.experiment_dir)
        if not os.path.exists(self.output_dir):
            os.mkdir(self.output_dir)
        if not os.path.exists(os.path.join(args.experiment_dir,"runs/")):
            os.mkdir(os.path.join(args.experiment_dir,"runs/"))

        # initialize tensorboard writer
        self.runs_dir = os.path.join(args.experiment_dir,"runs/",args.run_name)
        self.writer = SummaryWriter(self.runs_dir)

        # initialize global steps
        self.train_gs = 0
        self.val_gs = 0

        # initialize model config
        self.config = ModelConfig(args)

        # check if there is a model to load
        if args.old_model_dir is not None:
            self.use_old_model = True
            self.load_dir = args.old_model_dir
            self.config.load_from_file(
                os.path.join(self.load_dir, "config.json"))

            # create vocab
            self.vocab = Vocab()
            self.vocab.load_from_dict(os.path.join(self.load_dir, "vocab.json"))
            self.update_vocab = False
            self.config.min_count=1
        else:
            self.use_old_model = False

            self.vocab = None
            self.update_vocab = True

        # create data sets
        self.dataset_filename = args.dataset_filename

        # train
        self.train_dataset = DialogueDataset(
            os.path.join(self.dataset_filename, "train_data.json"),
            self.config.sentence_len,
            self.vocab,
            self.update_vocab)
        self.data_loader_train = torch.utils.data.DataLoader(
            self.train_dataset, self.config.train_batch_size, shuffle=True)
        self.config.train_len = len(self.train_dataset)

        self.vocab = self.train_dataset.vocab

        # eval
        self.val_dataset = DialogueDataset(
            os.path.join(self.dataset_filename, "val_data.json"),
            self.config.sentence_len,
            self.vocab,
            self.update_vocab)
        self.data_loader_val = torch.utils.data.DataLoader(
            self.val_dataset, self.config.val_batch_size, shuffle=True)
        self.config.val_len = len(self.val_dataset)

        # update, and save vocab
        self.vocab = self.val_dataset.vocab
        self.train_dataset.vocab = self.vocab
        if (self.config.min_count > 1):
            self.config.old_vocab_size = len(self.vocab)
            self.vocab.prune_vocab(self.config.min_count)
        self.vocab.save_to_dict(os.path.join(self.output_dir, "vocab.json"))
        self.vocab_size = len(self.vocab)
        self.config.vocab_size = self.vocab_size

        # load embeddings
        if self.config.pretrained_embeddings_dir is None:
            pretrained_embeddings = get_pretrained_embeddings(self.config.pretrained_embeddings_dir , self.vocab)
        else:
            pretrained_embeddings = None

        # print and save the config file
        self.config.print_config(self.writer)
        self.config.save_config(os.path.join(self.output_dir, "config.json"))

        # set device
        self.device = torch.device('cuda')

        # create model
        self.model = Transformer(
            self.config.vocab_size,
            self.config.label_len,
            self.config.sentence_len,
            d_word_vec=self.config.embedding_dim,
            d_model=self.config.model_dim,
            d_inner=self.config.inner_dim,
            n_layers=self.config.num_layers,
            n_head=self.config.num_heads,
            d_k=self.config.dim_k,
            d_v=self.config.dim_v,
            dropout=self.config.dropout,
            pretrained_embeddings=pretrained_embeddings
        ).to(self.device)

        # create optimizer
        self.optimizer = torch.optim.Adam(
            filter(lambda x: x.requires_grad, self.model.parameters()),
            betas=(0.9, 0.98), eps=1e-09)

        # load old model, optimizer if there is one
        if self.use_old_model:
            self.model, self.optimizer = load_checkpoint(
                os.path.join(self.load_dir, "model.bin"),
                self.model, self.optimizer, self.device)


        # create a sceduled optimizer object
        self.optimizer = ScheduledOptim(
            self.optimizer, self.config.model_dim, self.config.warmup_steps)
예제 #2
0
def main():
    nb_epochs = 30
    batch_size = 200
    hidden_size = 256
    embedding_dim = 300
    max_len = 20
    teacher_forcing = 0.6
    min_count = 2
    max_grad_norm = 5
    val_len = 5000
    weight_decay = 0.00001
    model_filename = '/home/mattd/pycharm/yelp/models' \
                     '/baseline_frozen_pretrained'

    eng_fr_filename = '/mnt/data1/datasets/yelp/merged/train'
    dataset = SentenceDataset(eng_fr_filename, max_len, min_count)
    print('Dataset: {}'.format(len(dataset)))

    train_len = len(dataset) - val_len
    dataset_train, dataset_val = torch.utils.data.dataset.random_split(
        dataset, [train_len, val_len])
    print('Train {}, val: {}'.format(len(dataset_train), len(dataset_val)))

    embeddings_dir = '/home/mattd/pycharm/yelp/embeddings.npy'
    embeddings = cuda(get_pretrained_embeddings(embeddings_dir, dataset))

    data_loader_train = torch.utils.data.DataLoader(dataset_train,
                                                    batch_size,
                                                    shuffle=True)
    data_loader_val = torch.utils.data.DataLoader(dataset_val,
                                                  batch_size,
                                                  shuffle=False)

    vocab_size = len(dataset.vocab)
    padding_idx = dataset.vocab[SentenceDataset.PAD_TOKEN]
    init_idx = dataset.vocab[SentenceDataset.INIT_TOKEN]
    model = Seq2SeqModel(embeddings, hidden_size, padding_idx, init_idx,
                         max_len, teacher_forcing)
    model = cuda(model)

    parameters = list(model.parameters())
    optimizer = torch.optim.Adam(parameters,
                                 amsgrad=True,
                                 weight_decay=weight_decay)
    criterion = torch.nn.CrossEntropyLoss(
        ignore_index=dataset.vocab[SentenceDataset.PAD_TOKEN])

    phases = [
        'train',
        'val',
    ]
    data_loaders = [
        data_loader_train,
        data_loader_val,
    ]

    lowest_loss = 500

    for epoch in range(nb_epochs):
        for phase, data_loader in zip(phases, data_loaders):
            if phase == 'train':
                model.train()
            else:
                model.eval()

            epoch_loss = []
            for i, (inputs, targets) in enumerate(data_loader):
                optimizer.zero_grad()

                inputs = variable(inputs)
                targets = variable(targets)

                outputs = model(inputs, targets)

                targets = targets.view(-1)
                outputs = outputs.view(targets.size(0), -1)

                loss = criterion(outputs, targets)

                if phase == 'train':
                    loss.backward()
                    torch.nn.utils.clip_grad_norm_(parameters, max_grad_norm)
                    optimizer.step()

                epoch_loss.append(float(loss))

            epoch_loss = np.mean(epoch_loss)

            if epoch_loss < lowest_loss:
                save_checkpoint(model, loss, optimizer, model_filename)
                lowest_loss = epoch_loss

            if phase == 'train':
                print('Epoch {:03d} | {} loss: {:.3f}'.format(
                    epoch, phase, epoch_loss),
                      end='')
            else:
                print(', {} loss: {:.3f}'.format(phase, epoch_loss), end='\n')

            # print random sentence
            if phase == 'val':
                random_idx = np.random.randint(len(dataset_val))
                inputs, targets = dataset_val[random_idx]
                inputs_var = variable(inputs)

                outputs_var = model(inputs_var.unsqueeze(
                    0))  # unsqueeze to get the batch dimension
                outputs = argmax(outputs_var).squeeze(0).data.cpu().numpy()

                print(u'> {}'.format(
                    get_sentence_from_indices(inputs, dataset.vocab,
                                              SentenceDataset.EOS_TOKEN)))
                print(u'= {}'.format(
                    get_sentence_from_indices(targets, dataset.vocab,
                                              SentenceDataset.EOS_TOKEN)))
                print(u'< {}'.format(
                    get_sentence_from_indices(outputs, dataset.vocab,
                                              SentenceDataset.EOS_TOKEN)))
                print()
예제 #3
0
    def build_graph(self):
        # Reset previous graph.
        reset_graph()

        # Placeholders.
        x_source = tf.placeholder(tf.int32,
                                  shape=[None, None],
                                  name="x_source")

        source_seq_length = tf.placeholder(tf.int32,
                                           shape=[None],
                                           name="source_seq_length")

        x_target = tf.placeholder(tf.int32,
                                  shape=[None, None],
                                  name="x_target")

        target_seq_length = tf.placeholder(tf.int32,
                                           shape=[None],
                                           name="target_seq_length")

        labels = tf.placeholder(tf.float32,
                                shape=[None],
                                name="labels")

        input_dropout = tf.placeholder_with_default(1.0,
                                                    shape=[],
                                                    name="input_dropout")

        output_dropout = tf.placeholder_with_default(1.0,
                                                     shape=[],
                                                     name="output_dropout")

        decision_threshold = tf.placeholder_with_default(0.5,
                                                         shape=[],
                                                         name="decision_threshold")

        # Embedding layer.
        with tf.variable_scope("embeddings"):
            if self.config.source_embeddings_path is not None and self.config.target_embeddings_path is not None:
                source_pretrained_embeddings,\
                target_pretrained_embeddings = get_pretrained_embeddings(
                    source_embeddings_path,
                    target_embeddings_path,
                    source_vocab,
                    target_vocab)
                assert source_pretrained_embeddings.shape[1] == target_pretrained_embeddings.shape[1]
                self.config.embedding_size = source_pretrained_embeddings.shape[1]
                if self.config.fix_pretrained:
                    source_embeddings = tf.get_variable(
                        name="source_embeddings_matrix",
                        shape=[self.config.source_vocab_size, self.config.embedding_size],
                        initializer=tf.constant_initializer(source_pretrained_embeddings),
                        trainable=False)
                    target_embeddings = tf.get_variable(
                        name="target_embeddings_matrix",
                        shape=[self.config.target_vocab_size, self.config.embedding_size],
                        initializer=tf.constant_initializer(target_pretrained_embeddings),
                        trainable=False)
                else:
                    source_embeddings = tf.get_variable(
                        name="source_embeddings_matrix",
                        shape=[self.config.source_vocab_size, self.config.embedding_size],
                        initializer=tf.constant_initializer(source_pretrained_embeddings))
                    target_embeddings = tf.get_variable(
                        name="target_embeddings_matrix",
                        shape=[self.config.target_vocab_size, self.config.embedding_size],
                        initializer=tf.constant_initializer(target_pretrained_embeddings))
            else:
                source_embeddings = tf.get_variable(
                    name="source_embeddings_matrix",
                    shape=[self.config.source_vocab_size, self.config.embedding_size])
                target_embeddings = tf.get_variable(
                    name="target_embeddings_matrix",
                    shape=[self.config.target_vocab_size, self.config.embedding_size])

            source_rnn_inputs = tf.nn.embedding_lookup(source_embeddings, x_source)
            target_rnn_inputs = tf.nn.embedding_lookup(target_embeddings, x_target)
            source_rnn_inputs = tf.nn.dropout(source_rnn_inputs,
                                              keep_prob=input_dropout,
                                              name="source_seq_embeddings")
            target_rnn_inputs = tf.nn.dropout(target_rnn_inputs,
                                              keep_prob=input_dropout,
                                              name="target_seq_embeddings")

        # BiRNN encoder.
        with tf.variable_scope("birnn") as scope:
            if self.config.use_lstm:
                cell_fw = tf.nn.rnn_cell.LSTMCell(self.config.state_size, use_peepholes=True)
                cell_bw = tf.nn.rnn_cell.LSTMCell(self.config.state_size, use_peepholes=True)
            else:
                cell_fw = tf.nn.rnn_cell.GRUCell(self.config.state_size)
                cell_bw = tf.nn.rnn_cell.GRUCell(self.config.state_size)

            cell_fw = tf.nn.rnn_cell.DropoutWrapper(cell_fw, output_keep_prob=output_dropout)
            cell_bw = tf.nn.rnn_cell.DropoutWrapper(cell_bw, output_keep_prob=output_dropout)

            if self.config.num_layers > 1:
                if self.config.use_lstm:
                    cell_fw = tf.nn.rnn_cell.MultiRNNCell([tf.nn.rnn_cell.LSTMCell(self.config.state_size,
                                                                                   use_peepholes=True)
                                                           for _ in range(self.config.num_layers)])
                    cell_bw = tf.nn.rnn_cell.MultiRNNCell([tf.nn.rnn_cell.LSTMCell(self.config.state_size,
                                                                                   use_peepholes=True)
                                                           for _ in range(self.config.num_layers)])
                else:
                    cell_fw = tf.nn.rnn_cell.MultiRNNCell([tf.nn.rnn_cell.GRUCell(self.config.state_size)
                                                           for _ in range(self.config.num_layers)])
                    cell_bw = tf.nn.rnn_cell.MultiRNNCell([tf.nn.rnn_cell.GRUCell(self.config.state_size)
                                                           for _ in range(self.config.num_layers)])

            with tf.variable_scope(scope):
                source_rnn_outputs, source_final_state = tf.nn.bidirectional_dynamic_rnn(
                    cell_fw=cell_fw,
                    cell_bw=cell_bw,
                    inputs=source_rnn_inputs,
                    sequence_length=source_seq_length,
                    dtype=tf.float32)

            with tf.variable_scope(scope, reuse=True):
                target_rnn_outputs, target_final_state = tf.nn.bidirectional_dynamic_rnn(
                    cell_fw=cell_fw,
                    cell_bw=cell_bw,
                    inputs=target_rnn_inputs,
                    sequence_length=target_seq_length,
                    dtype=tf.float32)

            self.config.state_size *= 2
            # Mean and max pooling only work for 1 layer BiRNN.
            if self.config.use_mean_pooling:
                source_final_state = self.average_pooling(source_rnn_outputs, source_seq_length)
                target_final_state = self.average_pooling(target_rnn_outputs, target_seq_length)
            elif self.config.use_max_pooling:
                source_final_state = self.max_pooling(source_rnn_outputs)
                target_final_state = self.max_pooling(target_rnn_outputs)
            else:
                source_final_state_fw, source_final_state_bw = source_final_state
                target_final_state_fw, target_final_state_bw = target_final_state
                if self.config.num_layers > 1:
                    source_final_state_fw = source_final_state_fw[-1]
                    source_final_state_bw = source_final_state_bw[-1]
                    target_final_state_fw = target_final_state_fw[-1]
                    target_final_state_bw = target_final_state_bw[-1]
                if self.config.use_lstm:
                    source_final_state_fw = source_final_state_fw.h
                    source_final_state_bw = source_final_state_bw.h
                    target_final_state_fw = target_final_state_fw.h
                    target_final_state_bw = target_final_state_bw.h
                source_final_state = tf.concat([source_final_state_fw, source_final_state_bw],
                                               axis=1, name="source_final_state_ph")
                target_final_state = tf.concat([target_final_state_fw, target_final_state_bw],
                                               axis=1)

        # Feed-forward neural network.
        with tf.variable_scope("feed_forward"):
            h_multiply = tf.multiply(source_final_state, target_final_state)
            h_abs_diff = tf.abs(tf.subtract(source_final_state, target_final_state))

            W_1 = tf.get_variable(name="W_1",
                                  shape=[self.config.state_size, self.config.hidden_size])
            W_2 = tf.get_variable(name="W_2",
                                  shape=[self.config.state_size, self.config.hidden_size])
            b_1 = tf.get_variable(name="b_1",
                                  shape=[self.config.hidden_size],
                                  initializer=tf.constant_initializer(0.0))

            h_semantic = tf.tanh(tf.matmul(h_multiply, W_1) + tf.matmul(h_abs_diff, W_2) + b_1)

            W_3 = tf.get_variable(name="W_3",
                                  shape=[self.config.hidden_size, 1])
            b_2 = tf.get_variable(name="b_2",
                                  shape=[1],
                                  initializer=tf.constant_initializer(0.0))

            logits = tf.matmul(h_semantic, W_3) + b_2
            logits = tf.squeeze(logits,
                                name="logits")

            # Sigmoid output layer.
            with tf.name_scope("output"):
                probs = tf.sigmoid(logits,
                                   name="probs")
                predicted_class = tf.cast(tf.greater(probs, decision_threshold),
                                          tf.float32,
                                          name="predicted_class")

        # Loss.
        with tf.name_scope("cross_entropy"):
            losses = tf.nn.sigmoid_cross_entropy_with_logits(
                logits=logits,
                labels=labels,
                name="cross_entropy_per_sequence")
            mean_loss = tf.reduce_mean(losses,
                                       name="cross_entropy_loss")

        # Optimization.
        with tf.name_scope("optimization"):
            global_step = tf.Variable(initial_value=0,
                                      trainable=False,
                                      name="global_step")
            optimizer = tf.train.AdamOptimizer(self.config.learning_rate)
            trainable_variables = tf.trainable_variables()
            gradients = tf.gradients(mean_loss, trainable_variables,
                                     name="gradients")
            clipped_gradients, global_norm = tf.clip_by_global_norm(
                gradients,
                clip_norm=self.config.max_gradient_norm,
                name="clipped_gradients")
            train_op = optimizer.apply_gradients(zip(clipped_gradients, trainable_variables),
                                                 global_step=global_step)

        # Evaluation metrics.
        accuracy = tf.metrics.accuracy(labels, predicted_class,
                                       name="accuracy")
        precision = tf.metrics.precision(labels, predicted_class,
                                         name="precision")
        recall = tf.metrics.recall(labels, predicted_class,
                                   name="recall")

        # Add summaries.
        tf.summary.scalar("loss", mean_loss)
        tf.summary.scalar("global_norm", global_norm)
        tf.summary.scalar("accuracy", accuracy[0])
        tf.summary.scalar("precision", precision[0])
        tf.summary.scalar("recall", recall[0])
        tf.summary.scalar("logits" + "/sparsity", tf.nn.zero_fraction(logits))
        tf.summary.histogram("logits" + "/activations", logits)
        tf.summary.histogram("probs", probs)

        # Add histogram for trainable variables.
        for var in trainable_variables:
            tf.summary.histogram(var.op.name, var)

        # Add histogram for gradients.
        for grad, var in zip(clipped_gradients, trainable_variables):
            if grad is not None:
                tf.summary.histogram(var.op.name + "/gradients", grad)

        # Assign placeholders and operations.
        self.x_source = x_source
        self.x_target = x_target
        self.source_seq_length = source_seq_length
        self.target_seq_length = target_seq_length
        self.labels = labels
        self.input_dropout = input_dropout
        self.output_dropout = output_dropout
        self.decision_threshold = decision_threshold
        self.train_op = train_op
        self.probs = probs
        self.predicted_class = predicted_class
        self.mean_loss = mean_loss
        self.accuracy = accuracy
        self.precision = precision
        self.recall = recall
        self.summaries = tf.summary.merge_all()
        self.saver = tf.train.Saver(tf.global_variables(), max_to_keep=5)
예제 #4
0
def main():
    nb_epochs = 50
    batch_size = 500
    hidden_size = 256
    embedding_dim = 300
    pretrained_embeddings = "/embeddings_min2_max15.npy"
    max_grad_norm = 5
    max_len = 15
    min_count = 2
    weight_decay = 0.00001
    learning_rate = 0.001
    model_group = "/auto_encoder"
    autoencoder_name = "/auto_encoder_3"
    autoencoder_version = 1
    project_file = "/home/mattd/PycharmProjects/reddit"
    dataset_path = "/home/mattd/PycharmProjects/reddit/data/"

    string = 'nb_epochs: {}\nbatch_size: {}\nhidden_size: {}\nembedding_dim: ' \
             '{}\npretrained_embeddings: {}\nmax_len: {}\nmin_countmin_count: '\
             '{}\nweight_decay: {}\nlearning_rate: {}\nmodel_group: ' \
             '{}\nautoencoder_name: {}\nautoencoder_version: {}\n'.format(
                nb_epochs, batch_size, hidden_size, embedding_dim,
                pretrained_embeddings, max_len, min_count, weight_decay,
                learning_rate, model_group, autoencoder_name,autoencoder_version)
    print(string)
    output = string + '\n'

    # embedding_filename = 'embeddings_20_1.npy's'

    model_filename = '{}{}s{}'.format(
        project_file, model_group, autoencoder_name)

    new_model_filename = '{}_{}'.format(model_filename, autoencoder_version)

    output_file = '{}{}_outputs{}_{}'.format(
        project_file, model_group, autoencoder_name, autoencoder_version)

    description_filename = \
        '{}/description/description_1.txt'.format(project_file)

    # eng_fr_filename = '/mnt/data1/datasets/yelp/merged/train'
    dataset_train_filename = "{}train.csv".format(dataset_path)
    dataset_val_filename = "{}validation.csv".format(dataset_path)

    dataset_train = SentenceDataset(dataset_train_filename, max_len, min_count)
    dataset_val = SentenceDataset(dataset_val_filename, max_len, min_count,
                               dataset_train.vocab)

    string = 'Train {}, val: {}'.format(len(dataset_train), len(dataset_val))
    print(string)
    output += string + '\n'

    # getting pretrained embeddings
    if pretrained_embeddings is not None:
        embeddings_dir = '{}{}'.format(project_file, pretrained_embeddings)
        pretrained_embeddings = cuda(
            get_pretrained_embeddings(embeddings_dir))
        embedding_dim = pretrained_embeddings.shape[1]

    data_loader_train = torch.utils.data.DataLoader(dataset_train, batch_size, shuffle=True)
    data_loader_val = torch.utils.data.DataLoader(dataset_val, batch_size, shuffle=False)

    vocab_size = len(dataset_val.vocab)
    padding_idx = dataset_val.vocab[SentenceDataset.PAD_TOKEN]
    init_idx = dataset_val.vocab[SentenceDataset.INIT_TOKEN]

    model = Seq2SeqModel(hidden_size, padding_idx, init_idx,
        max_len, vocab_size, embedding_dim, pretrained_embeddings)

    model = cuda(model)

    parameters = list(model.parameters())
    optimizer = torch.optim.Adam(
        parameters, amsgrad=True, weight_decay=weight_decay, lr=learning_rate)
    criterion = torch.nn.CrossEntropyLoss(ignore_index=dataset_val.vocab[
        SentenceDataset.PAD_TOKEN])

    model, optimizer, lowest_loss, description, last_epoch, \
    train_loss, val_loss, found_model = load_checkpoint(model_filename, model, optimizer)

    if found_model:
        string = 'Loaded Model:\nlowest_validation_loss: {}\ndescription: {}' \
                 '\nlast_epoch:{}\n'.format(lowest_loss, description,
                                            last_epoch)
    else:
        string = 'No model found at {}\n'.format(model_filename)

    print(string)
    output = output + string + '\n'

    outfile = open(output_file, 'w')
    outfile.write(output)
    outfile.close()

    phases = ['train', 'val', ]
    data_loaders = [data_loader_train, data_loader_val, ]

    intervals = 6

    for epoch in range(last_epoch, last_epoch+nb_epochs):
        start = time.clock()

        #if epoch == 6:
        #    model.unfreeze_embeddings()
        #    parameters = list(model.parameters())
        #    optimizer = torch.optim.Adam(
        #        parameters, amsgrad=True, weight_decay=weight_decay)

        for phase, data_loader in zip(phases, data_loaders):
            if phase == 'train':
                model.train()
            else:
                model.eval()

            epoch_loss = []
            epoch_sentenence_accuracy = []
            epoch_token_accuracy = []
            j = 1

            for i, inputs in tqdm(enumerate(data_loader)):
                optimizer.zero_grad()

                inputs = variable(inputs)
                targets = variable(inputs)

                outputs = model.auto_encoder(inputs, targets)

                targets = targets.view(-1)
                outputs = outputs.view(targets.size(0), -1)

                loss = criterion(outputs, targets)

                epoch_loss.append(float(loss))
                average_epoch_loss = np.mean(epoch_loss)

                if phase == 'train':
                    loss.backward()
                    torch.nn.utils.clip_grad_norm_(parameters, max_grad_norm)
                    optimizer.step()
                    if (len(data_loader) / intervals)*j <= i+1:
                        train_loss.append(average_epoch_loss)
                        string = (
                            'Epoch {:03d} Example {:03d} | {} loss: {:.3f}'.format(
                             epoch, i, phase, average_epoch_loss))
                        print(string, end='\n')
                        output = output + string + '\n'
                        j += 1

                else:
                    predicted = torch.argmax(
                        outputs.view(-1, max_len, vocab_size), -1)
                    batch_sentence_accuracy, batch_token_accuracy = encoder_accuracy(
                        targets.view(-1, max_len), predicted)
                    epoch_sentenence_accuracy.append(batch_sentence_accuracy)
                    epoch_token_accuracy.append(batch_token_accuracy)

            if phase == 'val':
                averege_epoch_sentenence_accuracy = np.mean(epoch_sentenence_accuracy)
                averege_epoch_token_accuracy = np.mean(epoch_token_accuracy)

                time_taken = time.clock() - start

                val_loss.append(average_epoch_loss)
                string = ' {} loss: {:.3f} | time: {:.3f}'.format(
                    phase, average_epoch_loss, time_taken)
                print(string, end='')
                output = output + '\n' + string + '\n'

                string = '| sentence accuracy:{:.3f}| token accuracy:{:.3f}'.format(
                    averege_epoch_sentenence_accuracy, averege_epoch_token_accuracy)
                print(string, end='\n')
                output = output + string + '\n'

                if average_epoch_loss < lowest_loss:
                    save_checkpoint(
                        model, average_epoch_loss, optimizer, new_model_filename,
                        description_filename, epoch, train_loss, val_loss)
                    lowest_loss = average_epoch_loss

                random_idx = np.random.randint(len(dataset_val))
                inputs = dataset_val[random_idx]
                targets = inputs
                inputs_var = variable(inputs)

                outputs_var = model.auto_encoder(inputs_var.unsqueeze(0)) #
                # unsqueeze to
                # get the batch dimension
                outputs = argmax(outputs_var).squeeze(0).data.cpu().numpy()

                string = '> {}\n'.format(get_sentence_from_indices(
                    inputs, dataset_val.vocab, SentenceDataset.EOS_TOKEN))

                string = string + u'= {}\n'.format(get_sentence_from_indices(
                    targets, dataset_val.vocab, SentenceDataset.EOS_TOKEN))

                string = string + u'< {}'.format(get_sentence_from_indices(
                    outputs, dataset_val.vocab, SentenceDataset.EOS_TOKEN))
                print(string, end='\n')
                output = output + string + '\n' + '\n'
        outfile = open(output_file, 'w')
        outfile.write(output)
        outfile.close()
예제 #5
0
def main():
    nb_epochs = 100
    batch_size = 500
    hidden_size = 256
    embedding_dim = 300
    pretrained_embeddings = None
    max_len = 20
    min_count = 2
    max_grad_norm = 5
    val_len = 10000
    weight_decay = 0.00001
    model_filename = '/home/mattd/pycharm/encoder/models3' \
                     '/Baseline'
    description_filename = \
        '/home/mattd/pycharm/encoder/description/description2.txt'
    output_file = '/home/mattd/pycharm/encoder/model_outputs_3/baseline'

    outfile = open(output_file, 'w')

    eng_fr_filename = '/home/okovaleva/projects/forced_apart/autoencoder/data' \
                      '/train_1M.txt'
    dataset = SentenceDataset(eng_fr_filename, max_len, min_count)
    string = 'Dataset: {}'.format(len(dataset))
    print(string)
    outfile.write(string+'\n')

    train_len = len(dataset) - val_len
    dataset_train, dataset_val = torch.utils.data.dataset.random_split(dataset, [train_len, val_len])
    string = 'Train {}, val: {}'.format(len(dataset_train), len(dataset_val))
    print(string)
    outfile.write(string+'\n')

    embeddings_dir = '/home/mattd/pycharm/encoder' \
                     '/embeddings_3min.npy'
    pretrained_embeddings = cuda(
        get_pretrained_embeddings(embeddings_dir, dataset))
    embedding_dim = pretrained_embeddings.shape[1]

    data_loader_train = torch.utils.data.DataLoader(dataset_train, batch_size, shuffle=True)
    data_loader_val = torch.utils.data.DataLoader(dataset_val, batch_size, shuffle=False)

    vocab_size = len(dataset.vocab)
    padding_idx = dataset.vocab[SentenceDataset.PAD_TOKEN]
    init_idx = dataset.vocab[SentenceDataset.INIT_TOKEN]

    model = Seq2SeqModel(
        pretrained_embeddings, hidden_size, padding_idx, init_idx,
                         max_len, vocab_size, embedding_dim)
    model = cuda(model)

    parameters = list(model.parameters())
    optimizer = torch.optim.Adam(parameters, amsgrad=True, weight_decay=weight_decay)
    criterion = torch.nn.CrossEntropyLoss(ignore_index=dataset.vocab[SentenceDataset.PAD_TOKEN])

    model, optimizer, lowest_loss, description, last_epoch, \
    train_loss, val_loss = load_checkpoint(model_filename, model, optimizer)

    print(description)

    phases = ['train', 'val', ]
    data_loaders = [data_loader_train, data_loader_val, ]


    for epoch in range(last_epoch, last_epoch+nb_epochs):
        start = time.clock()

        #if epoch == 6:
        #    model.unfreeze_embeddings()
        #    parameters = list(model.parameters())
        #    optimizer = torch.optim.Adam(
        #        parameters, amsgrad=True, weight_decay=weight_decay)

        for phase, data_loader in zip(phases, data_loaders):
            if phase == 'train':
                model.train()
            else:
                model.eval()

            epoch_loss = []
            epoch_sentenence_accuracy = []
            epoch_token_accuracy = []

            for i, inputs in enumerate(data_loader):
                optimizer.zero_grad()

                inputs = variable(inputs)
                targets = variable(inputs)

                outputs = model(inputs, targets)

                targets = targets.view(-1)
                outputs = outputs.view(targets.size(0), -1)

                loss = criterion(outputs, targets)

                if phase == 'train':
                    loss.backward()
                    torch.nn.utils.clip_grad_norm_(parameters, max_grad_norm)
                    optimizer.step()

                if phase == 'val':
                    predicted = torch.argmax(outputs.view(batch_size, max_len,
                                                          -1), -1)
                    batch_sentenence_accuracy, batch_token_accuracy = accuracy(
                        targets.view(batch_size, -1), predicted)
                    epoch_sentenence_accuracy.append(batch_sentenence_accuracy)
                    epoch_token_accuracy.append(batch_token_accuracy)
                epoch_loss.append(float(loss))

            epoch_loss = np.mean(epoch_loss)

            if phase == 'train':
                train_loss.append(epoch_loss)
                string = ('Epoch {:03d} | {} loss: {:.3f}'.format(
                    epoch, phase, epoch_loss))

                print(string, end='\n')
                outfile.write(string+'\n')
            else:
                averege_epoch_sentenence_accuracy = sum(epoch_sentenence_accuracy) / \
                    len(epoch_sentenence_accuracy)
                averege_epoch_token_accuracy = sum(epoch_token_accuracy) / \
                    len(epoch_token_accuracy)
                time_taken = time.clock() - start

                val_loss.append(epoch_loss)
                string = ' {} loss: {:.3f} | time: {:.3f}'.format(
                    phase, epoch_loss, time_taken)
                print(string, end='')

                string = '| sentence accuracy:{:.3f}| token accuracy:{:.3f}'.format(
                    averege_epoch_sentenence_accuracy, averege_epoch_token_accuracy)
                print(string, end='\n')
                outfile.write(string+'\n')
                if epoch_loss < lowest_loss:
                    save_checkpoint(
                        model, epoch_loss, optimizer, model_filename,
                        description_filename, epoch, train_loss, val_loss)
                    lowest_loss = epoch_loss



            # print random sentence
            if phase == 'val':
                random_idx = np.random.randint(len(dataset_val))
                inputs = dataset_val[random_idx]
                targets = inputs
                inputs_var = variable(inputs)

                outputs_var = model(inputs_var.unsqueeze(0)) # unsqueeze to get the batch dimension
                outputs = argmax(outputs_var).squeeze(0).data.cpu().numpy()

                string = '> {}'.format(get_sentence_from_indices(
                    inputs, dataset.vocab, SentenceDataset.EOS_TOKEN))
                print(string, end='\n')
                outfile.write(string+'\n')

                string = u'= {}'.format(get_sentence_from_indices(
                    targets, dataset.vocab, SentenceDataset.EOS_TOKEN))
                print(string, end='\n')
                outfile.write(string+'\n')

                string = u'< {}'.format(get_sentence_from_indices(
                    outputs, dataset.vocab, SentenceDataset.EOS_TOKEN))
                print(string, end='\n')
                outfile.write(string+'\n')
                print()
    outfile.close()
예제 #6
0
def main():
    file = {
        "model_group": "/seq_len_exp",
        "model_name": "/generation_6",
        "old_model_name": None,
        "model_version": 0,
        "project_file": "/home/mattd/PycharmProjects/reddit/generation"
    }

    file["dataset_path"] = "{}/data/".format(file["project_file"])

    file["model_filename"] = '{}{}s{}_{}'.format(file["project_file"],
                                                 file["model_group"],
                                                 file["model_name"],
                                                 file["model_version"])

    file["output_file"] = '{}{}_outputs{}_{}'.format(file["project_file"],
                                                     file["model_group"],
                                                     file["model_name"],
                                                     file["model_version"])

    # check_files(file)

    use_old_model = file["old_model_name"] is not None
    params = {}

    if use_old_model:
        file["old_model_filename"] = '{}{}s{}'.format(file["project_file"],
                                                      file["model_group"],
                                                      file["old_model_name"])
        params, old_files = load_params(file["old_model_filename"])
        use_old_model = old_files != {}

    if not use_old_model:
        params = {
            "batch_size": 1000,
            "hidden_size": 256,
            "embedding_dim": 300,
            "pretrained_embeddings": True,
            "max_grad_norm": 5,
            "max_len": 30,
            "min_count": 2,
            "weight_decay": 0.00001,
            "learning_rate": 0.005,
        }

    params["num_training_examples"] = 78260
    params["num_val_examples"] = -1
    params["nb_epochs"] = 40

    if params["pretrained_embeddings"]:
        file["pretrained_embeddings_file"] = \
            "/embeddings/embeddings_min{}_max{}.npy".format(
            params["min_count"], params["max_len"])

    string = ""
    for k, v in file.items():
        string += "{}: {}\n".format(k, v)
    for k, v in params.items():
        string += "{}: {}\n".format(k, v)

    print(string)
    output = string + '\n'

    # eng_fr_filename = '/mnt/data1/datasets/yelp/merged/train'
    dataset_train_filename = "{}train.csv".format(file["dataset_path"])
    dataset_val_filename = "{}validation.csv".format(file["dataset_path"])

    dataset_train = PairsDataset(dataset_train_filename, params["max_len"],
                                 params["min_count"])
    dataset_val = PairsDataset(dataset_val_filename, params["max_len"],
                               params["min_count"], dataset_train.vocab)

    string = 'Vocab size {}\n'.format(len(dataset_train.vocab))
    string += 'Train {} '.format(len(dataset_train))

    if params["num_training_examples"] != -1:
        dataset_train.prune_examples(params["num_training_examples"])
        string += '> {}'.format(len(dataset_train))

    string += '\nVal: {}'.format(len(dataset_val))

    if params["num_val_examples"] != -1:
        dataset_val.prune_examples(params["num_val_examples"])
        string += '-> {}'.format(len(dataset_val))

    print(string)
    output += string + '\n'

    if params["pretrained_embeddings"]:
        embeddings_dir = '{}{}'.format(file["project_file"],
                                       file["pretrained_embeddings_file"])
        pretrained_embeddings = cuda(get_pretrained_embeddings(embeddings_dir))
        params["embedding_dim"] = pretrained_embeddings.shape[1]
    else:
        pretrained_embeddings = None

    data_loader_train = torch.utils.data.DataLoader(dataset_train,
                                                    params["batch_size"],
                                                    shuffle=True)
    data_loader_val = torch.utils.data.DataLoader(dataset_val,
                                                  params["batch_size"],
                                                  shuffle=False)

    vocab_size = len(dataset_train.vocab)
    padding_idx = dataset_train.vocab[PairsDataset.PAD_TOKEN]
    init_idx = dataset_train.vocab[PairsDataset.INIT_TOKEN]

    model = Seq2SeqModel(params["hidden_size"], padding_idx, init_idx,
                         params["max_len"], vocab_size,
                         params["embedding_dim"], pretrained_embeddings)

    model = cuda(model)

    parameters = list(model.parameters())
    optimizer = torch.optim.Adam(parameters,
                                 amsgrad=True,
                                 weight_decay=params["weight_decay"],
                                 lr=params["learning_rate"])
    criterion = torch.nn.CrossEntropyLoss()

    if use_old_model:
        model, optimizer = load_checkpoint(file["old_model_filename"], model,
                                           optimizer)

    lowest_loss = 100
    train_loss = []
    val_loss = []
    best_model = model
    best_optimizer = optimizer
    average_epoch_loss = 0

    metrics = {"accuracy": [], "precision": [], "recall": [], "f1": []}

    outfile = open(file["output_file"], 'w')
    outfile.write(output)
    outfile.close()

    phases = [
        'train',
        'val',
    ]
    data_loaders = [
        data_loader_train,
        data_loader_val,
    ]

    intervals = 2
    highest_acc = 0

    for epoch in range(0, params["nb_epochs"]):
        start = time.clock()
        string = 'Epoch: {}\n'.format(epoch)
        print(string, end='')
        output = output + '\n' + string

        #if epoch == 6:
        #    model.unfreeze_embeddings()
        #    parameters = list(model.parameters())
        #    optimizer = torch.optim.Adam(
        #        parameters, amsgrad=True, weight_decay=weight_decay)

        for phase, data_loader in zip(phases, data_loaders):
            if phase == 'train':
                model.train()
                string = 'Train: \n'
            else:
                model.eval()
                string = 'Validation \n'

            print(string, end='')
            output = output + '\n' + string

            epoch_loss = []
            epoch_accuracy = []
            epoch_precision = []
            epoch_recall = []
            epoch_f1 = []
            j = 1

            for i, (sentence_1, sentence_2,
                    labels) in tqdm(enumerate(data_loader)):
                optimizer.zero_grad()

                sentence_1 = variable(sentence_1)
                sentence_2 = variable(sentence_2)
                targets = variable(labels)

                outputs = model(sentence_1, sentence_2, targets)

                targets = targets.view(-1)
                outputs = outputs.view(targets.size(0), -1)
                loss = criterion(outputs, targets)

                epoch_loss.append(float(loss))
                average_epoch_loss = np.mean(epoch_loss)

                if phase == 'train':
                    loss.backward()
                    torch.nn.utils.clip_grad_norm_(parameters,
                                                   params["max_grad_norm"])
                    optimizer.step()
                    if (len(data_loader) / intervals) * j <= i + 1:
                        string = ('Example {:03d} | {} loss: {:.3f}'.format(
                            i, phase, average_epoch_loss))
                        print(string, end='\n')
                        output = output + string + '\n'
                        j += 1
                else:
                    # get result metrics
                    accuracy, precision, recall, f1 = classifier_accuracy(
                        targets.cpu().numpy(),
                        torch.argmax(outputs.view(-1, 2), -1).cpu().numpy())
                    #print('{},{},{},{}'.format(accuracy, precision, recall,
                    # f1))
                    epoch_accuracy.append(accuracy)
                    epoch_precision.append(precision)
                    epoch_recall.append(recall)
                    epoch_f1.append(f1)

            # print random sentence
            if phase == 'val':
                time_taken = time.clock() - start

                val_loss.append(average_epoch_loss)
                string = ' {} loss: {:.3f} | time: {:.3f}'.format(
                    phase, average_epoch_loss, time_taken)
                string += ' | lowest loss: {:.3f} highest accuracy:' \
                    ' {:.3f}'.format(lowest_loss, highest_acc)
                print(string, end='\n')
                output = output + '\n' + string + '\n'

                average_epoch_accuracy = np.mean(epoch_accuracy)
                average_epoch_precision = np.mean(epoch_precision)
                average_epoch_recall = np.mean(epoch_recall)
                average_epoch_f1 = np.mean(epoch_f1)
                metrics["accuracy"].append(average_epoch_accuracy),
                metrics["precision"].append(average_epoch_precision)
                metrics["recall"].append(average_epoch_recall)
                metrics["f1"].append(average_epoch_f1)

                if average_epoch_loss < lowest_loss:
                    best_model = model
                    best_optimizer = optimizer
                    best_epoch = epoch
                    lowest_loss = average_epoch_loss

                save_checkpoint(best_epoch, best_model, best_optimizer, epoch,
                                model, optimizer, train_loss, val_loss,
                                metrics, params, file)

                if average_epoch_accuracy > highest_acc:
                    highest_acc = average_epoch_accuracy

                string = "Accuracy: {:.3f}\nPrecision: {:.3f}\nRecall: {:.3f}\n" \
                         "F1: {:.3f}\n".format(
                    average_epoch_accuracy, average_epoch_precision,
                    average_epoch_recall, average_epoch_f1)
                print(string, end='\n')
                output = output + string + '\n'

                random_idx = np.random.randint(len(dataset_val))
                sentence_1, sentence_2, labels = dataset_val[random_idx]
                targets = labels
                sentence_1_var = variable(sentence_1)
                sentence_2_var = variable(sentence_2)

                outputs_var = model(sentence_1_var.unsqueeze(0),
                                    sentence_2_var.unsqueeze(0))  # unsqueeze
                #  to get the batch dimension
                outputs = outputs_var.squeeze(0).data.cpu().numpy()

                string = '> {}\n'.format(
                    get_sentence_from_indices(sentence_1, dataset_val.vocab,
                                              PairsDataset.EOS_TOKEN))

                string = string + u'> {}\n'.format(
                    get_sentence_from_indices(sentence_2, dataset_val.vocab,
                                              PairsDataset.EOS_TOKEN))

                string = string + u'target:{}|  P false:{:.3f}, P true:' \
                    u' {:.3f}'.format(targets, float(outputs[0]), float(outputs[1]))
                print(string, end='\n\n')
                output = output + string + '\n' + '\n'
            else:
                train_loss.append(average_epoch_loss)
        outfile = open(file["output_file"], 'w')
        outfile.write(output)
        outfile.close()