Exemplo n.º 1
0
def data_iter(train_file, validate_file, config, batch_size, num_embed, vocab_path, max_length=100):
    logger.info('Loading data...')

    x_train, y_train, vocab, vocab_inv, n_class = data_helpers.load_data(train_file, config, max_length, None)
    embed_size = num_embed
    sentence_size = x_train.shape[1]
    vocab_size = len(vocab)
    util.save_to_pickle(vocab_path, vocab)
    x_dev, y_dev, _, _, _ = data_helpers.load_data(validate_file, config, max_length, vocab)

    # randomly shuffle data
    np.random.seed(10)
    shuffle_indices = np.random.permutation(np.arange(len(y_train)))
    x_train = x_train[shuffle_indices]
    y_train = y_train[shuffle_indices]

    logger.info('Train/Valid split: %d/%d' % (len(y_train), len(y_dev)))
    logger.info('train shape: %(shape)s', {'shape': x_train.shape})
    logger.info('valid shape: %(shape)s', {'shape': x_dev.shape})
    logger.info('sentence max words: %(shape)s', {'shape': sentence_size})
    logger.info('embedding size: %(msg)s', {'msg': embed_size})
    logger.info('vocab size: %(msg)s', {'msg': vocab_size})

    train = mx.io.NDArrayIter(
        x_train, y_train, batch_size, shuffle=True)
    valid = mx.io.NDArrayIter(
        x_dev, y_dev, batch_size)
    return train, valid, sentence_size, embed_size, vocab_size, n_class
def preprocess_purpose_data(max_len=60, portion=0.85, augmentation=False, deduplicate=True):
    """
    Only augment training data, not test data.
    """
    sentences, _, purposes, augment_sentences = _preprocess_dataset_small(max_len, augmentation, deduplicate=deduplicate)
    # shuffle all data
    if augmentation:
        data = list(zip(sentences, purposes, augment_sentences))
        word_to_idx = compute_word_to_idx(sentences+augment_sentences)
    else:
        data = list(zip(sentences, purposes))
        word_to_idx = compute_word_to_idx(sentences)

    random.shuffle(data)
    random.shuffle(data)
    end = int(len(data) * portion)
    train_data, test_data = data[:end], data[end:]

    if augmentation:
        # only augment train_data, concat both sentences
        train_sentences, train_purposes, train_augment_sentences = zip(*train_data)
        train_sentences += train_augment_sentences
        train_purposes += train_purposes
        train_data = list(zip(train_sentences, train_purposes))
        # rebind test_data without augementation
        test_sentences, test_purposes, _ = zip(*test_data)
        test_data = list(zip(test_sentences, test_purposes))

    # save as pickle for later use
    save_to_pickle('processed_data/purpose.train.pkl', [train_data, word_to_idx])
    save_to_pickle('processed_data/purpose.test.pkl', [test_data, word_to_idx])
def preprocess_polarity_data(max_len=60, portion=0.85, deduplicate=True):
    small_sentences, small_polarities, _, _ = _preprocess_dataset_small(max_len, deduplicate=deduplicate)
    large_sentences, large_polarities, polarity_to_idx = _preprocess_dataset_large(max_len, deduplicate=deduplicate)
    combined_sentences = small_sentences + large_sentences
    combined_polarities = small_polarities + large_polarities
    data = []
    if deduplicate:
        seen = {}
        for sent, polarity in zip(combined_sentences, combined_polarities):
            key = ''.join(sent)
            if key not in seen:
                seen[key] = True
                data.append((sent, polarity))
        print('unique sentences:', len(seen), 'duplicate:', len(combined_sentences)-len(seen))
    else:
        data = list(zip(combined_sentences, combined_polarities))
    
    # shuffle all data
    random.shuffle(data)
    word_to_idx = compute_word_to_idx(combined_sentences)
    end = int(len(data) * portion)
    train_data, test_data = data[:end], data[end:]

    # save as pickle for later use
    save_to_pickle('processed_data/polarity.train.pkl', [train_data, word_to_idx, polarity_to_idx])
    save_to_pickle('processed_data/polarity.test.pkl', [test_data, word_to_idx, polarity_to_idx])
Exemplo n.º 4
0
    def load_glove_model(self, path_to_glove, word_to_idx,
                         saved_embedding='processed_data/glove_embedding.pkl',
                         regenerate=True):
        """
        Overwrite nn.Embedding.weight by pre-trained GloVe vectors.

        First load pre-trained GloVe model, i.e., a word-vector lookup table
        Then filter the words appeared in our dataset based on word_to_idx
        Then overwrite initial nn.Embedding.weight
        Credit: https://github.com/pytorch/text/issues/30
        """
        if regenerate:
            count = 0
            with open(path_to_glove, 'r') as f:
                for line in f.readlines():
                    # print(line)
                    row = line.split()
                    word, vector = row[0], row[1:]
                    vector = torch.FloatTensor(list(map(float, vector)))
                    # only update the word that is in both word_to_idx and glove
                    # remain the same weight for the word that is not in glove model
                    if word in word_to_idx:
                        count += 1
                        # overwrite initial embedding.weight
                        self.embeddings.weight.data[word_to_idx[word]] = vector
                print('num of words in both word_to_idx and glove', count)
                save_to_pickle(saved_embedding, self.embeddings.weight.data)
        else:
            self.embeddings.weight.data.copy_(load_pickle(saved_embedding))
Exemplo n.º 5
0
    def __init__(self, train_file, validate_file, config, vocab_path, max_length):

        logger.info('Loading data...')

        x_train, x_train_len, y_train, vocab, vocab_inv, self.n_class = \
            self.load_data(train_file, config, max_length, None)
        self.sentence_size = x_train.shape[1]
        self.vocab_size = len(vocab)
        util.save_to_pickle(vocab_path, vocab)

        x_dev, x_dev_len, y_dev, _, _, _ = self.load_data(validate_file, config, max_length, vocab)

        # randomly shuffle data
        np.random.seed(10)
        shuffle_indices = np.random.permutation(np.arange(len(y_train)))
        x_train = x_train[shuffle_indices]
        x_train_len = x_train_len[shuffle_indices]
        y_train = y_train[shuffle_indices]

        # replicating random examples from pre-data
        # rest = batch_size - len(x_train) % batch_size
        # random_indices = np.random.randint(x_train.shape[0], size=rest)
        #
        # x_train = np.concatenate((x_train, x_train[random_indices, :]), axis=0)
        # x_train_len = np.concatenate((x_train_len, x_train_len[random_indices]), axis=0)
        # y_train = np.concatenate((y_train, y_train[random_indices]), axis=0)

        self.x_train = mx.nd.array(x_train)
        self.x_train_len = mx.nd.array(x_train_len)
        self.y_train = mx.nd.array(y_train)

        self.x_dev = mx.nd.array(x_dev)
        self.x_dev_len = mx.nd.array(x_dev_len)
        self.y_dev = mx.nd.array(y_dev)

        logger.info('Train/Valid split: %d/%d' % (len(y_train), len(y_dev)))
        logger.info('train shape: %(shape)s', {'shape': x_train.shape})
        logger.info('valid shape: %(shape)s', {'shape': x_dev.shape})
Exemplo n.º 6
0
def train_epochs(resume=False, use_glove=True):
    """Train multiple opochs"""

    print('total epochs: ', cfg.EPOCHS, '; use_glove: ', use_glove)

    training_data, word_to_idx, label_to_idx = data_loader()
    model, best_acc, start_epoch = get_model(word_to_idx, label_to_idx, resume,
                                             use_glove)

    losses = []
    loss_function = nn.NLLLoss()
    if cfg.RUN_MODE == 'CNN':
        optimizer = optim.Adam(model.parameters(), lr=0.001, weight_decay=0)
        # optimizer = optim.SGD(model.parameters(), lr=0.1)
        # optimizer = optim.Adagrad(model.parameters(), lr=0.01, weight_decay=0.01)
    else:
        # optimizer = optim.Adam(model.parameters(), lr=0.001)
        optimizer = optim.SGD(model.parameters(), momentum=0.9, lr=0.1)
    # optimizers below are not working
    # optimizer = optim.Adagrad(model.parameters(), lr=0.001)

    since = time.time()
    training_error_rates = []
    test_error_rates = []
    for epoch in range(1 + start_epoch, start_epoch + cfg.EPOCHS + 1):
        train_error, train_loss = train(model, loss_function, optimizer,
                                        training_data, word_to_idx)
        losses.append(train_loss)
        training_error_rates.append(train_error)
        test_error_rate = get_error_rate(model, training=False)
        test_error_rates.append(test_error_rate)
        acc = 1 - test_error_rate
        print('epoch: {}, time: {:.2f}s, cost so far: {}, accurary: {:.3f}'.
              format(epoch, (time.time() - since), train_loss.numpy(), acc))
        if acc > best_acc:
            save_checkpoint(model, acc, epoch)
            best_acc = acc

    # save all_losses
    save_to_pickle('checkpoint/all_losses.p', losses)
    save_to_pickle('checkpoint/training_error_rates.p', training_error_rates)
    save_to_pickle('checkpoint/test_error_rates.p', test_error_rates)
Exemplo n.º 7
0
def main():
    #batch_size = 128  # For mini-batch gradient descent
    #epochs = 10
    args = read_args()
    x_train, x_test, y_train, y_test = load_dataset()

    #toresults
    #toresults=get_time_str()

    ## TODO 3: Build the Keras model
    args.input_num = x_train.shape[1]
    #
    model = create_model(args=args)

    #data augmentation
    # datagen = ImageDataGenerator(
    #rotation_range=10,
    #width_shift_range=0.1,
    #height_shift_range=0.1,
    #horizontal_flip=True,
    #)
    #datagen.fit(x_train)

    #  Data Augmentation
    #hist=model.fit_generator(datagen.flow(x_train, y_train, batch_size=32),
    #                        steps_per_epoch=x_train.shape[0] // args.batch_size,
    #                  epochs=args.epochs,
    #   verbose=1,
    #  validation_data=(x_test, y_test),workers=4)

    #    # TODO 4: Fit the model
    hist = model.fit(x_train,
                     y_train,
                     batch_size=args.batch_size,
                     epochs=args.epochs,
                     verbose=1,
                     validation_data=(x_test, y_test))
    score = model.evaluate(x_test, y_test, verbose=0)
    print(score)
    y_test_pred_mat = model.predict(x_test)
    y_test_norm = np.argmax(y_test, axis=1)
    predictions = np.argmax(y_test_pred_mat, axis=1)

    # TODO 5: Evaluate the model, calculating the metrics.
    # Option 1: Use the model.evaluate() method. For this, the model must be
    # already compiled with the metrics.
    # performance = model.evaluate(X_test, y_test)

    # Option 2: Use the model.predict() method and calculate the metrics using
    # sklearn. We recommend this, because you can store the predictions if
    # you need more analysis later. Also, if you calculate the metrics on a
    # notebook, then you can compare multiple classifiers.
    # predictions = ...
    # performance = ...

    # TODO 6: Save the results.
    #Pandas
    results = pandas.DataFrame(y_test_norm, columns=['true_label'])

    results.loc[:, 'predicted'] = predictions
    results.to_csv('predictions_{}.csv'.format(args.experiment_name),
                   index=False)
    #to pickle
    params_dict = get_keras_model_history_params(model,
                                                 [('args', args.__dict__)])
    save_to_pickle(params_dict, 'params_{}.pick'.format(args.experiment_name))

    #guardar graficos de accuracy y loss
    save_fig(hist)
    print(model.summary())