예제 #1
0
def main():
    args = getArguments()
    print('[DEBUG]', args)

    x, y = make_circles(n_samples=args.n_samples,
                        noise=args.noise,
                        factor=0.3,
                        random_state=42)

    x1Squared = x[:, 0]**2
    x2Squared = x[:, 1]**2

    x = np.concatenate((x, x1Squared.reshape(-1, 1)), axis=1)
    x = np.concatenate((x, x2Squared.reshape(-1, 1)), axis=1)
    y = y.reshape(-1, 1)

    scaler = StandardScaler()
    x = scaler.fit_transform(x)

    lr = LogisticRegression(x=x,
                            y=y,
                            alpha=3e-2,
                            max_epochs=1000,
                            epsilon=1e-3,
                            batch_size=100)
    lr.runGradientDescent()

    print(f'[DEBUG] Optimized Cost: {lr.history["cost"][-1]}')
    print(f'[DEBUG] Optimized Theta: {lr.history["theta"][-1]}')

    plotAndSaveGraphs(lr, args, scaler)
def main():
    args = getArguments()
    print('[DEBUG]', args)

    x, y = make_blobs(n_samples=args.n_samples,
                      centers=2,
                      n_features=2,
                      cluster_std=args.noise,
                      random_state=42)

    scaler = StandardScaler()
    x = scaler.fit_transform(x)

    lr = LogisticRegression(x=x,
                            y=y.reshape(-1, 1),
                            alpha=args.lr,
                            max_epochs=args.max_epochs,
                            epsilon=args.epsilon,
                            batch_size=args.batch_size)

    lr.runGradientDescent()

    print(f'[DEBUG] Optimized Theta: {lr.history["theta"][-1]}')
    print(f'[DEBUG] Optimized Cost: {lr.history["cost"][-1]}')

    plotAndSaveGraphs(lr, args, scaler)
def main():
    dataset = datasets.load_breast_cancer()

    features = dataset.data

    features = StandardScaler().fit_transform(features)

    num_features = features.shape[1]

    labels = dataset.target

    train_features, test_features, train_labels, test_labels = train_test_split(
        features, labels, test_size=0.3, stratify=labels)

    train_size = train_features.shape[0]
    test_size = test_features.shape[0]

    # slice the dataset to be exact as per the batch size
    # e.g. train_size = 1898322, batch_size = 256
    # [:1898322-(1898322%256)] = [:1898240]
    # 1898322 // 256 = 7415; 7415 * 256 = 1898240
    train_features = train_features[:train_size - (train_size % BATCH_SIZE)]
    train_labels = train_labels[:train_size - (train_size % BATCH_SIZE)]

    # modify the size of the dataset to be passed on model.train()
    train_size = train_features.shape[0]

    # slice the dataset to be exact as per the batch size
    test_features = test_features[:test_size - (test_size % BATCH_SIZE)]
    test_labels = test_labels[:test_size - (test_size % BATCH_SIZE)]

    test_size = test_features.shape[0]

    model = LogisticRegression(
        alpha=LEARNING_RATE,
        batch_size=BATCH_SIZE,
        num_classes=NUM_CLASSES,
        sequence_length=num_features,
    )

    model.train(
        checkpoint_path="./checkpoint_path/logistic_regression/",
        log_path="./log_path/logistic_regression/",
        model_name="logistic_regression",
        epochs=3000,
        train_data=[train_features, train_labels],
        train_size=train_size,
        validation_data=[test_features, test_labels],
        validation_size=test_size,
        result_path="./results/logistic_regression/",
    )
def logistic_regression(train_data, train_labels, test_data, test_labels):

    print(f'{LogisticRegression.__name__}:')

    # Create and train model
    lr_model = LogisticRegression(train_data.shape[1], eta=0.001, epochs=50)
    model = OneVersusRest(lr_model)

    model.train(train_data, train_labels)

    # Predict 2000 validation set samples and calculate accuracy
    test_data_2k = test_data[:len(test_labels)]
    test_pred = model.predict(test_data_2k)

    # Print metrics
    print('\nTest Accuracy: {:.02f}%\n'.format(
        100 * accuracy(test_pred, test_labels)))
    mat, classes = confusion_matrix(test_pred, test_labels)
    print('Precision:\n{}\n'.format(
        np.round(precision(test_pred, test_labels), 2)))
    print('Recall:\n{}\n'.format(np.round(recall(test_pred, test_labels), 2)))
    print('F1:\n{}\n'.format(np.round(f1_score(test_pred, test_labels), 2)))
    print('Confusion Matrix:')
    print(mat)

    # Predict 10000 test set samples and save predictions
    print('Predicting 10k samples...')
    test_pred = model.predict(test_data)
    save_predictions(logistic_regression.__name__, test_pred)
    print('Saved 10k predictions.\n')
예제 #5
0
def task_3_logistic(x, y, x_test, y_test, args):
    accuracies = []
    sizes = np.linspace(10, 200, num=20)
    N = y.shape[0]
    for size in sizes:
        acc = 0
        for i in range(50):

            rand = np.random.randint(int(N), size=int(size))
            m = LogisticRegression(x[rand], y[rand])
            m.fit(lr=args[0], eps=args[1], regularization=args[2])
            pred = m.predict(x_test)
            cm = evaluation.confusion_matrix(y_test, pred)
            acc += evaluation.accuracy(cm)

        accuracies.append(acc/50)

    return accuracies, sizes
예제 #6
0
def generate_classification_predictions():
  X, Y = get_classification_training_data()
  test_X = get_classification_testing_data()

  class_models = [LogisticRegression(), NaiveBayes()]
  predictions = []
  for model in class_models:
    model.fit(X, Y)
    predictions.append(model.predict(test_X))
  
  return predictions
예제 #7
0
def test_logistic_regression():
    from models.logistic_regression import LogisticRegression

    x, y = np.random.randn(2, 500, 2), np.zeros([2, 500])
    x[0] += np.array([1, -1])  # 左上方移动
    x[1] += np.array([-1, 1])  # 右下方移动
    y[1] = 1
    plot_scatter(x[0], x[1], 'Real')

    x = x.reshape(-1, 2)
    y = y.flatten()

    logistic = LogisticRegression(2, lr=1e-3)
    train_logistic_regression(logistic, x, y, batch_size=32, epochs=100)

    pred = logistic.predict(x)
    plot_scatter_with_line(x[pred == 0], x[pred == 1], logistic.weights,
                           'Pred')

    acc = np.sum(pred == y) / len(pred)
    print(f'Acc = {100 * acc:.2f}%')
    def fit(self, model, X, label_vector, weights, smoothing=False):
        n_classes = model.get_n_classes()
        if n_classes > 2:
            sys.exit("Platt scaling not yet implemented for more than 2 classes.")

        self._base_model = model

        if smoothing:
            X, label_vector, weights = self.reweight_data(X, label_vector, weights)

        scores = np.reshape(model.score(X), (len(label_vector), 1))

        bincount = np.bincount(label_vector, minlength=n_classes)
        most_common = np.argmax(bincount)

        # check to see if there is only one label in the training data:
        if bincount[most_common] == len(label_vector):
            print("Only label %d found in dev data; skipping Platt" % most_common)
        else:
            self._platt_model = LogisticRegression(n_classes, alpha=self._alpha, penalty=self._penalty, objective='acc')
            self._platt_model.fit(scores, label_vector, weights)
예제 #9
0
    def update_plot(self, plot_state):
        X, y = plot_state_to_model_data(plot_state)

        if X.shape[0] != 0:
            classifier = LogisticRegression()
            classifier.fit(X, y, 1000)

            b, w1, w2 = classifier.weights

            origin = np.array([0, -b / w2])
            angle = angle_from_tangent(-w2, w1)

            delta = np.array([0, self.ALPHA / w2])

            x_red, y_red = calculate_line_endpoints(angle, origin - delta)
            x_decision, y_decision = calculate_line_endpoints(angle, origin)
            x_blue, y_blue = calculate_line_endpoints(angle, origin + delta)

            self.decision_boundary.data_source.data = dict(x=x_decision,
                                                           y=y_decision)
            self.red_side.data_source.data = dict(x=x_red, y=y_red)
            self.blue_side.data_source.data = dict(x=x_blue, y=y_blue)
예제 #10
0
def train_lr(x, y):
    train_size = min(400, int(len(x) * 0.8))
    X_train, X_test, y_train, y_test = train_test_split(x,
                                                        y,
                                                        train_size=train_size,
                                                        random_state=2333,
                                                        stratify=y)
    model = LogisticRegression()
    model.init_model(None)
    X = model.preprocess_data(X_train)
    model.fit(X, y_train)
    return model
예제 #11
0
def main(_):
    """High level pipeline.
    This script performs the trainsing, evaling and testing state of the model.
    """
    #    learning_rate = FLAGS.learning_rate
    #    feature_type = FLAGS.feature_type
    #    model_type = FLAGS.model_type
    #    num_steps = FLAGS.num_steps

    feature_type = 'default'
    model_type = 'svm'
    # Load dataset.
    data = read_dataset('data/train_lab.txt', 'data/image_data')

    # Data Processing.
    data = preprocess_data(data, 'default')
    print("Finish preprocessing...")

    # Initialize model.
    ndim = data['image'].shape[1]
    if model_type == 'linear':
        model = LinearRegression(ndim, 'uniform')
    elif model_type == 'logistic':
        model = LogisticRegression(ndim, 'uniform')
    elif model_type == 'svm':
        model = SupportVectorMachine(ndim, 'uniform')

    # Train Model.
    print("Start to train the model...")
    model = train_model(data, model)

    # Eval Model.
    print("Start to evaluate the model...")
    data_val = read_dataset('data/val_lab.txt', 'data/image_data')
    data_val = preprocess_data(data_val, feature_type)
    loss, acc = eval_model(data_val, model)
    print(loss, acc)

    # Test Model.
    print("Start doing the test")
    data_test = read_dataset('data/test_lab.txt', 'data/image_data')
    print("Start preprocess testing data")
    data_test = preprocess_data(data_test, feature_type)
    print("Making predictions")
    data_test['label'] = model.predict(model.forward(data_test['image']))
    print("Output the results to csv file")
    write_dataset('data/test_lab.txt', data_test)
    # Generate Kaggle output.
    print("Finished!")
예제 #12
0
def main():
    parser = argparse.ArgumentParser(description='Linear Regression test')
    parser.add_argument('-n',
                        '--n_iter',
                        type=int,
                        default=50,
                        help='number of iterations for grad_descent')
    parser.add_argument('-f',
                        '--n_features',
                        type=int,
                        default=2,
                        help='number of features')
    args = parser.parse_args()
    n_iter = args.n_iter
    n_features = args.n_features

    X, y, centers = generate_classification_data(n_features=n_features)
    X_train, X_test, y_train, y_test = split_dataset(X, y)
    print("Training size: %s, Test size %s" % (len(X_train), len(X_test)))
    print("-" * 20)

    # Plotting dataset
    plot_points_and_cluster(X, centers)

    # Fit and predict
    model = LogisticRegression(n_iter=n_iter)
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    print("-" * 20)

    # Scoring
    model.score(y_test, y_pred)
    print("-" * 20)

    # Plot decision boundary
    if n_features == 2:
        plot_logistic_regression_decision_boundary(X, y, model)

    # Plot iteration vs cost
    plot_iteration_vs_cost(n_iter, model.cost_h)
예제 #13
0
def main(_):
    """High level pipeline.
    This script performs the trainsing, evaling and testing state of the model.
    """
    learning_rate = FLAGS.learning_rate
    feature_type = FLAGS.feature_type
    model_type = FLAGS.model_type
    num_steps = FLAGS.num_steps

    # Load dataset.
    data = read_dataset('data/val_lab.txt', 'data/image_data')


    # Data Processing.
    data = preprocess_data(data, feature_type)

    # Initialize model.
    ndim = data['image'].shape[1]

    if model_type == 'linear':
        model = LinearRegression(ndim, 'ones')
    elif model_type == 'logistic':
        model = LogisticRegression(ndim, 'zeros')
    elif model_type == 'svm':
        model = SupportVectorMachine(ndim, 'zeros')

    # Train Model.
    model = train_model(data, model, learning_rate, num_steps=num_steps)

    # Eval Model.
    data_test = read_dataset('data/test_lab.txt', 'data/image_data')
    data_test = preprocess_data(data_test, feature_type)
    acc, loss = eval_model(data_test, model)

    # Test Model.
    data_test = read_dataset('data/test_lab.txt', 'data/image_data')
    data_test = preprocess_data(data_test, feature_type)
예제 #14
0
def sgd_optimization(learning_rate=0.13,
                     n_epochs=1000,
                     dataset='../../data/mnist.pkl.gz',
                     batch_size=600):
    """
    Demonstrate stochastic gradient descent optimization of a log-linear
    model.

    :type learning_rate: float
    :param learning_rate: learning rate used (factor for the stochastic
                          gradient)

    :type n_epochs: int
    :param n_epochs: maximal number of epochs to run the optimizer

    :type dataset: int
    :param dataset: the path of MNIST dataset file from 
                    http://www.iro.umontreal.ca/~lisa/deep/data/mnist/mnist.pkl.gz

    :type batch_size: int
    :param batch_size: 
    """
    
    datasets = load_data(dataset)
    train_x, train_y = datasets[0]
    valid_x, valid_y = datasets[1]
    test_x, test_y = datasets[2]

    # compute number of minibatches for training, validation and testing
    n_train_batches = train_x.get_value(borrow=True).shape[0] // batch_size
    n_valid_batches = valid_x.get_value(borrow=True).shape[0] // batch_size
    n_test_batches = test_x.get_value(borrow=True).shape[0] // batch_size
    valid_x, valid_y = datasets[1]
    test_x, test_y = datasets[2]


    # build the model
    print("... building the model")
    # allocate symbolic variables for the data
    # index to a minibatch
    index = T.lscalar()

    # generate symbolic variables for input (x and y represent a
    # minibatch)
    x = T.matrix('x')  # data, presented as rasterized images
    y = T.ivector('y')  # labels, presented as 1D vector of [int] labels

    # construct the logistic regression class
    # each MNIST image has size 28*28
    classifier = LogisticRegression(input=x,
                                    n_in=28*28,
                                    n_out=10)
    # the cost we minimize during training is the negative log likelihood
    # of the model in symbolic format
    cost = classifier.negative_log_likelihood(y)

    # compiling a Theano function that computes the mistakes that are made
    # by the model on a minibatch
    test_givens = {
        x: test_x[index * batch_size : (index+1) * batch_size],
        y: test_y[index * batch_size : (index+1) * batch_size]
    }
    test_model = theano.function(inputs=[index],
                                 outputs=classifier.errors(y),
                                 givens=test_givens)
    valid_givens = {
        x: valid_x[index * batch_size : (index+1) * batch_size],
        y: valid_y[index * batch_size : (index+1) * batch_size]
    }
    valid_model = theano.function(inputs=[index],
                                  outputs=classifier.errors(y),
                                  givens=valid_givens)

    # compute the gradient of cost with respect to theta = (W, b)
    g_W = T.grad(cost=cost, wrt=classifier.W)
    g_b = T.grad(cost=cost, wrt=classifier.b)

    # specify how to update the parameters of the model as a list of
    # (variable, update expression) pairs.
    updates = [
        (classifier.W, classifier.W - learning_rate * g_W),
        (classifier.b, classifier.b - learning_rate * g_b)
    ]
    
    # compiling a Theano function train_model that returns the cost,
    # but in the same time updates the parameter of the model based on
    # the rules defined in `updates`.
    train_givens = {
        x: train_x[index * batch_size : (index+1) * batch_size],
        y: train_y[index * batch_size : (index+1) * batch_size]
    }
    train_model = theano.function(inputs=[index],
                                  outputs=cost,
                                  updates=updates,
                                  givens=train_givens)


    # train model
    print('... training the model')
    # early-stopping parameters
    # look as this many examples regardless
    patience = 5000
    # wait this much longer when a new best is found
    patience_increase = 2
    # a relative improvement of this much is considered significant
    improvement_threshold = 0.995
    # go through this many
    # minibatch before checking the network
    # on the validation set
    # in this case we check every epoch
    validation_frequency = min(n_train_batches, patience // 2)

    best_validation_loss = np.inf
    test_score = 0.
    start_time = timeit.default_timer()

    epoch = 0
    done_looping = False
    while (epoch < n_epochs) and (not done_looping):
        epoch += 1
        for minibatch_index in range(n_train_batches):
            minibatch_avg_cost = train_model(minibatch_index)
            # iteration number
            iter = (epoch - 1) * n_train_batches + minibatch_index

            if (iter + 1) % validation_frequency == 0:
                # compute zero-one loss on validation set
                validation_losses = [valid_model(i) for i in range(n_valid_batches)]
                this_validation_loss = np.mean(validation_losses)
                print(
                    "epoch %i, minibatch %i/%i, validatioin error %f %%" %
                    (
                        epoch,
                        minibatch_index + 1,
                        n_train_batches,
                        this_validation_loss * 100.
                    )
                )

                # if we got the best validation score until now
                if this_validation_loss < best_validation_loss:
                    # improve patience if loss improvement is good enough
                    if this_validation_loss < best_validation_loss * improvement_threshold:
                        patience = max(patience, iter * patience_increase)

                    best_validation_loss = this_validation_loss
                    
                    # test it on the test set
                    test_losses = [test_model(i) for i in range(n_test_batches)]
                    test_score = np.mean(test_losses)
                    print(
                        ('epoch %i, minibatch %i/%i, test error of best model %f %%') %
                        (
                            epoch,
                            minibatch_index + 1,
                            n_train_batches,
                            test_score * 100.
                        )
                    )

                    # save the best model
                    with open('outputs/best_model.pkl', 'wb') as f:
                        pickle.dump(classifier, f)

            if patience <= iter:
                done_looping = True
                break

    end_time = timeit.default_timer()
    print(
        ('Optimization complete with best validation score of %f %%, '
         'with best performance %f %%') %
        (best_validation_loss % 100., test_score * 100.)
    )
    print('The code run for %d epochs, with %f epochs/sec' %
          (epoch, 1. * epoch / (end_time - start_time)))
    print(('The code for file ' +
           os.path.split(__file__)[1] +
           ' ran for %.1fs' % ((end_time - start_time))), file=sys.stderr)
예제 #15
0
eps = [0.01, 0.05, 0.1, 0.5]

# Task 3. Experiments
# 1. Compare accuracy of naive bayes and logistic regression

# Get cross validation accuracy for 5-fold cv
print("Ionosphere validation accuracy (default parameters):")
evaluation.cross_validation(5, ionosphere_train_features, ionosphere_train_labels, model=LogisticRegression)

# Grid search for optimal hyperparameters
print("Ionosphere grid search hyperparameters:")
ionosphere_max_val_acc, ionosphere_arg_max = evaluation.grid_search(learning_rates=lrs, epsilons=eps, lambdas=lamdas, x=ionosphere_train_features, y=ionosphere_train_labels, model=LogisticRegression)

# Accuracy on test split - train with best hyperparameters
print("Ionosphere test accuracy:")
logistic_ionosphere = LogisticRegression(ionosphere_train_features, ionosphere_train_labels)
logistic_ionosphere.fit(lr=ionosphere_arg_max[0], eps=ionosphere_arg_max[1], regularization=ionosphere_arg_max[2])
ionosphere_prediction = logistic_ionosphere.predict(ionosphere_test_features)
cm_ionosphere = evaluation.confusion_matrix(ionosphere_test_labels, ionosphere_prediction)
print("Accuracy:", evaluation.accuracy(cm_ionosphere), "Precision:", evaluation.precision(cm_ionosphere), "Recall:", evaluation.true_positive(cm_ionosphere), "F1:", evaluation.f_score(cm_ionosphere))

# 5-fold CV for naive bayes
print("Ionosphere validation accuracy (naive bayes):")
evaluation.cross_validation_naive(5, ionosphere_dataset.train_data, NaiveBayes, ionosphere_dataset.label_column, ionosphere_dataset.feature_columns)

naive_ionosphere = NaiveBayes(ionosphere_dataset.train_data, ionosphere_dataset.label_column, continuous=ionosphere_dataset.feature_columns)

print("Ionosphere test accuracy (naive bayes):")

ionosphere_pred_naive = ionosphere_dataset.test_data.apply(naive_ionosphere.predict, axis=1)
cm_ionosphere_naive = evaluation.confusion_matrix(ionosphere_test_labels, ionosphere_pred_naive.to_numpy())
예제 #16
0
def main(model, dataset, learning_rate, stopping_condition, threshold=None):
    """
    Main program
    :param model: Str
    :param dataset: Str
    :param learning_rate: Float
    :param stopping_condition: Float
    :param threshold: Float
    :return: None
    """
    # Import, process, and normalize data
    if dataset == 'breast':
        df = preprocessing.process_breast_cancer_data()
    elif dataset == 'glass':
        df = preprocessing.process_glass_data()
    elif dataset == 'iris':
        df = preprocessing.process_iris_data()
    elif dataset == 'soybean':
        df = preprocessing.process_soybean_data()
    elif dataset == 'voter':
        df = preprocessing.process_voter_data()

    # Set up stratified 5-fold cross-validation; only necessary for classificaton
    skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=5)
    training_sets, test_sets = [], []
    for fold, (train, test) in enumerate(
            skf.split(X=np.zeros(len(df)), y=df.iloc[:, -1:])):
        training_sets.append(df.iloc[train])
        test_sets.append(df.iloc[test])

    # Train; run 5 experiments in total
    training_errors, trained_models = [], []
    for training_set in training_sets:
        print("\nTraining:")
        training_data = training_set.iloc[:, 1:-1].to_numpy().T
        training_labels = training_set.iloc[:, -1:].to_numpy().T
        classes = df['class'].unique()
        if model == 'adaline':
            my_model = \
                Adaline(training_data, training_labels, classes, learning_rate, threshold, stopping_condition, raw_data=training_set)
        elif model == 'logistic_regression':
            my_model = \
                LogisticRegression(training_data, training_labels, classes, learning_rate, threshold, stopping_condition, raw_data=training_set)
        if dataset == 'breast' or dataset == 'voter':
            my_model.train()
        elif dataset == 'glass' or dataset == 'iris' or dataset == 'soybean':
            my_model.multi_train()
        trained_models.append(my_model)
        training_errors.append(my_model.get_training_error())
        my_model.plot_error()
        my_model.report_classifications()

    # Test; run 5 experiments in total
    testing_errors = []
    for model, test_set in zip(trained_models, test_sets):
        print("\nTesting: ")
        testing_data = test_set.iloc[:, 1:-1].to_numpy().T
        testing_labels = test_set.iloc[:, -1:].to_numpy().T
        if dataset == 'breast' or dataset == 'voter':
            model.test(testing_data, testing_labels)
        elif dataset == 'glass' or dataset == 'iris' or dataset == 'soybean':
            model.multi_test(testing_data, testing_labels)
        testing_errors.append(model.get_testing_error())
        model.report_classifications()

    # Report average results
    average_training_error = sum(training_errors) / len(training_errors)
    average_testing_error = sum(testing_errors) / len(testing_errors)
    print("\nSummary:")
    print(f"Average training error: {average_training_error}")
    print(f"Average testing error: {average_testing_error}")
예제 #17
0
def run(config):
    input_dir = config['input_dir']
    prefix = config['prefix']
    field = config['field']
    label_name = config['label_name']
    random_test_prop = config['random_test_prop']
    metadata_name = config['metadata_name']
    train_start = config['train_start']
    train_end = config['train_end']
    test_start = config['test_start']
    test_end = config['test_end']
    max_n_train = config['max_n_train']
    sample_labels = config['sample_labels']
    penalty = config['penalty']
    objective = config['objective']
    average = config['average']
    cshift = config['cshift']

    # make the output directory and save the config file
    output_dir = make_output_dir(config)
    fh.makedirs(output_dir)
    fh.write_to_json(config, os.path.join(output_dir, 'config.json'))

    # load features
    print(input_dir, label_name, train_end, test_start, penalty, objective,
          cshift)

    print("Loading features")
    all_X, all_ids, all_vocab = load_all_features(input_dir, prefix, config)
    all_ids_index = dict(zip(all_ids, range(len(all_ids))))
    n_items, n_features = all_X.shape
    print("Full feature matrix shape = ", all_X.shape)

    # if desired, do a random split into test and nontest data
    if random_test_prop is not None:
        print("Doing random train/test split")
        test_prop = float(random_test_prop)
        n_test_all = int(n_items * test_prop)
        test_indices = np.random.choice(np.arange(n_items),
                                        size=n_test_all,
                                        replace=False)
        test_items_all = [all_ids[i] for i in test_indices]
        nontest_items_all = list(set(all_ids) - set(test_items_all))
        n_nontest_all = len(nontest_items_all)

    # alternatively, if metadata exists, use it to split into test and nontest
    elif metadata_name is not None:
        metadata_file = os.path.join(input_dir,
                                     prefix + '.' + metadata_name + '.csv')
        metadata_df = pd.read_csv(metadata_file, header=0, index_col=0)
        metadata_df.index = [str(i) for i in metadata_df.index]
        field_vals = list(set(metadata_df[field].values))
        field_vals.sort()
        print("Splitting data according to %s" % field)
        print("Values:", field_vals)

        print("Testing on %s to %s" % (test_start, test_end))
        # first, split into training and non-train data based on the field of interest
        test_selector_all = (metadata_df[field] >=
                             test_start) & (metadata_df[field] <= test_end)
        metadata_test_df = metadata_df[test_selector_all]
        test_items_all = list(metadata_test_df.index)
        n_test_all = len(test_items_all)

        nontest_selector_all = (metadata_df[field] >= train_start) & (
            metadata_df[field] <= train_end)
        metadata_nontest_df = metadata_df[nontest_selector_all]
        nontest_items_all = list(metadata_nontest_df.index)
        n_nontest_all = len(nontest_items_all)

    # otherwise, there is not test data; just train a model
    else:
        nontest_items_all = list(all_ids)
        n_nontest_all = len(nontest_items_all)
        test_items_all = []
        n_test_all = 0

    # if there is test data, learn a model to distinguish train from test (if desired):
    weights_df = pd.DataFrame(np.ones(len(all_ids)),
                              index=all_ids,
                              columns=['weight'])
    if n_test_all > 0 and cshift:
        print("Training models for covariates shift")
        # split test and nontest to get balanced subsets
        test_items_1 = list(
            np.random.choice(test_items_all,
                             size=int(n_test_all / 2),
                             replace=False))
        test_items_2 = list(set(test_items_all) - set(test_items_1))
        nontest_items_1 = list(
            np.random.choice(nontest_items_all,
                             size=int(n_nontest_all / 2),
                             replace=False))
        nontest_items_2 = list(set(nontest_items_all) - set(nontest_items_1))

        # combine the test and nontest data into two balanced sets
        cset1_items = nontest_items_1 + test_items_1
        cset2_items = nontest_items_2 + test_items_2
        y1 = [0] * len(test_items_1) + [1] * len(nontest_items_1)
        y2 = [0] * len(test_items_2) + [1] * len(nontest_items_2)

        cset1_indices = [all_ids_index[i] for i in cset1_items]
        cset2_indices = [all_ids_index[i] for i in cset2_items]
        X1 = all_X[cset1_indices, :]
        X2 = all_X[cset2_indices, :]

        # train two models, one on each half of the data, using the other as a dev set
        cshift_model1 = LogisticRegression(n_classes=2,
                                           penalty='l2',
                                           objective='acc')
        cshift_model1.create_alpha_grid(config['n_alphas'],
                                        config['alpha_min'],
                                        config['alpha_max'])
        cshift_model1.fit(X1, y1, None, X2, y2, None, 1)

        cshift_model2 = LogisticRegression(n_classes=2,
                                           penalty='l2',
                                           objective='acc')
        cshift_model2.create_alpha_grid(config['n_alphas'],
                                        config['alpha_min'],
                                        config['alpha_max'])
        cshift_model2.fit(X2, y2, None, X1, y1, None, 1)

        # now get the models' predictions on the dev data, which will inform future weighting
        y1_pred_probs = cshift_model2.predict_proba(X1)
        for i, item in enumerate(nontest_items_1):
            weights_df.loc[item] = n_nontest_all / float(n_test_all) * (
                1.0 / y1_pred_probs[i, 0] - 1)

        y2_pred_probs = cshift_model1.predict_proba(X2)
        for i, item in enumerate(nontest_items_1):
            weights_df.loc[item] = n_nontest_all / float(n_test_all) * (
                1.0 / y2_pred_probs[i, 0] - 1)

    # reset random seed for consistency with/without using cshift
    if config['seed'] is not None:
        np.random.seed(int(config['seed']))

    print("Weights mean/min/max:", np.mean(weights_df.values),
          np.min(weights_df.values), np.max(weights_df.values))

    # only keep the items in the train and test sets
    all_items = nontest_items_all + test_items_all
    print("Train: %d, Test: %d (labeled and unlabeled)" %
          (n_nontest_all, n_test_all))

    # load labels
    label_file = os.path.join(input_dir, prefix + '.' + label_name + '.csv')
    labels_df = pd.read_csv(label_file, index_col=0, header=0)
    labels_df.index = [str(i) for i in labels_df.index]
    labels_df = labels_df.loc[all_items]
    class_names = labels_df.columns

    # find the labeled items
    print("Subsetting items with labels")
    label_sums_df = labels_df.sum(axis=1)
    labeled_item_selector = label_sums_df > 0
    labels_df = labels_df[labeled_item_selector]
    n_labeled_items, n_classes = labels_df.shape
    print("%d labeled items and %d classes" % (n_labeled_items, n_classes))
    labeled_items = set(labels_df.index)

    if n_classes > 2 and config['objective'] == 'calibration':
        sys.exit(
            "*ERROR*: Calibration objective has not been implemented for more than 2 classes"
        )

    nontest_items = [i for i in nontest_items_all if i in labeled_items]
    test_items = [i for i in test_items_all if i in labeled_items]
    n_nontest = len(nontest_items)
    n_test = len(test_items)

    # take a subset of the nontest items up to a max size, if desired.
    if max_n_train is not None and n_nontest_all > max_n_train:
        print("Sampling a set of %d labels" % max_n_train)
        nontest_indices = np.random.choice(np.arange(n_nontest_all),
                                           size=max_n_train,
                                           replace=False)
        nontest_items = [nontest_items[i] for i in nontest_indices]
        n_nontest = len(nontest_items)

    # split the training set into train and dev
    print("Splitting nontest into train and dev")
    np.random.shuffle(nontest_items)
    n_dev = int(n_nontest / config['dev_folds'])
    dev_fold = int(config['dev_fold'])
    dev_items = nontest_items[n_dev * dev_fold:n_dev * (dev_fold + 1)]
    train_items = list(set(nontest_items) - set(dev_items))
    train_items.sort()
    dev_items.sort()
    n_train = len(train_items)
    n_dev = len(dev_items)

    print("Train: %d, dev: %d, test: %d" % (n_train, n_dev, n_test))
    fh.write_list_to_text([str(n_train)],
                          os.path.join(output_dir, 'train.n.txt'))
    fh.write_list_to_text([str(n_test)], os.path.join(output_dir,
                                                      'test.n.txt'))
    fh.write_list_to_text([str(n_dev)], os.path.join(output_dir, 'dev.n.txt'))

    test_labels_df = labels_df.loc[test_items]
    nontest_labels_df = labels_df.loc[nontest_items]
    train_labels_df = labels_df.loc[train_items]
    dev_labels_df = labels_df.loc[dev_items]

    test_weights_df = weights_df.loc[test_items]
    nontest_weights_df = weights_df.loc[nontest_items]
    train_weights_df = weights_df.loc[train_items]
    dev_weights_df = weights_df.loc[dev_items]

    # Convert (possibly multiply-annotated) labels to one label per instance, either by duplicating or sampling
    test_labels_df, test_weights_df = prepare_labels(
        test_labels_df, sample=False, weights_df=test_weights_df)
    nontest_labels_df, nontest_weights_df = prepare_labels(
        nontest_labels_df, sample=sample_labels, weights_df=nontest_weights_df)
    train_labels_df, train_weights_df = prepare_labels(
        train_labels_df, sample=sample_labels, weights_df=train_weights_df)
    dev_labels_df, dev_weights_df = prepare_labels(dev_labels_df,
                                                   sample=sample_labels,
                                                   weights_df=dev_weights_df)

    test_labels_df.to_csv(os.path.join(output_dir, 'test_labels.csv'))
    nontest_labels_df.to_csv(os.path.join(output_dir, 'nontest_labels.csv'))
    train_labels_df.to_csv(os.path.join(output_dir, 'train_labels.csv'))
    dev_labels_df.to_csv(os.path.join(output_dir, 'dev_labels.csv'))

    test_weights_df.to_csv(os.path.join(output_dir, 'test_weights.csv'))
    nontest_weights_df.to_csv(os.path.join(output_dir, 'nontest_weights.csv'))
    train_weights_df.to_csv(os.path.join(output_dir, 'train_weights.csv'))
    dev_weights_df.to_csv(os.path.join(output_dir, 'dev_weights.csv'))

    # get one-row-hot label matrices for each subset
    train_labels = train_labels_df.values
    dev_labels = dev_labels_df.values
    test_labels = test_labels_df.values
    nontest_labels = nontest_labels_df.values

    # get weight vectors for each subset
    train_weights = train_weights_df.values[:, 0]
    dev_weights = dev_weights_df.values[:, 0]
    test_weights = test_weights_df.values[:, 0]
    nontest_weights = nontest_weights_df.values[:, 0]

    # get new item lists which correspond to the label data frames
    test_items = list(test_labels_df.index)
    dev_items = list(dev_labels_df.index)
    train_items = list(train_labels_df.index)

    n_test = len(test_items)

    # gather training features
    feature_index = dict(zip(all_ids, range(len(all_ids))))
    train_indices = [feature_index[i] for i in train_items]
    dev_indices = [feature_index[i] for i in dev_items]
    test_indices = [feature_index[i] for i in test_items]

    train_X = all_X[train_indices, :]
    dev_X = all_X[dev_indices, :]
    test_X = all_X[test_indices, :]

    print(train_X.shape, dev_X.shape, test_X.shape)

    nontest_prop = np.dot(nontest_weights,
                          nontest_labels) / nontest_weights.sum()
    print("Non-test label proportions:", nontest_prop)
    fh.write_list_to_text([str(nontest_prop[1])],
                          os.path.join(output_dir, 'nontest.prop.txt'))

    if n_test > 0:
        test_prop = np.dot(test_weights, test_labels) / test_weights.sum()
        print("Test label proportions:", test_prop)
        fh.write_list_to_text([str(test_prop[1])],
                              os.path.join(output_dir, 'test.prop.true.txt'))
        fh.write_list_to_text([str(np.abs(test_prop[1] - nontest_prop[1]))],
                              os.path.join(output_dir,
                                           'test.prop.ae.nontest.txt'))
    else:
        test_prop = None

    pos_label = 1
    # use zero as the positive label if it the minority class
    if n_classes == 2:
        if nontest_prop[1] > 0.5:
            pos_label = 0
            print("Using %d as the positive label" % pos_label)

    # convert the label matrices into a categorical label vector
    train_label_vector = np.argmax(train_labels, axis=1)
    test_label_vector = np.argmax(test_labels, axis=1)
    dev_label_vector = np.argmax(dev_labels, axis=1)

    # train a model
    model = LogisticRegression(n_classes=n_classes,
                               penalty=penalty,
                               objective=objective)
    model.create_alpha_grid(config['n_alphas'], config['alpha_min'],
                            config['alpha_max'])
    model.fit(train_X, train_label_vector, train_weights, dev_X,
              dev_label_vector, dev_weights, pos_label, average)

    print("Number of non-zero weights = %d" % model.get_model_size())

    # predict on train, dev, and test data
    train_f1, train_acc, train_cal = predict_evaluate_and_save(
        model,
        train_X,
        train_items,
        class_names,
        train_label_vector,
        pos_label=pos_label,
        average=average,
        weights=train_weights,
        output_dir=output_dir,
        output_prefix='train')
    dev_f1, dev_acc, dev_cal = predict_evaluate_and_save(model,
                                                         dev_X,
                                                         dev_items,
                                                         class_names,
                                                         dev_label_vector,
                                                         pos_label=pos_label,
                                                         average=average,
                                                         weights=dev_weights,
                                                         output_dir=output_dir,
                                                         output_prefix='dev')
    if n_test > 0:
        test_f1, test_acc, test_cal = predict_evaluate_and_save(
            model,
            test_X,
            test_items,
            class_names,
            test_label_vector,
            pos_label=pos_label,
            average=average,
            weights=test_weights,
            output_dir=output_dir,
            output_prefix='test')

    else:
        test_f1 = np.nan
        test_acc = np.nan
        test_cal = np.nan
    print("Accuracy values: train %0.4f; dev %0.4f; test %0.4f" %
          (train_acc, dev_acc, test_acc))
    print("F1 values: train %0.4f; dev %0.4f; test %0.4f" %
          (train_f1, dev_f1, test_f1))
    #print("Cal values: train %0.4f; dev %0.4f; test %0.4f" % (train_cal, dev_cal, test_cal))

    if n_test > 0:
        test_pred = model.predict(test_X)
        cc_prop = compute_proportions_from_predicted_labels(test_pred,
                                                            test_weights,
                                                            n_classes=2)
        print("Predicted proportions on test:")
        print("CC :", cc_prop)
        fh.write_list_to_text([str(cc_prop[1])],
                              os.path.join(output_dir, 'test.prop.cc.txt'))
        fh.write_list_to_text([str(np.abs(test_prop[1] - cc_prop[1]))],
                              os.path.join(output_dir, 'test.prop.ae.cc.txt'))
        test_pred_probs = model.predict_proba(test_X)
        pcc_prop = np.dot(test_weights, test_pred_probs) / np.sum(test_weights)
        print("PCC:", pcc_prop)
        fh.write_list_to_text([str(pcc_prop[1])],
                              os.path.join(output_dir, 'test.prop.pcc.txt'))
        fh.write_list_to_text([str(np.abs(test_prop[1] - pcc_prop[1]))],
                              os.path.join(output_dir, 'test.prop.ae.pcc.txt'))

    if n_test > 0:
        # create a secondary ACC model
        print("Fitting ACC")
        acc_model = ACC()
        acc_model.fit(model, dev_X, dev_label_vector, dev_weights)
        acc_proportions = acc_model.predict_proportions(test_X, test_weights)
        print("ACC proportions:", acc_proportions)
        fh.write_list_to_text([str(acc_proportions[1])],
                              os.path.join(output_dir, 'test.prop.acc.txt'))
        fh.write_list_to_text([str(np.abs(test_prop[1] - acc_proportions[1]))],
                              os.path.join(output_dir, 'test.prop.ae.acc.txt'))

        # create a secondary calibration model
        print("Fitting Platt")
        platt_model = Platt()
        platt_model.fit(model,
                        dev_X,
                        dev_label_vector,
                        dev_weights,
                        smoothing=True)
        platt_proportions = platt_model.predict_proportions(
            test_X, test_weights)
        print("Platt proportions:", platt_proportions)
        fh.write_list_to_text([str(platt_proportions[1])],
                              os.path.join(output_dir, 'test.prop.platt.txt'))
        fh.write_list_to_text(
            [str(np.abs(test_prop[1] - platt_proportions[1]))],
            os.path.join(output_dir, 'test.prop.ae.platt.txt'))

    print_top_words(model,
                    dev_X,
                    all_vocab,
                    n_classes=n_classes,
                    n_words=40,
                    output_dir=output_dir)
    joblib.dump(model, os.path.join(output_dir, 'model.pkl'))
    #fh.write_list_to_text(all_vocab, os.path.join(output_dir, 'model.vocab.txt.gz'), do_gzip=True)
    fh.write_to_json(all_vocab,
                     os.path.join(output_dir, 'model.vocab.json.test.gz'),
                     sort_keys=False,
                     do_gzip=True)
    #fh.write_to_json(all_vocab, os.path.join(output_dir, 'model.vocab.json'), sort_keys=False)

    print("")
예제 #18
0
def main(args):
    config_yaml = yaml.load(open(args.config, "r"), Loader=yaml.FullLoader)
    if not os.path.exists(args.config):
        raise FileNotFoundError('provided config file does not exist: %s' % args.config)

    if 'restart_log_dir_path' not in config_yaml['simclr']['train'].keys():
        config_yaml['simclr']['train']['restart_log_dir_path'] = None

    if args.data_dir_path is not None:
        config_yaml['simclr']['train']['data_dir_path'] = args.data_dir_path
        print('yo!: ', args.data_dir_path)

    config_yaml['logger_name'] = 'logreg'
    config = SimCLRConfig(config_yaml)

    if not os.path.exists(config.base.output_dir_path):
        os.mkdir(config.base.output_dir_path)

    if not os.path.exists(config.base.log_dir_path):
        os.makedirs(config.base.log_dir_path)

    logger = setup_logger(config.base.logger_name, config.base.log_file_path)
    logger.info('using config: %s' % config)

    config_copy_file_path = os.path.join(config.base.log_dir_path, 'config.yaml')
    shutil.copy(args.config, config_copy_file_path)

    writer = SummaryWriter(log_dir=config.base.log_dir_path)

    if not os.path.exists(args.model):
        raise FileNotFoundError('provided model directory does not exist: %s' % args.model)
    else:
        logger.info('using model directory: %s' % args.model)

    config.logistic_regression.model_path = args.model
    logger.info('using model_path: {}'.format(config.logistic_regression.model_path))

    config.logistic_regression.epoch_num = args.epoch_num
    logger.info('using epoch_num: {}'.format(config.logistic_regression.epoch_num))

    model_file_path = Path(config.logistic_regression.model_path).joinpath(
        'checkpoint_' + config.logistic_regression.epoch_num + '.pth')
    if not os.path.exists(model_file_path):
        raise FileNotFoundError('model file does not exist: %s' % model_file_path)
    else:
        logger.info('using model file: %s' % model_file_path)

    train_dataset, val_dataset, test_dataset, classes = Datasets.get_datasets(config,
                                                                              img_size=config.logistic_regression.img_size)
    num_classes = len(classes)

    train_loader, val_loader, test_loader = Datasets.get_loaders(config, train_dataset, val_dataset, test_dataset)

    simclr_model = load_simclr_model(config)
    simclr_model = simclr_model.to(config.base.device)
    simclr_model.eval()

    model = LogisticRegression(simclr_model.num_features, num_classes)
    model = model.to(config.base.device)

    learning_rate = config.logistic_regression.learning_rate
    momentum = config.logistic_regression.momentum
    optimizer = torch.optim.SGD(model.parameters(), lr=learning_rate, momentum=momentum, nesterov=True)
    criterion = torch.nn.CrossEntropyLoss()

    logger.info("creating features from pre-trained context model")
    (train_x, train_y, test_x, test_y) = get_features(
        config, simclr_model, train_loader, test_loader
    )

    feature_train_loader, feature_test_loader = get_data_loaders(
        config, train_x, train_y, test_x, test_y
    )

    best_model_wts = copy.deepcopy(model.state_dict())
    best_acc = 0.0
    best_epoch = 0
    best_loss = 0

    for epoch in range(config.logistic_regression.epochs):
        loss_epoch, accuracy_epoch = train(
            config, feature_train_loader, model, criterion, optimizer
        )

        loss = loss_epoch / len(train_loader)
        accuracy = accuracy_epoch / len(train_loader)

        writer.add_scalar("Loss/train_epoch", loss, epoch)
        writer.add_scalar("Accuracy/train_epoch", accuracy, epoch)
        logger.info(
            "epoch [%3.i|%i] -> train loss: %f, accuracy: %f" % (
                epoch + 1, config.logistic_regression.epochs, loss, accuracy)
        )

        if accuracy > best_acc:
            best_loss = loss
            best_epoch = epoch + 1
            best_acc = accuracy
            best_model_wts = copy.deepcopy(model.state_dict())

    model.load_state_dict(best_model_wts)
    logger.info(
        "train dataset performance -> best epoch: {}, loss: {}, accuracy: {}".format(best_epoch, best_loss, best_acc, )
    )

    loss_epoch, accuracy_epoch = test(
        config, feature_test_loader, model, criterion
    )

    loss = loss_epoch / len(test_loader)
    accuracy = accuracy_epoch / len(test_loader)
    logger.info(
        "test dataset performance -> best epoch: {}, loss: {}, accuracy: {}".format(best_epoch, loss, accuracy)
    )
예제 #19
0
# Retrieve args
args_parser = Parser()
args, data_args, algo_args, model_args, solvers = args_parser.get_args()

# Identify node
comm = mpi4py.MPI.COMM_WORLD
rank = comm.Get_rank()

# Set up loggers / plotters
logging.basicConfig(level=logging.INFO)
log = logging.getLogger(f"Main {rank}")

plotter = Plotter(filename=args.plotter_path)

# Load model and dataset
model = LogisticRegression(**model_args)

error_model = model.get_global(comm.size)
global_dataset = LIBSVM_Loader(**data_args, seed=args.seed,
                               rank=rank).load(**data_args,
                                               comm_size=comm.size)
local_dataset = global_dataset.get_truncated(rank, comm.size)
if rank > 0:
    global_dataset = None

# Build graph
graph = get_graph_class(args.graph)(comm.size, seed=args.seed, logger=log)
log.info(graph)

# Name the run
filename = str(time.time()).split(".")[0]
예제 #20
0
    ap = AveragedPerceptron()
    ap.train(learning_rates)
    ap.report()
    ap.evaluate()

    ############################################
    ###### Part II                   ###########
    ############################################

    svm = SVM(verbose=True)
    svm.train(epochs=20)
    hm.report(svm)
    hm.evaluate(svm)

    lr = LogisticRegression(verbose=True)
    lr.train(epochs=20)
    hm.report(lr)
    hm.evaluate(lr)

    nb = NaiveBayes()
    nb.train(epochs=1)
    hm.report(nb)
    hm.evaluate(nb)

    # Logistic regression using sklearn
    import data as dt
    from sklearn.linear_model import LogisticRegression

    train_data = dt.load_data(dt.TRAIN, matrix=True)
    test_data = dt.load_data(dt.TEST, matrix=True)
# Data from:
# https://en.wikipedia.org/wiki/Logistic_regression#Probability_of_passing_an_exam_versus_hours_of_study
X = np.array([
    0.50, 0.75, 1.00, 1.25, 1.50, 1.75, 1.75, 2.00, 2.25, 2.50, 2.75, 3.00,
    3.25, 3.50, 4.00, 4.25, 4.50, 4.75, 5.00, 5.50
],
             dtype='float32')
y = np.array([0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 1, 1, 1, 1, 1],
             dtype='float32')

X, y = np.reshape(X, (20, 1)), np.reshape(y, (20, 1))
X = np.concatenate((np.ones((20, 1), dtype='float32'), X), axis=1)

# Fit model to data
model = LogisticRegression(data=X, labels=y)
weights = model.fit(alpha=0.1, verbose=True)

# Generate line of best fit
x_bf = np.linspace(0, 6, dtype='float32')
y_bf = np.array([sigmoid(weights[0][0] + x * weights[1][0]) for x in x_bf],
                dtype='float32')

plt.scatter(X[:, 1], y, color='b', s=75, label='Samples')
plt.plot(x_bf, y_bf, color='r', label='Fitted Model')
plt.xlabel('$x$')
plt.ylabel('$y$')
plt.title('Logistic Regression')
plt.legend()
plt.show()
class Platt:
    """
    Apply platt scaling to a score classifier
    """

    def __init__(self, penalty='l2', alpha=100000.0):
        self._penalty = penalty
        self._alpha = alpha
        self._p_pred_given_true = None
        self._base_model = None
        self._platt_model = None

    def fit(self, model, X, label_vector, weights, smoothing=False):
        n_classes = model.get_n_classes()
        if n_classes > 2:
            sys.exit("Platt scaling not yet implemented for more than 2 classes.")

        self._base_model = model

        if smoothing:
            X, label_vector, weights = self.reweight_data(X, label_vector, weights)

        scores = np.reshape(model.score(X), (len(label_vector), 1))

        bincount = np.bincount(label_vector, minlength=n_classes)
        most_common = np.argmax(bincount)

        # check to see if there is only one label in the training data:
        if bincount[most_common] == len(label_vector):
            print("Only label %d found in dev data; skipping Platt" % most_common)
        else:
            self._platt_model = LogisticRegression(n_classes, alpha=self._alpha, penalty=self._penalty, objective='acc')
            self._platt_model.fit(scores, label_vector, weights)

    def predict_proba(self, X):
        if self._platt_model is None:
            return self._base_model.predict_proba(X)
        else:
            scores = self._base_model.score(X)
            scores = scores.reshape((len(scores), 1))
            return self._platt_model.predict_proba(scores)

    def predict(self, X):
        pred_probs = self.predict_proba(X)
        predictions = np.argmax(pred_probs, axis=1)
        return predictions

    def predict_proportions(self, X, weights):
        pred_probs = self.predict_proba(X)
        return np.dot(weights, pred_probs) / np.sum(weights)

    def reweight_data(self, X, label_vector, instance_weights):
        n_classes = self._base_model.get_n_classes()
        cl_sums = np.zeros(n_classes)
        for cl in range(n_classes):
            sel = np.array(label_vector == cl, dtype=bool)
            cl_sums[cl] = np.sum(instance_weights[sel])

        pos_weight = (cl_sums[1] + 1) / float(cl_sums[1] + 2)
        neg_weight = (cl_sums[0] + 1) / float(cl_sums[0] + 2)
        weight_vector = (label_vector * pos_weight + (1-label_vector) * neg_weight) * instance_weights

        if type(X) == list:
            X = X + X
        else:
            X = sparse.vstack([X, X])
        y = np.r_[label_vector, 1-label_vector]
        w = np.r_[weight_vector, 1-weight_vector]

        return X, y, w
예제 #23
0
파일: lenet.py 프로젝트: alpesis-ai/mnist
def classifier_lenet5(learning_rate=0.1,
                      n_epochs=200,
                      dataset='../../data/mnist.pkl.gz',
                      nkerns=[20, 50],
                      batch_size=500):
    """ Demonstrates lenet on MNIST dataset.

    :type learning_rate: float
    :param learning_rate: learning rate used (factor for the stochastic gradient)

    :type n_epochs: int
    :param n_epochs: maximal number of epochs to run the optimizer

    :type dataset: string
    :param dataset: path to the dataset used for training/testing (MNIST)

    :type nkerns: list of ints
    :param nkerns: number of kernels on each layer
    """

    rng = np.random.RandomState(23455)

    datasets = load_data(dataset)
    train_x, train_y = datasets[0]
    valid_x, valid_y = datasets[1]
    test_x, test_y = datasets[2]

    # compute number of minibatches for training, validation and testing
    n_train_batches = train_x.get_value(borrow=True).shape[0]
    n_valid_batches = valid_x.get_value(borrow=True).shape[0]
    n_test_batches = test_x.get_value(borrow=True).shape[0]

    n_train_batches //= batch_size
    n_valid_batches //= batch_size
    n_test_batches //= batch_size

    # allocate symbolic variables for the data
    # index to a minibatch
    index = T.lscalar()

    # the data is presented as rasterized images
    x = T.matrix('x')
    # the labels are presented as 1D vector of int labels
    y = T.ivector('y')

    # build the model
    print('... building the model')

    # reshape matrix of rasterized images of shape (batch_size, 28*28)
    # to a 4D tensor, compatible with our ConvLayer (28, 28)
    # is the size of MNIST images.
    layer0_input = x.reshape((batch_size, 1, 28, 28))

    # construct the first convolutional pooling layer:
    # filtering reduces the image size to (28-5+1, 28-5+1) = (24, 24)
    # maxpooling reduces this further to (24/2, 24/2) = (12, 12)
    # 4D output tensor is thus of shape (batch_size, nkerns[0], 12, 12)
    layer0 = Conv(rng,
                  input=layer0_input,
                  image_shape=(batch_size, 1, 28, 28),
                  filter_shape=(nkerns[0], 1, 5, 5),
                  poolsize=(2, 2))

    # construct the second convolutional pooling layer
    # filtering reduces the image size to (12-5+1, 12-5+1) = (8, 8)
    # maxpooling reduces this further to (8/2, 8/2) = (4, 4)
    # 4D output tensor is thus of shape (batch_size, nkerns[1], 4, 4)
    layer1 = Conv(rng,
                  input=layer0.output,
                  image_shape=(batch_size, nkerns[0], 12, 12),
                  filter_shape=(nkerns[1], nkerns[0], 5, 5),
                  poolsize=(2, 2))

    # the HiddenLayer being fully-connected, it operates on 2D matrics
    # of shape (batch_size, num_pixels) (i.e matrix of rasterized images).
    # This will generate a matrix of shape (batch_size, nkerns[1] * 4 * 4),
    # or (500, 50 * 4 * 4) = (500, 800) with the default values.
    layer2_input = layer1.output.flatten(2)

    # construct a fully-connected sigmoidal layer
    layer2 = HiddenLayer(rng,
                         input=layer2_input,
                         n_in=nkerns[1] * 4 * 4,
                         n_out=500,
                         activation=T.tanh)

    # classify the values of the fully-connected sigmoidal layer
    layer3 = LogisticRegression(input=layer2.output, n_in=500, n_out=10)

    # the cost we minimize during training is the NLL of the model
    cost = layer3.negative_log_likelihood(y)

    # create a function to compute the mistakes that are made by the model
    givens = {
        x: test_x[index * batch_size:(index + 1) * batch_size],
        y: test_y[index * batch_size:(index + 1) * batch_size]
    }
    test_model = theano.function([index], layer3.errors(y), givens=givens)

    givens = {
        x: valid_x[index * batch_size:(index + 1) * batch_size],
        y: valid_y[index * batch_size:(index + 1) * batch_size]
    }
    valid_model = theano.function([index], layer3.errors(y), givens=givens)

    # create a list of all model parameters to be fit by gradient descent
    params = layer3.params + layer2.params + layer1.params + layer0.params

    # create a list of gradients for all model parameters
    grads = T.grad(cost, params)

    # train_model is a function that updates the model parameters by
    # SGD since this model has many parameters, it would be tedious to
    # manually create an update rule for each model parameter.
    # We thus create the updates list by automatically looping over all
    # (params[i], grads[i]) pairs.
    updates = [(param_i, param_i - learning_rate * grad_i)
               for param_i, grad_i in zip(params, grads)]

    givens = {
        x: train_x[index * batch_size:(index + 1) * batch_size],
        y: train_y[index * batch_size:(index + 1) * batch_size]
    }
    train_model = theano.function([index],
                                  cost,
                                  updates=updates,
                                  givens=givens)

    # train the model
    print('... training')
    # early-stopping parameters
    # look as this many examples regardless
    patience = 10000
    # wait this much longer when a new best is found
    patience_increase = 2
    # a relative improvement of this much is considered significant
    improvement_threshold = 0.995
    # go through this many minibatche before checking the network
    # on the validation set, in this case we check every epoch
    validation_frequency = min(n_train_batches, patience // 2)

    best_validation_loss = np.inf
    best_iter = 0
    test_score = 0.
    start_time = timeit.default_timer()

    epoch = 0
    done_looping = False

    while (epoch < n_epochs) and (not done_looping):
        epoch += 1
        for minibatch_index in range(n_train_batches):
            iter = (epoch - 1) * n_train_batches + minibatch_index
            if iter % 100 == 0:
                print("training @ iter = ", iter)
            cost_ij = train_model(minibatch_index)

            if (iter + 1) % validation_frequency == 0:
                # compute zero-one loss on validation set
                validation_losses = [
                    valid_model(i) for i in range(n_valid_batches)
                ]
                this_validation_loss = np.mean(validation_losses)
                print('epoch %i, minibatch %i/%i, validation error %f %%' %
                      (epoch, minibatch_index + 1, n_train_batches,
                       this_validation_loss * 100.))

                if this_validation_loss < best_validation_loss:
                    # improve patience if loss imporovement is good enough
                    if this_validation_loss < best_validation_loss * improvement_threshold:
                        patience = max(patience, iter * patience_increase)

                    best_validation_loss = this_validation_loss
                    best_iter = iter

                    # test it on the test set
                    test_losses = [
                        test_model(i) for i in range(n_test_batches)
                    ]
                    test_score = np.mean(test_losses)
                    print((
                        "epoch %i, minibatch %i/%i, test error of best model %f %%"
                    ) % (epoch, minibatch_index + 1, n_train_batches,
                         test_score * 100.))

            if patience <= iter:
                done_looping = True
                break

    end_time = timeit.default_timer()
    print("Optimization complete.")
    print("Best validation score of %f %% obtained at iteration %i, "
          "with test performance %f %%" %
          (best_validation_loss * 100., best_iter + 1, test_score * 100.))