Пример #1
0
def _test_regression(dataset='mauna_loa', k=1, dist_metric='l2', d=2):
    """
    compute test loss on regression dataset

    Inputs:
        dataset: (str) name of dataset
        k: (int) number of nearest neighbours to test on
        dist_metric: (str) 'l1' or 'l2'
        d : (int, optional) if name='rosenbrock' the specify the dataset dimensionality

    Outputs:
        RMSE on test set of the dataset
    """

    if dataset == 'rosenbrock':
        x_train, x_valid, x_test, y_train, y_valid, y_test = load_dataset(
            'rosenbrock', n_train=5000, d=d)
    else:
        x_train, x_valid, x_test, y_train, y_valid, y_test = load_dataset(
            dataset)

    x_train = np.vstack([x_valid, x_train])
    y_train = np.vstack([y_valid, y_train])
    return _eval_knn([k, k + 1],
                     x_train,
                     y_train,
                     x_test,
                     y_test,
                     dist_metric,
                     compute_loss=True)
Пример #2
0
def convert_h5(data_dir,
               label_dir,
               data_split,
               train_volumes,
               test_volumes,
               f,
               data_id,
               remap_config='Neo',
               orientation=preprocessor.ORIENTATION['coronal']):
    # Data splitting
    if data_split:
        train_file_paths, test_file_paths = apply_split(
            data_split, data_dir, label_dir)
    elif train_volumes and test_volumes:
        train_file_paths = du.load_file_paths(data_dir, label_dir, data_id,
                                              train_volumes)
        test_file_paths = du.load_file_paths(data_dir, label_dir, data_id,
                                             test_volumes)
    else:
        raise ValueError(
            'You must either provide the split ratio or a train, train dataset list'
        )

    reduce_slices = False  #True  #BORIS

    print("Train dataset size: %d, Test dataset size: %d" %
          (len(train_file_paths), len(test_file_paths)))
    # loading,pre-processing and writing train data
    print("===Train data===")
    data_train, label_train, class_weights_train, weights_train, _ = du.load_dataset(
        train_file_paths,
        orientation,
        remap_config=remap_config,
        return_weights=True,
        reduce_slices=reduce_slices,  #BORIS
        remove_black=True)

    _write_h5(data_train,
              label_train,
              class_weights_train,
              weights_train,
              f,
              mode='train')

    # loading,pre-processing and writing test data
    print("===Test data===")
    data_test, label_test, class_weights_test, weights_test, _ = du.load_dataset(
        test_file_paths,
        orientation,
        remap_config=remap_config,
        return_weights=True,
        reduce_slices=reduce_slices,  #BORIS
        remove_black=True)

    _write_h5(data_test,
              label_test,
              class_weights_test,
              weights_test,
              f,
              mode='test')
Пример #3
0
def main():
    print("\nParameters:")
    for attr, value in args.__dict__.items():
        print("\t{}={}".format(attr.upper(), value))

    # load data
    strain_data, sd_train_data, sdev_data, stest_data, embeddings =\
                   data_utils.load_dataset(args, 'askubuntu-master', dtrain=True)
    dtrain_data, ddev_data, dtest_data, _ =\
                   data_utils.load_dataset(args, 'Android-master')

    # initalize necessary parameters
    args.embed_num = embeddings.shape[0]
    args.kernel_sizes = [int(k) for k in args.kernel_sizes.split(',')]

    # load model
    if args.snapshot is None:
        # initalize model
        task_model = None
        if args.model == 'lstm':
            if args.bidirectional and (args.hidden_layer > 1):
                args.hidden_layer = 1
                print('\nMultilayer bidirectional LSTM not supported yet,\
                            layer set to 1.\n')
            task_model = model.LSTM(args, embeddings)
        elif args.model == 'cnn':
            task_model = model.CNN(args, embeddings)

        domain_model = model.DomainClassifier(args, embeddings)

        # train models
        res = train2.train_model(strain_data, sd_train_data, sdev_data,
                                 stest_data, dtrain_data, ddev_data,
                                 dtest_data, task_model, domain_model, args)
    else:
        print('\nLoading model from [%s]...' % args.snapshot)
        try:
            mod = torch.load(args.snapshot)
        except:
            print("Sorry, This snapshot doesn't exist.")
            exit()
        print(mod)

        # evaluate

        print('\nEvaluating on target dev')
        evaluate.q_evaluate(mod, ddev_data, args)

        print('Evaluating on target test')
        evaluate.q_evaluate(mod, dtest_data, args)
Пример #4
0
def _cross_val(dataset='mauna_loa', k=10, dist_metric='l1', v=5):
    """
    cross validation technique on knn

    Inputs:
        dataset: (str) name of dataset
        k: (list) k[0]:lower bound of number of nearest neighbours; k[1]:upper bound of number of nearest neighbours
        dist_metric: (str) 'l1' or 'l2'
        v: (int) cross validation parameter, number of cross folds

    Outputs:
        averaged validation loss
    """
    print('------Processing Dataset ' + dataset + ' ------')
    if dataset == 'rosenbrock':
        x_train, x_valid, x_test, y_train, y_valid, y_test = load_dataset(
            'rosenbrock', n_train=5000, d=2)
    else:
        x_train, x_valid, x_test, y_train, y_valid, y_test = load_dataset(
            dataset)

    x_train = np.vstack([x_valid, x_train])
    y_train = np.vstack([y_valid, y_train])

    np.random.seed(42)
    np.random.shuffle(x_train)
    np.random.seed(42)
    np.random.shuffle(y_train)

    data_partition = _partition_fold(v=v, data=x_train)
    loss = np.empty((0, k[1] - k[0]))
    for fold in range(v):
        print('------Processing Fold ' + str(fold + 1) + ' ------')
        train_x = np.delete(x_train, list(data_partition[fold]), axis=0)
        train_y = np.delete(y_train, list(data_partition[fold]), axis=0)

        query_x = np.take(x_train, list(data_partition[fold]), axis=0)
        query_y = np.take(y_train, list(data_partition[fold]), axis=0)

        curr_loss = _eval_knn(k,
                              train_x,
                              train_y,
                              query_x,
                              query_y,
                              dist_metric=dist_metric)
        loss = np.append(loss, [curr_loss], axis=0)

    loss = loss.mean(axis=0)
    return loss
def main(_):
    data_path = 'data/new-dataset-cornell-length10-filter1-vocabSize40000.pkl'
    word2id, id2word, trainingSamples = load_dataset(data_path)
    hparam = Config()
    hparam.is_training = False

    with tf.Session() as sess:

        model = Seq2SeqModel(hparam, word2id)
        ckpt = tf.train.get_checkpoint_state(hparam.save_path)
        if ckpt and tf.train.checkpoint_exists(ckpt.model_checkpoint_path):
            print("Restoring model parameters from %s." %
                  ckpt.model_checkpoint_path)
            model.saver.restore(sess, ckpt.model_checkpoint_path)
        else:
            print("Creating model with fresh parameters.")
            sess.run(model.init)

        sys.stdout.write("> ")
        sys.stdout.flush()
        sentence = sys.stdin.readline()
        while sentence:
            batch = sentence_preprocess(sentence, word2id)
            outputs = model.infer_session(sess, batch)

            predicted_ids = outputs["predicted_ids"]
            out_sents = [id2word[idx] for idx in predicted_ids[0][0].tolist()]
            print(" ".join(out_sents))
            print("> ", "")
            sys.stdout.flush()
            sentence = sys.stdin.readline()
Пример #6
0
def evaluate_model(model_path, dataset_path='emnist/emnist-balanced-test.csv'):
    raw_test_x, raw_test_y, class_map = data_utils.load_dataset(dataset_path)
    test_x, test_y, _ = data_utils.prepare_data(raw_test_x, raw_test_y,
                                                class_map)
    best_model = load_model(model_path)
    print(best_model.evaluate(test_x, test_y))
    data_utils.print_confusion_matrix(test_x, test_y, model_path, class_map)
Пример #7
0
def _test_predict(l=0):
    x_train, x_valid, x_test, y_train, y_valid, y_test = load_dataset(
        'mauna_loa')
    x_total = np.vstack([x_train, x_valid])
    y_total = np.vstack([y_train, y_valid])

    phi_train = _construct_phi(x_total)
    phi_test = _construct_phi(x_test)

    U, S, Vh = np.linalg.svd(phi_train)

    # Invert Sigma
    sig = np.diag(S)
    filler = np.zeros([phi_train.shape[0] - len(S), len(S)])
    sig = np.vstack([sig, filler])

    inv = np.linalg.inv(sig.T @ sig + l * np.eye(sig.shape[1]))
    w = Vh.T @ inv @ sig.T @ (U.T @ y_total)

    prediction = phi_test @ w
    plot(xlabel='x',
         ylabel='y',
         name='mauna_loa_predict',
         x=x_test,
         y=[prediction, y_test],
         legend=['Predicted', 'GroundTruth'])
    return _RMSE(prediction, y_test)
Пример #8
0
def predict_test(dataset='mauna_loa', k=2, dist_metric='l2'):
    """
    run knn and output predicted values on regression test data

    Inputs:
        dataset: (str) name of dataset
        k: (list) k[0]:lower bound of number of nearest neighbours; k[1]:upper bound of number of nearest neighbours
        dist_metric: (str) 'l1' or 'l2'

    Outputs:
        [predict_x,GroundTruth_y,predicted_y]
    """
    x_train, x_valid, x_test, y_train, y_valid, y_test = load_dataset(dataset)

    x_train = np.vstack([x_valid, x_train])
    y_train = np.vstack([y_valid, y_train])

    predicted_y = np.empty((0, y_train.shape[-1]))
    curr_predict = _eval_knn([k, k + 1],
                             x_train,
                             y_train,
                             x_test,
                             y_test,
                             dist_metric=dist_metric,
                             compute_loss=False)
    predicted_y = np.append(predicted_y, curr_predict['k=' + str(k)], axis=0)

    rval = []
    for idx in range(x_test.shape[0]):
        rval.append((x_test[idx], y_test[idx], predicted_y[idx]))

    rval.sort(key=lambda tup: tup[0])
    return [i[0] for i in rval], [i[1] for i in rval], [i[2] for i in rval]
Пример #9
0
def run_Q5():
    theta_list, test_loss = [0.01, 0.1, 1.0], []
    #theta_list, test_loss = [1.0], []
    x_train, x_valid, x_test, y_train, y_valid, y_test = load_dataset(
        'rosenbrock', n_train=200, d=2)
    for theta in theta_list:
        print('----- Processing Theta = ' + str(theta) + '-----')
        I_selected, w = _greedy_alg(x_train, y_train, theta=theta)
        #print(I_selected)
        #print(w)
        loss_total = 0
        big_K = np.empty((0, len(I_selected)))

        for i in range(x_test.shape[0]):
            build_kernel = _test_kernel(basis=I_selected,
                                        x_train=x_train,
                                        test_pt=x_test[i],
                                        theta=theta)
            #print(build_kernel)
            #break
            big_K = np.append(big_K, [build_kernel], axis=0)
        #print(big_K)
        predicted_y = np.dot(big_K, w)
        #print(predicted_y)
        loss = _RMSE(predicted_y, y_test)
        #     loss_total += loss
        # l = loss_total/x_test.shape[0]
        # test_loss.append(l)
        #break
        print('Test Loss: ' + str(loss))
    return loss
def run_example():
    """
    This example demonstrates computation of the negative log likelihood (nll) as
    well as the gradient of the nll with respect to all weights and biases of the
    neural network. We will use 50 neurons per hidden layer and will initialize all 
    weights and biases to zero.
    """
    # load the MNIST_small dataset
    from data_utils import load_dataset
    x_train, x_valid, x_test, y_train, y_valid, y_test = load_dataset(
        'mnist_small')

    # initialize the weights and biases of the network
    M = 50  # 50 neurons per hidden layer
    W1 = np.zeros((M, 784))  # weights of first (hidden) layer
    W2 = np.zeros((M, M))  # weights of second (hidden) layer
    W3 = np.zeros((10, M))  # weights of third (output) layer
    b1 = np.zeros((M, 1))  # biases of first (hidden) layer
    b2 = np.zeros((M, 1))  # biases of second (hidden) layer
    b3 = np.zeros((10, 1))  # biases of third (output) layer

    # considering the first 250 points in the training set,
    # compute the negative log likelihood and its gradients
    (nll, (W1_grad, W2_grad, W3_grad, b1_grad, b2_grad, b3_grad)) = \
        nll_gradients(W1, W2, W3, b1, b2, b3, x_train[:250], y_train[:250])
    print("negative log likelihood: %.5f" % nll)
Пример #11
0
def run_Q3(l=0.1):
    x_train, x_valid, x_test, y_train, y_valid, y_test = load_dataset(
        'mauna_loa')
    x_total = np.vstack([x_train, x_valid])
    y_total = np.vstack([y_train, y_valid])

    K = _Q3_construct_K(x_total)
    R = np.linalg.cholesky((K + l * np.eye(len(K))))
    #print(K)
    #print(K.shape)

    R_inv = np.linalg.inv(R)

    alpha = R_inv.T @ R_inv @ y_total
    K_test = _Q3_construct_test_K(x_total, x_test)
    prediction = K_test @ alpha
    plot(xlabel='x',
         ylabel='y',
         name='mauna_loa_predict_CH',
         x=x_test,
         y=[prediction, y_test],
         legend=['Predicted', 'GroundTruth'])
    z = np.linspace(-0.1, 0.1, 100)
    x = [0] * len(z)
    _visualize_kernel(x, z, 'k(0,z)')
    z = np.linspace(-0.1 + 1, 0.1 + 1, 100)
    x = [1] * len(z)
    _visualize_kernel(x, z, 'k(1,z+1)')
    return _RMSE(prediction, y_test)
def model_06():
    # 加载数据集
    X_train, Y_train, X_test, Y_test = load_dataset()  # 数据

    # 设置参数
    layers_dims = [X_train.shape[0], 1]
    num_iter = 2000
    learning_rate = 0.5
    print_cost = False
    initialization = "he"

    parameters, costs = basic_model(X_train,
                                    Y_train,
                                    layers_dims=layers_dims,
                                    num_iter=num_iter,
                                    lr=learning_rate,
                                    print_cost=print_cost,
                                    initialization=initialization)

    # 预测及评估
    prediction_train = predict(parameters, X_train)
    prediction_test = predict(parameters, X_test)

    print("Train准确率: {}".format(evaluate(prediction_train, Y_train)))
    print("test准确率: {}".format(evaluate(prediction_test, Y_test)))

    plt.title("Model with He initialization")
    axes = plt.gca()
    axes.set_xlim([-1.5, 1.5])
    axes.set_ylim([-1.5, 1.5])
    plot_decision_boundary(lambda x: predict(parameters, x.T), X_train,
                           Y_train)
    plt.show()
Пример #13
0
    def train(self,
              epoch=25,
              batch_size=1,
              learning_rate=0.0002,
              momentum=0.9,
              decay=0.95,
              data_dir="data",
              dataset_name="cnn",
              vocab_size=1000000):
        if not self.vocab:
            self.vocab, self.rev_vocab = load_vocab(data_dir, dataset_name,
                                                    vocab_size)

        self.opt = tf.train.RMSPropOptimizer(learning_rate,
                                             decay=decay,
                                             momentum=momentum)

        for epoch_idx in xrange(epoch):
            data_loader = load_dataset(data_dir, dataset_name, vocab_size)

            contexts, questions, answers = [], [], []
            for batch_idx in xrange(batch_size):
                _, context, question, answer, _ = data_loader.next()
                contexts.append(context)
                questions.append(question)
                answers.append(answers)
Пример #14
0
def main():
    config = get_config()
    config = init_env(config)
    datasets = data_utils.load_dataset(config)

    eval_metric = FewShotMetrics(config, datasets)
    if config.eval:
        model = Model.load(config, config.load_checkpoint)
    else:
        if config.load_checkpoint:
            model = Model.load(config, config.load_checkpoint)
        else:
            word_dict = datasets['train'].word_dict
            classes = datasets['train'].classes
            model = Model(config, word_dict, classes)
        model.train(datasets['train'], datasets['dev'], eval_metric)
        model.load_best()
    test_loader = data_utils.get_dataset_loader(config,
                                                datasets['test'],
                                                train=False)
    evaluate(config,
             model,
             test_loader,
             eval_metric,
             split='test',
             dump=not config.eval)
def main(_):

    data_path = 'data/new-dataset-cornell-length10-filter1-vocabSize40000.pkl'
    word2id, id2word, trainingSamples = load_dataset(data_path)
    hparam = Config()

    with tf.Session() as sess:

        model = Seq2SeqModel(hparam, word2id)
        ckpt = tf.train.get_checkpoint_state(hparam.save_path)

        if FLAGS.resume and ckpt and tf.train.checkpoint_exists(
                ckpt.model_checkpoint_path):
            print("Restoring model parameters from %s." %
                  ckpt.model_checkpoint_path)
            model.saver.restore(sess, ckpt.model_checkpoint_path)
        else:
            print("Creating model with fresh parameters.")
            sess.run(model.init)

        train_writer = tf.summary.FileWriter(hparam.save_path,
                                             graph=sess.graph)

        for epoch in range(hparam.num_epoch):
            print("Starting Epoch {}/{}:".format(epoch, hparam.num_epoch))

            batches = get_batches(trainingSamples, hparam.batch_size)
            total_loss = 0.0
            total_count = 0

            for nextBatch in tqdm(batches, desc="training"):

                outputs = model.train_session(sess, nextBatch)

                loss = outputs["loss"]
                summary = outputs["summary"]
                step = outputs["step"]
                train_writer.add_summary(summary, step)
                total_loss += loss
                total_count += 1

                if step % hparam.display_per_step == 0:

                    perplexity = math.exp(
                        float(total_loss / total_count)
                    ) if total_loss / total_count < 300 else float('inf')
                    tqdm.write(
                        " Step %d | Per-word Loss %.4f | Perplexity %.4f" %
                        (step, total_loss / total_count, perplexity))

                    checkpoint_path = os.path.join(hparam.save_path,
                                                   hparam.model_name)
                    model.saver.save(sess, checkpoint_path)

            tqdm.write("\n")
            tqdm.write(" Epoch %d | Per-word Loss %.4f | Perplexity %.4f" %
                       (epoch, total_loss / total_count, perplexity))
            tqdm.write("\n")
Пример #16
0
def log_reg_GD(dataset='iris', lr_rates=[0.1], method='SGD', total_iter=2000):
    x_train, x_valid, x_test, y_train, y_valid, y_test = load_dataset(dataset)
    y_train, y_valid, y_test = y_train[:, (1, )], y_valid[:,
                                                          (1, )], y_test[:,
                                                                         (1, )]

    y_train, y_valid, y_test = _cast_TF(y_train), _cast_TF(y_valid), _cast_TF(
        y_test)

    x_train = np.vstack([x_train, x_valid])
    y_train = np.vstack([y_train, y_valid])

    X = np.ones((len(x_train), len(x_train[0]) + 1))
    X[:, 1:] = x_train

    X_test = np.ones((len(x_test), len(x_test[0]) + 1))
    X_test[:, 1:] = x_test

    test_accuracies = []
    test_logs = []
    neg_log = {}

    for rate in lr_rates:

        w = np.zeros(np.shape(X[0, :]))
        neg_log[rate] = []
        bar = tqdm.tqdm(total=total_iter, desc='Iter', position=0)
        for iteration in range(total_iter):
            bar.update(1)

            estimates = X @ w
            estimates = estimates.reshape(np.shape(y_train))

            if method == 'SGD':
                i = random.randint(0, len(y_train) - 1)
                grad_L = (y_train[i] - _sigmoid(estimates[i])) * X[i, :]

            elif method == 'GD':
                grad_L = np.zeros(np.shape(w))
                for i in range(len(y_train)):
                    grad_L += (y_train[i] - _sigmoid(estimates[i])) * X[i, :]

            w = w + (rate * grad_L)
            L = _log_likelihood(estimates, y_train)
            neg_log[rate].append(-L)

        test_estimates = np.dot(X_test, w)
        test_estimates = test_estimates.reshape(np.shape(y_test))
        predictions = np.zeros(np.shape(y_test))
        for i in range(len(predictions)):
            p = _sigmoid(test_estimates[i])
            predictions[i] = (p >= 1 / 2)

        test_accuracies.append(_Q1_compute_acc(y_test, predictions))
        test_logs.append(_log_likelihood(test_estimates, y_test))

    return neg_log, test_accuracies, test_logs
Пример #17
0
def test():
    parser = argparse.ArgumentParser()
    parser.add_argument('--target', choices=['vitB1', 'vitB12', 'folate'])
    parser.add_argument('--modelType', choices=['lr', 'svc', 'rf', 'knn'])
    parser.add_argument('--reverse', action='store_true')
    opt = parser.parse_args()

    # threshold
    th_dict = dict()
    th_dict['vitB1'] = 30
    th_dict['vitB12'] = 180
    th_dict['folate'] = 4

    # load the dataset
    x_df, y_df, date = data_utils.load_dataset(target=opt.target)

    # preprocess the dataset
    x_data, y_data, weight = data_utils.preprocess_dataset(x_df, y_df, th=th_dict[opt.target])

    # split into train and test
    n_train = np.sum(date < 20170000)
    if opt.reverse:
        x_data, y_data = x_data[::-1], y_data[::-1]
    x_data, x_test, y_data, y_test = train_test_split(x_data, y_data,
                                                      train_size=n_train,
                                                      shuffle=False)

    # model
    if opt.modelType == 'lr':
        model = LogisticRegression(C=1e1, random_state=42, class_weight={1: weight})
    elif opt.modelType == 'svc':
        model = SVC(kernel='rbf', C=1e6, gamma=1e-9, class_weight={1: weight},
                    probability=True, random_state=42)
    elif opt.modelType == 'rf':
        model = RandomForestClassifier(n_estimators=50,
                                       min_samples_split=2,
                                       max_depth=10,
                                       class_weight={1: weight},
                                       random_state=42)
    elif opt.modelType == 'knn':
        model = KNeighborsClassifier(algorithm='auto',
                                     leaf_size=1,
                                     metric='minkowski',
                                     metric_params=None,
                                     n_jobs=1,
                                     n_neighbors=37,
                                     p=1,
                                     weights='uniform')

    # fit and predict
    model.fit(x_data, y_data)
    prob_test = model.predict_proba(x_test)[:, 1]

    # evaluation
    auc_value = roc_auc_score(y_test, prob_test)
    print('AUC: {:.4f}'.format(auc_value))
    draw_roc(y_test, prob_test, opt.modelType)
Пример #18
0
def TimeTaken(d):
    x_train, x_valid, x_test, y_train, y_valid, y_test = load_dataset(
        'rosenbrock', n_train=5000, d=d)
    print(np.shape(x_test[0]))
    print(np.shape(x_train[0]))
    time_init = time.time()
    Test_Error_Tree(x_train, y_train, x_test, y_test)
    time_end = time.time()
    return time_end - time_init
Пример #19
0
def main(_):
    pp.pprint(flags.FLAGS.__flags)

    if not FLAGS.model_dir:
        print(" [-] Error: Model dir is not set!")
        exit(-1)

    if not os.path.exists(FLAGS.model_dir):
        print(" [*] Creating model directory...")
        os.makedirs(FLAGS.model_dir)

    with open(os.path.join(FLAGS.model_dir, "config.json"),
              'w') as config_file:
        config_file.write("%s" % (pp.pformat(flags.FLAGS.__flags)))

    # build model
    model = model_dict[FLAGS.model](vocab_size=FLAGS.vocab_size,
                                    size=FLAGS.cell_size,
                                    cell_type=FLAGS.cell)
    # load data
    print(" [*] Loading dataset...")
    train_data = data_utils.load_dataset(FLAGS.data_dir,
                                         FLAGS.dataset,
                                         FLAGS.vocab_size,
                                         FLAGS.max_nsteps,
                                         part="training")
    dev_data = data_utils.load_dataset(FLAGS.data_dir,
                                       FLAGS.dataset,
                                       FLAGS.vocab_size,
                                       FLAGS.max_nsteps,
                                       part="validation")
    print(" [+] Finish loading. Train set: %d, Dev set: %d" %
          (len(train_data), len(dev_data)))

    #model.train(train_data, dev_data, nb_epoch=FLAGS.epoch, batch_size=FLAGS.batch_size, model_dir=FLAGS.model_dir)
    model.batch_train(train_data,
                      dev_data,
                      nb_epoch=FLAGS.epoch,
                      batch_size=FLAGS.batch_size,
                      model_dir=FLAGS.model_dir,
                      evaluate_every=FLAGS.evaluate_every,
                      checkpoint_every=FLAGS.checkpoint_every)
Пример #20
0
def convert_h5(data_dir, label_dir, data_split, f):

    if data_split:
        train_file_paths, test_file_paths = apply_split(
            data_split, data_dir, label_dir)
    else:
        raise ValueError('Please provide the split ratio')

    print("Training dataset size: ", len(train_file_paths))
    print("Testing dataset size: ", len(test_file_paths))

    # data_train = list of 3D numpy array of training volumes
    # label_train = list of 3D numpy array of training labels
    # _ = list of header of training volumes
    print("Loading and pre-processing Training data...")
    data_train, label_train, _ = du.load_dataset(train_file_paths)
    _write_h5(data_train, label_train, f, mode="train")

    print("Loading and pre-processing Testing data...")
    data_test, label_test, _ = du.load_dataset(test_file_paths)
    _write_h5(data_test, label_test, f, mode="test")
Пример #21
0
def _svd_classification(dataset='mnist_small'):
    """
    svd on classificaiton dataset

    Inputs:
        dataset: (str) name of dataset

    Outputs:
        accuracy on predicted values
    """
    if dataset == 'rosenbrock':
        x_train, x_valid, x_test, y_train, y_valid, y_test = load_dataset(
            'rosenbrock', n_train=5000, d=2)
    else:
        x_train, x_valid, x_test, y_train, y_valid, y_test = load_dataset(
            dataset)

    x_total = np.vstack([x_train, x_valid])
    y_total = np.vstack([y_train, y_valid])

    X = np.ones((len(x_total), len(x_total[0]) + 1))
    X[:, 1:] = x_total

    U, S, Vh = np.linalg.svd(X)

    # Invert Sigma
    sig = np.diag(S)
    filler = np.zeros([len(x_total) - len(S), len(S)])
    sig_inv = np.linalg.pinv(np.vstack([sig, filler]))

    # Compute weights
    w = Vh.T @ (sig_inv @ (U.T @ y_total))

    # Make test predictions
    X_test = np.ones((len(x_test), len(x_test[0]) + 1))
    X_test[:, 1:] = x_test
    predictions = np.argmax(X_test @ w, axis=1)
    y_test = np.argmax(1 * y_test, axis=1)

    return (predictions == y_test).sum() / len(y_test)
Пример #22
0
def _svd_regression(dataset='mauna_loa'):
    """
    svd on regression dataset

    Inputs:
        dataset: (str) name of dataset

    Outputs:
        RMSE on predicted values
    """
    if dataset == 'rosenbrock':
        x_train, x_valid, x_test, y_train, y_valid, y_test = load_dataset(
            'rosenbrock', n_train=5000, d=2)
    else:
        x_train, x_valid, x_test, y_train, y_valid, y_test = load_dataset(
            dataset)

    x_total = np.vstack([x_train, x_valid])
    y_total = np.vstack([y_train, y_valid])

    X = np.ones((len(x_total), len(x_total[0]) + 1))
    X[:, 1:] = x_total

    U, S, Vh = np.linalg.svd(X)

    # Invert Sigma
    sig = np.diag(S)
    filler = np.zeros([len(x_total) - len(S), len(S)])
    sig_inv = np.linalg.pinv(np.vstack([sig, filler]))

    # Compute weights
    w = Vh.T @ (sig_inv @ (U.T @ y_total))

    # Make test predictions
    X_test = np.ones((len(x_test), len(x_test[0]) + 1))
    X_test[:, 1:] = x_test
    predictions = X_test @ w

    return _RMSE(y_test, predictions)
Пример #23
0
def predict_cross_val(dataset='mauna_loa', k=2, dist_metric='l2', v=5):
    """
    cross validation technique on knn and output predicted values

    Inputs:
        dataset: (str) name of dataset
        k: (list) k[0]:lower bound of number of nearest neighbours; k[1]:upper bound of number of nearest neighbours
        dist_metric: (str) 'l1' or 'l2'
        v: (int) cross validation parameter, number of cross folds

    Outputs:
        [predict_x,GroundTruth_y,predicted_y]
    """
    x_train, x_valid, x_test, y_train, y_valid, y_test = load_dataset(dataset)

    x_train = np.vstack([x_valid, x_train])
    y_train = np.vstack([y_valid, y_train])

    np.random.seed(42)
    np.random.shuffle(x_train)
    np.random.seed(42)
    np.random.shuffle(y_train)

    data_partition = _partition_fold(v=v, data=x_train)
    predicted_y = np.empty((0, y_train.shape[-1]))
    for fold in range(v):
        print('------Processing Fold ' + str(fold + 1) + ' ------')
        train_x = np.delete(x_train, data_partition[fold], axis=0)
        train_y = np.delete(y_train, data_partition[fold], axis=0)
        query_x = np.take(x_train, data_partition[fold], axis=0)
        query_y = np.take(y_train, data_partition[fold], axis=0)

        curr_predict = _eval_knn([k, k + 1],
                                 train_x,
                                 train_y,
                                 query_x,
                                 query_y,
                                 dist_metric=dist_metric,
                                 compute_loss=False)
        #print(curr_predict.shape)
        predicted_y = np.append(predicted_y,
                                curr_predict['k=' + str(k)],
                                axis=0)

    rval = []
    for idx in range(x_train.shape[0]):
        rval.append((x_train[idx], y_train[idx], predicted_y[idx]))

    rval.sort(key=lambda tup: tup[0])
    return [i[0] for i in rval], [i[1] for i in rval], [i[2] for i in rval]
def question1a():
    x_train, x_valid, x_test, y_train, y_valid, y_test = load_dataset('iris')
    y_train, y_valid, y_test = y_train[:, (1,)], y_valid[:, (1,)], y_test[:, (1,)]
    learningRate = 0.0001
    maxIterations = 1000
    x_train, x_test = np.vstack((x_train, x_valid)), x_test
    y_train, y_test = np.vstack((y_train, y_valid)), y_test
    varianceList = [0.5, 1, 2]
    print("\nResults for question 1:\n")
    for variance in varianceList:
        margLikelihood, iterations, w, H = laplaceApproximation(x_train, x_test, y_train, y_test, learningRate, variance, maxIterations)
        print("For a variance of {}:".format(variance))
        print("Iterations = {}".format(iterations))
        print("Marginal log likelihood = {}\n".format(margLikelihood))
Пример #25
0
def load_initial_dataset():
    dataset_folder = Path("../datasets/")
    try:
        # Try to load a cached version of the dataframe
        print("Trying to load the cached dataframe...")
        df = pd.read_pickle(dataset_folder / 'cached_dataframe.pkl2')
        print("Done")
    except:
        print("No cached dataframe, loading the dataset from disk")
        path_file = dataset_folder / 'Cell_Phones_and_Accessories_5.json'
        df = load_dataset(path_file)
        # Store the dataframe on disk
        print("Caching the dataframe")
        df.to_pickle(dataset_folder / 'cached_dataframe.pkl2')
    return df
Пример #26
0
def loadData(datasetName, d=2):
    '''
    Loads the dataset and normalize the x_ sets
    INPUT: datasetName: a string of the name of file to be loaded. Note that this file must be in the same path as this file
    OUTPUT: 6 datasets in array form, 3 of which are normalized x data
    '''
    if datasetName == 'rosenbrock':
        x_train, x_valid, x_test, y_train, y_valid, y_test = load_dataset(
            datasetName, n_train=1000, d=d)
    else:
        x_train, x_valid, x_test, y_train, y_valid, y_test = load_dataset(
            datasetName)

    x_all = np.concatenate([x_train, x_valid])
    y_all = np.concatenate([y_train, y_valid])
    index_all = list(range(np.shape(x_all)[0]))
    random.shuffle(index_all)

    # Normalizetion of each x data
    mean = x_all.mean(axis=0, keepdims=True)
    stddev = x_all.std(axis=0, keepdims=True)
    x_all = normalization(x_all, mean, stddev)
    x_test = normalization(x_test, mean, stddev)
    return index_all, x_all, x_test, y_all, y_test
Пример #27
0
def _kd_tree(dataset='rosenbrock', dist_metric='l2', k=5, d=2):
    """
    knn using kd_tree

    Inputs:
        dataset: (str) name of dataset
        k: (int) number of nearest neighbours
        dist_metric: (str) 'l1' or 'l2'
        d: (int) data dimensionality

    Outputs:
        RMSE on predicted values
    """
    x_train, x_valid, x_test, y_train, y_valid, y_test = load_dataset(
        dataset, n_train=5000, d=d)
    kdt = neighbors.KDTree(x_train)
    _, index = kdt.query(x_test, k=k)
    predictions = np.sum(y_train[index], axis=1) / k
    return _RMSE(y_test, predictions)
Пример #28
0
def _test_classification(dataset='iris', k_range=[1, 2], dist_metric='l1'):
    """
    run knn and output predicted values on classificaiton test data

    Inputs:
        dataset: (str) name of dataset
        k_range: (list) k[0]:lower bound of number of nearest neighbours; k[1]:upper bound of number of nearest neighbours
        dist_metric: (str) 'l1' or 'l2'


    Outputs:
        accuracy of predicted values referred to GroundTruth
    """

    print('------Processing Dataset ' + dataset + ' ------')

    x_train, x_valid, x_test, y_train, y_valid, y_test = load_dataset(dataset)

    if y_train.dtype == np.dtype('bool'):
        y_train = _cast_TF(y_train)
        y_valid = _cast_TF(y_valid)
        y_test = _cast_TF(y_test)
    acc = []
    predicted = _eval_knn(k_range,
                          x_train,
                          y_train,
                          x_test,
                          y_test,
                          dist_metric,
                          compute_loss=False)
    for k in range(k_range[0], k_range[1]):
        curr_predict = predicted['k=' + str(k)]
        result = np.argmax(curr_predict, axis=1)
        gt = np.where(y_test == True, 1, 0)
        gt = np.argmax(gt, axis=1)
        #print(result-gt)
        #break

        unique, counts = np.unique(result - gt, return_counts=True)
        correct = dict(zip(unique, counts))[0]
        acc.append(correct / y_test.shape[0])

    return acc
Пример #29
0
def create_test_train_fold(fold_num):
    """Splits the dataset into training and held-out test set."""
    data_x, data_y, _ = data_utils.load_dataset(FLAGS.dataset_name)
    tf.logging.info('Dataset: %s, Size: %d', FLAGS.dataset_name,
                    data_x.shape[0])
    tf.logging.info('Cross-val fold: %d/%d', FLAGS.fold_num, _N_FOLDS)
    # Get the training and test set based on the StratifiedKFold split
    (x_train_all, y_train_all), test_dataset = data_utils.get_train_test_fold(
        data_x,
        data_y,
        fold_num=fold_num,
        num_folds=_N_FOLDS,
        stratified=not FLAGS.regression)
    data_gen = data_utils.split_training_dataset(
        x_train_all,
        y_train_all,
        FLAGS.num_splits,
        stratified=not FLAGS.regression)
    return data_gen, test_dataset
  def train(self, epoch=25, batch_size=1,
            learning_rate=0.0002, momentum=0.9, decay=0.95,
            data_dir="data", dataset_name="cnn", vocab_size=1000000):
    if not self.vocab:
      self.vocab, self.rev_vocab = load_vocab(data_dir, dataset_name, vocab_size)

    self.opt = tf.train.RMSPropOptimizer(learning_rate,
                                         decay=decay,
                                         momentum=momentum)

    for epoch_idx in xrange(epoch):
      data_loader = load_dataset(data_dir, dataset_name, vocab_size)

      contexts, questions, answers = [], [], []
      for batch_idx in xrange(batch_size):
        _, context, question, answer, _ = data_loader.next()
        contexts.append(context)
        questions.append(question)
        answers.append(answers)
Пример #31
0
def run_Q1a(dataset='iris', lr=0.001):
    x_train, x_valid, x_test, y_train, y_valid, y_test = load_dataset(dataset)
    y_train, y_valid, y_test = y_train[:, (1, )], y_valid[:,
                                                          (1, )], y_test[:,
                                                                         (1, )]

    x_train, x_test = np.vstack((x_train, x_valid)), x_test
    y_train, y_test = np.vstack((y_train, y_valid)), y_test
    var_list = [0.5, 1, 2]

    X_train = _generate_X(x_train)
    X_test = _generate_X(x_test)

    marginal_likelihoods, rval_w = {}, None

    for variance in var_list:

        w = np.zeros(np.shape(X_train[0]))

        x_prod = np.reshape(X_train @ w, np.shape(y_train))
        posterior_grad = _likelihood_grad(X_train, x_prod,
                                          y_train) + _prior_grad(w, variance)

        while 1:
            if max(posterior_grad) < 10**(-2): break
            x_prod = X_train @ w
            posterior_grad = _likelihood_grad(
                X_train, x_prod, y_train) + _prior_grad(w, variance)
            w = w + (lr * posterior_grad)
        hessian = _likelihood_hess(X_train, x_prod) + _prior_hess(w, variance)

        marginal_likelihoods[variance] = _log_likelihood(
            x_prod, y_train) + _log_prior(w, variance) - _log_g(hessian)
        if variance == 1: rval_w = w

    print(marginal_likelihoods)
    print(rval_w)
    return marginal_likelihoods, rval_w
    def train(
        self,
        sess,
        vocab_size,
        epoch=25,
        learning_rate=0.0002,
        momentum=0.9,
        decay=0.95,
        data_dir="data",
        dataset_name="cnn",
    ):
        self.prepare_model(data_dir, dataset_name, vocab_size)

        start = time.clock()
        print(" [*] Calculating gradient and loss...")
        self.optim = tf.train.AdamOptimizer(learning_rate, 0.9).minimize(self.loss)
        print(" [*] Calculating gradient and loss finished. Take %.2fs" % (time.clock() - start))

        # Could not use RMSPropOptimizer because the sparse update of RMSPropOptimizer
        # is not implemented yet (2016.01.24).
        # self.optim = tf.train.RMSPropOptimizer(learning_rate,
        #                                        decay=decay,
        #                                        momentum=momentum).minimize(self.loss)

        sess.run(tf.initialize_all_variables())

        if self.load(sess, self.checkpoint_dir, dataset_name):
            print(" [*] Deep LSTM checkpoint is loaded.")
        else:
            print(" [*] There is no checkpoint for this model.")

        y = np.zeros([self.batch_size, self.vocab_size])

        merged = tf.merge_all_summaries()
        writer = tf.train.SummaryWriter("/tmp/deep", sess.graph_def)

        counter = 0
        start_time = time.time()
        for epoch_idx in xrange(epoch):
            data_loader = load_dataset(data_dir, dataset_name, vocab_size)

            batch_stop = False
            while True:
                y.fill(0)
                inputs, nstarts, answers = [], [], []
                batch_idx = 0
                while True:
                    try:
                        (_, document, question, answer, _), data_idx, data_max_idx = data_loader.next()
                    except StopIteration:
                        batch_stop = True
                        break

                    # [0] means splitter between d and q
                    data = (
                        [int(d) for d in document.split()]
                        + [0]
                        + [int(q) for q in question.split() for q in question.split()]
                    )

                    if len(data) > self.max_nsteps:
                        continue

                    inputs.append(data)
                    nstarts.append(len(inputs[-1]) - 1)
                    y[batch_idx][int(answer)] = 1

                    batch_idx += 1
                    if batch_idx == self.batch_size:
                        break
                if batch_stop:
                    break

                FORCE = False
                if FORCE:
                    inputs = array_pad(inputs, self.max_nsteps, pad=-1, force=FORCE)
                    nstarts = np.where(inputs == -1)[1]
                    inputs[inputs == -1] = 0
                else:
                    inputs = array_pad(inputs, self.max_nsteps, pad=0)
                nstarts = [[nstart, idx, 0] for idx, nstart in enumerate(nstarts)]

                _, summary_str, cost, accuracy = sess.run(
                    [self.optim, merged, self.loss, self.accuracy],
                    feed_dict={self.inputs: inputs, self.nstarts: nstarts, self.y: y},
                )
                if counter % 10 == 0:
                    writer.add_summary(summary_str, counter)
                    print(
                        "Epoch: [%2d] [%4d/%4d] time: %4.4f, loss: %.8f, accuracy: %.8f"
                        % (epoch_idx, data_idx, data_max_idx, time.time() - start_time, np.mean(cost), accuracy)
                    )
                counter += 1
            self.save(sess, self.checkpoint_dir, dataset_name)