示例#1
0
文件: solvers.py 项目: 121onto/noaa
def fit_random_msgd_early_stopping(x, datasets, outpath, n_batches, batch_size,
                                   models, classifier, n_epochs=1000,
                                   patience=5000, patience_increase=2,
                                   improvement_threshold=0.995):

    # unpack parameters
    [(tn_x, tn_y), (v_x, v_y), (tt_x, tt_y)] = datasets
    n_tn_batches, n_v_batches, n_tt_batches = n_batches
    tn_model, v_model = models

    validation_frequency = min(n_tn_batches, patience//20)

    # initialize some variables
    best_validation_loss = np.inf
    best_iter = 0
    test_score = 0.
    start_time = timeit.default_timer()

    # main loop
    done_looping = False
    epoch = 0
    while (epoch < n_epochs) and (not done_looping):
        epoch = epoch + 1
        for minibatch_index in xrange(n_tn_batches):
            finished = prompt_for_quit_or_timeout(msg='Stop learning?', timeout=5)
            if finished:
                end_time = timeit.default_timer()
                return best_validation_loss, best_iter, epoch, (end_time - start_time)

            minibatch = tn_x.get_value(borrow=True)[minibatch_index * batch_size: (minibatch_index + 1) * batch_size]
            minibatch = transform_images(minibatch)
            minibatch_avg_cost = tn_model(minibatch, minibatch_index)

            iter = (epoch - 1) * n_tn_batches + minibatch_index
            if (iter + 1) % validation_frequency == 0:
                validation_losses = [v_model(i) for i in xrange(n_v_batches)]
                this_validation_loss = np.mean(validation_losses)
                print(
                    'epoch %i, minibatch %i/%i, validation error %f, minibatch average cost %f' %
                    (
                        epoch,
                        minibatch_index + 1,
                        n_tn_batches,
                        this_validation_loss,
                        minibatch_avg_cost
                    )
                )

                if this_validation_loss < best_validation_loss:
                    if this_validation_loss < best_validation_loss * improvement_threshold:
                        patience = max(patience, iter * patience_increase)

                    #best_params = copy.deepcopy(params)
                    best_validation_loss = this_validation_loss
                    best_iter = iter

            else:
                print(
                    'epoch %i, minibatch %i/%i, minibatch average cost %f' %
                    (
                        epoch,
                        minibatch_index + 1,
                        n_tn_batches,
                        minibatch_avg_cost
                    )
                )


            if patience <= iter:
                done_looping = True
                break

        if outpath is not None:
            classifier.save_params(path=outpath)

    end_time = timeit.default_timer()
    return best_validation_loss, best_iter, epoch, (end_time - start_time)
示例#2
0
文件: preproc.py 项目: 121onto/noaa
def build_memmap_arrays(
        csv_path=os.path.join(BASE_DIR, 'data/train.csv'),
        img_path=os.path.join(BASE_DIR, 'data/imgs-proc/'),
        out_path=os.path.join(BASE_DIR, 'data/memmap/'),
        image_size=3*300*300):

    train = pd.read_csv(csv_path)
    train['whaleID'] = train['whaleID'].astype('category')
    train['whaleID'] = train['whaleID'].cat.codes
    labels_dict = dict(zip(train.Image, train.whaleID))

    tn_x_path = os.path.join(out_path, 'tn_x.dat')
    tn_y_path = os.path.join(out_path, 'tn_y.dat')
    v_x_path = os.path.join(out_path, 'v_x.dat')
    v_y_path = os.path.join(out_path, 'v_y.dat')
    tt_x_path = os.path.join(out_path, 'tt_x.dat')
    tt_y_path = os.path.join(out_path, 'tt_y.dat')

    tn_x = np.memmap(tn_x_path, dtype=theano.config.floatX, mode='w+', shape=(4044,image_size))
    tn_y = np.memmap(tn_y_path, dtype=theano.config.floatX, mode='w+', shape=(4044,))
    v_x = np.memmap(v_x_path, dtype=theano.config.floatX, mode='w+', shape=(500,image_size))
    v_y = np.memmap(v_y_path, dtype=theano.config.floatX, mode='w+', shape=(500,))
    tt_x = np.memmap(tt_x_path, dtype=theano.config.floatX, mode='w+', shape=(6925,image_size))
    tt_y = np.memmap(tt_y_path, dtype=theano.config.floatX, mode='w+', shape=(6925,))

    # randomly allocate 500 samples to the validation dataset
    v_batch = np.random.choice(range(4544), size=500, replace=False)
    a_idx = 0
    tn_idx = 0
    v_idx = 0
    tt_idx = 0

    terminate = False
    for idx, file in enumerate(sorted(os.listdir(img_path))):
        if not file.startswith('w_'):
            continue
        if idx % 1000 == 0:
            print(file)
            np.memmap.flush(tn_x)
            np.memmap.flush(v_x)
            np.memmap.flush(tt_x)
            terminate = prompt_for_quit_or_timeout()
            if terminate:
                print('Exiting gracefully...')
                break
            else:
                print('Program will continue...')
        if file.endswith('.jpg'):
            with open(os.path.join(img_path,file), 'rb') as f:
                im = Image.open(f)
                im = np.asarray(im).T.flatten()

                if file in labels_dict:
                    if a_idx in v_batch:
                        v_x[v_idx,:] = im[:]
                        v_y[v_idx] = labels_dict[file]
                        v_idx += 1
                        a_idx += 1
                    else:
                        tn_x[tn_idx,:] = im[:]
                        tn_y[tn_idx] = labels_dict[file]
                        tn_idx += 1
                        a_idx += 1
                else:
                    tt_x[tt_idx,:] = im[:]
                    tt_idx += 1

    if terminate:
        sys.exit('')