Пример #1
0
def get_higgs_generator(data_path,
                        train_size=1000000,
                        store="h5",
                        seed=0,
                        part="train",
                        patterns=True,
                        target=True):

    if store == "h5":

        if part == "train":
            fname = os.path.join(data_path,
                                 "higgs/HIGGS.train_%s.h5pd" % str(train_size))
        elif part == "test":
            fname = os.path.join(data_path, "higgs/HIGGS.test.h5pd")

        if not os.path.exists(fname):
            print(
                "Store for higgs data does not exist. Generating all stores ..."
            )
            _convert_higgs_data(data_path, train_size)

        if part == "test":
            chunksize = 250000
        else:
            if train_size <= 2000000:
                chunksize = 500000
            else:
                chunksize = 2000000

        return DataGenerator(fname=fname,
                             seed=seed,
                             patterns=patterns,
                             target=target,
                             chunksize=chunksize)

    elif store == "mem":

        X_train, y_train, X_test, y_test = get_higgs_data(
            data_path,
            train_size=train_size,
            shuffle_train=False,
            shuffle_test=False)

        data = {}
        if part == "train":
            data['X'] = X_train
            data['y'] = y_train
        else:
            data['X'] = X_test
            data['y'] = y_test

        return DataGenerator(data=data,
                             seed=seed,
                             patterns=patterns,
                             target=target,
                             chunksize=10000000)
Пример #2
0
def get_covtype_generator(data_path,
                          train_size=100000,
                          store="h5",
                          seed=0,
                          part="train",
                          patterns=True,
                          target=True):

    if store == "h5":

        if part == "train":
            fname = os.path.join(
                data_path,
                "covtype/covtype-train-1_%s.csv.h5pd" % str(train_size))
        elif part == "test":
            fname = os.path.join(data_path, "covtype/covtype-test-1.csv.h5pd")

        if not os.path.exists(fname):
            print(
                "Store for covtype data does not exist. Generating all stores ..."
            )
            _convert_datasets(data_path, train_size)

        return DataGenerator(fname=fname,
                             seed=seed,
                             patterns=patterns,
                             target=target,
                             chunksize=200000)

    elif store == "mem":

        X_train, y_train, X_test, y_test = get_covtype_data(
            data_path,
            train_size=train_size,
            shuffle_train=False,
            shuffle_test=False)

        data = {}
        if part == "train":
            data['X'] = X_train
            data['y'] = y_train
        else:
            data['X'] = X_test
            data['y'] = y_test

        return DataGenerator(data=data,
                             seed=seed,
                             patterns=patterns,
                             target=target,
                             chunksize=200000)
Пример #3
0
def get_artificial_generator(data_path,
                             size=1000,
                             seed=0,
                             part="train",
                             store="h5",
                             patterns=True,
                             target=True):

    if part == "train":
        fname = os.path.join(data_path,
                             "artificial/train_" + str(size) + ".h5pd")
    elif part == "test":
        fname = os.path.join(data_path,
                             "artificial/test_" + str(size) + ".h5pd")

    try:
        shutil.rmtree(fname)
    except:
        pass

    if not os.path.exists(fname):
        print(
            "Store for artificial data does not exist. Generating all stores ..."
        )
        _convert_datasets(data_path, size=size, seed=seed)

    return DataGenerator(fname=fname,
                         seed=seed,
                         patterns=patterns,
                         target=target,
                         chunksize=200000)
Пример #4
0
def get_landsat_generator(data_path,
                          train_size=10000000,
                          data_set="LC81950212016133LGN00",
                          version="1_1",
                          seed=0,
                          part="train",
                          store=None,
                          patterns=True,
                          target=True,
                          chunksize=5000000):

    assert version in ["1_1", "3_3", "pan_1_1", "pan_3_3"]

    if part == "train":
        fname = os.path.join(data_path, "landsat",
                             str(data_set) + "_" + version + ".train.h5pd")
    elif part == "test":
        fname = os.path.join(data_path, "landsat",
                             str(data_set) + "_" + version + ".test.h5pd")
    check_and_download(fname)

    return DataGenerator(fname=fname,
                         seed=seed,
                         patterns=patterns,
                         target=target,
                         chunksize=chunksize)
Пример #5
0
def single_run(dkey, train_size, param, seed, profile=False):

    print("Processing data set %s with train_size %s and parameters %s ..." %
          (str(dkey), str(train_size), str(param)))

    if dkey == "landsat":

        # TODO: Download file manually if needed (9,7GB and 524MB):
        # wget https://sid.erda.dk/share_redirect/GsVMKksFSk/landsat_train_LC08_L1TP_196022_20150415_20170409_01_T1_test_random_row_0.050000.h5pd
        # wget https://sid.erda.dk/share_redirect/GsVMKksFSk/landsat_test_LC08_L1TP_196022_20150415_20170409_01_T1_test_random_row_0.050000.h5pd

        # TODO: Adapt paths accordingly
        fname_train = "data/landsat_train_LC08_L1TP_196022_20150415_20170409_01_T1_test_random_row_0.050000.h5pd"
        fname_test = "data/landsat_test_LC08_L1TP_196022_20150415_20170409_01_T1_test_random_row_0.050000.h5pd"

        traingen = DataGenerator(fname=fname_train,
                                 seed=seed,
                                 patterns=True,
                                 target=True,
                                 chunksize=1000000,
                                 n_lines_max=train_size)
        testgen = DataGenerator(fname=fname_test,
                                seed=seed,
                                patterns=True,
                                target=True,
                                chunksize=1000000,
                                n_lines_max=20000000)

    else:
        raise Exception("Unknown data set!")

    Xtrain, ytrain = traingen.get_all()
    Xtest, ytest = testgen.get_all()

    print("")
    print("Number of training patterns:\t%i" % Xtrain.shape[0])
    print("Number of test patterns:\t%i" % Xtest.shape[0])
    print("Dimensionality of the data:\t%i\n" % Xtrain.shape[1])

    if param['tree_type'] == "randomized":
        from sklearn.ensemble import ExtraTreesClassifier as RF
    elif param['tree_type'] == "standard":
        from sklearn.ensemble import RandomForestClassifier as RF

    model = RF(n_estimators=param['n_estimators'],
               criterion="gini",
               max_features=param['max_features'],
               min_samples_split=2,
               n_jobs=param['n_jobs'],
               random_state=seed,
               bootstrap=param['bootstrap'],
               min_samples_leaf=1,
               max_depth=None,
               verbose=0)

    if profile == True:
        import yep
        assert param['n_jobs'] == 1
        yep.start("train.prof")

    # training
    fit_start_time = time.time()
    model.fit(Xtrain, ytrain)
    fit_end_time = time.time()
    if profile == True:
        yep.stop()
    ypreds_train = model.predict(Xtrain)

    # testing
    test_start_time = time.time()
    ypred_test = model.predict(Xtest)
    test_end_time = time.time()

    results = {}
    results['dataset'] = dkey
    results['param'] = param
    results['training_time'] = fit_end_time - fit_start_time
    results['testing_time'] = test_end_time - test_start_time
    print("Training time:     %f" % results['training_time'])
    print("Testing time:      %f" % results['testing_time'])

    evaluate(ypreds_train, ytrain, results, "training")
    evaluate(ypred_test, ytest, results, "testing")

    fname = '%s_%s_%s_%s_%s_%s.json' % (
        str(param['n_estimators']),
        str(param['max_features']),
        str(param['n_jobs']),
        str(param['bootstrap']),
        str(param['tree_type']),
        str(seed),
    )
    fname = os.path.join(params.odir, str(dkey), str(train_size), "sk", fname)
    ensure_dir_for_file(fname)
    with open(fname, 'w') as fp:
        json.dump(results, fp)
Пример #6
0
def single_run(dkey, train_size, param, seed, profile=False):
                
    print("Processing data set %s with train_size %s and parameters %s ..." % (str(dkey), str(train_size), str(param)))
    
    tmp_dir = "tmp/subsetwood"
    
    if dkey == "landsat":

        # TODO: Download file manually if needed (9,7GB and 524MB):
        # wget https://sid.erda.dk/share_redirect/GsVMKksFSk/landsat_train_LC08_L1TP_196022_20150415_20170409_01_T1_test_random_row_0.050000.h5pd
        # wget https://sid.erda.dk/share_redirect/GsVMKksFSk/landsat_test_LC08_L1TP_196022_20150415_20170409_01_T1_test_random_row_0.050000.h5pd

        # TODO: Adapt paths accordingly
        fname_train = "data/landsat_train_LC08_L1TP_196022_20150415_20170409_01_T1_test_random_row_0.050000.h5pd"
        fname_test = "data/landsat_test_LC08_L1TP_196022_20150415_20170409_01_T1_test_random_row_0.050000.h5pd"
        
        traingen = DataGenerator(fname=fname_train, seed=seed, patterns=True, target=True, chunksize=1000000, n_lines_max=train_size)
        testgen = DataGenerator(fname=fname_test, seed=seed, patterns=True, target=True, chunksize=1000000, n_lines_max=20000000)
    
    else:
        raise Exception("Unknown data set!")

    print("")
    print("Number of training patterns:\t%i" % traingen.get_shapes()[0][0])
    print("Number of test patterns:\t%i" % testgen.get_shapes()[0][0])
    print("Dimensionality of the data:\t%i\n" % traingen.get_shapes()[0][1])
    
    # set to top trees size
    n_subset = 500000

    model = SubsetWoodClassifier(
                n_estimators=param['n_estimators'],
                criterion="gini",
                max_features=param['max_features'],
                min_samples_split=2,
                n_jobs=param['n_jobs'],
                seed=seed,
                bootstrap=param['bootstrap'],
                tree_traversal_mode="dfs",
                tree_type=param['tree_type'],
                min_samples_leaf=1,
                float_type="double",
                max_depth=None,
                verbose=1,
                odir=tmp_dir,
                store=DiskStore())

    # training
    if profile == True:
        import yep
        assert param['n_jobs'] == 1
        yep.start("train.prof")
                
    fit_start_time = time.time()        
    model.fit(traingen, n_subset=n_subset)
    fit_end_time = time.time()
    if profile == True:
        yep.stop()
    
    # testing
    print("Computing predictions ...")
    test_start_time = time.time()
    ypred_test = model.predict(generator=testgen)
    test_end_time = time.time()
    
    results = {}
    results['dataset'] = dkey
    results['param'] = param
    results['training_time'] = fit_end_time - fit_start_time
    results['testing_time'] = test_end_time - test_start_time
    results['total'] = model.get_training_times()['total']
    results['retrieve'] = model.get_training_times()['retrieve']
    results['subset'] = model.get_training_times()['subset']
    
    print("Training time:\t\t%f" % results['training_time'])
    print("Testing time:\t\t%f" % results['testing_time'])
    
    print("Evaluating test error ...")

    ytest = testgen.get_all_target()            
    ytrain = traingen.get_all_target()            
    ytrain = ytrain.astype(numpy.int64)
    ytest = ytest.astype(numpy.int64)
    ypred_test = ypred_test.astype(numpy.int64)
    evaluate(ypred_test, ytest, results, "testing")

    print("Training distribution")
    print(numpy.bincount(ytrain))

    print("Test distribution")
    print(numpy.bincount(ytest))

    print("Predict distribution")
    print(numpy.bincount(ypred_test))
    
    fname = '%s_%s_%s_%s_%s_%s.json' % (str(param['n_estimators']),
                                  str(param['max_features']),
                                  str(param['n_jobs']),
                                  str(param['bootstrap']),
                                  str(param['tree_type']),
                                  str(seed),
                                )
    fname = os.path.join(params.odir, str(dkey), str(train_size), "subsetwood_" + str(n_subset), fname)
    ensure_dir_for_file(fname)
    with open(fname, 'w') as fp:
        json.dump(results, fp)

    
    del(testgen)
    del(traingen)
    model.cleanup()
    time.sleep(1)
Пример #7
0
def single_run(dkey, train_size, param, seed, profile=False):

    print("Processing data set %s with train_size %s and parameters %s ..." %
          (str(dkey), str(train_size), str(param)))

    if dkey == "landsat":

        # TODO: Download file manually if needed (9,7GB and 524MB):
        # wget https://sid.erda.dk/share_redirect/GsVMKksFSk/landsat_train_LC08_L1TP_196022_20150415_20170409_01_T1_test_random_row_0.050000.h5pd
        # wget https://sid.erda.dk/share_redirect/GsVMKksFSk/landsat_test_LC08_L1TP_196022_20150415_20170409_01_T1_test_random_row_0.050000.h5pd

        # TODO: Adapt paths accordingly
        fname_train = "data/landsat_train_LC08_L1TP_196022_20150415_20170409_01_T1_test_random_row_0.050000.h5pd"
        fname_test = "data/landsat_test_LC08_L1TP_196022_20150415_20170409_01_T1_test_random_row_0.050000.h5pd"

        traingen = DataGenerator(fname=fname_train,
                                 seed=seed,
                                 patterns=True,
                                 target=True,
                                 chunksize=1000000,
                                 n_lines_max=train_size)
        testgen = DataGenerator(fname=fname_test,
                                seed=seed,
                                patterns=True,
                                target=True,
                                chunksize=1000000,
                                n_lines_max=20000000)

        # TODO: Adapt paths accordingly
        fname_train_csv = "tmp/landsat_train_small_%lu.csv" % train_size
        fname_test_csv = "tmp/landsat_test.csv"

    traingen.to_csv(fname_train_csv, cache=False, remove=True)
    testgen.to_csv(fname_test_csv, cache=False, remove=True)

    import h2o
    from skutil.h2o import h2o_col_to_numpy
    h2o.init(max_mem_size="12G", nthreads=param['n_jobs'])
    h2o.remove_all()
    from h2o.estimators.random_forest import H2ORandomForestEstimator

    if dkey == "landsat_small" or dkey == "landsat":
        train_df = h2o.import_file(fname_train_csv)
        test_df = h2o.import_file(fname_test_csv)
        Xcols, ycol = train_df.col_names[:-1], train_df.col_names[-1]
    else:
        raise Exception("Unknown data set!")

    print("")
    print("Number of training patterns:\t%i" % train_df.shape[0])
    print("Number of test patterns:\t%i" % test_df.shape[0])
    print("Dimensionality of the data:\t%i\n" % train_df.shape[1])

    if param['max_features'] is None:
        mtries = train_df.shape[1] - 2
    elif param['max_features'] == "sqrt":
        mtries = int(math.sqrt(train_df.shape[1] - 2))

    if param['bootstrap'] == False:
        sample_rate = 1.0
    else:
        sample_rate = 0.632

    model = H2ORandomForestEstimator(
        mtries=mtries,
        sample_rate=sample_rate,
        #nbins=1000, #crash
        min_rows=1,
        build_tree_one_node=True,
        max_depth=20,
        balance_classes=False,
        ntrees=param['n_estimators'],
        seed=seed)

    # training
    fit_start_time = time.time()
    model.train(Xcols, ycol, training_frame=train_df)
    fit_end_time = time.time()

    # testing
    test_start_time = time.time()
    ypreds_test = model.predict(test_df)
    test_end_time = time.time()

    results = {}
    results['dataset'] = dkey
    results['param'] = param
    results['training_time'] = fit_end_time - fit_start_time
    results['testing_time'] = test_end_time - test_start_time
    print("Training time:     %f" % results['training_time'])
    print("Testing time:      %f" % results['testing_time'])

    evaluate(numpy.rint(ypreds_test.as_data_frame().values),
             test_df[ycol].as_data_frame().values, results, "testing")

    fname = '%s_%s_%s_%s_%s_%s.json' % (
        str(param['n_estimators']),
        str(param['max_features']),
        str(param['n_jobs']),
        str(param['bootstrap']),
        str(param['tree_type']),
        str(seed),
    )
    fname = os.path.join(params.odir, str(dkey), str(train_size), "h2", fname)
    ensure_dir_for_file(fname)
    with open(fname, 'w') as fp:
        json.dump(results, fp)
Пример #8
0
def single_run(dkey, train_size, param, seed, profile=False):

    print("Processing data set %s with train_size %s and parameters %s ..." %
          (str(dkey), str(train_size), str(param)))

    tmp_dir = "tmp/hugewood"

    if dkey == "landsat":

        # TODO: Download file manually if needed (255GB and 524MB):
        # wget https://sid.erda.dk/share_redirect/GsVMKksFSk/landsat_train.h5pd
        # wget https://sid.erda.dk/share_redirect/GsVMKksFSk/landsat_test_LC08_L1TP_196022_20150415_20170409_01_T1_test_random_row_0.050000.h5pd

        fname_train = "data/landsat_train.h5pd"
        fname_test = "data/landsat_test_LC08_L1TP_196022_20150415_20170409_01_T1_test_random_row_0.050000.h5pd"

        traingen = DataGenerator(fname=fname_train,
                                 seed=seed,
                                 patterns=True,
                                 target=True,
                                 chunksize=1000000,
                                 n_lines_max=train_size)
        testgen = DataGenerator(fname=fname_test,
                                seed=seed,
                                patterns=True,
                                target=True,
                                chunksize=1000000,
                                n_lines_max=10000000)

    else:
        raise Exception("Unknown data set!")

    print("")
    print("Number of training patterns:\t%i" % traingen.get_shapes()[0][0])
    print("Number of test patterns:\t%i" % testgen.get_shapes()[0][0])
    print("Dimensionality of the data:\t%i\n" % traingen.get_shapes()[0][1])

    param_wood = param['param_wood']

    wood = WoodClassifier(n_estimators=1,
                          criterion="gini",
                          max_features=param_wood['max_features'],
                          min_samples_split=2,
                          n_jobs=param_wood['n_jobs'],
                          seed=params.seed,
                          bootstrap=param_wood['bootstrap'],
                          tree_traversal_mode="dfs",
                          tree_type=param_wood['tree_type'],
                          min_samples_leaf=1,
                          float_type="double",
                          max_depth=None,
                          verbose=0)

    model = HugeWoodClassifier(
        n_estimators=param['n_estimators'],
        n_estimators_bottom=param['n_estimators_bottom'],
        n_top="auto",
        n_patterns_leaf="auto",
        balanced_top_tree=True,
        top_tree_max_depth=None,
        top_tree_type="standard",
        top_tree_leaf_stopping_mode="ignore_impurity",
        n_jobs=param_wood['n_jobs'],
        seed=params.seed,
        verbose=1,
        plot_intermediate={},
        chunk_max_megabytes=2048,
        wrapped_instance=wood,
        odir=tmp_dir,
        store=DiskStore(),
    )

    # training
    if profile == True:
        import yep
        assert param_wood['n_jobs'] == 1
        yep.start("train.prof")

    fit_start_time = time.time()
    model.fit(traingen)
    fit_end_time = time.time()
    if profile == True:
        yep.stop()

    # testing
    print("Computing predictions ...")
    test_start_time = time.time()
    ypred_test = model.predict(generator=testgen)
    test_end_time = time.time()

    results = {}
    results['dataset'] = dkey
    results['param'] = param
    results['training_time'] = fit_end_time - fit_start_time
    results['testing_time'] = test_end_time - test_start_time
    results['total'] = model.get_training_times()['total']
    results['retrieve'] = model.get_training_times()['retrieve']
    results['top'] = model.get_training_times()['top']
    results['distribute'] = model.get_training_times()['distribute']
    results['bottom'] = model.get_training_times()['bottom']

    print("Training time:\t\t%f" % results['training_time'])
    print("Testing time:\t\t%f" % results['testing_time'])

    print("Evaluating test error ...")
    ytest = testgen.get_all_target()
    evaluate(ypred_test, ytest, results, "testing")

    fname = '%s_%s_%s_%s_%s.json' % (
        str(param_wood['n_estimators']),
        str(param_wood['max_features']),
        str(param_wood['n_jobs']),
        str(param_wood['bootstrap']),
        str(param_wood['tree_type']),
    )
    fname = os.path.join(params.odir, str(dkey), str(train_size), "hugewood",
                         fname)
    ensure_dir_for_file(fname)
    with open(fname, 'w') as fp:
        json.dump(results, fp)

    del (testgen)
    del (traingen)
    model.cleanup()
    time.sleep(1)