Пример #1
0
 def create_dataset(self, container_key, dkey, data):
     
     ensure_dir_for_file(container_key)  
     s = h5py.File(container_key, 'a', driver="sec2", libver='latest')
     
     dset = s.create_dataset(dkey, data.shape, maxshape=(None, data.shape[1]), compression="lzf")
     dset[:,:] = data
     
     s.close()
Пример #2
0
    def append_to_dataset(self, container_key, dkey, data):

        ensure_dir_for_file(container_key)
        s = h5py.File(container_key, 'a', driver="sec2", libver='latest')
        
        offset = 0
        
        if not dkey in s.keys():
            
            dset = s.create_dataset(dkey, data.shape, maxshape=(None, data.shape[1]), compression="lzf")
            
        else:
            
            dset = s.get(dkey)
            offset += dset.shape[0]
            dset.resize(dset.shape[0] + data.shape[0], axis=0)
    
        dset[offset:, :] = data
        
        s.close()
Пример #3
0
def single_run(dkey, train_size, param, seed, profile=False):

    print(
        "Processing data set %s with train_size %s, seed %s, and parameters %s ..."
        % (str(dkey), str(train_size), str(seed), str(param)))

    if dkey == "covtype":
        Xtrain, ytrain, Xtest, ytest = covtype(train_size=train_size,
                                               seed=seed)
    elif dkey == "higgs":
        Xtrain, ytrain, Xtest, ytest = higgs(train_size=train_size, seed=seed)
    elif dkey == "susy":
        Xtrain, ytrain, Xtest, ytest = susy(train_size=train_size, seed=seed)
    else:
        raise Exception("Unknown data set!")

    print("")
    print("Number of training patterns:\t%i" % Xtrain.shape[0])
    print("Number of test patterns:\t%i" % Xtest.shape[0])
    print("Dimensionality of the data:\t%i\n" % Xtrain.shape[1])

    if param['tree_type'] == "randomized":
        from sklearn.ensemble import ExtraTreesClassifier as RF
    elif param['tree_type'] == "standard":
        from sklearn.ensemble import RandomForestClassifier as RF

    model = RF(n_estimators=param['n_estimators'],
               criterion="gini",
               max_features=param['max_features'],
               min_samples_split=2,
               n_jobs=param['n_jobs'],
               random_state=seed,
               bootstrap=param['bootstrap'],
               min_samples_leaf=1,
               max_depth=None,
               verbose=0)

    if profile == True:
        import yep
        assert param['n_jobs'] == 1
        yep.start("train.prof")

    # training
    fit_start_time = time.time()
    model.fit(Xtrain, ytrain)
    fit_end_time = time.time()
    if profile == True:
        yep.stop()
    ypreds_train = model.predict(Xtrain)

    # testing
    test_start_time = time.time()
    ypred_test = model.predict(Xtest)
    test_end_time = time.time()

    results = {}
    results['dataset'] = dkey
    results['param'] = param
    results['training_time'] = fit_end_time - fit_start_time
    results['testing_time'] = test_end_time - test_start_time
    print("Training time:     %f" % results['training_time'])
    print("Testing time:      %f" % results['testing_time'])

    evaluate(ypreds_train, ytrain, results, "training")
    evaluate(ypred_test, ytest, results, "testing")

    fname = '%s_%s_%s_%s_%s_%s.json' % (
        str(param['n_estimators']),
        str(param['max_features']),
        str(param['n_jobs']),
        str(param['bootstrap']),
        str(param['tree_type']),
        str(seed),
    )
    fname = os.path.join(params.odir, str(dkey), str(train_size), "sk", fname)
    ensure_dir_for_file(fname)
    with open(fname, 'w') as fp:
        json.dump(results, fp)
Пример #4
0
def single_run(dkey, train_size, param, seed, profile=False):

    print("Processing data set %s with train_size %s and parameters %s ..." %
          (str(dkey), str(train_size), str(param)))

    if dkey == "landsat":

        # TODO: Download file manually if needed (9,7GB and 524MB):
        # wget https://sid.erda.dk/share_redirect/GsVMKksFSk/landsat_train_LC08_L1TP_196022_20150415_20170409_01_T1_test_random_row_0.050000.h5pd
        # wget https://sid.erda.dk/share_redirect/GsVMKksFSk/landsat_test_LC08_L1TP_196022_20150415_20170409_01_T1_test_random_row_0.050000.h5pd

        # TODO: Adapt paths accordingly
        fname_train = "data/landsat_train_LC08_L1TP_196022_20150415_20170409_01_T1_test_random_row_0.050000.h5pd"
        fname_test = "data/landsat_test_LC08_L1TP_196022_20150415_20170409_01_T1_test_random_row_0.050000.h5pd"

        traingen = DataGenerator(fname=fname_train,
                                 seed=seed,
                                 patterns=True,
                                 target=True,
                                 chunksize=1000000,
                                 n_lines_max=train_size)
        testgen = DataGenerator(fname=fname_test,
                                seed=seed,
                                patterns=True,
                                target=True,
                                chunksize=1000000,
                                n_lines_max=20000000)

    else:
        raise Exception("Unknown data set!")

    Xtrain, ytrain = traingen.get_all()
    Xtest, ytest = testgen.get_all()

    print("")
    print("Number of training patterns:\t%i" % Xtrain.shape[0])
    print("Number of test patterns:\t%i" % Xtest.shape[0])
    print("Dimensionality of the data:\t%i\n" % Xtrain.shape[1])

    if param['tree_type'] == "randomized":
        from sklearn.ensemble import ExtraTreesClassifier as RF
    elif param['tree_type'] == "standard":
        from sklearn.ensemble import RandomForestClassifier as RF

    model = RF(n_estimators=param['n_estimators'],
               criterion="gini",
               max_features=param['max_features'],
               min_samples_split=2,
               n_jobs=param['n_jobs'],
               random_state=seed,
               bootstrap=param['bootstrap'],
               min_samples_leaf=1,
               max_depth=None,
               verbose=0)

    if profile == True:
        import yep
        assert param['n_jobs'] == 1
        yep.start("train.prof")

    # training
    fit_start_time = time.time()
    model.fit(Xtrain, ytrain)
    fit_end_time = time.time()
    if profile == True:
        yep.stop()
    ypreds_train = model.predict(Xtrain)

    # testing
    test_start_time = time.time()
    ypred_test = model.predict(Xtest)
    test_end_time = time.time()

    results = {}
    results['dataset'] = dkey
    results['param'] = param
    results['training_time'] = fit_end_time - fit_start_time
    results['testing_time'] = test_end_time - test_start_time
    print("Training time:     %f" % results['training_time'])
    print("Testing time:      %f" % results['testing_time'])

    evaluate(ypreds_train, ytrain, results, "training")
    evaluate(ypred_test, ytest, results, "testing")

    fname = '%s_%s_%s_%s_%s_%s.json' % (
        str(param['n_estimators']),
        str(param['max_features']),
        str(param['n_jobs']),
        str(param['bootstrap']),
        str(param['tree_type']),
        str(seed),
    )
    fname = os.path.join(params.odir, str(dkey), str(train_size), "sk", fname)
    ensure_dir_for_file(fname)
    with open(fname, 'w') as fp:
        json.dump(results, fp)
Пример #5
0
def single_run(dkey, train_size, n_bottom, param, seed, profile=False):

    print(
        "Processing data set %s with train_size %s, n_bottom %s, seed %s, and parameters %s ..."
        % (str(dkey), str(train_size), str(n_bottom), str(seed), str(param)))

    if dkey == "covtype":
        traingen, testgen = covtype_generators(train_size=train_size,
                                               store="mem",
                                               seed=seed)
    elif dkey == "higgs":
        traingen, testgen = higgs_generators(train_size=train_size,
                                             store="mem",
                                             seed=seed)
    elif dkey == "susy":
        traingen, testgen = susy_generators(train_size=train_size,
                                            store="mem",
                                            seed=seed)
    else:
        raise Exception("Unknown data set!")

    print("")
    print("Number of training patterns:\t%i" % traingen.get_shapes()[0][0])
    print("Number of test patterns:\t%i" % testgen.get_shapes()[0][0])
    print("Dimensionality of the data:\t%i\n" % traingen.get_shapes()[0][1])

    param_wood = param['param_wood']

    wood = WoodClassifier(n_estimators=1,
                          criterion="gini",
                          max_features=param_wood['max_features'],
                          min_samples_split=2,
                          n_jobs=param_wood['n_jobs'],
                          seed=seed,
                          bootstrap=param_wood['bootstrap'],
                          tree_traversal_mode="dfs",
                          tree_type=param_wood['tree_type'],
                          min_samples_leaf=1,
                          float_type="double",
                          max_depth=None,
                          verbose=0)

    model = HugeWoodClassifier(
        n_estimators=int(24 / n_bottom),
        n_estimators_bottom=int(n_bottom),
        n_top="auto",
        n_patterns_leaf=75000,
        balanced_top_tree=True,
        top_tree_lambda=1.0,
        top_tree_max_depth=None,
        top_tree_type="standard",
        top_tree_leaf_stopping_mode="ignore_impurity",
        n_jobs=param_wood['n_jobs'],
        seed=seed,
        verbose=1,
        plot_intermediate={},
        chunk_max_megabytes=2048,
        wrapped_instance=wood,
        store=MemoryStore(),
    )

    # training
    if profile == True:
        import yep
        assert param_wood['n_jobs'] == 1
        yep.start("train.prof")

    fit_start_time = time.time()
    model.fit(traingen)
    fit_end_time = time.time()
    if profile == True:
        yep.stop()
    ypreds_train = model.predict(generator=traingen)

    # testing
    test_start_time = time.time()
    ypred_test = model.predict(generator=testgen)
    test_end_time = time.time()

    results = {}
    results['dataset'] = dkey
    results['param'] = param
    results['training_time'] = fit_end_time - fit_start_time
    results['testing_time'] = test_end_time - test_start_time
    print("Training time:\t\t%f" % results['training_time'])
    print("Testing time:\t\t%f" % results['testing_time'])

    evaluate(ypreds_train, traingen.get_all_target(), results, "training")
    evaluate(ypred_test, testgen.get_all_target(), results, "testing")

    fname = '%s_%s_%s_%s_%s_%s.json' % (
        str(param_wood['n_estimators']),
        str(param_wood['max_features']),
        str(param_wood['n_jobs']),
        str(param_wood['bootstrap']),
        str(param_wood['tree_type']),
        str(seed),
    )
    fname = os.path.join(params.odir, str(dkey), str(train_size),
                         str(n_bottom), "hugewood_75K", fname)
    ensure_dir_for_file(fname)
    with open(fname, 'w') as fp:
        json.dump(results, fp)

    del (testgen)
    del (traingen)
    model.cleanup()

    time.sleep(1)
Пример #6
0
def single_run(dkey, train_size, param, seed, profile=False):
                
    print("Processing data set %s with train_size %s and parameters %s ..." % (str(dkey), str(train_size), str(param)))
    
    tmp_dir = "tmp/subsetwood"
    
    if dkey == "landsat":

        # TODO: Download file manually if needed (9,7GB and 524MB):
        # wget https://sid.erda.dk/share_redirect/GsVMKksFSk/landsat_train_LC08_L1TP_196022_20150415_20170409_01_T1_test_random_row_0.050000.h5pd
        # wget https://sid.erda.dk/share_redirect/GsVMKksFSk/landsat_test_LC08_L1TP_196022_20150415_20170409_01_T1_test_random_row_0.050000.h5pd

        # TODO: Adapt paths accordingly
        fname_train = "data/landsat_train_LC08_L1TP_196022_20150415_20170409_01_T1_test_random_row_0.050000.h5pd"
        fname_test = "data/landsat_test_LC08_L1TP_196022_20150415_20170409_01_T1_test_random_row_0.050000.h5pd"
        
        traingen = DataGenerator(fname=fname_train, seed=seed, patterns=True, target=True, chunksize=1000000, n_lines_max=train_size)
        testgen = DataGenerator(fname=fname_test, seed=seed, patterns=True, target=True, chunksize=1000000, n_lines_max=20000000)
    
    else:
        raise Exception("Unknown data set!")

    print("")
    print("Number of training patterns:\t%i" % traingen.get_shapes()[0][0])
    print("Number of test patterns:\t%i" % testgen.get_shapes()[0][0])
    print("Dimensionality of the data:\t%i\n" % traingen.get_shapes()[0][1])
    
    # set to top trees size
    n_subset = 500000

    model = SubsetWoodClassifier(
                n_estimators=param['n_estimators'],
                criterion="gini",
                max_features=param['max_features'],
                min_samples_split=2,
                n_jobs=param['n_jobs'],
                seed=seed,
                bootstrap=param['bootstrap'],
                tree_traversal_mode="dfs",
                tree_type=param['tree_type'],
                min_samples_leaf=1,
                float_type="double",
                max_depth=None,
                verbose=1,
                odir=tmp_dir,
                store=DiskStore())

    # training
    if profile == True:
        import yep
        assert param['n_jobs'] == 1
        yep.start("train.prof")
                
    fit_start_time = time.time()        
    model.fit(traingen, n_subset=n_subset)
    fit_end_time = time.time()
    if profile == True:
        yep.stop()
    
    # testing
    print("Computing predictions ...")
    test_start_time = time.time()
    ypred_test = model.predict(generator=testgen)
    test_end_time = time.time()
    
    results = {}
    results['dataset'] = dkey
    results['param'] = param
    results['training_time'] = fit_end_time - fit_start_time
    results['testing_time'] = test_end_time - test_start_time
    results['total'] = model.get_training_times()['total']
    results['retrieve'] = model.get_training_times()['retrieve']
    results['subset'] = model.get_training_times()['subset']
    
    print("Training time:\t\t%f" % results['training_time'])
    print("Testing time:\t\t%f" % results['testing_time'])
    
    print("Evaluating test error ...")

    ytest = testgen.get_all_target()            
    ytrain = traingen.get_all_target()            
    ytrain = ytrain.astype(numpy.int64)
    ytest = ytest.astype(numpy.int64)
    ypred_test = ypred_test.astype(numpy.int64)
    evaluate(ypred_test, ytest, results, "testing")

    print("Training distribution")
    print(numpy.bincount(ytrain))

    print("Test distribution")
    print(numpy.bincount(ytest))

    print("Predict distribution")
    print(numpy.bincount(ypred_test))
    
    fname = '%s_%s_%s_%s_%s_%s.json' % (str(param['n_estimators']),
                                  str(param['max_features']),
                                  str(param['n_jobs']),
                                  str(param['bootstrap']),
                                  str(param['tree_type']),
                                  str(seed),
                                )
    fname = os.path.join(params.odir, str(dkey), str(train_size), "subsetwood_" + str(n_subset), fname)
    ensure_dir_for_file(fname)
    with open(fname, 'w') as fp:
        json.dump(results, fp)

    
    del(testgen)
    del(traingen)
    model.cleanup()
    time.sleep(1)
Пример #7
0
def single_run(dkey, train_size, param, seed, profile=False):

    print("Processing data set %s with train_size %s and parameters %s ..." %
          (str(dkey), str(train_size), str(param)))

    if dkey == "landsat":

        # TODO: Download file manually if needed (9,7GB and 524MB):
        # wget https://sid.erda.dk/share_redirect/GsVMKksFSk/landsat_train_LC08_L1TP_196022_20150415_20170409_01_T1_test_random_row_0.050000.h5pd
        # wget https://sid.erda.dk/share_redirect/GsVMKksFSk/landsat_test_LC08_L1TP_196022_20150415_20170409_01_T1_test_random_row_0.050000.h5pd

        # TODO: Adapt paths accordingly
        fname_train = "data/landsat_train_LC08_L1TP_196022_20150415_20170409_01_T1_test_random_row_0.050000.h5pd"
        fname_test = "data/landsat_test_LC08_L1TP_196022_20150415_20170409_01_T1_test_random_row_0.050000.h5pd"

        traingen = DataGenerator(fname=fname_train,
                                 seed=seed,
                                 patterns=True,
                                 target=True,
                                 chunksize=1000000,
                                 n_lines_max=train_size)
        testgen = DataGenerator(fname=fname_test,
                                seed=seed,
                                patterns=True,
                                target=True,
                                chunksize=1000000,
                                n_lines_max=20000000)

        # TODO: Adapt paths accordingly
        fname_train_csv = "tmp/landsat_train_small_%lu.csv" % train_size
        fname_test_csv = "tmp/landsat_test.csv"

    traingen.to_csv(fname_train_csv, cache=False, remove=True)
    testgen.to_csv(fname_test_csv, cache=False, remove=True)

    import h2o
    from skutil.h2o import h2o_col_to_numpy
    h2o.init(max_mem_size="12G", nthreads=param['n_jobs'])
    h2o.remove_all()
    from h2o.estimators.random_forest import H2ORandomForestEstimator

    if dkey == "landsat_small" or dkey == "landsat":
        train_df = h2o.import_file(fname_train_csv)
        test_df = h2o.import_file(fname_test_csv)
        Xcols, ycol = train_df.col_names[:-1], train_df.col_names[-1]
    else:
        raise Exception("Unknown data set!")

    print("")
    print("Number of training patterns:\t%i" % train_df.shape[0])
    print("Number of test patterns:\t%i" % test_df.shape[0])
    print("Dimensionality of the data:\t%i\n" % train_df.shape[1])

    if param['max_features'] is None:
        mtries = train_df.shape[1] - 2
    elif param['max_features'] == "sqrt":
        mtries = int(math.sqrt(train_df.shape[1] - 2))

    if param['bootstrap'] == False:
        sample_rate = 1.0
    else:
        sample_rate = 0.632

    model = H2ORandomForestEstimator(
        mtries=mtries,
        sample_rate=sample_rate,
        #nbins=1000, #crash
        min_rows=1,
        build_tree_one_node=True,
        max_depth=20,
        balance_classes=False,
        ntrees=param['n_estimators'],
        seed=seed)

    # training
    fit_start_time = time.time()
    model.train(Xcols, ycol, training_frame=train_df)
    fit_end_time = time.time()

    # testing
    test_start_time = time.time()
    ypreds_test = model.predict(test_df)
    test_end_time = time.time()

    results = {}
    results['dataset'] = dkey
    results['param'] = param
    results['training_time'] = fit_end_time - fit_start_time
    results['testing_time'] = test_end_time - test_start_time
    print("Training time:     %f" % results['training_time'])
    print("Testing time:      %f" % results['testing_time'])

    evaluate(numpy.rint(ypreds_test.as_data_frame().values),
             test_df[ycol].as_data_frame().values, results, "testing")

    fname = '%s_%s_%s_%s_%s_%s.json' % (
        str(param['n_estimators']),
        str(param['max_features']),
        str(param['n_jobs']),
        str(param['bootstrap']),
        str(param['tree_type']),
        str(seed),
    )
    fname = os.path.join(params.odir, str(dkey), str(train_size), "h2", fname)
    ensure_dir_for_file(fname)
    with open(fname, 'w') as fp:
        json.dump(results, fp)
Пример #8
0
def single_run(dkey, train_size, param, seed, profile=False):

    print("Processing data set %s with train_size %s and parameters %s ..." %
          (str(dkey), str(train_size), str(param)))

    tmp_dir = "tmp/hugewood"

    if dkey == "landsat":

        # TODO: Download file manually if needed (255GB and 524MB):
        # wget https://sid.erda.dk/share_redirect/GsVMKksFSk/landsat_train.h5pd
        # wget https://sid.erda.dk/share_redirect/GsVMKksFSk/landsat_test_LC08_L1TP_196022_20150415_20170409_01_T1_test_random_row_0.050000.h5pd

        fname_train = "data/landsat_train.h5pd"
        fname_test = "data/landsat_test_LC08_L1TP_196022_20150415_20170409_01_T1_test_random_row_0.050000.h5pd"

        traingen = DataGenerator(fname=fname_train,
                                 seed=seed,
                                 patterns=True,
                                 target=True,
                                 chunksize=1000000,
                                 n_lines_max=train_size)
        testgen = DataGenerator(fname=fname_test,
                                seed=seed,
                                patterns=True,
                                target=True,
                                chunksize=1000000,
                                n_lines_max=10000000)

    else:
        raise Exception("Unknown data set!")

    print("")
    print("Number of training patterns:\t%i" % traingen.get_shapes()[0][0])
    print("Number of test patterns:\t%i" % testgen.get_shapes()[0][0])
    print("Dimensionality of the data:\t%i\n" % traingen.get_shapes()[0][1])

    param_wood = param['param_wood']

    wood = WoodClassifier(n_estimators=1,
                          criterion="gini",
                          max_features=param_wood['max_features'],
                          min_samples_split=2,
                          n_jobs=param_wood['n_jobs'],
                          seed=params.seed,
                          bootstrap=param_wood['bootstrap'],
                          tree_traversal_mode="dfs",
                          tree_type=param_wood['tree_type'],
                          min_samples_leaf=1,
                          float_type="double",
                          max_depth=None,
                          verbose=0)

    model = HugeWoodClassifier(
        n_estimators=param['n_estimators'],
        n_estimators_bottom=param['n_estimators_bottom'],
        n_top="auto",
        n_patterns_leaf="auto",
        balanced_top_tree=True,
        top_tree_max_depth=None,
        top_tree_type="standard",
        top_tree_leaf_stopping_mode="ignore_impurity",
        n_jobs=param_wood['n_jobs'],
        seed=params.seed,
        verbose=1,
        plot_intermediate={},
        chunk_max_megabytes=2048,
        wrapped_instance=wood,
        odir=tmp_dir,
        store=DiskStore(),
    )

    # training
    if profile == True:
        import yep
        assert param_wood['n_jobs'] == 1
        yep.start("train.prof")

    fit_start_time = time.time()
    model.fit(traingen)
    fit_end_time = time.time()
    if profile == True:
        yep.stop()

    # testing
    print("Computing predictions ...")
    test_start_time = time.time()
    ypred_test = model.predict(generator=testgen)
    test_end_time = time.time()

    results = {}
    results['dataset'] = dkey
    results['param'] = param
    results['training_time'] = fit_end_time - fit_start_time
    results['testing_time'] = test_end_time - test_start_time
    results['total'] = model.get_training_times()['total']
    results['retrieve'] = model.get_training_times()['retrieve']
    results['top'] = model.get_training_times()['top']
    results['distribute'] = model.get_training_times()['distribute']
    results['bottom'] = model.get_training_times()['bottom']

    print("Training time:\t\t%f" % results['training_time'])
    print("Testing time:\t\t%f" % results['testing_time'])

    print("Evaluating test error ...")
    ytest = testgen.get_all_target()
    evaluate(ypred_test, ytest, results, "testing")

    fname = '%s_%s_%s_%s_%s.json' % (
        str(param_wood['n_estimators']),
        str(param_wood['max_features']),
        str(param_wood['n_jobs']),
        str(param_wood['bootstrap']),
        str(param_wood['tree_type']),
    )
    fname = os.path.join(params.odir, str(dkey), str(train_size), "hugewood",
                         fname)
    ensure_dir_for_file(fname)
    with open(fname, 'w') as fp:
        json.dump(results, fp)

    del (testgen)
    del (traingen)
    model.cleanup()
    time.sleep(1)
Пример #9
0
def single_run(dkey, train_size, param, seed, profile=False):

    print(
        "Processing data set %s with train_size %s, seed %s, and parameters %s ..."
        % (str(dkey), str(train_size), str(seed), str(param)))

    if dkey == "covtype":
        traingen, testgen = covtype_generators(train_size=train_size,
                                               store="mem",
                                               seed=seed)
        n_subset = 50000
    elif dkey == "higgs":
        traingen, testgen = higgs_generators(train_size=train_size,
                                             store="mem",
                                             seed=seed)
        n_subset = 500000
    elif dkey == "susy":
        traingen, testgen = susy_generators(train_size=train_size,
                                            store="mem",
                                            seed=seed)
        n_subset = 500000
    else:
        raise Exception("Unknown data set!")

    print("")
    print("Number of training patterns:\t%i" % traingen.get_shapes()[0][0])
    print("Number of test patterns:\t%i" % testgen.get_shapes()[0][0])
    print("Dimensionality of the data:\t%i\n" % traingen.get_shapes()[0][1])

    model = SubsetWoodClassifier(n_estimators=param['n_estimators'],
                                 criterion="gini",
                                 max_features=param['max_features'],
                                 min_samples_split=2,
                                 n_jobs=param['n_jobs'],
                                 seed=seed,
                                 bootstrap=param['bootstrap'],
                                 tree_traversal_mode="dfs",
                                 tree_type=param['tree_type'],
                                 min_samples_leaf=1,
                                 float_type="double",
                                 max_depth=None,
                                 verbose=1,
                                 store=MemoryStore())

    # training
    if profile == True:
        import yep
        assert param['n_jobs'] == 1
        yep.start("train.prof")

    fit_start_time = time.time()
    model.fit(traingen, n_subset=n_subset)
    fit_end_time = time.time()
    if profile == True:
        yep.stop()
    ypreds_train = model.predict(generator=traingen)

    # testing
    test_start_time = time.time()
    ypred_test = model.predict(generator=testgen)
    test_end_time = time.time()

    results = {}
    results['dataset'] = dkey
    results['param'] = param
    results['training_time'] = fit_end_time - fit_start_time
    results['testing_time'] = test_end_time - test_start_time
    print("Training time:\t\t%f" % results['training_time'])
    print("Testing time:\t\t%f" % results['testing_time'])

    evaluate(ypreds_train, traingen.get_all_target(), results, "training")
    evaluate(ypred_test, testgen.get_all_target(), results, "testing")

    fname = '%s_%s_%s_%s_%s_%s.json' % (
        str(param['n_estimators']),
        str(param['max_features']),
        str(param['n_jobs']),
        str(param['bootstrap']),
        str(param['tree_type']),
        str(seed),
    )
    fname = os.path.join(params.odir, str(dkey), str(train_size), "subsetwood",
                         fname)
    ensure_dir_for_file(fname)
    with open(fname, 'w') as fp:
        json.dump(results, fp)

    del (testgen)
    del (traingen)
    model.cleanup()

    time.sleep(1)
Пример #10
0
def single_run(dkey, train_size, param, seed, profile=False):

    print("Processing data set %s with train_size %s and parameters %s ..." %
          (str(dkey), str(train_size), str(param)))

    import h2o
    from skutil.h2o import h2o_col_to_numpy
    h2o.init(max_mem_size="12G", nthreads=param['n_jobs'])
    h2o.remove_all()
    from h2o.estimators.random_forest import H2ORandomForestEstimator

    # get and convert data
    if dkey == "covtype":
        fname_train, fname_test = covtype_files(train_size=train_size)
        train_df = h2o.import_file(fname_train)
        test_df = h2o.import_file(fname_test)
        Xcols, ycol = train_df.col_names[:-1], train_df.col_names[-1]
    elif dkey == "higgs":
        fname_train, fname_test = higgs_files(train_size=train_size)
        train_df = h2o.import_file(fname_train)
        test_df = h2o.import_file(fname_test)
        Xcols, ycol = train_df.col_names[1:], train_df.col_names[0]
    elif dkey == "susy":
        fname_train, fname_test = susy_files(train_size=train_size)
        train_df = h2o.import_file(fname_train)
        test_df = h2o.import_file(fname_test)
        Xcols, ycol = train_df.col_names[1:], train_df.col_names[0]

    print("")
    print("Number of training patterns:\t%i" % train_df.shape[0])
    print("Number of test patterns:\t%i" % test_df.shape[0])
    print("Dimensionality of the data:\t%i\n" % train_df.shape[1])

    if param['max_features'] is None:
        mtries = train_df.shape[1] - 2
    elif param['max_features'] == "sqrt":
        mtries = int(math.sqrt(train_df.shape[1] - 2))

    if param['bootstrap'] == False:
        sample_rate = 1.0
    else:
        sample_rate = 0.632

    model = H2ORandomForestEstimator(
        mtries=mtries,
        sample_rate=sample_rate,
        #nbins=1000, #crash
        min_rows=1,
        build_tree_one_node=True,
        max_depth=20,
        balance_classes=False,
        ntrees=param['n_estimators'],
        seed=seed)

    # training
    fit_start_time = time.time()
    model.train(Xcols, ycol, training_frame=train_df)
    fit_end_time = time.time()
    ypreds_train = model.predict(train_df)

    # testing
    test_start_time = time.time()
    ypreds_test = model.predict(test_df)
    test_end_time = time.time()

    results = {}
    results['dataset'] = dkey
    results['param'] = param
    results['training_time'] = fit_end_time - fit_start_time
    results['testing_time'] = test_end_time - test_start_time
    print("Training time:     %f" % results['training_time'])
    print("Testing time:      %f" % results['testing_time'])

    evaluate(numpy.rint(ypreds_train.as_data_frame().values),
             train_df[ycol].as_data_frame().values, results, "training")
    evaluate(numpy.rint(ypreds_test.as_data_frame().values),
             test_df[ycol].as_data_frame().values, results, "testing")

    fname = '%s_%s_%s_%s_%s_%s.json' % (
        str(param['n_estimators']),
        str(param['max_features']),
        str(param['n_jobs']),
        str(param['bootstrap']),
        str(param['tree_type']),
        str(seed),
    )

    fname = os.path.join(params.odir, str(dkey), str(train_size), "h2", fname)
    ensure_dir_for_file(fname)
    with open(fname, 'w') as fp:
        json.dump(results, fp)