예제 #1
0
def main():
    # end_year = 2016 # end_year is inclusive
    # part_count = 16 # the number of data files to train against
    # gpu_time = 0

    gpu_dfs = []
    perf_format_path = perf_data_path + "/Performance_%sQ%s.txt"

    # ETL stage ##############################################################
    time_ETL = time.time()
    for quarter in range(1, count_quarter_processing + 1):
        year = 2000 + quarter // 4
        file = perf_format_path % (str(year), str(quarter % 4))
        gpu_dfs.append(
            run_gpu_workflow(year=year, quarter=(quarter % 4), perf_file=file))

    time_ETL_end = time.time()
    print("ETL time: ", time_ETL_end - time_ETL)
    ##########################################################################
    dxgb_gpu_params = {
        'nround': 100,
        'max_depth': 8,
        'max_leaves': 2**8,
        'alpha': 0.9,
        'eta': 0.1,
        'gamma': 0.1,
        'learning_rate': 0.1,
        'subsample': 1,
        'reg_lambda': 1,
        'scale_pos_weight': 2,
        'min_child_weight': 30,
        'tree_method': 'gpu_hist',
        'n_gpus': 1,
        # 'distributed_dask':  True,
        'loss': 'ls',
        'objective': 'gpu:reg:linear',
        'max_features': 'auto',
        'criterion': 'friedman_mse',
        'grow_policy': 'lossguide',
        'verbose': True
    }

    gpu_dfs = [DataFrame.from_arrow(gpu_df) for gpu_df in gpu_dfs]

    pd_df = gpu_dfs[0].to_pandas()

    y = pd_df["delinquency_12"]
    x = pd_df.drop(["delinquency_12"], axis=1)

    pd_df = xgb.DMatrix(x, y)

    bst = xgb.train(dxgb_gpu_params,
                    pd_df,
                    num_boost_round=dxgb_gpu_params['nround'])
    time_ML_train_end = time.time()
    print("Machine learning - train: ", time_ML_train_end - time_ETL_end)
예제 #2
0
def test_datetime_to_arrow(dtype):
    timestamp = (cudf.datasets.timeseries(
        start="2000-01-01", end="2000-01-02", freq="3600s",
        dtypes={}).reset_index()["timestamp"].reset_index(drop=True))
    gdf = DataFrame({"timestamp": timestamp.astype(dtype)})
    assert_eq(gdf, DataFrame.from_arrow(gdf.to_arrow(preserve_index=False)))