예제 #1
0
def main():
    global args, DEBUG
    args = parser.parse_args()
    DEBUG = args.debug
    print_debug(DEBUG)
    for dataset in ['test', 'train']:
        do_dataset(dataset)
예제 #2
0
def main():
    global args, DEBUG, FRAC, PREDICTORS
    args = parser.parse_args()
    DEBUG = args.debug
    FRAC = args.frac
    print_debug(DEBUG)

    if DEBUG:
        storename = '../processed_features_debug{}/{}_debug{}.h5'.format(DEBUG, 'train', DEBUG)
        mat_filename = '../processed_features_debug{}/text_feature_kernel.pickle'.format(DEBUG)
    else:
        storename = '../processed_features/{}.h5'.format('train')
        mat_filename = '../processed_features/text_feature_kernel.pickle'


    PREDICTORS = get_predictors(storename)
    
    boosting_type_list = ['gbdt','dart']
    num_leaves_list = [7,9,31,63]
    max_depth_list = [3,4,7,9] 
    for boosting_type in boosting_type_list:      
        for i in range(len(num_leaves_list)):
            print ('==============================================================')
            num_leaves = num_leaves_list[i]
            max_depth = max_depth_list[i]
            print('num leaves:', num_leaves)
            print('max depth:', max_depth)
            DO(mat_filename,storename,num_leaves,max_depth,1,boosting_type)
예제 #3
0
def main():
    global args, DEBUG
    args = parser.parse_args()
    DEBUG = args.debug
    print_debug(DEBUG)
    for dataset in ['train', 'test']:
        do_dataset(dataset)
    write_all_feature_to_text()
예제 #4
0
def main():
    global args, DEBUG, FRAC, PREDICTORS, TRAINMODE
    args = parser.parse_args()
    DEBUG = args.debug
    FRAC = args.frac
    TRAINMODE = args.trainmode
    print_debug(DEBUG)
    DO()
예제 #5
0
def main():
    global args, DEBUG, DATASET
    args = parser.parse_args()
    DATASET = args.dataset
    DEBUG = args.debug
    print_debug(DEBUG)

    if DEBUG:
        todir = '../processed_features_debug{}/'.format(DEBUG)
    else:
        todir = '../processed_features/'
    gen_aggregated_kernel(todir, '.pickle')
예제 #6
0
def main():
    global args, DEBUG, FRAC, PREDICTORS, TRAINMODE, PREDICTORS, LOCAL_TUNE_RESULT
    args = parser.parse_args()
    DEBUG = args.debug
    FRAC = args.frac
    TRAINMODE = args.trainmode
    # OPTION=args.option
    print_debug(DEBUG)

    if DEBUG:
        dir_feature = '../processed_features_debug2/'
    else:
        dir_feature = '../processed_features/'

    # boosting_list = ['gbdt', 'dart']
    boosting_list = ['gbdt']
    num_leave_list = [7, 9, 15, 31, 63, 128]
    max_depth_list = [3, 4, 7, 15, 31, 64]

    model_list = []
    for i in range(len(num_leave_list)):
        num_leave = num_leave_list[i]
        max_depth = max_depth_list[i]
        for boosting_type in boosting_list:
            model_list = model_list + [
                '{}_{}_{}'.format(boosting_type, num_leave, max_depth)
            ]

    LOCAL_TUNE_RESULT = pd.DataFrame(
        index=model_list,
        columns=['running_time', 'num_round', 'train', 'val'])
    if DEBUG: print(LOCAL_TUNE_RESULT)

    option = 1
    is_textadded = True
    PREDICTORS = PREDICTORS_BASED
    mat_filename = dir_feature + 'text_feature_kernel.pickle'
    print_header('Option {}'.format(option))
    print('is_textadded {} \n predictors {} \n mat filename {}'.format(
        is_textadded, PREDICTORS, mat_filename))

    for k in range(len(num_leave_list)):
        i = len(num_leave_list) - k - 1
        num_leave = num_leave_list[i]
        max_depth = max_depth_list[i]
        for boosting_type in boosting_list:
            DO(option, is_textadded, mat_filename, dir_feature, num_leave,
               max_depth, boosting_type)

    print_header('FINAL SUMMARY')
    print(LOCAL_TUNE_RESULT)
    LOCAL_TUNE_RESULT.to_csv('csv/tune_params.csv', index=True)
예제 #7
0
def main():
    global args, DEBUG, FRAC, PREDICTORS, TRAINMODE, OPTION
    args = parser.parse_args()
    DEBUG = args.debug
    FRAC = args.frac
    TRAINMODE = args.trainmode
    OPTION = args.option
    print_debug(DEBUG)
    feature_train = get_good_local()
    if DEBUG: print(feature_train)
    PREDICTORS = PREDICTORS_BASED + feature_train
    if DEBUG: print(PREDICTORS)
    DO()
def main():
    global args, DEBUG, FRAC, PREDICTORS, TRAINMODE, PREDICTORS, LOCAL_VALIDATION_RESULT
    args = parser.parse_args()
    DEBUG = args.debug
    FRAC = args.frac
    TRAINMODE = args.trainmode
    # OPTION=args.option
    print_debug(DEBUG)

    done_feature_df = load_csv('csv/forward_selection.csv')
    print(done_feature_df)

    if DEBUG:
        dir_feature = '../processed_features_debug2/'
    else:
        dir_feature = '../processed_features/'

    option = 0
    is_textadded = False
    PREDICTORS = PREDICTORS_BASED

    feature_list = ['base']
    files = glob.glob(dir_feature + '*.pickle')
    REMOVED_LIST = [
        'cat_encode', 'len_feature_kernel', 'text_feature_kernel', 'time'
    ]
    for file in files:
        filename = os.path.basename(file)
        feature = re.sub('\.pickle$', '', filename)
        if is_added(filename, REMOVED_LIST):
            feature_list = feature_list + [feature]

    LOCAL_VALIDATION_RESULT = pd.DataFrame(
        index=feature_list,
        columns=['running_time', 'num_round', 'train', 'val', 'diff'])
    if DEBUG:
        print(feature_list)
        print(LOCAL_VALIDATION_RESULT)

    for feature in feature_list:
        if feature == 'base':
            PREDICTORS = PREDICTORS
        else:
            PREDICTORS = PREDICTORS + [feature]
        DO(option, is_textadded, 'abc', dir_feature, 1988, feature)
        if feature != 'base':
            PREDICTORS.remove(feature)

    print_header('FINAL SUMMARY')
    print(LOCAL_VALIDATION_RESULT)
    LOCAL_VALIDATION_RESULT.to_csv('forward_selection.csv', index=True)
예제 #9
0
def main():
    global args, DEBUG, FRAC, PREDICTORS, TRAINMODE, PREDICTORS, LOCAL_VALIDATION_RESULT
    args = parser.parse_args()
    DEBUG = args.debug
    FRAC = args.frac
    TRAINMODE = args.trainmode
    # OPTION=args.option
    print_debug(DEBUG)

    if DEBUG:
        dir_feature = '../processed_features_debug2/'
    else:
        dir_feature = '../processed_features/'

    option = 0
    is_textadded = False
    PREDICTORS = PREDICTORS_BASED
    mat_filename = dir_feature + 'text_feature_kernel.pickle'

    # seed_list = np.random.randint(2000, size=1000)
    random.seed(1992)
    seed_array = random.sample(range(0, 10000), 100)

    seed_list = []
    for seed in seed_array:
        seed_list = seed_list + ['seed_' + str(seed)]
    LOCAL_VALIDATION_RESULT = pd.DataFrame(index=seed_list,
                                           columns=[
                                               'seed', 'running_time',
                                               'num_round', 'train', 'val',
                                               'local_test', 'diff'
                                           ])
    print(seed_list)
    print(LOCAL_VALIDATION_RESULT)
    for seed in seed_array:
        DO(option, is_textadded, mat_filename, dir_feature, seed)

    print_header('FINAL SUMMARY')
    print(LOCAL_VALIDATION_RESULT)
    LOCAL_VALIDATION_RESULT.to_csv('seed_select.csv', index=False)
예제 #10
0
def main():
    global args, DEBUG, FRAC, PREDICTORS, TRAINMODE, PREDICTORS, LOCAL_VALIDATION_RESULT
    args = parser.parse_args()
    DEBUG = args.debug
    FRAC = args.frac
    TRAINMODE = args.trainmode
    # OPTION=args.option
    print_debug(DEBUG)

    if DEBUG:
        dir_feature = '../processed_features_debug2/'
    else:
        dir_feature = '../processed_features/'

    option_list = []
    for option in range(10):
        option_list = option_list + ['option' + str(option)]

    LOCAL_VALIDATION_RESULT = pd.DataFrame(
        index=option_list,
        columns=['running_time', 'num_round', 'train', 'val'])

    if DEBUG:
        print(option_list)
        print(LOCAL_VALIDATION_RESULT)

    test_list = [8]
    for option in test_list:
        # nothing here
        if option == 0:
            is_textadded = False
            PREDICTORS = PREDICTORS_BASED
            mat_filename = dir_feature + 'text_feature_kernel.pickle'
        # kernel
        elif option == 1:
            is_textadded = True
            PREDICTORS = PREDICTORS_BASED
            mat_filename = dir_feature + 'text_feature_kernel.pickle'
        # kernel max_feature = 1000
        elif option == 2:
            is_textadded = True
            PREDICTORS = PREDICTORS_BASED
            mat_filename = dir_feature + 'text_feature_kernel_1000.pickle'
        # kernel max_feature = 30000
        elif option == 3:
            is_textadded = True
            PREDICTORS = PREDICTORS_BASED
            mat_filename = dir_feature + 'text_feature_kernel_30000.pickle'
        # kernel max_feature = infinite
        elif option == 4:
            is_textadded = True
            PREDICTORS = PREDICTORS_BASED
            mat_filename = dir_feature + 'text_feature_kernel_-1.pickle'
        # kernel max_feature = 18000 + 'good' feature
        elif option == 5:
            is_textadded = True
            PREDICTORS = PREDICTORS_BASED + PREDICTORS_GOOD
            mat_filename = dir_feature + 'text_feature_kernel.pickle'
        # kernel max_feature = 18000 + not-checked feature
        elif option == 6:
            is_textadded = True
            PREDICTORS = PREDICTORS_BASED + PREDICTORS_NOTCHECKED
            mat_filename = dir_feature + 'text_feature_kernel.pickle'
        elif option == 7:
            is_textadded = True
            PREDICTORS = PREDICTORS_BASED + PREDICTORS_OVERFIT
            mat_filename = dir_feature + 'text_feature_kernel.pickle'
        elif option == 8:
            is_textadded = True
            PREDICTORS = PREDICTORS_BASED + PREDICTORS_TRY
            mat_filename = dir_feature + 'text_feature_kernel_30000.pickle'
        if DEBUG:
            print_header('Option {}'.format(option))
            print('is_textadded {} \n predictors {} \n mat filename {}'.format(
                is_textadded, PREDICTORS, mat_filename))

        DO(option, is_textadded, mat_filename, dir_feature)

    print_header('FINAL SUMMARY')
    print(LOCAL_VALIDATION_RESULT)
예제 #11
0
def main():
    global args, DEBUG, FRAC, PREDICTORS
    args = parser.parse_args()
    DEBUG = args.debug
    FRAC = args.frac
    print_debug(DEBUG)

    print("\nData Load Stage")

    target = TARGET

    tabular_predictors = get_tabular_predictors()
    # categorical = get_categorical(predictors)

    if DEBUG:
        mat_filename = '../processed_features_debug2/text_feature_kernel.pickle'
        dir_feature = '../processed_features_debug2/'
    else:
        mat_filename = '../processed_features/text_feature_kernel.pickle'
        dir_feature = '../processed_features/'

    X, y, test, full_predictors, predictors = prepare_training(
        mat_filename, dir_feature, tabular_predictors)

    categorical = get_categorical(predictors)
    predictors = get_predictors(predictors)

    X_train, X_valid, y_train, y_valid = train_test_split(X,
                                                          y,
                                                          test_size=0.10,
                                                          random_state=SEED)

    print(X.shape)

    print("Light Gradient Boosting Regressor")
    lgbm_params = {
        'task': 'train',
        'boosting_type': 'gbdt',
        'objective': 'regression',
        'metric': 'rmse',
        'max_depth': 15,
        # 'num_leaves': 31,
        'feature_fraction': 0.7,
        'bagging_fraction': 0.8,
        # 'bagging_freq': 5,
        'learning_rate': 0.019,
        'verbose': 0
    }

    print(lgbm_params)

    # LGBM Dataset Formatting
    lgtrain = lgb.Dataset(X_train,
                          y_train,
                          feature_name=full_predictors,
                          categorical_feature=categorical)
    lgvalid = lgb.Dataset(X_valid,
                          y_valid,
                          feature_name=full_predictors,
                          categorical_feature=categorical)

    # Go Go Go
    modelstart = time.time()
    lgb_clf = lgb.train(lgbm_params,
                        lgtrain,
                        num_boost_round=16000,
                        valid_sets=[lgtrain, lgvalid],
                        valid_names=['train', 'valid'],
                        early_stopping_rounds=200,
                        verbose_eval=200)