Пример #1
0
def ridge_submit():
	logger = mylogger()
	logger.info('RidgeRegression start')
	logger.debug('make_train_data start')
	#train = pd.read_csv('../result_tmp/scaled_train.csv')
	train = pd.read_csv('../result_tmp/scaled_train_DateBlockNum.csv')
	#train = train[train['date_block_num']==33]  #直近1ヶ月
	train = train.loc[(30<train['date_block_num'])&(train['date_block_num']<=33)]  #直近3m
	
	y = train['item_cnt_month']
	X = train.drop(['item_cnt_month', 'date_block_num'], axis=1).values
	#X = train.drop(['item_cnt_month'], axis=1).values
	#X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=1)
	logger.debug('make_train_data end')

	logger.info('Fitting start')
	ridge = Ridge()
	ridge.fit(X, y)
	logger.debug('Fitting end')

	logger.info('Scoring start')
	#logger.info('Accuracy on test set: {:.3f}'.format(.score(X_test, y_test)))
	test_data = load_test_data()
	test = test_data.drop(['ID'], axis=1).values
	
	submission = load_submission()
	submission['item_cnt_month'] = ridge.predict(test).astype(np.float16).clip(0., 20.)
	submission.to_csv('../result_tmp/submit_180902_31-33_ridge.csv', encoding='utf-8-sig', index=False)
	logger.info('submission:\n{}'.format(submission.head()))
	logger.debug('RidgeRegression end')
	logger.debug('====================')
Пример #2
0
def RandomForest():
    logger.info('RandomForestRegressor start')
    logger.debug('make_train_data start')
    train = pd.read_csv('./result_tmp/scaled_train.csv')

    y = train['item_cnt_month']
    X = train.drop(['item_cnt_month'], axis=1).values
    #X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=1)
    logger.debug('make_train_dat end')

    logger.info('Fitting start')
    forest = RandomForestRegressor(n_estimators=50, random_state=1)
    forest.fit(X, y)
    logger.debug('Fitting end')

    logger.info('Scoring start')
    #logger.info('Accuracy on test set: {:.3f}'.format(forest.score(X_test, y_test)))
    test_data = load_test_data()
    test = test_data.drop(['ID'], axis=1).values

    submission = load_submission()
    submission['item_cnt_month'] = forest.predict(test).astype(
        np.float16).clip(0., 20.)
    submission.to_csv('./result_tmp/submit_180826_1st.csv',
                      encoding='utf-8-sig',
                      index=False)
    logger.info('submission:\n{}'.format(submission.head()))
    logger.debug('RandomForestRegressor end')
    logger.debug('====================')
Пример #3
0
def forest_submit():
    logger = mylogger()
    logger.info('RandomForestRegressor start')
    logger.debug('make_train_data start')
    #train = pd.read_csv('./result_tmp/scaled_train.csv')
    train = pd.read_csv('./result_tmp/scaled_train_DateBlockNum.csv')
    #train = train[train['date_block_num']==33]  #直近1ヶ月
    train = train.loc[(30 < train['date_block_num']) &
                      (train['date_block_num'] <= 33)]  #直近3m

    y = train['item_cnt_month']
    X = train.drop(['item_cnt_month', 'date_block_num'], axis=1).values
    #X = train.drop(['item_cnt_month'], axis=1).values
    #X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=1)
    logger.debug('make_train_data end')

    logger.info('Fitting start')
    forest = RandomForestRegressor(n_estimators=50, random_state=1)
    forest.fit(X, y)
    logger.debug('Fitting end')

    #EDAしたいとき
    #fti = forest.feature_importances_
    #print('Feature Importances:')
    #for i, feature in enumerate(train.colunms):
    #	print('\t{0:10s}:{1:>.6f}'.format(feature, fti[i]))

    logger.info('Scoring start')
    #logger.info('Accuracy on test set: {:.3f}'.format(.score(X_test, y_test)))
    test_data = load_test_data()
    test = test_data.drop(['ID'], axis=1).values

    submission = load_submission()
    submission['item_cnt_month'] = forest.predict(test).astype(
        np.float16).clip(0., 20.)
    #submission.to_csv('./result_tmp/submit_180826_1st.csv', encoding='utf-8-sig', index=False)
    submission.to_csv('./result_tmp/submit_180827_31-33.csv',
                      encoding='utf-8-sig',
                      index=False)
    logger.info('submission:\n{}'.format(submission.head()))
    logger.debug('RandomForestRegressor end')
    logger.debug('====================')
Пример #4
0
    forest = RandomForestRegressor(n_estimators=50, random_state=1)
	forest.fit(X, y)
	logger.debug('Fitting end')
	
    #EDAしたいとき
	#fti = forest.feature_importances_
	#print('Feature Importances:')
	#for i, feature in enumerate(train.colunms):
	#	print('\t{0:10s}:{1:>.6f}'.format(feature, fti[i]))
    
	logger.info('Scoring start')
	#logger.info('Accuracy on test set: {:.3f}'.format(.score(X_test, y_test)))
	test_data = load_test_data()
	test = test_data.drop(['ID'], axis=1).values
	
	submission = load_submission()
	submission['item_cnt_month'] = forest.predict(test).astype(np.float16).clip(0., 20.)
	#submission.to_csv('./result_tmp/submit_180826_1st.csv', encoding='utf-8-sig', index=False)
	submission.to_csv('./result_tmp/submit_180827_31-33.csv', encoding='utf-8-sig', index=False)
	logger.info('submission:\n{}'.format(submission.head()))
	logger.debug('RandomForestRegressor end')
	logger.debug('====================')


# CV: KFold, StratifiedKFold, TimeSeriesSplit, GroupKFold
# [http://scikit-learn.org/stable/modules/cross_validation.html]
# TimeSeriesSplit()によるCV
# [http://scikit-learn.org/stable/modules/generated/sklearn.model_selection.TimeSeriesSplit.html]
# 以下のイメージで分割する。TESTを予測するために、直近のTRAINを使うイメージがtscv
# TRAIN: [0] TEST: [1], TRAIN: [0 1] TEST: [2], TRAIN: [0 1 2] TEST: [3]
def forest_cv():