示例#1
0
文件: LoadData.py 项目: ikki02/gcp
def load_test_data():
    logger = mylogger()
    logger.info('read_test start')
    test = pd.read_csv('../../../input/test.csv')
    logger.info('test:\n{}'.format(test.head()))
    logger.debug('read_test end')
    return test
示例#2
0
文件: LoadData.py 项目: ikki02/gcp
def load_train_data():
    logger = mylogger()
    logger.info('read_train start')
    train = pd.read_csv('../../../input/train.csv')
    logger.info('train:\n{}'.format(train.head()))
    logger.debug('read_train end')
    return train
示例#3
0
文件: LoadData.py 项目: ikki02/gcp
def load_submission():
    logger = mylogger()
    logger.info('read_submission start')
    submission = pd.read_csv('../../../input/sample_submission.csv')
    logger.info('sample_submission:\n{}'.format(submission.head()))
    logger.debug('read_submission end')
    return submission
示例#4
0
def forest_cv():
	logger = mylogger()
	logger.info('RandomForestRegressor start')

	logger.debug('make_train_data start')
	train = pd.read_csv('../result_tmp/scaled_train.csv')
	#train = pd.read_csv('../result_tmp/scaled_train_DateBlockNum.csv')
	#train = train[train['date_block_num']==33]  #直近1ヶ月
	#train = train.loc[(30<train['date_block_num'])&(train['date_block_num']<=33)]  #直近3m
	y = train['item_cnt_month']
	X = train.drop(['item_cnt_month'], axis=1).values
	#X = train.drop(['item_cnt_month', 'date_block_num'], axis=1).values
	logger.debug('make_train_data end')

	logger.info('Cross-validation start')
	forest = RandomForestRegressor(n_estimators=50, random_state=1)
	
	#cvはKFoldのshuffle引数をTrue/Falseから選べる。
	#ただのkfoldしたいときは上記引数をFalseにすればよい。その際、インデックス順にk分割する。
	#shuffle_cvしたいときは上記引数をTrueにすればよい。毎回分割が変わる。
	kfold = KFold(n_splits=3, shuffle=True, random_state=0)
	#skf = StratifiedKFold(n_splits=3)  #skfはkfoldと同じ引数をもつ。
	#tscv = TimeSeriesSplit(n_splits=3)
	scores = cross_val_score(forest, X, y, cv=kfold)
	#以下、GroupKFoldを使うときの書き方
	#groups = list(train['date_block_num'])
	#scores = cross_val_score(forest, X, y, groups, cv=GroupKFold(n_splits=3))
	logger.info('Cross-validation scores_forest: {}'.format(scores))
	logger.info('Average Cross-validation score_forest: {}'.format(scores.mean()))
	logger.debug('Cross-validation end')
	
	logger.debug('RandomForestRegressor end')
	logger.debug('====================')
示例#5
0
def forest_gscv():
	logger = mylogger()
	logger.info('RandomForestRegressor start')

	logger.debug('make_train_data start')
	train = pd.read_csv('../result_tmp/scaled_train.csv')
	y = train['item_cnt_month']
	X = train.drop(['item_cnt_month'], axis=1).values
	logger.debug('make_train_data end')

	logger.info('GridSearchCV start')
	param_grid = {'n_estimators':[10, 30, 50],
				 'random_state':[1, 2, 3]}
	logger.debug('Parameter grid:\n{}'.format(param_grid))
	grid_search = GridSearchCV(RandomForestRegressor(), param_grid, cv=5, n_jobs=4)
	X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=1)
	grid_search.fit(X_train, y_train)
	logger.info('Best GridSearchCV parameters_forest: {}'.format(grid_search.best_params_))
	logger.info('Best GridSearchCV score_forest: {}'.format(grid_search.best_score_))
	logger.info('Test set score_forest: {:.2f}'.format(grid_search.score(X_test, y_test)))
	results = pd.DataFrame(grid_search.cv_results_)
	results.to_csv('../result_tmp/GridSearch.csv', encoding='utf-8-sig', index=False)
	logger.debug('GridSearchCV end')
	
	logger.debug('RandomForestRegressor end')
	logger.debug('====================')
示例#6
0
def ridge_gscv():
	logger = mylogger()
	logger.info('RidgeRegression start')

	logger.debug('make_train_data start')
	train = pd.read_csv('../result_tmp/scaled_train.csv')
	y = train['item_cnt_month']
	X = train.drop(['item_cnt_month'], axis=1).values
	logger.debug('make_train_data end')

	logger.info('GridSearchCV start')
	ridge = Ridge()
	param_grid = {'alpha': [0.1, 1, 10, 50, 100]}
	logger.debug('Parameter grid:\n{}'.format(param_grid))
	grid_search = GridSearchCV(ridge, param_grid, cv=5, n_jobs=-1)
	X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=1)
	grid_search.fit(X_train, y_train)
	logger.info('Best GridSearchCV parameters_ridge: {}'.format(grid_search.best_params_))
	logger.info('Best GridSearchCV score_ridge: {}'.format(grid_search.best_score_))
	logger.info('Test set score_ridge: {:.2f}'.format(grid_search.score(X_test, y_test)))
	results = pd.DataFrame(grid_search.cv_results_)
	results.to_csv('../result_tmp/GridSearch_ridge.csv', encoding='utf-8-sig', index=False)
	logger.debug('GridSearchCV end')
	
	#EDAしたいときコメントアウトはずす。
	ridge = Ridge(**grid_search.best_params_)
	ridge.fit(X_train, y_train)
	
	logger.info('coefficient:{}'.format(ridge.coef_))
	logger.info('intercept:{}'.format(ridge.intercept_))

	logger.debug('RidgeRegression end')
	logger.debug('====================')
示例#7
0
def ridge_submit():
	logger = mylogger()
	logger.info('RidgeRegression start')
	logger.debug('make_train_data start')
	#train = pd.read_csv('../result_tmp/scaled_train.csv')
	train = pd.read_csv('../result_tmp/scaled_train_DateBlockNum.csv')
	#train = train[train['date_block_num']==33]  #直近1ヶ月
	train = train.loc[(30<train['date_block_num'])&(train['date_block_num']<=33)]  #直近3m
	
	y = train['item_cnt_month']
	X = train.drop(['item_cnt_month', 'date_block_num'], axis=1).values
	#X = train.drop(['item_cnt_month'], axis=1).values
	#X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=1)
	logger.debug('make_train_data end')

	logger.info('Fitting start')
	ridge = Ridge()
	ridge.fit(X, y)
	logger.debug('Fitting end')

	logger.info('Scoring start')
	#logger.info('Accuracy on test set: {:.3f}'.format(.score(X_test, y_test)))
	test_data = load_test_data()
	test = test_data.drop(['ID'], axis=1).values
	
	submission = load_submission()
	submission['item_cnt_month'] = ridge.predict(test).astype(np.float16).clip(0., 20.)
	submission.to_csv('../result_tmp/submit_180902_31-33_ridge.csv', encoding='utf-8-sig', index=False)
	logger.info('submission:\n{}'.format(submission.head()))
	logger.debug('RidgeRegression end')
	logger.debug('====================')
示例#8
0
文件: LoadData.py 项目: ikki02/gcp
def load_suppliment():
    logger = mylogger()
    logger.info('read_csv start')
    item_cat = pd.read_csv('../../../input/item_categories.csv')
    item = pd.read_csv('../../../input/items.csv')
    shop = pd.read_csv('../../../input/shops.csv')
    logger.debug('read_csv end')
    logger.info('item_cat:\n{}'.format(item_cat.head()))
    logger.info('item:\n{}'.format(item.head()))
    logger.info('shop:\n{}'.format(shop.head()))
    return item_cat, item, shop
示例#9
0
def label_encode(train, test):
    logger = mylogger()
    # 木系のアルゴリズムは使う必要なし。
    logger.info('label_encode starts')
    for c in ['shop_id', 'item_id']:
        lbl = preprocessing.LabelEncoder()
        lbl.fit(list(train[c].unique()) + list(test[c].unique()))
        train[c] = lbl.transform(train[c].astype(str))
        test[c] = lbl.transform(test[c].astype(str))
        logger.debug(c)
    train.to_csv('../result_tmp/train_lbl.csv')
    test.to_csv('../result_tmp/test_lbl.csv')
    logger.debug('label encode ends')
    return train, test
示例#10
0
文件: Scoring.py 项目: ikki02/gcp
def xgboost_gscv():
    #インストール
    # cd <workspace>
    # git clone --recursive https://github.com/dmlc/xgboost
    # cd xgboost; make -j4
    # pip show setuptoolsでsetuptoolsのチェック。パスが通っていることも重要。
    # cd python-package; sudo python setup.py install

    logger = mylogger()
    logger.info('xgboostRegressor start')

    logger.debug('make_train_data start')
    train = pd.read_csv('./result_tmp/scaled_train.csv')
    y = train['item_cnt_month']
    X = train.drop(['item_cnt_month'], axis=1).values
    logger.debug('make_train_data end')

    logger.info('GridSearchCV start')
    reg = xgb.XGBRegressor()
    param_grid = {'max_depth': [2, 4, 6], 'n_estimators': [50, 100, 500]}
    logger.debug('Parameter grid:\n{}'.format(param_grid))
    grid_search = GridSearchCV(reg, param_grid, cv=5, n_jobs=-1)
    X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=1)
    grid_search.fit(X_train, y_train)
    logger.info('Best GridSearchCV parameters_xgb: {}'.format(
        grid_search.best_params_))
    logger.info('Best GridSearchCV score_xgb: {}'.format(
        grid_search.best_score_))
    logger.info('Test set score_xgb: {:.2f}'.format(
        grid_search.score(X_test, y_test)))
    results = pd.DataFrame(grid_search.cv_results_)
    results.to_csv('./result_tmp/GridSearch_xgb.csv',
                   encoding='utf-8-sig',
                   index=False)
    logger.debug('GridSearchCV end')

    #EDAしたいときコメントアウトはずす。
    #xgbr = xgb.XGBRegressor(**grid_search.best_params_)
    #xgbr.fit(X_train, y_train)

    #fti = pd.Series(xgbr.feature_importances_, index=train.columns)
    #fti = fti.sort_values()
    #logger.debug(fti)
    #fti.plot(kind='barh')
    #plt.title('feature importance from xgboost')
    #plt.show()

    logger.debug('xgboostRegressor end')
    logger.debug('====================')
示例#11
0
def make_train_in_test():
    logger = mylogger()
    logger.info('train in test starts')
    train = load_train_data()
    test = load_test_data()
    logger.info('train.org.shape:{}'.format(train.shape))
    test_shops = test.shop_id.unique()
    test_items = test.item_id.unique()
    train = train[train.shop_id.isin(test_shops)]
    train = train[train.item_id.isin(test_items)]
    train['date'] = pd.to_datetime(train['date'], format='%d.%m.%Y')
    train['month'] = train['date'].dt.month
    logger.info('train in test.shape:{}'.format(train.shape))
    logger.debug('train in test ends')
    return train, test
示例#12
0
def make_item_cnt_month(train):
    logger = mylogger()
    logger.info('MakeItemCntMonth starts')
    item_cnt_month = train['item_cnt_day'].groupby( \
     [train['date_block_num'], train['shop_id'], train['item_id']]).apply(sum)
    item_cnt_month.name = 'item_cnt_month'
    item_cnt_month_df = pd.DataFrame(item_cnt_month)
    item_cnt_month_df = item_cnt_month_df.reset_index()
    item_cnt_month_df.drop(['date_block_num'], axis=1, inplace=True)
    item_cnt_month_df.to_csv('../result_tmp/scaled_train.csv',
                             encoding='utf-8-sig',
                             index=False)
    logger.debug(item_cnt_month_df.shape)
    logger.debug('MakeItemCntMonth ends')
    return item_cnt_month_df
示例#13
0
文件: Scoring.py 项目: ikki02/gcp
def forest_submit():
    logger = mylogger()
    logger.info('RandomForestRegressor start')
    logger.debug('make_train_data start')
    #train = pd.read_csv('./result_tmp/scaled_train.csv')
    train = pd.read_csv('./result_tmp/scaled_train_DateBlockNum.csv')
    #train = train[train['date_block_num']==33]  #直近1ヶ月
    train = train.loc[(30 < train['date_block_num']) &
                      (train['date_block_num'] <= 33)]  #直近3m

    y = train['item_cnt_month']
    X = train.drop(['item_cnt_month', 'date_block_num'], axis=1).values
    #X = train.drop(['item_cnt_month'], axis=1).values
    #X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=1)
    logger.debug('make_train_data end')

    logger.info('Fitting start')
    forest = RandomForestRegressor(n_estimators=50, random_state=1)
    forest.fit(X, y)
    logger.debug('Fitting end')

    #EDAしたいとき
    #fti = forest.feature_importances_
    #print('Feature Importances:')
    #for i, feature in enumerate(train.colunms):
    #	print('\t{0:10s}:{1:>.6f}'.format(feature, fti[i]))

    logger.info('Scoring start')
    #logger.info('Accuracy on test set: {:.3f}'.format(.score(X_test, y_test)))
    test_data = load_test_data()
    test = test_data.drop(['ID'], axis=1).values

    submission = load_submission()
    submission['item_cnt_month'] = forest.predict(test).astype(
        np.float16).clip(0., 20.)
    #submission.to_csv('./result_tmp/submit_180826_1st.csv', encoding='utf-8-sig', index=False)
    submission.to_csv('./result_tmp/submit_180827_31-33.csv',
                      encoding='utf-8-sig',
                      index=False)
    logger.info('submission:\n{}'.format(submission.head()))
    logger.debug('RandomForestRegressor end')
    logger.debug('====================')