def merge_xgboost_local_substitute(): xgboost = get_pred_for_shops(pred_file = os.path.join(getHome(),"Dropbox",'dataset','Analysis','competition','xgBoost_comp_iterLength_7mode_filled_tarLength_14_iterOrC_copy_0.08607781.csv')) local = get_pred_for_shops(pred_file = os.path.join(getHome(),"Dropbox",'dataset','Scripts','LocalLearning','neighbours_10000_alg_ETR_estimator_500_mss_2_msl_1_copy_7x2_drop_xiaohaozi_localLearning714_0.08547515.csv')) shop_set = get_substitutes() for shop in local: if shop in shop_set: local[shop] = xgboost[shop] with open('merge_xgboost_local.csv','w') as fw: for s in xrange(1,2001): fw.write(','.join(map(str,[s]+local[s]))+'\n')
def get_substitutes(): filename = os.path.join(getHome(),"Dropbox",'dataset','Scripts','VisualizePredictionResult','shops_recorded.txt') shop_set = set() with open(filename,'r') as fp: lines = fp.readlines() for l in lines: shop_set.add(int(l.strip('\r\n'))) return shop_set
def get_original_pay_trend(shop_id): """ :param shop_id: :return: a dataframe with columns ‘day' and 'cnt' """ HOME = getHome() PayTrendFolder = os.path.join(HOME, "Dropbox", "dataset", "Analysis", "PayTrend") FileNameTemp = "CustomerFlow_%s.csv" csvFileName = os.path.join(PayTrendFolder, FileNameTemp % shop_id) shopTrend = pd.read_csv(csvFileName, header=None, names=['time', 'cnt'], parse_dates=[0]) shopTrend['day'] = shopTrend.apply(lambda row: row['time'].date(), axis=1) shopTrend.drop('time',axis=1, inplace=True) return shopTrend
def load_data_set(continuous_zero_filled_threshold, consider_anomaly, lag, outputlength, startshop, endshop): HOME = getHome() SourceFolder = os.path.join(HOME, \ "Dropbox", "dataset", "Analysis",\ "PayTrend_Filled_threshold_%s_%s_anomaly"%(continuous_zero_filled_threshold, "consider" if consider_anomaly else "not_consider"), \ "NewFeatures") #"training_allDefaultFeatures_lag%s_output%s"%(lag,outputlength)) list_df = [] print "loading data" for shop_id in xrange(startshop, endshop + 1): if shop_id % 200 == 0: print shop_id src_csv_file = "feature_shop_%s.csv" % shop_id src_csv_file = os.path.join(SourceFolder, src_csv_file) each_shop = pd.read_csv(src_csv_file) if each_shop.shape[0] != 0: list_df.append(each_shop) list_df = pd.concat(list_df, ignore_index=True) print 'finish loading' return list_df
sys_name = "Lin" HOME_modelFolder = os.path.expanduser('~') elif system.startswith("Win"): HOME = r"C:\Users\SI30YD" if not os.path.exists(HOME): HOME = r"C:\Users\KH44IM" sys_name = "Win" HOME_modelFolder = r"H:\Model" else: print "Unknown platform" sys_name = "No" sys.exit(0) return HOME, HOME_modelFolder HOME, HOME_modelFolder = getHome() sys.path.append(os.path.join(HOME, "Dropbox", "dataset", "Scripts")) from tianchi_api.system import getHome from tianchi_api.metrics import loss, loss_reverse from sklearn.model_selection import KFold from tianchi_api.metrics import loss from tianchi_api.competition import CompetitionPredictionModel, IterativePredictionModel, NewCompetitionPredictionModel from tianchi_api.features import * from tianchi_api.models import * def Model_for_competition(algorithm_name, pickle_model_file_name, fgfs, ReportFolder, source,
# -*- coding: utf-8 -*- ''' The improved KNN predictor considers all the features. ''' import pandas as pd import matplotlib.pyplot as plt import psycopg2 import sys import os import numpy as np from tianchi_api.system import getHome #from zero_statistics import get_original_pay_trend import zero_statistics as zt global_anamolies = set() DESFOLDER = os.path.join(getHome(), "Dropbox", "dataset", "Analysis", "AnamolyDetect") FILENAME = "firstpart.csv" def write_anamolies(): try: os.makedirs(DESFOLDER) except: pass with open(os.path.join(DESFOLDER, FILENAME), 'w') as fw: for rec in global_anamolies: fw.write("%s,%s,%s\n" % (rec[0], rec[1], rec[2])) def click_data_show(shop_id):
""" return the home directory according to the platform :return: """ system = platform.system() if system.startswith("Lin"): HOME = os.path.expanduser('~') elif system.startswith("Win"): HOME = r"C:\Users\KH44IM" else: print "Unknown platform" sys.exit(0) return HOME HOME = getHome() sys.path.append(os.path.join(HOME, "Dropbox", "dataset", "Scripts")) from tianchi_api.system import getHome from tianchi_api.metrics import loss, loss_reverse import xgboost as xgb from xgboost.sklearn import XGBRegressor import matplotlib.pylab as plt from matplotlib.pylab import rcParams rcParams['figure.figsize'] = 12, 4 from sklearn.model_selection import KFold def scorer(estimator, X, y): return loss_reverse(estimator.predict(X), y, False)
filename = os.path.join(getHome(),"Dropbox",'dataset','Scripts','VisualizePredictionResult','shops_recorded.txt') shop_set = set() with open(filename,'r') as fp: lines = fp.readlines() for l in lines: shop_set.add(int(l.strip('\r\n'))) return shop_set def merge_xgboost_local_substitute(): xgboost = get_pred_for_shops(pred_file = os.path.join(getHome(),"Dropbox",'dataset','Analysis','competition','xgBoost_comp_iterLength_7mode_filled_tarLength_14_iterOrC_copy_0.08607781.csv')) local = get_pred_for_shops(pred_file = os.path.join(getHome(),"Dropbox",'dataset','Scripts','LocalLearning','neighbours_10000_alg_ETR_estimator_500_mss_2_msl_1_copy_7x2_drop_xiaohaozi_localLearning714_0.08547515.csv')) shop_set = get_substitutes() for shop in local: if shop in shop_set: local[shop] = xgboost[shop] with open('merge_xgboost_local.csv','w') as fw: for s in xrange(1,2001): fw.write(','.join(map(str,[s]+local[s]))+'\n') if __name__=='__main__': pred_file = os.path.join(getHome(),"Dropbox",'dataset','Scripts','LocalLearning','Ext_LocalKernel_weight+114_77copy_0.08390_1.1_0.0832.csv') #print "Here" #pred_file = os.path.join(getHome(),"Dropbox",'dataset','Analysis','competition','features_final','xgBoost_comp_iterLength_7mode_filled_tarLength_14_iterOrC_iterative_77copy.csv') #print pred_file savefolder = os.path.join(getHome(), 'Dropbox', 'dataset', 'Analysis', 'Pred_Visulization','BestScore') concat_pred_to_original(range(1,2001),5,True,pred_file,savefolder) #for s in range(1,2001): #compare_preds(s) #merge_xgboost_local_substitute()
if system.startswith("Lin"): HOME = os.path.expanduser('~') sys_name = "Lin" HOME_modelFolder = os.path.expanduser('~') elif system.startswith("Win"): HOME = r"C:\Users\KH44IM" sys_name = "Win" HOME_modelFolder = r"H:\Model" else: print "Unknown platform" sys_name = "No" sys.exit(0) return HOME, HOME_modelFolder sys.path.append(os.path.join(getHome()[0], "Dropbox", "dataset", "Scripts")) import pandas as pd import matplotlib.pyplot as plt import numpy as np from tianchi_api.system import getHome from tianchi_api.metrics import loss from tianchi_api.system import getHome from sklearn.model_selection import train_test_split from sklearn.ensemble import RandomForestRegressor from sklearn.ensemble import ExtraTreesRegressor from sklearn.neighbors import NearestNeighbors from tianchi_api.competition import CompetitionPredictionModel, IterativePredictionModel, NewCompetitionPredictionModel from tianchi_api.features import * from tianchi_api.getPredictors import predictors_WeatherAirTem