示例#1
0
def grid_search(tuned_parameters, data, train_size, seed):
    '''
    参数优化

    :param tuned_parameters: 待优化的参数字典
    :param data: 数据集
    :param train_size:训练集大小
    :param seed:用于生成随机数种子
    :return:
    '''

    print("----- Begin run grid_search at %s -------" % current_time())
    X = data[:, :-1]
    y = data[:, -1]
    X_train, X_test, y_train, y_test = train_test_split(X,
                                                        y,
                                                        train_size=train_size,
                                                        stratify=data[:, -1],
                                                        random_state=seed)
    clf = GridSearchCV(GradientBoostingClassifier(),
                       tuned_parameters,
                       cv=10,
                       scoring="roc_auc")
    clf.fit(X_train, y_train)
    print("Best parameters set found:", clf.best_params_)
    print("Randomized Grid scores:")
    for params, mean_score, scores in clf.grid_scores_:
        print("\t%0.3f (+/-%0.03f) for %s" %
              (mean_score, scores.std() * 2, params))
        print("Optimized Score:", clf.score(X_test, y_test))
        print("Detailed classification report:")
        y_true, y_pred = y_test, clf.predict(X_test)
        print(classification_report(y_true, y_pred))
    print("----- End run grid_search at %s -------" % current_time())
示例#2
0
 def _save_data(self):
     print("----- Begin run save_data at %s -------" % current_time())
     with open(self.fname, 'wb') as file:  #保存训练集、测试集、编码器、归一化器
         pickle.dump([
             self.train_datas, self.test_datas, self.encoders, self.scalers
         ], file)
     print("----- End run save_data at %s -------" % current_time())
示例#3
0
    def _scaled(self):
        '''
        特征归一化,采用 MaxAbsScaler 来进行归一化
        :return:
        '''
        print("----- Begin run scaled at %s -------" % current_time())
        train_scales = {}
        test_scales = {}
        self.scalers = {}
        for _type in self.types:
            if _type == 'type 1':
                train_last_index = 5  #最后5列为 group_1/date_act/date_people/char_38/outcome
                test_last_index = 4  #最后4列为 group_1/date_act/date_people/char_38
            else:
                train_last_index = 6  #最后6列为 group_1/char_10_act/date_act/date_people/char_38/outcome
                test_last_index = 5  #最后5列为 group_1/char_10_act/date_act/date_people/char_38

            scaler = MaxAbsScaler()
            train_array = self.train_datas[_type].toarray()
            train_front = train_array[:, :-train_last_index]
            train_mid = scaler.fit_transform(
                train_array[:, -train_last_index:-1])  #outcome 不需要归一化
            train_end = train_array[:, -1].reshape((-1, 1))  #outcome
            train_scales[_type] = np.hstack(
                (train_front, train_mid, train_end))

            test_array = self.test_datas[_type].toarray()
            test_front = test_array[:, :-test_last_index]
            test_end = scaler.transform(test_array[:, -test_last_index:])
            test_scales[_type] = np.hstack((test_front, test_end))
            self.scalers[_type] = scaler
        self.train_datas = train_scales
        self.test_datas = test_scales
        print("----- End run scaled at %s -------" % current_time())
 def _load_data(self):
     
     print('----- Begin run load_data at %s -----' %current_time())
           
     with open(self.fname,'rb') as file:
           self.train_datas,self.test_datas = pickle.load(file)
           
     print('----- End run load_data at %s -----' %current_time())
 def _save_data(self):
     
     print('----- Begin run save_data at %s -----' %current_time())
     
     with open (self.fname,'wb') as file:
         pickle.dump([self.train_datas,self.test_datas],file=file)
         
     print('----- End run save_data at %s -----' %current_time()')
 def _curve(self):
     print('----- Begin run learning_curve (%s) at %s -----' %(self.curve_name,current_time()))
     abs_trains_sizes,train_scores,test_scores = learning_curve(self.estimator,self.X,self.y,cv=3,scoring='roc_auc',train_sizes=self.train_sizes)
     print('----- End run learning_curve (%s) at %s -----' %(self.curve_name,current_time())')
     train_scores_mean = np.mean(train_scores,axis=1)
     train_scores_std = np.std(train_scores,axis=1)
     test_scores_mean = np.mean(test_scores_mean)
     test_scores_std = np.std(test_scores_std)
     return abs_trains_sizes,train_scores_mean,train_scores_std,test_scores_mean,test_scores_std
 def _merge_data(self):
     '''
     合并people数据和activity数据
     :return :
     '''
     
     print('----- Begin run merge_data at %s -----' %current_time())
     
     self.train_data = self.merge(act_train,people,how='left',left_index=True,right_index=True,suffixes=('_act','_people'))
     self.test_data = self.merge(act_test,people,how='left',left_index=True,right_index=True,suffixes=('_act','_people'))
     
     print('----- End run merge_data at %s -----' %current_time())
    def _typecast_data(self):
        '''
        执行数据类型转换,将所有数据转换为浮点型
        :return :
        '''
        
        print('----- Begin run typecast_data at %s -----' %current_time())
        
        str_col_list = ['group_1'] + ['char_%d_act'%i for i in range(1,11)] + ['char_%d_people'%i for i in range(1,10)]
        bool_col_list = ['char_10_people'] + ['char_%d'%i for i in range(11,18)]
            
        for _type in sel.types:
            for data_set in [train_datas,test_datas]:
                data_set[_type].date_act = (data_set[_type].date_act - np.datetime64('1970-01-01')) / np.timedelta64(1,'D')
                data_set[_type].date_people = (data_set[_type].date_people - np.datetime64('1970-01-01'))/np.timedelta64(1,'D')
                data_set[_type].group_1 = data_set[_type].group_1.str.replace('group','').str.strip().astype(np.float64)
            for col in bool_col_list:
                if col in data_set[_type]:
                    data_set[_type][col] = data_set[_type][col].astype(np.float64)
            for col in str_col_list[1:]:
                if col in data_set[_type]:
                data_set[_type][col] = data_set[_type][col].str.replace('type','').str.strip().astype(np.float64)
            data_set[_type] = data_set[_type].astype(np.float64)

        print('----- Begin run typecast_data at %s -----' %current_time()) 
        
    def _is_ready(self):
        if (os.path.exists(self.fname)):
            return True
        else:
            return False
    
    def _save_data(self):
        
        print('----- Begin run save_data at %s -----' %current_time())
        
        with open (self.fname,'wb') as file:
            pickle.dump([self.train_datas,self.test_datas],file=file)
            
        print('----- End run save_data at %s -----' %current_time()')
            
    def _load_data(self):
        
        print('----- Begin run load_data at %s -----' %current_time())
              
        with open(self.fname,'rb') as file:
              self.train_datas,self.test_datas = pickle.load(file)
              
        print('----- End run load_data at %s -----' %current_time())
 def _scaled(self):
     '''
     特征归一化,采用 MaxAbsScale 来进行归一化
     :return:
     '''
     print('----- Begin run Scaled at %s -----'%current_time())
     train_scales = {}
     test_scales = {}
     self.scaler = {}
     for _type in self.types:
         if _type == 'type 1':
             train_last_index = 5
             test_last_index = 4
         else:
             train_last_index = 6
             test_last_index = 5
         scaler = MaxAbsScaler()
         train_array = self.train_datas[_type].toarray()
         train_front = train_array[:,:-train_last_index]
         train_middle = scaler.fit_transform(train_array[:,-train_last_index:-1])
         train_end = train_array[:,-1].reshape((-1,1))
         train_scalers[_type] = np.hstack((train_front,train_middle,train_end))
         self.scaler[_type] = scaler
     
         test_array = self.test_datas[_type].toarray()
         test_front = test_array[:,:-test_last_index]
         test_end = scaler.fit_transform(test_array[:,-test_last_index])
         test_scales[_type] = np.hstack((test_front,test_end))
         self.scalers[_type] = scaler
         
           
     self.train_datas = train_scalers
     self.test_datas = test_scalers
 def _load_csv(self):
     '''
     加载CSV文件
     :return :
     '''
     print('----- Begin run load_csv at %s -----' %current_time()) 
     
     self.people = pd.read_csv(self.p_fname,sep='',header=0,keep_default_na=True,parse_dates=['date'])
     self.act_train = pd.read_csv(self.train_fname,sep='',header=0,keep_default_na=True,parse_dates=['date'])
     self.act_test = pd.read_csv(self.test_fname,sep='',header=0,keep_default_na=True,parse_dates=['date'])
     
     self.people.set_index(kesys = ['people_id'],drop=True,append=False,inplace=True)
     self.act_train.set_index(kesys = ['people_id'],drop=True,append=False,inplace=True)
     self.act_test.set_index(kesys = ['people_id'],drop=True,append=False,inplace=True)
     
     print('----- End run load_csv at %s -----' %current_time())
 def _curve(self):
     print("----- Begin run validation_curve(%s) at %s -------" %
           (self.curve_name, current_time()))
     train_scores, test_scores = validation_curve(self.estimator,
                                                  self.X,
                                                  self.y,
                                                  param_name=self.p_name,
                                                  param_range=self.p_range,
                                                  cv=3,
                                                  scoring="roc_auc",
                                                  n_jobs=-1,
                                                  verbose=1)
     print("----- End run validation_curve(%s) at %s -------" %
           (self.curve_name, current_time()))
     train_scores_mean = np.mean(train_scores, axis=1)
     train_scores_std = np.std(train_scores, axis=1)
     test_scores_mean = np.mean(test_scores, axis=1)
     test_scores_std = np.std(test_scores, axis=1)
     return [
         item for item in self.p_range
     ], train_scores_mean, train_scores_std, test_scores_mean, test_scores_std
 def _curve(self):
     print("----- Begin run learning_curve(%s) at %s -------" %
           (self.curve_name, current_time()))
     #### 获取学习曲线 ######
     abs_trains_sizes, train_scores, test_scores = learning_curve(
         self.estimator,
         self.X,
         self.y,
         cv=3,
         scoring="roc_auc",
         train_sizes=self.train_sizes,
         n_jobs=-1,
         verbose=1)
     print("----- End run learning_curve(%s) at %s -------" %
           (self.curve_name, current_time()))
     ###### 对每个 test_size ,获取 3 折交叉上的预测得分上的均值和方差 #####
     train_scores_mean = np.mean(train_scores, axis=1)
     train_scores_std = np.std(train_scores, axis=1)
     test_scores_mean = np.mean(test_scores, axis=1)
     test_scores_std = np.std(test_scores, axis=1)
     return abs_trains_sizes, train_scores_mean, train_scores_std, test_scores_mean, test_scores_std
 def _onehot_encode():
     '''
     独热编码
     :return :
     '''
     print('----- Begin run onehot_encoder at %s -----'%current_time())
     train_results = {}
     test_results = {}
     self._encoders = {}
     
     for _type in self.types():
         if _type == 'type 1':
             one_hot_cols = ['char_%d_act'%i for i in range(1,10)] + ['char_%d_people'%i for i in range(1,10)]
             train_end_cols = ['group_1','date_act','date_people','char_38','outcome']
             test_end_cols = ['group_1','date_act','date_people','char_38']
         else:
             one_hot_cols = ['char_%d_people'%i for i in range(1,10)]
             train_end_cols = ['group_1','char_10_act','date_act','date_people','char_38','outcome']
             test_end_cols = ['group_1','char_10_act','date_act','date_people','char_38']
         
         train_front_array = self.train_datas[_type][one_hot_cols].values
         train_end_array = self.train_datas[_type][train_end_cols].values
         train_middle_array = self.train_datas[_type].drop(train_end_cols + one_hot_cols,axis=1,inplace=False).values
         
         test_front_array = self.test_datas[_type][one_hot_cols].values
         test_end_array = self.test_datas[_type][test_end_cols].values
         test_middle_array = self.test_datas[_type].drop(test_end_cols + one_hot_cols,axis=1,inplace=False).values           
           
         encoder = OneHotEncoder(categorical_features='all',sparse=True)
         
         train_result = hstack([encoder.fit_transform(train_front_array),csr_matrix(train_middle_array),csr_matrix(train_end_array)])
         test_result = hstack([encoder.fit_transform(test_front_array),csr_matrix(test_middle_array),csr_matrix(test_end_array)])
         
         train_results[_type] = train_result
         test_results[_type] = test_result
     
     self.train_datas = train_results
     self.test_datas = test_results
     print('----- End run onehot_encoder at %s -----'%current_time())
 def _split_data(self):
     '''
     拆分数据为 type1-7
     :return:
     '''
     
     print('----- Begin run split_data at %s -----' %current_time())
     
     self.train_datas = {}
     self.test_dats = {}
     for _type in self._types:
         #拆分
         self.train_datas[_type] = self.train_data[self.train_data.activity_category == _type].dropna(axis=(0,1),how='all')
         self.test_datas[_type] = self.test_data[self.test_data.activity_category == _type].dropna(axis=(0,1),how='all')
         #删除列activity_category
         self.train_datas[_type].drop('activity_category',axis=1,inplace=True)
         self.test_datas[_type].drop('activity_category',axis=1,inplace=True)
         #将列activity_id设为索引
         self.train_datas[_type].set_index(keys = ['activity_id'],drop=True,append=True,inplace=True)
         self.test_datas[_types].set_index(keys = ['activity_id'],drop=True,append=True,inplace=True)
     
     print('----- End run split_data at %s -----' %current_time())
示例#15
0
 def _load_data(self):
     print("----- Begin run _load_data at %s -------" % current_time())
     with open(self.fname, 'rb') as file:  #加载训练集、测试集、编码器、归一化器
         self.train_datas, self.test_datas, self.encoders, self.scalers = pickle.load(
             file)
     print("----- End run _load_data at %s -------" % current_time())