################### FEATURE SELECTION if feat_sel: print('FEATURE SELECTION ...') data_frame = pd.concat([X_init, y_init], axis=1) data_frame = feature_importance(data_frame, coeff_threshold, 'class') X_test_then_train = X_test_then_train[data_frame.columns[:-1]] X_init = X_init[data_frame.columns[:-1]] features = features[data_frame.columns[:-1]] #DATA PREPARATION FOR SCIKIT-MULTIFLOW stream.X = features.values stream.y = labels.values ################### HYPERPARAMETER TUNING if hyperparameter_tuning: print('HYPERPARAMETER TUNING ...') for reg in range(len(regressors)): reg_name = regressors[reg].__class__.__name__ if reg_name == 'PassiveAggressiveRegressor': print(reg_name, ' tuning ...') PAR_timer = timer()
def cargaDatos(datasets,data,severity,speed,lim_data): if data==0:#weather stream = FileStream('your_path') stream.prepare_for_use() df=pd.DataFrame(stream.X) x = df.values min_max_scaler = preprocessing.MinMaxScaler() x_scaled = min_max_scaler.fit_transform(x) df = pd.DataFrame(x_scaled) stream.X=df.as_matrix() elif data==1:#elec stream = FileStream('your_path') stream.prepare_for_use() elif data==2:#covtype stream = FileStream('your_path') stream.prepare_for_use() df=pd.DataFrame(stream.X) x = df.values min_max_scaler = preprocessing.MinMaxScaler() x_scaled = min_max_scaler.fit_transform(x) df = pd.DataFrame(x_scaled) df=df[0:5000]#Limitar porque tiene muchas features stream.X=df.as_matrix() #Hay q hacer que las labels vayan de 0-6 para que el tamaño del repositorio de OnlineGRF coincida stream.y=stream.y-1 stream.target_values=list(np.unique(stream.y)) elif data==3:#moving_squares stream = FileStream('your_path') stream.prepare_for_use() df=pd.DataFrame(stream.X) df=df[0:lim_data]#Limitar datos a 50k samples stream.X=df.as_matrix() elif data==4:#sea_stream stream = FileStream('your_path') stream.prepare_for_use() elif data==5:#usenet2 stream = FileStream('your_path') stream.prepare_for_use() elif data==6:#gmsc df=pd.read_csv('your_path',sep=',',header=0) df = df.drop('Unnamed: 0', 1)#Quitamos la primera columna df=df.dropna(how='any')#Se quitan las filas con Nan df=df[0:lim_data]#Limitar datos a 50k samples feats=df[['RevolvingUtilizationOfUnsecuredLines', 'age', 'NumberOfTime30-59DaysPastDueNotWorse', 'DebtRatio', 'MonthlyIncome', 'NumberOfOpenCreditLinesAndLoans', 'NumberOfTimes90DaysLate', 'NumberRealEstateLoansOrLines', 'NumberOfTime60-89DaysPastDueNotWorse', 'NumberOfDependents']] # x = feats.values # min_max_scaler = preprocessing.MinMaxScaler() # x_scaled = min_max_scaler.fit_transform(x) # feats = pd.DataFrame(x_scaled) clas=df[['SeriousDlqin2yrs']] df_result = pd.concat([feats, clas], axis=1, sort=False) df_result.to_csv('your_path') stream = FileStream('your_path') stream.prepare_for_use() stream.X=feats.as_matrix() stream.y=clas.as_matrix() elif data==7:#airlines df = pd.read_csv('your_path', sep=',', header=None) #Tratar las features nominales: 0,2,3 columns #1. Si hacemos OneHot encoding, se convierte en tantas features que al usar GRF y su parametro gamma tarda demasiado # df=pd.get_dummies(df, columns=[0,2,3], prefix=["airline", "airport_from", "airport_to"]) # df.to_csv("//home//txuslopez//Dropbox//jlopezlobo//Data sets//Non stationary environments//Airlines//airlines2.csv") #2. Hacemos Label encoding df.iloc[:,0] = df.iloc[:,0].astype('category') df.iloc[:,0]=df.iloc[:,0].cat.codes df.iloc[:,2] = df.iloc[:,2].astype('category') df.iloc[:,2]=df.iloc[:,2].cat.codes df.iloc[:,3] = df.iloc[:,3].astype('category') df.iloc[:,3]=df.iloc[:,3].cat.codes #Quitamos la primera columna df=df.drop([0], axis=1) # df=pd.DataFrame(stream.X) # # x = df.values # min_max_scaler = preprocessing.MinMaxScaler() # x_scaled = min_max_scaler.fit_transform(x) # df = pd.DataFrame(x_scaled) df.to_csv('your_path') df=df[0:lim_data]#Limitar datos a 50k samples stream = FileStream('your_path') stream.prepare_for_use() stream.X=df.as_matrix() elif data==8 or data==9 or data==10 or data==11:#sinteticos synt_name='' synt_name2='' if data==8: synt_name='circleG' synt_name2='CircleG' elif data==9: synt_name='line' synt_name2='Line' elif data==10: synt_name='sineH' synt_name2='SineH' elif data==11: synt_name='sine' synt_name2='Sine' path='your_path' fil=synt_name+'//data'+synt_name2+'Sev'+str(severity)+'Sp'+str(speed)+'Train.csv' raw_data= pd.read_csv(path + fil, sep=',',header=None) caso=raw_data[raw_data.columns[0:3]]#Delete the last useless column caso.iloc[:,2]=(caso.iloc[:,2]).astype(int)#Se convierte la clase a int new_fil=synt_name+'_'+'Sev'+str(severity)+'_Sp'+str(speed)+'Train.csv' caso.to_csv(path+synt_name+'//'+ new_fil) stream = FileStream(path+synt_name+'//'+ new_fil) stream.prepare_for_use() if synt_name=='sine':#Hay que escalar los datos df=pd.DataFrame(stream.X) x = df.values min_max_scaler = preprocessing.MinMaxScaler() x_scaled = min_max_scaler.fit_transform(x) df = pd.DataFrame(x_scaled) caso=df stream.X=caso.iloc[:,0:2].as_matrix() elif data==12 or data==13 or data==14 or data==15 or data==16 or data==17 or data==18 or data==19:#sinteticos extendidos synt_name='' synt_name2='' if data==12 or data==13: synt_name='circleG' synt_name2='CircleG' elif data==14 or data==15: synt_name='line' synt_name2='Line' elif data==16 or data==17: synt_name='sineH' synt_name2='SineH' elif data==18 or data==19: synt_name='sine' synt_name2='Sine' path='your_path' fil=synt_name+'//data'+synt_name2+'Sev'+str(severity)+'Sp'+str(speed)+'Train.csv' raw_data= pd.read_csv(path + fil, sep=',',header=None) caso=raw_data[raw_data.columns[0:3]]#Delete the last useless column caso.iloc[:,2]=(caso.iloc[:,2]).astype(int)#Se convierte la clase a int #Se alargan los concepts estables caso2=pd.DataFrame() if data==12 or data==14 or data==16 or data==18:#concepto estable 1 caso=caso[0:999] caso2=caso.iloc[np.tile(np.arange(len(caso)), 50)] new_fil=synt_name+'_'+'concept1.csv' elif data==13 or data==15 or data==17 or data==19:#concepto estable 2 caso=caso[1000:1999] caso2=caso.iloc[np.tile(np.arange(len(caso)), 50)] new_fil=synt_name+'_'+'concept2.csv' caso2.to_csv(path+synt_name+'//'+ new_fil) stream = FileStream(path+synt_name+'//'+ new_fil) stream.prepare_for_use() if synt_name=='sine':#Hay que escalar los datos para que no sean negativos, sino algunos algoritmos cascan df=pd.DataFrame(stream.X) x = df.values min_max_scaler = preprocessing.MinMaxScaler() x_scaled = min_max_scaler.fit_transform(x) df = pd.DataFrame(x_scaled) caso2=df stream.X=caso2.iloc[:,0:2].as_matrix() return stream