def _get_rf( X: np.ndarray, Y: np.ndarray, Yvar: np.ndarray, num_trees: int, max_features: Optional[str], ) -> RandomForestRegressor: """Fit a Random Forest model. Args: X: X Y: Y Yvar: Variance for Y num_trees: Number of trees max_features: Max features specifier Returns: Fitted Random Forest. """ r = RandomForestRegressor(n_estimators=num_trees, max_features=max_features, bootstrap=True) # pyre-fixme[16]: `RandomForestRegressor` has no attribute `estimators_`. r.estimators_ = [DecisionTreeRegressor() for i in range(r.n_estimators)] for estimator in r.estimators_: # Parametric bootstrap y = np.random.normal(loc=Y[:, 0], scale=np.sqrt(Yvar[:, 0])) estimator.fit(X, y) return r
def deserialize_random_forest_regressor(model_dict): model = RandomForestRegressor(**model_dict['params']) estimators = [ deserialize_decision_tree_regressor(decision_tree) for decision_tree in model_dict['estimators_'] ] model.estimators_ = np.array(estimators) model.n_features_ = model_dict['n_features_'] model.n_outputs_ = model_dict['n_outputs_'] model.max_depth = model_dict['max_depth'] model.min_samples_split = model_dict['min_samples_split'] model.min_samples_leaf = model_dict['min_samples_leaf'] model.min_weight_fraction_leaf = model_dict['min_weight_fraction_leaf'] model.max_features = model_dict['max_features'] model.max_leaf_nodes = model_dict['max_leaf_nodes'] model.min_impurity_decrease = model_dict['min_impurity_decrease'] model.min_impurity_split = model_dict['min_impurity_split'] if 'oob_score_' in model_dict: model.oob_score_ = model_dict['oob_score_'] if 'oob_prediction_' in model_dict: model.oob_prediction_ = np.array(model_dict['oob_prediction_']) return model
def train(self): if self.args.early_stopping <= 0: # no early stopping rf = RandomForestRegressor( **self.params ) if self.args.regression else RandomForestClassifier( **self.params) rf.fit(self.x_train, self.y_train) else: # determine best number of trees by adding in trees iteratively self.params['n_estimators'] = self.args.min_n_estimators self.params['warm_start'] = True rf = RandomForestRegressor( **self.params ) if self.args.regression else RandomForestClassifier( **self.params) result = { 'val_loss': float('inf'), 'val_error': float('inf'), 'n_estimators': 0 } while rf.n_estimators < self.args.n_estimators: rf.fit(self.x_train, self.y_train) vl_loss, vl_error, _, _ = self.eval(rf, self.x_val, self.y_val) if self.args.verbose >= 3: print( ' n_estimators={} validation_score={:.4f}'.format( rf.n_estimators, vl_loss if self.args.regression else vl_error)) if (self.args.regression and vl_loss < result['val_loss']) \ or (not self.args.regression and vl_error < result['val_error']): result = { 'val_loss': vl_loss, 'val_error': vl_error, 'n_estimators': rf.n_estimators } elif rf.n_estimators - self.args.early_stopping >= result[ 'n_estimators']: rf.set_params(n_estimators=result['n_estimators'] ) # roll back rf to optimal n_estimators rf.estimators_ = rf.estimators_[:rf.n_estimators] self.params['n_estimators'] = result['n_estimators'] if self.args.verbose >= 3: print( 'Early stopping at n_estimators={} (added {} trees with no improvement).' .format(rf.n_estimators, self.args.early_stopping)) print('Rolled back to optimal n_estimators={}'.format( rf.n_estimators)) break rf.set_params(n_estimators=rf.n_estimators + self.args.step_size) tr_loss, tr_error, tr_rmse, tr_mae = self.eval(rf, self.x_train, self.y_train) vl_loss, vl_error, vl_rmse, vl_mae = self.eval(rf, self.x_val, self.y_val) self.model = rf self.params['n_estimators'] = rf.n_estimators self.result = { 'train_loss': tr_loss, 'train_error': tr_error, 'train_rmse': tr_rmse, 'train_mae': tr_mae, 'val_loss': vl_loss, 'val_error': vl_error, 'val_rmse': vl_rmse, 'val_mae': vl_mae, 'n_estimators': rf.n_estimators } if self.args.verbose >= 3: if self.args.regression and not self.args.dataset.endswith("_r"): print( 'TRAIN RESULT: Loss: {:.5f} RMSE: {:.5f} MAE: {:.5f}' .format(tr_loss, tr_rmse, tr_mae)) print( 'VALDT RESULT: Loss: {:.5f} RMSE: {:.5f} MAE: {:.5f}' .format(vl_loss, vl_rmse, vl_mae)) else: print( 'TRAIN RESULT: Loss: {:.5f} Error: {:.2f}% Accuracy: {:.2f}%' .format(tr_loss, 100. * tr_error, 100. * (1 - tr_error))) print( 'VALDT RESULT: Loss: {:.5f} Error: {:.2f}% Accuracy: {:.2f}%' .format(vl_loss, 100. * vl_error, 100. * (1 - vl_error)))
def prediction2(param, estimators, origin='', destination='', carrier='', month=0, weekday=0): ''' This function allows you to input all of your flight information (no leaks!) and the function will return how late your flight will arrive based on the output from the Random Forest Regressor. Inputs: Origin (enter this as a city, state combo, or include the airport name (such as Bush or Hobby). This will automatically calculate which airport you meant. Destination (same as Origin, entered as a string) Carrier (which Airline, use a string to represent the name (such as 'American' or 'United') Month (the month the flight is scheduled for) Weekday (Enter number between 1-7) such as 1:'Lundi',2:'Mardi',3:'Mercredi',4:'Jeudi',5:'Vendredi',6:'Samedi',7:'Dimanche' Available Carriers: 'FL':'AirTran airways', 'AS':'Alaska airlines', 'AA':'American airlines', 'DL':'Delta airways', '9E':'Endeavor air', 'MQ':'Envoy air', 'EV':'ExpressJet airlines', 'F9':'Frontier airlines', 'HA':'Hawaiian airlines', 'B6':'JetBlue airways', 'YV':'Mesa airlines', 'OO':'SkyWest airlines', 'WN':'Southwest airlines', 'NK':'Spirit airlines', 'UA':'United airlines', 'US':'US airways', 'VX':'Virgin America Outputs: int: Estimated delay for the arrival (in minutes, can be negative if the flight is expected to arrive early) text: Status of the estimation ''' from sklearn.ensemble import RandomForestRegressor col_utiles = [ 'CARRIER_DELAY', 'WEATHER_DELAY', 'NAS_DELAY', 'SECURITY_DELAY', 'LATE_AIRCRAFT_DELAY', 'MONTH', 'DAY_OF_MONTH', 'DAY_OF_WEEK', 'UNIQUE_CARRIER', 'ORIGIN_AIRPORT_ID', 'DEST_AIRPORT_ID', 'CRS_DEP_TIME', 'DISTANCE', 'AIR_TIME', 'CRS_ELAPSED_TIME', 'ACTUAL_ELAPSED_TIME', 'ARR_DELAY' ] #Lecture du fichier d'input fly = pd.read_csv('./Dataset_Projet_4/2016_sample_file_09.csv', sep=",", encoding='utf_8', low_memory=False, error_bad_lines=False, usecols=col_utiles) #Et celui des aéroports airport = pd.read_csv('./Dataset_Projet_4/L_AIRPORT_ID.csv', sep=",", encoding='utf_8', low_memory=False, error_bad_lines=False) #Label encoding des cies aériennes Cie = liste_distincte_col(fly, 'UNIQUE_CARRIER', '|') Cie.sort() from sklearn import preprocessing le = preprocessing.LabelEncoder() le.fit(Cie) fly['CIE'] = le.transform(fly['UNIQUE_CARRIER']) #Création d'un df avec la liaison UNIQUE_CARRIER <-> CIE CIEdf = fly[['UNIQUE_CARRIER', 'CIE']].drop_duplicates() # On conserve un exemplaire unique #Elimination de la colonne UNIQUE_CARRIER fly.drop(['UNIQUE_CARRIER'], axis=1, inplace=True) #Elimination des colonnes Unnamed fly = fly.drop(col_rech_titre(fly, False, "Unnamed"), axis=1) #Recherche du code des aéroports id_origin = 0 id_destination = 0 if origin != '': id_origin = AirportCode(fly, airport, origin) if destination != '': id_destination = AirportCode(fly, airport, destination) #Création d'un marsk / filtre def mask(df, key, value): return df[df[key] == value] pd.DataFrame.mask = mask #Filtrage de Fly: if id_origin != 0: fly = fly.mask('ORIGIN_AIRPORT_ID', id_origin) if id_destination != 0: fly = fly.mask('DEST_AIRPORT_ID', id_destination) if month != 0: fly = fly.mask('MONTH', month) if weekday != 0: fly = fly.mask('DAY_OF_WEEK', weekday) if carrier != '': #Détermination de la CIE à partir de UNIQUE_CARRIER carrier_num = CIEdf[CIEdf.UNIQUE_CARRIER == carrier] carrier_num = carrier_num['CIE'].values[0] fly = fly.mask('CIE', carrier_num) #Remplacement des NaN fly['CARRIER_DELAY'].fillna(0, inplace=True) fly['WEATHER_DELAY'].fillna(0, inplace=True) fly['NAS_DELAY'].fillna(0, inplace=True) fly['SECURITY_DELAY'].fillna(0, inplace=True) fly['LATE_AIRCRAFT_DELAY'].fillna(0, inplace=True) if fly.shape[0] == 0: return 0, "Insuffisament de données pour prédire ! Soyez moins précis !" #Création des moyennes par colonne fly_mean = fly.mean() #Création d'un DataFRame mono observation reprenant la moyenne des infos nécessaires à la régression: ar = np.array([[ fly_mean['CARRIER_DELAY'], fly_mean['WEATHER_DELAY'], fly_mean['NAS_DELAY'], fly_mean['SECURITY_DELAY'], fly_mean['LATE_AIRCRAFT_DELAY'], fly_mean['DISTANCE'], fly_mean['AIR_TIME'], fly_mean['CRS_ELAPSED_TIME'], fly_mean['ACTUAL_ELAPSED_TIME'] ]]) df = pd.DataFrame(ar, index=[1], columns=[ 'CARRIER_DELAY', 'WEATHER_DELAY', 'NAS_DELAY', 'SECURITY_DELAY', 'LATE_AIRCRAFT_DELAY', 'DISTANCE', 'AIR_TIME', 'CRS_ELAPSED_TIME', 'ACTUAL_ELAPSED_TIME' ]) New_rfr = RandomForestRegressor() New_rfr.set_params = param New_rfr.estimators_ = estimators print( f"Prédiction d'avance/retard: {int(New_rfr.predict(df)[0])} minutes ") return int(New_rfr.predict(df)[0]), "prédiction sans erreur"