示例#1
0
def _get_rf(
    X: np.ndarray,
    Y: np.ndarray,
    Yvar: np.ndarray,
    num_trees: int,
    max_features: Optional[str],
) -> RandomForestRegressor:
    """Fit a Random Forest model.

    Args:
        X: X
        Y: Y
        Yvar: Variance for Y
        num_trees: Number of trees
        max_features: Max features specifier

    Returns: Fitted Random Forest.
    """
    r = RandomForestRegressor(n_estimators=num_trees,
                              max_features=max_features,
                              bootstrap=True)
    # pyre-fixme[16]: `RandomForestRegressor` has no attribute `estimators_`.
    r.estimators_ = [DecisionTreeRegressor() for i in range(r.n_estimators)]
    for estimator in r.estimators_:
        # Parametric bootstrap
        y = np.random.normal(loc=Y[:, 0], scale=np.sqrt(Yvar[:, 0]))
        estimator.fit(X, y)
    return r
def deserialize_random_forest_regressor(model_dict):
    model = RandomForestRegressor(**model_dict['params'])
    estimators = [
        deserialize_decision_tree_regressor(decision_tree)
        for decision_tree in model_dict['estimators_']
    ]
    model.estimators_ = np.array(estimators)

    model.n_features_ = model_dict['n_features_']
    model.n_outputs_ = model_dict['n_outputs_']
    model.max_depth = model_dict['max_depth']
    model.min_samples_split = model_dict['min_samples_split']
    model.min_samples_leaf = model_dict['min_samples_leaf']
    model.min_weight_fraction_leaf = model_dict['min_weight_fraction_leaf']
    model.max_features = model_dict['max_features']
    model.max_leaf_nodes = model_dict['max_leaf_nodes']
    model.min_impurity_decrease = model_dict['min_impurity_decrease']
    model.min_impurity_split = model_dict['min_impurity_split']

    if 'oob_score_' in model_dict:
        model.oob_score_ = model_dict['oob_score_']
    if 'oob_prediction_' in model_dict:
        model.oob_prediction_ = np.array(model_dict['oob_prediction_'])

    return model
示例#3
0
    def train(self):

        if self.args.early_stopping <= 0:  # no early stopping
            rf = RandomForestRegressor(
                **self.params
            ) if self.args.regression else RandomForestClassifier(
                **self.params)
            rf.fit(self.x_train, self.y_train)
        else:  # determine best number of trees by adding in trees iteratively
            self.params['n_estimators'] = self.args.min_n_estimators
            self.params['warm_start'] = True
            rf = RandomForestRegressor(
                **self.params
            ) if self.args.regression else RandomForestClassifier(
                **self.params)

            result = {
                'val_loss': float('inf'),
                'val_error': float('inf'),
                'n_estimators': 0
            }

            while rf.n_estimators < self.args.n_estimators:
                rf.fit(self.x_train, self.y_train)
                vl_loss, vl_error, _, _ = self.eval(rf, self.x_val, self.y_val)

                if self.args.verbose >= 3:
                    print(
                        '   n_estimators={}   validation_score={:.4f}'.format(
                            rf.n_estimators,
                            vl_loss if self.args.regression else vl_error))

                if (self.args.regression and vl_loss < result['val_loss']) \
                        or (not self.args.regression and vl_error < result['val_error']):
                    result = {
                        'val_loss': vl_loss,
                        'val_error': vl_error,
                        'n_estimators': rf.n_estimators
                    }
                elif rf.n_estimators - self.args.early_stopping >= result[
                        'n_estimators']:
                    rf.set_params(n_estimators=result['n_estimators']
                                  )  # roll back rf to optimal n_estimators
                    rf.estimators_ = rf.estimators_[:rf.n_estimators]
                    self.params['n_estimators'] = result['n_estimators']
                    if self.args.verbose >= 3:
                        print(
                            'Early stopping at n_estimators={} (added {} trees with no improvement).'
                            .format(rf.n_estimators, self.args.early_stopping))
                        print('Rolled back to optimal n_estimators={}'.format(
                            rf.n_estimators))
                    break

                rf.set_params(n_estimators=rf.n_estimators +
                              self.args.step_size)

        tr_loss, tr_error, tr_rmse, tr_mae = self.eval(rf, self.x_train,
                                                       self.y_train)
        vl_loss, vl_error, vl_rmse, vl_mae = self.eval(rf, self.x_val,
                                                       self.y_val)
        self.model = rf
        self.params['n_estimators'] = rf.n_estimators
        self.result = {
            'train_loss': tr_loss,
            'train_error': tr_error,
            'train_rmse': tr_rmse,
            'train_mae': tr_mae,
            'val_loss': vl_loss,
            'val_error': vl_error,
            'val_rmse': vl_rmse,
            'val_mae': vl_mae,
            'n_estimators': rf.n_estimators
        }

        if self.args.verbose >= 3:
            if self.args.regression and not self.args.dataset.endswith("_r"):
                print(
                    'TRAIN RESULT:   Loss: {:.5f}   RMSE: {:.5f}   MAE: {:.5f}'
                    .format(tr_loss, tr_rmse, tr_mae))
                print(
                    'VALDT RESULT:   Loss: {:.5f}   RMSE: {:.5f}   MAE: {:.5f}'
                    .format(vl_loss, vl_rmse, vl_mae))
            else:
                print(
                    'TRAIN RESULT:   Loss: {:.5f}   Error: {:.2f}%   Accuracy: {:.2f}%'
                    .format(tr_loss, 100. * tr_error, 100. * (1 - tr_error)))
                print(
                    'VALDT RESULT:   Loss: {:.5f}   Error: {:.2f}%   Accuracy: {:.2f}%'
                    .format(vl_loss, 100. * vl_error, 100. * (1 - vl_error)))
示例#4
0
def prediction2(param,
                estimators,
                origin='',
                destination='',
                carrier='',
                month=0,
                weekday=0):
    '''
    This function allows you to input all of your flight information (no leaks!) and
    the function will return how late your flight will arrive based on the output from the 
    Random Forest Regressor.
        
    Inputs: 
        
            Origin (enter this as a city, state combo, or include the airport name (such as Bush
                        or Hobby). This will automatically calculate which airport you meant.
                 
            Destination (same as Origin, entered as a string)
                 
            Carrier (which Airline, use a string to represent the name (such as 'American' or 'United')
                 
            Month (the month the flight is scheduled for)
                 
            Weekday (Enter number between 1-7) such as 1:'Lundi',2:'Mardi',3:'Mercredi',4:'Jeudi',5:'Vendredi',6:'Samedi',7:'Dimanche'
                 
            Available Carriers:
            'FL':'AirTran airways',
            'AS':'Alaska airlines',
            'AA':'American airlines',
            'DL':'Delta airways',
            '9E':'Endeavor air',
            'MQ':'Envoy air',
            'EV':'ExpressJet airlines',
            'F9':'Frontier airlines',
            'HA':'Hawaiian airlines',
            'B6':'JetBlue airways',
            'YV':'Mesa airlines',
            'OO':'SkyWest airlines',
            'WN':'Southwest airlines',
            'NK':'Spirit airlines',
            'UA':'United airlines',
            'US':'US airways',
            'VX':'Virgin America
            
    Outputs: 
        int: Estimated delay for the arrival (in minutes, can be negative if the flight is expected to arrive early)
        text: Status of the estimation
    '''
    from sklearn.ensemble import RandomForestRegressor

    col_utiles = [
        'CARRIER_DELAY', 'WEATHER_DELAY', 'NAS_DELAY', 'SECURITY_DELAY',
        'LATE_AIRCRAFT_DELAY', 'MONTH', 'DAY_OF_MONTH', 'DAY_OF_WEEK',
        'UNIQUE_CARRIER', 'ORIGIN_AIRPORT_ID', 'DEST_AIRPORT_ID',
        'CRS_DEP_TIME', 'DISTANCE', 'AIR_TIME', 'CRS_ELAPSED_TIME',
        'ACTUAL_ELAPSED_TIME', 'ARR_DELAY'
    ]
    #Lecture du fichier d'input
    fly = pd.read_csv('./Dataset_Projet_4/2016_sample_file_09.csv',
                      sep=",",
                      encoding='utf_8',
                      low_memory=False,
                      error_bad_lines=False,
                      usecols=col_utiles)
    #Et celui des aéroports
    airport = pd.read_csv('./Dataset_Projet_4/L_AIRPORT_ID.csv',
                          sep=",",
                          encoding='utf_8',
                          low_memory=False,
                          error_bad_lines=False)

    #Label encoding des cies aériennes
    Cie = liste_distincte_col(fly, 'UNIQUE_CARRIER', '|')
    Cie.sort()
    from sklearn import preprocessing
    le = preprocessing.LabelEncoder()
    le.fit(Cie)
    fly['CIE'] = le.transform(fly['UNIQUE_CARRIER'])

    #Création d'un df avec la liaison UNIQUE_CARRIER <-> CIE
    CIEdf = fly[['UNIQUE_CARRIER',
                 'CIE']].drop_duplicates()  # On conserve un exemplaire unique

    #Elimination de la colonne UNIQUE_CARRIER
    fly.drop(['UNIQUE_CARRIER'], axis=1, inplace=True)

    #Elimination des colonnes Unnamed
    fly = fly.drop(col_rech_titre(fly, False, "Unnamed"), axis=1)

    #Recherche du code des aéroports
    id_origin = 0
    id_destination = 0
    if origin != '':
        id_origin = AirportCode(fly, airport, origin)
    if destination != '':
        id_destination = AirportCode(fly, airport, destination)

    #Création d'un marsk / filtre
    def mask(df, key, value):
        return df[df[key] == value]

    pd.DataFrame.mask = mask

    #Filtrage de Fly:
    if id_origin != 0:
        fly = fly.mask('ORIGIN_AIRPORT_ID', id_origin)

    if id_destination != 0:
        fly = fly.mask('DEST_AIRPORT_ID', id_destination)

    if month != 0:
        fly = fly.mask('MONTH', month)

    if weekday != 0:
        fly = fly.mask('DAY_OF_WEEK', weekday)

    if carrier != '':
        #Détermination de la CIE à partir de UNIQUE_CARRIER
        carrier_num = CIEdf[CIEdf.UNIQUE_CARRIER == carrier]
        carrier_num = carrier_num['CIE'].values[0]
        fly = fly.mask('CIE', carrier_num)

    #Remplacement des NaN
    fly['CARRIER_DELAY'].fillna(0, inplace=True)
    fly['WEATHER_DELAY'].fillna(0, inplace=True)
    fly['NAS_DELAY'].fillna(0, inplace=True)
    fly['SECURITY_DELAY'].fillna(0, inplace=True)
    fly['LATE_AIRCRAFT_DELAY'].fillna(0, inplace=True)

    if fly.shape[0] == 0:
        return 0, "Insuffisament de données pour prédire ! Soyez moins précis !"

    #Création des moyennes par colonne
    fly_mean = fly.mean()

    #Création d'un DataFRame mono observation reprenant la moyenne des infos nécessaires à la régression:
    ar = np.array([[
        fly_mean['CARRIER_DELAY'], fly_mean['WEATHER_DELAY'],
        fly_mean['NAS_DELAY'], fly_mean['SECURITY_DELAY'],
        fly_mean['LATE_AIRCRAFT_DELAY'], fly_mean['DISTANCE'],
        fly_mean['AIR_TIME'], fly_mean['CRS_ELAPSED_TIME'],
        fly_mean['ACTUAL_ELAPSED_TIME']
    ]])
    df = pd.DataFrame(ar,
                      index=[1],
                      columns=[
                          'CARRIER_DELAY', 'WEATHER_DELAY', 'NAS_DELAY',
                          'SECURITY_DELAY', 'LATE_AIRCRAFT_DELAY', 'DISTANCE',
                          'AIR_TIME', 'CRS_ELAPSED_TIME', 'ACTUAL_ELAPSED_TIME'
                      ])

    New_rfr = RandomForestRegressor()
    New_rfr.set_params = param
    New_rfr.estimators_ = estimators

    print(
        f"Prédiction d'avance/retard: {int(New_rfr.predict(df)[0])} minutes ")

    return int(New_rfr.predict(df)[0]), "prédiction sans erreur"