예제 #1
0
def combine_predictions() -> None:
    """
    Method combining duration and direction prediction and saving the file to disc.
    :return: None
    """
    # Trying to load data from duration prediction
    try:
        duration_predictions = pd.read_csv(
            os.path.join(get_data_path(), 'output/duration_predictions.csv'))
    except Exception as e:
        print('Data from duration prediction could not be loaded.')
        raise e

    # Trying to load data from direction prediction
    try:
        direction_predictions = pd.read_csv(
            os.path.join(get_data_path(), 'output/direction_predictions.csv'))
    except Exception as e:
        print('Data from direction prediction could not be loaded.')
        raise e

    # Concatenating both DataFrames and save them to disk
    final_df = pd.concat(
        [duration_predictions, direction_predictions['direction']], axis=1)
    final_df.to_csv(
        os.path.join(get_data_path(), 'output/final_predictions.csv'))
    print('Prediction data was combined and saved to disc.')
예제 #2
0
def execute_geo_filtering(gdf: gpd.GeoDataFrame) -> gpd.GeoDataFrame:
    # Load the GeoJSON boundary of Mannheim
    mannheim_boundary_gdf = gpd.read_file(os.path.join(
        get_data_path(), 'input/mannheim_boundary.geojson'),
                                          crs='EPSG:4326')
    # Remove all trips which are not within Mannheim (using native shapely is faster than geopandas' spatial join)
    return gdf[gdf.within(mannheim_boundary_gdf['geometry'][0])]
예제 #3
0
def train(whatmodel, resolution):
    """
    This command allows for training several machine-learning models for different scopes on preprocessed Nextbike data.
    A trips-indexed Nextbike file for the city of Bremen must exist in data/processed/.

    Duration, direction and demand models are available.
    Models are saved after training as pre-trained models in pickle format under /models/.
    """

    if not os.path.isfile(os.path.join(get_data_path(), 'processed/bremen.csv')):
        click.echo(
            'Could not find /data/processed/bremen.csv - please run preprocessing first using " nextbike transform".', err=True)
        sys.exit(0)

    m = Model('bremen.csv')
    if whatmodel == 'duration':
        m.train_duration()
    elif whatmodel == 'direction':
        m.train_direction_uni()
        m.train_direction_main_station()
    elif whatmodel == 'demand':
        if resolution is None:
            click.echo(
                'No temporal resolution defined, please specify using the -t/--resolution parameter.')
            sys.exit(0)
        else:
            m.train_demand(resolution +'H')
예제 #4
0
def train(filename):
    """
    Trains a model based on a given data frame and saves it to disk at {project_dir}/data/output
    :param filename: Path to the data frame which should be used for training
    :return: None
    """
    with yaspin(color='blue') as spinner:
        spinner.text = 'Conducting Pre-Processing and Transformation steps ...\t'
        preprocessor = Preprocessor()
        preprocessor.load_gdf(filename)
        preprocessor.clean_gdf()
        transformer = Transformer(preprocessor)
        transformer.transform()
        spinner.text = 'Training duration model ...\t'
        duration_model = DurationModel()
        duration_model.load_from_transformer(transformer, training=True)
        duration_model.train()
        duration_model.predict()
        duration_model.training_score()
        spinner.text = 'Training direction model ...\t'
        direction_model = DirectionModel()
        direction_model.load_from_transformer(transformer, training=True)
        direction_model.train()
        direction_model.predict()
        direction_model.training_score()
        spinner.text = 'Models trained and saved to disk at {}.'.format(
            os.path.join(get_data_path(), 'output'))
        spinner.ok('✅ ')
예제 #5
0
 def __init__(self, filename, refresh=False):
     self._refresh = refresh
     self._filename = filename
     self._prettyfilename = filename.replace('.csv', '')
     self._datapath = get_data_path()
     self._raw = io.read_file(path=os.path.join(self._datapath,
                                                'raw/' + filename),
                              datetime_cols=['datetime'])
     self.plz_df = gpd.read_file(self._datapath +
                                 '/external/plz_bremen.geojson')
예제 #6
0
def save_model(model, type: str = 'regressor') -> None:
    """
    Method for saving trained models to disc.
    :param model: A trained model instance
    :param type: A string representing if type of model is related to duration, false booking or direction prediction
    :return: None
    """
    if type == 'regressor':
        pickle.dump(
            model,
            open(os.path.join(get_data_path(), 'output/duration.pkl'), 'wb'))
    elif type == 'booking_filter':
        pickle.dump(
            model,
            open(os.path.join(get_data_path(), 'output/booking_filter.pkl'),
                 'wb'))
    elif type == 'classifier':
        pickle.dump(
            model,
            open(os.path.join(get_data_path(), 'output/direction.pkl'), 'wb'))
예제 #7
0
 def _geo_filter_mannheim_trips(self) -> None:
     """
     Removes all trips which are geographically outside of Mannheim
     :return: None
     """
     # Load the GeoJSON boundary of Mannheim
     mannheim_boundary_gdf = gpd.read_file(os.path.join(
         get_data_path(), 'input/mannheim_boundary.geojson'),
                                           crs='EPSG:4326')
     # Remove all trips which are not within Mannheim (using native shapely is faster than geopandas' spatial join)
     self._gdf = self._gdf[self._gdf.within(
         mannheim_boundary_gdf['geometry'][0])]
예제 #8
0
def save_encoder(encoder: LabelEncoder, type: str = 'label') -> None:
    """
    Mehthod to save the classes of an encoder object for later use
    :param encoder: The encoder object that was fit and used to transform target features in classification
    :return: None
    """
    path = os.path.join(get_data_path(), 'output')
    if type == 'label':
        joblib.dump(encoder, os.path.join(path, 'classes.joblib'))
    elif type == 'season':
        joblib.dump(encoder, os.path.join(path, 'season.joblib'))
    elif type == 'station':
        joblib.dump(encoder, os.path.join(path, 'station.joblib'))
예제 #9
0
 def save(self, filename: str = 'mannheim_transformed.csv') -> None:
     """
     Saves the transformed GeoDataFrame as csv-file to the disk.
     :return: None
     :raises: UserWarning
     """
     if self.__gdf is None:
         raise UserWarning(
             'Attempting to save an empty data set. Did you transform it before?'
         )
     path = os.path.join(get_data_path(), 'output')
     create_dir_if_not_exists(path)
     self.__gdf.to_csv(os.path.join(path, filename), index=False)
예제 #10
0
def save_predictions(predicted_data: pd.DataFrame,
                     type: str = 'regressor') -> None:
    """
    Method that saves DataFrames containing the raw data as well as predictions
    :param predicted_data: A DataFrame containing raw data and predictions
    :param type: A string representing if type of model is related to duration, false booking or direction prediction
    :return: None
    """
    path = os.path.join(get_data_path(), 'output')
    create_dir_if_not_exists(path)
    if type == 'regressor':
        predicted_data.to_csv(os.path.join(path, 'duration_predictions.csv'),
                              index=False)
    elif type == 'classifier':
        predicted_data.to_csv(os.path.join(path, 'direction_predictions.csv'),
                              index=False)
예제 #11
0
 def load_gdf(self, path: str = None) -> None:
     """
     Reads the raw DataFrame, transforms it to a GeoDataFrame and initializes the __gdf property.
     :type path: object A path that points to the .csv file
     :return: None
     """
     if path:
         df = read_df(path, index_col=0, parse_dates=['datetime'])
     else:
         df = read_df(os.path.join(get_data_path(), 'input/mannheim.csv'),
                      index_col=0,
                      parse_dates=['datetime'])
     validate_input(df)
     self._gdf = gpd.GeoDataFrame(df,
                                  crs='EPSG:4326',
                                  geometry=gpd.points_from_xy(
                                      df['p_lng'], df['p_lat']))
예제 #12
0
def predict(filename):
    """
    Predicts the duration of the trips specified in the given data frame and saves them to disk at
    {project_dir}/data/output
    :param filename: Path to the data frame which should be used for prediction
    :return: None
    """
    with yaspin(color='blue') as spinner:
        spinner.text = 'Conducting Pre-Processing and Transformation steps ...\t'
        preprocessor = Preprocessor()
        preprocessor.load_gdf(filename)
        preprocessor.clean_gdf()
        transformer = Transformer(preprocessor)
        transformer.transform()
        spinner.text = 'Performing duration prediction ...\t'
        duration_predictor = DurationModel()
        duration_predictor.load_from_transformer(transformer, training=False)
        duration_predictor.predict(save=True)
        spinner.text = 'Performing direction prediction ...\t'
        direction_predictor = DirectionModel()
        direction_predictor.load_from_transformer(transformer, training=False)
        direction_predictor.predict(save=True)
        spinner.text = 'Predictions performed and saved to disk at {}.'.format(os.path.join(get_data_path(), 'output'))
        spinner.ok('✅ ')
예제 #13
0
 def __init__(self, filename):
     self._datapath = get_data_path()
     self._filename = filename