def combine_predictions() -> None: """ Method combining duration and direction prediction and saving the file to disc. :return: None """ # Trying to load data from duration prediction try: duration_predictions = pd.read_csv( os.path.join(get_data_path(), 'output/duration_predictions.csv')) except Exception as e: print('Data from duration prediction could not be loaded.') raise e # Trying to load data from direction prediction try: direction_predictions = pd.read_csv( os.path.join(get_data_path(), 'output/direction_predictions.csv')) except Exception as e: print('Data from direction prediction could not be loaded.') raise e # Concatenating both DataFrames and save them to disk final_df = pd.concat( [duration_predictions, direction_predictions['direction']], axis=1) final_df.to_csv( os.path.join(get_data_path(), 'output/final_predictions.csv')) print('Prediction data was combined and saved to disc.')
def execute_geo_filtering(gdf: gpd.GeoDataFrame) -> gpd.GeoDataFrame: # Load the GeoJSON boundary of Mannheim mannheim_boundary_gdf = gpd.read_file(os.path.join( get_data_path(), 'input/mannheim_boundary.geojson'), crs='EPSG:4326') # Remove all trips which are not within Mannheim (using native shapely is faster than geopandas' spatial join) return gdf[gdf.within(mannheim_boundary_gdf['geometry'][0])]
def train(whatmodel, resolution): """ This command allows for training several machine-learning models for different scopes on preprocessed Nextbike data. A trips-indexed Nextbike file for the city of Bremen must exist in data/processed/. Duration, direction and demand models are available. Models are saved after training as pre-trained models in pickle format under /models/. """ if not os.path.isfile(os.path.join(get_data_path(), 'processed/bremen.csv')): click.echo( 'Could not find /data/processed/bremen.csv - please run preprocessing first using " nextbike transform".', err=True) sys.exit(0) m = Model('bremen.csv') if whatmodel == 'duration': m.train_duration() elif whatmodel == 'direction': m.train_direction_uni() m.train_direction_main_station() elif whatmodel == 'demand': if resolution is None: click.echo( 'No temporal resolution defined, please specify using the -t/--resolution parameter.') sys.exit(0) else: m.train_demand(resolution +'H')
def train(filename): """ Trains a model based on a given data frame and saves it to disk at {project_dir}/data/output :param filename: Path to the data frame which should be used for training :return: None """ with yaspin(color='blue') as spinner: spinner.text = 'Conducting Pre-Processing and Transformation steps ...\t' preprocessor = Preprocessor() preprocessor.load_gdf(filename) preprocessor.clean_gdf() transformer = Transformer(preprocessor) transformer.transform() spinner.text = 'Training duration model ...\t' duration_model = DurationModel() duration_model.load_from_transformer(transformer, training=True) duration_model.train() duration_model.predict() duration_model.training_score() spinner.text = 'Training direction model ...\t' direction_model = DirectionModel() direction_model.load_from_transformer(transformer, training=True) direction_model.train() direction_model.predict() direction_model.training_score() spinner.text = 'Models trained and saved to disk at {}.'.format( os.path.join(get_data_path(), 'output')) spinner.ok('✅ ')
def __init__(self, filename, refresh=False): self._refresh = refresh self._filename = filename self._prettyfilename = filename.replace('.csv', '') self._datapath = get_data_path() self._raw = io.read_file(path=os.path.join(self._datapath, 'raw/' + filename), datetime_cols=['datetime']) self.plz_df = gpd.read_file(self._datapath + '/external/plz_bremen.geojson')
def save_model(model, type: str = 'regressor') -> None: """ Method for saving trained models to disc. :param model: A trained model instance :param type: A string representing if type of model is related to duration, false booking or direction prediction :return: None """ if type == 'regressor': pickle.dump( model, open(os.path.join(get_data_path(), 'output/duration.pkl'), 'wb')) elif type == 'booking_filter': pickle.dump( model, open(os.path.join(get_data_path(), 'output/booking_filter.pkl'), 'wb')) elif type == 'classifier': pickle.dump( model, open(os.path.join(get_data_path(), 'output/direction.pkl'), 'wb'))
def _geo_filter_mannheim_trips(self) -> None: """ Removes all trips which are geographically outside of Mannheim :return: None """ # Load the GeoJSON boundary of Mannheim mannheim_boundary_gdf = gpd.read_file(os.path.join( get_data_path(), 'input/mannheim_boundary.geojson'), crs='EPSG:4326') # Remove all trips which are not within Mannheim (using native shapely is faster than geopandas' spatial join) self._gdf = self._gdf[self._gdf.within( mannheim_boundary_gdf['geometry'][0])]
def save_encoder(encoder: LabelEncoder, type: str = 'label') -> None: """ Mehthod to save the classes of an encoder object for later use :param encoder: The encoder object that was fit and used to transform target features in classification :return: None """ path = os.path.join(get_data_path(), 'output') if type == 'label': joblib.dump(encoder, os.path.join(path, 'classes.joblib')) elif type == 'season': joblib.dump(encoder, os.path.join(path, 'season.joblib')) elif type == 'station': joblib.dump(encoder, os.path.join(path, 'station.joblib'))
def save(self, filename: str = 'mannheim_transformed.csv') -> None: """ Saves the transformed GeoDataFrame as csv-file to the disk. :return: None :raises: UserWarning """ if self.__gdf is None: raise UserWarning( 'Attempting to save an empty data set. Did you transform it before?' ) path = os.path.join(get_data_path(), 'output') create_dir_if_not_exists(path) self.__gdf.to_csv(os.path.join(path, filename), index=False)
def save_predictions(predicted_data: pd.DataFrame, type: str = 'regressor') -> None: """ Method that saves DataFrames containing the raw data as well as predictions :param predicted_data: A DataFrame containing raw data and predictions :param type: A string representing if type of model is related to duration, false booking or direction prediction :return: None """ path = os.path.join(get_data_path(), 'output') create_dir_if_not_exists(path) if type == 'regressor': predicted_data.to_csv(os.path.join(path, 'duration_predictions.csv'), index=False) elif type == 'classifier': predicted_data.to_csv(os.path.join(path, 'direction_predictions.csv'), index=False)
def load_gdf(self, path: str = None) -> None: """ Reads the raw DataFrame, transforms it to a GeoDataFrame and initializes the __gdf property. :type path: object A path that points to the .csv file :return: None """ if path: df = read_df(path, index_col=0, parse_dates=['datetime']) else: df = read_df(os.path.join(get_data_path(), 'input/mannheim.csv'), index_col=0, parse_dates=['datetime']) validate_input(df) self._gdf = gpd.GeoDataFrame(df, crs='EPSG:4326', geometry=gpd.points_from_xy( df['p_lng'], df['p_lat']))
def predict(filename): """ Predicts the duration of the trips specified in the given data frame and saves them to disk at {project_dir}/data/output :param filename: Path to the data frame which should be used for prediction :return: None """ with yaspin(color='blue') as spinner: spinner.text = 'Conducting Pre-Processing and Transformation steps ...\t' preprocessor = Preprocessor() preprocessor.load_gdf(filename) preprocessor.clean_gdf() transformer = Transformer(preprocessor) transformer.transform() spinner.text = 'Performing duration prediction ...\t' duration_predictor = DurationModel() duration_predictor.load_from_transformer(transformer, training=False) duration_predictor.predict(save=True) spinner.text = 'Performing direction prediction ...\t' direction_predictor = DirectionModel() direction_predictor.load_from_transformer(transformer, training=False) direction_predictor.predict(save=True) spinner.text = 'Predictions performed and saved to disk at {}.'.format(os.path.join(get_data_path(), 'output')) spinner.ok('✅ ')
def __init__(self, filename): self._datapath = get_data_path() self._filename = filename