def cli(sys_argv: List[str]): """Command line interface to merge two dataset :param sys_argv: list of command line arguments """ parser = argparse.ArgumentParser() parser.add_argument('path_to_dataset_1', type=str, help='Path to a pickled dataset') parser.add_argument('path_to_dataset_2', type=str, help='Path a pickled dataset') parser.add_argument('--out_path', type=str, help='Path to save the merged dataset') args = parser.parse_args(sys_argv) dataset1 = file_utils.pickle2dataframe(args.path_to_dataset_1) dataset2 = file_utils.pickle2dataframe(args.path_to_dataset_2) merged_dataset = dataset1.append(dataset2) file_utils.dataframe2pickle(merged_dataset, args.out_path)
def load_data(self, path, transform_numerical, transform_categorical): raw_data = file_utils.pickle2dataframe(path) data = raw_data[self.all_features] if transform_numerical: data = transform_numerical(raw_data) if transform_categorical: data = transform_categorical(raw_data) return data
def get_categorical_processor(data_path: str, features: List[str], save_path: str = None) -> Preprocessor: """Load a data set saved as a pickle and fit a CategoricalPreprocessor :param data_path: Path to the data on which the preprocessor will be fitted :param features: List of categorical features to preprocess :param save_path: Path where to save the parameter of the preprocessor :return: fitted categorical preprocessor """ train_data: DataFrame = file_utils.pickle2dataframe(data_path) train_data = train_data[features].dropna() encoder = CategoricalPreprocessor() encoder.fit(train_data) if save_path: encoder.save(save_path) return encoder
def __init__(self, path, numerical_features=None, categorical_features=None, output_features=None, transform_numerical=None, transform_categorical=None): self.numerical_features = numerical_features self.categorical_features = categorical_features self.output_features = output_features self.transform_numerical = transform_numerical self.transform_categorical = transform_categorical self.all_features = self.get_all_features(numerical_features, categorical_features, output_features) self.raw_data = file_utils.pickle2dataframe(path) self.data = self.load_data(path, transform_numerical, transform_categorical)
def cli(sys_argv: List[str]): data = file_utils.pickle2dataframe(FULL_DATA_PATH) data = data[data.PX_LAST > 0.] data['LOG_PX_LAST'] = np.log(data.PX_LAST) valuation_dates = data['ValuationDate'] valuation_years = [dates.to_pydatetime().year for dates in valuation_dates] valuation_months = [dates.to_pydatetime().month for dates in valuation_dates] years = set(valuation_years) months = set(valuation_months) data['ValuationYear'] = valuation_years data['ValuationMonth'] = valuation_months data = data[data.ValuationYear == 2017] train_months = list(range(1, 9)) valid_months = [9, 10] test_months = [11, 12] all_index = data.index # remove line with missing values data_complete = data.copy().dropna() # split data train_data = data_complete[data_complete.ValuationMonth.isin(train_months)] valid_data = data_complete[data_complete.ValuationMonth.isin(valid_months)] test_data = data_complete[data_complete.ValuationMonth.isin(test_months)] # data_missing contains only lines with missing data missing_index = [i for i in all_index if i not in data_complete.index] data_missing = data.loc[missing_index] train_data_missing = data_missing[data_missing.ValuationMonth.isin(train_months)] # save the data to pickles file_utils.dataframe2pickle(train_data, constants.TRAIN_PATH) file_utils.dataframe2pickle(valid_data, constants.VALID_PATH) file_utils.dataframe2pickle(test_data, constants.TEST_PATH) file_utils.dataframe2pickle(train_data_missing, constants.TRAIN_PATH_MISSING)
def get_numerical_processor(data_path: str, features: List[str], scale: Tuple[int, int], apply_log: bool, save_path: str = None) -> Preprocessor: """Load a training set saved as a pickle and train a NumericalPreprocessor :param data_path: Path to the data on which the preprocessor will be fitted :param features: List of numerical features to preprocess :param scale: See `preprocessors.NumericalPreprocessor` docstring :param apply_log: See `preprocessors.NumericalPreprocessor` docstring :param save_path: Path where to save the parameter of the preprocessor :return: fitted numerical preprocessor """ train_data = file_utils.pickle2dataframe(data_path) train_data = train_data[features] normalizer = NumericalPreprocessor(scale, apply_log) normalizer.fit(train_data) if save_path: normalizer.save(save_path) return normalizer
def cli(sys_argv: List[str]): """Command line interface to train the models :param sys_argv: list of command line arguments """ parser = argparse.ArgumentParser() parser.add_argument('--mse_loss', action='store_true') parser.add_argument( '--writer_path', type=str, default=None, help= 'path to the pickled writer. Use this option if visualizing losses.') parser.add_argument('--price_predictions', action='store_true') parser.add_argument( '--target_data', type=str, default=None, help= "path to the pickled target data set. Use this option if visualizing predictions" ) parser.add_argument( '--target_feature', type=str, default=None, help= "Name of the target feature. Use this option if visualizing predictions" ) parser.add_argument( '--pred', type=str, default=None, help= "path to the pickled predictions. Use this option if visualizing predictions" ) parser.add_argument('--output_dir', type=str, default=RESULTS_DIR, help='Path to the directory where to save results') parser.add_argument('--output_filename', type=str, help='Name of the figure') args = parser.parse_args(sys_argv) save_path = '{}/figure_{}'.format(args.output_dir, args.output_filename) if args.mse_loss: writer: Dict[str, List[float]] = file_utils.load_pickle(args.writer_path) print(' [visualize] Saving MSE figure at `{}`'.format(save_path)) plot_utils.plot_train_valid_rmse_loss(writer['train_loss'], writer['valid_loss'], save_path) print(' [visualize] Done') return if args.price_predictions: dataframe = file_utils.pickle2dataframe(args.target_data) targets = dataframe[args.target_feature] predictions = file_utils.load_pickle(args.pred) plot_utils.scatter_plot_predictions(predictions, targets, save_path) return exit( 'You need to specify `--mse_loss` or `--price_predictions` in command line arguments' )