def get_df(cols, load_from_temp, temp_path): weather_df = load_weather_df(CONFIG.preprocessed_meteo_path_complete) if not load_from_temp: logger.info('Loading train Dataframe...') train_df = load_train_df(CONFIG.preprocessed_train_path_means) logger.info('Loading weather Dataframe...') logger.info('Creating features...') ff = FeatureFactory(train_df, weather_df) for col in cols: logger.info('Creating %s feature...' % col) ff(col) if 'ASS_ASSIGNMENT' not in cols: cols = ['ASS_ASSIGNMENT'] + cols if 'DATE' not in cols: cols = ['DATE'] + cols if 'CSPL_RECEIVED_CALLS' not in cols: cols += ['CSPL_RECEIVED_CALLS'] logger.info('Selecting features...') ff.select_features(cols) train_df = ff.X if temp_path is not None: train_df.to_csv(temp_path) else: assert temp_path is not None logger.info('Loading train Dataframe...') train_df = pd.read_csv(temp_path, encoding='latin-1', index_col=0, parse_dates=['DATE']) weather_df.reset_index(inplace=True) return train_df, weather_df
def get_cross_validation_parameters(df, columns, weather_df=None, k_fold=5, weights=None, out_path=None, label=None): """ Get the parameters you need for sklearn cross-validation functions. Parameters ========== df: pd.Dataframe, the input data. columns: list, the features to be processed and/or kept. weather_df: pd.Dataframe, the weather data. k_fold: The number of partitions of size 1/k to choose for cross-validation. weights: dict, each column will be multiplied by its corresponding weight (e.g. impact on KNN) out_path: str or None, path to a file in which X is saved, to avoid recomputing features. None not to save this file. label: str, Name of the 'calls' column (CSPL_RECEIVED_CALLS or prediction) Returns ======= X: np.array or sparse matrix, the data set. y: np.array, the corresponding labels. cv: An object to be used as a cross-validation generator. dates: np.array, the list of dates corresponding to the data set. """ if label is None: label = 'CSPL_RECEIVED_CALLS' if weights is None: weights = {} df.reset_index(inplace=True, drop=True) ff = FeatureFactory(df, weather_df) dates = np.array(ff('full_date')) y = np.array(df[label]) - df['MEAN'] for column in columns: logger.debug(column) ff(column) ff.select_features(columns) ff.apply_weights(weights) X = np.array(ff.X) if out_path is not None: ff.X.to_csv(out_path) cv = DateShuffleSplit(dates, n_iter=k_fold, test_size=float(1)/k_fold) return X, y, cv, dates