def regex(self, df): logger.log("Apply regex on the string rows", logging.INFO) df = formatCols(df) print(df.shape) logger.log("New dtypes is {}".format(df.dtypes), logging.DEBUG) return df
def date_cols(self, df): logger.log("Build new columns thanks to the datetime columns", logging.INFO) df = buildColsFromDateCols(df) print(df.shape) return df
def read(self): logger.log("Reading the file {}".format(self.filepath), logging.INFO) df = readFile(self.filepath, self.encoding, self.sep, self.infer_datetime) print(df.shape) logger.log("Inital dtypes is {}".format(df.dtypes), logging.DEBUG) return df
def transformCols(self): """Transform all columns from colsToTransform.columns. Complexity 3 * n (n = nb_cols). Return: ------- pandas.DataFrame with the new columns """ for col in self.cols_transform: logger.log("Transform {}".format(col), logging.DEBUG) self.transformCol(col)
def derivateCols(self): """Derivate all the columns. Complexity n * n (n = nb_cols) Return: ------- pandas.DataFrame with all new columns derivate """ possible_derivatif = itertools.product(self.cols_transform, self.cols_transform) for col1, col2 in possible_derivatif: # Trash hack find a best way, pls if col1 != col2: logger.log("Derivate {} / {}".format(col1, col2), logging.DEBUG) self.derivateCol(col1, col2)
def fill(self): """Fill the dataframe. Return: ------- pandas.DataFrame filled """ dataframe = self.dataframe.copy() wcss = self.wcss(dataframe) nb_cluster = self.computeOptimalCluster(wcss) logger.log("Optimal nb of cluster is: {}".format(nb_cluster), logging.DEBUG) dataframe["Cluster"] = self.clustering(dataframe, nb_cluster) return self.fillCols(dataframe)
def fillCols(self, dataframe): """Fill NaN for all columns. Args: ----- dataframe (pandas.DataFrame): data Return: ------- pandas.DataFrame with new value instead of NaN """ for col in self.cols: logger.log("Filling NaN, column: {}".format(col), logging.DEBUG) dataframe[col] = self.fillCol(dataframe, col) return dataframe.drop("Cluster", axis=1)
def scaleCols(self, dataframe): """Determine features to scale. Args: ----- dataframe (pandas.DataFrame): data Return: ------- pandas.DataFrame with columns scaled """ for col in dataframe.columns: logger.log("Scale column {}".format(col), logging.DEBUG) dataframe[col] = self.scaleCol(dataframe[col], col) return dataframe
def readFile(filepath, encoding="utf-8-sig", sep=",", infer_datetime=True): """Read a csv file. Args: ----- filepath (str): the path of the data file encoding (str): the encoding type sep (char): the delimiter infer_datetime (bool): try to optimaze datetime Return: ------- pandas.DataFrame with data """ def getColumns(dargs): """Get all columns names. Arg: ----- dargs (dict): args to read the csv file Return: ------- list of all columns in the dataframe """ dargs.update({"nrows": 5}) return list(pd.read_csv(**dargs).columns) dargs = { "encoding": encoding, "sep": sep, "engine": "python", "filepath_or_buffer": filepath } logger.log("Read csv file: ".format(filepath), logging.DEBUG) columns = getColumns(dargs) del dargs["nrows"] if infer_datetime: dargs.update({ "parse_dates": columns, "infer_datetime_format": infer_datetime }) logger.log("args: ".format(str(dargs)), logging.DEBUG) return pd.read_csv(**dargs)
def featurize(self): """Build new features. Return: ------- pandas.DataFrame with all the new features """ if self.transformations: self.transformCols() else: logger.log("We won't transform features", logging.WARNING) if self.derivate: self.derivateCols() else: logger.log("We won't derivate features", logging.WARNING) return self.dataframe
def scaleCol(self, serie, col): """Scale the serie. Args: ----- serie (pandas.Serie): serie to scale Return: ------- pandas.Serie scaled """ warnings.filterwarnings("ignore") try: return StandardScaler().fit_transform(serie) except Exception as e: logger.log("{}".format(e), logging.ERROR) return serie
def dummiefication(self, dataframe): """Transform categoric variables into dummiees. Args: ----- dataframe (pandas.DataFrame): data Return: -------- pandas.DataFrame with new columns based on cataegoric value """ dummify = dataframe.loc[:, dataframe.dtypes == object] for col in dummify.columns: logger.log("Dumify column {}".format(col), logging.DEBUG) if len(dataframe[col].unique()) < 10: df = pd.get_dummies(dataframe[col], drop_first=True, prefix=col) dataframe = pd.concat([df, dataframe], axis=1) return dataframe
def GetX_Y(df, col_y, col_to_remove=[]): """Select X and y dataframes. Args: ----- df (pandas.DataFrame): the datas col_y (str): col to predict col_to_remove (list): columns you don't want to use Returns: ------- pandas.DataFrame X and y """ y = df[[col_y]] X = df.drop([col_y], axis=1) for col in col_to_remove: if col in X: X.drop([col], axis=1, inplace=True) logger.log("Remove column {}".format(col), logging.DEBUG) else: logger.log("Col {} not in the dataframe".format(col), logging.WARNING) return X, y
def split(self, df): logger.log("Split dataframe, and remove useless cols", logging.INFO) X, y = GetX_Y(df, self.y_col, self.col_to_drop) print(df.shape) return X, y
def feature(self, df, orginal_cols): logger.log("Make some feature engineering", logging.INFO) df = FeatureEngineering(df, cols=orginal_cols).featurize() print(df.shape) return df
def scale(self, df): logger.log("Scale the data", logging.INFO) df = ScaleData(df).scale() print(df.shape) return df
def dummy(self, df): logger.log("Dummify categoric variables", logging.INFO) df = Dummify(df).dummies() print(df.shape) return df
def fill(self, df): logger.log("Filling the NaN values", logging.INFO) df = FillNaN(df).fill() print(df.shape) return df