Пример #1
0
    def transformCols(self):
        """Transform all columns from colsToTransform.columns.
        Complexity 3 * n (n = nb_cols).

            Return:
            -------
                pandas.DataFrame with the new columns
        """
        for col in self.cols_transform:
            logger.log("Transform {}".format(col), logging.DEBUG)
            self.transformCol(col)
Пример #2
0
def readFile(filepath,
             encoding="utf-8-sig",
             sep=",",
             infer_datetime=True,
             decimal=',',
             thousands='.'):
    """Read a csv file.

        Args:
        -----
            filepath (str): the path of the data file
            encoding (str): the encoding type
            sep (char): the delimiter
            infer_datetime (bool): try to optimaze datetime

        Return:
        -------
            pandas.DataFrame with data
        """
    def getColumns(dargs):
        """Get all columns names.

            Arg:
            -----
                dargs (dict): args to read the csv file

            Return:
            -------
                list of all columns in the dataframe
        """
        dargs.update({"nrows": 5})

        return list(pd.read_csv(**dargs).columns)

    dargs = {
        "encoding": encoding,
        "sep": sep,
        "decimal": decimal,
        "engine": "python",
        "filepath_or_buffer": filepath,
        "thousands": thousands
    }
    logger.log("Read csv file: {}".format(filepath), logging.DEBUG)
    columns = getColumns(dargs)
    del dargs["nrows"]

    if infer_datetime:
        dargs.update({
            "parse_dates": columns,
            "infer_datetime_format": infer_datetime
        })
        logger.log("args: {}".format(str(dargs)), logging.DEBUG)

    return pd.read_csv(**dargs)
Пример #3
0
    def fill(self):
        """Fill the dataframe.

            Return:
            -------
                pandas.DataFrame filled
        """
        dataframe = self.dataframe.copy()
        wcss = self.wcss(dataframe)
        nb_cluster = self.computeOptimalCluster(wcss)
        logger.log("Optimal nb of cluster is: {}".format(nb_cluster),
                   logging.DEBUG)
        dataframe["Cluster"] = self.clustering(dataframe, nb_cluster)

        return self.fillCols(dataframe)
Пример #4
0
    def fillCols(self, dataframe):
        """Fill NaN for all columns.

            Args:
            -----
                dataframe (pandas.DataFrame): data

            Return:
            -------
                pandas.DataFrame with new value instead of NaN
        """
        for col in self.cols:
            logger.log("Filling NaN, column: {}".format(col), logging.DEBUG)
            dataframe[col] = self.fillCol(dataframe, col)

        return dataframe.drop("Cluster", axis=1)
Пример #5
0
    def derivateCols(self):
        """Derivate all the columns.
        Complexity n * n (n = nb_cols)

            Return:
            -------
                pandas.DataFrame with all new columns derivate
        """
        possible_derivatif = itertools.product(self.cols_transform,
                                               self.cols_transform)
        for col1, col2 in possible_derivatif:
            # Trash hack find a best way, pls
            if col1 != col2:
                logger.log("Derivate {} / {}".format(col1, col2),
                           logging.DEBUG)
                self.derivateCol(col1, col2)
Пример #6
0
    def featurize(self):
        """Build new features.

            Return:
            -------
                pandas.DataFrame with all the new features
        """
        if self.transformations:
            self.transformCols()
        else:
            logger.log("We won't transform features", logging.WARNING)
        if self.derivate:
            self.derivateCols()
        else:
            logger.log("We won't derivate features", logging.WARNING)

        return self.dataframe
Пример #7
0
def formatCols(df):
    """Formats all object columns of the dataframe.

        Arg:
        ----
            df (pandas.DataFrame): datas

        Return:
        -------
            pandas.DataFrame formatted
    """
    cols = df.select_dtypes(include=["object"]).columns
    for col in cols:
        logger.log("Format col: {}".format(col), logging.DEBUG)
        df[col] = df[col].map(formatStr)
        try:
            res = df[col].astype(float)
            logger.log("Col {} has been cast into float".format(col), logging.DEBUG)
        except Exception:
            res = df[col]
        finally:
            df[col] = res

    return df
Пример #8
0
def GetX_Y(df, col_y, col_to_remove=[]):
    """Select X and y dataframes.

        Args:
        -----
            df (pandas.DataFrame): the datas
            col_y (str): col to predict
            col_to_remove (list): columns you don't want to use

        Returns:
        -------
            pandas.DataFrame X and y
    """
    y = df[[col_y]]
    X = df.drop([col_y], axis=1)
    for col in col_to_remove:
        if col in X:
            X.drop([col], axis=1, inplace=True)
            logger.log("Remove column {}".format(col), logging.DEBUG)
        else:
            logger.log("Col {} not in the dataframe".format(col),
                       logging.WARNING)

    return X, y