示例#1
0
                def pivot(df, columns, values, index=None):
                    idx = get_cols(index, df.columns)
                    if idx is None:
                        raise ValueError("No 'index' in pivot " + str(index))
                    cols = get_cols(columns, df.columns)
                    if cols is None:
                        raise ValueError("Bad parameter 'columns' in pivot " +
                                         str(columns))
                    vals = get_cols(values, df.columns)
                    if vals is None:
                        raise ValueError("Bad parameter 'values' in pivot " +
                                         str(values))

                    try:
                        df = pd.pivot_table(df,
                                            values=vals,
                                            index=idx,
                                            columns=cols)
                        df.reset_index(inplace=True)
                    except Exception as e:
                        raise RuntimeError(
                            "Runtime Error processing pivot operation on Dataframe."
                            + e)

                    return df
示例#2
0
    def _process(self, _data):
        logging.info('Processing data at ' + self.__class__.__name__)
        meta, df = _data

        # process fields
        field_columns = get_cols(self.__fields, df.columns)
        if field_columns is None:
            logging.error("Bad parameter 'fields'" + str(self.__fields))
            raise ValueError

        # process tags
        tag_columns = get_cols(self.__tags, df.columns)

        # process timestamp
        if self.__ts['method'] == TS_FILE_MODIFIED and 'ts' in meta:
            df["ts"] = meta['ts']
        elif self.__ts['method'] == TS_FILE_NAME and 'name' in meta:
            # extract date time from filename
            s = self.__ts['method'].search(meta['name']).group(1)
            try:
                df["ts"] = datetime.strptime(s, self.__ts['format'])
            except ValueError as e:
                logging.error(e)

        elif self.__ts['method'] == TS_COLUMN:
            ts_column = get_cols(self.__ts['column'], df.columns)
            if ts_column is None:
                logging.error("Bad parameter ts.column" + str(self.__tags))
                raise ValueError
            df.rename(columns={ts_column: 'ts'}, inplace=True)
            if self.__ts['format'] in ['D', 's', 'ms', 'us', 'ns']:
                df['ts'] = pd.to_datetime(df['ts'], unit=self.__ts['format'])
            else:
                df['ts'] = pd.to_datetime(df['ts'], format=self.__ts['format'])
        else:
            logging.error(
                "Couldn't extract timestamp of measurement, defaulting to now")
            df['ts'] = datetime.now()

        df.set_index('ts', inplace=True)
        lines = DataFrameClient()._convert_dataframe_to_lines(
            df,
            self.__measurement,
            field_columns,
            tag_columns,
            numeric_precision=6)
        # print all lines
        print("\n".join(lines))
        logging.info("Exported %d records." % len(lines))
        yield None
示例#3
0
                def replace(df,
                            column=None,
                            new_column=None,
                            default=None,
                            regex=False,
                            remap={}):
                    col = get_cols(column, df.columns)
                    if col is None:
                        logging.error(
                            "Bad or missing parameter 'column' = %s in map " %
                            str(column))
                        return df
                    inplace = new_column is None
                    try:
                        if not inplace:
                            df[new_column] = df[col]
                        else:
                            new_column = col
                        df.replace({new_column: remap},
                                   regex=regex,
                                   inplace=True)
                        if not inplace:
                            # unchanged values will be replaced by NaN
                            df.loc[df[col] == df[new_column],
                                   new_column] = default

                    except Exception as e:
                        raise RuntimeError(
                            "Runtime Error processing replace operation on Dataframe."
                            + str(e))
                    return df
示例#4
0
                def split(df,
                          col,
                          new_cols,
                          sep=',',
                          collapse=None,
                          replace=True):
                    col = get_cols(col, df.columns)
                    if col is None:
                        raise ValueError("Bad parameter 'col' in split " +
                                         str(col))

                    try:
                        n_new_cols = len(new_cols)
                        tmp = df[col].copy().str.split(sep, expand=True)
                        n_split_cols = len(tmp.columns)
                        # if resulting columns is different
                        if n_split_cols <= n_new_cols:
                            logging.warning(
                                "Unexpected format, found only %d/%d columns. "
                                "Will adjust to lowest" %
                                (n_split_cols, n_new_cols))
                            new_cols = new_cols[:min(n_split_cols, n_new_cols)]
                            df[new_cols] = tmp
                        else:
                            if collapse is None:
                                logging.warning(
                                    "Unexpected format, found %d columns expected %d."
                                    "Abandoning .. please configure a collapse column"
                                    % (n_split_cols, n_new_cols))
                                return None

                            # re-collapse all columns starting from collapse to before last
                            tmp['n_cols'] = tmp.notnull().sum(axis=1)
                            cols = tmp.columns.tolist()
                            for n in range(n_new_cols, n_split_cols):
                                tmp.loc[tmp.n_cols == n + 1,
                                        cols[collapse -
                                             1]] = tmp[tmp.n_cols == n +
                                                       1][cols[collapse -
                                                               1:n]].apply(
                                                                   sep.join,
                                                                   axis=1)
                                tmp.loc[tmp.n_cols == n + 1,
                                        cols[collapse]] = tmp[cols[n]]

                            # remove trailing columns
                            tmp.drop(tmp.columns[n_new_cols:],
                                     axis=1,
                                     inplace=True)
                            df[new_cols] = tmp
                        if replace:
                            df.drop(col, axis=1, inplace=True)
                    except Exception as e:
                        raise RuntimeError(
                            "Runtime Error processing split operation on Dataframe."
                            + str(e))
                        return None

                    return df
示例#5
0
 def drop_col(df, columns=None):
     cols = get_cols(columns, df.columns)
     if cols is None:
         raise ValueError(
             "Bad parameter 'columns' in drop_col " +
             str(columns))
     try:
         df.drop(cols, axis=1, inplace=True)
     except Exception as e:
         raise RuntimeError(
             "Runtime Error processing drop_col operation on Dataframe."
             + str(e))
     return df
示例#6
0
 def _join(df, columns=None, new_column=None, separator=','):
     cols = get_cols(columns, df.columns)
     if cols is None:
         raise ValueError("Bad parameter 'columns' in join " +
                          str(columns))
     if new_column is None:
         new_column = separator.join(cols)
     try:
         df[new_column] = df[cols].apply(separator.join, axis=1)
     except Exception as e:
         raise RuntimeError(
             "Runtime Error processing join operation on Dataframe."
             + str(e))
     return df
示例#7
0
 def group_by(df, columns=None, function='sum'):
     cols = get_cols(columns, df.columns)
     if cols is None:
         raise ValueError(
             "Bad parameter 'columns' in group_by " +
             str(columns))
     try:
         df = df.groupby(cols,
                         axis=0,
                         as_index=False,
                         squeeze=True).sum()
     except Exception as e:
         raise RuntimeError(
             "Runtime Error processing group_by operation on Dataframe."
             + e)
     return df
示例#8
0
 def melt(df, **kwargs):
     if 'id_vars' in kwargs:
         kwargs['id_vars'] = get_cols(kwargs['id_vars'],
                                      df.columns)
     return pd.melt(df, **kwargs)