def pivot(df, columns, values, index=None): idx = get_cols(index, df.columns) if idx is None: raise ValueError("No 'index' in pivot " + str(index)) cols = get_cols(columns, df.columns) if cols is None: raise ValueError("Bad parameter 'columns' in pivot " + str(columns)) vals = get_cols(values, df.columns) if vals is None: raise ValueError("Bad parameter 'values' in pivot " + str(values)) try: df = pd.pivot_table(df, values=vals, index=idx, columns=cols) df.reset_index(inplace=True) except Exception as e: raise RuntimeError( "Runtime Error processing pivot operation on Dataframe." + e) return df
def _process(self, _data): logging.info('Processing data at ' + self.__class__.__name__) meta, df = _data # process fields field_columns = get_cols(self.__fields, df.columns) if field_columns is None: logging.error("Bad parameter 'fields'" + str(self.__fields)) raise ValueError # process tags tag_columns = get_cols(self.__tags, df.columns) # process timestamp if self.__ts['method'] == TS_FILE_MODIFIED and 'ts' in meta: df["ts"] = meta['ts'] elif self.__ts['method'] == TS_FILE_NAME and 'name' in meta: # extract date time from filename s = self.__ts['method'].search(meta['name']).group(1) try: df["ts"] = datetime.strptime(s, self.__ts['format']) except ValueError as e: logging.error(e) elif self.__ts['method'] == TS_COLUMN: ts_column = get_cols(self.__ts['column'], df.columns) if ts_column is None: logging.error("Bad parameter ts.column" + str(self.__tags)) raise ValueError df.rename(columns={ts_column: 'ts'}, inplace=True) if self.__ts['format'] in ['D', 's', 'ms', 'us', 'ns']: df['ts'] = pd.to_datetime(df['ts'], unit=self.__ts['format']) else: df['ts'] = pd.to_datetime(df['ts'], format=self.__ts['format']) else: logging.error( "Couldn't extract timestamp of measurement, defaulting to now") df['ts'] = datetime.now() df.set_index('ts', inplace=True) lines = DataFrameClient()._convert_dataframe_to_lines( df, self.__measurement, field_columns, tag_columns, numeric_precision=6) # print all lines print("\n".join(lines)) logging.info("Exported %d records." % len(lines)) yield None
def replace(df, column=None, new_column=None, default=None, regex=False, remap={}): col = get_cols(column, df.columns) if col is None: logging.error( "Bad or missing parameter 'column' = %s in map " % str(column)) return df inplace = new_column is None try: if not inplace: df[new_column] = df[col] else: new_column = col df.replace({new_column: remap}, regex=regex, inplace=True) if not inplace: # unchanged values will be replaced by NaN df.loc[df[col] == df[new_column], new_column] = default except Exception as e: raise RuntimeError( "Runtime Error processing replace operation on Dataframe." + str(e)) return df
def split(df, col, new_cols, sep=',', collapse=None, replace=True): col = get_cols(col, df.columns) if col is None: raise ValueError("Bad parameter 'col' in split " + str(col)) try: n_new_cols = len(new_cols) tmp = df[col].copy().str.split(sep, expand=True) n_split_cols = len(tmp.columns) # if resulting columns is different if n_split_cols <= n_new_cols: logging.warning( "Unexpected format, found only %d/%d columns. " "Will adjust to lowest" % (n_split_cols, n_new_cols)) new_cols = new_cols[:min(n_split_cols, n_new_cols)] df[new_cols] = tmp else: if collapse is None: logging.warning( "Unexpected format, found %d columns expected %d." "Abandoning .. please configure a collapse column" % (n_split_cols, n_new_cols)) return None # re-collapse all columns starting from collapse to before last tmp['n_cols'] = tmp.notnull().sum(axis=1) cols = tmp.columns.tolist() for n in range(n_new_cols, n_split_cols): tmp.loc[tmp.n_cols == n + 1, cols[collapse - 1]] = tmp[tmp.n_cols == n + 1][cols[collapse - 1:n]].apply( sep.join, axis=1) tmp.loc[tmp.n_cols == n + 1, cols[collapse]] = tmp[cols[n]] # remove trailing columns tmp.drop(tmp.columns[n_new_cols:], axis=1, inplace=True) df[new_cols] = tmp if replace: df.drop(col, axis=1, inplace=True) except Exception as e: raise RuntimeError( "Runtime Error processing split operation on Dataframe." + str(e)) return None return df
def drop_col(df, columns=None): cols = get_cols(columns, df.columns) if cols is None: raise ValueError( "Bad parameter 'columns' in drop_col " + str(columns)) try: df.drop(cols, axis=1, inplace=True) except Exception as e: raise RuntimeError( "Runtime Error processing drop_col operation on Dataframe." + str(e)) return df
def _join(df, columns=None, new_column=None, separator=','): cols = get_cols(columns, df.columns) if cols is None: raise ValueError("Bad parameter 'columns' in join " + str(columns)) if new_column is None: new_column = separator.join(cols) try: df[new_column] = df[cols].apply(separator.join, axis=1) except Exception as e: raise RuntimeError( "Runtime Error processing join operation on Dataframe." + str(e)) return df
def group_by(df, columns=None, function='sum'): cols = get_cols(columns, df.columns) if cols is None: raise ValueError( "Bad parameter 'columns' in group_by " + str(columns)) try: df = df.groupby(cols, axis=0, as_index=False, squeeze=True).sum() except Exception as e: raise RuntimeError( "Runtime Error processing group_by operation on Dataframe." + e) return df
def melt(df, **kwargs): if 'id_vars' in kwargs: kwargs['id_vars'] = get_cols(kwargs['id_vars'], df.columns) return pd.melt(df, **kwargs)