def relocate(data, cols, before=None, after=None): """Use relocate() to change column positions Parameters ---------- data: pandas or pyspark DataFrame A dataframe cols: str or list Columns to move before: str, default=None Destination of columns. Default is to move to the very left. after: str, default=None Destination of columns. Default is to move the very left. Returns ------- Our dataframe, but with the columns repositioned """ is_pandas = _check_df_type(data, 'rename') if isinstance(cols, str): cols = _get_str_columns(data, cols, is_pandas=is_pandas) elif isinstance(cols, list): cols = _get_list_columns(data, cols, is_pandas) if is_pandas: if before is None and after is None: new_cols = [cols] + data.columns.difference([cols]).tolist() return_data = data[new_cols] elif isinstance(before, str): if isinstance(after, str): raise ValueError( "Only one of before or after can be specified") if isinstance(cols, str): cols_to_move = data.pop(cols).values data.insert(data.columns.get_loc(before), cols, cols_to_move) else: for col in cols: cols_to_move = data.pop(col).values data.insert( data.columns.get_loc(before) - 1, col, cols_to_move) return_data = data.copy() elif isinstance(after, str): if isinstance(cols, str): cols_to_move = data.pop(cols).values data.insert( data.columns.get_loc(after) + 1, cols, cols_to_move) else: for i, col in enumerate(cols): cols_to_move = data.pop(cols).values data.insert( data.columns.get_loc(after) + (i + 1), col, cols_to_move) return_data = data.copy() else: raise TypeError( "One of before or after must be in string format, or both set to None" ) return return_data else: ...
def add_count(data, cols, wt=None, sort=False, name=None): """Count observations by group Parameters ---------- data: pandas or pyspark DataFrame A dataframe cols: str or list Optional variables to use when determining uniqueness. If there are multiple rows for a given combination of inputs, only the first row will be preserved. If None, will use all variables. wt: str of list, default is None Frequency weights. Can be a variable (or combination of variables) or None. wt is computed once for each unique combination of the counted variables. sort: bool, default is False If True, will show the largest groups at the top. name: str, default is None The name of the new column in the output. If omitted, it will default to n. If there's already a column called n, it will error, and require you to specify the name. Returns ------- Our dataframe, but with an additional column including the sum or count """ is_pandas = _check_df_type(data, "add_count") if isinstance(cols, str): distinct_cols = _get_str_columns(data, cols, is_pandas=is_pandas) elif isinstance(cols, list): distinct_cols = _get_list_columns(data, cols, is_pandas=is_pandas) else: if is_pandas: distinct_cols = data.columns else: distinct_cols = data.schema.names if name is None: name = 'n' if is_pandas: if wt is None: # While it would be nice to simply have df[name] = df.groupby([col])[col].transform('count'), that only # works when we know that col is one column. Otherwise, we will be trying to set one column with 2+ columns, # which will throw an error. So we instead create our Series/DataFrame and then perform a check to ensure # that we don't try and create a new column with several. groupby_data = data.groupby( distinct_cols)[distinct_cols].transform('count') if isinstance(groupby_data, pd.Series): data[name] = groupby_data else: data[name] = groupby_data.iloc[:, 0] else: # We know that wt has to be a single column, so we can afford to automatically create a new column and not # have to worry about errors caused by setting a column from several. data[name] = data.groupby(distinct_cols)[wt].transform('sum') if sort: data = data.sort_values(by=name, ascending=False) else: ... return data
def arrange(data, cols): """Arrange rows by column values Parameters --------- data: pandas or pysparkDataFrame A dataframe cols: str or list The columns we are sorting the dataframe on Returns ------- Our sorted dataframe For example, suppose we had a dataframe like a b 1 2 3 4 5 6 Then running arrange(df, desc(a)) will return a b 5 6 3 4 1 2 """ is_pandas = _check_df_type(data, "arrange") if isinstance(cols, str): columns = _get_str_columns(data, cols, is_pandas=is_pandas) elif isinstance(cols, list): columns = _get_list_columns(data, cols, is_pandas) else: raise TypeError("Cannot determine method for determining column types") sorting_cols = [] ascending_cols = [] for c in columns: if re.search('desc()', c): # Here, we are removing the des() from our string, but not whitespace as we may have a column like ' col1' sorting_cols.append(re.sub(r'desc|\(|\)|\s+', r'', c)) ascending_cols.append(False) else: sorting_cols.append(c) ascending_cols.append(True) if is_pandas: # Here, we are resetting the index because R's implementation also resets the index. Thus, we also need to drop # our prior index return data.sort_values( sorting_cols, ascending=ascending_cols).reset_index().drop('index', axis=1) else: return data.orderBy(sorting_cols, ascending=ascending_cols)
def distinct(data, cols=None, keep_all=False): """Subset distinct/unique rows Parameters ---------- data: pandas or pyspark DataFrame A dataframe cols: str or list, default is None Optional variables to use when determining uniqueness. If there are multiple rows for a given combination of inputs, only the first row will be preserved. If None, will use all variables. keep_all: bool, default is False If True, keep all variables in data. If a combination of cols is not distinct, this keeps the first row of values. Returns ------- Our dataframe but with unique rows specified """ is_pandas = _check_df_type(data, "distinct") if isinstance(cols, str): distinct_cols = _get_str_columns(data, cols, is_pandas=is_pandas) elif isinstance(cols, list): distinct_cols = _get_list_columns(data, cols, is_pandas=is_pandas) else: if is_pandas: distinct_cols = data.columns else: distinct_cols = data.schema.names if is_pandas: if keep_all: # Here, we need to find the distinct values, and then perform a left merge on the distinct values with the # remaining columns to ensure that we have all available columns. We then reset the index as that is what # R's implementation does. dropped_data = data[distinct_cols].drop_duplicates(keep='first') dropped_data = pd.merge(dropped_data, data.drop(distinct_cols, axis=1), left_index=True, right_index=True, how='left') dropped_data.index = np.arange(len(dropped_data)) return dropped_data else: return data[distinct_cols].drop_duplicates(keep='first', ignore_index=True) else: ...
def select(data, cols): """Select variables in a data frame Parameters ---------- data: pandas or pyspark DataFrame A dataframe cols: str or list Columns to move Returns ------- Our dataframe, but with the selected columns """ is_pandas = _check_df_type(data, "select") if isinstance(cols, str): cols = _get_str_columns(data, cols, is_pandas=is_pandas) elif isinstance(cols, list): cols = _get_list_columns(data, cols, is_pandas) if is_pandas: return data.loc[:, cols] else: return data.select(*cols)
def count(data, cols, wt=None, sort=False, name=None, drop=True): """Count observations by group Parameters ---------- data: pandas or pyspark DataFrame A dataframe cols: str or list Optional variables to use when determining uniqueness. If there are multiple rows for a given combination of inputs, only the first row will be preserved. If None, will use all variables. wt: str of list, default is None Frequency weights. Can be a variable (or combination of variables) or None. wt is computed once for each unique combination of the counted variables. sort: bool, default is False If True, will show the largest groups at the top. name: str, default is None The name of the new column in the output. If omitted, it will default to n. If there's already a column called n, it will error, and require you to specify the name. drop: bool, default is True If False will include counts for empty groups (i.e. for levels of factors that don't exist in the data) Returns ------- A pandas series with our counts or sums """ is_pandas = _check_df_type(data, "count") if isinstance(cols, str): distinct_cols = _get_str_columns(data, cols, is_pandas=is_pandas) elif isinstance(cols, list): distinct_cols = _get_list_columns(data, cols, is_pandas=is_pandas) else: if is_pandas: distinct_cols = data.columns else: distinct_cols = data.schema.names if name is None: name = 'n' if is_pandas: # Here, we are treating our groupby columns as factors/categorical data, so we need to convert it. This also # allows us to incorporate the drop() command, as otherwise it would serve no purpose. data[distinct_cols] = data[distinct_cols].astype('category') if wt is None: count_df = data.groupby(distinct_cols).count() else: count_df = data.groupby(distinct_cols)[wt].sum() # We don't need multiple columns containing the same counts, so we choose the first one and rename it if isinstance(count_df, pd.DataFrame): count_df = count_df.rename({count_df.columns[0]: name}, axis=1) count_df = count_df.iloc[:, 0] # We have one column/Series, so all we need to do is rename it else: count_df = count_df.rename(name) # Drops any column that contains NaN (i.e., categories that don't have any observations) if drop: count_df = count_df.dropna() # Replaces the NaNs (i.e., categories that don't have any observations) with 0 else: count_df = count_df.fillna(0) # Since we know that count_df is going to be a Series, we don't need to specify the axis or the column names, as # those are already known for a Series if sort: count_df = count_df.sort_values(ascending=False) count_df = count_df.reset_index() else: ... return count_df
def filter(data, cols): """Filters data based on arguments from cols Parameters ---------- data: pandas or pyspark DataFrame The dataframe for which we filtering the data on cols: str or list The filter conditions we are applying on our dataframe Returns ------- filtered_data: pandas DataFrame The dataframe, after we've applied all filtering conditions For example, suppose we had a dataframe like a b 1 2 3 4 5 6 Then running filter(df, "a >= median(a)") will return a 3 5 """ is_pandas = _check_df_type(data, "filter") if isinstance(cols, str): cols = _get_str_columns(data, cols, is_pandas=is_pandas) elif isinstance(cols, list): cols = _get_list_columns(data, cols, is_pandas) query_result = [] for c in cols: if "mean(" in c.casefold(): mean_col = re.search(r'(?<=mean\()[a-zA-Z]+', c).group(0) if is_pandas: val = data[mean_col].mean() else: ... comparison = re.search(r'([<>]=?|==)', c).group(0) result = '{} {} {}'.format(mean_col, comparison, val) elif "median(" in c.casefold(): median_col = re.search(r'(?<=median\()[a-zA-Z]+', c).group(0) if is_pandas: val = data[median_col].median() else: ... comparison = re.search(r'([<>]=?|==)', c).group(0) result = '{} {} {}'.format(median_col, comparison, val) elif "min(" in c.casefold(): min_col = re.search(r'(?<=min\()[a-zA-Z]+', c).group(0) if is_pandas: val = data[min_col].min() else: ... comparison = re.search(r'([<>]=?|==)', c).group(0) result = '{} {} {}'.format(min_col, comparison, val) elif "max(" in c.casefold(): max_col = re.search(r'(?<=max\()[a-zA-Z]+', c).group(0) if is_pandas: val = data[max_col].max() else: ... comparison = re.search(r'([<>]=?|==)', c).group(0) result = '{} {} {}'.format(max_col, comparison, val) elif "quantile(" in c.casefold(): quantile_col = re.search(r'(?<=quantile\()[a-zA-Z]+', c).group(0) if re.search('probs=', c): quantile_percent = float( re.search(r'(?<=probs\=)\s*\d*\.\d+', c).group(0)) else: quantile_percent = float( re.search(r'(?<=,)\s*\d*\.\d+', c).group(0)) if quantile_percent > 1: raise Exception("Cannot have percentile greater than 1") comparison = re.search(r'([<>]=?|==)', c).group(0) if is_pandas: val = data[quantile_col].quantile(quantile_percent) else: ... result = '{} {} {}'.format(quantile_col, comparison, val) else: result = c query_result.append(result) if is_pandas: return data.query(' & '.join(query_result)).reset_index().drop( ['index'], axis=1) else: return data.filter(' and '.join(query_result))