def metadata(self): """return metadata""" metadata_cols = utils.get_metadata(self, self.metadata_string, self.prefix) return DataFrame(self[metadata_cols], metadata_string=self.metadata_string, prefix=self.prefix)
def test_get_metadata_simple(): x = [1,2,3,4] y = [4,3,2,1] z = [1,2,3,4] columns = ["colA", "colB", "Metadata_A"] test_df = pd.DataFrame(list(zip(x, y, z)), columns=columns) cols = utils.get_metadata(test_df) assert cols == ["Metadata_A"]
def test_get_metadata_different_case(): x = [1,2,3,4] y = [4,3,2,1] z = [1,2,3,4] a = [4,3,5,1] columns = ["colA", "colB", "metadata_A", "Metadata_A"] test_df = pd.DataFrame(list(zip(x, y, z, a)), columns=columns) cols = utils.get_metadata(test_df, metadata_string="metadata") assert cols == ["metadata_A"]
def test_get_metadata_middle_prefix(): x = [1,2,3,4] y = [4,3,2,1] z = [1,2,3,4] a = [4,3,5,1] columns = ["colA", "colB", "something_Metadata", "Metadata_A"] test_df = pd.DataFrame(list(zip(x, y, z, a)), columns=columns) cols = utils.get_metadata(test_df) assert cols == ["Metadata_A"]
def aggregate(data, on, method="median", metadata_string="Metadata_", prefix=True): """ Aggregate dataset Parameters ----------- data : pandas DataFrame DataFrame on : string or list of strings column(s) with which to group by and aggregate the dataset. method : string (default="median") method to average each group. options = "median" or "mean" **kwargs : additional args to utils.get_metadata / utils.get_featuredata Returns ------- agg_df : pandas DataFrame aggregated dataframe, with a row per value of 'on' """ _check_inputs(data, on, method) _check_featuredata(data, on, metadata_string, prefix) # keep track of original column order df_cols = data.columns.tolist() grouped = data.groupby(on, as_index=False) if method == "mean": agg = grouped.aggregate(np.mean) if method == "median": agg = grouped.aggregate(np.median) df_metadata = data[utils.get_metadata(data, metadata_string, prefix)].copy() # add indexing column to metadata if not already present df_metadata[on] = data[on] # drop metadata to the same level as aggregated data df_metadata.drop_duplicates(subset=on, inplace=True) # merge aggregated and feature data merged_df = pd.merge(agg, df_metadata, on=on, how="outer", suffixes=("remove_me", "")) # merge untracked columns with merged data merged_df = merged_df[df_cols] # re-arrange to columns are in original order assert len(merged_df.columns) == len(data.columns) return merged_df
def scale_features(data, metadata_string="Metadata_", prefix=True): """ scale and centre features with a z-score Parameters ---------- df : pandas DataFrame DataFrame **kwargs : additional arguments to utils.get_featuredata/get_metadata Returns ------- scaled : pandas DataFrame dataframe of same dimensions as df, with scaled feature values """ data_columns = data.columns.tolist() feature_data = data[utils.get_featuredata(data, metadata_string, prefix)] metadata = data[utils.get_metadata(data, metadata_string, prefix)] scaled_featuredata = feature_data.apply(z_score) scaled_both = pd.concat([scaled_featuredata, metadata], axis=1) # return columns to original order scaled_both = scaled_both[data_columns] return scaled_both
def metacols(self): """return list of metadata column names""" return utils.get_metadata(self, self.metadata_string, self.prefix)