def main(self, features: List[pd.DataFrame], categories: List[pd.DataFrame], whiten: bool, id_filter: List[T], subsets: List[List[T]]) -> dict: # merge input data into single df df = reduce(lambda a, b: a.append(b), features) # apply id filter df = utils.apply_id_filter(df=df, id_filter=id_filter) if not subsets: # empty subsets equals all samples in one subset subsets = [df['id'].unique().tolist()] # make matrix of data df = df.pivot(index='feature', columns='id', values='value') df = df.T feature_labels = list(df) # save ids so we can re-assign them after pca ids = df.index.tolist() # replace missing values with row medians imp = Imputer(missing_values='NaN', strategy='median', axis=0) imp.fit(df) df = imp.transform(df) # PCA pca = PCA(whiten=whiten) pca.fit(df) reduced_df = pca.transform(df) # get explained variance ratios of components variance_ratios = pca.explained_variance_ratio_ # get loadings loadings = -1 * pca.components_.T * np.sqrt(pca.explained_variance_) loadings = pd.DataFrame(loadings) loadings['feature'] = feature_labels # re-assign ids reduced_df = pd.DataFrame(reduced_df) reduced_df['id'] = ids # add category and subset column reduced_df = utils.apply_subsets(df=reduced_df, subsets=subsets) reduced_df = utils.apply_categories(df=reduced_df, categories=categories) return { 'data': reduced_df.to_dict(orient='list'), 'loadings': loadings.to_dict(orient='list'), 'variance_ratios': variance_ratios.tolist() }
def test_apply_categorys(self): df = pd.DataFrame([[101, 'foo', 1], [102, 'foo', 2], [103, 'foo', 3]], columns=['id', 'feature', 'value']) c1 = pd.DataFrame( [[101, 'c1', 'a'], [102, 'c1', 'b'], [105, 'c1', 'c']], columns=['id', 'feature', 'value']) c2 = pd.DataFrame([[106, 'c2', 'd']], columns=['id', 'feature', 'value']) c3 = pd.DataFrame([[102, 'c3', 'f']], columns=['id', 'feature', 'value']) result = utils.apply_categories(df=df, categories=[c1, c2, c3]) assert result['category'].tolist()[:2] == ['a', 'b AND f'] assert np.isnan(result['category'].tolist()[2])
def main(self, x: pd.DataFrame, y: pd.DataFrame, id_filter: List[str], method: str, subsets: List[List[str]], categories: List[pd.DataFrame]) -> dict: """Compute correlation statistics for the given parameters. :param x: DataFrame containing x axis values. :param y: DataFrame containing y axis values. :param id_filter: If specified use only given ids during the analysis. :param method: pearson, spearman or kendall. :param subsets: List of lists of subset ids. :param categories: List of DataFrames that categorise the data points. :return: corr. coef., p-value and other useful values. """ if len(x['feature'].unique().tolist()) != 1 \ or len(y['feature'].unique().tolist()) != 1: error = "Input is invalid. Please make sure that the two " \ "variables to compare have exactly one dimension, each." logger.error(error) raise ValueError(error) if method not in ['pearson', 'spearman', 'kendall']: raise ValueError("Unknown method '{}'".format(method)) df = self.merge_x_y(x, y) x_label = list(df['feature_x'])[0] y_label = list(df['feature_y'])[0] df = utils.apply_id_filter(df=df, id_filter=id_filter) df = utils.apply_subsets(df=df, subsets=subsets) df = utils.apply_categories(df=df, categories=categories) global_stats = self.compute_stats(df, method) output = global_stats output['method'] = method output['data'] = df.to_json(orient='records') output['x_label'] = x_label output['y_label'] = y_label return output
def main(self, bw_factor: float, num_bins: int, id_filter: List[str], subsets: List[List[str]], data: pd.DataFrame, categories: List[pd.DataFrame]) -> dict: """Compute several basic statistics such as bin size and kde. :param bw_factor: KDE resolution. :param num_bins: Number of bins to use for histogram. :param id_filter: If specified use only given ids during the analysis. :param subsets: List of lists of subset ids. :param data: Numerical values to create histogram of. :param categories: The groups to split the values into. """ df = data del data df.dropna(inplace=True) if df.shape[0] == 0: error = 'The selected numerical variable must be non-empty.' logger.exception(error) raise ValueError(error) df = utils.apply_id_filter(df=df, id_filter=id_filter) df = utils.apply_subsets(df=df, subsets=subsets) df = utils.apply_categories(df=df, categories=categories) stats = {} categories = df['category'].unique().tolist() subsets = df['subset'].unique().tolist() for category in categories: for subset in subsets: sub_df = df[(df['category'] == category) & (df['subset'] == subset)] values = sub_df['value'] if values.shape[0] < 2: continue hist, bin_edges = np.histogram(values, bins=num_bins) hist = hist.tolist() bin_edges = bin_edges.tolist() mean = np.mean(values) median = np.median(values) std = np.std(values) def bw(obj, fac): return np.power(obj.n, -1.0 / (obj.d + 4)) * fac kde = scipy.stats.gaussian_kde( values, bw_method=partial(bw, fac=bw_factor)) xs = np.linspace( start=np.min(values), stop=np.max(values), num=200) dist = kde(xs).tolist() if not stats.get(category): stats[category] = {} stats[category][subset] = { 'hist': hist, 'bin_edges': bin_edges, 'mean': mean, 'median': median, 'std': std, 'dist': dist } return { 'data': df.to_json(orient='records'), 'stats': stats, 'subsets': subsets, 'categories': categories, 'label': df['feature'].tolist()[0] }
def main(self, durations: List[pd.DataFrame], categories: List[pd.DataFrame], event_observed: List[pd.DataFrame], estimator: str, id_filter: List[str], subsets: List[List[str]]) -> dict: # TODO: Docstring if len(durations) != 1: error = 'Analysis requires exactly one array that specifies the ' \ 'duration length.' logger.exception(error) raise ValueError(error) if len(event_observed) > 1: error = 'Maximal one variable for "event_observed" allowed' logger.exception(error) raise ValueError(error) df = durations[0] df.dropna(inplace=True) df = utils.apply_id_filter(df=df, id_filter=id_filter) df = utils.apply_subsets(df=df, subsets=subsets) df = utils.apply_categories(df=df, categories=categories) stats = {} categories = df['category'].unique().tolist() subsets = df['subset'].unique().tolist() # for every category and subset combination estimate the survival fun. for category in categories: for subset in subsets: sub_df = df[(df['category'] == category) & (df['subset'] == subset)] T = sub_df['value'] E = None # default is nothing is censored if len(T) <= 3: continue if event_observed: # find observation boolean value for every duration E = event_observed[0].merge(sub_df, how='right', on='id') E = [not x for x in pd.isnull(E['value_x'])] assert len(E) == len(T) if estimator == 'NelsonAalen': fitter = NelsonAalenFitter() fitter.fit(durations=T, event_observed=E) estimate = fitter.cumulative_hazard_[ 'NA_estimate'].tolist() ci_lower = fitter.confidence_interval_[ 'NA_estimate_lower_0.95'].tolist() ci_upper = fitter.confidence_interval_[ 'NA_estimate_upper_0.95'].tolist() elif estimator == 'KaplanMeier': fitter = KaplanMeierFitter() fitter.fit(durations=T, event_observed=E) # noinspection PyUnresolvedReferences estimate = fitter.survival_function_[ 'KM_estimate'].tolist() ci_lower = fitter.confidence_interval_[ 'KM_estimate_lower_0.95'].tolist() ci_upper = fitter.confidence_interval_[ 'KM_estimate_upper_0.95'].tolist() else: error = 'Unknown estimator: {}'.format(estimator) logger.exception(error) raise ValueError(error) timeline = fitter.timeline.tolist() if not stats.get(category): stats[category] = {} stats[category][subset] = { 'timeline': timeline, 'estimate': estimate, 'ci_lower': ci_lower, 'ci_upper': ci_upper } return { 'label': df['feature'].tolist()[0], 'categories': categories, 'subsets': subsets, 'stats': stats }
def main(self, features: List[pd.DataFrame], categories: List[pd.DataFrame], id_filter: List[T], transformation: str, subsets: List[List[T]]) -> dict: """ Compute boxplot statistics for the given parameters. :param features: List of numerical features :param categories: List of categorical features used to group numerical features. :param id_filter: List of ids that will be considered for analysis. If empty all ids will be used. :param transformation: Transformation that will be applied to the data. :param subsets: List of subsets used as another way to group the numerical features. """ if not len(features): raise ValueError("Must at least specify one " "non empty numerical feature.") # merge dfs into single one df = reduce(lambda l, r: l.append(r), features) df = utils.apply_transformation(df=df, transformation=transformation) df.dropna(inplace=True) df = utils.apply_id_filter(df=df, id_filter=id_filter) df = utils.apply_subsets(df=df, subsets=subsets) df = utils.apply_categories(df=df, categories=categories) df['outlier'] = None results = { 'statistics': {}, 'features': df['feature'].unique().tolist(), 'categories': df['category'].unique().tolist(), 'subsets': df['subset'].unique().tolist() } group_values = [] for feature in results['features']: for subset in results['subsets']: for category in results['categories']: values = df[(df['subset'] == subset) & (df['category'] == category) & (df['feature'] == feature)]['value'].tolist() if len(values) < 2: continue # FIXME: v This is ugly. Look at kaplan_meier_survival.py label = '{}//{}//s{}'.format(feature, category, subset + 1) group_values.append(values) stats = self.boxplot_statistics(values) u_outliers = np.array(values) > stats['u_wsk'] l_outliers = np.array(values) < stats['l_wsk'] outliers = np.bitwise_or(u_outliers, l_outliers) df.loc[(df['subset'] == subset) & (df['category'] == category) & (df['feature'] == feature), 'outlier'] = outliers kde = scipy.stats.gaussian_kde(values) xs = np.linspace(start=stats['l_wsk'], stop=stats['u_wsk'], num=100) stats['kde'] = kde(xs).tolist() results['statistics'][label] = stats results['data'] = df.to_json(orient='records') f_value, p_value = scipy.stats.f_oneway(*group_values) results['anova'] = { 'p_value': p_value, 'f_value': f_value } return results