def aggregate_pick_high_count(state: PipelineState) -> PipelineState: state.df['read_count'] = state.df.sum(axis=1) state.df = state.df.sort_values('read_count', ascending=False)\ .groupby(lambda x: x)\ .first() state.df = state.df.drop(columns=['read_count']) return state
def split_fixed_set(state, train_households, verbose=False): state.df['household'] = state.df.index.map(_parse_household_id) train_df = state.df.loc[state.df['household'].isin(train_households)] test_df = state.df.loc[~state.df['household'].isin(train_households)] train_index = train_df.index test_index = test_df.index train_meta = state.meta_df[state.meta_df.index.isin(train_index)] test_meta = state.meta_df[state.meta_df.index.isin(test_index)] if state.target is not None: train_target = state.target[state.target.index.isin(train_index)] test_target = state.target[state.target.index.isin(test_index)] else: train_target = None test_target = None state.df = state.df.drop('household', axis=1) train_df = train_df.drop('household', axis=1) test_df = test_df.drop('household', axis=1) # print(train_df) # print(test_df) return TrainTest(train=PipelineState(train_df, train_meta, train_target), test=PipelineState(test_df, test_meta, test_target))
def _filter_by_metadata(state: PipelineState, meta_col: str, meta_val: set) -> PipelineState: values = state.meta_df[meta_col] state.df = state.df.join(values) state.df = state.df[state.df[meta_col].isin(meta_val)] state.df = state.df.drop([meta_col], axis=1) state.meta_df = state.meta_df[state.meta_df[meta_col].isin(meta_val)] return _filter_to_matched_pairs(state)
def clr_wrapper(state: PipelineState): # Unfortunately, clr needs pseudocounts or it crashes out. clr_data = clr(state.df.to_numpy() + .5) new_df = pd.DataFrame(data=clr_data, index=state.df.index, columns=state.df.columns) return state.update_df(new_df)
def rarefy_wrapper(state: PipelineState, target_count: int) -> PipelineState: table = Artifact.import_data("FeatureTable[Frequency]", state.df) \ .view(biom.Table) table = rarefy(table, target_count) df = Artifact.import_data("FeatureTable[Frequency]", table) \ .view(pd.DataFrame) return state.update_df(df)
def _filter_by_shared_ids(state: PipelineState) -> PipelineState: df_set = set(state.df.index) meta_set = set(state.meta_df.index) valid_set = df_set.intersection(meta_set) return state.update( df=state.df[state.df.index.isin(valid_set)], meta_df=state.meta_df[state.meta_df.index.isin(valid_set)])
def _filter_out_sample_ids(state: PipelineState, bad_sample_ids) \ -> PipelineState: if len(bad_sample_ids) == 0: return state def _filter(df): bad_rows = df.index.isin(set(bad_sample_ids)) return df[~bad_rows] return state.update(df=_filter(state.df), meta_df=_filter(state.meta_df))
def fix_input_table(df): df = df.set_index("sample") df["remainder"] = df["total_reads"] - df["target_reads"] df = df.drop("total_reads", axis=1) df.index.name = None df.columns.name = None ps = PipelineState(df, pd.DataFrame(), None) ps = sample_filtering.build_prefix_filter(BAD_SAMPLE_PREFIXES)(ps, "filter") ps = id_parsing.build()(ps, "filter") return ps.df
def __call__(self, state: PipelineState, mode: str): new_df = None if mode == 'train': new_df = self._lda.fit_transform(state.df, state.target) elif mode == 'test': new_df = self._lda.transform(state.df) return state.update_df( pd.DataFrame( new_df, columns=['LDA%i' % i for i in range(self.num_components)], index=state.df.index))
def _filter_out_sample_id_prefix(state: PipelineState, bad_prefixes) \ -> PipelineState: if len(bad_prefixes) == 0: return state def _filter(df): bad_rows = df.index.str.startswith(bad_prefixes[0]) for i in range(1, len(bad_prefixes)): bad_rows |= df.index.str.startswith(bad_prefixes[i]) return df[~bad_rows] return state.update(df=_filter(state.df), meta_df=_filter(state.meta_df))
def matched_pair_subtract(state: PipelineState, meta_col_name: str, one_set: set) -> PipelineState: (left, right) = _split_left_right(state, meta_col_name, one_set) left = left.sort_index() right = right.sort_index() df = right - left # TODO FIXME HACK: Does target's type matter? Do we need to # turn these into ints/booleans? target = df['target'] / 2 + .5 df = df.drop('target', axis=1) return state.update(target=target, df=df)
def matched_pair_concat(state: PipelineState, meta_col_name: str, one_set: set) -> PipelineState: (left, right) = _split_left_right(state, meta_col_name, one_set) left = left.rename(columns=lambda name: "L" + name) right = right.rename(columns=lambda name: "R" + name) left = left.sort_index() right = right.sort_index() df = pd.concat([left, right], axis=1) df = df.drop('Ltarget', axis=1) target = df['Rtarget'] df = df.drop('Rtarget', axis=1) return state.update(target=target, df=df)
def _filter_to_matched_pairs(state: PipelineState) -> PipelineState: household_map = defaultdict(list) for key in state.df.index: household_map[_parse_household_id(key)].append(key) good_keys = [] for household in household_map: sample_ids = household_map[household] if len(sample_ids) == 2: for sample_id in sample_ids: good_keys.append(sample_id) good_keys = set(good_keys) return state.update( df=state.df[state.df.index.isin(good_keys)], meta_df=state.meta_df[state.meta_df.index.isin(good_keys)])
def __call__(self, state: PipelineState, mode: str): new_df = None if mode == 'train': new_df = self.reducer.fit_transform(state.df) elif mode == 'test': new_df = self.reducer.transform(state.df) # # Plot That Umap! # print("MODE:", mode) # from matplotlib import pyplot as plt # plt.scatter(new_df[:, 0], new_df[:, 1]) # plt.title("UMAP") # plt.xlabel("UMAP0") # plt.ylabel("UMAP1") # plt.show() # plt.close() return state.update_df( pd.DataFrame(new_df, columns=["UMAP1", "UMAP2"], index=state.df.index))
def _restrict_columns_compositional(state: PipelineState, chosen_columns: list) \ -> PipelineState: df = state.df present_columns = [] for col in chosen_columns: if col in state.df.columns: present_columns.append(col) if len(present_columns) == 0: print(df.columns) print(chosen_columns) print("IS IT HERE?") try: print("WHAT:", df['1000569']) print("HELLO:", df['1403328']) except Exception as e: print(e) print('F**k Python') exit(-1) raise Exception( "Cannot restrict columns, no chosen columns are present") chosen_columns = present_columns remainder = df.drop(chosen_columns, axis=1) print(remainder) print("VS") print(len(chosen_columns)) df['remainder'] = remainder.sum(axis=1) restricted = df[chosen_columns + ['remainder']] # ###################DONOTCOMMIT MEEEEE###################### # restricted = restricted.drop(["remainder"], axis=1) # ####################################################### return state.update_df(restricted)
def _filter_zero_sum(state: PipelineState) -> PipelineState: state.df['rowsum'] = state.df.sum(axis=1) state.df = state.df[state.df['rowsum'] > 0] state.df = state.df.drop(['rowsum'], axis=1) return _filter_to_matched_pairs(state)
def _target(state: PipelineState, meta_col_name: str, one_set: set) \ -> PipelineState: return state.update_target( state.meta_df.apply(lambda row: 1 if row[meta_col_name] in one_set else 0, axis=1))
def run_preprocessing(analysis_config, callbacks: AnalysisCallbacks): callbacks.start_analysis(analysis_config) analysis_name = analysis_config.analysis_name metadata_filepath = analysis_config.metadata_filepath feature_set_index = analysis_config.feature_set_index if feature_set_index is not None: feature_set_index = feature_set_index.features training_set_index = analysis_config.training_set_index if training_set_index is None: training_set_index = 0 pair_strategy = analysis_config.pair_strategy if pair_strategy is None: pair_strategy = "paired_concat" metadata_filter = analysis_config.metadata_filter feature_filter = analysis_config.feature_filter dim_reduction = analysis_config.dimensionality_reduction normalization = analysis_config.normalization if normalization is None: normalization = Normalization.DEFAULT feature_transform = analysis_config.feature_transform # TODO: Probably need to keep the config for the algorithm next to the algo # blahhh. df = analysis_config.table_info.load_dataframe() # For raw reads, uncomment here # _print_read_count_info(df) # print("RAGE") # print(biom_filepath) # print(df.columns) # print(df['88431']) # print(df['411462']) # print("END RAGE") # # Look up ids for genera # genera = biom_table.metadata_to_dataframe(axis="observation") # genera = genera[genera["Name"].isin([ # "Akkermansia", # "Bifidobacterium", # "Bilophila", # "Blautia", # "Butyricimonas", # "Coprococcus", # "Christensenella", # "Desulfovibrio", # "Faecalibacterium", # "Haemophilus", # "Methanobrevibacter", # "Mycoplana", # "Paraprevotella", # "Pedobacter", # "Pseudomonas", # "Slackia", # # "Streptococcus thermophiles", # # "Streptococcus salivarius", # # "Faecalibacterium prausnitzi", # # "Coprococcus comes", # # "Anaerostipes hadrus", # # "Eubacterium rectale", # # "Acinetobacter calcoaceticus", # # "Akkermansia muciniphila", # # "Eggerthella lenta" # ])] # pd.set_option('display.max_rows', None) # print(genera.sort_values("Name")) # Load metadata DataFrame metadata = Metadata.load(metadata_filepath) meta_df = metadata.to_dataframe() state = PipelineState(df, meta_df, None) print(analysis_name) # Run Preprocessing train_state, test_state = preprocessing_pipeline.process( analysis_config, state, callbacks, restricted_feature_set=feature_set_index, training_set_index=training_set_index, verbose=False, pair_strategy=pair_strategy, metadata_filter=metadata_filter, feature_filter=feature_filter, dim_reduction=dim_reduction, normalization=normalization, feature_transform=feature_transform, meta_encoder=analysis_config.meta_encoder, downsample_count=analysis_config.downsample_count) df = train_state.df meta_df = train_state.meta_df target = train_state.target # For normalized reads after preprocessing, uncomment here # c_perf = ['G000013285'] # akkermansias = [ # 'G000020225','G000436395','G000437075', # 'G000723745', 'G000980515', 'G001578645', # 'G001580195', 'G001647615', 'G001683795', # 'G001917295', 'G001940945', 'G900097105' # ] # df2 = df[c_perf + akkermansias] # df_akk = df[akkermansias] # df2['sum'] = df_akk.sum(axis=1) # for akk in akkermansias + ['sum']: # max_akk = df2[akk].max().max() # min_akk = max_akk - 1500 # df3 = df2[df2[akk] > min_akk] # print(akk, min_akk/10000, "to", max_akk/10000) # print(df3) # import code # code.interact(local=locals()) # _print_read_count_info(df) # Shuffle the data so the machine learning can't learn anything based on # order df = df.sample(frac=1, random_state=110399473805677 % (2**32)) # Target and df order must match. Argh. df['target'] = target target = df['target'] df = df.drop(['target'], axis=1) # print(df.columns) # plotter.simple_swarm(df, meta_df, "239935", "disease") # return # Convert necessary types for regression-benchmarking final_biom = Artifact.import_data("FeatureTable[Frequency]", df)\ .view(biom.Table) return final_biom, target, train_state, test_state
def _sum_columns(state: PipelineState): series = state.df.sum(axis=1) series.name = "score" return state.update_df(series.to_frame())
def _apply_feature_transform(state: PipelineState, transformer: FeatureTransformer): return state.update_df(transformer.transform_df(state.df))
def aggregate_mean(state: PipelineState) -> PipelineState: return state.update_df(state.df.groupby(lambda x: x).mean())
def matched_pair_subtract_sex_balanced(state: PipelineState, meta_col_name: str, one_set: set) -> PipelineState: state = _target(state, meta_col_name, one_set) state.df["__sex__"] = state.meta_df.apply(lambda row: 1 if row["sex"] == "F" else 0, axis=1) state.df['__target__'] = state.target state.df['__household__'] = state.df.index state.df['__household__'] = state.df['__household__'].apply( _parse_household_id) households = state.df['__household__'].tolist() households = set(households) MM_hh = set() MF_hh = set() FM_hh = set() FF_hh = set() for hh in households: hh_frame = state.df[state.df['__household__'] == hh] if hh_frame["__sex__"].sum() == 2: FF_hh.add(hh) elif hh_frame["__sex__"].sum() == 0: MM_hh.add(hh) elif ((hh_frame["__sex__"] == 1) & (hh_frame["__target__"] == 1)).sum() == 1: FM_hh.add(hh) else: MF_hh.add(hh) # print("Lengths:") # print(len(MM_hh), len(FF_hh), len(FM_hh), len(MF_hh)) # print("MM") # print(MM_hh) # print("FF") # print(FF_hh) # Pick a subset r = default_rng(_CHOICE_RANDOM_SEED) accept_set = r.choice(sorted(list(FM_hh)), len(MF_hh), replace=False).tolist() accept_set = accept_set + list(FF_hh) + list(MM_hh) + list(MF_hh) # print("ACCEPT SET: ", len(accept_set)) state.df = state.df[state.df['__household__'].isin(accept_set)] state.df = state.df.drop(["__household__", "__target__"], axis=1) meta_accept_set = [s.replace("-", "") for s in accept_set] state.meta_df = state.meta_df[state.meta_df['household'].isin( meta_accept_set)] (left, right) = _split_left_right(state, meta_col_name, one_set) left = left.sort_index() right = right.sort_index() df = right - left # TODO FIXME HACK: Does target's type matter? Do we need to # turn these into ints/booleans? target = df['target'] / 2 + .5 # Pick subset of households with # MM 0 # FF 0 # MF -1 # FM 1 A = (df['target'] == -1) & (df['__sex__'] == -1) B = (df['target'] == -1) & (df['__sex__'] == 0) C = (df['target'] == -1) & (df['__sex__'] == 1) D = (df['target'] == 1) & (df['__sex__'] == -1) E = (df['target'] == 1) & (df['__sex__'] == 0) F = (df['target'] == 1) & (df['__sex__'] == 1) # print(A.sum(), B.sum(), C.sum()) # print(D.sum(), E.sum(), F.sum()) # # print((A | B | C | D | E | F).sum()) df = df.drop('target', axis=1) df = df.drop('__sex__', axis=1) return state.update(target=target, df=df)
def fix_sample_ids(state: PipelineState) -> PipelineState: return state.update_df(state.df.rename(mapper=_parse_sample_id))
def divide_total(state: PipelineState, target_count: int) -> PipelineState: df = state.df.div(state.df.sum(axis=1) / target_count, axis=0) return state.update_df(df)