def aggregate_pick_high_count(state: PipelineState) -> PipelineState:
    state.df['read_count'] = state.df.sum(axis=1)
    state.df = state.df.sort_values('read_count', ascending=False)\
               .groupby(lambda x: x)\
               .first()
    state.df = state.df.drop(columns=['read_count'])
    return state
def split_fixed_set(state, train_households, verbose=False):
    state.df['household'] = state.df.index.map(_parse_household_id)
    train_df = state.df.loc[state.df['household'].isin(train_households)]
    test_df = state.df.loc[~state.df['household'].isin(train_households)]

    train_index = train_df.index
    test_index = test_df.index

    train_meta = state.meta_df[state.meta_df.index.isin(train_index)]
    test_meta = state.meta_df[state.meta_df.index.isin(test_index)]

    if state.target is not None:
        train_target = state.target[state.target.index.isin(train_index)]
        test_target = state.target[state.target.index.isin(test_index)]
    else:
        train_target = None
        test_target = None

    state.df = state.df.drop('household', axis=1)
    train_df = train_df.drop('household', axis=1)
    test_df = test_df.drop('household', axis=1)

    # print(train_df)
    # print(test_df)

    return TrainTest(train=PipelineState(train_df, train_meta, train_target),
                     test=PipelineState(test_df, test_meta, test_target))
def _filter_by_metadata(state: PipelineState, meta_col: str,
                        meta_val: set) -> PipelineState:
    values = state.meta_df[meta_col]

    state.df = state.df.join(values)
    state.df = state.df[state.df[meta_col].isin(meta_val)]
    state.df = state.df.drop([meta_col], axis=1)

    state.meta_df = state.meta_df[state.meta_df[meta_col].isin(meta_val)]
    return _filter_to_matched_pairs(state)
예제 #4
0
def clr_wrapper(state: PipelineState):
    # Unfortunately, clr needs pseudocounts or it crashes out.
    clr_data = clr(state.df.to_numpy() + .5)
    new_df = pd.DataFrame(data=clr_data,
                          index=state.df.index,
                          columns=state.df.columns)
    return state.update_df(new_df)
예제 #5
0
def rarefy_wrapper(state: PipelineState, target_count: int) -> PipelineState:
    table = Artifact.import_data("FeatureTable[Frequency]", state.df) \
        .view(biom.Table)
    table = rarefy(table, target_count)
    df = Artifact.import_data("FeatureTable[Frequency]", table) \
        .view(pd.DataFrame)
    return state.update_df(df)
def _filter_by_shared_ids(state: PipelineState) -> PipelineState:
    df_set = set(state.df.index)
    meta_set = set(state.meta_df.index)
    valid_set = df_set.intersection(meta_set)

    return state.update(
        df=state.df[state.df.index.isin(valid_set)],
        meta_df=state.meta_df[state.meta_df.index.isin(valid_set)])
def _filter_out_sample_ids(state: PipelineState, bad_sample_ids) \
        -> PipelineState:
    if len(bad_sample_ids) == 0:
        return state

    def _filter(df):
        bad_rows = df.index.isin(set(bad_sample_ids))
        return df[~bad_rows]

    return state.update(df=_filter(state.df), meta_df=_filter(state.meta_df))
예제 #8
0
def fix_input_table(df):
    df = df.set_index("sample")
    df["remainder"] = df["total_reads"] - df["target_reads"]
    df = df.drop("total_reads", axis=1)
    df.index.name = None
    df.columns.name = None

    ps = PipelineState(df, pd.DataFrame(), None)
    ps = sample_filtering.build_prefix_filter(BAD_SAMPLE_PREFIXES)(ps,
                                                                   "filter")
    ps = id_parsing.build()(ps, "filter")
    return ps.df
예제 #9
0
    def __call__(self, state: PipelineState, mode: str):
        new_df = None
        if mode == 'train':
            new_df = self._lda.fit_transform(state.df, state.target)
        elif mode == 'test':
            new_df = self._lda.transform(state.df)

        return state.update_df(
            pd.DataFrame(
                new_df,
                columns=['LDA%i' % i for i in range(self.num_components)],
                index=state.df.index))
def _filter_out_sample_id_prefix(state: PipelineState, bad_prefixes) \
        -> PipelineState:

    if len(bad_prefixes) == 0:
        return state

    def _filter(df):
        bad_rows = df.index.str.startswith(bad_prefixes[0])
        for i in range(1, len(bad_prefixes)):
            bad_rows |= df.index.str.startswith(bad_prefixes[i])
        return df[~bad_rows]

    return state.update(df=_filter(state.df), meta_df=_filter(state.meta_df))
def matched_pair_subtract(state: PipelineState, meta_col_name: str,
                          one_set: set) -> PipelineState:
    (left, right) = _split_left_right(state, meta_col_name, one_set)

    left = left.sort_index()
    right = right.sort_index()
    df = right - left

    # TODO FIXME HACK:  Does target's type matter?  Do we need to
    #  turn these into ints/booleans?
    target = df['target'] / 2 + .5
    df = df.drop('target', axis=1)
    return state.update(target=target, df=df)
def matched_pair_concat(state: PipelineState, meta_col_name: str,
                        one_set: set) -> PipelineState:
    (left, right) = _split_left_right(state, meta_col_name, one_set)

    left = left.rename(columns=lambda name: "L" + name)
    right = right.rename(columns=lambda name: "R" + name)

    left = left.sort_index()
    right = right.sort_index()
    df = pd.concat([left, right], axis=1)
    df = df.drop('Ltarget', axis=1)
    target = df['Rtarget']
    df = df.drop('Rtarget', axis=1)
    return state.update(target=target, df=df)
def _filter_to_matched_pairs(state: PipelineState) -> PipelineState:
    household_map = defaultdict(list)
    for key in state.df.index:
        household_map[_parse_household_id(key)].append(key)

    good_keys = []
    for household in household_map:
        sample_ids = household_map[household]
        if len(sample_ids) == 2:
            for sample_id in sample_ids:
                good_keys.append(sample_id)

    good_keys = set(good_keys)
    return state.update(
        df=state.df[state.df.index.isin(good_keys)],
        meta_df=state.meta_df[state.meta_df.index.isin(good_keys)])
예제 #14
0
    def __call__(self, state: PipelineState, mode: str):
        new_df = None
        if mode == 'train':
            new_df = self.reducer.fit_transform(state.df)
        elif mode == 'test':
            new_df = self.reducer.transform(state.df)

        # # Plot That Umap!
        # print("MODE:", mode)
        # from matplotlib import pyplot as plt
        # plt.scatter(new_df[:, 0], new_df[:, 1])
        # plt.title("UMAP")
        # plt.xlabel("UMAP0")
        # plt.ylabel("UMAP1")
        # plt.show()
        # plt.close()

        return state.update_df(
            pd.DataFrame(new_df,
                         columns=["UMAP1", "UMAP2"],
                         index=state.df.index))
예제 #15
0
def _restrict_columns_compositional(state: PipelineState,
                                    chosen_columns: list) \
        -> PipelineState:

    df = state.df
    present_columns = []
    for col in chosen_columns:
        if col in state.df.columns:
            present_columns.append(col)
    if len(present_columns) == 0:
        print(df.columns)
        print(chosen_columns)
        print("IS IT HERE?")
        try:
            print("WHAT:", df['1000569'])
            print("HELLO:", df['1403328'])
        except Exception as e:
            print(e)
            print('F**k Python')
        exit(-1)
        raise Exception(
            "Cannot restrict columns, no chosen columns are present")
    chosen_columns = present_columns
    remainder = df.drop(chosen_columns, axis=1)

    print(remainder)
    print("VS")
    print(len(chosen_columns))

    df['remainder'] = remainder.sum(axis=1)
    restricted = df[chosen_columns + ['remainder']]

    # ###################DONOTCOMMIT MEEEEE######################
    # restricted = restricted.drop(["remainder"], axis=1)
    # #######################################################
    return state.update_df(restricted)
def _filter_zero_sum(state: PipelineState) -> PipelineState:
    state.df['rowsum'] = state.df.sum(axis=1)
    state.df = state.df[state.df['rowsum'] > 0]
    state.df = state.df.drop(['rowsum'], axis=1)
    return _filter_to_matched_pairs(state)
def _target(state: PipelineState, meta_col_name: str, one_set: set) \
        -> PipelineState:
    return state.update_target(
        state.meta_df.apply(lambda row: 1
                            if row[meta_col_name] in one_set else 0,
                            axis=1))
예제 #18
0
def run_preprocessing(analysis_config, callbacks: AnalysisCallbacks):
    callbacks.start_analysis(analysis_config)

    analysis_name = analysis_config.analysis_name
    metadata_filepath = analysis_config.metadata_filepath
    feature_set_index = analysis_config.feature_set_index
    if feature_set_index is not None:
        feature_set_index = feature_set_index.features
    training_set_index = analysis_config.training_set_index
    if training_set_index is None:
        training_set_index = 0
    pair_strategy = analysis_config.pair_strategy
    if pair_strategy is None:
        pair_strategy = "paired_concat"
    metadata_filter = analysis_config.metadata_filter
    feature_filter = analysis_config.feature_filter
    dim_reduction = analysis_config.dimensionality_reduction
    normalization = analysis_config.normalization
    if normalization is None:
        normalization = Normalization.DEFAULT
    feature_transform = analysis_config.feature_transform
    # TODO: Probably need to keep the config for the algorithm next to the algo
    #  blahhh.

    df = analysis_config.table_info.load_dataframe()
    # For raw reads, uncomment here
    # _print_read_count_info(df)

    # print("RAGE")
    # print(biom_filepath)
    # print(df.columns)
    # print(df['88431'])
    # print(df['411462'])
    # print("END RAGE")

    # # Look up ids for genera
    # genera = biom_table.metadata_to_dataframe(axis="observation")
    # genera = genera[genera["Name"].isin([
    #     "Akkermansia",
    #     "Bifidobacterium",
    #     "Bilophila",
    #     "Blautia",
    #     "Butyricimonas",
    #     "Coprococcus",
    #     "Christensenella",
    #     "Desulfovibrio",
    #     "Faecalibacterium",
    #     "Haemophilus",
    #     "Methanobrevibacter",
    #     "Mycoplana",
    #     "Paraprevotella",
    #     "Pedobacter",
    #     "Pseudomonas",
    #     "Slackia",
    #     # "Streptococcus thermophiles",
    #     # "Streptococcus salivarius",
    #     # "Faecalibacterium prausnitzi",
    #     # "Coprococcus comes",
    #     # "Anaerostipes hadrus",
    #     # "Eubacterium rectale",
    #     # "Acinetobacter calcoaceticus",
    #     # "Akkermansia muciniphila",
    #     # "Eggerthella lenta"
    # ])]

    # pd.set_option('display.max_rows', None)
    # print(genera.sort_values("Name"))

    # Load metadata DataFrame
    metadata = Metadata.load(metadata_filepath)
    meta_df = metadata.to_dataframe()

    state = PipelineState(df, meta_df, None)

    print(analysis_name)
    # Run Preprocessing
    train_state, test_state = preprocessing_pipeline.process(
        analysis_config,
        state,
        callbacks,
        restricted_feature_set=feature_set_index,
        training_set_index=training_set_index,
        verbose=False,
        pair_strategy=pair_strategy,
        metadata_filter=metadata_filter,
        feature_filter=feature_filter,
        dim_reduction=dim_reduction,
        normalization=normalization,
        feature_transform=feature_transform,
        meta_encoder=analysis_config.meta_encoder,
        downsample_count=analysis_config.downsample_count)

    df = train_state.df
    meta_df = train_state.meta_df
    target = train_state.target

    # For normalized reads after preprocessing, uncomment here
    # c_perf = ['G000013285']
    # akkermansias = [
    #     'G000020225','G000436395','G000437075',
    #     'G000723745', 'G000980515', 'G001578645',
    #     'G001580195', 'G001647615', 'G001683795',
    #     'G001917295', 'G001940945', 'G900097105'
    # ]
    # df2 = df[c_perf + akkermansias]
    # df_akk = df[akkermansias]
    # df2['sum'] = df_akk.sum(axis=1)
    # for akk in akkermansias + ['sum']:
    #     max_akk = df2[akk].max().max()
    #     min_akk = max_akk - 1500
    #     df3 = df2[df2[akk] > min_akk]
    #     print(akk, min_akk/10000, "to", max_akk/10000)
    #     print(df3)
    # import code
    # code.interact(local=locals())

    # _print_read_count_info(df)

    # Shuffle the data so the machine learning can't learn anything based on
    # order
    df = df.sample(frac=1, random_state=110399473805677 % (2**32))

    # Target and df order must match.  Argh.
    df['target'] = target
    target = df['target']
    df = df.drop(['target'], axis=1)

    # print(df.columns)
    # plotter.simple_swarm(df, meta_df, "239935", "disease")
    # return

    # Convert necessary types for regression-benchmarking
    final_biom = Artifact.import_data("FeatureTable[Frequency]", df)\
        .view(biom.Table)

    return final_biom, target, train_state, test_state
예제 #19
0
def _sum_columns(state: PipelineState):
    series = state.df.sum(axis=1)
    series.name = "score"
    return state.update_df(series.to_frame())
예제 #20
0
def _apply_feature_transform(state: PipelineState,
                             transformer: FeatureTransformer):
    return state.update_df(transformer.transform_df(state.df))
def aggregate_mean(state: PipelineState) -> PipelineState:
    return state.update_df(state.df.groupby(lambda x: x).mean())
def matched_pair_subtract_sex_balanced(state: PipelineState,
                                       meta_col_name: str,
                                       one_set: set) -> PipelineState:

    state = _target(state, meta_col_name, one_set)
    state.df["__sex__"] = state.meta_df.apply(lambda row: 1
                                              if row["sex"] == "F" else 0,
                                              axis=1)

    state.df['__target__'] = state.target
    state.df['__household__'] = state.df.index
    state.df['__household__'] = state.df['__household__'].apply(
        _parse_household_id)

    households = state.df['__household__'].tolist()
    households = set(households)

    MM_hh = set()
    MF_hh = set()
    FM_hh = set()
    FF_hh = set()
    for hh in households:
        hh_frame = state.df[state.df['__household__'] == hh]
        if hh_frame["__sex__"].sum() == 2:
            FF_hh.add(hh)
        elif hh_frame["__sex__"].sum() == 0:
            MM_hh.add(hh)
        elif ((hh_frame["__sex__"] == 1) &
              (hh_frame["__target__"] == 1)).sum() == 1:
            FM_hh.add(hh)
        else:
            MF_hh.add(hh)

    # print("Lengths:")
    # print(len(MM_hh), len(FF_hh), len(FM_hh), len(MF_hh))
    # print("MM")
    # print(MM_hh)
    # print("FF")
    # print(FF_hh)

    # Pick a subset
    r = default_rng(_CHOICE_RANDOM_SEED)
    accept_set = r.choice(sorted(list(FM_hh)), len(MF_hh),
                          replace=False).tolist()

    accept_set = accept_set + list(FF_hh) + list(MM_hh) + list(MF_hh)
    # print("ACCEPT SET: ", len(accept_set))
    state.df = state.df[state.df['__household__'].isin(accept_set)]
    state.df = state.df.drop(["__household__", "__target__"], axis=1)

    meta_accept_set = [s.replace("-", "") for s in accept_set]
    state.meta_df = state.meta_df[state.meta_df['household'].isin(
        meta_accept_set)]

    (left, right) = _split_left_right(state, meta_col_name, one_set)

    left = left.sort_index()
    right = right.sort_index()
    df = right - left

    # TODO FIXME HACK:  Does target's type matter?  Do we need to
    #  turn these into ints/booleans?
    target = df['target'] / 2 + .5

    # Pick subset of households with

    # MM 0
    # FF 0
    # MF -1
    # FM 1
    A = (df['target'] == -1) & (df['__sex__'] == -1)
    B = (df['target'] == -1) & (df['__sex__'] == 0)
    C = (df['target'] == -1) & (df['__sex__'] == 1)
    D = (df['target'] == 1) & (df['__sex__'] == -1)
    E = (df['target'] == 1) & (df['__sex__'] == 0)
    F = (df['target'] == 1) & (df['__sex__'] == 1)

    # print(A.sum(), B.sum(), C.sum())
    # print(D.sum(), E.sum(), F.sum())
    #
    # print((A | B | C | D | E | F).sum())

    df = df.drop('target', axis=1)
    df = df.drop('__sex__', axis=1)

    return state.update(target=target, df=df)
예제 #23
0
def fix_sample_ids(state: PipelineState) -> PipelineState:
    return state.update_df(state.df.rename(mapper=_parse_sample_id))
예제 #24
0
def divide_total(state: PipelineState, target_count: int) -> PipelineState:
    df = state.df.div(state.df.sum(axis=1) / target_count, axis=0)
    return state.update_df(df)