예제 #1
0
def test_concat():
    df = pl.DataFrame({"a": [2, 1, 3], "b": [1, 2, 3], "c": [1, 2, 3]})

    assert pl.concat([df, df]).shape == (6, 3)

    # check if a remains unchanged
    a = pl.from_rows(((1, 2), (1, 2)))
    _ = pl.concat([a, a, a])
    assert a.shape == (2, 2)
예제 #2
0
def test_lazy_concat(df: pl.DataFrame) -> None:
    shape = df.shape
    shape = (shape[0] * 2, shape[1])

    out = pl.concat([df.lazy(), df.lazy()]).collect()
    assert out.shape == shape
    assert out.frame_equal(df.vstack(df.clone()), null_equal=True)
def build_gene_annotation_df(pset_dict):
    """
    Build a table mapping each gene in a dataset to its gene annotations.
    @param pset_dict: [`dict`] A nested dictionary containing all tables in the PSet
    @return: [`DataFrame`] A table of all gene annotations, mapped to genes
    """
    # Extract the all molecular data types for the pSet
    df_list = [
        pl.from_pandas(pset_dict['molecularProfiles'][mDataType]['rowData'])
        for mDataType in pset_dict['molecularProfiles']
    ]
    # Get columns of interest, add columns needed later
    for i in range(len(df_list)):
        df_list[i] = df_list[i].select(['.features'])
        empty_column = [None for _ in range(len(df_list[i]['.features']))]
        df_list[i]['symbol'] = pl.Series('symbol', empty_column, dtype=pl.Utf8)
        df_list[i]['gene_seq_start'] = pl.Series('gene_seq_start',
                                                 empty_column,
                                                 dtype=pl.Int64)
        df_list[i]['gene_seq_end'] = pl.Series('gene_seq_end',
                                               empty_column,
                                               dtype=pl.Int64)
    # Merge to a single DataFrame
    gene_annotation_df = pl.concat(df_list) \
        .rename({'.features': 'gene_id'})
    # Remove Ensembl gene version
    gene_annotation_df['gene_id'] = gene_annotation_df['gene_id'] \
        .apply(lambda x: re.sub(r'\..*$', '', x))
    gene_annotation_df = gene_annotation_df \
        .drop_duplicates() \
        .to_pandas()
    return gene_annotation_df
예제 #4
0
def concat_and_sort(blocks: List["pyarrow.Table"], key: "SortKeyT",
                    descending: bool) -> "pyarrow.Table":
    check_polars_installed()
    col, _ = key[0]
    blocks = [pl.from_arrow(block) for block in blocks]
    df = pl.concat(blocks).sort(col, reverse=descending)
    return df.to_arrow()
예제 #5
0
def main():
    parser = argparse.ArgumentParser()
    parser.add_argument('json_dirs_list_fname')
    parser.add_argument('outfname')
    args = parser.parse_args()

    with open(args.json_dirs_list_fname) as json_dirs_list_file:
        dirs_list = json.loads(json_dirs_list_file.read().strip())

    dfs = []
    for pheno, region, dir_ in dirs_list:
        if pheno == 'urate' and region == '4_8165642_11717761':
            continue
        print(f'Loading region {pheno}: {region}', flush=True)
        dfs.append(load_dir(pheno, region, dir_))
    pl.concat(dfs).with_columns([
        pl.col('susie_pip').round(4),
        pl.col('susie_cs_pip').round(4),
    ]).collect().to_csv(args.outfname, sep='\t')
예제 #6
0
def test_unset_sorted_on_append() -> None:
    df1 = pl.DataFrame([
        pl.Series("key", ["a", "b", "a", "b"], dtype=pl.Categorical),
        pl.Series("val", [1, 2, 3, 4]),
    ]).sort("key")
    df2 = pl.DataFrame([
        pl.Series("key", ["a", "b", "a", "b"], dtype=pl.Categorical),
        pl.Series("val", [5, 6, 7, 8]),
    ]).sort("key")
    df = pl.concat([df1, df2], rechunk=False)
    assert df.groupby("key").count()["count"].to_list() == [4, 4]
예제 #7
0
def test_expression_appends() -> None:
    df = pl.DataFrame({"a": [1, 1, 2]})

    assert df.select(pl.repeat(None, 3).append(pl.col("a"))).n_chunks() == 2

    assert df.select(pl.repeat(None, 3).append(pl.col("a")).rechunk()).n_chunks() == 1

    out = df.select(pl.concat([pl.repeat(None, 3), pl.col("a")]))

    assert out.n_chunks() == 1
    assert out.to_series().to_list() == [None, None, None, 1, 1, 2]
예제 #8
0
def test_predicate_count_vstack() -> None:
    l1 = pl.DataFrame({
        "k": ["x", "y"],
        "v": [3, 2],
    }).lazy()
    l2 = pl.DataFrame({
        "k": ["x", "y"],
        "v": [5, 7],
    }).lazy()
    assert pl.concat([l1, l2]).filter(
        pl.count().over("k") == 2).collect()["v"].to_list() == [3, 2, 5, 7]
예제 #9
0
def test_concat_horizontal() -> None:
    a = pl.DataFrame({"a": ["a", "b"], "b": [1, 2]})
    b = pl.DataFrame({"c": [5, 7, 8, 9], "d": [1, 2, 1, 2], "e": [1, 2, 1, 2]})

    out = pl.concat([a, b], how="horizontal")
    expected = pl.DataFrame({
        "a": ["a", "b", None, None],
        "b": [1, 2, None, None],
        "c": [5, 7, 8, 9],
        "d": [1, 2, 1, 2],
        "e": [1, 2, 1, 2],
    })
    assert out.frame_equal(expected)
예제 #10
0
def test_diag_concat() -> None:
    a = pl.DataFrame({"a": [1, 2]})
    b = pl.DataFrame({"b": ["a", "b"], "c": [1, 2]})
    c = pl.DataFrame({"a": [5, 7], "c": [1, 2], "d": [1, 2]})

    out = pl.concat([a, b, c], how="diagonal")
    expected = pl.DataFrame({
        "a": [1, 2, None, None, 5, 7],
        "b": [None, None, "a", "b", None, None],
        "c": [None, None, 1, 2, 1, 2],
        "d": [None, None, None, None, 1, 2],
    })

    assert out.frame_equal(expected, null_equal=True)
예제 #11
0
def test_categorical_lexical_ordering_after_concat() -> None:
    with pl.StringCache():
        ldf1 = (pl.DataFrame([
            pl.Series("key1", [8, 5]),
            pl.Series("key2", ["fox", "baz"])
        ]).lazy().with_column(
            pl.col("key2").cast(pl.Categorical).cat.set_ordering("lexical")))
        ldf2 = (pl.DataFrame([
            pl.Series("key1", [6, 8, 6]),
            pl.Series("key2", ["fox", "foo", "bar"])
        ]).lazy().with_column(
            pl.col("key2").cast(pl.Categorical).cat.set_ordering("lexical")))
        df = (pl.concat([ldf1, ldf2]).with_column(
            pl.col("key2").cat.set_ordering("lexical")).collect())

        df.sort(["key1", "key2"])
예제 #12
0
def test_dtype_concat_3735() -> None:
    for dt in [
            pl.Int8,
            pl.Int16,
            pl.Int32,
            pl.Int64,
            pl.UInt8,
            pl.UInt16,
            pl.UInt32,
            pl.UInt64,
            pl.Float32,
            pl.Float64,
    ]:
        d1 = pl.DataFrame([
            pl.Series("val", [1, 2], dtype=dt),
        ])
    d2 = pl.DataFrame([
        pl.Series("val", [3, 4], dtype=dt),
    ])
    df = pl.concat([d1, d2])
    assert df.shape == (4, 1)
loci = pl.read_csv(args.str_loci, sep='\t',
                   has_header=False).distinct().rename({
                       'column_1': 'chrom',
                       'column_2': 'pos'
                   })

for chrom in range(1, 23):
    dfs = [
        loci.filter((pl.col('pos') >= start) & (pl.col('pos') <= end)
                    & (pl.col('chrom') == chrom))
        for (region_chrom, start, end) in zip(
            *pl.read_csv(args.finemapping_regions, sep='\t').select(
                ['chrom', 'start', 'end']).to_dict(False).values())
        if region_chrom == chrom
    ]
    if len(dfs) > 0:
        pl.concat(dfs).sort('pos').to_csv(
            f'{args.outdir}/{args.phenotype}_chr{chrom}.tab', sep='\t')
    else:
        pl.DataFrame({
            'chrom': [],
            'pos': []
        }).to_csv(f'{args.outdir}/{args.phenotype}_chr{chrom}.tab', sep='\t')
    '''
    df.filter(
        (pl.col('phenotype') == phenotype) &
        (pl.col('chrom') == chrom)
    ).select(['chrom', 'pos']).sort('pos').to_csv(, sep='\t')
    '''
예제 #14
0
import bokeh.plotting
import numpy as np
import polars as pl
import scipy.stats

parser = argparse.ArgumentParser()
parser.add_argument('outdir')
parser.add_argument('chrom_files', nargs = '+',
                    help='4 cols: pos, chance of length confusionn, avg abs length confusion, normalized avg abs lenght confusion')
args = parser.parse_args()
outdir = args.outdir
chrom_fnames = args.chrom_files

loci = pl.concat([
    pl.scan_csv(
        chrom_fname,
        sep='\t'
    ) for chrom_fname in chrom_fnames
]).drop('pos').collect()

for col in loci.columns:
    print(f'Plotting column {col} ...', flush=True)
    max_val = loci.select(pl.col(col).max()).to_numpy()
    min_val = loci.select(pl.col(col).min()).to_numpy()
    n_steps = 1000
    step_size = (max_val - min_val)/n_steps
    xs = np.arange(min_val, max_val + step_size, step_size)
    ys = scipy.stats.gaussian_kde(loci[col].to_numpy())(xs)

    if col.startswith('chance'):
        unit = '%'
    elif col.startswith('avg'):
예제 #15
0
def test_concat() -> None:
    s = pl.Series("a", [2, 1, 3])

    assert pl.concat([s, s]).len() == 6
    # check if s remains unchanged
    assert s.len() == 3
def main():
    parser = argparse.ArgumentParser()
    parser.add_argument('outtable')
    parser.add_argument('outreadme')
    parser.add_argument('pos_to_snpstr_pos')
    parser.add_argument('intable')
    parser.add_argument('inreadme')
    parser.add_argument('spot_test_fname_json_dict_fname')

    args = parser.parse_args()

    with open(args.spot_test_fname_json_dict_fname) as json_file:
        spot_test_fname_json_dict = next(json_file)

    with open(args.outreadme, 'w+') as readme:
        with open(args.inreadme) as inreadme:
            readme.write(inreadme.read())
        readme.write(
            'other_ethnic_association_ps - association p-values for the other '
            'ethnicities in the order ' +
            ','.join(other_ethnicities) + '\n'
        )
        readme.write(
            'other_ethnic_effect_directions - direction of association (+/-) '
            'for the other ethnicities in the order ' +
            ','.join(other_ethnicities) +
            " (NaN if that ethnicity's p > 0.05)\n"
        )
        for ethnicity in other_ethnicities:
            readme.write(
                f'{ethnicity}_population_allele_frequencies - frequencies of each allele '
                "(by dosage) among the ethnicity's tested population\n"
            )

    hits = pl.scan_csv(
        args.intable,
        sep='\t',
        # hack added arguments here that will be ignored when reading putatively_causal but not when reading exonic_finemapped
        dtype={'alleles': str}
    )
    cols = hits.columns

    # hack to only clean in one of the two cases this function is running
    if 'white_brit_allele_frequencies' in cols:
        hits = hits.with_column(
            pl.col('white_brit_allele_frequencies').str.replace_all('"', "'")
        )

    hits = hits.join(
        pl.scan_csv(args.pos_to_snpstr_pos, sep='\t'),
        how='left',
        left_on=['chrom', 'start_pos'],
        right_on=['chrom', 'pos']
    )

    spot_tests_fnames = {
        tuple(key.split('__')): fname
        for key, fname in
        json.loads(spot_test_fname_json_dict).items()
    }

    spot_tests = {}
    for outer_ethnicity in other_ethnicities:
        spot_tests[outer_ethnicity] = pl.concat([
            (pl.scan_csv(
                    spot_test_fname,
                    sep='\t',
                    dtype={'alleles': str},
                    null_values=['nan'],
                    with_column_names=lambda cols: list(fix_cols(cols, phenotype))
                ).select([
                    pl.lit(phenotype).alias('phenotype'),
                    'chrom',
                    'pos',
                    pl.col('p_phenotype').cast(float).alias(f'{ethnicity}_p'),
                    pl.when(pl.col('p_phenotype') >= 0.05).then(np.nan).when(pl.col('coeff_phenotype') > 0).then(pl.lit('+')).otherwise(pl.lit('-')).alias(f'{ethnicity}_effect_direction'),
                    pl.col('subset_total_per_allele_dosages').apply(reformat_dosage_dict_str).alias(f'{ethnicity}_population_allele_frequencies')
                ]))
            for (phenotype, _, _, ethnicity), spot_test_fname
            in spot_tests_fnames.items()
            if ethnicity == outer_ethnicity
        ])

    for ethnicity in other_ethnicities:
        hits = hits.join(
            spot_tests[ethnicity],
            how='left',
            left_on=['phenotype', 'chrom', 'snpstr_pos'],
            right_on=['phenotype', 'chrom', 'pos']
        )

    hits = hits.with_columns([
        pl.sum([pl.col(f'{ethnicity}_p').cast(str) + pl.lit(', ') for ethnicity in other_ethnicities])
             .str.replace(', $', '').alias('other_ethnic_association_ps'),
        pl.sum([pl.col(f'{ethnicity}_effect_direction').cast(str) + pl.lit(', ') for ethnicity in other_ethnicities])
             .str.replace(', $', '').alias('other_ethnic_effect_directions')
    ])

    hits = hits.select([
        *cols,
        'other_ethnic_association_ps',
        'other_ethnic_effect_directions',
        *[f'{ethnicity}_population_allele_frequencies' for ethnicity in other_ethnicities]
    ]).collect()
    assert hits.shape[0] == pl.read_csv(
        args.intable,
        sep='\t',
        # same hack as above
        dtype = {'alleles': str}
    ).shape[0]

    hits.to_csv(args.outtable, sep='\t',)
                    help='cols: chrom, pos, FINEMAP_pcausal, SuSiE_CS_pcausal')
parser.add_argument('pos_to_snpstr_pos', help='cols: chrom, pos, snpstr_pos')
parser.add_argument(
    'chrom_tables',
    nargs='+',
    help=
    'In chromosome order. 4 cols: pos, chance of length confusionn, avg abs length confusion, normalized avg abs length confusion'
)
args = parser.parse_args()
outdir = args.outdir
results_fname = args.results_table
chrom_fnames = args.chrom_tables

loci = pl.concat([
    pl.scan_csv(chrom_fname, sep='\t').with_column(
        pl.lit(chrom_num + 1).cast(int).alias('chrom'))
    for chrom_num, chrom_fname in enumerate(chrom_fnames)
]).collect()

pos_to_snpstr_pos = pl.scan_csv(args.pos_to_snpstr_pos, sep='\t').collect()

cols = ['normalized_avg_abs_length_confusion', 'chance_of_length_confusion']
#loci_cols = [col for col in loci.columns if col != 'pos' and col != 'chrom']

results = pl.scan_csv(results_fname, sep='\t', null_values='NA').with_column(
    pl.when(pl.col('SuSiE_CS_pcausal').is_null()).then(0).otherwise(
        pl.col('SuSiE_CS_pcausal')).alias('SuSiE_pcausal')).with_column(
            (pl.col('FINEMAP_pcausal') -
             pl.col('SuSiE_pcausal')).alias('discrepancy')).with_column(
                 pl.col('discrepancy').abs().alias('abs_discrepancy')).filter(
                     (pl.col('FINEMAP_pcausal') >= .8)
        dna_structures,
        how='left',
        left_on=['canonical_unit'],
        right_on=['repeat_unit'],
        suffixes=['', '_other']
    )
    '''

    loci_summary_dfs = []
    for chrom in range(1, 23):
        distribution_stats = pl.read_csv(
            f'{ukb}/export_scripts/intermediate_results/chr{chrom}_loci_summary.tab',
            sep='\t',
        )
        loci_summary_dfs.append(distribution_stats)
    loci_summaries = pl.concat(loci_summary_dfs)
    n_before = all_STRs.shape[0]
    all_STRs = all_STRs.join(loci_summaries,
                             how='left',
                             left_on=['chrom', 'SNPSTR_start_pos'],
                             right_on=['chr', 'pos'],
                             suffix='_other')
    assert n_before == all_STRs.shape[0]
    print('Calculating mean lens ... ', flush=True, end='')
    all_STRs = all_STRs.with_column(
        pl.Series([
            sum(key * val
                for (key, val) in ast.literal_eval(allele_dist).items())
            for allele_dist in all_STRs['allele_dist']
        ]).alias('mean_len'))
    print('done', flush=True)
def main():
    parser = argparse.ArgumentParser()
    parser.add_argument('phenotypes', nargs='+')
    phenotypes = parser.parse_args().phenotypes

    all_dfs = []
    susie_cs_min_abs_corrs = []
    finemap_cs_coverages = []
    unconverged_regions = []
    #underexplored_regions = []
    unfinished_regions = []

    for phenotype in phenotypes:

        pheno_dfs = []
        str_assocs = pl.scan_csv(
            f'{ukb}/association/results/{phenotype}/my_str/results.tab',
            sep='\t',
        ).select([
            pl.lit(phenotype).alias('phenotype'), 'chrom', 'pos',
            pl.col(f'p_{phenotype}').alias('p_val'),
            pl.lit(True).alias('is_STR'),
            pl.lit(None).cast(int).alias('reflen'),
            pl.lit(None).cast(int).alias('altlen')
        ])

        snp_assocs = pl.scan_csv(
            f'{ukb}/association/results/{phenotype}/plink_snp/results.tab',
            sep='\t',
            null_values='NA',
        ).select([
            pl.col('#CHROM').alias('chrom'),
            pl.col('POS').alias('pos'),
            pl.col('REF').str.lengths().cast(int).alias('reflen'),
            pl.col('ALT').str.lengths().cast(int).alias('altlen'),
            pl.col('P').alias('p_val'),
        ]).groupby(['chrom', 'pos', 'reflen', 'altlen']).agg([
            pl.col('p_val').min().alias('p_val'),
        ]).with_columns([
            pl.lit(phenotype).alias('phenotype'),
            pl.lit(False).alias('is_STR')
        ]).select([
            'phenotype', 'chrom', 'pos', 'p_val', 'is_STR', 'reflen', 'altlen'
        ])

        assocs = pl.concat([str_assocs, snp_assocs
                            ]).filter(pl.col('p_val') <= p_val_thresh)

        regions_df = pl.read_csv(f'{ukb}/signals/regions/{phenotype}.tab',
                                 sep='\t')
        for chrom, start, end, any_strs in zip(regions_df['chrom'],
                                               regions_df['start'],
                                               regions_df['end'],
                                               regions_df['any_strs']):
            if not any_strs:
                continue
            converged_fname = f'{ukb}/finemapping/susie_results/{phenotype}/{chrom}_{start}_{end}/converged.txt'
            if not os.path.exists(converged_fname):
                unfinished_regions.append((phenotype, chrom, start, end))
                continue
            with open(converged_fname) as converged_file:
                if not next(converged_file).strip() == 'TRUE':
                    unconverged_regions.append((phenotype, chrom, start, end))
                    continue
            print(f'Loading {phenotype} region {chrom}:{start}-{end}',
                  flush=True)
            with open(
                    f'{ukb}/finemapping/susie_results/{phenotype}/{chrom}_{start}_{end}/colnames.txt'
            ) as var_file:
                susie_vars = [line.strip() for line in var_file]
            alphas = pl.scan_csv(
                f'{ukb}/finemapping/susie_results/{phenotype}/{chrom}_{start}_{end}/alpha.tab',
                sep='\t',
                has_header=False).collect().to_numpy().T
            n_alphas = alphas.shape[1]
            susie_pips = 1 - np.prod(1 - alphas, axis=1)
            assert susie_pips.shape[0] == len(susie_vars)
            susie_idx = np.arange(len(susie_vars)) + 1
            susie_df = pl.DataFrame({
                'varname': susie_vars,
                'susie_pip': susie_pips,
                'susie_alpha': np.zeros(len(susie_vars)),
                'susie_cs': [-1] * len(susie_vars),
                'susie_idx': susie_idx,
                **{f'alpha_{i}': alphas[:, i]
                   for i in range(n_alphas)}
            }).lazy()
            finemap_df = pl.scan_csv(
                f'{ukb}/finemapping/finemap_results/{phenotype}/{chrom}_{start}_{end}/finemap_output.snp',
                sep=' ').select([
                    pl.col('rsid').alias('varname'),
                    pl.col('prob').alias('finemap_pip')
                ])

            df = susie_df.join(finemap_df, how='inner', on=[
                'varname'
            ]).with_columns([
                pl.col('varname').str.extract('^[^_]*_([^_]*)',
                                              1).cast(int).alias('pos'),
                pl.col('varname').str.extract(
                    '^[^_]*_[^_]*_([^_]*)_.*',
                    1).str.lengths().cast(int).alias('reflen'),
                pl.col('varname').str.extract(
                    '^[^_]*_[^_]*_[^_]*_([^_]*)',
                    1).str.lengths().cast(int).alias('altlen'),
                pl.col('varname').str.contains('^STR').alias('is_STR'),
                pl.lit(f'{phenotype}_{chrom}_{start}_{end}').alias('region'),
                pl.lit(chrom).alias('chrom').cast(int),
                pl.lit(phenotype).alias('phenotype')
            ]).sort('susie_idx')

            real_cs_count = 0
            for cs_fname in glob.glob(
                    f'{ukb}/finemapping/susie_results/{phenotype}/{chrom}_{start}_{end}/cs*.txt'
            ):
                cs_id = int(cs_fname.split('cs')[-1].split('.')[0])
                with open(cs_fname) as cs_file:
                    # susie uses 1 based indexing, python uses 0
                    # make sure cs idxs are in increasing order
                    cs_susie_idx = np.array(
                        [int(idx) for idx in next(cs_file).strip().split()])
                    assert np.all(cs_susie_idx[1:] - cs_susie_idx[:-1] > 0)
                    cs_susie_idx = pl.Series('cs_susie_idx', cs_susie_idx)
                    next(cs_file)  # skip cs credibility
                    min_abs_corr, _, _ = [
                        float(idx) for idx in next(cs_file).strip().split()
                    ]
                susie_cs_min_abs_corrs.append(min_abs_corr)
                finemap_cs_coverages.append(
                    df.filter(pl.col('susie_idx').is_in(cs_susie_idx)).select(
                        pl.col('finemap_pip').sum()).collect())
                df = df.with_column(
                    pl.when(pl.col('susie_idx').is_in(cs_susie_idx)).then(
                        pl.when(
                            pl.col(f'alpha_{cs_id-1}') > pl.col('susie_alpha')
                        ).then(pl.col(f'alpha_{cs_id-1}')).otherwise(
                            pl.col('susie_alpha'))).otherwise(
                                pl.col('susie_alpha')).alias('susie_alpha'))
                if min_abs_corr < corr_cutoff:
                    continue
                real_cs_count += 1
                # could worry about variants being in multiple CSes
                df = df.with_column(
                    pl.when(pl.col('susie_idx').is_in(cs_susie_idx)).then(
                        cs_id).otherwise(pl.col('susie_cs')).alias('susie_cs'))
            pheno_dfs.append(df)
            '''
            if real_cs_count >= 10:
                underexplored_regions.append((phenotype, chrom, start, end))
            '''
        pheno_dfs = [
            df.select(pl.col('*').exclude('^alpha.*$')) for df in pheno_dfs
        ]
        pheno_df = pl.concat(pheno_dfs).join(
            assocs,
            how='left',
            on=['phenotype', 'chrom', 'is_STR', 'pos', 'reflen',
                'altlen']).collect()
        all_dfs.append(pheno_df)

    del df, susie_df, finemap_df, assocs, pheno_dfs, pheno_df
    susie_cs_min_abs_corrs = np.array(susie_cs_min_abs_corrs)
    finemap_cs_coverages = np.array(finemap_cs_coverages)

    total_df = pl.concat(all_dfs)
    #total_assocs = pl.concat(all_assocs).filter(pl.col('p_val') <= p_val_thresh)
    ''''
    start_time = time.time()
    print('Gathering data ... ', flush=True)
    total_df = total_df.join(
        total_assocs,
        how='left',
        on=['phenotype', 'chrom', 'is_STR', 'pos', 'reflen', 'altlen']
    ).collect()
    print(f'Done. Time: {time.time() - start_time:.2}')
    '''

    total_df.filter(
        ~pl.col('p_val').is_null() & (pl.col('p_val') <= p_val_thresh)).to_csv(
            f'{ukb}/post_finemapping/intermediate_results/gathered_data.tab',
            sep='\t')

    print(
        'Any vars with null Ps?',
        total_df.select(pl.col('p_val').is_null().alias('null?')).select(
            pl.any('null?').alias('any_nulls'))['any_nulls'][0])
    print(
        'n regions',
        total_df.select(
            pl.col('region').unique().count().alias('region_count'))
        ['region_count'][0])

    cses_per_region = total_df.filter(
        pl.col('susie_cs') >= 0).filter(~pl.col('p_val').is_null()).groupby([
            'susie_cs', 'region'
        ]).agg(
            pl.col('p_val').min().alias('min_p'),
        ).filter(pl.col('min_p') <= p_val_thresh).groupby('region').agg(
            pl.col('region').count().alias('n_cses')).to_dict(False)['n_cses']
    print(
        f'avg cses (total PIP >= .9, min_p_val of CS members <= {p_val_thresh}) per region {np.mean(cses_per_region)}, ({np.std(cses_per_region)})'
    )

    for filter_, text in ((pl.lit(True), ''), (pl.col('is_STR'), ' STR'),
                          (~pl.col('is_STR'), ' SNP')):
        susie_hits_per_region = total_df.filter(filter_).with_column(
            ((pl.col('susie_cs') >= 0) & (pl.col('susie_pip') >= pip_threshold)
             & (pl.col('p_val') <= p_val_thresh)
             ).alias('susie_hit')).groupby('region').agg(
                 pl.col('susie_hit').sum().alias('n_susie_hits')).to_dict(
                     False)['n_susie_hits']
        print(
            f'avg susie{text} hits (var is in a CS, PIP >= {pip_threshold}, p_val <= {p_val_thresh}) per region {np.mean(susie_hits_per_region)}, ({np.std(susie_hits_per_region)})'
        )

        finemap_hits_per_region = total_df.filter(filter_).with_column(
            ((pl.col('finemap_pip') >= pip_threshold) &
             (pl.col('p_val') <= p_val_thresh)
             ).alias('finemap_hit')).groupby('region').agg(
                 pl.col('finemap_hit').sum().alias('n_finemap_hits')).select(
                     'n_finemap_hits').to_numpy()
        print(
            f'avg finemap{text} hits (PIP >= {pip_threshold}, p_val <= {p_val_thresh}) per region {np.mean(finemap_hits_per_region)}, ({np.std(finemap_hits_per_region)})'
        )

        print('Exporting FINEMAP vs SuSiE PIP plots', flush=True)
        comparison_thresh = 0.3
        title = f'{text} with p-val <= {p_val_thresh} where at least one of SuSiE or FINEMAP PIP >= {comparison_thresh}'
        if text == '':
            title = 'Vars ' + title
        fig = bokeh.plotting.figure(
            width=1200,
            height=1200,
            title=title,
            x_axis_label='FINEMAP PIPs',
            y_axis_label='SuSiE PIPs',
        )
        fig.title.text_font_size = '30px'
        fig.axis.axis_label_text_font_size = '26px'
        fig.axis.major_label_text_font_size = '20px'

        fig.background_fill_color = None
        fig.border_fill_color = None
        fig.ygrid.grid_line_color = None
        fig.xgrid.grid_line_color = None
        fig.toolbar.logo = None
        fig.toolbar_location = None
        print(total_df.filter(filter_))
        print(total_df.filter(filter_ & (pl.col('p_val') <= p_val_thresh)))
        pips = total_df.filter(filter_ & (pl.col('p_val') <= p_val_thresh)
                               & ((pl.col('finemap_pip') >= comparison_thresh)
                                  | ((pl.col('susie_pip') >= comparison_thresh)
                                     & (pl.col('susie_cs') >= 0)))).select(
                                         ['susie_pip', 'finemap_pip'])
        print(pips)

        bin_size = .05
        bins = bokeh.util.hex.hexbin(
            pips['finemap_pip'].to_numpy().reshape(-1),
            pips['susie_pip'].to_numpy().reshape(-1),
            size=bin_size)

        palette = [
            linear_int_interpolate((134, 204, 195), (9, 41, 46), i / 254)
            for i in range(-1, 255)
        ]
        cmap = bokeh.transform.log_cmap('counts',
                                        palette=palette,
                                        low=1,
                                        high=max(bins.counts),
                                        low_color=(255, 255, 255))
        color_mapper = bokeh.models.LogColorMapper(palette=palette,
                                                   low=1,
                                                   high=max(bins.counts))

        fig.hex_tile(q='q',
                     r='r',
                     size=bin_size,
                     line_color=None,
                     source=bins,
                     fill_color=cmap)
        color_bar = bokeh.models.ColorBar(color_mapper=color_mapper,
                                          width=70,
                                          major_label_text_font_size='20px')
        fig.add_layout(color_bar, 'right')
        ext = text.replace(' ', '_')
        bokeh.io.export_png(
            fig,
            filename=
            f'{ukb}/export_scripts/results/finemap_pip_vs_susie_pip{ext}.png')
        bokeh.io.export_svg(
            fig,
            filename=
            f'{ukb}/export_scripts/results/finemap_pip_vs_susie_pip{ext}.svg')

    print(f'unconverged regions: {unconverged_regions}')
    print(f'unfinished regions: {unfinished_regions}')
    #print(f'underexplored regions: {underexplored_regions}')

    fig = bokeh.plotting.figure(
        width=1200,
        height=1200,
        title='SuSiE credible set min absolute correlations',
        x_axis_label='min absolute correlation',
        y_axis_label='# credible sets',
    )
    fig.axis.axis_label_text_font_size = '30px'
    fig.background_fill_color = None
    fig.border_fill_color = None
    fig.grid.grid_line_color = None
    fig.toolbar_location = None
    step = 0.01
    left_edges = np.arange(0, 1 + step, step)
    ys = [
        np.sum((left_edge <= susie_cs_min_abs_corrs)
               & (susie_cs_min_abs_corrs < left_edge + step))
        for left_edge in left_edges
    ]
    fig.quad(top=ys, bottom=0, left=left_edges, right=left_edges + step)

    print('Exporting cs plots', flush=True)
    bokeh.io.export_png(
        fig, filename=f'{ukb}/export_scripts/results/cs_min_abs_corrs.png')
    bokeh.io.export_svg(
        fig, filename=f'{ukb}/export_scripts/results/cs_min_abs_corrs.svg')

    fig = bokeh.plotting.figure(
        width=1200,
        height=1200,
        title=
        f'Number of SuSie CSes min absolute corr >= {corr_cutoff} per region',
        x_axis_label='# cses in the region',
        y_axis_label='# regions',
    )
    fig.axis.axis_label_text_font_size = '30px'
    fig.background_fill_color = None
    fig.border_fill_color = None
    fig.grid.grid_line_color = None
    fig.toolbar_location = None
    left_edges = np.arange(0, max(cses_per_region) + 1)
    ys = [
        np.sum((left_edge <= cses_per_region)
               & (cses_per_region < left_edge + 1)) for left_edge in left_edges
    ]
    fig.quad(top=ys, bottom=0, left=left_edges, right=left_edges + 1)

    print('Exporting cs per region plots', flush=True)
    bokeh.io.export_png(
        fig, filename=f'{ukb}/export_scripts/results/cses_per_region.png')
    bokeh.io.export_svg(
        fig, filename=f'{ukb}/export_scripts/results/cses_per_region.svg')

    fig = bokeh.plotting.figure(
        width=1200,
        height=1200,
        title=f'Number of FINEMAP vars with PIP >= {pip_threshold} per region',
        x_axis_label='# hits in the region',
        y_axis_label='# regions',
    )
    fig.axis.axis_label_text_font_size = '30px'
    fig.background_fill_color = None
    fig.border_fill_color = None
    fig.grid.grid_line_color = None
    fig.toolbar_location = None
    left_edges = np.arange(0, max(finemap_hits_per_region) + 1)
    ys = [
        np.sum((left_edge <= finemap_hits_per_region)
               & (finemap_hits_per_region < left_edge + 1))
        for left_edge in left_edges
    ]
    fig.quad(top=ys, bottom=0, left=left_edges, right=left_edges + 1)

    print('Exporting finemap hits per region plots', flush=True)
    bokeh.io.export_png(
        fig,
        filename=f'{ukb}/export_scripts/results/finemap_hits_per_region.png')
    bokeh.io.export_svg(
        fig,
        filename=f'{ukb}/export_scripts/results/finemap_hits_per_region.svg')

    fig = bokeh.plotting.figure(
        width=1200,
        height=1200,
        title=
        f'FINEMAP total PIPs for SuSiE CSes with min_abs_corr >= {corr_cutoff}',
        x_axis_label='FINEMAP PIPs',
        y_axis_label='# credible sets',
    )
    fig.background_fill_color = None
    fig.border_fill_color = None
    fig.ygrid.grid_line_color = None
    fig.xgrid.grid_line_color = None
    fig.toolbar.logo = None
    fig.toolbar_location = None
    include = susie_cs_min_abs_corrs >= corr_cutoff
    max_total_pip = max(1, np.max(finemap_cs_coverages[include]))
    step = 0.01
    left_edges = np.arange(0, max_total_pip + step, step)
    ys = [
        np.sum((left_edge <= finemap_cs_coverages[include])
               & (finemap_cs_coverages[include] < left_edge + step))
        for left_edge in left_edges
    ]
    fig.quad(top=ys, bottom=0, left=left_edges, right=left_edges + step)

    print('Exporting FINEMAP CS PIP plots', flush=True)
    bokeh.io.export_png(
        fig,
        filename=f'{ukb}/export_scripts/results/susie_cs_finemap_total_pips.png'
    )
    bokeh.io.export_svg(
        fig,
        filename=f'{ukb}/export_scripts/results/susie_cs_finemap_total_pips.svg'
    )

    total_cses = np.sum(include)
    total_cses_large_finemap_pip = np.sum(
        finemap_cs_coverages[include] >= pip_threshold)
    print(
        f'SuSiE CSes with min_abs_corr >= {corr_cutoff} with FINEMAP total PIP >= {pip_threshold}: {total_cses_large_finemap_pip} ({total_cses_large_finemap_pip/total_cses:%})'
    )

    susie_pip_threshold_for_finemap = .3
    n_replicates_from_finemap = total_df.filter(
        (pl.col('susie_cs') >= 0)
        & (pl.col('susie_pip') >= susie_pip_threshold_for_finemap)
        & (pl.col('finemap_pip') >= pip_threshold)).shape[0]
    n_finemap_total = total_df.filter(
        pl.col('finemap_pip') >= pip_threshold).shape[0]
    print(
        f'FINEMAP hits with PIP >= {pip_threshold} in a SuSiE CS with abs corr >= {corr_cutoff} and SuSiE PIP >= {susie_pip_threshold_for_finemap}: {n_replicates_from_finemap} ({n_replicates_from_finemap/n_finemap_total:%})'
    )

    for (curr_df, text) in [(total_df, 'all hits no filter'),
                            (total_df.filter(pl.col('p_val') <= 1e-10),
                             'all hits p<=1e-10')]:
        print(text)
        var_thresh1 = .8
        var_thresh2 = .3
        for susie_thresh in (var_thresh1, var_thresh2):
            for finemap_thresh in (var_thresh1, var_thresh2):
                count = curr_df.filter(
                    (pl.col('susie_cs') >= 0)
                    & (pl.col('susie_pip') >= susie_thresh)
                    & (pl.col('finemap_pip') >= finemap_thresh)).shape[0]
                print(
                    f'Vars in a SuSiE CS with SuSIE PIP >= {susie_thresh} and with FINEMAP PIP >= {finemap_thresh}: {count}'
                )

        for susie_thresh in (var_thresh1, var_thresh2):
            count = curr_df.filter(
                (pl.col('susie_cs') >= 0)
                & (pl.col('susie_pip') >= susie_thresh)
                & (pl.col('finemap_pip') < var_thresh2)).shape[0]
            print(
                f'Vars in a SuSiE CS with SuSIE PIP >= {susie_thresh} with FINEMAP PIP < {var_thresh2}: {count}'
            )
        for finemap_thresh in (var_thresh1, var_thresh2):
            count = curr_df.filter(
                (pl.col('finemap_pip') >= finemap_thresh)
                & ((pl.col('susie_cs') < 0)
                   | (pl.col('susie_pip') < var_thresh2))).shape[0]
            print(
                f'Vars with FINEMAP PIP >= {finemap_thresh} either not in a SuSiE CS or having SuSiE PIP <= {var_thresh2}: {count}'
            )

    # Not going to report susie alphas v pips - just know that they're similar if we look
    # at vars in good credible sets and not otherwise
    '''
    ])

associations_df = pl.concat([
    pl.scan_csv(
        f'{ukb}/post_finemapping/intermediate_results/finemapping_all_concordance_{phenotype}.tab',
        sep='\t',
        dtypes={
            **{f'{ethnicity}_p_val': float
               for ethnicity in other_ethnicities},
            **{f'{ethnicity}_coeff': float
               for ethnicity in other_ethnicities},
            **{f'{ethnicity}_se': float
               for ethnicity in other_ethnicities}
        }).filter('is_STR') for phenotype in phenotypes.phenotypes_in_use
]).select([
    'phenotype',
    'chrom',
    'pos',
    'region',
    'p_val',
    'coeff',
    'se',
    ((pl.col('susie_alpha') >= 0.8) & (pl.col('susie_cs') >= 0) &
     (pl.col('p_val') <= 5e-8)).alias('finemapped_susie'),
    ((pl.col('finemap_pip') >= 0.8) &
     (pl.col('p_val') <= 5e-8)).alias('finemapped_finemap'),
    *[f'{ethnicity}_p_val' for ethnicity in other_ethnicities],
    *[f'{ethnicity}_coeff' for ethnicity in other_ethnicities],
    *[f'{ethnicity}_se' for ethnicity in other_ethnicities],
]).collect()

df = associations_df.join(
예제 #21
0
import os

import polars as pl

import phenotypes

ukb = os.environ['UKB']

dfs = []
for phenotype in phenotypes.phenotypes_in_use:
    dfs.append(
        pl.scan_csv(f'{ukb}/signals/regions/{phenotype}.tab',
                    sep='\t').with_column(
                        pl.lit(phenotype).alias('phenotype')))

pl.concat(dfs).collect().with_column(
    (((pl.col('phenotype') == 'total_bilirubin') & (pl.col('chrom') == 12) &
      (pl.col('start') == 19976272) & (pl.col('end') == 22524428)) |
     ((pl.col('phenotype') == 'urate') & (pl.col('chrom') == 4) &
      (pl.col('start') == 8165642) & (pl.col('end') == 11717761)) |
     ((pl.col('phenotype') == 'alkaline_phosphatase') &
      (pl.col('chrom') == 1) & (pl.col('start') == 19430673) &
      (pl.col('end') == 24309348))
     ).alias('filtered_due_to_computation_burden')).select([
         'phenotype', 'chrom', 'start', 'end',
         'filtered_due_to_computation_burden'
     ]).to_csv(
         f'{ukb}/export_scripts/results/supp_table_2_finemapping_regions.tab',
         sep='\t')
            fname,
            sep='\t',
            skip_rows=1,
            has_header=False,
            with_column_names=lambda _: header.
            replace('0.05_significance_CI', 'foo', 1).replace(
                '5e-8_significance_CI', 'bar', 1).split(
                    '\t')  # these duplicate column names won't be used anyway
        ).select([
            'chrom', 'pos',
            pl.col('subset_total_per_allele_dosages').alias(
                f'{ethnicity}_allele_dosages')
        ])
        df = df.join(assoc_df, how='left', on=['chrom', 'pos'])
    finemapping_dfs.append(df.collect())
finemapping_results = pl.concat(finemapping_dfs).rename({'pos': 'snpstr_pos'})

finemapping_results = finemapping_results.filter((pl.col('p_val') <= 5e-8) & (
    ((pl.col('susie_alpha') >= 0.8) & (pl.col('susie_cs') >= 0))
    | (pl.col('finemap_pip') >= 0.8)).any().over(['chrom', 'snpstr_pos']))

pos_table = pl.read_csv(f'{ukb}/snpstr/flank_trimmed_vcf/vars.tab', sep='\t')

finemapping_results = finemapping_results.join(pos_table,
                                               how='left',
                                               on=['chrom', 'snpstr_pos'])

repeat_units = pl.read_csv(f'{ukb}/snpstr/repeat_units.tab', sep='\t')

finemapping_results = finemapping_results.join(repeat_units,
                                               how='left',
parser = argparse.ArgumentParser()
parser.add_argument('outprefix')
parser.add_argument('results_tables', nargs='+')
args = parser.parse_args()

other_ethnicities = ['black', 'south_asian', 'chinese', 'irish', 'white_other']

df = pl.concat([
    pl.scan_csv(
        table,
        sep='\t',
        dtypes={
                **{f'{ethnicity}_p_val': float for ethnicity in other_ethnicities},
                **{f'{ethnicity}_coeff': float for ethnicity in other_ethnicities},
                **{f'{ethnicity}_se': float for ethnicity in other_ethnicities}
        }
    ) for table in args.results_tables
]).filter(pl.col('p_val') <= 1e-10).with_columns([
    ((pl.col('susie_alpha') >= 0.8) & (pl.col('susie_cs') >= 0)).alias('susie_result'),
    (pl.col('finemap_pip') >= 0.8).alias('finemap_result')
]).filter(
    pl.col('susie_result') | pl.col('finemap_result')
).collect()

for var, condition in (('STR', pl.col('is_STR')), ('SNP', ~pl.col('is_STR'))):
    temp_df = df.filter(condition)
    s_total = temp_df.filter('susie_result').shape[0]
    f_total = temp_df.filter('finemap_result').shape[0]
    shared = temp_df.filter(pl.col('finemap_result') & pl.col('susie_result')).shape[0]

    plt.figure()
def main():
    df = pl.scan_csv('post_finemapping/intermediate_results/gathered_data.tab',
                     sep='\t').filter((pl.col('susie_pip') >= 0.3)
                                      | (pl.col('finemap_pip') >= 0.3))
    df = df.with_column(
        (pl.col('susie_pip') -
         pl.col('finemap_pip')).alias('susie_f_pip_diff')).with_column(
             pl.col('susie_f_pip_diff').abs().alias('abs_pip_diff'))
    locus_summary_df = pl.concat([
        pl.scan_csv(
            f'export_scripts/intermediate_results/chr{chrom}_loci_summary.tab',
            sep='\t') for chrom in range(1, 23)
    ]).select(['chr', 'pos', 'multiallelicness', 'allele_dist'])
    allele_threshes = (0.0004, 0.002, 0.01, 0.05)
    #allele_threshes = [0.01]
    df = df.join(
        locus_summary_df,
        how='left',
        #left_on=['chrom', 'snpstr_pos'],
        left_on=['chrom', 'pos'],
        right_on=['chr', 'pos']).collect()

    snp_df = df.filter(~pl.col('is_STR'))
    str_df = df.filter(pl.col('is_STR'))
    assert not str_df.select(
        pl.col('multiallelicness').is_null().any()).to_numpy()[0]

    str_df = str_df.with_columns([
        pl.apply('allele_dist', count_alleles(thresh),
                 pl.UInt32).alias(f'alleles_{thresh}')
        for thresh in allele_threshes
    ])
    confusions = pl.concat([
        pl.scan_csv(f'side_analyses/length_confusion/chr{i}.tab',
                    sep='\t').with_column(pl.lit(i).alias('chrom').cast(int))
        for i in range(1, 23)
    ]).collect()
    merged_df = str_df.join(confusions, how='left', on=['chrom', 'pos'])

    step = 0.05
    fig = bokeh.plotting.figure(title='STR PIP histogram',
                                width=size,
                                height=size,
                                x_axis_label='PIP',
                                y_axis_label='density',
                                tools='',
                                toolbar_location=None)
    xs = np.arange(0, 1 + step, step)
    fig.line(
        x=xs[:-1],
        #y=scipy.stats.gaussian_kde(arr)(xs),
        y=np.histogram(str_df['susie_pip'], bins=xs, density=True)[0],
        color='red',
        legend_label='SuSiE STRs')
    fig.line(
        x=xs[:-1],
        #y=scipy.stats.gaussian_kde(arr)(xs),
        y=np.histogram(str_df['finemap_pip'], bins=xs, density=True)[0],
        color='blue',
        legend_label='FINEMAP STRs')
    fig.line(
        x=xs[:-1],
        #y=scipy.stats.gaussian_kde(arr)(xs),
        y=np.histogram(snp_df['susie_pip'], bins=xs, density=True)[0],
        color='green',
        legend_label='SuSiE SNPs')
    fig.line(
        x=xs[:-1],
        #y=scipy.stats.gaussian_kde(arr)(xs),
        y=np.histogram(snp_df['finemap_pip'], bins=xs, density=True)[0],
        color='purple',
        legend_label='FINEMAP SNPs')
    bokeh.io.export_png(fig,
                        filename='post_finemapping/results/pip_histogram.png')

    fig = bokeh.plotting.figure(title='STR PIP scatterplot',
                                width=size,
                                height=size,
                                x_axis_label='FINEMAP PIP',
                                y_axis_label='SuSiE PIP',
                                tools='',
                                toolbar_location=None)
    fig.circle(str_df['susie_pip'], str_df['finemap_pip'])
    bokeh.io.export_png(
        fig, filename='post_finemapping/results/str_comp_pip_scatter.png')

    fig = bokeh.plotting.figure(title='STR PIP heatmap',
                                width=size,
                                height=size,
                                x_axis_label='FINEMAP PIP',
                                y_axis_label='SuSiE PIP',
                                match_aspect=True,
                                tools='',
                                toolbar_location=None)
    heat_map(fig, str_df['finemap_pip'], str_df['susie_pip'],
             'post_finemapping/results/str_comp_pip_heatmap.png')

    fig = bokeh.plotting.figure(title='STR PIPs',
                                width=size,
                                height=size,
                                x_axis_label='FINEMAP PIP',
                                y_axis_label='SuSiE PIP',
                                match_aspect=True,
                                tools='',
                                toolbar_location=None)
    weighted_heat_map(
        fig, merged_df['finemap_pip'], merged_df['susie_pip'],
        merged_df['chance_of_length_confusion'],
        'average chance of misgenotyping per sample at any such locus',
        'post_finemapping/results/str_comp_pip_chance_map.png')

    fig = bokeh.plotting.figure(title='STR PIPs',
                                width=size,
                                height=size,
                                x_axis_label='FINEMAP PIP',
                                y_axis_label='SuSiE PIP',
                                match_aspect=True,
                                tools='',
                                toolbar_location=None)
    weighted_heat_map(
        fig, merged_df['finemap_pip'], merged_df['susie_pip'],
        merged_df['normalized_avg_abs_length_confusion'],
        'average number of standard deviations of misgenotyping per sample at any such locus',
        'post_finemapping/results/str_comp_pip_sd_map.png')

    fig = bokeh.plotting.figure(title='SNP PIP scatterplot',
                                width=size,
                                height=size,
                                x_axis_label='FINEMAP PIP',
                                y_axis_label='SuSiE PIP',
                                tools='',
                                toolbar_location=None)
    fig.circle(snp_df['susie_pip'], snp_df['finemap_pip'])
    bokeh.io.export_png(
        fig, filename='post_finemapping/results/snp_comp_pip_scatter.png')

    fig = bokeh.plotting.figure(title='SNP PIP heatmap',
                                width=size,
                                height=size,
                                x_axis_label='FINEMAP PIP',
                                y_axis_label='SuSiE PIP',
                                match_aspect=True,
                                tools='',
                                toolbar_location=None)
    heat_map(fig, snp_df['finemap_pip'], snp_df['susie_pip'],
             'post_finemapping/results/snp_comp_pip_heatmap.png')

    color_mapper = bokeh.models.LinearColorMapper(palette=palette,
                                                  low=0,
                                                  high=1)
    color_bar = bokeh.models.ColorBar(color_mapper=color_mapper, width=30)
    cmap = bokeh.transform.linear_cmap('foo', palette=palette, low=0, high=1)

    fig = bokeh.plotting.figure(title='STR PIP scatterplot',
                                width=size,
                                height=size,
                                x_axis_label='FINEMAP PIP',
                                y_axis_label='SuSiE PIP',
                                tools='',
                                match_aspect=True,
                                toolbar_location=None)
    cb_title = bokeh.models.Title(
        text='chance a genotype call at this locus is wrong', align='center')
    fig.add_layout(color_bar, 'right')
    fig.add_layout(cb_title, 'right')
    cds = bokeh.models.ColumnDataSource(
        dict(x=merged_df['finemap_pip'],
             y=merged_df['susie_pip'],
             color=[
                 linear_int_interpolate((134, 204, 195), (9, 41, 46), val)
                 for val in merged_df['chance_of_length_confusion']
             ]))
    fig.circle(x='x', y='y', color='color', source=cds)
    bokeh.io.export_png(
        fig,
        filename='post_finemapping/results/colored_str_comp_pip_scatter.png')

    step = 0.05
    for thresh in allele_threshes:
        for pip_thresh in (0.3, 0.8):
            for xs, x_label, out_loc, title, col in [
                (
                    np.arange(-1, 1 + step, step),
                    'SuSiE PIP - FINEMAP PIP',
                    f'post_finemapping/results/pip_diff_density_allele_thresh_{thresh}_pip_thresh_{pip_thresh}.png',
                    f'PIP diff, STR allele penetrance threshold = {thresh:.4}',
                    'susie_f_pip_diff',
                ),
                (np.arange(0, 1 + step, step), 'absolute PIP difference',
                 f'post_finemapping/results/pip_abs_diff_density_allele_thresh_{thresh}_pip_thresh_{pip_thresh}.png',
                 f'absolute PIP diff, STR allele penetrance threshold = {thresh:.4}',
                 'abs_pip_diff')
            ]:
                filter_exp = (pl.col('susie_pip') >= pip_thresh) | (
                    pl.col('finemap_pip') >= pip_thresh)
                fig = bokeh.plotting.figure(title=title,
                                            width=size,
                                            height=size,
                                            x_axis_label=x_label,
                                            y_axis_label='density',
                                            tools='',
                                            toolbar_location=None)
                fig.line(
                    x=xs[:-1],
                    y=np.histogram(snp_df.filter(filter_exp)[col].to_numpy(),
                                   bins=xs,
                                   density=True)[0],
                    #y=scipy.stats.gaussian_kde(snp_df['susie_f_pip_diff'].to_numpy())(xs),
                    color='black',
                    legend_label=f'SNPs (n={snp_df.shape[0]})')
                for count, color in ((2, 'brown'), (3, 'red'), (4, 'orange')):
                    arr = str_df.filter(filter_exp).filter(
                        pl.col(f'alleles_{thresh}') == count)[col].to_numpy()
                    fig.line(
                        x=xs[:-1],
                        #y=scipy.stats.gaussian_kde(arr)(xs),
                        y=np.histogram(arr, bins=xs, density=True)[0],
                        color=color,
                        legend_label=f'{count}-allele STRs (n={arr.shape[0]})')
                arr = str_df.filter(filter_exp).filter(
                    pl.col(f'alleles_{thresh}') >= 5)[col].to_numpy()
                fig.line(
                    x=xs[:-1],
                    #y=scipy.stats.gaussian_kde(arr)(xs),
                    y=np.histogram(arr, bins=xs, density=True)[0],
                    color='gold',
                    legend_label=
                    f'STRs with at least 5 alleles (n={arr.shape[0]})')
                fig.add_layout(
                    bokeh.models.Title(
                        text=
                        f'Variants with PIP at least {pip_thresh} for SuSiE or FINEMAP'
                    ), 'below')
                bokeh.io.export_png(fig, filename=out_loc)

    fig = bokeh.plotting.figure(title='STR PIP diff',
                                width=size,
                                height=size,
                                x_axis_label='multiallelicness',
                                y_axis_label='SuSiE PIP - FINEMAP PIP',
                                tools='',
                                toolbar_location=None)
    heat_map(fig,
             str_df['multiallelicness'],
             str_df['susie_f_pip_diff'],
             'post_finemapping/results/str_pip_diff_heatmap.png',
             y_min=-1)
    fig = bokeh.plotting.figure(title='STR PIP abs diff',
                                width=size,
                                height=size,
                                x_axis_label='multiallelicness',
                                y_axis_label='absolute PIP difference',
                                tools='',
                                toolbar_location=None)
    heat_map(fig, str_df['multiallelicness'], str_df['abs_pip_diff'],
             'post_finemapping/results/str_pip_abs_diff_heatmap.png')

    fig = bokeh.plotting.figure(title='PIP abs diff',
                                width=size,
                                height=size,
                                x_axis_label='multiallelicness',
                                y_axis_label='absolute PIP difference',
                                tools='',
                                toolbar_location=None)
예제 #25
0
print(coords_df.shape)

qtl_strs = []
yang_dir = '/expanse/projects/gymreklab/yal084_storage/share_with_Jonathan'
for fname, col_name in ('eSTR', 'str-gene'), ('STR',
                                              'str-exon'), ('eISOFORM',
                                                            'str-isoform'):
    qtl_str = pl.read_csv(f'{yang_dir}/{fname}_GB_650pc_combined_fdr10p.csv',
                          sep='\t').with_column(
                              pl.col(col_name).str.split_exact(
                                  '-',
                                  1).struct.field('field_0').alias('hg38'))
    qtl_str = pl.concat([
        qtl_str.join(coords_df,
                     left_on='hg38',
                     right_on=f'chrom_pos_{offset}_38').drop([
                         f'chrom_pos_{offset2}_38'
                         for offset2 in range(-10, 11) if offset2 != offset
                     ]) for offset in range(-10, 11)
    ])

    qtl_str = qtl_str.distinct().groupby('chrom_pos').agg([
        pl.col('phenotype').first(),
        pl.col('association_p_value').first(),
        pl.col('p_values').list(),
        pl.col('Tissue').list(),
        pl.col('gene_name').list(),
        pl.col(col_name).str.split_exact(
            '-', 1).struct.field('field_1').list().alias('target')
    ])
    print(qtl_str.shape)
def write_input_variants(workdir, outdir, gts_dir, readme, phenotype, chrom,
                         start, end, inclusion_threshold, mac, snp_str_ratio,
                         total_prob, use_PACSIN2):
    '''
    write README.txt
    write finemap_input.z
    write finemap_innput.master
    '''

    sample_idx = sample_utils.get_samples_idx_phenotype(
        'white_brits', phenotype)
    n_samples = np.sum(sample_idx)

    if mac:
        mac_threshold = int(mac[0])
        snp_mac_fname = mac[1]
        str_mac_fname = mac[2]
        snps_exclude_mac = pl.scan_csv(
            snp_mac_fname,
            sep='\t').filter(pl.col('ALT_CTS') < mac_threshold).select(
                ('SNP_' + pl.col('#POS').cast(str) + '_' + pl.col('REF') +
                 '_' + pl.col('ALT')
                 ).alias('varname')).collect()['varname'].to_list()
        # need to make that look like a list of strings to polars b/c buggy, so add a single nonsense to it
        snps_exclude_mac.append('asdf')

        strs_exclude_mac = pl.scan_csv(
            str_mac_fname,
            sep='\t').filter(pl.col('mac') < mac_threshold).select(
                'pos').collect()['pos'].to_list()

    plink_results_fname = f'{ukb}/association/results/{phenotype}/plink_snp/results.tab'
    str_results_fname = f'{ukb}/association/results/{phenotype}/my_str/results.tab'
    filter_set_fname = f'{ukb}/finemapping/str_imp_snp_overlaps/chr{chrom}_to_filter.tab'

    with open(f'{workdir}/finemap_input.master', 'w') as finemap_master:
        finemap_master.write('z;ld;snp;config;cred;log;n_samples\n'
                             f'{outdir}/finemap_input.z;'
                             f'{gts_dir}/all_variants.ld;'
                             f'{outdir}/finemap_output.snp;'
                             f'{outdir}/finemap_output.config;'
                             f'{outdir}/finemap_output.cred;'
                             f'{outdir}/finemap_output.log;'
                             f'{n_samples}')

    today = datetime.datetime.now().strftime("%Y_%M_%D")
    readme.write(
        f'Run date: {today}\n'
        'Manually generating variant-variant LD for each imputed SNP each STR in the region '
        'where an association was successfully '
        f'performed and had p < {inclusion_threshold} and the SNP was not in the filter set\n'
        f'(Filter set at {filter_set_fname})\n'
        'Correlation is STR length dosage vs SNP dosage.\n'
        'Running FINEMAP with that list of imputed SNPs and STRs.\n')

    # load STRs
    strs = pl.scan_csv(
        str_results_fname, sep='\t', dtypes={
            'locus_filtered': str
        }).filter((pl.col('chrom') == chrom) & (pl.col('pos') >= start)
                  & (pl.col('pos') <= end)
                  & (pl.col('locus_filtered') == 'False')
                  & (pl.col(f'p_{phenotype}') < inclusion_threshold)).select([
                      ('STR_' + pl.col('pos').cast(str)).alias('rsid'),
                      ('0' + pl.col('chrom').cast(str)
                       ).str.slice(-2).alias('chromosome'),
                      pl.col('pos').alias('position'),
                      pl.lit('nan').alias('allele1'),
                      pl.lit('nan').alias('allele2'),
                      pl.lit('nan').alias('maf'),
                      pl.col(f'coeff_{phenotype}').alias('beta'),
                      pl.col(f'se_{phenotype}').alias('se'),
                  ]).collect()

    if mac:
        strs = strs.filter(~pl.col('position').is_in(strs_exclude_mac))

    if use_PACSIN2:
        strs = strs.filter(pl.col('pos') != 43385872)
        pacsin2_strs = pl.read_csv(
            f'{ukb}/association/spot_test/white_brits/{phenotype}/PACSIN2.tab',
            sep='\t').filter(
                pl.col('pos').is_in([43385866, 43385875, 43385893])).select([
                    ('PACSIN2_STR_' + pl.col('pos').cast(str)).alias('rsid'),
                    ('0' + pl.col('chrom').cast(str)
                     ).str.slice(-2).alias('chromosome'),
                    pl.col('pos').alias('position'),
                    pl.lit('nan').alias('allele1'),
                    pl.lit('nan').alias('allele2'),
                    pl.lit('nan').alias('maf'),
                    pl.col(f'coeff_{phenotype}').alias('beta'),
                    pl.col(f'se_{phenotype}').alias('se'),
                ])
        strs = pl.concat([strs, pacsin2_strs])

    assert strs.distinct(
        subset=['chromosome', 'position']).shape[0] == strs.shape[0]

    n_strs = strs.shape[0]

    # load SNPs
    snps_to_filter = set()
    with open(filter_set_fname) as filter_file:
        next(filter_file)  # skip header
        for line in filter_file:
            pos, ref, alt = line.strip().split('\t')[3:6]
            snps_to_filter.add(f'{pos}_{ref}_{alt}')

    snps = pl.scan_csv(plink_results_fname, sep='\t', null_values='NA').filter(
        (pl.col('#CHROM') == chrom) & (pl.col('POS') >= start)
        & (pl.col('POS') <= end) & (pl.col('ERRCODE') == '.')
        & (pl.col('P') < inclusion_threshold)
        & ~(pl.col('POS').cast(str) + '_' + pl.col('REF') + '_' +
            pl.col('ALT')).is_in(list(snps_to_filter))).select([
                ('SNP_' + pl.col('POS').cast(str) + '_' + pl.col('REF') + '_' +
                 pl.col('ALT')).alias('rsid'),
                ('0' +
                 pl.col('#CHROM').cast(str)).str.slice(-2).alias('chromosome'),
                pl.col('POS').alias('position'),
                pl.col('REF').alias('allele1'),
                pl.col('ALT').alias('allele2'),
                pl.lit('nan').alias('maf'),
                pl.col('BETA').alias('beta'),
                pl.col('SE').alias('se'),
            ]).collect()

    if mac:
        snps = snps.filter(~pl.col('rsid').is_in(snps_exclude_mac))

    n_snps = snps.shape[0]

    if snp_str_ratio is not None:
        strs = strs.with_column(
            pl.lit(1 / (n_strs + snp_str_ratio * n_snps)).alias('prob'))
        snps = snps.with_column(
            pl.lit(snp_str_ratio /
                   (n_strs + snp_str_ratio * n_snps)).alias('prob'))

    vars_df = pl.concat([strs, snps])

    if total_prob is not None:
        vars_df = vars_df.with_column(
            pl.lit(total_prob / (n_snps + n_strs)).alias('prob'))

    vars_df.to_csv(f'{workdir}/finemap_input.z', sep=' ')