示例#1
0
    def test_tdt(self):
        pedigree = hl.Pedigree.read(resource('tdt.fam'))
        tdt_tab = (hl.transmission_disequilibrium_test(
            hl.split_multi_hts(hl.import_vcf(resource('tdt.vcf'), min_partitions=4)),
            pedigree))

        truth = hl.import_table(
            resource('tdt_results.tsv'),
            types={'POSITION': hl.tint32, 'T': hl.tint32, 'U': hl.tint32,
                   'Chi2': hl.tfloat64, 'Pval': hl.tfloat64})
        truth = (truth
                 .transmute(locus=hl.locus(truth.CHROM, truth.POSITION),
                            alleles=[truth.REF, truth.ALT])
                 .key_by('locus', 'alleles'))

        if tdt_tab.count() != truth.count():
            self.fail('Result has {} rows but should have {} rows'.format(tdt_tab.count(), truth.count()))

        bad = (tdt_tab.filter(hl.is_nan(tdt_tab.p_value), keep=False)
               .join(truth.filter(hl.is_nan(truth.Pval), keep=False), how='outer'))
        bad.describe()

        bad = bad.filter(~(
                (bad.t == bad.T) &
                (bad.u == bad.U) &
                (hl.abs(bad.chi_sq - bad.Chi2) < 0.001) &
                (hl.abs(bad.p_value - bad.Pval) < 0.001)))

        if bad.count() != 0:
            bad.order_by(hl.asc(bad.v)).show()
            self.fail('Found rows in violation of the predicate (see show output)')
示例#2
0
def nullify_nan(value):
    return hl.cond(hl.is_nan(value), hl.null(value.dtype), value)
示例#3
0
def compute_coverage_stats(
    mt: hl.MatrixTable,
    reference_ht: hl.Table,
    coverage_over_x_bins: List[int] = [1, 5, 10, 15, 20, 25, 30, 50, 100],
) -> hl.Table:
    """
    Computes the following coverage statistics for every base of the `reference_ht` provided:
        - mean
        - median
        - total DP
        - fraction of samples with coverage above X, for each x in `coverage_over_x_bins`

    The `reference_ht` is a table that contains row for each locus coverage should be computed on.
    It needs to be keyed with the same keys as `mt`, typically either `locus` or `locus, alleles`.
    The `reference_ht` can e.g. be created using `get_reference_ht`

    :param mt: Input sparse MT
    :param reference_ht: Input reference HT
    :param coverage_over_x_bins: List of boundaries for computing samples over X
    :return: Table with per-base coverage stats
    """

    n_samples = mt.count_cols()
    print(f"Computing coverage stats on {n_samples} samples.")

    # Create an outer join with the reference Table
    mt = mt.select_entries("END", "DP").select_cols().select_rows()
    col_key_fields = list(mt.col_key)
    t = mt._localize_entries("__entries", "__cols")
    t = t.join(reference_ht.key_by(*mt.row_key).select(_in_ref=True),
               how="outer")
    t = t.annotate(__entries=hl.or_else(
        t.__entries,
        hl.range(n_samples).map(
            lambda x: hl.null(t.__entries.dtype.element_type)),
    ))
    mt = t._unlocalize_entries("__entries", "__cols", col_key_fields)

    # Densify
    mt = hl.experimental.densify(mt)

    # Filter rows where the reference is missing
    mt = mt.filter_rows(mt._in_ref)

    # Unfilter entries so that entries with no ref block overlap aren't null
    mt = mt.unfilter_entries()

    # Compute coverage stats
    coverage_over_x_bins = sorted(coverage_over_x_bins)
    max_coverage_bin = coverage_over_x_bins[-1]
    hl_coverage_over_x_bins = hl.array(coverage_over_x_bins)

    # This expression creates a counter DP -> number of samples for DP between 0 and max_coverage_bin
    coverage_counter_expr = hl.agg.counter(
        hl.min(max_coverage_bin, hl.or_else(mt.DP, 0)))

    # This expression aggregates the DP counter in reverse order of the coverage_over_x_bins
    # and computes the cumulative sum over them.
    #  It needs to be in reverse order because we want the sum over samples covered by > X.
    count_array_expr = hl.cumulative_sum(
        hl.array([
            hl.int32(coverage_counter_expr.get(max_coverage_bin, 0))
        ]  # The coverage was already floored to the max_coverage_bin, so no more aggregation is needed for the max bin
                 ).
        extend(  # For each of the other bins, coverage needs to be summed between the boundaries
            hl.range(hl.len(hl_coverage_over_x_bins) - 1, 0, step=-1).
            map(lambda i: hl.sum(
                hl.range(hl_coverage_over_x_bins[i - 1],
                         hl_coverage_over_x_bins[i]).map(lambda j: hl.int32(
                             coverage_counter_expr.get(j, 0)))))))
    mean_expr = hl.agg.mean(hl.or_else(mt.DP, 0))

    # Annotate rows now
    return mt.select_rows(
        mean=hl.cond(hl.is_nan(mean_expr), 0, mean_expr),
        median_approx=hl.or_else(hl.agg.approx_median(hl.or_else(mt.DP, 0)),
                                 0),
        total_DP=hl.agg.sum(mt.DP),
        **{
            f"over_{x}": count_array_expr[i] / n_samples
            for i, x in zip(
                range(
                    len(coverage_over_x_bins) - 1, -1, -1
                ),  # Reverse the bin index as count_array_expr has the reverse order
                coverage_over_x_bins,
            )
        },
    ).rows()
示例#4
0
def export_results(num_pops,
                   trait_types='all',
                   batch_size=256,
                   mt=None,
                   export_path_str=None,
                   skip_binary_eur=True):
    r'''
    `num_pops`: exact number of populations for which phenotype is defined
    `trait_types`: trait category (options: all, binary, quant)
    `batch_size`: batch size argument for export entries by col
    '''
    assert trait_types in {
        'all', 'quant', 'binary'
    }, "trait_types must be one of the following: {'all','quant','binary'}"
    print(f'\n\nExporting {trait_types} trait types for {num_pops} pops\n\n')
    if mt == None:
        mt0 = get_final_sumstats_mt_for_export()
    else:
        mt0 = mt

    meta_mt0 = hl.read_matrix_table(get_meta_analysis_results_path())

    mt0 = mt0.annotate_cols(pheno_id=get_pheno_id(tb=mt0))
    mt0 = mt0.annotate_rows(chr=mt0.locus.contig,
                            pos=mt0.locus.position,
                            ref=mt0.alleles[0],
                            alt=mt0.alleles[1])

    all_pops = ['AFR', 'AMR', 'CSA', 'EAS', 'EUR', 'MID']

    if trait_types == 'all':
        trait_types_to_run = [
            'continuous', 'biomarkers', 'categorical', 'phecode', 'icd10',
            'prescriptions'
        ]  # list of which trait_type to run
    elif trait_types == 'quant':
        trait_types_to_run = ['continuous', 'biomarkers']
    elif trait_types == 'binary':
        trait_types_to_run = [
            'categorical', 'phecode', 'icd10', 'prescriptions'
        ]

    pop_sets = [set(i) for i in list(combinations(all_pops, num_pops))
                ]  # list of exact set of pops for which phenotype is defined

    # fields specific to each category of trait
    quant_meta_fields = ['AF_Allele2']
    quant_fields = ['AF_Allele2']

    binary_meta_fields = ['AF_Cases', 'AF_Controls']
    binary_fields = ['AF.Cases', 'AF.Controls']

    # dictionaries for renaming fields
    quant_meta_field_rename_dict = {
        'AF_Allele2': 'af_meta',
        'BETA': 'beta_meta',
        'SE': 'se_meta',
        'Pvalue': 'pval_meta',
        'Pvalue_het': 'pval_heterogeneity'
    }
    quant_field_rename_dict = {
        'AF_Allele2': 'af',
        'BETA': 'beta',
        'SE': 'se',
        'Pvalue': 'pval',
        'low_confidence': 'low_confidence'
    }  # decided on this implementation to make later code cleaner

    binary_meta_field_rename_dict = {
        'BETA': 'beta_meta',
        'SE': 'se_meta',
        'Pvalue': 'pval_meta',
        'AF_Cases': 'af_cases_meta',
        'AF_Controls': 'af_controls_meta',
        'Pvalue_het': 'pval_heterogeneity'
    }
    binary_field_rename_dict = {
        'AF.Cases': 'af_cases',
        'AF.Controls': 'af_controls',
        'BETA': 'beta',
        'SE': 'se',
        'Pvalue': 'pval',
        'low_confidence': 'low_confidence'
    }  # decided on this implementation to make later code cleaner

    all_quant_trait_types = {'continuous', 'biomarkers'}
    all_binary_trait_types = {
        'categorical', 'phecode', 'icd10', 'prescriptions'
    }

    quant_trait_types = all_quant_trait_types.intersection(
        trait_types_to_run)  # get list of quant trait types to run
    binary_trait_types = all_binary_trait_types.intersection(
        trait_types_to_run)  # get list of binary trait types to run
    error_trait_types = set(trait_types_to_run).difference(
        quant_trait_types.union(binary_trait_types))
    assert len(
        error_trait_types
    ) == 0, f'ERROR: The following trait_types are invalid: {error_trait_types}'

    for trait_category, trait_types in [('binary', binary_trait_types),
                                        ('quant', quant_trait_types)]:
        if len(trait_types) == 0:  #if no traits in trait_types list
            continue

        print(f'{trait_category} trait types to run: {trait_types}')

        if trait_category == 'quant':
            meta_fields = quant_meta_fields
            fields = quant_fields
            meta_field_rename_dict = quant_meta_field_rename_dict
            field_rename_dict = quant_field_rename_dict
        elif trait_category == 'binary':
            meta_fields = binary_meta_fields
            fields = binary_fields
            meta_field_rename_dict = binary_meta_field_rename_dict
            field_rename_dict = binary_field_rename_dict

        meta_fields += ['BETA', 'SE', 'Pvalue', 'Pvalue_het']
        fields += ['BETA', 'SE', 'Pvalue', 'low_confidence']

        for pop_set in pop_sets:
            start = time()

            if (pop_set == {'EUR'} and trait_category == 'binary'
                ) and skip_binary_eur:  # run EUR-only binary traits separately
                print('\nSkipping EUR-only binary traits\n')
                continue

            mt1 = mt0.filter_cols(
                (hl.literal(trait_types).contains(mt0.trait_type))
                & (hl.set(mt0.pheno_data.pop) == hl.literal(pop_set)))

            col_ct = mt1.count_cols()
            if col_ct == 0:
                print(
                    f'\nSkipping {trait_types},{sorted(pop_set)}, no phenotypes found\n'
                )
                continue

            pop_list = sorted(pop_set)

            annotate_dict = {}
            keyed_mt = meta_mt0[mt1.row_key, mt1.col_key]
            if len(pop_set) > 1:
                for field in meta_fields:  # NOTE: Meta-analysis columns go before per-population columns
                    field_expr = keyed_mt.meta_analysis[field][0]
                    annotate_dict.update({
                        f'{meta_field_rename_dict[field]}':
                        hl.if_else(hl.is_nan(field_expr), hl.str(field_expr),
                                   hl.format('%.3e', field_expr))
                    })

            for field in fields:
                for pop_idx, pop in enumerate(pop_list):
                    field_expr = mt1.summary_stats[field][pop_idx]
                    annotate_dict.update({
                        f'{field_rename_dict[field]}_{pop}':
                        hl.if_else(
                            hl.is_nan(field_expr), hl.str(field_expr),
                            hl.str(field_expr) if field == 'low_confidence'
                            else hl.format('%.3e', field_expr))
                    })

            mt2 = mt1.annotate_entries(**annotate_dict)

            mt2 = mt2.filter_cols(mt2.coding != 'zekavat_20200409')
            mt2 = mt2.key_cols_by('pheno_id')
            mt2 = mt2.key_rows_by().drop(
                'locus', 'alleles', 'summary_stats'
            )  # row fields that are no longer included: 'gene','annotation'

            batch_idx = 1
            get_export_path = lambda batch_idx: f'{ldprune_dir}/export_results/{"" if export_path_str is None else f"{export_path_str}/"}{trait_category}/{"-".join(pop_list)}_batch{batch_idx}'
            print(mt2.describe())
            while hl.hadoop_is_dir(get_export_path(batch_idx)):
                batch_idx += 1
            print(
                f'\nExporting {col_ct} phenos to: {get_export_path(batch_idx)}\n'
            )
            hl.experimental.export_entries_by_col(
                mt=mt2,
                path=get_export_path(batch_idx),
                bgzip=True,
                batch_size=batch_size,
                use_string_key_as_file_name=True,
                header_json_in_file=False)
            end = time()
            print(
                f'\nExport complete for:\n{trait_types}\n{pop_list}\ntime: {round((end-start)/3600,2)} hrs'
            )
示例#5
0
def export_binary_eur(cluster_idx, num_clusters=10, batch_size=256):
    r'''
    Export summary statistics for binary traits defined only for EUR. 
    Given the large number of such traits (4184), it makes sense to batch this 
    across `num_clusters` clusters for reduced wall time and robustness to mid-export errors.
    NOTE: `cluster_idx` is 1-indexed.
    '''
    mt0 = get_final_sumstats_mt_for_export()
    meta_mt0 = hl.read_matrix_table(get_meta_analysis_results_path())

    mt0 = mt0.annotate_cols(pheno_id=get_pheno_id(tb=mt0))
    mt0 = mt0.annotate_rows(chr=mt0.locus.contig,
                            pos=mt0.locus.position,
                            ref=mt0.alleles[0],
                            alt=mt0.alleles[1])

    trait_types_to_run = ['categorical', 'phecode', 'icd10',
                          'prescriptions']  # list of which trait_type to run

    # fields specific to each category of trait
    meta_fields = ['AF_Cases', 'AF_Controls']
    fields = ['AF.Cases', 'AF.Controls']

    # dictionaries for renaming fields
    meta_field_rename_dict = {
        'BETA': 'beta_meta',
        'SE': 'se_meta',
        'Pvalue': 'pval_meta',
        'AF_Cases': 'af_cases_meta',
        'AF_Controls': 'af_controls_meta',
        'Pvalue_het': 'pval_heterogeneity'
    }
    field_rename_dict = {
        'AF.Cases': 'af_cases',
        'AF.Controls': 'af_controls',
        'BETA': 'beta',
        'SE': 'se',
        'Pvalue': 'pval',
        'low_confidence': 'low_confidence'
    }  # decided on this implementation to make later code cleaner

    all_binary_trait_types = {
        'categorical', 'phecode', 'icd10', 'prescriptions'
    }

    meta_fields += ['BETA', 'SE', 'Pvalue', 'Pvalue_het']
    fields += ['BETA', 'SE', 'Pvalue', 'low_confidence']

    trait_category = 'binary'
    trait_types = all_binary_trait_types.intersection(
        trait_types_to_run)  # get list of binary trait types to run
    pop_set = {'EUR'}
    start = time()

    mt1 = mt0.filter_cols((hl.literal(trait_types).contains(mt0.trait_type)) &
                          (hl.set(mt0.pheno_data.pop) == hl.literal(pop_set)))

    pheno_id_list = mt1.pheno_id.collect()

    num_traits = len(pheno_id_list)  # total number of traits to run

    traits_per_cluster = ceil(
        num_traits / num_clusters)  # maximum traits to run per cluster

    cluster_pheno_id_list = pheno_id_list[
        (cluster_idx - 1) * traits_per_cluster:cluster_idx *
        traits_per_cluster]  # list of traits to run in current cluster

    print(len(cluster_pheno_id_list))

    mt1 = mt1.filter_cols(
        hl.literal(cluster_pheno_id_list).contains(mt1.pheno_id))

    pop_list = sorted(pop_set)

    annotate_dict = {}

    keyed_mt = meta_mt0[mt1.row_key, mt1.col_key]
    if len(pop_set) > 1:
        for field in meta_fields:  # NOTE: Meta-analysis columns go before per-population columns
            field_expr = keyed_mt.meta_analysis[field][0]
            annotate_dict.update({
                f'{meta_field_rename_dict[field]}':
                hl.if_else(hl.is_nan(field_expr), hl.str(field_expr),
                           hl.format('%.3e', field_expr))
            })

    for field in fields:
        for pop_idx, pop in enumerate(pop_list):
            field_expr = mt1.summary_stats[field][pop_idx]
            annotate_dict.update({
                f'{field_rename_dict[field]}_{pop}':
                hl.if_else(
                    hl.is_nan(field_expr), hl.str(field_expr),
                    hl.str(field_expr) if field == 'low_confidence' else
                    hl.format('%.3e', field_expr))
            })

    mt2 = mt1.annotate_entries(**annotate_dict)

    mt2 = mt2.filter_cols(mt2.coding != 'zekavat_20200409')
    mt2 = mt2.key_cols_by('pheno_id')
    mt2 = mt2.key_rows_by().drop(
        'locus', 'alleles', 'summary_stats'
    )  # row fields that are no longer included: 'gene','annotation'
    print(mt2.describe())

    batch_idx = 1
    get_export_path = lambda batch_idx: f'{ldprune_dir}/release/{trait_category}/{"-".join(pop_list)}_batch{batch_idx}/subbatch{cluster_idx}'

    while hl.hadoop_is_dir(get_export_path(batch_idx)):
        batch_idx += 1
    print(
        f'\nExporting {len(cluster_pheno_id_list)} phenos to: {get_export_path(batch_idx)}\n'
    )
    hl.experimental.export_entries_by_col(mt=mt2,
                                          path=get_export_path(batch_idx),
                                          bgzip=True,
                                          batch_size=batch_size,
                                          use_string_key_as_file_name=True,
                                          header_json_in_file=False)
    end = time()
    print(
        f'\nExport complete for:\n{trait_types}\n{pop_list}\ntime: {round((end-start)/3600,2)} hrs'
    )
示例#6
0
def prepare_pext_data(base_level_pext_path):
    tmp_dir = os.path.expanduser("~")

    #
    # Step 1: rename fields, extract chrom/pos from locus, convert missing values to 0, export to TSV
    #
    ds = hl.read_table(base_level_pext_path)

    ds = ds.select(
        gene_id=ds.ensg,
        chrom=ds.locus.contig,
        pos=ds.locus.position,
        # Replace NaNs and missing values with 0s
        mean=hl.cond(
            hl.is_missing(ds.mean_proportion) | hl.is_nan(ds.mean_proportion),
            hl.float(0), ds.mean_proportion),
        **{
            renamed: hl.cond(
                hl.is_missing(ds[original]) | hl.is_nan(ds[original]),
                hl.float(0), ds[original])
            for original, renamed in TISSUE_NAME_MAP.items()
        })

    ds = ds.order_by(ds.gene_id, hl.asc(ds.pos)).drop("locus")
    ds.export("file://" + os.path.join(tmp_dir, "bases.tsv"))

    #
    # Step 2: Collect base-level data into regions
    #
    with open(os.path.join(tmp_dir, "regions.tsv"), "w") as output_file:
        writer = csv.writer(output_file, delimiter="\t")
        writer.writerow(["gene_id", "chrom", "start", "stop", "mean"] +
                        TISSUE_FIELDS)

        def output_region(region):
            writer.writerow([
                region.gene, region.chrom, region.start, region.stop,
                region.tissues["mean"]
            ] + [region.tissues[t] for t in TISSUE_FIELDS])

        rows = read_bases_tsv(os.path.join(tmp_dir, "bases.tsv"))
        first_row = next(rows)
        current_region = Region(gene=first_row.gene,
                                chrom=first_row.chrom,
                                start=first_row.pos,
                                stop=None,
                                tissues=first_row.tissues)
        last_pos = first_row.pos

        for row in rows:
            if (row.gene != current_region.gene
                    or row.chrom != current_region.chrom or row.pos >
                (last_pos + 1)
                    or any(row.tissues[t] != current_region.tissues[t]
                           for t in row.tissues)):
                output_region(current_region._replace(stop=last_pos))
                current_region = Region(gene=row.gene,
                                        chrom=row.chrom,
                                        start=row.pos,
                                        stop=None,
                                        tissues=row.tissues)

            last_pos = row.pos

        output_region(current_region._replace(stop=last_pos))

    # Copy regions file to HDFS
    subprocess.run(
        [
            "hdfs", "dfs", "-cp",
            "file://" + os.path.join(tmp_dir, "regions.tsv"),
            os.path.join(tmp_dir, "regions.tsv")
        ],
        check=True,
    )

    #
    # Step 3: Convert regions to a Hail table.
    #
    types = {t: hl.tfloat for t in TISSUE_FIELDS}
    types["gene_id"] = hl.tstr
    types["chrom"] = hl.tstr
    types["start"] = hl.tint
    types["stop"] = hl.tint
    types["mean"] = hl.tfloat

    ds = hl.import_table(os.path.join(tmp_dir, "regions.tsv"),
                         min_partitions=100,
                         missing="",
                         types=types)

    ds = ds.select("gene_id",
                   "chrom",
                   "start",
                   "stop",
                   "mean",
                   tissues=hl.struct(**{t: ds[t]
                                        for t in TISSUE_FIELDS}))

    ds = ds.group_by("gene_id").aggregate(
        regions=hl.agg.collect(ds.row_value.drop("gene_id")))

    return ds