예제 #1
0
def get_expr_for_vep_transcript_id_to_consequence_map(
        vep_transcript_consequences_root):
    # Manually build string because hl.json encodes a dictionary as [{ key: ..., value: ... }, ...]
    return ("{" + hl.delimit(
        vep_transcript_consequences_root.map(
            lambda c: '"' + c.transcript_id + '": "' + c.major_consequence +
            '"')) + "}")
예제 #2
0
def get_sample_data(mt: hl.MatrixTable,
                    fields: List[hl.expr.StringExpression],
                    sep: str = '\t',
                    delim: str = '|'):
    """
    Hail devs hate this one simple py4j trick to speed up sample queries

    :param MatrixTable or Table mt: MT
    :param list of StringExpression fields: fields
    :param sep: Separator to use (tab usually fine)
    :param delim: Delimiter to use (pipe usually fine)
    :return: Sample data
    :rtype: list of list of str
    """
    field_expr = fields[0]
    for field in fields[1:]:
        field_expr = field_expr + '|' + field
    if isinstance(mt, hl.MatrixTable):
        mt_agg = mt.aggregate_cols
    else:
        mt_agg = mt.aggregate
    return [
        x.split(delim)
        for x in mt_agg(hl.delimit(hl.agg.collect(field_expr), sep)).split(sep)
        if x != 'null'
    ]
예제 #3
0
def annotate_variant_id(
        t: Union[hl.Table, hl.MatrixTable],
        field_name: str = 'vid') -> Union[hl.Table, hl.MatrixTable]:
    """
    Expected input dataset with bi-allelic variant, and fields `locus` and `alleles`.
    Annotate variant ids as follow 'chr:position:ref:alt'.

    :param field_name: variant id field name
    :param t: dataset
    :return: HailTable or MatrixTable
    """

    variant_id_ann_exp = {
        field_name:
        hl.delimit([
            hl.str(t.locus.contig),
            hl.str(t.locus.position),
            hl.str(t.alleles[0]),
            hl.str(t.alleles[1])
        ],
                   delimiter=":")
    }

    if isinstance(t, hl.Table):
        return t.annotate(**variant_id_ann_exp)
    else:
        return t.annotate_rows(**variant_id_ann_exp)
예제 #4
0
def make_pheno_manifest(export=True):
    mt0 = load_final_sumstats_mt(filter_sumstats=False,
                                 filter_variants=False,
                                 separate_columns_by_pop=False,
                                 annotate_with_nearest_gene=False)

    ht = mt0.cols()
    annotate_dict = {}

    annotate_dict.update({
        'pops': hl.delimit(ht.pheno_data.pop),
        'num_pops': hl.len(ht.pheno_data.pop)
    })

    for field in ['n_cases', 'n_controls', 'heritability', 'lambda_gc']:
        for pop in ['AFR', 'AMR', 'CSA', 'EAS', 'EUR', 'MID']:
            new_field = field if field != 'heritability' else 'saige_heritability'  # new field name (only applicable to saige heritability)
            idx = ht.pheno_data.pop.index(pop)
            field_expr = ht.pheno_data[field]
            annotate_dict.update({
                f'{new_field}_{pop}':
                hl.if_else(hl.is_nan(idx), hl.null(field_expr[0].dtype),
                           field_expr[idx])
            })
    annotate_dict.update({'filename': get_pheno_id(tb=ht) + '.tsv.bgz'})
    ht = ht.annotate(**annotate_dict)

    dropbox_manifest = hl.import_table(
        f'{ldprune_dir}/UKBB_Pan_Populations-Manifest_20200615-manifest_info.tsv',
        impute=True,
        key='File')
    dropbox_manifest = dropbox_manifest.filter(
        dropbox_manifest['is_old_file'] != '1')
    bgz = dropbox_manifest.filter(~dropbox_manifest.File.contains('.tbi'))
    bgz = bgz.rename({'File': 'filename'})
    tbi = dropbox_manifest.filter(dropbox_manifest.File.contains('.tbi'))
    tbi = tbi.annotate(
        filename=tbi.File.replace('.tbi', '')).key_by('filename')

    dropbox_annotate_dict = {}

    rename_dict = {
        'dbox link': 'dropbox_link',
        'size (bytes)': 'size_in_bytes'
    }

    dropbox_annotate_dict.update({'filename_tabix': tbi[ht.filename].File})
    for field in ['dbox link', 'wget', 'size (bytes)', 'md5 hex']:
        for tb, suffix in [(bgz, ''), (tbi, '_tabix')]:
            dropbox_annotate_dict.update({
                (rename_dict[field] if field in rename_dict else field.replace(
                     ' ', '_')) + suffix:
                tb[ht.filename][field]
            })
    ht = ht.annotate(**dropbox_annotate_dict)
    ht = ht.drop('pheno_data')
    ht.describe()
    ht.show()
예제 #5
0
def get_expr_for_vep_gene_id_to_consequence_map(
        vep_sorted_transcript_consequences_root, gene_ids):
    # Manually build string because hl.json encodes a dictionary as [{ key: ..., value: ... }, ...]
    return ("{" + hl.delimit(
        gene_ids.map(lambda gene_id: hl.bind(
            lambda worst_consequence_in_gene: '"' + gene_id + '":"' +
            worst_consequence_in_gene.major_consequence + '"',
            vep_sorted_transcript_consequences_root.find(lambda c: c.gene_id ==
                                                         gene_id)))) + "}")
예제 #6
0
def get_expr_for_worst_transcript_consequence_annotations_struct(
        vep_sorted_transcript_consequences_root,
        include_coding_annotations=True):
    """Retrieves the top-ranked transcript annotation based on the ranking computed by
    get_expr_for_vep_sorted_transcript_consequences_array(..)

    Args:
        vep_sorted_transcript_consequences_root (ArrayExpression):
        include_coding_annotations (bool):
    """

    transcript_consequences = {
        "biotype": hl.tstr,
        "canonical": hl.tint,
        "category": hl.tstr,
        "cdna_start": hl.tint,
        "cdna_end": hl.tint,
        "codons": hl.tstr,
        "gene_id": hl.tstr,
        "gene_symbol": hl.tstr,
        "hgvs": hl.tstr,
        "hgvsc": hl.tstr,
        "major_consequence": hl.tstr,
        "major_consequence_rank": hl.tint,
        "transcript_id": hl.tstr,
    }

    if include_coding_annotations:
        transcript_consequences.update({
            "amino_acids": hl.tstr,
            "domains": hl.tstr,
            "hgvsp": hl.tstr,
            "lof": hl.tstr,
            "lof_flags": hl.tstr,
            "lof_filter": hl.tstr,
            "lof_info": hl.tstr,
            "polyphen_prediction": hl.tstr,
            "protein_id": hl.tstr,
            "sift_prediction": hl.tstr,
        })

    return hl.cond(
        vep_sorted_transcript_consequences_root.size() == 0,
        hl.struct(
            **{
                field: hl.null(field_type)
                for field, field_type in transcript_consequences.items()
            }),
        hl.bind(
            lambda worst_transcript_consequence:
            (worst_transcript_consequence.annotate(domains=hl.delimit(
                hl.set(worst_transcript_consequence.domains))).select(
                    *transcript_consequences.keys())),
            vep_sorted_transcript_consequences_root[0],
        ),
    )
 def clinvar(self):
     return hl.struct(
         **{
             'allele_id':
             self._clinvar_data[self.mt.row_key].info.ALLELEID,
             'clinical_significance':
             hl.delimit(self._clinvar_data[self.mt.row_key].info.CLNSIG),
             'gold_stars':
             self._clinvar_data[self.mt.row_key].gold_stars
         })
예제 #8
0
def make_pheno_manifest():
    mt0 = load_final_sumstats_mt(filter_sumstats=False,
                                 filter_variants=False,
                                 separate_columns_by_pop=False,
                                 annotate_with_nearest_gene=False)
    ht = mt0.cols()
    annotate_dict = {}

    annotate_dict.update({
        'pops': hl.delimit(ht.pheno_data.pop),
        'num_pops': hl.len(ht.pheno_data.pop)
    })

    for field in ['n_cases', 'n_controls', 'heritability', 'lambda_gc']:
        for pop in ['AFR', 'AMR', 'CSA', 'EAS', 'EUR', 'MID']:
            new_field = field if field != 'heritability' else 'saige_heritability'  # new field name (only applicable to saige heritability)
            idx = ht.pheno_data.pop.index(pop)
            field_expr = ht.pheno_data[field]
            annotate_dict.update({
                f'{new_field}_{pop}':
                hl.if_else(hl.is_nan(idx), hl.null(field_expr[0].dtype),
                           field_expr[idx])
            })
    annotate_dict.update({
        'filename':
        (ht.trait_type + '-' + ht.phenocode + '-' + ht.pheno_sex +
         hl.if_else(hl.len(ht.coding) > 0, '-' + ht.coding, '') +
         hl.if_else(hl.len(ht.modifier) > 0, '-' + ht.modifier, '')).replace(
             ' ', '_').replace('/', '_') + '.tsv.bgz'
    })
    ht = ht.annotate(**annotate_dict)
    aws_bucket = 'https://pan-ukb-us-east-1.s3.amazonaws.com/sumstats_release'
    ht = ht.annotate(aws_link=aws_bucket + '/' + ht.filename,
                     aws_link_tabix=aws_bucket + '_tabix/' + ht.filename +
                     '.tbi')

    other_fields_ht = hl.import_table(
        f'{ldprune_dir}/release/md5_hex_and_file_size.tsv.bgz',
        force_bgz=True,
        key=PHENO_KEY_FIELDS)
    other_fields = [
        'size_in_bytes', 'size_in_bytes_tabix', 'md5_hex', 'md5_hex_tabix'
    ]

    ht = ht.annotate(wget='wget ' + ht.aws_link,
                     wget_tabix='wget ' + ht.aws_link_tabix,
                     **{f: other_fields_ht[ht.key][f]
                        for f in other_fields})

    ht = ht.drop('pheno_data', 'pheno_indices')
    ht.export(f'{bucket}/combined_results/phenotype_manifest.tsv.bgz')
예제 #9
0
def annotate_nearest_gene(t,
                          add_contig: bool = False,
                          add_only_gene_symbols_as_str: bool = False,
                          loc: str = 'nearest_genes'):
    intervals_ht = hl.read_table(get_gene_intervals_path())
    if add_contig:
        intervals_ht = intervals_ht.annotate(
            contig=intervals_ht.interval.start.contig)
    annotation = intervals_ht.index(t.locus, all_matches=True)
    if add_only_gene_symbols_as_str:
        annotation = hl.delimit(annotation.gene_name)
    if loc: annotation = {loc: annotation}
    return t.annotate_rows(**annotation) if isinstance(
        t, hl.MatrixTable) else t.annotate(**annotation)
예제 #10
0
def hgvsp_from_consequence_amino_acids(csq):
    return hl.if_else(
        csq.hgvsp.contains("=") | csq.hgvsp.contains("%3D"),
        hl.bind(
            lambda protein_letters: "p." + protein_letters + hl.str(
                csq.protein_start) + protein_letters,
            hl.delimit(
                csq.amino_acids.split("").filter(lambda l: l != "").map(
                    lambda l: PROTEIN_LETTERS_1TO3.get(l)),  # pylint: disable=unnecessary-lambda
                "",
            ),
        ),
        csq.hgvsp.split(":")[-1],
    )
예제 #11
0
    def test_export_import_plink_same(self):
        mt = get_dataset()
        mt = mt.select_rows(rsid=hl.delimit([mt.locus.contig, hl.str(mt.locus.position), mt.alleles[0], mt.alleles[1]], ':'),
                            cm_position=15.0)
        mt = mt.select_cols(fam_id=hl.null(hl.tstr), pat_id=hl.null(hl.tstr), mat_id=hl.null(hl.tstr),
                            is_female=hl.null(hl.tbool), is_case=hl.null(hl.tbool))
        mt = mt.select_entries('GT')

        bfile = '/tmp/test_import_export_plink'
        hl.export_plink(mt, bfile, ind_id=mt.s, cm_position=mt.cm_position)

        mt_imported = hl.import_plink(bfile + '.bed', bfile + '.bim', bfile + '.fam',
                                      a2_reference=True, reference_genome='GRCh37')
        self.assertTrue(mt._same(mt_imported))
        self.assertTrue(mt.aggregate_rows(hl.agg.all(mt.cm_position == 15.0)))
예제 #12
0
def annotate_variant_key(ds: Union[hl.MatrixTable, hl.Table]
                         ) -> Union[hl.MatrixTable, hl.Table]:
    # define key variant expression
    key_expr = hl.delimit([ds.locus.contig,
                           hl.str(ds.locus.position),
                           ds.alleles[0],
                           ds.alleles[1]], ':')

    if isinstance(ds, hl.MatrixTable):
        ds = ds.annotate_rows(variant_key=key_expr)

    if isinstance(ds, hl.Table):
        ds = ds.annotate(variant_key=key_expr)

    return ds
예제 #13
0
def get_expr_for_formatted_hgvs(csq):
    return hl.cond(
        hl.is_missing(csq.hgvsp)
        | HGVSC_CONSEQUENCES.contains(csq.major_consequence),
        csq.hgvsc.split(":")[-1],
        hl.cond(
            csq.hgvsp.contains("=") | csq.hgvsp.contains("%3D"),
            hl.bind(
                lambda protein_letters: "p." + protein_letters + hl.str(
                    csq.protein_start) + protein_letters,
                hl.delimit(
                    csq.amino_acids.split("").map(
                        lambda l: PROTEIN_LETTERS_1TO3.get(l)), ""),
            ),
            csq.hgvsp.split(":")[-1],
        ),
    )
예제 #14
0
def _encode_allele(allele: hl.expr.StringExpression) -> hl.expr.StringExpression:
    return hl.delimit(
        _grouped(
            # Convert string to array
            allele.split("")[:-1]
            # Convert letters to numbers
            .map(lambda letter: hl.switch(letter).when("A", 0).when("C", 1).when("G", 2).when("T", 3).or_missing()),
            3,  # Group into sets of 3
        )
        # Ensure each group has 3 elements
        .map(lambda g: g.extend(hl.range(3 - hl.len(g)).map(lambda _: 0)))
        # Bit shift and add group elements
        .map(lambda g: g[0] * 16 + g[1] * 4 + g[2])
        # Convert to letters
        .map(lambda n: _ENCODED_ALLELE_CHARACTERS[n]),
        "",
    )
def prepare_variant_results():
    results_path = pipeline_config.get("SCHEMA", "variant_results_path")
    annotations_path = pipeline_config.get("SCHEMA",
                                           "variant_annotations_path")

    results = hl.read_table(results_path)

    results = results.drop("v", "af_case", "af_ctrl")

    # Add n_denovos to AC_case
    results = results.annotate(ac_case=hl.or_else(results.ac_case, 0) +
                               hl.or_else(results.n_denovos, 0))

    results = results.annotate(
        source=hl.delimit(hl.sorted(hl.array(results.source)), ", "))

    results = results.group_by(
        "locus",
        "alleles").aggregate(group_results=hl.agg.collect(results.row_value))
    results = results.annotate(group_results=hl.dict(
        results.group_results.map(lambda group_result:
                                  (group_result.analysis_group,
                                   group_result.drop("analysis_group")))))

    variants = hl.read_table(annotations_path)
    variants = variants.select(
        gene_id=variants.gene_id,
        consequence=hl.case().when(
            (variants.canonical_term == "missense_variant") &
            (variants.mpc >= 3), "missense_variant_mpc_>=3").when(
                (variants.canonical_term == "missense_variant") &
                (variants.mpc >= 2), "missense_variant_mpc_2-3").when(
                    variants.canonical_term == "missense_variant",
                    "missense_variant_mpc_<2").default(
                        variants.canonical_term),
        hgvsc=variants.hgvsc_canonical.split(":")[-1],
        hgvsp=variants.hgvsp_canonical.split(":")[-1],
        info=hl.struct(cadd=variants.cadd,
                       mpc=variants.mpc,
                       polyphen=variants.polyphen),
    )

    variants = variants.annotate(**results[variants.key])
    variants = variants.filter(hl.is_defined(variants.group_results))

    return variants
예제 #16
0
def prepare_clinvar_variants(clinvar_path, reference_genome):
    ds = hl.read_table(clinvar_path)

    ds = ds.filter(hl.is_defined(ds[f"locus_{reference_genome}"]) & hl.is_defined(ds[f"alleles_{reference_genome}"]))

    ds = ds.select(locus=ds[f"locus_{reference_genome}"], alleles=ds[f"alleles_{reference_genome}"], **ds.variant)

    # Remove any variants with alleles other than ACGT
    ds = ds.filter(
        hl.len(hl.set(hl.delimit(ds.alleles, "").split("")).difference(hl.set(["A", "C", "G", "T", ""]))) == 0
    )

    ds = ds.annotate(
        variant_id=variant_id(ds.locus, ds.alleles),
        chrom=normalized_contig(ds.locus.contig),
        pos=ds.locus.position,
        ref=ds.alleles[0],
        alt=ds.alleles[1],
    )

    ds = ds.key_by("locus", "alleles")

    return ds
예제 #17
0
    def test_export_import_plink_same(self):
        mt = get_dataset()
        mt = mt.select_rows(rsid=hl.delimit([
            mt.locus.contig,
            hl.str(mt.locus.position), mt.alleles[0], mt.alleles[1]
        ], ':'),
                            cm_position=15.0)
        mt = mt.select_cols(fam_id=hl.null(hl.tstr),
                            pat_id=hl.null(hl.tstr),
                            mat_id=hl.null(hl.tstr),
                            is_female=hl.null(hl.tbool),
                            is_case=hl.null(hl.tbool))
        mt = mt.select_entries('GT')

        bfile = '/tmp/test_import_export_plink'
        hl.export_plink(mt, bfile, ind_id=mt.s, cm_position=mt.cm_position)

        mt_imported = hl.import_plink(bfile + '.bed',
                                      bfile + '.bim',
                                      bfile + '.fam',
                                      a2_reference=True,
                                      reference_genome='GRCh37')
        self.assertTrue(mt._same(mt_imported))
        self.assertTrue(mt.aggregate_rows(hl.agg.all(mt.cm_position == 15.0)))
예제 #18
0
def ht_to_vcf_mt(
    info_ht: hl.Table,
    pipe_delimited_annotations: List[str] = INFO_VCF_AS_PIPE_DELIMITED_FIELDS,
) -> hl.MatrixTable:
    """
    Creates a MT ready for vcf export from a HT. In particular, the following conversions are done:
    - All int64 are coerced to int32
    - Fields specified by `pipe_delimited_annotations` will be converted from arrays to pipe-delimited strings

    .. note::

        The MT returned has no cols.

    :param info_ht: Input HT
    :param pipe_delimited_annotations: List of info fields (they must be fields of the ht.info Struct)
    :return: MatrixTable ready for VCF export
    """
    def get_pipe_expr(
            array_expr: hl.expr.ArrayExpression) -> hl.expr.StringExpression:
        return hl.delimit(array_expr.map(lambda x: hl.or_else(hl.str(x), "")),
                          "|")

    # Make sure the HT is keyed by locus, alleles
    info_ht = info_ht.key_by("locus", "alleles")

    # Convert int64 fields to int32 (int64 isn't supported by VCF)
    for f, ft in info_ht.info.dtype.items():
        if ft == hl.dtype("int64"):
            logger.warning(
                f"Coercing field info.{f} from int64 to int32 for VCF output. Value will be capped at int32 max value."
            )
            info_ht = info_ht.annotate(info=info_ht.info.annotate(
                **{f: hl.int32(hl.min(2**31 - 1, info_ht.info[f]))}))
        elif ft == hl.dtype("array<int64>"):
            logger.warning(
                f"Coercing field info.{f} from array<int64> to array<int32> for VCF output. Array values will be capped at int32 max value."
            )
            info_ht = info_ht.annotate(info=info_ht.info.annotate(
                **{
                    f:
                    info_ht.info[f].map(
                        lambda x: hl.int32(hl.min(2**31 - 1, x)))
                }))

    info_expr = {}

    # Make sure to pipe-delimit fields that need to.
    # Note: the expr needs to be prefixed by "|" because GATK expect one value for the ref (always empty)
    # Note2: this doesn't produce the correct annotation for AS_SB_TABLE, but it is overwritten below
    for f in pipe_delimited_annotations:
        if f in info_ht.info:
            info_expr[f] = "|" + get_pipe_expr(info_ht.info[f])

    # Flatten SB if it is an array of arrays
    if "SB" in info_ht.info and not isinstance(info_ht.info.SB,
                                               hl.expr.ArrayNumericExpression):
        info_expr["SB"] = info_ht.info.SB[0].extend(info_ht.info.SB[1])

    if "AS_SB_TABLE" in info_ht.info:
        info_expr["AS_SB_TABLE"] = get_pipe_expr(
            info_ht.info.AS_SB_TABLE.map(lambda x: hl.delimit(x, ",")))

    # Annotate with new expression and add 's' empty string field required to cast HT to MT
    info_ht = info_ht.annotate(info=info_ht.info.annotate(**info_expr),
                               s=hl.null(hl.tstr))

    # Create an MT with no cols so that we acn export to VCF
    info_mt = info_ht.to_matrix_table_row_major(columns=["s"],
                                                entry_field_name="s")
    return info_mt.filter_cols(False)

mnvs = import_mnv_file(replace_quote_char(args.mnv_url), quote="'")

if args.three_bp_mnv_url:
    mnvs_3bp = import_three_bp_mnv_file(replace_quote_char(
        args.three_bp_mnv_url),
                                        quote="'")

    snp12_components = mnvs_3bp.select(
        component_mnv=hl.bind(
            lambda snv1, snv2: hl.delimit(
                [
                    snv1.chrom,
                    hl.str(snv1.pos),
                    snv1.ref + snv2.ref,
                    snv1.alt + snv2.alt,
                ],
                "-",
            ),
            mnvs_3bp.constituent_snvs[0],
            mnvs_3bp.constituent_snvs[1],
        ),
        related_mnv=hl.struct(
            combined_variant_id=mnvs_3bp.variant_id,
            n_individuals=mnvs_3bp.n_individuals,
            other_constituent_snvs=[mnvs_3bp.constituent_snvs[2].variant_id],
            consequences=mnvs_3bp.consequences,
        ),
    )
    snp23_components = mnvs_3bp.select(
예제 #20
0
def main():
    parser = argparse.ArgumentParser()
    parser.add_argument("--results", required=True)
    parser.add_argument("--annotations", required=True)
    parser.add_argument("--output", required=True)
    args = parser.parse_args()

    hl.init(log="/tmp/hail.log")

    variants = hl.read_table(args.annotations)
    variants = variants.annotate(
        variant_id=variant_id(variants.locus, variants.alleles),
        chrom=variants.locus.contig,
        pos=variants.locus.position,
        xpos=x_position(variants.locus),
        alt=variants.alleles[1],
        ref=variants.alleles[0],
    )

    variants = variants.transmute(
        transcript_id=hl.delimit(variants.transcript_id, ","),
        hgvsc=hl.delimit(
            variants.hgvsc.keys().map(lambda k: k + ":" + variants.hgvsc[k]),
            ","),
        hgvsp=hl.delimit(
            variants.hgvsp.keys().map(lambda k: k + ":" + variants.hgvsp[k]),
            ","),
    )

    variants = variants.annotate(
        csq_canonical=hl.case().when((variants.csq_canonical == "mis")
                                     & (variants.mpc >= 3), "mis3").
        when((variants.csq_canonical == "mis")
             & (variants.mpc >= 2), "mis2").default(variants.csq_canonical))

    variants = variants.annotate(flags="PASS")
    variants = variants.drop("v")

    results = hl.read_table(args.results)
    results = results.annotate(
        analysis_group=results.analysis_group.lower().replace(
            "[^a-z0-9]+", "_").replace("_+$", ""))
    results = results.drop("v")

    # Add n_denovos to AC_case
    results = results.annotate(ac_case=hl.or_else(results.ac_case, 0) +
                               hl.or_else(results.n_denovos, 0))
    results = results.annotate(
        af_case=hl.cond(results.an_case == 0, 0, results.ac_case /
                        results.an_case))

    variants = variants.filter(hl.is_defined(results[variants.key]))

    analysis_groups = results.aggregate(
        hl.agg.collect_as_set(results.analysis_group))

    variants = variants.annotate(groups=hl.struct())
    for group in analysis_groups:
        group_results = results.filter(
            results.analysis_group == group).drop("analysis_group")
        variants = variants.annotate(groups=variants.groups.annotate(
            **{group: group_results[variants.key]}))

    # The latest (2019/04/15) SCHEMA dataset moved the source and in_analysis field from variant level to group level
    # in_analysis is the same for all groups within a variant, but source is not
    variants = variants.annotate(in_analysis=variants.groups.meta.in_analysis,
                                 source=variants.groups.meta.source)

    variants.write(args.output)
예제 #21
0
def infer_families(
    relationship_ht: hl.Table,
    sex: Union[hl.Table, Dict[str, bool]],
    duplicate_samples_ht: hl.Table,
    i_col: str = "i",
    j_col: str = "j",
    relationship_col: str = "relationship",
) -> hl.Pedigree:
    """
    This function takes a hail Table with a row for each pair of individuals i,j in the data that are related (it's OK to have unrelated samples too).
    The `relationship_col` should be a column specifying the relationship between each two samples as defined in this module's constants.

    This function returns a pedigree containing trios inferred from the data. Family ID can be the same for multiple
    trios if one or more members of the trios are related (e.g. sibs, multi-generational family). Trios are ordered by family ID.

    .. note::

        This function only returns complete trios defined as: one child, one father and one mother (sex is required for both parents).

    :param relationship_ht: Input relationship table
    :param sex: A Table or dict giving the sex for each sample (`TRUE`=female, `FALSE`=male). If a Table is given, it should have a field `is_female`.
    :param duplicated_samples: All duplicated samples TO REMOVE (If not provided, this function won't work as it assumes that each child has exactly two parents)
    :param i_col: Column containing the 1st sample of the pair in the relationship table
    :param j_col: Column containing the 2nd sample of the pair in the relationship table
    :param relationship_col: Column contatining the relationship for the sample pair as defined in this module constants.
    :return: Pedigree of complete trios
    """
    def group_parent_child_pairs_by_fam(
        parent_child_pairs: Iterable[Tuple[str, str]]
    ) -> List[List[Tuple[str, str]]]:
        """
        Takes all parent-children pairs and groups them by family.
        A family here is defined as a list of sample-pairs which all share at least one sample with at least one other sample-pair in the list.

        :param parent_child_pairs: All the parent-children pairs
        :return: A list of families, where each element of the list is a list of the parent-children pairs
        """
        fam_id = 1  # stores the current family id
        s_fam = dict()  # stores the family id for each sample
        fams = defaultdict(list)  # stores fam_id -> sample-pairs
        for pair in parent_child_pairs:
            if pair[0] in s_fam:
                if pair[1] in s_fam:
                    if (
                            s_fam[pair[0]] != s_fam[pair[1]]
                    ):  # If both samples are in different families, merge the families
                        new_fam_id = s_fam[pair[0]]
                        fam_id_to_merge = s_fam[pair[1]]
                        for s in s_fam:
                            if s_fam[s] == fam_id_to_merge:
                                s_fam[s] = new_fam_id
                        fams[new_fam_id].extend(fams.pop(fam_id_to_merge))
                else:  # If only the 1st sample in the pair is already in a family, assign the 2nd sample in the pair to the same family
                    s_fam[pair[1]] = s_fam[pair[0]]
                fams[s_fam[pair[0]]].append(pair)
            elif (
                    pair[1] in s_fam
            ):  # If only the 2nd sample in the pair is already in a family, assign the 1st sample in the pair to the same family
                s_fam[pair[0]] = s_fam[pair[1]]
                fams[s_fam[pair[1]]].append(pair)
            else:  # If none of the samples in the pair is already in a family, create a new family
                s_fam[pair[0]] = fam_id
                s_fam[pair[1]] = fam_id
                fams[fam_id].append(pair)
                fam_id += 1

        return list(fams.values())

    def get_trios(
        fam_id: str,
        parent_child_pairs: List[Tuple[str, str]],
        related_pairs: Dict[Tuple[str, str], str],
    ) -> List[hl.Trio]:
        """
        Generates trios based from the list of parent-child pairs in the family and
        all related pairs in the data. Only complete parent/offspring trios are included in the results.

        The trios are assembled as follows:
        1. All pairs of unrelated samples with different sexes within the family are extracted as possible parent pairs
        2. For each possible parent pair, a list of all children is constructed (each child in the list has a parent-offspring pair with each parent)
        3. If there are multiple children for a given parent pair, all children should be siblings with each other
        4. Check that each child was only assigned a single pair of parents. If a child is found to have multiple parent pairs, they are ALL discarded.

        :param fam_id: The family ID
        :param parent_child_pairs: The parent-child pairs for this family
        :param related_pairs: All related sample pairs in the data
        :return: List of trios in the family
        """
        def get_possible_parents(samples: List[str]) -> List[Tuple[str, str]]:
            """
            1. All pairs of unrelated samples with different sexes within the family are extracted as possible parent pairs

            :param samples: All samples in the family
            :return: Possible parent pairs
            """
            possible_parents = []
            for i in range(len(samples)):
                for j in range(i + 1, len(samples)):
                    if (related_pairs.get(
                            tuple(sorted([samples[i], samples[j]]))) is None):
                        if sex.get(samples[i]) is False and sex.get(
                                samples[j]) is True:
                            possible_parents.append((samples[i], samples[j]))
                        elif (sex.get(samples[i]) is True
                              and sex.get(samples[j]) is False):
                            possible_parents.append((samples[j], samples[i]))
            return possible_parents

        def get_children(possible_parents: Tuple[str, str]) -> List[str]:
            """
            2. For a given possible parent pair, a list of all children is constructed (each child in the list has a parent-offspring pair with each parent)

            :param possible_parents: A pair of possible parents
            :return: The list of all children (if any) corresponding to the possible parents
            """
            possible_offsprings = defaultdict(
                set
            )  # stores sample -> set of parents in the possible_parents where (sample, parent) is found in possible_child_pairs
            for pair in parent_child_pairs:
                if possible_parents[0] == pair[0]:
                    possible_offsprings[pair[1]].add(possible_parents[0])
                elif possible_parents[0] == pair[1]:
                    possible_offsprings[pair[0]].add(possible_parents[0])
                elif possible_parents[1] == pair[0]:
                    possible_offsprings[pair[1]].add(possible_parents[1])
                elif possible_parents[1] == pair[1]:
                    possible_offsprings[pair[0]].add(possible_parents[1])

            return [
                s for s, parents in possible_offsprings.items()
                if len(parents) == 2
            ]

        def check_sibs(children: List[str]) -> bool:
            """
            3. If there are multiple children for a given parent pair, all children should be siblings with each other

            :param children: List of all children for a given parent pair
            :return: Whether all children in the list are siblings
            """
            for i in range(len(children)):
                for j in range(i + 1, len(children)):
                    if (related_pairs[tuple(sorted([children[i], children[j]
                                                    ]))] != SIBLINGS):
                        return False
            return True

        def discard_multi_parents_children(trios: List[hl.Trio]):
            """
            4. Check that each child was only assigned a single pair of parents. If a child is found to have multiple parent pairs, they are ALL discarded.

            :param trios: All trios formed for this family
            :return: The list of trios for which each child has a single parents pair.
            """
            children_trios = defaultdict(list)
            for trio in trios:
                children_trios[trio.s].append(trio)

            for s, s_trios in children_trios.items():
                if len(s_trios) > 1:
                    logger.warning(
                        "Discarded duplicated child {0} found multiple in trios: {1}"
                        .format(s, ", ".join([str(trio) for trio in s_trios])))

            return [
                trios[0] for trios in children_trios.values()
                if len(trios) == 1
            ]

        # Get all possible pairs of parents in (father, mother) order
        all_possible_parents = get_possible_parents(
            list({s
                  for pair in parent_child_pairs for s in pair}))

        trios = []
        for possible_parents in all_possible_parents:
            children = get_children(possible_parents)
            if check_sibs(children):
                trios.extend([
                    hl.Trio(
                        s=s,
                        fam_id=fam_id,
                        pat_id=possible_parents[0],
                        mat_id=possible_parents[1],
                        is_female=sex.get(s),
                    ) for s in children
                ])
            else:
                logger.warning(
                    "Discarded family with same parents, and multiple offspring that weren't siblings:"
                    "\nMother: {}\nFather:{}\nChildren:{}".format(
                        possible_parents[0], possible_parents[1],
                        ", ".join(children)))

        return discard_multi_parents_children(trios)

    # Get all the relations we care about:
    # => Remove unrelateds and duplicates
    dups = duplicate_samples_ht.aggregate(
        hl.agg.explode(lambda dup: hl.agg.collect_as_set(dup),
                       duplicate_samples_ht.filtered),
        _localize=False,
    )
    relationship_ht = relationship_ht.filter(
        ~dups.contains(relationship_ht[i_col])
        & ~dups.contains(relationship_ht[j_col])
        & (relationship_ht[relationship_col] != UNRELATED))

    # Check relatedness table format
    if not relationship_ht[i_col].dtype == relationship_ht[j_col].dtype:
        logger.error(
            "i_col and j_col of the relatedness table need to be of the same type."
        )

    # If i_col and j_col aren't str, then convert them
    if not isinstance(relationship_ht[i_col], hl.expr.StringExpression):
        logger.warning(
            f"Pedigrees can only be constructed from string IDs, but your relatedness_ht ID column is of type: {relationship_ht[i_col].dtype}. Expression will be converted to string in Pedigrees."
        )
        if isinstance(relationship_ht[i_col], hl.expr.StructExpression):
            logger.warning(
                f"Struct fields {list(relationship_ht[i_col])} will be joined by underscores to use as sample names in Pedigree."
            )
            relationship_ht = relationship_ht.key_by(
                **{
                    i_col:
                    hl.delimit(
                        hl.array([
                            hl.str(relationship_ht[i_col][x])
                            for x in relationship_ht[i_col]
                        ]),
                        "_",
                    ),
                    j_col:
                    hl.delimit(
                        hl.array([
                            hl.str(relationship_ht[j_col][x])
                            for x in relationship_ht[j_col]
                        ]),
                        "_",
                    ),
                })
        else:
            raise NotImplementedError(
                "The `i_col` and `j_col` columns of the `relationship_ht` argument passed to infer_families are not of type StringExpression or Struct."
            )

    # If sex is a Table, extract sex information as a Dict
    if isinstance(sex, hl.Table):
        sex = dict(hl.tuple([sex.s, sex.is_female]).collect())

    # Collect all related sample pairs and
    # create a dictionnary with pairs as keys and relationships as values
    # Sample-pairs are tuples ordered by sample name
    related_pairs = {
        tuple(sorted([i, j])): rel
        for i, j, rel in hl.tuple([
            relationship_ht.i, relationship_ht.j, relationship_ht.relationship
        ]).collect()
    }

    parent_child_pairs_by_fam = group_parent_child_pairs_by_fam(
        [pair for pair, rel in related_pairs.items() if rel == PARENT_CHILD])
    return hl.Pedigree([
        trio for fam_index, parent_child_pairs in enumerate(
            parent_child_pairs_by_fam) for trio in get_trios(
                str(fam_index), parent_child_pairs, related_pairs)
    ])
예제 #22
0
# initialize hail
logging.info('Initialize hail')
hl.init(log = args.hail_log)

# read hail Tables
logging.info('Read GWAS results saved in hail Table')
gwas_out = hl.read_table(args.gwas_ht)

# add variant column
logging.info('Adding `variant` column: chr:pos:ref:alt')
gwas_out = gwas_out.annotate( 
    variant = hl.delimit(
        hl.array([
            gwas_out['locus'].contig,
            hl.str(gwas_out['locus'].position),
            gwas_out['alleles'][0],
            gwas_out['alleles'][1]
        ]), 
    delimiter = ':')
)

# change the key of Table to variant
logging.info('Changing the key of Table to `variant` column')
gwas_out = gwas_out.key_by('variant')
gwas_out = gwas_out.repartition(40)
gwas_out = gwas_out.cache()

# exporting TSV
logging.info('Looping over list of trait lists and output TSVs')
phenotypes = gwas_out['phenotypes'].collect()[0]   # note that this annotation `phenotypes` was added by gwas_on_subset_ht.py!
for i, subset in enumerate(phenotypes):
예제 #23
0
def adjust_vcf_incompatible_types(
    ht: hl.Table,
    pipe_delimited_annotations: List[str] = INFO_VCF_AS_PIPE_DELIMITED_FIELDS,
) -> hl.Table:
    """
    Create a Table ready for vcf export.

    In particular, the following conversions are done:
        - All int64 are coerced to int32
        - Fields specified by `pipe_delimited_annotations` are converted from arrays to pipe-delimited strings

    :param ht: Input Table.
    :param pipe_delimited_annotations: List of info fields (they must be fields of the ht.info Struct).
    :return: Table ready for VCF export.
    """
    def get_pipe_expr(
            array_expr: hl.expr.ArrayExpression) -> hl.expr.StringExpression:
        return hl.delimit(array_expr.map(lambda x: hl.or_else(hl.str(x), "")),
                          "|")

    # Make sure the HT is keyed by locus, alleles
    ht = ht.key_by("locus", "alleles")

    info_type_convert_expr = {}
    # Convert int64 fields to int32 (int64 isn't supported by VCF)
    for f, ft in ht.info.dtype.items():
        if ft == hl.dtype("int64"):
            logger.warning(
                "Coercing field info.%s from int64 to int32 for VCF output. Value will be capped at int32 max value.",
                f,
            )
            info_type_convert_expr.update(
                {f: hl.int32(hl.min(2**31 - 1, ht.info[f]))})
        elif ft == hl.dtype("array<int64>"):
            logger.warning(
                "Coercing field info.%s from array<int64> to array<int32> for VCF output. Array values will be capped "
                "at int32 max value.",
                f,
            )
            info_type_convert_expr.update(
                {f: ht.info[f].map(lambda x: hl.int32(hl.min(2**31 - 1, x)))})

    ht = ht.annotate(info=ht.info.annotate(**info_type_convert_expr))

    info_expr = {}

    # Make sure to pipe-delimit fields that need to.
    # Note: the expr needs to be prefixed by "|" because GATK expect one value for the ref (always empty)
    # Note2: this doesn't produce the correct annotation for AS_SB_TABLE, it is handled below
    for f in pipe_delimited_annotations:
        if f in ht.info and f != "AS_SB_TABLE":
            info_expr[f] = "|" + get_pipe_expr(ht.info[f])

    # Flatten SB if it is an array of arrays
    if "SB" in ht.info and not isinstance(ht.info.SB,
                                          hl.expr.ArrayNumericExpression):
        info_expr["SB"] = ht.info.SB[0].extend(ht.info.SB[1])

    if "AS_SB_TABLE" in ht.info:
        info_expr["AS_SB_TABLE"] = get_pipe_expr(
            ht.info.AS_SB_TABLE.map(lambda x: hl.delimit(x, ",")))

    # Annotate with new expression
    ht = ht.annotate(info=ht.info.annotate(**info_expr))

    return ht
예제 #24
0
def get_gold_stars(review_status):
    review_status_str = hl.delimit(hl.sorted(review_status, key=lambda s: s.replace("^_", "z")))
    return CLINVAR_GOLD_STARS[review_status_str]
예제 #25
0
def main(args):
    hl.init(master=f'local[{args.n_threads}]',
            log=hl.utils.timestamp_path(os.path.join(tempfile.gettempdir(),
                                                     'extract_vcf'),
                                        suffix='.log'),
            default_reference=args.reference)

    sys.path.append('/')
    add_args = []
    if args.additional_args is not None:
        add_args = args.additional_args.split(',')
    load_module = importlib.import_module(args.load_module)
    mt = getattr(load_module, args.load_mt_function)(*add_args)

    if args.gene_map_ht_path is None:
        interval = [hl.parse_locus_interval(args.interval)]
    else:
        gene_ht = hl.read_table(args.gene_map_ht_path)
        if args.gene is not None:
            gene_ht = gene_ht.filter(gene_ht.gene_symbol == args.gene)
            interval = gene_ht.aggregate(hl.agg.take(gene_ht.interval, 1),
                                         _localize=False)
        else:
            interval = [hl.parse_locus_interval(args.gene_ht_interval)]
            gene_ht = hl.filter_intervals(gene_ht, interval)

        gene_ht = gene_ht.filter(
            hl.set(args.groups.split(',')).contains(gene_ht.annotation))
        gene_ht.select(group=gene_ht.gene_id + '_' + gene_ht.gene_symbol +
                       '_' + gene_ht.annotation,
                       variant=hl.delimit(gene_ht.variants,
                                          '\t')).key_by().drop('start').export(
                                              args.group_output_file,
                                              header=False)
        # TODO: possible minor optimization: filter output VCF to only variants in `gene_ht.variants`

    if not args.no_adj:
        mt = mt.filter_entries(mt.adj)

    mt = hl.filter_intervals(mt, interval)

    if not args.input_bgen:
        mt = mt.select_entries('GT')
        mt = mt.filter_rows(hl.agg.count_where(mt.GT.is_non_ref()) > 0)
    mt = mt.annotate_rows(rsid=mt.locus.contig + ':' +
                          hl.str(mt.locus.position) + '_' + mt.alleles[0] +
                          '/' + mt.alleles[1])

    if args.callrate_filter:
        mt = mt.filter_rows(
            hl.agg.fraction(hl.is_defined(mt.GT)) >= args.callrate_filter)

    if args.export_bgen:
        if not args.input_bgen:
            mt = mt.annotate_entries(GT=hl.if_else(
                mt.GT.is_haploid(), hl.call(mt.GT[0], mt.GT[0]), mt.GT))
            mt = gt_to_gp(mt)
            mt = impute_missing_gp(mt, mean_impute=args.mean_impute_missing)
        hl.export_bgen(mt, args.output_file, gp=mt.GP, varid=mt.rsid)
    else:
        mt = mt.annotate_entries(GT=hl.or_else(mt.GT, hl.call(0, 0)))
        # Note: no mean-imputation for VCF
        hl.export_vcf(mt, args.output_file)
예제 #26
0
    def get_csq_from_struct(element: hl.expr.StructExpression,
                            feature_type: str) -> hl.expr.StringExpression:
        # Most fields are 1-1, just lowercase
        fields = dict(element)

        # Add general exceptions
        fields.update({
            "allele":
            element.variant_allele,
            "consequence":
            hl.delimit(element.consequence_terms, delimiter="&"),
            "feature_type":
            feature_type,
            "feature":
            (element.transcript_id if "transcript_id" in element else
             element.regulatory_feature_id if "regulatory_feature_id"
             in element else element.motif_feature_id
             if "motif_feature_id" in element else ""),
            "variant_class":
            vep_expr.variant_class,
        })

        # Add exception for transcripts
        if feature_type == "Transcript":
            fields.update({
                "canonical":
                hl.cond(element.canonical == 1, "YES", ""),
                "ensp":
                element.protein_id,
                "gene":
                element.gene_id,
                "symbol":
                element.gene_symbol,
                "symbol_source":
                element.gene_symbol_source,
                "cdna_position":
                hl.str(element.cdna_start) + hl.cond(
                    element.cdna_start == element.cdna_end,
                    "",
                    "-" + hl.str(element.cdna_end),
                ),
                "cds_position":
                hl.str(element.cds_start) + hl.cond(
                    element.cds_start == element.cds_end,
                    "",
                    "-" + hl.str(element.cds_end),
                ),
                "protein_position":
                hl.str(element.protein_start) + hl.cond(
                    element.protein_start == element.protein_end,
                    "",
                    "-" + hl.str(element.protein_end),
                ),
                "sift":
                element.sift_prediction + "(" +
                hl.format("%.3f", element.sift_score) + ")",
                "polyphen":
                element.polyphen_prediction + "(" +
                hl.format("%.3f", element.polyphen_score) + ")",
                "domains":
                hl.delimit(element.domains.map(lambda d: d.db + ":" + d.name),
                           "&"),
            })
        elif feature_type == "MotifFeature":
            fields["motif_score_change"] = hl.format(
                "%.3f", element.motif_score_change)

        return hl.delimit(
            [hl.or_else(hl.str(fields.get(f, "")), "") for f in _csq_fields],
            "|")
예제 #27
0
 def get_pipe_expr(
         array_expr: hl.expr.ArrayExpression) -> hl.expr.StringExpression:
     return hl.delimit(array_expr.map(lambda x: hl.or_else(hl.str(x), "")),
                       "|")
예제 #28
0
    def export(self, path, delimiter='\t', missing='NA', header=True):
        """Export a field to a text file.

        Examples
        --------

        >>> small_mt.GT.export('output/gt.tsv')
        >>> with open('output/gt.tsv', 'r') as f:
        ...     for line in f:
        ...         print(line, end='')
        locus	alleles	0	1	2	3
        1:1	["A","C"]	0/1	0/1	0/0	0/0
        1:2	["A","C"]	1/1	0/1	1/1	1/1
        1:3	["A","C"]	1/1	0/1	0/1	0/0
        1:4	["A","C"]	1/1	0/1	1/1	1/1
        <BLANKLINE>

        >>> small_mt.GT.export('output/gt-no-header.tsv', header=False)
        >>> with open('output/gt-no-header.tsv', 'r') as f:
        ...     for line in f:
        ...         print(line, end='')
        1:1	["A","C"]	0/1	0/1	0/0	0/0
        1:2	["A","C"]	1/1	0/1	1/1	1/1
        1:3	["A","C"]	1/1	0/1	0/1	0/0
        1:4	["A","C"]	1/1	0/1	1/1	1/1
        <BLANKLINE>

        >>> small_mt.pop.export('output/pops.tsv')
        >>> with open('output/pops.tsv', 'r') as f:
        ...     for line in f:
        ...         print(line, end='')
        sample_idx	pop
        0	2
        1	2
        2	0
        3	2
        <BLANKLINE>

        >>> small_mt.ancestral_af.export('output/ancestral_af.tsv')
        >>> with open('output/ancestral_af.tsv', 'r') as f:
        ...     for line in f:
        ...         print(line, end='')
        locus	alleles	ancestral_af
        1:1	["A","C"]	5.3905e-01
        1:2	["A","C"]	8.6768e-01
        1:3	["A","C"]	4.3765e-01
        1:4	["A","C"]	7.6300e-01
        <BLANKLINE>

        >>> mt = small_mt
        >>> small_mt.bn.export('output/bn.tsv')
        >>> with open('output/bn.tsv', 'r') as f:
        ...     for line in f:
        ...         print(line, end='')
        bn
        {"n_populations":3,"n_samples":4,"n_variants":4,"n_partitions":8,"pop_dist":[1,1,1],"fst":[0.1,0.1,0.1],"mixture":false}
        <BLANKLINE>


        Notes
        -----

        For entry-indexed expressions, if there is one column key field, the
        result of calling :func:`~hail.expr.functions.str` on that field is used as
        the column header. Otherwise, each compound column key is converted to
        JSON and used as a column header. For example:

        >>> small_mt = small_mt.key_cols_by(s=small_mt.sample_idx, family='fam1')
        >>> small_mt.GT.export('output/gt-no-header.tsv')
        >>> with open('output/gt-no-header.tsv', 'r') as f:
        ...     for line in f:
        ...         print(line, end='')
        locus	alleles	{"s":0,"family":"fam1"}	{"s":1,"family":"fam1"}	{"s":2,"family":"fam1"}	{"s":3,"family":"fam1"}
        1:1	["A","C"]	0/1	0/1	0/0	0/0
        1:2	["A","C"]	1/1	0/1	1/1	1/1
        1:3	["A","C"]	1/1	0/1	0/1	0/0
        1:4	["A","C"]	1/1	0/1	1/1	1/1
        <BLANKLINE>


        Parameters
        ----------
        path : :class:`str`
            The path to which to export.
        delimiter : :class:`str`
            The string for delimiting columns.
        missing : :class:`str`
            The string to output for missing values.
        header : :obj:`bool`
            When ``True`` include a header line.
        """
        uid = Env.get_uid()
        self_name, ds = self._to_relational_preserving_rows_and_cols(uid)
        if isinstance(ds, hl.Table):
            ds.export(output=path, delimiter=delimiter, header=header)
        else:
            assert len(self._indices.axes) == 2
            entries, cols = Env.get_uid(), Env.get_uid()
            t = ds.select_cols().localize_entries(entries, cols)
            t = t.order_by(*t.key)
            output_col_name = Env.get_uid()
            entry_array = t[entries]
            if self_name:
                entry_array = hl.map(lambda x: x[self_name], entry_array)
            entry_array = hl.map(
                lambda x: hl.if_else(hl.is_missing(x), missing, hl.str(x)),
                entry_array)
            file_contents = t.select(
                **{k: hl.str(t[k])
                   for k in ds.row_key},
                **{output_col_name: hl.delimit(entry_array, delimiter)})
            if header:
                col_key = t[cols]
                if len(ds.col_key) == 1:
                    col_key = hl.map(lambda x: x[0], col_key)
                column_names = hl.map(hl.str,
                                      col_key).collect(_localize=False)[0]
                header_table = hl.utils.range_table(1).key_by().select(
                    **{k: k
                       for k in ds.row_key},
                    **{output_col_name: hl.delimit(column_names, delimiter)})
                file_contents = header_table.union(file_contents)
            file_contents.export(path, delimiter=delimiter, header=False)
예제 #29
0
def main(args):
    ht_snp = hl.import_table(args.snp, impute=True)
    ht_snp = ht_snp.annotate(variant=hl.delimit([
        ht_snp.chromosome,
        hl.str(ht_snp.position), ht_snp.allele1, ht_snp.allele2
    ],
                                                delimiter=':'))
    ht_snp = ht_snp.annotate(
        **hl.parse_variant(ht_snp.variant, reference_genome='GRCh38'))
    ht_snp = ht_snp.key_by('locus', 'alleles')
    ht_snp = ht_snp.add_index('idx_snp')
    ht_snp = ht_snp.checkpoint(new_temp_file())

    # annotate vep
    gnomad = hl.read_table(
        'gs://gnomad-public-requester-pays/release/3.0/ht/genomes/gnomad.genomes.r3.0.sites.ht'
    )
    ht_snp = ht_snp.join(gnomad.select('vep'), how='left')
    ht_snp = process_consequences(ht_snp)

    # extract most severe
    ht_snp = ht_snp.annotate(vep=(hl.case().when(
        hl.is_defined(ht_snp.vep.worst_csq_for_variant_canonical),
        ht_snp.vep.worst_csq_for_variant_canonical).when(
            hl.is_defined(ht_snp.vep.worst_csq_for_variant),
            ht_snp.vep.worst_csq_for_variant).or_missing()),
                             is_canonical_vep=hl.is_defined(
                                 ht_snp.vep.worst_csq_for_variant_canonical))
    ht_snp = ht_snp.annotate(most_severe=hl.if_else(
        hl.is_defined(ht_snp.vep), ht_snp.vep.most_severe_consequence,
        'intergenic_variant'),
                             gene_most_severe=ht_snp.vep.gene_symbol)
    ht_snp = ht_snp.select_globals()
    ht_snp = ht_snp.drop('vep')
    ht_snp = ht_snp.annotate(
        **annotate_consequence_category(ht_snp.most_severe))
    ht_snp = ht_snp.checkpoint(new_temp_file())

    df = ht_snp.key_by().drop('locus', 'alleles', 'variant',
                              'idx_snp').to_pandas()

    # annotate LD
    for pop in POPS:
        ht = hl.read_table(
            f'gs://gnomad-public-requester-pays/release/2.1.1/ld/gnomad.genomes.r2.1.1.{pop}.common.adj.ld.variant_indices.ht'
        )
        ht = ht.annotate(locus_hg38=hl.liftover(ht.locus, 'GRCh38'))
        ht = ht.filter(hl.is_defined(ht.locus_hg38))
        ht = ht.key_by('locus_hg38', 'alleles').drop('locus')
        ht = ht_snp.join(ht, 'inner')
        ht = ht.checkpoint(new_temp_file())

        lead_idx = ht.order_by(hl.desc(ht.prob)).head(1).idx.collect()
        idx = ht.idx.collect()
        bm = BlockMatrix.read(
            f'gs://gnomad-public-requester-pays/release/2.1.1/ld/gnomad.genomes.r2.1.1.{pop}.common.ld.bm'
        )
        bm = bm.filter(idx, idx)
        # re-densify triangluar matrix
        bm = bm + bm.T - get_diag_mat(bm.diagonal())
        bm = bm.filter_rows(
            np.where(np.array(idx) == lead_idx[0])[0].tolist())**2

        idx_snp = ht.idx_snp.collect()
        r2 = bm.to_numpy()[0]
        df[f'gnomad_lead_r2_{pop}'] = np.nan
        df[f'gnomad_lead_r2_{pop}'].iloc[idx_snp] = r2

    if args.out.startswith('gs://'):
        fopen = hl.hadoop_open
    else:
        fopen = open

    with fopen(args.out, 'w') as f:
        df.to_csv(f, sep='\t', na_rep='NA', index=False)
예제 #30
0
def format_variants_table(ds):

    ############################
    # Derived top level fields #
    ############################

    ds = ds.annotate(
        variant_id=variant_id(ds.locus, ds.alleles),
        chrom=normalized_contig(ds.locus),
        pos=ds.locus.position,
        xpos=x_position(ds.locus),
        ref=ds.alleles[0],
        alt=ds.alleles[1],
    )

    ###############
    # Frequencies #
    ###############

    g = hl.eval(ds.globals)

    freq_index_tree = get_freq_index_tree(g.freq_index_dict)

    subsets = list(freq_index_tree.keys())

    ds = ds.annotate(
        **{
            subset: hl.struct(
                # Adjusted frequencies
                AC_adj=freq_expression(ds, "AC", freq_index_tree[subset]),
                AN_adj=freq_expression(ds, "AN", freq_index_tree[subset]),
                AF_adj=freq_expression(ds, "AF", freq_index_tree[subset]),
                nhomalt_adj=freq_expression(ds, "homozygote_count", freq_index_tree[subset]),
                # Raw frequencies
                AC_raw=ds.freq[g.freq_index_dict[f"{subset}_raw"]].AC,
                AN_raw=ds.freq[g.freq_index_dict[f"{subset}_raw"]].AN,
                AF_raw=ds.freq[g.freq_index_dict[f"{subset}_raw"]].AF,
                nhomalt_raw=ds.freq[g.freq_index_dict[f"{subset}_raw"]].homozygote_count,
                # Popmax
                popmax=ds.popmax[g.popmax_index_dict[subset]].pop,
                AC_popmax=ds.popmax[g.popmax_index_dict[subset]].AC,
                AN_popmax=ds.popmax[g.popmax_index_dict[subset]].AN,
                AF_popmax=ds.popmax[g.popmax_index_dict[subset]].AF,
                nhomalt_popmax=ds.popmax[g.popmax_index_dict[subset]].homozygote_count,
            )
            for subset in subsets
        }
    )

    ##############################
    # Filtering allele frequency #
    ##############################

    faf_index_tree = collections.defaultdict(dict)
    for labels_combo, index in g.faf_index_dict.items():
        labels = labels_combo.split("_")
        # Subset labels contain an _, so rebuild those after splitting them
        if labels[0] == "non":
            labels = ["_".join(labels[0:2])] + labels[2:]

        if len(labels) == 2:
            [subset, pop] = labels
            faf_index_tree[subset][pop] = index
        else:
            assert len(labels) == 1
            subset = labels[0]
            faf_index_tree[subset]["total"] = index

    ds = ds.annotate(
        **{
            subset: ds[subset].annotate(
                faf95_adj=hl.struct(**{pop: ds.faf[index].faf95 for pop, index in faf_index_tree[subset].items()}),
                faf99_adj=hl.struct(**{pop: ds.faf[index].faf99 for pop, index in faf_index_tree[subset].items()}),
            )
            for subset in subsets
        }
    )

    ds = ds.drop("freq", "popmax", "faf")

    ##############
    # Histograms #
    ##############

    # Extract overall age distribution
    ds = ds.transmute(
        gnomad_age_hist_het=ds.age_hist_het[g.age_index_dict["gnomad"]],
        gnomad_age_hist_hom=ds.age_hist_hom[g.age_index_dict["gnomad"]],
    )

    # Convert lists of numbers in histograms into pipe delimited strings
    ds = ds.annotate(
        **{
            field: ds[field].annotate(
                bin_freq=hl.delimit(ds[field].bin_freq, "|"), bin_edges=hl.delimit(ds[field].bin_edges, "|")
            )
            for field in [
                "ab_hist_alt",
                "dp_hist_all",
                "dp_hist_alt",
                "gq_hist_all",
                "gq_hist_alt",
                "gnomad_age_hist_het",
                "gnomad_age_hist_hom",
            ]
        }
    )

    ###########################
    # Quality metrics / flags #
    ###########################

    # Use the same fields as the VCFs
    # Based https://github.com/macarthur-lab/gnomad_qc/blob/25a81bc2166fbe4ccbb2f7a87d36aba661150413/variant_qc/prepare_data_release.py#L128-L159
    ds = ds.transmute(
        BaseQRankSum=ds.allele_info.BaseQRankSum,
        ClippingRankSum=ds.allele_info.ClippingRankSum,
        DP=ds.allele_info.DP,
        FS=ds.info_FS,
        InbreedingCoeff=ds.info_InbreedingCoeff,
        MQ=ds.info_MQ,
        MQRankSum=ds.info_MQRankSum,
        QD=ds.info_QD,
        ReadPosRankSum=ds.info_ReadPosRankSum,
        rf_negative_label=ds.fail_hard_filters,
        rf_positive_label=ds.tp,
        rf_tp_probability=ds.rf_probability,
        SOR=ds.info_SOR,
        VQSLOD=ds.allele_info.VQSLOD,
        VQSR_culprit=ds.allele_info.culprit,
        VQSR_NEGATIVE_TRAIN_SITE=ds.info_NEGATIVE_TRAIN_SITE,
        VQSR_POSITIVE_TRAIN_SITE=ds.info_POSITIVE_TRAIN_SITE,
    )

    # These fields are left unaltered at the top level
    #
    # allele_type
    # decoy
    # has_star
    # lcr
    # n_alt_alleles
    # nonpar
    # pab_max
    # rf_label
    # rf_train
    # segdup
    # transmitted_singleton
    # variant_type
    # was_mixed

    # TODO: Remove this, leave these at top level
    ds = ds.transmute(
        allele_info=hl.struct(
            BaseQRankSum=ds.BaseQRankSum,
            ClippingRankSum=ds.ClippingRankSum,
            DP=ds.DP,
            FS=ds.FS,
            InbreedingCoeff=ds.InbreedingCoeff,
            MQ=ds.MQ,
            MQRankSum=ds.MQRankSum,
            QD=ds.QD,
            ReadPosRankSum=ds.ReadPosRankSum,
            SOR=ds.SOR,
            VQSLOD=ds.VQSLOD,
            VQSR_culprit=ds.VQSR_culprit,
            VQSR_NEGATIVE_TRAIN_SITE=ds.VQSR_NEGATIVE_TRAIN_SITE,
            VQSR_POSITIVE_TRAIN_SITE=ds.VQSR_POSITIVE_TRAIN_SITE,
        )
    )

    ###################
    # VEP annotations #
    ###################

    ds = ds.annotate(sortedTranscriptConsequences=sorted_transcript_consequences_v2(ds.vep))

    ds = ds.drop("vep")

    #########
    # Flags #
    #########

    # TODO: Leave these at the top level
    ds = ds.transmute(flags=hl.struct(lcr=ds.lcr, segdup=ds.segdup))

    # TODO: Remove this, these flags are calculated on the fly
    ds = ds.annotate(
        flags=ds.flags.annotate(
            lc_lof=get_expr_for_variant_lc_lof_flag(ds.sortedTranscriptConsequences),
            lof_flag=get_expr_for_variant_loftee_flag_flag(ds.sortedTranscriptConsequences),
        ),
        sortedTranscriptConsequences=hl.bind(
            lambda genes_with_lc_lof_flag, genes_with_loftee_flag_flag: ds.sortedTranscriptConsequences.map(
                lambda csq: csq.annotate(
                    flags=hl.struct(
                        lc_lof=get_expr_for_consequence_lc_lof_flag(csq),
                        lc_lof_in_gene=genes_with_lc_lof_flag.contains(csq.gene_id),
                        lof_flag=get_expr_for_consequence_loftee_flag_flag(csq),
                        lof_flag_in_gene=genes_with_loftee_flag_flag.contains(csq.gene_id),
                        nc_transcript=(csq.category == "lof") & (csq.lof == ""),
                    )
                )
            ),
            get_expr_for_genes_with_lc_lof_flag(ds.sortedTranscriptConsequences),
            get_expr_for_genes_with_loftee_flag_flag(ds.sortedTranscriptConsequences),
        ),
    )

    #################
    # Unused fields #
    #################

    # These fields were not in the 2.1.1 browser Hail table

    ds = ds.drop(
        "adj_biallelic_rank",
        "adj_biallelic_singleton_rank",
        "adj_rank",
        "adj_singleton_rank",
        "biallelic_rank",
        "biallelic_singleton_rank",
        "info_DP",
        "mills",
        "n_nonref",
        "omni",
        "qd",
        "rank",
        "score",
        "singleton_rank",
        "singleton",
        "was_split",
    )

    # These two fields appear only in the genomes table
    if "_score" in ds.row_value.dtype.fields:
        ds = ds.drop("_score", "_singleton")

    ########
    # Keys #
    ########

    # Drop key fields
    ds = ds.key_by().drop("locus", "alleles")

    return ds
예제 #31
0
print("\n=== Processing ===")
mt = mt.annotate_rows(sortedTranscriptConsequences=
                      get_expr_for_vep_sorted_transcript_consequences_array(
                          vep_root=mt.vep))

mt = mt.annotate_rows(
    main_transcript=
    get_expr_for_worst_transcript_consequence_annotations_struct(
        vep_sorted_transcript_consequences_root=mt.sortedTranscriptConsequences
    ))

mt = mt.annotate_rows(gene_ids=get_expr_for_vep_gene_ids_set(
    vep_transcript_consequences_root=mt.sortedTranscriptConsequences), )

review_status_str = hl.delimit(
    hl.sorted(hl.array(hl.set(mt.info.CLNREVSTAT)),
              key=lambda s: s.replace("^_", "z")))

mt = mt.select_rows(
    allele_id=mt.info.ALLELEID,
    alt=get_expr_for_alt_allele(mt),
    chrom=get_expr_for_contig(mt.locus),
    clinical_significance=hl.delimit(
        hl.sorted(hl.array(hl.set(mt.info.CLNSIG)),
                  key=lambda s: s.replace("^_", "z"))),
    domains=get_expr_for_vep_protein_domains_set(
        vep_transcript_consequences_root=mt.vep.transcript_consequences),
    gene_ids=mt.gene_ids,
    gene_id_to_consequence_json=get_expr_for_vep_gene_id_to_consequence_map(
        vep_sorted_transcript_consequences_root=mt.
        sortedTranscriptConsequences,