Пример #1
0
    def test_liftover_strand(self):
        grch37 = hl.get_reference('GRCh37')
        grch37.add_liftover(resource('grch37_to_grch38_chr20.over.chain.gz'),
                            'GRCh38')

        self.assertEqual(
            hl.eval(
                hl.liftover(hl.locus('20', 60001, 'GRCh37'),
                            'GRCh38',
                            include_strand=True)),
            hl.eval(
                hl.struct(result=hl.locus('chr20', 79360, 'GRCh38'),
                          is_negative_strand=False)))

        self.assertEqual(
            hl.eval(
                hl.liftover(hl.locus_interval('20', 37007582, 37007586, True,
                                              True, 'GRCh37'),
                            'GRCh38',
                            include_strand=True)),
            hl.eval(
                hl.struct(result=hl.locus_interval('chr12', 32563117, 32563121,
                                                   True, True, 'GRCh38'),
                          is_negative_strand=True)))

        with self.assertRaises(FatalError):
            hl.eval(
                hl.liftover(
                    hl.parse_locus_interval('1:10000-10000',
                                            reference_genome='GRCh37'),
                    'GRCh38'))

        grch37.remove_liftover("GRCh38")
Пример #2
0
    def test_reference_genome_liftover(self):
        grch37 = hl.get_reference('GRCh37')
        grch38 = hl.get_reference('GRCh38')

        self.assertTrue(not grch37.has_liftover('GRCh38') and not grch38.has_liftover('GRCh37'))
        grch37.add_liftover(resource('grch37_to_grch38_chr20.over.chain.gz'), 'GRCh38')
        grch38.add_liftover(resource('grch38_to_grch37_chr20.over.chain.gz'), 'GRCh37')
        assert grch37.has_liftover('GRCh38')
        assert grch38.has_liftover('GRCh37')

        ds = hl.import_vcf(resource('sample.vcf'))
        t = ds.annotate_rows(liftover=hl.liftover(hl.liftover(ds.locus, 'GRCh38'), 'GRCh37')).rows()
        assert t.all(t.locus == t.liftover)

        null_locus = hl.null(hl.tlocus('GRCh38'))

        rows = [
            {'l37': hl.locus('20', 1, 'GRCh37'), 'l38': null_locus},
            {'l37': hl.locus('20', 60000, 'GRCh37'), 'l38': null_locus},
            {'l37': hl.locus('20', 60001, 'GRCh37'), 'l38': hl.locus('chr20', 79360, 'GRCh38')},
            {'l37': hl.locus('20', 278686, 'GRCh37'), 'l38': hl.locus('chr20', 298045, 'GRCh38')},
            {'l37': hl.locus('20', 278687, 'GRCh37'), 'l38': hl.locus('chr20', 298046, 'GRCh38')},
            {'l37': hl.locus('20', 278688, 'GRCh37'), 'l38': null_locus},
            {'l37': hl.locus('20', 278689, 'GRCh37'), 'l38': null_locus},
            {'l37': hl.locus('20', 278690, 'GRCh37'), 'l38': null_locus},
            {'l37': hl.locus('20', 278691, 'GRCh37'), 'l38': hl.locus('chr20', 298047, 'GRCh38')},
            {'l37': hl.locus('20', 37007586, 'GRCh37'), 'l38': hl.locus('chr12', 32563117, 'GRCh38')},
            {'l37': hl.locus('20', 62965520, 'GRCh37'), 'l38': hl.locus('chr20', 64334167, 'GRCh38')},
            {'l37': hl.locus('20', 62965521, 'GRCh37'), 'l38': null_locus}
        ]
        schema = hl.tstruct(l37=hl.tlocus(grch37), l38=hl.tlocus(grch38))
        t = hl.Table.parallelize(rows, schema)
        self.assertTrue(t.all(hl.cond(hl.is_defined(t.l38),
                                      hl.liftover(t.l37, 'GRCh38') == t.l38,
                                      hl.is_missing(hl.liftover(t.l37, 'GRCh38')))))

        t = t.filter(hl.is_defined(t.l38))
        self.assertTrue(t.count() == 6)

        t = t.key_by('l38')
        t.count()
        self.assertTrue(list(t.key) == ['l38'])

        null_locus_interval = hl.null(hl.tinterval(hl.tlocus('GRCh38')))
        rows = [
            {'i37': hl.locus_interval('20', 1, 60000, True, False, 'GRCh37'), 'i38': null_locus_interval},
            {'i37': hl.locus_interval('20', 60001, 82456, True, True, 'GRCh37'),
             'i38': hl.locus_interval('chr20', 79360, 101815, True, True, 'GRCh38')}
        ]
        schema = hl.tstruct(i37=hl.tinterval(hl.tlocus(grch37)), i38=hl.tinterval(hl.tlocus(grch38)))
        t = hl.Table.parallelize(rows, schema)
        self.assertTrue(t.all(hl.liftover(t.i37, 'GRCh38') == t.i38))

        grch37.remove_liftover("GRCh38")
        grch38.remove_liftover("GRCh37")
Пример #3
0
    def test_reference_genome_liftover(self):
        grch37 = hl.get_reference('GRCh37')
        grch38 = hl.get_reference('GRCh38')

        self.assertTrue(not grch37.has_liftover('GRCh38') and not grch38.has_liftover('GRCh37'))
        grch37.add_liftover(resource('grch37_to_grch38_chr20.over.chain.gz'), 'GRCh38')
        grch38.add_liftover(resource('grch38_to_grch37_chr20.over.chain.gz'), 'GRCh37')
        self.assertTrue(grch37.has_liftover('GRCh38') and grch38.has_liftover('GRCh37'))

        ds = hl.import_vcf(resource('sample.vcf'))
        t = ds.annotate_rows(liftover=hl.liftover(hl.liftover(ds.locus, 'GRCh38'), 'GRCh37')).rows()
        self.assertTrue(t.all(t.locus == t.liftover))

        null_locus = hl.null(hl.tlocus('GRCh38'))

        rows = [
            {'l37': hl.locus('20', 1, 'GRCh37'), 'l38': null_locus},
            {'l37': hl.locus('20', 60000, 'GRCh37'), 'l38': null_locus},
            {'l37': hl.locus('20', 60001, 'GRCh37'), 'l38': hl.locus('chr20', 79360, 'GRCh38')},
            {'l37': hl.locus('20', 278686, 'GRCh37'), 'l38': hl.locus('chr20', 298045, 'GRCh38')},
            {'l37': hl.locus('20', 278687, 'GRCh37'), 'l38': hl.locus('chr20', 298046, 'GRCh38')},
            {'l37': hl.locus('20', 278688, 'GRCh37'), 'l38': null_locus},
            {'l37': hl.locus('20', 278689, 'GRCh37'), 'l38': null_locus},
            {'l37': hl.locus('20', 278690, 'GRCh37'), 'l38': null_locus},
            {'l37': hl.locus('20', 278691, 'GRCh37'), 'l38': hl.locus('chr20', 298047, 'GRCh38')},
            {'l37': hl.locus('20', 37007586, 'GRCh37'), 'l38': hl.locus('chr12', 32563117, 'GRCh38')},
            {'l37': hl.locus('20', 62965520, 'GRCh37'), 'l38': hl.locus('chr20', 64334167, 'GRCh38')},
            {'l37': hl.locus('20', 62965521, 'GRCh37'), 'l38': null_locus}
        ]
        schema = hl.tstruct(l37=hl.tlocus(grch37), l38=hl.tlocus(grch38))
        t = hl.Table.parallelize(rows, schema)
        self.assertTrue(t.all(hl.cond(hl.is_defined(t.l38),
                                      hl.liftover(t.l37, 'GRCh38') == t.l38,
                                      hl.is_missing(hl.liftover(t.l37, 'GRCh38')))))

        t = t.filter(hl.is_defined(t.l38))
        self.assertTrue(t.count() == 6)

        t = t.key_by('l38')
        t.count()
        self.assertTrue(list(t.key) == ['l38'])

        null_locus_interval = hl.null(hl.tinterval(hl.tlocus('GRCh38')))
        rows = [
            {'i37': hl.locus_interval('20', 1, 60000, True, False, 'GRCh37'), 'i38': null_locus_interval},
            {'i37': hl.locus_interval('20', 60001, 82456, True, True, 'GRCh37'),
             'i38': hl.locus_interval('chr20', 79360, 101815, True, True, 'GRCh38')}
        ]
        schema = hl.tstruct(i37=hl.tinterval(hl.tlocus(grch37)), i38=hl.tinterval(hl.tlocus(grch38)))
        t = hl.Table.parallelize(rows, schema)
        self.assertTrue(t.all(hl.liftover(t.i37, 'GRCh38') == t.i38))

        grch37.remove_liftover("GRCh38")
        grch38.remove_liftover("GRCh37")
Пример #4
0
    def test_liftover_strand(self):
        grch37 = hl.get_reference('GRCh37')
        grch37.add_liftover(resource('grch37_to_grch38_chr20.over.chain.gz'), 'GRCh38')

        self.assertEqual(hl.eval(hl.liftover(hl.locus('20', 60001, 'GRCh37'), 'GRCh38', include_strand=True)),
                         hl.eval(hl.struct(result=hl.locus('chr20', 79360, 'GRCh38'), is_negative_strand=False)))

        self.assertEqual(hl.eval(hl.liftover(hl.locus_interval('20', 37007582, 37007586, True, True, 'GRCh37'),
                                             'GRCh38', include_strand=True)),
                         hl.eval(hl.struct(result=hl.locus_interval('chr12', 32563117, 32563121, True, True, 'GRCh38'),
                                           is_negative_strand=True)))

        grch37.remove_liftover("GRCh38")
Пример #5
0
def _import_purcell_5k(path) -> hl.Table:
    p5k = hl.import_locus_intervals(path, reference_genome='GRCh37')
    rg37 = hl.get_reference('GRCh37')
    rg38 = hl.get_reference('GRCh38')
    if not rg37.has_liftover('GRCh38'):
        rg37.add_liftover(
            'gs://hail-common/references/grch37_to_grch38.over.chain.gz', rg38)
    p5k = p5k.annotate(start=hl.liftover(p5k.interval.start, 'GRCh38'),
                       end=hl.liftover(p5k.interval.start, 'GRCh38'))
    p5k = p5k.filter((p5k.start.contig == 'chr' + p5k.interval.start.contig)
                     & (p5k.end.contig == 'chr' + p5k.interval.end.contig))
    p5k = p5k.key_by()
    p5k = p5k.select(locus=p5k.start, locus_b37=p5k.interval.start)
    return p5k.key_by('locus')
Пример #6
0
def _import_purcell_5k(path) -> hl.Table:
    p5k = hl.import_locus_intervals(path, reference_genome="GRCh37")
    rg37 = hl.get_reference("GRCh37")
    rg38 = hl.get_reference("GRCh38")
    if not rg37.has_liftover("GRCh38"):
        rg37.add_liftover("references/grch37_to_grch38.over.chain.gz", rg38)
    p5k = p5k.annotate(
        start=hl.liftover(p5k.interval.start, "GRCh38"),
        end=hl.liftover(p5k.interval.start, "GRCh38"),
    )
    p5k = p5k.filter((p5k.start.contig == "chr" + p5k.interval.start.contig)
                     & (p5k.end.contig == "chr" + p5k.interval.end.contig))
    p5k = p5k.key_by()
    p5k = p5k.select(locus=p5k.start, locus_b37=p5k.interval.start)
    return p5k.key_by("locus")
Пример #7
0
def get_liftover_v2_qc_mt(data_type: str,
                          ld_pruned: bool,
                          release_only: bool = False,
                          overwrite: bool = False) -> hl.MatrixTable:
    """
    Returns MatrixTable for sample QC purposes on build 38: can be exomes, genomes, or joint (joint dataset can also be ld_pruned=True)
    Criteria: callrate > 0.99, AF > 0.001, SNPs only, bi-allelics only
    Note: sites where the locus changes chromosome are discarded
    """
    path = qc_mt_path(data_type, ld_pruned, 'GRCh38')
    if not overwrite and hl.hadoop_exists(path):
        grch38_qc_mt = hl.read_matrix_table(path)
    else:
        grch38_qc_mt = hl.read_matrix_table(
            qc_mt_path(data_type, ld_pruned=ld_pruned))
        get_liftover_genome(grch38_qc_mt)
        grch38_qc_mt = grch38_qc_mt.key_rows_by()
        grch38_qc_mt = grch38_qc_mt.transmute_rows(locus=hl.liftover(
            grch38_qc_mt.locus, 'GRCh38'),
                                                   locus37=grch38_qc_mt.locus)
        grch38_qc_mt = grch38_qc_mt.filter_rows(
            grch38_qc_mt.locus.contig == 'chr' + grch38_qc_mt.locus37.contig)
        grch38_qc_mt = grch38_qc_mt.key_rows_by(locus=grch38_qc_mt.locus,
                                                alleles=grch38_qc_mt.alleles)
        grch38_qc_mt = grch38_qc_mt.checkpoint(path, overwrite=overwrite)

    if release_only:
        meta = get_gnomad_meta(data_type)
        grch38_qc_mt = grch38_qc_mt.filter_cols(
            meta[grch38_qc_mt.col_key].release)

    return grch38_qc_mt
def intersect_target_ref(ref_mt_filt,
                         snp_list,
                         grch37_or_grch38,
                         intersect_out,
                         overwrite: bool = False):
    mt = hl.read_matrix_table(ref_mt_filt)
    if grch37_or_grch38.lower() == 'grch38':
        snp_list = snp_list.key_by(locus=hl.locus(hl.str(snp_list.chr),
                                                  hl.int(snp_list.pos),
                                                  reference_genome='GRCh38'),
                                   alleles=[snp_list.ref, snp_list.alt])
        mt = mt.filter_rows(hl.is_defined(snp_list[mt.row_key]))

    elif grch37_or_grch38.lower() == 'grch37':
        snp_list = snp_list.key_by(locus=hl.locus(hl.str(snp_list.chr),
                                                  hl.int(snp_list.pos),
                                                  reference_genome='GRCh37'),
                                   alleles=[snp_list.ref, snp_list.alt])
        # liftover snp list to GRCh38, filter to SNPs in mt
        rg37, rg38 = load_liftover()

        snp_liftover = snp_list.annotate(
            new_locus=hl.liftover(snp_list.locus, 'GRCh38'))
        snp_liftover = snp_liftover.filter(
            hl.is_defined(snp_liftover.new_locus))
        snp_liftover = snp_liftover.key_by(locus=snp_liftover.new_locus,
                                           alleles=snp_liftover.alleles)
        mt = mt.filter_rows(hl.is_defined(snp_liftover[mt.row_key]))

    mt = mt.repartition(5000)
    mt = mt.checkpoint(intersect_out,
                       overwrite=overwrite,
                       _read_if_exists=not overwrite)
Пример #9
0
def main(args):
    full_vcf = hl.read_matrix_table(args.allreads_prefix + '.mt')

    # liftover chains
    rg37 = hl.get_reference('GRCh37')
    rg38 = hl.get_reference('GRCh38')
    rg37.add_liftover(
        'gs://hail-common/references/grch37_to_grch38.over.chain.gz', rg38)

    chips = hl.hadoop_open(args.chip_loci)
    chip_dict = {}
    for chip in chips:
        chip = chip.strip().split()
        chip_pos = hl.import_table(chip[1],
                                   filter='\[Controls\]',
                                   skip_blank_lines=True)
        chip_pos = chip_pos.filter(
            hl.array(list(map(str, range(1, 23))) + ['X', 'Y']).contains(
                chip_pos.chr))
        chip_pos = chip_pos.key_by(
            locus=hl.locus(chip_pos.chr, hl.int(chip_pos.pos)))

        #  liftover chip position info
        chip_pos = chip_pos.annotate(
            new_locus=hl.liftover(chip_pos.locus, 'GRCh38'))
        chip_pos = chip_pos.filter(hl.is_defined(chip_pos.new_locus))
        chip_pos = chip_pos.key_by(locus=chip_pos.new_locus)

        # filter full vcf to sites in genotype data
        geno_vcf = full_vcf.filter_rows(hl.is_defined(
            chip_pos[full_vcf.locus]))
        hl.export_vcf(
            geno_vcf,
            'gs://neurogap/high_coverage/NeuroGap_30x_' + chip[0] + '.vcf.bgz')
Пример #10
0
def get_gnomad_ld_pruned_mt(genome_version="GRCh38"):

    gnomad_ld_pruned_hg37_path = "gs://gnomad/sample_qc/mt/gnomad.joint.high_callrate_common_biallelic_snps.pruned.mt"
    gnomad_ld_pruned_hg38_path = "gs://seqr-bw/ref/GRCh38/gnomad_ld_pruned.mt"

    if genome_version == "GRCh37":
        return hl.read_matrix_table(gnomad_ld_pruned_hg37_path)
    elif genome_version == "GRCh38":
        if not file_exists(gnomad_ld_pruned_hg38_path):

            grch37 = hl.get_reference('GRCh37')
            try:
                grch37.add_liftover("gs://hail-common/references/grch37_to_grch38.over.chain.gz", 'GRCh38') # doesn't like when try to add on chain file more than once
            except:  # in case the lift-over chain file was added previously
                pass
            
            gnomad_ld_pruned_mt = hl.read_matrix_table(gnomad_ld_pruned_hg37_path)
            gnomad_ld_pruned_mt = gnomad_ld_pruned_mt.annotate_rows(liftover_locus = hl.liftover(gnomad_ld_pruned_mt.locus, 'GRCh38'))
            gnomad_ld_pruned_mt = gnomad_ld_pruned_mt.filter_rows(hl.is_defined(gnomad_ld_pruned_mt.liftover_locus))
            gnomad_ld_pruned_mt = gnomad_ld_pruned_mt.rename({'locus': 'locus_grch37'}).rename({'liftover_locus': 'locus'})
            gnomad_ld_pruned_mt = gnomad_ld_pruned_mt.key_rows_by('locus','alleles')
            gnomad_ld_pruned_mt.write(gnomad_ld_pruned_hg38_path)

        return hl.read_matrix_table(gnomad_ld_pruned_hg38_path)
    else:
        raise ValueError(f"Invalid genome version: {genome_version}")
Пример #11
0
def liftover_to_grch38(input_type: str = None,
                       dirname: str = None,
                       basename: str = None):

    lifted_over = f'{dirname}{basename}.liftover.grch38.mt'
    print('\nLifting over to GRCh38')
    mt = read_infile(input_type=input_type, dirname=dirname, basename=basename)

    rg37 = hl.get_reference('GRCh37')
    rg38 = hl.get_reference('GRCh38')
    rg37.add_liftover(
        'gs://hail-common/references/grch37_to_grch38.over.chain.gz', rg38)

    mt = mt.annotate_rows(new_locus=hl.liftover(mt.locus,
                                                'GRCh38',
                                                include_strand=True),
                          old_locus=mt.locus)
    mt = mt.filter_rows(
        hl.is_defined(mt.new_locus) & ~mt.new_locus.is_negative_strand)

    mt = mt.key_rows_by(locus=mt.new_locus.result, alleles=mt.alleles)

    print(f'\nWriting out data lifted-over to GRCh38 to: {lifted_over}')
    mt.write(lifted_over)

    return hl.read_matrix_table(lifted_over)
Пример #12
0
def liftover_expr(
        locus: hl.expr.LocusExpression, alleles: hl.expr.ArrayExpression,
        destination_ref: hl.ReferenceGenome) -> hl.expr.StructExpression:
    lifted_over_locus = hl.liftover(locus,
                                    destination_ref,
                                    include_strand=True)
    lifted_over_alleles = alleles.map(lambda a: hl.if_else(
        lifted_over_locus.is_negative_strand, hl.reverse_complement(a), a))
    return hl.struct(locus=lifted_over_locus.result,
                     alleles=lifted_over_alleles)
Пример #13
0
 def add_37_coordinates(mt):
     """Annotates the GRCh38 MT with 37 coordinates using hail's built-in liftover
     :param mt: MatrixTable from VCF
     :return: MatrixTable annotated with GRCh37 coordinates
     """
     rg37 = hl.get_reference('GRCh37')
     rg38 = hl.get_reference('GRCh38')
     rg38.add_liftover(
         'gs://hail-common/references/grch38_to_grch37.over.chain.gz', rg37)
     mt = mt.annotate_rows(rg37_locus=hl.liftover(mt.locus, 'GRCh37'))
     return mt
Пример #14
0
def main():

    # Parse args
    args = parse_args()

    # Prepare liftover
    rg37 = hl.get_reference('GRCh37')
    rg38 = hl.get_reference('GRCh38')
    rg37.add_liftover(args.chainfile, rg38)

    # Create my own rg38 with altered names
    rg38_custom_contigs = [
        contig.replace('chr', '') for contig in rg38.contigs
    ]
    rg38_custom_lens = {}
    for contig in rg38.lengths:
        rg38_custom_lens[contig.replace('chr', '')] = rg38.lengths[contig]
    rg38_custom = hl.ReferenceGenome('rg38_custom', rg38_custom_contigs,
                                     rg38_custom_lens)

    # Load plink
    mt = hl.import_plink(bed=args.in_plink + '.bed',
                         bim=args.in_plink + '.bim',
                         fam=args.in_plink + '.fam',
                         reference_genome='GRCh37',
                         min_partitions=args.min_partitions)

    # # Re-call to remove phasing (required for plink output)
    # mt = mt.annotate_entries(GT=hl.call(mt.GT[0], mt.GT[1], phased=False))

    # Liftover
    mt = mt.annotate_rows(locus_GRCh38=hl.liftover(mt.locus, 'GRCh38'))

    # Strip chr from contig name (causes problems with GCTA)
    mt = mt.annotate_rows(
        contig_GRCh38=mt.locus_GRCh38.contig.replace('chr', ''))

    # Swap GRCh37 locus for GRCh38 (but have to use rg38_custom)
    mt = mt.key_rows_by()
    mt = mt.annotate_rows(locus=hl.locus(mt.contig_GRCh38,
                                         mt.locus_GRCh38.position,
                                         reference_genome=rg38_custom))
    mt = mt.key_rows_by(mt.locus, mt.alleles)

    # Remove rows with missing locus (after liftover)
    mt = mt.filter_rows(hl.is_defined(mt.locus))

    # Write plink format
    hl.export_plink(dataset=mt, output=args.out_plink)

    return 0
def main(args):
    covs = ['0.5', '1', '2', '4', '6']

    # liftover gencove data
    rg37 = hl.get_reference('GRCh37')
    rg38 = hl.get_reference('GRCh38')
    rg37.add_liftover(
        'gs://hail-common/references/grch37_to_grch38.over.chain.gz', rg38)

    for cov in ['1']:
        all_vcf = hl.hadoop_open(
            'gs://neurogap/high_coverage/gencove/fastq_keep.txt')
        header = all_vcf.readline().split()
        header_index = dict(zip(header, list(range(len(header)))))
        comb = None

        if args.merge_vcf:
            for line in all_vcf:
                # read and merge
                line = line.split()
                if line[header_index['depth']] == cov and line[
                        header_index['gencove_qc']] == 'PASS':
                    vcf = hl.import_vcf(line[header_index['path']],
                                        force_bgz=True,
                                        reference_genome='GRCh37',
                                        min_partitions=100)
                    if comb is None:
                        comb = vcf
                    else:
                        comb = comb.union_cols(vcf)

            # write out mt
            #comb = comb.naive_coalesce(5000)
            comb.write('gs://neurogap/high_coverage/gencove/merge_' + cov +
                       '_hg19.mt',
                       overwrite=args.overwrite)

        if args.liftover:
            comb = hl.read_matrix_table(
                'gs://neurogap/high_coverage/gencove/merge_' + cov +
                '_hg19.mt')

            comb = comb.annotate_rows(
                new_locus=hl.liftover(comb.locus, 'GRCh38'))
            comb = comb.filter_rows(hl.is_defined(comb.new_locus))
            comb = comb.key_rows_by(locus=comb.new_locus, alleles=comb.alleles)

            # write out mt
            comb.write('gs://neurogap/high_coverage/gencove/merge_' + cov +
                       '_grch38.mt',
                       overwrite=args.overwrite)
Пример #16
0
def lift_data(
    t: Union[hl.MatrixTable, hl.Table],
    gnomad: bool,
    data_type: str,
    path: str,
    rg: hl.genetics.ReferenceGenome,
    overwrite: bool,
) -> Union[hl.MatrixTable, hl.Table]:
    """
    Lifts input Table or MatrixTable from one reference build to another

    :param t: Table or MatrixTable
    :param gnomad: Whether data is gnomAD data
    :param data_type: Data type (exomes or genomes for gnomAD; not used otherwise)
    :param path: Path to input Table/MatrixTable (if data is not gnomAD data)
    :param rg: Reference genome
    :param overwrite: Whether to overwrite data
    :return: Table or MatrixTablewith liftover annotations
    """

    logger.info("Annotating input with liftover coordinates")
    liftover_expr = {
        "new_locus": hl.liftover(t.locus, rg, include_strand=True),
        "old_locus": t.locus,
    }
    t = (t.annotate(**liftover_expr)
         if isinstance(t, hl.Table) else t.annotate_rows(**liftover_expr))

    no_target_expr = hl.agg.count_where(hl.is_missing(t.new_locus))
    num_no_target = (t.aggregate(no_target_expr) if isinstance(t, hl.Table)
                     else t.aggregate_rows(no_target_expr))

    logger.info(f"Filtering out {num_no_target} sites that failed to liftover")
    keep_expr = hl.is_defined(t.new_locus)
    t = t.filter(keep_expr) if isinstance(
        t, hl.Table) else t.filter_rows(keep_expr)

    row_key_expr = {"locus": t.new_locus.result, "alleles": t.alleles}
    t = (t.key_by(**row_key_expr)
         if isinstance(t, hl.Table) else t.key_rows_by(**row_key_expr))

    logger.info("Writing out lifted over data")
    t = t.checkpoint(
        get_checkpoint_path(gnomad, data_type, path, isinstance(t, hl.Table)),
        overwrite=overwrite,
    )
    return t
Пример #17
0
def liftover_expr(
    locus: hl.expr.LocusExpression,
    alleles: hl.expr.ArrayExpression,
    destination_reference: hl.ReferenceGenome,
) -> hl.expr.StructExpression:
    """
    Generates struct liftover expression.

    Struct contains:
        - locus: Liftover coordinates
        - alleles: Liftover alleles
        - original_locus: Locus prior to liftover
        - original_alleles: Alleles prior to liftover
        - locus_fail_liftover: Whether the locus failed liftover
        - ref_allele_mismatch: Whether the allele at index 0 of alleles (lifted over reference allele)
            doesn't match the allele at that position in the destination reference

    :param locus: Input locus.
    :param alleles: Input alleles.
    :param destination_reference: Desired reference genome build for liftover.
    :return: Struct containing expressions for lifted over locus/alleles as well as original locus/alleles.
    """
    lifted_over_locus = hl.liftover(locus, destination_reference, include_strand=True)
    lifted_over_alleles = alleles.map(
        lambda a: hl.if_else(
            lifted_over_locus.is_negative_strand, hl.reverse_complement(a), a
        )
    )

    return hl.struct(
        new_locus=lifted_over_locus.result,
        new_alleles=lifted_over_alleles,
        original_locus=locus,
        original_alleles=alleles,
        locus_fail_liftover=hl.is_missing(lifted_over_locus),
        ref_allele_mismatch=lifted_over_locus.result.sequence_context()
        != lifted_over_alleles[0],
    )
def query(output):  # pylint: disable=too-many-locals
    """Query script entry point."""

    hl.init(default_reference='GRCh38')

    gnomad_loadings_path = f'{output}/gnomad_loadings_90k_liftover.ht'

    # liftover and get variants
    ht_gnomad_loadings = hl.read_table(GNOMAD_V2_LOADINGS)
    rg37 = hl.get_reference('GRCh37')
    rg38 = hl.get_reference('GRCh38')
    rg37.add_liftover(
        'gs://hail-common/references/grch37_to_grch38.over.chain.gz', rg38)
    ht_gnomad_loadings_liftover = ht_gnomad_loadings.annotate(
        liftover=hl.liftover(ht_gnomad_loadings.locus,
                             'GRCh38',
                             include_strand=False),
        old_locus=ht_gnomad_loadings.locus,
    )
    ht_gnomad_loadings_liftover = ht_gnomad_loadings_liftover.key_by(
        locus=ht_gnomad_loadings_liftover.liftover)

    # save gnomad loadings
    ht_gnomad_loadings_liftover.write(gnomad_loadings_path, overwrite=True)
Пример #19
0
def main():

    # # Args (local)
    # chrom = 11
    # chain_file = '/Users/em21/Projects/ot_genetics/genetics-sumstats_data/extras/prepare_uk_biobank_gwas_catalog/sitelist/input_data/grch37_to_grch38.over.chain.gz'
    # in_bgen = 'example_data/ukb_imp_chr{chrom}_v3.example.bgen'
    # in_sample = 'output/ukb_10k_downsampled.sample'
    # to_keep_list = 'output/ukb_10k_downsampled.sample_list.tsv'
    # out_plink = 'output/ukb_v3_downsampled10k_plink/ukb_v3_chr{chrom}.downsampled10k'
    # cores = 1 # Use "*" for all
    # maf_threshold = 0.001

    # Args (server)
    chrom = sys.argv[1]
    chain_file = '/nfs/users/nfs_e/em21/otcoregen/em21/uk_biobank_analysis/create_10k_subsample/input_data/grch37_to_grch38.over.chain.gz'
    in_bgen = '/nfs/users/nfs_e/em21/otcoregen/uk_biobank_data/data/genetics/imputation/ukb_imp_chr{chrom}_v3.bgen'
    in_sample = '/nfs/users/nfs_e/em21/otcoregen/em21/uk_biobank_analysis/create_10k_subsample/input_data/ukb_10k_downsampled.sample'
    to_keep_list = '/nfs/users/nfs_e/em21/otcoregen/em21/uk_biobank_analysis/create_10k_subsample/input_data/ukb_10k_downsampled.sample_list.tsv'
    out_plink = '/nfs/users/nfs_e/em21/otcoregen/em21/uk_biobank_analysis/create_10k_subsample/output/ukb_v3_downsampled10k_plink/ukb_v3_chr{chrom}.downsampled10k'
    cores = sys.argv[2]  # Use "*" for all
    maf_threshold = 0.001

    # Set the maximum number of cores
    hl.init(master="local[{}]".format(cores))

    # Prepare liftover
    rg37 = hl.get_reference('GRCh37')
    rg38 = hl.get_reference('GRCh38')
    rg37.add_liftover(chain_file, rg38)

    # Create my own rg38 with altered names
    rg38_custom_contigs = [
        contig.replace('chr', '') for contig in rg38.contigs
    ]
    rg38_custom_lens = {}
    for contig in rg38.lengths:
        rg38_custom_lens[contig.replace('chr', '')] = rg38.lengths[contig]
    rg38_custom = hl.ReferenceGenome('rg38_custom', rg38_custom_contigs,
                                     rg38_custom_lens)

    print('Processing chromosome {0}'.format(chrom))

    # Index bgen if not existing
    if not hl.hadoop_exists(in_bgen.format(chrom=chrom) + '.idx2'):
        hl.index_bgen(in_bgen.format(chrom=chrom),
                      contig_recoding={
                          "01": "1",
                          "02": "2",
                          "03": "3",
                          "04": "4",
                          "05": "5",
                          "06": "6",
                          "07": "7",
                          "08": "8",
                          "09": "9"
                      },
                      reference_genome='GRCh37')

    # Load bgen
    mt = hl.import_bgen(in_bgen.format(chrom=chrom),
                        entry_fields=['GT'],
                        sample_file=in_sample)

    # Load list samples to keep
    samples_to_keep = hl.import_table(to_keep_list,
                                      no_header=True,
                                      impute=False,
                                      types={
                                          'f0': hl.tstr
                                      }).key_by('f0')

    # Downsample to required subset of samples
    mt = mt.filter_cols(hl.is_defined(samples_to_keep[mt.s]))

    # Re-call to remove phasing (required for plink output)
    # mt = mt.annotate_entries(GT=hl.call(mt.GT[0], mt.GT[1], phased=False))

    # Filter on MAF
    mt = hl.variant_qc(mt)
    mt = mt.annotate_rows(variant_qc=mt.variant_qc.annotate(
        MAF=hl.min(mt.variant_qc.AF)))
    mt = mt.filter_rows(mt.variant_qc.MAF >= maf_threshold)

    # Liftover
    mt = mt.annotate_rows(locus_GRCh38=hl.liftover(mt.locus, 'GRCh38'))

    # Strip chr from contig name (causes problems with GCTA)
    mt = mt.annotate_rows(
        contig_GRCh38=mt.locus_GRCh38.contig.replace('chr', ''))

    # Swap GRCh37 locus for GRCh38 (but have to use rg38_custom)
    mt = mt.key_rows_by()
    mt = mt.annotate_rows(locus=hl.locus(mt.contig_GRCh38,
                                         mt.locus_GRCh38.position,
                                         reference_genome=rg38_custom))
    mt = mt.key_rows_by(mt.locus, mt.alleles)

    # Remove rows with missing locus (after liftover)
    mt = mt.filter_rows(hl.is_defined(mt.locus))

    # Write plink format
    hl.export_plink(dataset=mt, output=out_plink.format(chrom=chrom))

    return 0
Пример #20
0
parser.add_argument('-d', required=True, choices=['scores', 'elements'], help='GERP++ dataset to load.')
parser.add_argument('-b', required=True, choices=['GRCh37', 'GRCh38'], help='Reference genome build to load.')
args = parser.parse_args()

hg19 = hl.ReferenceGenome.from_fasta_file('hg19', 
                                          'gs://hail-datasets-extracted-data/assemblies/ucsc.hg19.fasta.gz', 
                                          'gs://hail-datasets-extracted-data/assemblies/ucsc.hg19.fasta.fai')

if args.d == 'scores':
    name = 'GERP_scores'
    ht = hl.import_table('gs://hail-datasets-extracted-data/GERP++/GERP++_scores.hg19.tsv.bgz',
                         types={'position': hl.tint, 'N': hl.tfloat, 'S': hl.tfloat}, min_partitions=300)
    ht = ht.annotate(locus=hl.locus('chr' + ht['chromosome'].replace('MT', 'M'), ht['position'], 'hg19'))
    if args.b == 'GRCh37':
        hg19.add_liftover('gs://hail-datasets-extracted-data/assemblies/hg19tob37.chain.gz', 'GRCh37')
        ht = ht.annotate(locus=hl.liftover(ht['locus'], 'GRCh37'))
    if args.b == 'GRCh38':
        hg19.add_liftover('gs://hail-datasets-extracted-data/assemblies/hg19ToHg38.over.chain.gz', 'GRCh38')
        ht = ht.annotate(locus=hl.liftover(ht['locus'], 'GRCh38'))
    ht = ht.filter(hl.is_defined(ht['locus']))
    ht = ht.select('locus', 'N', 'S')
    ht = ht.key_by('locus')

if args.d == 'elements':
    name = 'GERP_elements'
    ht = hl.import_table('gs://hail-datasets-extracted-data/GERP++/GERP++_elements.hg19.tsv.bgz',
                         types={'start': hl.tint, 'end': hl.tint, 'S': hl.tfloat, 'p_value': hl.tfloat})
    ht = ht.annotate(interval=hl.interval(hl.locus(ht['chromosome'], ht['start'], 'hg19'),
                                          hl.locus(ht['chromosome'], ht['end'], 'hg19')))
    if args.b == 'GRCh37':
        hg19.add_liftover('gs://hail-datasets-extracted-data/assemblies/hg19tob37.chain.gz', 'GRCh37')
Пример #21
0
import hail as hl

mt = hl.read_matrix_table(
    'gs://hail-datasets/hail-data/gtex_v7_exon_read_counts.GRCh37.mt')

b37 = hl.get_reference('GRCh37')
b37.add_liftover('gs://hail-common/references/grch37_to_grch38.over.chain.gz',
                 'GRCh38')
mt = mt.annotate_rows(interval=hl.liftover(mt.interval, 'GRCh38'))

mt.describe()
mt.write(
    'gs://hail-datasets/hail-data/gtex_v7_exon_read_counts.GRCh38.liftover.mt',
    overwrite=True)
        'position': hl.tint,
        'N': hl.tfloat,
        'S': hl.tfloat
    })

hg19 = hl.ReferenceGenome.from_fasta_file(
    'hg19', 'gs://hail-datasets/raw-data/assemblies/ucsc.hg19.fasta.gz',
    'gs://hail-datasets/raw-data/assemblies/ucsc.hg19.fasta.fai')
hg19.add_liftover('gs://hail-datasets/raw-data/assemblies/hg19tob37.chain.gz',
                  'GRCh37')

ht = ht.annotate(locus=hl.locus(ht.contig, ht.position, 'hg19'))
ht.write('hdfs:///tmp/tmp.ht', overwrite=True)

ht = hl.read_table('hdfs:///tmp/gerp_scores.hg19.ht')
ht = ht.annotate(locus=hl.liftover(ht.locus, 'GRCh37'))
ht = ht.filter(hl.is_defined(ht.locus), keep=True)
ht = ht.select(ht.locus, ht.N, ht.N)
ht = ht.key_by(ht.locus)

n_rows = ht.count()
n_partitions = ht.n_partitions()

ht = ht.annotate_globals(name=name,
                         version=version,
                         reference_genome=reference_genome,
                         n_rows=n_rows,
                         n_partitions=n_partitions)

ht.describe()
ht.write('gs://hail-datasets/hail-data/{n}.{rg}.ht'.format(
Пример #23
0
import hail as hl

mt = hl.read_matrix_table(
    'gs://hail-datasets/hail-data/gtex_v7_eqtl_significant_associations.GRCh37.mt'
)
mt.describe()

b37 = hl.get_reference('GRCh37')
b37.add_liftover('gs://hail-common/references/grch37_to_grch38.over.chain.gz',
                 'GRCh38')

mt = mt.annotate_rows(liftover_locus=hl.liftover(mt.locus, 'GRCh38'))
mt = mt.filter_rows(hl.is_defined(mt.liftover_locus), keep=True)
mt = mt.partition_rows_by(['liftover_locus'], 'liftover_locus', 'alleles',
                          'gene_id')
mt = mt.drop(mt.locus)
mt = mt.rename({'liftover_locus': 'locus'})

mt.describe()
mt.write(
    'gs://hail-datasets/hail-data/gtex_v7_eqtl_significant_associations.GRCh38.liftover.mt',
    overwrite=True)
Пример #24
0
build = args.b

ht = hl.import_table(f'{raw_data_root}/DANN_GRCh37.tsv.bgz',
                     types={
                         'position': hl.tint,
                         'DANN_score': hl.tfloat
                     })

ht = ht.annotate(locus=hl.locus(ht['chromosome'], ht['position'], 'GRCh37'),
                 alleles=hl.array([ht['ref'], ht['alt']]))

if build == 'GRCh38':
    b37 = hl.get_reference('GRCh37')
    b37.add_liftover(
        'gs://hail-common/references/grch37_to_grch38.over.chain.gz', 'GRCh38')
    ht = ht.annotate(locus=hl.liftover(ht['locus'], 'GRCh38'))
    ht = ht.filter(hl.is_defined(ht['locus']))

n_rows = ht.count()
n_partitions = ht.n_partitions()

ht = ht.key_by('locus', 'alleles')
ht = ht.rename({'DANN_score': 'score'})
ht = ht.select('score')

ht = ht.annotate_globals(metadata=hl.struct(name=name,
                                            version=hl.null(hl.tstr),
                                            reference_genome=build,
                                            n_rows=n_rows,
                                            n_partitions=n_partitions))
Пример #25
0
def main(args):
    ht_snp = hl.import_table(args.snp, impute=True)
    ht_snp = ht_snp.annotate(variant=hl.delimit([
        ht_snp.chromosome,
        hl.str(ht_snp.position), ht_snp.allele1, ht_snp.allele2
    ],
                                                delimiter=':'))
    ht_snp = ht_snp.annotate(
        **hl.parse_variant(ht_snp.variant, reference_genome='GRCh38'))
    ht_snp = ht_snp.key_by('locus', 'alleles')
    ht_snp = ht_snp.add_index('idx_snp')
    ht_snp = ht_snp.checkpoint(new_temp_file())

    # annotate vep
    gnomad = hl.read_table(
        'gs://gnomad-public-requester-pays/release/3.0/ht/genomes/gnomad.genomes.r3.0.sites.ht'
    )
    ht_snp = ht_snp.join(gnomad.select('vep'), how='left')
    ht_snp = process_consequences(ht_snp)

    # extract most severe
    ht_snp = ht_snp.annotate(vep=(hl.case().when(
        hl.is_defined(ht_snp.vep.worst_csq_for_variant_canonical),
        ht_snp.vep.worst_csq_for_variant_canonical).when(
            hl.is_defined(ht_snp.vep.worst_csq_for_variant),
            ht_snp.vep.worst_csq_for_variant).or_missing()),
                             is_canonical_vep=hl.is_defined(
                                 ht_snp.vep.worst_csq_for_variant_canonical))
    ht_snp = ht_snp.annotate(most_severe=hl.if_else(
        hl.is_defined(ht_snp.vep), ht_snp.vep.most_severe_consequence,
        'intergenic_variant'),
                             gene_most_severe=ht_snp.vep.gene_symbol)
    ht_snp = ht_snp.select_globals()
    ht_snp = ht_snp.drop('vep')
    ht_snp = ht_snp.annotate(
        **annotate_consequence_category(ht_snp.most_severe))
    ht_snp = ht_snp.checkpoint(new_temp_file())

    df = ht_snp.key_by().drop('locus', 'alleles', 'variant',
                              'idx_snp').to_pandas()

    # annotate LD
    for pop in POPS:
        ht = hl.read_table(
            f'gs://gnomad-public-requester-pays/release/2.1.1/ld/gnomad.genomes.r2.1.1.{pop}.common.adj.ld.variant_indices.ht'
        )
        ht = ht.annotate(locus_hg38=hl.liftover(ht.locus, 'GRCh38'))
        ht = ht.filter(hl.is_defined(ht.locus_hg38))
        ht = ht.key_by('locus_hg38', 'alleles').drop('locus')
        ht = ht_snp.join(ht, 'inner')
        ht = ht.checkpoint(new_temp_file())

        lead_idx = ht.order_by(hl.desc(ht.prob)).head(1).idx.collect()
        idx = ht.idx.collect()
        bm = BlockMatrix.read(
            f'gs://gnomad-public-requester-pays/release/2.1.1/ld/gnomad.genomes.r2.1.1.{pop}.common.ld.bm'
        )
        bm = bm.filter(idx, idx)
        # re-densify triangluar matrix
        bm = bm + bm.T - get_diag_mat(bm.diagonal())
        bm = bm.filter_rows(
            np.where(np.array(idx) == lead_idx[0])[0].tolist())**2

        idx_snp = ht.idx_snp.collect()
        r2 = bm.to_numpy()[0]
        df[f'gnomad_lead_r2_{pop}'] = np.nan
        df[f'gnomad_lead_r2_{pop}'].iloc[idx_snp] = r2

    if args.out.startswith('gs://'):
        fopen = hl.hadoop_open
    else:
        fopen = open

    with fopen(args.out, 'w') as f:
        df.to_csv(f, sep='\t', na_rep='NA', index=False)
Пример #26
0
def main(gnomad_file, chain_file, out_folder, test=None):

    # Output files:
    out_parquet = f'{out_folder}/variant-annotation.parquet'

    # Load data
    ht = hl.read_table(gnomad_file)

    # If process is being tested, take head:
    if test:
        ht = ht.head(test)

    # Assert that all alleles are biallelic:
    assert ht.all(ht.alleles.length() == 2), 'Mono- or multiallelic variants have been found.'

    # Extracting AF indices of populations:
    population_indices = ht.globals.freq_index_dict.collect()[0]
    population_indices = {pop: population_indices[f'{pop}-adj'] for pop in POPULATIONS}

    # Generate struct for alt. allele frequency in selected populations:
    ht = ht.annotate(af=hl.struct(**{pop: ht.freq[index].AF for pop, index in population_indices.items()}))

    # Add chain file
    grch37 = hl.get_reference('GRCh37')
    grch38 = hl.get_reference('GRCh38')
    grch38.add_liftover(chain_file, grch37)

    # Liftover
    ht = ht.annotate(
        locus_GRCh37=hl.liftover(ht.locus, 'GRCh37')
    )

    # Adding build-specific coordinates to the table:
    ht = ht.annotate(
        chrom_b38=ht.locus.contig.replace('chr', ''),
        pos_b38=ht.locus.position,
        chrom_b37=ht.locus_GRCh37.contig.replace('chr', ''),
        pos_b37=ht.locus_GRCh37.position,
        ref=ht.alleles[0],
        alt=ht.alleles[1],
        allele_type=ht.allele_info.allele_type
    )

    # Updating table:
    ht = ht.annotate(
        # Updating CADD column:
        cadd=ht.cadd.rename({'raw_score': 'raw'}).drop('has_duplicate'),

        # Adding locus as new column:
        locus_GRCh38=ht.locus
    )

    # Drop all global annotations:
    ht = ht.select_globals()

    # Drop unnecessary VEP fields
    ht = ht.annotate(
        vep=ht.vep.drop(
            'assembly_name',
            'allele_string',
            'ancestral',
            'context',
            'end',
            'id',
            'input',
            'intergenic_consequences',
            'seq_region_name',
            'start',
            'strand',
            'variant_class'
        )
    )

    # Sort columns
    col_order = [
        'locus_GRCh38', 'chrom_b38', 'pos_b38',
        'chrom_b37', 'pos_b37',
        'ref', 'alt', 'allele_type', 'vep', 'rsid', 'af', 'cadd', 'filters'
    ]

    # Repartition and write parquet file
    (
        ht
        .select(*col_order)
        .to_spark(flatten=False)
        .coalesce(OUT_PARTITIONS)
        .write.mode('overwrite').parquet(out_parquet)
    )
Пример #27
0
import hail as hl

ht = hl.read_table(
    'gs://hail-datasets/hail-data/gerp_scores.GRCh37.liftover.ht')

b37 = hl.get_reference('GRCh37')
b37.add_liftover('gs://hail-common/references/grch37_to_grch38.over.chain.gz',
                 'GRCh38')

ht = ht.annotate(liftover_locus=hl.liftover(ht.locus, 'GRCh38'))
ht = ht.filter(hl.is_defined(ht.liftover_locus), keep=True)
ht = ht.key_by(ht.liftover_locus)
ht = ht.drop('locus')
ht = ht.rename({'liftover_locus': 'locus'})

ht.describe()
ht.write('gs://hail-datasets/hail-data/gerp_scores.GRCh38.liftover.ht',
         overwrite=True)
Пример #28
0
def main():

    # Args (global)
    chain_file = 'gs://hail-common/references/grch37_to_grch38.over.chain.gz'
    inf = 'gs://genetics-portal-raw/uk_biobank_sumstats/variant_sitelist/ukbiobank_neale_saige_sitelist.190321.tsv'
    in_ensembl = 'gs://genetics-portal-raw/ensembl_grch37_r95/homo_sapiens-chr*.vcf.*.gz'
    out_parquet = 'gs://genetics-portal-raw/uk_biobank_sumstats/variant_sitelist/ukbiobank_neale_saige_sitelist.190321.annotated.parquet'

    # # Args (local)
    # chain_file = 'input_data/grch37_to_grch38.over.chain.gz'
    # inf = 'ukbiobank_neale_saige_sitelist.head100k.tsv'
    # in_ensembl = 'input_data/homo_sapiens-chr1.head.vcf'
    # out_parquet = 'ukbiobank_neale_saige_sitelist.annotated.parquet'

    #
    # Load sitelist ------------------------------------------------------------
    #

    # Load data
    ht = hl.import_table(inf,
                         no_header=True,
                         min_partitions=128,
                         types={
                             'f0': 'str',
                             'f1': 'int32',
                             'f2': 'str',
                             'f3': 'str'
                         })

    # Rename columns
    ht = ht.rename({
        'f0': 'chrom_b37',
        'f1': 'pos_b37',
        'f2': 'ref',
        'f3': 'alt'
    })

    # Create locus and allele
    ht = ht.annotate(locus=hl.locus(ht.chrom_b37, ht.pos_b37, 'GRCh37'),
                     alleles=hl.array([ht.ref,
                                       ht.alt])).key_by('locus', 'alleles')

    #
    # Do liftover --------------------------------------------------------------
    #

    # Add chain file
    rg37 = hl.get_reference('GRCh37')
    rg38 = hl.get_reference('GRCh38')
    rg37.add_liftover(chain_file, rg38)

    # Liftover
    ht = ht.annotate(locus_GRCh38=hl.liftover(ht.locus, 'GRCh38'))

    # Convert to spark
    df = (ht.to_spark().withColumnRenamed(
        'locus_GRCh38.contig',
        'chrom_b38').withColumnRenamed('locus_GRCh38.position',
                                       'pos_b38').drop('locus.contig',
                                                       'locus.position',
                                                       'alleles'))

    #
    # Annotate with rsids ------------------------------------------------------
    #

    # Load ensembl
    ensembl = load_ensembl_vcf(in_ensembl)

    # Join
    df = df.join(ensembl,
                 on=['chrom_b37', 'pos_b37', 'ref', 'alt'],
                 how='left')
    #
    # Write output -------------------------------------------------------------
    #

    # Write
    (df.select('chrom_b37', 'pos_b37', 'chrom_b38', 'pos_b38', 'ref', 'alt',
               'rsid').write.parquet(out_parquet, mode='overwrite'))

    return 0
Пример #29
0
    Returns the complement of a base
    :param str base: Base to be flipped
    :return: Complement of input base
    :rtype: str
    """
    return (hl.switch(base).when('A', 'T').when('T', 'A').when('G', 'C').when(
        'C', 'G').default(base))


mpc_ht = hl.import_table(MPC_SCORE, impute=True)
mpc_ht = mpc_ht.annotate(locus=hl.locus(contig=mpc_ht.chrom, pos=mpc_ht.pos),
                         alleles=[mpc_ht.ref, mpc_ht.alt])
mpc_ht = mpc_ht.key_by(mpc_ht.locus, mpc_ht.alleles).select('MPC')

mpc_ht = mpc_ht.annotate(
    new_locus=hl.liftover(mpc_ht.locus, 'GRCh38', include_strand=True))
mpc_ht = mpc_ht.filter(hl.is_defined(mpc_ht.new_locus))

mpc_ht = mpc_ht.annotate(new_alleles=hl.cond(
    mpc_ht.new_locus.is_negative_strand,
    [flip_base(mpc_ht.alleles[0]),
     flip_base(mpc_ht.alleles[1])], mpc_ht.alleles))

mpc_ht = mpc_ht.key_by(locus=mpc_ht.new_locus.result,
                       alleles=mpc_ht.new_alleles)

# Write the result to file.
mpc_ht.write(
    'gs://raw_data_bipolar_dalio_w1_w2/inputs/fordist_constraint_official_mpc_values_v2_GRCh38.ht',
    overwrite=True)
Пример #30
0
    Returns the complement of a base
    :param str base: Base to be flipped
    :return: Complement of input base
    :rtype: str
    """
    return (hl.switch(base).when('A', 'T').when('T', 'A').when('G', 'C').when(
        'C', 'G').default(base))


ht_bsc = hl.import_table(BSC_COUNTS, impute=True)
ht_bsc = ht_bsc.annotate(locus=hl.locus(contig=ht_bsc.chrom, pos=ht_bsc.pos),
                         alleles=[ht_bsc.ref, ht_bsc.alt])
ht_bsc = ht_bsc.key_by(ht_bsc.locus, ht_bsc.alleles)

ht_bsc = ht_bsc.annotate(
    new_locus=hl.liftover(ht_bsc.locus, 'GRCh38', include_strand=True))
ht_bsc = ht_bsc.filter(hl.is_defined(ht_bsc.new_locus))

ht_bsc = ht_bsc.annotate(new_alleles=hl.cond(
    ht_bsc.new_locus.is_negative_strand,
    [flip_base(ht_bsc.alleles[0]),
     flip_base(ht_bsc.alleles[1])], ht_bsc.alleles))

ht_bsc = ht_bsc.key_by(locus=ht_bsc.new_locus.result,
                       alleles=ht_bsc.new_alleles)

# Write the result to file.
ht_bsc.write(
    'gs://raw_data_bipolar_dalio_w1_w2/inputs/BSC_MAC5_counts_GRCh38.ht',
    overwrite=True)
Пример #31
0
def liftover_intervals(t: hl.Table,
                       keep_missing_interval: bool = False) -> hl.Table:
    """
    Liftover locus in intervals from one coordinate system (hg37) to another (hg38)

    # Example input table description
    #
    # ----------------------------------------
    # Global fields:
    #     None
    # ----------------------------------------
    # Row fields:
    #     'interval': interval<locus<GRCh37>>
    # ----------------------------------------
    # Key: ['interval']
    # ----------------------------------------


    :param t: Table of intervals on GRCh37
    :param keep_missing_interval: If True, keep missing (non-lifted) intervals in the output Table.
    :return: Table with intervals lifted over GRCh38 added.
    """

    rg37 = hl.get_reference("GRCh37")
    rg38 = hl.get_reference("GRCh38")

    if not rg37.has_liftover("GRCh38"):
        rg37.add_liftover(
            f'{nfs_dir}/resources/liftover/grch37_to_grch38.over.chain.gz',
            rg38)

    t = t.annotate(
        start=hl.liftover(t.interval.start, "GRCh38"),
        end=hl.liftover(t.interval.end, "GRCh38"),
    )

    t = t.filter((t.start.contig == "chr" + t.interval.start.contig)
                 & (t.end.contig == "chr" + t.interval.end.contig))

    t = t.key_by()

    t = (t.select(interval=hl.locus_interval(t.start.contig,
                                             t.start.position,
                                             t.end.position,
                                             reference_genome=rg38,
                                             invalid_missing=True),
                  interval_hg37=t.interval))

    # bad intervals
    missing = t.aggregate(hl.agg.counter(~hl.is_defined(t.interval)))
    logger.info(
        f"Number of missing intervals: {missing[True]} out of {t.count()}...")

    # update globals annotations
    global_ann_expr = {
        'date': current_date(),
        'reference_genome': 'GRCh38',
        'was_lifted': True
    }
    t = t.annotate_globals(**global_ann_expr)

    if not keep_missing_interval:
        logger.info(f"Filtering out {missing[True]} missing intervals...")
        t = t.filter(hl.is_defined(t.interval), keep=True)

    return t.key_by("interval")
Пример #32
0
        'Name': 'gene_id',
        'x': 'TPM'
    })

    mt = mt_counts.annotate_entries(TPM=mt_tpm[mt_counts.gene_id,
                                               mt_counts.sample_id]['TPM'])
    mt = mt.annotate_rows(**ht_genes[mt.gene_id])
    mt = mt.annotate_cols(**ht_sample_attributes[mt.sample_id])

    if reference_genome == 'GRCh38':
        b37 = hl.get_reference('GRCh37')
        b37.add_liftover(
            'gs://hail-common/references/grch37_to_grch38.over.chain.gz',
            'GRCh38')
        mt = mt.annotate_rows(
            gene_interval=hl.liftover(mt['gene_interval'], 'GRCh38'))
        mt = mt.filter_rows(hl.is_defined(mt['gene_interval']))

    mt = mt.repartition(20)

elif dataset == 'transcripts':

    name = 'GTEx_transcript_expression'

    ht_transcripts = import_gtf(path=EXTRACT_BUCKET +
                                'GTEx/v7/GTEx_transcripts.v7.GRCh37.gtf.bgz',
                                reference_genome='GRCh37')
    ht_transcripts = ht_transcripts.filter(
        ht_transcripts['feature'] == 'transcript')
    ht_transcripts = ht_transcripts.select(
        'transcript_id', 'strand', 'transcript_name', 'transcript_type',