Exemplo n.º 1
0
def sample_variants(c, args):
    idx_to_sample = util.map_indicies_to_samples(c)
    query = "SELECT variant_id, gt_types, gts, gene, impact, biotype, \
                    in_dbsnp, clinvar_sig, clinvar_disease_name, aaf_1kg_all, aaf_esp_all, chrom, \
                    start, end  \
             FROM variants"
    c.execute(query)
    
    if args.command == 'interactions':
        #header  
        if args.var_mode:
            print "\t".join(['sample','gene','order_of_interaction', \
                             'interacting_gene', 'var_id', 'chrom', 'start', \
                             'end', 'impact', 'biotype', 'in_dbsnp', \
                             'clinvar_sig', 'clinvar_disease_name', 'aaf_1kg_all', \
                             'aaf_esp_all'])
        
        if (not args.var_mode):
            print "\t".join(['sample','gene','order_of_interaction', \
                     'interacting_gene'])
        sample_gene_interactions(c, args, idx_to_sample)
        
    elif args.command == 'lof_interactions':
        samples = get_variant_genes(c, args, idx_to_sample)
        return samples
Exemplo n.º 2
0
def get_genotypes(c, args):
    """For each variant, report each sample's genotype
       on a separate line.
    """ 
    idx_to_sample = util.map_indicies_to_samples(c)
    
    query = "SELECT  v.chrom, v.start, v.end, \
                     v.ref, v.alt, \
                     v.type, v.sub_type, \
                     v.aaf, v.in_dbsnp, v.gene, \
                     v.gts \
             FROM    variants v \
             ORDER BY chrom, start"
    c.execute(query)
    
    # build a list of all the column indices that are NOT
    # gt_* columns.  These will be the columns reported
    (col_names, non_gt_idxs) = \
        util.get_col_names_and_indices(c.description, ignore_gt_cols=True)
    col_names.append('sample')
    col_names.append('genotype')
    
    if args.use_header: 
        print args.separator.join(col for col in col_names)
    for row in c:
        gts = np.array(cPickle.loads(zlib.decompress(row['gts'])))
        for idx, gt in enumerate(gts):
            # xrange(len(row)-1) to avoid printing v.gts
            print args.separator.join(str(row[i]) for i in xrange(len(row)-1)),
            print args.separator.join([idx_to_sample[idx], gt])
Exemplo n.º 3
0
def get_gtcounts_by_sample(c, args):
    """
    Report the count of each genotype class
    observed for each sample.
    """
    idx_to_sample = util.map_indicies_to_samples(c)

    # report.
    print '\t'.join([
        'sample', 'num_hom_ref', 'num_het', 'num_hom_alt', 'num_unknown',
        'total'
    ])

    query = "SELECT *, \
             (num_hom_ref + num_het + num_hom_alt + num_unknown) as total \
             FROM sample_genotype_counts"

    c.execute(query)
    # count the number of each genotype type obs. for each sample.
    for row in c:
        sample = idx_to_sample[row['sample_id']]
        print "\t".join(
            str(s) for s in [
                sample, row['num_hom_ref'], row['num_het'], row['num_hom_alt'],
                row['num_unknown'], row['total']
            ])
Exemplo n.º 4
0
def get_genotypes(c, args):
    """For each variant, report each sample's genotype
       on a separate line.
    """
    idx_to_sample = util.map_indicies_to_samples(c)

    query = "SELECT  v.chrom, v.start, v.end, \
                     v.ref, v.alt, \
                     v.type, v.sub_type, \
                     v.aaf, v.in_dbsnp, v.gene, \
                     v.gts \
             FROM    variants v \
             ORDER BY chrom, start"

    c.execute(query)

    # build a list of all the column indices that are NOT
    # gt_* columns.  These will be the columns reported
    (col_names, non_gt_idxs) = \
        util.get_col_names_and_indices(c.description, ignore_gt_cols=True)
    col_names.append('sample')
    col_names.append('genotype')

    if args.use_header:
        print args.separator.join(col for col in col_names)
    for row in c:
        gts = np.array(cPickle.loads(zlib.decompress(row['gts'])))
        for idx, gt in enumerate(gts):
            # xrange(len(row)-1) to avoid printing v.gts
            print args.separator.join(
                str(row[i]) for i in xrange(len(row) - 1)),
            print args.separator.join([idx_to_sample[idx], gt])
Exemplo n.º 5
0
def sample_lof_variants(c, args):
    idx_to_sample = util.map_indicies_to_samples(c)
    query = "SELECT chrom, start, end, \
                             gt_types, gts, gene \
             FROM variants \
             WHERE is_lof='1'"
    c.execute(query)
      
    sample_lof_interactions(c, args, idx_to_sample) 
Exemplo n.º 6
0
    def __init__(self, db):
        self.db = db
        self.query_executed = False
        self.for_browser = False

        self._connect_to_database()
        # map sample names to indices. e.g. self.sample_to_idx[NA20814] -> 323
        self.sample_to_idx = util.map_samples_to_indicies(self.c)
        # and vice versa. e.g., self.idx_to_sample[323] ->  NA20814
        self.idx_to_sample = util.map_indicies_to_samples(self.c)
Exemplo n.º 7
0
def sample_lof_variants(c, args):
    idx_to_sample = util.map_indicies_to_samples(c)
    query = "SELECT chrom, start, end, \
                             gt_types, gts, gene \
             FROM variants \
             WHERE is_lof='1'"

    c.execute(query)

    sample_lof_interactions(c, args, idx_to_sample)
Exemplo n.º 8
0
    def __init__(self, db):
        self.db = db
        self.query_executed = False
        self.for_browser = False

        self._connect_to_database()
        # map sample names to indices. e.g. self.sample_to_idx[NA20814] -> 323
        self.sample_to_idx = util.map_samples_to_indicies(self.c)
        # and vice versa. e.g., self.idx_to_sample[323] ->  NA20814
        self.idx_to_sample = util.map_indicies_to_samples(self.c)
Exemplo n.º 9
0
    def __init__(self, db, include_gt_cols=False):
        assert os.path.exists(db), "%s does not exist." % db

        self.db = db
        self.query_executed = False
        self.for_browser = False
        self.include_gt_cols = include_gt_cols
        
        self._connect_to_database()
        # map sample names to indices. e.g. self.sample_to_idx[NA20814] -> 323
        self.sample_to_idx = util.map_samples_to_indicies(self.c)
        # and vice versa. e.g., self.idx_to_sample[323] ->  NA20814
        self.idx_to_sample = util.map_indicies_to_samples(self.c)
Exemplo n.º 10
0
def sample_variants(c, args):
    idx_to_sample = util.map_indicies_to_samples(c)
    query = "SELECT variant_id, gt_types, gts, gene, impact, biotype \
             FROM variants"
    c.execute(query)
    
    if args.var_mode:
        print "\t".join(['sample','gene','order_of_interaction', \
                     'interacting_gene', 'var_id','impact','biotype'])
    elif (not args.var_mode):
        print "\t".join(['sample','gene','order_of_interaction', \
                     'interacting_gene'])
    sample_gene_interactions(c, args, idx_to_sample)
Exemplo n.º 11
0
def sample_variants(c, args):
    idx_to_sample = util.map_indicies_to_samples(c)
    query = "SELECT variant_id, gt_types, gts, gene, impact, biotype \
             FROM variants"

    c.execute(query)

    if args.var_mode:
        print "\t".join(['sample','gene','order_of_interaction', \
                     'interacting_gene', 'var_id','impact','biotype'])
    elif (not args.var_mode):
        print "\t".join(['sample','gene','order_of_interaction', \
                     'interacting_gene'])
    sample_gene_interactions(c, args, idx_to_sample)
Exemplo n.º 12
0
def get_ind_lof(c, args):

    idx_to_sample = util.map_indicies_to_samples(c)

    query = "SELECT v.chrom, v.start, v.end, v.ref, v.alt, \
                             v.impact, v.aa_change, v.aa_length, \
                             v.gt_types, v.gts, i.gene, \
                             i.transcript,  i.biotype\
             FROM variants v, variant_impacts i \
             WHERE v.variant_id = i.variant_id \
             AND i.is_lof='1' \
             AND v.type = 'snp'"

    c.execute(query)

    # header
    print '\t'.join([
        'chrom', 'start', 'end', 'ref', 'alt', 'highest_impact', 'aa_change',
        'var_trans_pos', 'trans_aa_length', 'var_trans_pct', 'sample',
        'genotype', 'gene', 'transcript', 'trans_type'
    ])

    for r in c:
        gt_types = np.array(cPickle.loads(zlib.decompress(r['gt_types'])))
        gts = np.array(cPickle.loads(zlib.decompress(r['gts'])))
        gene = str(r['gene'])
        trans = str(r['transcript'])

        aa_change = str(r['aa_change'])
        aa_length = str(r['aa_length'])
        transcript_pos = None
        transcript_pct = None
        if aa_change != 'None':
            transcript_pos = re.findall('\S(\d+)\S', aa_change)[0]
            if aa_length != 'None':
                transcript_pct = float(transcript_pos) / float(aa_length)

        for idx, gt_type in enumerate(gt_types):
            if gt_type == HET or gt_type == HOM_ALT:
                print "\t".join([
                    r['chrom'],
                    str(r['start']),
                    str(r['end']), r['ref'], r['alt'], r['impact'],
                    r['aa_change'] or 'None', transcript_pos or 'None',
                    r['aa_length'] or 'None',
                    str(transcript_pct) or 'None', idx_to_sample[idx],
                    gts[idx], gene, trans, r['biotype']
                ])
Exemplo n.º 13
0
def get_ind_lof(c, args):

    idx_to_sample = util.map_indicies_to_samples(c)

    query = "SELECT v.chrom, v.start, v.end, v.ref, v.alt, \
                             v.impact, v.aa_change, v.aa_length, \
                             v.gt_types, v.gts, i.gene, \
                             i.transcript,  i.biotype\
             FROM variants v, variant_impacts i \
             WHERE v.variant_id = i.variant_id \
             AND i.is_lof='1' \
             AND v.type = 'snp'"

    c.execute(query)

    # header
    print '\t'.join(['chrom', 'start', 'end', 'ref', 'alt', \
                     'highest_impact', 'aa_change', 'var_trans_pos', 
                     'trans_aa_length', 'var_trans_pct', \
                     'sample', 'genotype', 'gene', 'transcript', 'trans_type'])

    for r in c:
        gt_types = np.array(cPickle.loads(zlib.decompress(r['gt_types'])))
        gts      = np.array(cPickle.loads(zlib.decompress(r['gts'])))        
        gene     = str(r['gene'])
        trans    = str(r['transcript'])
        
        aa_change = str(r['aa_change'])
        aa_length = str(r['aa_length'])
        transcript_pos = None
        transcript_pct = None
        if aa_change != 'None':
            transcript_pos = re.findall('\S(\d+)\S', aa_change)[0]
            if aa_length != 'None':
                transcript_pct = float(transcript_pos) / float(aa_length)

        for idx, gt_type in enumerate(gt_types):
            if gt_type == GT_HET or gt_type == GT_HOM_ALT:
                print "\t".join([r['chrom'], str(r['start']), \
                                 str(r['end']), r['ref'], r['alt'], \
                                 r['impact'], \
                                 r['aa_change'] or 'None', \
                                 transcript_pos or 'None', \
                                 r['aa_length'] or 'None', \
                                 str(transcript_pct) or 'None', \
                                 idx_to_sample[idx], \
                                 gts[idx], gene, trans, r['biotype']])
Exemplo n.º 14
0
def get_ind_pathways(c, args):

    idx_to_sample = util.map_indicies_to_samples(c)

    query = "SELECT v.chrom, v.start, v.end, v.ref, v.alt, \
                             i.impact, v.gt_types, v.gts, i.gene, \
                             i.transcript \
             FROM variants v, variant_impacts i \
             WHERE v.variant_id = i.variant_id"

    c.execute(query)

    # header
    print '\t'.join(['chrom', 'start', 'end', 'ref', 'alt', \
                     'highest_impact', 'sample', 'genotype', \
                     'gene', 'transcript', 'pathway'])

    _report_variant_pathways(c, args, idx_to_sample)
Exemplo n.º 15
0
def get_variants_by_sample(c, args):
    """
    Report the number of variants observed for each sample
    where the sample had a non-ref genotype
    """
    idx_to_sample = util.map_indicies_to_samples(c)

    # report.
    print '\t'.join(['sample', 'total'])

    query = "SELECT sample_id, \
             (num_het + num_hom_alt) as total \
             FROM sample_genotype_counts"
    c.execute(query)
    for row in c:
        sample = idx_to_sample[row['sample_id']]
        print "\t".join(str(s) for s in [sample,
                                         row['total']])
Exemplo n.º 16
0
def get_ind_pathways(c, args):

    idx_to_sample = util.map_indicies_to_samples(c)

    query = "SELECT v.chrom, v.start, v.end, v.ref, v.alt, \
                             i.impact, v.gt_types, v.gts, i.gene, \
                             i.transcript \
             FROM variants v, variant_impacts i \
             WHERE v.variant_id = i.variant_id"

    c.execute(query)

    # header
    print "\t".join(
        ["chrom", "start", "end", "ref", "alt", "impact", "sample", "genotype", "gene", "transcript", "pathway"]
    )

    _report_variant_pathways(c, args, idx_to_sample)
Exemplo n.º 17
0
def get_ind_pathways(c, args):

    idx_to_sample = util.map_indicies_to_samples(c)

    query = "SELECT v.chrom, v.start, v.end, v.ref, v.alt, \
                             i.impact, v.gt_types, v.gts, i.gene, \
                             i.transcript \
             FROM variants v, variant_impacts i \
             WHERE v.variant_id = i.variant_id"

    c.execute(query)

    # header
    print '\t'.join(['chrom', 'start', 'end', 'ref', 'alt', \
                     'impact', 'sample', 'genotype', \
                     'gene', 'transcript', 'pathway'])

    _report_variant_pathways(c, args, idx_to_sample)
Exemplo n.º 18
0
def sample_lof_variants(c, args, samples):
    idx_to_sample = util.map_indicies_to_samples(c)
    query = "SELECT chrom, start, end, \
                             gt_types, gts, gene \
             FROM variants \
             WHERE is_lof='1'"
    c.execute(query)

    #header
    if args.var_mode:
        print "\t".join(['sample','lof_gene','order_of_interaction', \
                    'interacting_gene', 'var_id', 'chrom', 'start', \
                    'end', 'impact','biotype','in_dbsnp', 'clin_sigs', \
                    'aaf_1kg_all','aaf_esp_all'])
                    
    elif (not args.var_mode):
        print "\t".join(['sample','lof_gene','order_of_interaction', \
                         'interacting_gene'])
                         
    sample_lof_interactions(c, args, idx_to_sample, samples)
Exemplo n.º 19
0
def sample_lof_variants(c, args, samples):
    idx_to_sample = util.map_indicies_to_samples(c)
    query = "SELECT chrom, start, end, \
                             gt_types, gts, gene \
             FROM variants \
             WHERE is_lof='1'"
    c.execute(query)

    #header
    if args.var_mode:
        print "\t".join(['sample','lof_gene','order_of_interaction', \
                    'interacting_gene', 'var_id', 'chrom', 'start', \
                    'end', 'impact','biotype','in_dbsnp', 'clinvar_sig', \
                    'clinvar_disease_name', 'aaf_1kg_all','aaf_esp_all'])
                    
    elif (not args.var_mode):
        print "\t".join(['sample','lof_gene','order_of_interaction', \
                         'interacting_gene'])
                         
    sample_lof_interactions(c, args, idx_to_sample, samples)
Exemplo n.º 20
0
def get_gtcounts_by_sample(c, args):
    """
    Report the count of each genotype class
    observed for each sample.
    """
    idx_to_sample = util.map_indicies_to_samples(c)

    # report.
    print '\t'.join(['sample', 'num_hom_ref', 'num_het',
                     'num_hom_alt', 'num_unknown', 'total'])

    query = "SELECT *, \
             (num_hom_ref + num_het + num_hom_alt + num_unknown) as total \
             FROM sample_genotype_counts"
    c.execute(query)
    # count the number of each genotype type obs. for each sample.
    for row in c:
        sample = idx_to_sample[row['sample_id']]
        print "\t".join(str(s) for s in [sample,
                                         row['num_hom_ref'],
                                         row['num_het'],
                                         row['num_hom_alt'],
                                         row['num_unknown'],
                                         row['total']])
Exemplo n.º 21
0
def get_compound_hets(c, args):
    """
    Report candidate compound heterozygous mutations.
    """
    # build a mapping of the numpy array index to the appropriate sample name
    # e.g. 0 == 109400005
    #     37 == 147800025
    idx_to_sample = util.map_indicies_to_samples(c)

    comp_hets = collections.defaultdict(lambda: collections.defaultdict(list))

    query = "SELECT * FROM variants \
             WHERE is_coding = 1"                                  # is_exonic - what about splice?
    c.execute(query)

    # step 1. collect all candidate heterozygptes for all
    # genes and samples.  the list will be refined in step 2.
    for row in c:
        gt_types = np.array(cPickle.loads(zlib.decompress(row['gt_types'])))
        gt_phases = np.array(cPickle.loads(zlib.decompress(row['gt_phases'])))
        gt_bases = np.array(cPickle.loads(zlib.decompress(row['gts'])))

        site = Site(row)

        # filter putative sites that the user doesn't care about
        if site.num_hets > 1 and not args.allow_other_hets:
            continue
        if not site.is_lof and args.only_lof:
            continue

        # track each sample that is heteroyzgous at this site.
        for idx, gt_type in enumerate(gt_types):
            if gt_type == GT_HET:
                sample = idx_to_sample[idx]
                # (testing)
                # sample = "NA19002"
                sample_site = copy(site)
                sample_site.phased = gt_phases[idx]

                # require phased genotypes
                if not sample_site.phased:
                    continue
                sample_site.gt = gt_bases[idx]
                # add the site to the list of candidates
                # for this sample/gene
                comp_hets[sample][site.gene].append(sample_site)

    # header
    print "sample\tgene\thet1\thet2"
    # step 2.  now, cull the list of candidate heterozygotes for each
    # gene/sample to those het pairs where the alternate alleles
    # were inherited on opposite haplotypes.
    for sample in comp_hets:
        for gene in comp_hets[sample]:
            for site1 in comp_hets[sample][gene]:
                for site2 in comp_hets[sample][gene]:
                    if site1 == site2:
                        continue

                    # expand the genotypes for this sample
                    # at each site into it's composite
                    # alleles.  e.g. A|G -> ['A', 'G']
                    alleles_site1 = site1.gt.split('|')
                    alleles_site2 = site2.gt.split('|')

                    # return the haplotype on which the alternate
                    # allele was observed for this sample at each
                    # candidate het. site.
                    # e.g., if ALT=G and alleles_site1=['A', 'G']
                    # then alt_hap_1 = 1.  if ALT=A, then alt_hap_1 = 0
                    alt_hap_1 = alleles_site1.index(site1.alt)
                    alt_hap_2 = alleles_site2.index(site2.alt)

                    # it is only a true compound heterozygote iff
                    # the alternates are on opposite haplotypes.
                    if alt_hap_1 != alt_hap_2:
                        print "\t".join([sample, gene, str(site1), str(site2)])
Exemplo n.º 22
0
def get_compound_hets(c, args):
    """
    Report candidate compound heterozygous mutations.
    """
    # build a mapping of the numpy array index to the appropriate sample name
    # e.g. 0 == 109400005
    #     37 == 147800025
    idx_to_sample = util.map_indicies_to_samples(c)

    comp_hets = collections.defaultdict(lambda: collections.defaultdict(list))

    query = "SELECT * FROM variants \
             WHERE impact_severity != 'LOW'"  # is_exonic - what about splice?
    c.execute(query)

    # step 1. collect all candidate heterozygptes for all
    # genes and samples.  the list will be refined in step 2.
    for row in c:
        gt_types = compression.unpack_genotype_blob(row['gt_types'])
        gt_phases = compression.unpack_genotype_blob(row['gt_phases'])
        gt_bases = compression.unpack_genotype_blob(row['gts'])

        site = Site(row)

        # filter putative sites that the user doesn't care about
        if site.num_hets > 1 and not args.allow_other_hets:
            continue
        if not site.is_lof and args.only_lof:
            continue

        # track each sample that is heteroyzgous at this site.
        for idx, gt_type in enumerate(gt_types):
            if gt_type == HET:
                sample = idx_to_sample[idx]
                # (testing)
                # sample = "NA19002"
                sample_site = copy(site)
                sample_site.phased = gt_phases[idx]

                # require phased genotypes
                if not sample_site.phased and not args.ignore_phasing:
                    continue

                sample_site.gt = gt_bases[idx]
                # add the site to the list of candidates
                # for this sample/gene
                comp_hets[sample][site.gene].append(sample_site)

    # header
    print "sample\tgene\thet1\thet2"
    # step 2.  now, cull the list of candidate heterozygotes for each
    # gene/sample to those het pairs where the alternate alleles
    # were inherited on opposite haplotypes.
    for sample in comp_hets:
        for gene in comp_hets[sample]:
            for site1 in comp_hets[sample][gene]:
                for site2 in comp_hets[sample][gene]:
                    if site1 == site2:
                        continue

                    # expand the genotypes for this sample
                    # at each site into it's composite
                    # alleles.  e.g. A|G -> ['A', 'G']
                    alleles_site1 = []
                    alleles_site2 = []
                    if not args.ignore_phasing:
                        alleles_site1 = site1.gt.split('|')
                        alleles_site2 = site2.gt.split('|')
                    else:
                        # split on phased (|) or unphased (/) genotypes
                        alleles_site1 = re.split('\||/', site1.gt)
                        alleles_site2 = re.split('\||/', site2.gt)

                    # it is only a true compound heterozygote iff
                    # the alternates are on opposite haplotypes.
                    if not args.ignore_phasing:
                        # return the haplotype on which the alternate
                        # allele was observed for this sample at each
                        # candidate het. site.
                        # e.g., if ALT=G and alleles_site1=['A', 'G']
                        # then alt_hap_1 = 1.  if ALT=A, then alt_hap_1 = 0
                        alt_hap_1 = alleles_site1.index(site1.alt)
                        alt_hap_2 = alleles_site2.index(site2.alt)

                        if alt_hap_1 != alt_hap_2:
                            print "\t".join([sample,
                                             gene,
                                             str(site1),
                                             str(site2)])
                    else:
                        # user has asked us to not care about phasing
                        print "\t".join([sample,
                                         gene,
                                         str(site1),
                                         str(site2)])
Exemplo n.º 23
0
def get_ind_lof(c, args):

    idx_to_sample = util.map_indicies_to_samples(c)

    query = "SELECT v.chrom, v.start, v.end, v.ref, v.alt, \
                             v.impact, v.aa_change, v.aa_length, \
                             v.gt_types, v.gts, i.gene, \
                             i.transcript,  i.biotype\
             FROM variants v, variant_impacts i \
             WHERE v.variant_id = i.variant_id \
             AND i.is_lof='1' \
             AND v.type = 'snp'"

    c.execute(query)

    # header
    print "\t".join(
        [
            "chrom",
            "start",
            "end",
            "ref",
            "alt",
            "highest_impact",
            "aa_change",
            "var_trans_pos",
            "trans_aa_length",
            "var_trans_pct",
            "sample",
            "genotype",
            "gene",
            "transcript",
            "trans_type",
        ]
    )

    for r in c:
        gt_types = np.array(cPickle.loads(zlib.decompress(r["gt_types"])))
        gts = np.array(cPickle.loads(zlib.decompress(r["gts"])))
        gene = str(r["gene"])
        trans = str(r["transcript"])

        aa_change = str(r["aa_change"])
        aa_length = str(r["aa_length"])
        transcript_pos = None
        transcript_pct = None
        if aa_change != "None":
            transcript_pos = re.findall("\S(\d+)\S", aa_change)[0]
            if aa_length != "None":
                transcript_pct = float(transcript_pos) / float(aa_length)

        for idx, gt_type in enumerate(gt_types):
            if gt_type == HET or gt_type == HOM_ALT:
                print "\t".join(
                    [
                        r["chrom"],
                        str(r["start"]),
                        str(r["end"]),
                        r["ref"],
                        r["alt"],
                        r["impact"],
                        r["aa_change"] or "None",
                        transcript_pos or "None",
                        r["aa_length"] or "None",
                        str(transcript_pct) or "None",
                        idx_to_sample[idx],
                        gts[idx],
                        gene,
                        trans,
                        r["biotype"],
                    ]
                )