Пример #1
0
def get_genotypes(conn, metadata, args):
    """For each variant, report each sample's genotype
       on a separate line.
    """
    idx_to_sample = util.map_indices_to_samples(metadata)

    query = "SELECT  v.chrom, v.start, v.end, \
                     v.ref, v.alt, \
                     v.type, v.sub_type, \
                     v.aaf, v.in_dbsnp, v.gene, \
                     v.gts \
             FROM    variants v \
             ORDER BY chrom, start"
    res = conn.execute(sql.text(query))

    # build a list of all the column indices that are NOT
    # gt_* columns.  These will be the columns reported
    (col_names, non_gt_idxs) = \
        util.get_col_names_and_indices(metadata.tables["variants"], ignore_gt_cols=True)
    col_names.append('sample')
    col_names.append('genotype')

    if args.use_header:
        print args.separator.join(col for col in col_names)
    for row in res:
        gts = Z.unpack_genotype_blob(row['gts'])
        for idx, gt in enumerate(gts):
            # xrange(len(row)-1) to avoid printing v.gts
            a = args.separator.join(str(row[i]) for i in xrange(len(row)-1))
            b = args.separator.join([idx_to_sample[idx], gt])
            print args.separator.join((a, b))
Пример #2
0
def get_genotypes(c, args):
    """For each variant, report each sample's genotype
       on a separate line.
    """
    idx_to_sample = util.map_indices_to_samples(c)

    query = "SELECT  v.chrom, v.start, v.end, \
                     v.ref, v.alt, \
                     v.type, v.sub_type, \
                     v.aaf, v.in_dbsnp, v.gene, \
                     v.gts \
             FROM    variants v \
             ORDER BY chrom, start"

    c.execute(query)

    # build a list of all the column indices that are NOT
    # gt_* columns.  These will be the columns reported
    (col_names, non_gt_idxs) = \
        util.get_col_names_and_indices(c.description, ignore_gt_cols=True)
    col_names.append('sample')
    col_names.append('genotype')

    if args.use_header:
        print args.separator.join(col for col in col_names)
    for row in c:
        gts = np.array(cPickle.loads(zlib.decompress(row['gts'])))
        for idx, gt in enumerate(gts):
            # xrange(len(row)-1) to avoid printing v.gts
            print args.separator.join(
                str(row[i]) for i in xrange(len(row) - 1)),
            print args.separator.join([idx_to_sample[idx], gt])
Пример #3
0
def sample_lof_variants(c, args, samples):
    idx_to_sample = util.map_indices_to_samples(c)
    query = "SELECT chrom, start, end, \
                             gt_types, gts, gene \
             FROM variants \
             WHERE is_lof='1'"
    c.execute(query)

    # header
    if args.var_mode:
        print "\t".join(
            [
                "sample",
                "lof_gene",
                "order_of_interaction",
                "interacting_gene",
                "var_id",
                "chrom",
                "start",
                "end",
                "impact",
                "biotype",
                "in_dbsnp",
                "clinvar_sig",
                "clinvar_disease_name",
                "aaf_1kg_all",
                "aaf_esp_all",
            ]
        )

    elif not args.var_mode:
        print "\t".join(["sample", "lof_gene", "order_of_interaction", "interacting_gene"])

    sample_lof_interactions(c, args, idx_to_sample, samples)
Пример #4
0
    def __init__(self, db, include_gt_cols=False,
                 out_format=DefaultRowFormat(None)):
        assert os.path.exists(db), "%s does not exist." % db

        self.db = db
        self.query_executed = False
        self.for_browser = False
        self.include_gt_cols = include_gt_cols

        # try to connect to the provided database
        self._connect_to_database()

        # extract the column names from the sample table.
        # needed for gt-filter wildcard support.
        self._collect_sample_table_columns()

        # list of samples ids for each clause in the --gt-filter
        self.sample_info = collections.defaultdict(list)

        # map sample names to indices. e.g. self.sample_to_idx[NA20814] -> 323
        self.sample_to_idx = util.map_samples_to_indices(self.c)
        # and vice versa. e.g., self.idx_to_sample[323] ->  NA20814
        self.idx_to_sample = util.map_indices_to_samples(self.c)
        self.idx_to_sample_object = util.map_indices_to_sample_objects(self.c)
        self.formatter = out_format
        self.predicates = [self.formatter.predicate]
Пример #5
0
def get_gtcounts_by_sample(c, args):
    """
    Report the count of each genotype class
    observed for each sample.
    """
    idx_to_sample = util.map_indices_to_samples(c)

    # report.
    print '\t'.join([
        'sample', 'num_hom_ref', 'num_het', 'num_hom_alt', 'num_unknown',
        'total'
    ])

    query = "SELECT *, \
             (num_hom_ref + num_het + num_hom_alt + num_unknown) as total \
             FROM sample_genotype_counts"

    c.execute(query)
    # count the number of each genotype type obs. for each sample.
    for row in c:
        sample = idx_to_sample[row['sample_id']]
        print "\t".join(
            str(s) for s in [
                sample, row['num_hom_ref'], row['num_het'], row['num_hom_alt'],
                row['num_unknown'], row['total']
            ])
Пример #6
0
def sample_variants(conn, metadata, args):
    idx_to_sample = util.map_indices_to_samples(metadata)
    query = "SELECT variant_id, gt_types, gts, gene, impact, biotype, \
                    in_dbsnp, clinvar_sig, clinvar_disease_name, aaf_1kg_all, aaf_esp_all, chrom, \
                    start, end  \
             FROM variants"
    res = conn.execute(query)

    if args.command == 'interactions':
        #header
        if args.var_mode:
            print "\t".join(['sample','gene','order_of_interaction', \
                             'interacting_gene', 'var_id', 'chrom', 'start', \
                             'end', 'impact', 'biotype', 'in_dbsnp', \
                             'clinvar_sig', 'clinvar_disease_name', 'aaf_1kg_all', \
                             'aaf_esp_all'])

        if (not args.var_mode):
            print "\t".join(['sample','gene','order_of_interaction', \
                     'interacting_gene'])
        sample_gene_interactions(res, args, idx_to_sample)

    elif args.command == 'lof_interactions':
        samples = get_variant_genes(res, args, idx_to_sample)
        return samples
Пример #7
0
def sample_variants(c, args):
    idx_to_sample = util.map_indices_to_samples(c)
    query = "SELECT variant_id, gt_types, gts, gene, impact, biotype, \
                    in_dbsnp, clinvar_sig, clinvar_disease_name, aaf_1kg_all, aaf_esp_all, chrom, \
                    start, end  \
             FROM variants"

    c.execute(query)

    if args.command == 'interactions':
        #header
        if args.var_mode:
            print "\t".join(['sample','gene','order_of_interaction', \
                             'interacting_gene', 'var_id', 'chrom', 'start', \
                             'end', 'impact', 'biotype', 'in_dbsnp', \
                             'clinvar_sig', 'clinvar_disease_name', 'aaf_1kg_all', \
                             'aaf_esp_all'])

        if (not args.var_mode):
            print "\t".join(['sample','gene','order_of_interaction', \
                     'interacting_gene'])
        sample_gene_interactions(c, args, idx_to_sample)

    elif args.command == 'lof_interactions':
        samples = get_variant_genes(c, args, idx_to_sample)
        return samples
Пример #8
0
def get_ind_lof(c, args):

    idx_to_sample = util.map_indices_to_samples(c)

    query = "SELECT v.chrom, v.start, v.end, v.ref, v.alt, \
                             v.impact, v.aa_change, v.aa_length, \
                             v.gt_types, v.gts, i.gene, \
                             i.transcript,  i.biotype\
             FROM variants v, variant_impacts i \
             WHERE v.variant_id = i.variant_id \
             AND i.is_lof='1' \
             AND v.type = 'snp'"

    c.execute(query)

    # header
    print '\t'.join([
        'chrom', 'start', 'end', 'ref', 'alt', 'highest_impact', 'aa_change',
        'var_trans_pos', 'trans_aa_length', 'var_trans_pct', 'sample',
        'genotype', 'gene', 'transcript', 'trans_type'
    ])

    for r in c:
        gt_types = np.array(cPickle.loads(zlib.decompress(r['gt_types'])))
        gts = np.array(cPickle.loads(zlib.decompress(r['gts'])))
        gene = str(r['gene'])
        trans = str(r['transcript'])

        aa_change = str(r['aa_change'])
        aa_length = str(r['aa_length'])
        transcript_pos = None
        transcript_pct = None
        if aa_change != 'None':
            try:
                #transcript_pos for snpEff annotated VCF
                transcript_pos = re.findall('\S(\d+)\S', aa_change)[0]
            except IndexError:
                #transcript_pos for VEP annotated VCF
                if aa_length != 'None' and \
                        aa_length.split("/")[0] != "-":
                    transcript_pos = aa_length.split("/")[0]
        #transcript_pct for snpEff annotated VCF
        if aa_length != 'None' and "/" not in aa_length:
            transcript_pct = float(transcript_pos) / float(aa_length)
        #transcript_pct for VEP annotated VCF
        elif aa_length != 'None' and "/" in aa_length:
            transcript_pct = float(transcript_pos) / float(
                aa_length.split("/")[1])

        for idx, gt_type in enumerate(gt_types):
            if gt_type == HET or gt_type == HOM_ALT:
                print "\t".join([
                    r['chrom'],
                    str(r['start']),
                    str(r['end']), r['ref'], r['alt'], r['impact'],
                    r['aa_change'] or 'None', transcript_pos or 'None',
                    r['aa_length'] or 'None',
                    str(transcript_pct) or 'None', idx_to_sample[idx],
                    gts[idx], gene, trans, r['biotype'] or 'None'
                ])
Пример #9
0
def get_genotypes(c, args):
    """For each variant, report each sample's genotype
       on a separate line.
    """
    idx_to_sample = util.map_indices_to_samples(c)

    query = "SELECT  v.chrom, v.start, v.end, \
                     v.ref, v.alt, \
                     v.type, v.sub_type, \
                     v.aaf, v.in_dbsnp, v.gene, \
                     v.gts \
             FROM    variants v \
             ORDER BY chrom, start"
    c.execute(query)

    # build a list of all the column indices that are NOT
    # gt_* columns.  These will be the columns reported
    (col_names, non_gt_idxs) = \
        util.get_col_names_and_indices(c.description, ignore_gt_cols=True)
    col_names.append('sample')
    col_names.append('genotype')

    if args.use_header:
        print args.separator.join(col for col in col_names)
    for row in c:
        gts = np.array(cPickle.loads(zlib.decompress(row['gts'])))
        for idx, gt in enumerate(gts):
            # xrange(len(row)-1) to avoid printing v.gts
            print args.separator.join(str(row[i]) for i in xrange(len(row)-1)),
            print args.separator.join([idx_to_sample[idx], gt])
Пример #10
0
    def __init__(self,
                 db,
                 include_gt_cols=False,
                 out_format=DefaultRowFormat(None)):
        assert os.path.exists(db), "%s does not exist." % db

        self.db = db
        self.query_executed = False
        self.for_browser = False
        self.include_gt_cols = include_gt_cols

        # try to connect to the provided database
        self._connect_to_database()

        # extract the column names from the sample table.
        # needed for gt-filter wildcard support.
        self._collect_sample_table_columns()

        # list of samples ids for each clause in the --gt-filter
        self.sample_info = collections.defaultdict(list)

        # map sample names to indices. e.g. self.sample_to_idx[NA20814] -> 323
        self.sample_to_idx = util.map_samples_to_indices(self.c)
        # and vice versa. e.g., self.idx_to_sample[323] ->  NA20814
        self.idx_to_sample = util.map_indices_to_samples(self.c)
        self.idx_to_sample_object = util.map_indices_to_sample_objects(self.c)
        self.formatter = out_format
        self.predicates = [self.formatter.predicate]
Пример #11
0
def get_ind_lof(c, args):
    
    idx_to_sample = util.map_indices_to_samples(c)

    query = "SELECT v.chrom, v.start, v.end, v.ref, v.alt, \
                             v.impact, v.aa_change, v.aa_length, \
                             v.gt_types, v.gts, i.gene, \
                             i.transcript,  i.biotype\
             FROM variants v, variant_impacts i \
             WHERE v.variant_id = i.variant_id \
             AND i.is_lof='1' \
             AND v.type = 'snp'"

    c.execute(query)

    # header
    print '\t'.join(['chrom', 'start', 'end', 'ref', 'alt',
                     'highest_impact', 'aa_change', 'var_trans_pos',
                     'trans_aa_length', 'var_trans_pct',
                     'sample', 'genotype', 'gene', 'transcript', 'trans_type'])

    for r in c:
        gt_types = Z.unpack_genotype_blob(r['gt_types'])
        gts = Z.unpack_genotype_blob(r['gts'])
        gene = str(r['gene'])
        trans = str(r['transcript'])

        aa_change = str(r['aa_change'])
        aa_length = str(r['aa_length'])
        transcript_pos = None
        transcript_pct = None
        if aa_change != 'None':
            try:
                #transcript_pos for snpEff annotated VCF
                transcript_pos = re.findall('\S(\d+)\S', aa_change)[0]
            except IndexError:
                #transcript_pos for VEP annotated VCF
                if aa_length != 'None' and \
                        aa_length.split("/")[0] != "-":
                    transcript_pos = aa_length.split("/")[0] 
        #transcript_pct for snpEff annotated VCF        
        if aa_length != 'None' and "/" not in aa_length:
            transcript_pct = float(transcript_pos) / float(aa_length)
        #transcript_pct for VEP annotated VCF
        elif aa_length != 'None' and "/" in aa_length:
            transcript_pct = float(transcript_pos) / float(aa_length.split("/")[1])

        for idx, gt_type in enumerate(gt_types):
            if gt_type == HET or gt_type == HOM_ALT:
                print "\t".join([r['chrom'], str(r['start']),
                                 str(r['end']), r['ref'], r['alt'],
                                 r['impact'],
                                 r['aa_change'] or 'None',
                                 transcript_pos or 'None',
                                 r['aa_length'] or 'None',
                                 str(transcript_pct) or 'None',
                                 idx_to_sample[idx],
                                 gts[idx], gene, trans, r['biotype'] or 'None'])
Пример #12
0
def get_ind_lof(c, args):

    idx_to_sample = util.map_indices_to_samples(c)

    query = "SELECT v.chrom, v.start, v.end, v.ref, v.alt, \
                             v.impact, v.aa_change, v.aa_length, \
                             v.gt_types, v.gts, i.gene, \
                             i.transcript,  i.biotype\
             FROM variants v, variant_impacts i \
             WHERE v.variant_id = i.variant_id \
             AND i.is_lof='1' \
             AND v.type = 'snp'"

    c.execute(query)

    # header
    print '\t'.join(['chrom', 'start', 'end', 'ref', 'alt',
                     'highest_impact', 'aa_change', 'var_trans_pos',
                     'trans_aa_length', 'var_trans_pct',
                     'sample', 'genotype', 'gene', 'transcript', 'trans_type'])

    for r in c:
        gt_types = np.array(cPickle.loads(zlib.decompress(r['gt_types'])))
        gts = np.array(cPickle.loads(zlib.decompress(r['gts'])))
        gene = str(r['gene'])
        trans = str(r['transcript'])

        aa_change = str(r['aa_change'])
        aa_length = str(r['aa_length'])
        transcript_pos = None
        transcript_pct = None
        if aa_change != 'None':
            transcript_pos = re.findall('\S(\d+)\S', aa_change)[0]
            if aa_length != 'None':
                transcript_pct = float(transcript_pos) / float(aa_length)

        for idx, gt_type in enumerate(gt_types):
            if gt_type == HET or gt_type == HOM_ALT:
                print "\t".join([r['chrom'], str(r['start']),
                                 str(r['end']), r['ref'], r['alt'],
                                 r['impact'],
                                 r['aa_change'] or 'None',
                                 transcript_pos or 'None',
                                 r['aa_length'] or 'None',
                                 str(transcript_pct) or 'None',
                                 idx_to_sample[idx],
                                 gts[idx], gene, trans, r['biotype']])
Пример #13
0
    def __init__(self, db, include_gt_cols=False,
                 out_format=DefaultRowFormat(None)):
        assert os.path.exists(db), "%s does not exist." % db

        self.db = db
        self.query_executed = False
        self.for_browser = False
        self.include_gt_cols = include_gt_cols

        self._connect_to_database()
        # map sample names to indices. e.g. self.sample_to_idx[NA20814] -> 323
        self.sample_to_idx = util.map_samples_to_indices(self.c)
        # and vice versa. e.g., self.idx_to_sample[323] ->  NA20814
        self.idx_to_sample = util.map_indices_to_samples(self.c)
        self.idx_to_sample_object = util.map_indices_to_sample_objects(self.c)
        self.formatter = out_format
        self.predicates = [self.formatter.predicate]
Пример #14
0
def get_variants_by_sample(c, args):
    """
    Report the number of variants observed for each sample
    where the sample had a non-ref genotype
    """
    idx_to_sample = util.map_indices_to_samples(c)

    # report.
    print '\t'.join(['sample', 'total'])

    query = "SELECT sample_id, \
             (num_het + num_hom_alt) as total \
             FROM sample_genotype_counts"

    c.execute(query)
    for row in c:
        sample = idx_to_sample[row['sample_id']]
        print "\t".join(str(s) for s in [sample, row['total']])
Пример #15
0
def get_ind_pathways(conn, metadata, args):

    idx_to_sample = util.map_indices_to_samples(metadata)

    query = "SELECT v.chrom, v.start, v.end, v.ref, v.alt, \
                             i.impact, v.gt_types, v.gts, i.gene, \
                             i.transcript \
             FROM variants v, variant_impacts i \
             WHERE v.variant_id = i.variant_id"

    res = conn.execute(sql.text(query))

    # header
    print '\t'.join(['chrom', 'start', 'end', 'ref', 'alt', \
                     'impact', 'sample', 'genotype', \
                     'gene', 'transcript', 'pathway'])

    _report_variant_pathways(res, args, idx_to_sample)
Пример #16
0
def get_variants_by_sample(c, args):
    """
    Report the number of variants observed for each sample
    where the sample had a non-ref genotype
    """
    idx_to_sample = util.map_indices_to_samples(c)

    # report.
    print '\t'.join(['sample', 'total'])

    query = "SELECT sample_id, \
             (num_het + num_hom_alt) as total \
             FROM sample_genotype_counts"
    c.execute(query)
    for row in c:
        sample = idx_to_sample[row['sample_id']]
        print "\t".join(str(s) for s in [sample,
                                         row['total']])
Пример #17
0
def get_ind_lof_pathways(c, args):

    idx_to_sample = util.map_indices_to_samples(c)

    query = "SELECT v.chrom, v.start, v.end, v.ref, v.alt, \
                             i.impact, v.gt_types, v.gts, i.gene, \
                             i.transcript \
             FROM variants v, variant_impacts i \
             WHERE v.variant_id = i.variant_id \
             AND i.is_lof='1'"

    c.execute(query)

    # header
    print '\t'.join(['chrom', 'start', 'end', 'ref', 'alt', \
                     'impact', 'sample', 'genotype', \
                     'gene', 'transcript', 'pathway'])

    _report_variant_pathways(c, args, idx_to_sample)
Пример #18
0
def get_ind_lof_pathways(c, args):

    idx_to_sample = util.map_indices_to_samples(c)

    query = "SELECT v.chrom, v.start, v.end, v.ref, v.alt, \
                             i.impact, v.gt_types, v.gts, i.gene, \
                             i.transcript \
             FROM variants v, variant_impacts i \
             WHERE v.variant_id = i.variant_id \
             AND i.is_lof='1'"

    c.execute(query)

    # header
    print '\t'.join(['chrom', 'start', 'end', 'ref', 'alt', \
                     'impact', 'sample', 'genotype', \
                     'gene', 'transcript', 'pathway'])

    _report_variant_pathways(c, args, idx_to_sample)
Пример #19
0
def sample_lof_variants(conn, metadata, args, samples):
    idx_to_sample = util.map_indices_to_samples(metadata)
    query = "SELECT chrom, start, end, \
                             gt_types, gts, gene \
             FROM variants \
             WHERE is_lof='1'"
    res = conn.execute(query)

    #header
    if args.var_mode:
        print "\t".join(['sample','lof_gene','order_of_interaction', \
                    'interacting_gene', 'var_id', 'chrom', 'start', \
                    'end', 'impact','biotype','in_dbsnp', 'clinvar_sig', \
                    'clinvar_disease_name', 'aaf_1kg_all','aaf_esp_all'])

    elif (not args.var_mode):
        print "\t".join(['sample','lof_gene','order_of_interaction', \
                         'interacting_gene'])

    sample_lof_interactions(res, args, idx_to_sample, samples)
Пример #20
0
def get_genotypes(conn, metadata, args):
    """For each variant, report each sample's genotype
       on a separate line.
    """
    idx_to_sample = util.map_indices_to_samples(metadata)

    query = "SELECT  v.chrom, v.start, v.end, \
                     v.ref, v.alt, \
                     v.type, v.sub_type, \
                     v.aaf, v.in_dbsnp, v.gene, \
                     v.gts \
             FROM    variants v \
             ORDER BY chrom, start"
    res = conn.execute(sql.text(query))

    # build a list of all the column indices that are NOT
    # gt_* columns.  These will be the columns reported
    (col_names, non_gt_idxs) = \
        util.get_col_names_and_indices(metadata.tables["variants"], ignore_gt_cols=True)
    col_names.append('sample')
    col_names.append('genotype')

    if args.use_header:
        print args.separator.join(col for col in col_names)

    unpack = Z.unpack_genotype_blob
    import zlib


    for row in res:
        try:
            gts = unpack(row['gts'])
        except zlib.error:
            unpack = Z.snappy_unpack_blob
            gts = unpack(row['gts'])

        for idx, gt in enumerate(gts):
            # xrange(len(row)-1) to avoid printing v.gts
            a = args.separator.join(str(row[i]) for i in xrange(len(row)-1))
            b = args.separator.join([idx_to_sample[idx], gt])
            print args.separator.join((a, b))
Пример #21
0
def sample_lof_variants(c, args, samples):
    idx_to_sample = util.map_indices_to_samples(c)
    query = "SELECT chrom, start, end, \
                             gt_types, gts, gene \
             FROM variants \
             WHERE is_lof='1'"

    c.execute(query)

    #header
    if args.var_mode:
        print "\t".join(['sample','lof_gene','order_of_interaction', \
                    'interacting_gene', 'var_id', 'chrom', 'start', \
                    'end', 'impact','biotype','in_dbsnp', 'clinvar_sig', \
                    'clinvar_disease_name', 'aaf_1kg_all','aaf_esp_all'])

    elif (not args.var_mode):
        print "\t".join(['sample','lof_gene','order_of_interaction', \
                         'interacting_gene'])

    sample_lof_interactions(c, args, idx_to_sample, samples)
Пример #22
0
def sample_variants(c, args):
    idx_to_sample = util.map_indices_to_samples(c)
    query = "SELECT variant_id, gt_types, gts, gene, impact, biotype, \
                    in_dbsnp, clinvar_sig, clinvar_disease_name, aaf_1kg_all, aaf_esp_all, chrom, \
                    start, end  \
             FROM variants"
    c.execute(query)

    if args.command == "interactions":
        # header
        if args.var_mode:
            print "\t".join(
                [
                    "sample",
                    "gene",
                    "order_of_interaction",
                    "interacting_gene",
                    "var_id",
                    "chrom",
                    "start",
                    "end",
                    "impact",
                    "biotype",
                    "in_dbsnp",
                    "clinvar_sig",
                    "clinvar_disease_name",
                    "aaf_1kg_all",
                    "aaf_esp_all",
                ]
            )

        if not args.var_mode:
            print "\t".join(["sample", "gene", "order_of_interaction", "interacting_gene"])
        sample_gene_interactions(c, args, idx_to_sample)

    elif args.command == "lof_interactions":
        samples = get_variant_genes(c, args, idx_to_sample)
        return samples
Пример #23
0
def get_gtcounts_by_sample(c, args):
    """
    Report the count of each genotype class
    observed for each sample.
    """
    idx_to_sample = util.map_indices_to_samples(c)

    # report.
    print '\t'.join(['sample', 'num_hom_ref', 'num_het',
                     'num_hom_alt', 'num_unknown', 'total'])

    query = "SELECT *, \
             (num_hom_ref + num_het + num_hom_alt + num_unknown) as total \
             FROM sample_genotype_counts"
    c.execute(query)
    # count the number of each genotype type obs. for each sample.
    for row in c:
        sample = idx_to_sample[row['sample_id']]
        print "\t".join(str(s) for s in [sample,
                                         row['num_hom_ref'],
                                         row['num_het'],
                                         row['num_hom_alt'],
                                         row['num_unknown'],
                                         row['total']])
Пример #24
0
def get_ind_lof(c, args):

    idx_to_sample = util.map_indices_to_samples(c)

    query = "SELECT v.chrom, v.start, v.end, v.ref, v.alt, \
                             v.impact, v.aa_change, v.aa_length, \
                             v.gt_types, v.gts, i.gene, \
                             i.transcript,  i.biotype\
             FROM variants v, variant_impacts i \
             WHERE v.variant_id = i.variant_id \
             AND i.is_lof='1' \
             AND v.type = 'snp'"

    c.execute(query)

    # header
    print "\t".join(
        [
            "chrom",
            "start",
            "end",
            "ref",
            "alt",
            "highest_impact",
            "aa_change",
            "var_trans_pos",
            "trans_aa_length",
            "var_trans_pct",
            "sample",
            "genotype",
            "gene",
            "transcript",
            "trans_type",
        ]
    )

    for r in c:
        gt_types = np.array(cPickle.loads(zlib.decompress(r["gt_types"])))
        gts = np.array(cPickle.loads(zlib.decompress(r["gts"])))
        gene = str(r["gene"])
        trans = str(r["transcript"])

        aa_change = str(r["aa_change"])
        aa_length = str(r["aa_length"])
        transcript_pos = None
        transcript_pct = None
        if aa_change != "None":
            transcript_pos = re.findall("\S(\d+)\S", aa_change)[0]
            if aa_length != "None":
                transcript_pct = float(transcript_pos) / float(aa_length)

        for idx, gt_type in enumerate(gt_types):
            if gt_type == HET or gt_type == HOM_ALT:
                print "\t".join(
                    [
                        r["chrom"],
                        str(r["start"]),
                        str(r["end"]),
                        r["ref"],
                        r["alt"],
                        r["impact"],
                        r["aa_change"] or "None",
                        transcript_pos or "None",
                        r["aa_length"] or "None",
                        str(transcript_pct) or "None",
                        idx_to_sample[idx],
                        gts[idx],
                        gene,
                        trans,
                        r["biotype"],
                    ]
                )
Пример #25
0
def get_ind_lof(conn, metadata, args):

    idx_to_sample = util.map_indices_to_samples(metadata)

    query = "SELECT v.chrom, v.start, v.end, v.ref, v.alt, \
                             v.impact, v.aa_change, v.aa_length, \
                             v.gt_types, v.gts, i.gene, \
                             i.transcript,  i.biotype\
             FROM variants v, variant_impacts i \
             WHERE v.variant_id = i.variant_id \
             AND i.is_lof='1' \
             AND v.type = 'snp'"

    res = conn.execute(sql.text(query))

    # header
    print '\t'.join([
        'chrom', 'start', 'end', 'ref', 'alt', 'highest_impact', 'aa_change',
        'var_trans_pos', 'trans_aa_length', 'var_trans_pct', 'sample',
        'genotype', 'gene', 'transcript', 'trans_type'
    ])

    unpack = Z.unpack_genotype_blob
    for r in res:
        try:
            gt_types = unpack(r['gt_types'])
            gts = unpack(r['gts'])
        except:
            unpack = Z.snappy_unpack_blob
            gt_types = unpack(r['gt_types'])
            gts = unpack(r['gts'])

        gene = str(r['gene'])
        trans = str(r['transcript'])

        aa_change = str(r['aa_change'])
        aa_length = str(r['aa_length'])
        transcript_pos = None
        transcript_pct = None
        if aa_change != 'None':
            try:
                #transcript_pos for snpEff annotated VCF
                transcript_pos = re.findall('\S(\d+)\S', aa_change)[0]
            except IndexError:
                #transcript_pos for VEP annotated VCF
                if aa_length != 'None' and \
                        aa_length.split("/")[0] != "-":
                    transcript_pos = aa_length.split("/")[0]
        #handle non exonic variants
        if transcript_pos is None:
            transcript_pct = '/'
        #transcript_pct for snpEff annotated VCF
        elif aa_length != 'None' and "/" not in aa_length:
            transcript_pct = float(transcript_pos) / float(aa_length)
        #transcript_pct for VEP annotated VCF
        elif aa_length != 'None' and "/" in aa_length:
            transcript_pct = float(transcript_pos) / float(
                aa_length.split("/")[1])

        for idx, gt_type in enumerate(gt_types):
            if gt_type == HET or gt_type == HOM_ALT:
                print "\t".join([
                    r['chrom'],
                    str(r['start']),
                    str(r['end']), r['ref'], r['alt'], r['impact'],
                    r['aa_change'] or 'None', transcript_pos or 'None',
                    r['aa_length'] or 'None',
                    str(transcript_pct) or 'None', idx_to_sample[idx],
                    gts[idx], gene, trans, r['biotype'] or 'None'
                ])