Exemplo n.º 1
0
    def _makedb(self):
        """Internal method. Do not use"""

        self.logger.info('Creating SNP database from ...')
        self.logger.info('Input file: %s' % self.inname)
        if not os.path.exists(self.inname):
            self.logger.error('%s: No such file' % self.inname)
            self.logger.error('Database not created')
            sys.exit(1)

        self.load(db=self.outname)
        self.conn.text_factory = str
        self._vcf = VCFParser(self.inname)
        self._infokeys, self._schema = self._createschema()
        for s in self._schema:
            self.createtable(s, True)
        stmt = 'insert into snps values ('
        stmt += ','.join('?' * (5 + len(self._infonum))) + ')'
        self.curs = self.conn.cursor()

        cache = []
        n = 0
        next = self._vcf.__next__
        while 1:

            try:
                cache.append(self._insertentry(next()))
            except StopIteration:
                break

            n += 1
            if not n % 100000:
                self.curs.executemany(stmt, cache)
                self.logger.info('Processed %d entries' % n)
                cache = []

        if cache:
            self.curs.executemany(stmt, cache)
        self.logger.info('Processed %s entries' % n)

        # Add version details
        fd = dt.fromtimestamp(os.path.getmtime(
            self.inname)).strftime('%Y-%m-%d')
        if 'dbsnp' in self.inname:
            version = 'v' + str(self._vcf.meta['dbSNP_BUILD_ID'])
        elif 'ESP6500' in self.inname:
            fn = os.path.splitext(os.path.split(self.inname)[1])[0]
            version = fn.split('.')[0]
        elif 'ExAC' in self.inname:
            fn = os.path.splitext(os.path.split(self.inname)[1])[0]
            version = fn.split('.')[1]
        else:
            version = "v%s_%s" % tuple(fd.split('-')[:2])

        self.set_version(os.path.split(self.outname)[1], fd, version, n)

        self.conn.commit()
        self.curs.close()
        self.conn.close()
        self.logger.info('... SNP database created')
Exemplo n.º 2
0
def get_varcnt(invcf):
    '''Computes number of exonic variants per gene

    Args:
        invcf(str):    VARANT annotated VCF

    Returns:
        varcnt(dictionary):    Returns exonic variant count per gene
    '''
    varcnt = {}
    vcf = VCFParser(invcf)
    for rec in vcf:
        vcf.parseinfo(rec)
        ant = vp.parse(rec.info)
        prant = vp.prio_trans(ant)
        cache = []
        for altid, antinfo in prant.items():
            if altid != 'intergenic':
                genelist = antinfo.keys()
                for gene in genelist:
                    txant = antinfo[gene]['TRANSCRIPT']
                    key = (rec.chrom, gene)
                    if 'CodingExonic' in txant.region.split('_')\
                             and txant.mutation != 'Syn' \
                             and rec.info['ESPAF'] < 5.0 and key not in cache: #TODO (to be replaced by ExAC?)
                        cache.append(key)
                        if key not in varcnt:
                            varcnt[key] = 1
                        else:
                            varcnt[key] += 1
    return varcnt
Exemplo n.º 3
0
def _get_sample_ids(invcf):
    '''Return sample ids present in the vcf file'''
    vcfo = VCFParser(invcf)
    return vcfo.samples
    vcfo.close()
Exemplo n.º 4
0
    geneannot['TRANSCRIPT'] = [ta]
    if alltrans and \
            geneannot['TRANSCRIPT'][0].region != 'Intergenic':
        for altid, geneinfo in parsed_dict.items():
            if prior_gene in geneinfo:
                for nt in geneinfo[prior_gene]['TRANSCRIPTS']:
                    if nt not in geneannot['TRANSCRIPT']:
                        geneannot['TRANSCRIPT'].append(nt)

    return geneannot


if __name__ == '__main__':
    import sys
    vcffile = sys.argv[1]
    vcf = VCFParser(vcffile)
    for rec in vcf:
        print rec.chrom, rec.pos, rec.ref, rec.alt, rec.id
        vcf.parseinfo(rec)
        print 'Parsed Varant Annotation - '
        par_ant = parse(rec.info)
        for ac, val in par_ant.items():
            print ac, '\t', val
        print 'Prioritized transcript per gene dictionary - '
        pt_par_ant = prio_trans(par_ant)
        for ac, val in pt_par_ant.items():
            print ac, '\t', val
        print 'Prioritized gene per vcf record - '
        ga = get_prior_geneannot(rec.info, alltrans=False)
        print ga
        print '\n'
Exemplo n.º 5
0
class SNPDB(db.DB):
    def _getschema(self):
        schema = [
            """create table snps
                   (chrom text not null,
                    pos int not null,
                    id text not null,
                    ref text not null,
                    alt text not null,
                  """, """create index chromposindex on snps (chrom, pos)""",
            """create index snpid on snps (id)"""
        ]
        return schema

    def _makedb(self):
        """Internal method. Do not use"""

        self.logger.info('Creating SNP database from ...')
        self.logger.info('Input file: %s' % self.inname)
        if not os.path.exists(self.inname):
            self.logger.error('%s: No such file' % self.inname)
            self.logger.error('Database not created')
            sys.exit(1)

        self.load(db=self.outname)
        self.conn.text_factory = str
        self._vcf = VCFParser(self.inname)
        self._infokeys, self._schema = self._createschema()
        for s in self._schema:
            self.createtable(s, True)
        stmt = 'insert into snps values ('
        stmt += ','.join('?' * (5 + len(self._infonum))) + ')'
        self.curs = self.conn.cursor()

        cache = []
        n = 0
        next = self._vcf.__next__
        while 1:

            try:
                cache.append(self._insertentry(next()))
            except StopIteration:
                break

            n += 1
            if not n % 100000:
                self.curs.executemany(stmt, cache)
                self.logger.info('Processed %d entries' % n)
                cache = []

        if cache:
            self.curs.executemany(stmt, cache)
        self.logger.info('Processed %s entries' % n)

        # Add version details
        fd = dt.fromtimestamp(os.path.getmtime(
            self.inname)).strftime('%Y-%m-%d')
        if 'dbsnp' in self.inname:
            version = 'v' + str(self._vcf.meta['dbSNP_BUILD_ID'])
        elif 'ESP6500' in self.inname:
            fn = os.path.splitext(os.path.split(self.inname)[1])[0]
            version = fn.split('.')[0]
        elif 'ExAC' in self.inname:
            fn = os.path.splitext(os.path.split(self.inname)[1])[0]
            version = fn.split('.')[1]
        else:
            version = "v%s_%s" % tuple(fd.split('-')[:2])

        self.set_version(os.path.split(self.outname)[1], fd, version, n)

        self.conn.commit()
        self.curs.close()
        self.conn.close()
        self.logger.info('... SNP database created')

    def _createschema(self):
        """Internal method to create schema based on the INFO fields in the VCF
        file"""
        schema = self._getschema()
        info = self._vcf.meta['INFO']
        keys = list(info.keys())
        keys.sort()
        infocols = []
        cursor = self.conn.cursor()
        cursor.execute(INFOTABLE)
        stmt = 'insert into info values (?,?,?,?)'
        self._infonum = {}
        for key in keys:
            knumb, ktype, kdesc = info[key]
            if knumb not in '01':
                ktype = 'text'
                self._infonum[key] = True
            else:
                ktype = TYPEMAP[RTYPEMAP.get(ktype, 'text')]
                self._infonum[key] = False
            cursor.execute(stmt, ('info_' + key, ktype, knumb, kdesc))
            infocols.append('info_%s %s' % (key, ktype))
        self.conn.commit()
        cursor.close()
        schema[0] += ','.join(infocols)
        schema[0] += ')'
        self.logger.info(schema)
        return keys, schema

    def _insertentry(self, rec):
        self._vcf.parseinfo(rec)
        args = [
            rec.chrom, rec.pos, ';'.join(rec.id), rec.ref, ','.join(rec.alt)
        ]
        infn = self._infonum
        info = rec.info.get
        for key in self._infokeys:
            v = info(key, None)
            if v is not None:
                if infn[key]:
                    v = ','.join(str(el) for el in v)
            args.append(v)
        return args
Exemplo n.º 6
0
def _load(invcf, thres_af, nmethod, data=None, sc=1):
    if not data:
        data = {}
    vcfs = VCFParser(invcf)
    samples = vcfs.samples
    for rec in vcfs:
        vcfs.parseinfo(rec)
        vcfs.parsegenotypes(rec)

        if not _is_HQVar(rec.filter):  # Checks variant is PASS
            continue

        for sid in samples:
            if sid not in data:
                data[sid] = {}

            gi = rec[sid]
            gt, gq = gi.GT, gi.GQ

            # Checks if genotype is not reference or GQ >= 30
            if not _genotype_check(gt, gq):
                continue

            altid = int(gt.split('/')[1])
            var = rec.chrom + ':' + str(rec.pos) + ':' + rec.ref +\
            ':' + rec.alt[altid - 1]
            af, flag = isRare(altid, rec.info, thres_af)
            if not flag:  # Checks if variant is not Rare (AF < 5%) in ExAC
                continue

            if 'LCR' in rec.info:
                continue

            if 'CLNDBN' in rec.info:
                dn = rec.info.CLNDBN[altid - 1]
                sig_num = rec.info.CLNSIG[altid - 1]
                if '|' in sig_num:
                    sig_num = [int(e) for e in sig_num.split('|') if e != '.']
                    if sig_num:
                        sig_num.sort()
                        sig_num = sig_num[-1]
                        cln_sig = CLNSIG_MAP[sig_num]
                    else:
                        cln_sig, dn = '', ''
                elif sig_num != '.':
                    sig_num = int(sig_num)
                    cln_sig = CLNSIG_MAP[sig_num]
                else:
                    dn, cln_sig = '', ''
            else:
                dn, cln_sig = '', ''

            if 'LCR' in rec.info:
                lcr = 'LCR'
            else:
                lcr = ''

            if 'CADD_phred' in rec.info:
                val = rec.info['CADD_phred'][altid - 1]
                if val == '.':
                    cadd = ''
                else:
                    cadd = float(val)
            else:
                cadd = ''

            if len(rec.ref) == len(rec.alt[altid - 1]) and len(rec.ref) == 1:
                ada_score, rf_score = get_dbscSNV_ant(rec.chrom, rec.pos,
                                                      rec.ref,
                                                      rec.alt[altid - 1])
                if (ada_score and ada_score > 0.6) or (rf_score
                                                       and rf_score > 0.6):
                    scpred = 'Damaging'
                else:
                    scpred = ''
            else:
                ada_score, rf_score, scpred = '', '', ''
            sc_ant = [scpred, ada_score, rf_score]

            #Parse annotation and prioritize transcript
            pa = vp.prio_trans(vp.parse(rec.info))

            # Ignore the intergenic variants
            if altid not in pa:
                continue

            eqtl_flag = False
            for gene, ant in pa[altid].items():
                ta = ant['TRANSCRIPT']
                key = ta.trans_id + '_' + ta.aa
                snps3d_pred = ['', '', '', '']

                # SC-1 variant present in Clinvar as Pathogenic or Likely Pathogenic
                if sc == 1:  # Search Criteria 1
                    if (cln_sig in ['Pathogenic', 'Likely pathogenic']):
                        if gene not in data[sid]:
                            data[sid][gene] = []
                        data[sid][gene].append(
                            (ta, af, cadd, gt, eqtl_flag, var, dn, cln_sig,
                             lcr, sc_ant, snps3d_pred))

                # SC-2 variant is protein altering + SC-1
                if sc == 2:  # Search Criteria 2
                    if (_is_PASnv(ta) and _is_Damaging(altid, rec.info, ta,
                                snps3d_pred, nmethod)) or _is_NonSense(ta) \
                                or _is_Splicing(ta) or _is_PAIndel(ta) or \
                                scpred == 'Damaging' or (cln_sig in ['Pathogenic',
                                               'Likely pathogenic']):
                        if gene not in data[sid]:
                            data[sid][gene] = []
                        data[sid][gene].append(
                            (ta, af, cadd, gt, eqtl_flag, var, dn, cln_sig,
                             lcr, sc_ant, snps3d_pred))

                # SC-5 in intronic and UTR variants + SC-1 + SC-2
                if sc == 3:  # Search Criteria 3
                    if _is_Intronic(ta) or _is_UTR(ta) or _is_PASnv(ta) or \
                    _is_NonSense(ta) or _is_Splicing(ta) or _is_PAIndel(ta) \
                    or scpred == 'Damaging' or cln_sig in ['Pathogenic',
                    'Likely pathogenic']:
                        if gene not in data[sid]:
                            data[sid][gene] = []
                        data[sid][gene].append(
                            (ta, af, cadd, gt, eqtl_flag, var, dn, cln_sig,
                             lcr, sc_ant, snps3d_pred))
    return data
Exemplo n.º 7
0
def _get_sample_ids(invcf):
    vcfo = VCFParser(invcf)
    return vcfo.samples
    vcfo.close()