예제 #1
0
def get_dbscSNV_ant(chrom, pos, ref, alt):
    '''Checks if the variant affects splicing'''
    f = '/dbscSNV/dbscSNV1.1.chr' + chrom + '.gz'
    fo = Tabix(f)
    for rec in fo.query(chrom, pos - 1, pos + 1):
        if int(rec[1]) == pos and rec[2] == ref and rec[3] == alt:
            ada_score, rf_score = rec[-2], rec[-1].strip()
            if rf_score and rf_score != '.':
                rf_score = ''
            return float(ada_score), rf_score
    return '', ''
예제 #2
0
def _get_coord_depth(invcf, capture_file_path, genelist_config, patient_capture_config):
    #global capcoord_dp
    
    # Load the genelist
    gl = _load_genelist(genelist_config)
    #gl = ['CCDC39', 'CSF2RA', 'CTC1', 'DNM1L', 'FBN1', 'HSD17B4', 'HYDIN', 'TERT']
    
    # Load the patient capture file
    pc = _load_patient_capture(patient_capture_config)
    
    # Load the captured coordinates for each sample
    CAP = {'v2': '4-Hopkins_clinical_panel_capture_v2paper.bed',
           'v1b': '4-Hopkins_clinical_panel_capture_v1b.bed'}

    cap_exon_mean = {}
    vcfo = Tabix(invcf)
    samples = _get_sample_ids(invcf)
    for cver, pl in pc.items():
        capture_file = os.path.join(capture_file_path, CAP[cver])
        cap_gene_coords = _load_capture(capture_file, gl)
        for gene, cap_exons in cap_gene_coords.items():
            for idx, exon in enumerate(cap_exons):
                exno = idx + 1
                chrom, sp, ep = exon
                key = (chrom, sp, ep, exno)
                temp = {}
                for rec in vcfo.query(chrom, sp, ep):
                    gt_info_raw = rec[9:]
                    for sid, gt_info in zip(samples, gt_info_raw):
                        if sid not in pl:
                            continue
                        if sid not in temp:
                            temp[sid] = []
                        if gt_info == './.':
                            temp[sid].append(0)
                        else:
                            gt_d = gt_info.split(':')
                            if len(gt_d) <= 3:
                                temp[sid].append(int(gt_d[-1]))
                            else:
                                temp[sid].append(int(gt_d[2]))
                # Compute the mean
                for sid, val in temp.items():
                    m = np.average(val)
                    if sid not in cap_exon_mean:
                        cap_exon_mean[sid] = {}
                    if gene not in cap_exon_mean[sid]:
                        cap_exon_mean[sid][gene] = {}
                    cap_exon_mean[sid][gene][key] = m
    return cap_exon_mean
예제 #3
0
def gerp(vf, af, name="gerp"):
    print "inside gerp"
    v = BedTool(vf)
    t = Tabix(af)

    results = {}

    for var in v:
        try:
            result = 0.0
            num = 0
            for res in t.query(var.chrom, var.start, var.end):
                result += float(res[4])
                num += 1
            if num > 0:
                results[var.name] = result/num
        except:
            pass
    print "exit gerp"
    return Series(results, name=name)
예제 #4
0
samples = []
results = {}

for ln in open(sys.argv[2]):
	sample, fn = ln.rstrip().split("\t")
	samples.append(sample)
	results[sample] = {}

#	p = subprocess.Popen(['bcftools', 'view', fn], stdout=subprocess.PIPE)
#	vcf_reader = vcf.VCFReader(p.stdout, 'rb')
#	for record in vcf_reader:
#		pos = (record.CHROM, record.POS)

	print fn
	tab = Tabix(fn)
	for pos in positions:
		search = "%s:%d-%d;" % (pos[0], pos[1], pos[1])
		try:
			itr = tab.fetch(search)
			rec = itr.next()
		except StopIteration:
			print "can't find: %s" % (search,)
			pass

		cols = rec.split("\t")

		record = {}
		record['REF'] = cols[3]
		record['ALT'] = cols[4]
		record['QUAL'] = float(cols[5])
예제 #5
0
def _get_coord_depth(invcf, capture_file_path, patient_capture_config,
                     sample_ids, qgene):

    # Load the patient capture file
    pc = _load_patient_capture(patient_capture_config, sample_ids)

    print pc

    # Load the captured coordinates for each sample
    CAP = {
        'v2': '4-Hopkins_clinical_panel_capture_v2.bed',
        'v1b': '4-Hopkins_clinical_panel_capture_v1b.bed'
    }

    vcfo = Tabix(invcf)
    samples = _get_sample_ids(invcf)

    coord_depth = {}
    for cver, pl in pc.items():
        if cver == 'v2':
            continue
        capture_file = os.path.join(capture_file_path, CAP[cver])
        cap_exons = _load_capture(capture_file, qgene)
        ex_cnt = len(cap_exons)

        for idx, exon in enumerate(cap_exons):
            exno = idx + 1
            chrom, sp, ep = exon
            midpos = range(sp, ep)[(ep - sp) / 2]
            for rec in vcfo.query(chrom, sp, ep):
                pos = int(rec[1])
                gt_info_raw = rec[9:]
                tsid = []
                for sid, gt_info in zip(samples, gt_info_raw):
                    if sid not in pl:
                        continue
                    tsid.append(sid)
                    if sid not in coord_depth:
                        coord_depth[sid] = [[], [], []]
                    if gt_info == './.':
                        dp = 0
                    else:
                        gt_d = gt_info.split(':')
                        if len(gt_d) <= 3:
                            dp = int(gt_d[-1])
                        else:
                            dp = int(gt_d[2])
                    coord_depth[sid][0].append(pos)
                    coord_depth[sid][1].append(dp)
                    if pos == midpos:
                        coord_depth[sid][2].append('CR-' + str(exno))
                    else:
                        coord_depth[sid][2].append('')

            if idx + 1 == ex_cnt:
                continue
            for sid in tsid:
                n = 105
                for fp in range(ep + 1, ep + 1 + n):
                    coord_depth[sid][0].append(fp)
                    coord_depth[sid][1].append(0)
                    coord_depth[sid][2].append('')
    return coord_depth
예제 #6
0
def _generate_TSV(outdir, samples, capcoord):
    """Internal Method. Generates two tsv files (one for snv and 
    other for indel) that contains the variants reported in 1000
    Genomes Project Phase 3 dataset for given genomic regions. 

    Args:
        outdir (str): Path where TSV files will be created
        samples (list): List of samples for which the variants
                        will be extracted from 1000 Genomes dataset.
        capcoord (list): List of bed format coordinates wrapped as 
                         tuples. For eg. 
                         [(chrom, start_pos, end_pos), ...]

    Returns:
        None
    """
    out_snpfile = os.path.join(outdir, 'snp_control_variants_all.tsv')
    outsnp = open(out_snpfile, 'w')

    out_indelfile = os.path.join(outdir, 'indel_control_variants_all.tsv')
    outindel = open(out_indelfile, 'w')
    
    chroms = [str(e) for e in range(1, 23)] + ['X', 'Y']
    hflag = False
    for chrom in chroms:
        if chrom == 'X':
            fp1 = 'ftp://ftp.1000genomes.ebi.ac.uk/vol1/ftp/release/20130502/ALL.chrX.phase3_shapeit2_mvncall_integrated_v1b.20130502.genotypes.vcf.gz'
            ofp1 = os.path.join(outdir, 'ALL.chrX.phase3_shapeit2_mvncall_integrated_v1b.20130502.genotypes.vcf.gz')
            fp2 = 'ftp://ftp.1000genomes.ebi.ac.uk/vol1/ftp/release/20130502/ALL.chrX.phase3_shapeit2_mvncall_integrated_v1b.20130502.genotypes.vcf.gz.tbi'
            ofp2 = os.path.join(outdir, 'ALL.chrX.phase3_shapeit2_mvncall_integrated_v1b.20130502.genotypes.vcf.gz.tbi')
        else:
            fp1 = 'ftp://ftp.1000genomes.ebi.ac.uk/vol1/ftp/release/20130502/ALL.chr%s.phase3_shapeit2_mvncall_integrated_v5a.20130502.genotypes.vcf.gz' % chrom
            ofp1 = os.path.join(outdir, 'ALL.chr%s.phase3_shapeit2_mvncall_integrated_v5a.20130502.genotypes.vcf.gz' % chrom)
            fp2 = 'ftp://ftp.1000genomes.ebi.ac.uk/vol1/ftp/release/20130502/ALL.chr%s.phase3_shapeit2_mvncall_integrated_v5a.20130502.genotypes.vcf.gz.tbi' % chrom
            ofp2 = os.path.join(outdir, 'ALL.chr%s.phase3_shapeit2_mvncall_integrated_v5a.20130502.genotypes.vcf.gz.tbi' % chrom)
        
        if not os.path.exists(ofp1):
            cmd = "wget %s -O %s" % (fp1, ofp1)
            print cmd
            os.system(cmd)
        if not os.path.exists(ofp2):
            cmd = "wget %s -O %s" % (fp2, ofp2)
            print cmd
            os.system(cmd)
        
        for line in os.popen("zcat %s | head -260 | grep '#CHROM'" % ofp1):
            line = line.strip()
            h = line.split('\t')[:9]
            kgsamples = line.split('\t')[9:]
        
        if hflag == False:
            h += samples
            outsnp.write('\t'.join(h) + '\n')
            outindel.write('\t'.join(h) + '\n')
            hflag = True
        
        vcfo = Tabix(ofp1)
        for coord in capcoord:
            if coord[0] != chrom:
                continue
            for rec in vcfo.query(*coord):
                if '<' in rec[4] or ',' in rec[4]:
                    continue
                
                if len(rec[3]) == len(rec[4]):  # to make Indel control set
                    out = outsnp
                
                if len(rec[3]) != len(rec[4]):  # to make SNV control set
                    out = outindel
            
                sv = []
                d = dict(zip(kgsamples, rec[9:]))
                temp = set([])
                for sid in samples:
                    sv.append(d[sid])
                    temp.add(d[sid])
                if len(temp) == 1 and list(temp)[0] in ['0|0', '0/0']:
                    continue
                out.write('\t'.join(rec[:9] + sv) + '\n')
예제 #7
0
def get_varcnt_gp(invcf, antvcf, capcoord, samples, varcnt, vao):
    '''Returns a dictionary where the keys are sampleid and values are a list
    of length 11 whose the elements are the count of - 
    * Nocall
    * VarLQ
    * GTLQ
    * REF allele
    * HetALT
    * HomALT
    * Ts
    * Tv
    * Common Variants (Based on 1000 Genomes)
    * Rare Variants (Based on 1000 Genomes)
    * Novel Variants (Based on 1000 Genomes)
    '''

    vcfsamples = get_vcfsamples(invcf)
    for sid in samples:
        varcnt[sid] = [
            0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
        ]  # NoCall, VarLQ, GTLQ, REF, HetALT, HomALT, Ts, Tv, Common, Rare, Novel
    vcfo = Tabix(invcf)
    antvcfo = Tabix(antvcf)
    for coord in capcoord:
        try:
            qdata = vcfo.query(*coord)
        except:
            qdata = None
        if qdata is None:
            continue
        for rec in qdata:
            filter = rec[6]
            gts = rec[9:]

            trans = vao.retrieve_genedef(rec[0], int(rec[1]), int(rec[1]))
            if not trans:  # indicating its an intergenic region
                continue

            for sid, gtinfo in zip(vcfsamples, gts):
                if sid in samples:
                    if gtinfo == '.' or './.' in gtinfo:  # No Call
                        varcnt[sid][0] += 1
                    elif filter != 'PASS':
                        varcnt[sid][1] += 1  # VarLQ
                    elif '0/0' in gtinfo:
                        varcnt[sid][3] += 1  # REF
                    else:
                        gt, gq = gtinfo.split(':')[:2]
                        gq = int(gq)
                        a1, a2 = gt.split('/')
                        a1, a2 = int(a1), int(a2)
                        ref, alt = rec[3], rec[4].split(',')[a2 - 1]
                        ct = get_change_type(ref, alt)
                        arec = antvcfo.query(rec[0],
                                             int(rec[1]) - 1, int(rec[1]))
                        com, rare, novel = False, False, False
                        iflag = False
                        for e in arec:
                            if int(rec[1]) == int(e[1]):
                                kgaf = 0.0
                                if 'KGDB' in e[7]:
                                    kgaf = float(e[7].split('KGAF=')[1].split(
                                        ';')[0].split(',')[a2 - 1])
                                exacaf = 0.0
                                if 'EXACDB' in e[7]:
                                    exacaf = float(
                                        e[7].split('EXACAF=')[1].split(
                                            ';')[0].split(',')[a2 - 1])

                                if kgaf == 0.0 and exacaf == 0.0:
                                    novel = True
                                elif kgaf < 0.05 and exacaf < 0.05:
                                    rare = True
                                else:
                                    com = True
                                break
                        if gq < 30:  # GTLQ
                            varcnt[sid][2] += 1
                        elif a1 != a2:  # HetALT
                            varcnt[sid][4] += 1
                            if ct == 'ts':
                                varcnt[sid][6] += 1
                            elif ct == 'tv':
                                varcnt[sid][7] += 1
                            if com:
                                varcnt[sid][8] += 1
                            elif rare:
                                varcnt[sid][9] += 1
                            elif novel:
                                varcnt[sid][10] += 1
                                print sid, rec[0], rec[1], rec[3], rec[4]
                        elif a1 == a2:  # HomALT
                            varcnt[sid][5] += 1
                            if ct == 'ts':
                                varcnt[sid][6] += 1
                            elif ct == 'tv':
                                varcnt[sid][7] += 1
                            if com:
                                varcnt[sid][8] += 1
                            elif rare:
                                varcnt[sid][9] += 1
                            elif novel:
                                varcnt[sid][10] += 1
                                print sid, rec[0], rec[1], rec[3], rec[4]
    return varcnt