def load_1kgp(self,raw=None,svtype=None,gen=None,tmp_bed=None): sv = BedTool([(format_chrom(x[0]),x[1],x[2],x[3]) for x in raw if svtype in str(x[3])]).sort() sv.intersect('{}annotation_files/{}_1000Genomes_{}.bed'.format(Config().resource_path(),gen,svtype), f=0.8, F=0.8, wao=True,output=tmp_bed) with open(tmp_bed,'r') as f: for l in f: x = tuple(l.rstrip().split('\t')) locus = tokenize_sv(x)+(str(x[3]),) ovr = int(x[-1]) if ovr==0: continue ovr = format(float(x[len(x)-1])/(int(x[2])-int(x[1])),'.2f') if self._1kgp.get(locus)==None: self._1kgp[locus]=(x[len(x)-2],ovr) elif self._1kgp.get(locus)!=None and float(ovr) > float(self._1kgp[locus][1]): self._1kgp[locus]=(x[len(x)-2],ovr) else: continue os.remove(tmp_bed)
def init_header(self, date=None, ids=None, Structural_Variant=None, gen=None): chroms, refs, contigs = {}, OrderedDict(), [] from sv2Config import Config with open('{}{}.genome'.format(Config().resource_path(), gen), 'r') as f: for l in f: chrom, leng = l.rstrip().split('\t') chroms[format_chrom(chrom)] = leng for x in Structural_Variant.raw: if chroms.get(format_chrom(x[0])) != None: refs[x[0]] = chroms[format_chrom(x[0])] for chrom in refs: contigs.append('##contig=<ID={},length={}>'.format( chrom, refs[chrom])) self.head = [ '##fileformat=VCFv4.1', '##fileDate={}'.format(date), '##SV2_CMD="{}"'.format(' '.join(map(str, sys.argv[:]))), '##INFO=<ID=END,Number=1,Type=Integer,Description="End position of structural variant">', '##INFO=<ID=SVTYPE,Number=1,Type=String,Description="Type of structural variant">', '##INFO=<ID=SVLEN,Number=1,Type=Integer,Description="Difference in length between REF and ALT alleles">', '##INFO=<ID=DENOVO_FILTER,Number=1,Type=String,Description="Stringent filter status, recommended for de novo mutation discovery">', '##INFO=<ID=REF_GTL,Number=1,Type=Float,Description="Median Phred-adjusted REF genotype likelihood">', '##INFO=<ID=AF,Number=1,Type=Float,Description="Alternate allele frequency,in the range (0,1)">', '##INFO=<ID=CYTOBAND,Number=.,Type=String,Description="Cytoband(s) overlapping the variant">', '##INFO=<ID=REPEATMASKER,Number=2,Type=String,Description="Name and reciprocal overlap of RepeatMasker variant">', '##INFO=<ID=1000G_ID,Number=1,Type=String,Description="1000 Genomes Phase 3 integrated SV callset variant identifier">', '##INFO=<ID=1000G_OVERLAP,Number=1,Type=Float,Description="Overlap to 1000 Genomes Phase 3 variant, in the range (0,1)">', '##INFO=<ID=DESCRIPTION,Number=1,Type=String,Description="Verbose description of SV, 1-based coordinates"', '##INFO=<ID=GENES,Number=1,Type=String,Description="Genes overlapping the variant, pipe-separated by transcripts>"', '##INFO=<ID=ABPARTS,Number=1,Type=Float,Description="Overlap to antibody parts, in the range (0,1)">', '##INFO=<ID=CENTROMERE,Number=1,Type=Float,Description="Centromere overlap, in the range (0,1)">', '##INFO=<ID=GAP,Number=1,Type=Float,Description="Overlap to gaps in the reference, in the range (0,1)">', '##INFO=<ID=SEGDUP,Number=1,Type=Float,Description="Segmental duplication overlap, in the range (0,1)">', '##INFO=<ID=STR,Number=1,Type=Float,Description="Short tandem repeat overlap, in the range (0,1)">', '##INFO=<ID=UNMAPPABLE,Number=1,Type=Float,Description="Overlap to DAC Blacklisted Regions, in the range (0,1)">', '##FILTER=<ID=ABPARTS,Description="Variant overlaps to antibody parts >50%">', '##FILTER=<ID=CENTROMERE,Description="Variant overlaps to centromere >50%">', '##FILTER=<ID=GAP>,Description="Variant overlaps to reference gaps >50%">', '##FILTER=<ID=GENOTYPEFAIL,Description="Variant was unable to be genotyped">', '##FILTER=<ID=NOALT,Description="No alternate allele detected">', '##FILTER=<ID=SEGDUP,Description="Variant overlaps to segmental duplications >50%">', '##FILTER=<ID=STR,Description="Variant overlaps to short tandem repeats >50%">', '##FILTER=<ID=UNMAPPABLE,Description="Variant overlaps to DAC Blacklisted Regions regions >50%">', '##FILTER=<ID=FAIL,Description="Variant failed standard filters">', '##FILTER=<ID=PASS,Description="Variant passed standard filters">', '##FORMAT=<ID=GT,Number=1,Type=String,Description="Genotype">', '##FORMAT=<ID=CN,Number=1,Type=Float,Description="Copy number estimate">', '##FORMAT=<ID=PE,Number=1,Type=Float,Description="Normalized discordant paired-end count">', '##FORMAT=<ID=SR,Number=1,Type=Float,Description="Normalized split-read count">', '##FORMAT=<ID=SC,Number=1,Type=Float,Description="SNV normalized coverage">', '##FORMAT=<ID=NS,Number=1,Type=Integer,Description="Number of SNVs within locus">', '##FORMAT=<ID=HA,Number=1,Type=Float,Description="Heterozygous allele ratio">', '##FORMAT=<ID=NH,Number=1,Type=Integer,Description="Number of heterozygous SNVs">', '##FORMAT=<ID=SQ,Number=1,Type=Float,Description="Phred-scaled genotype likelihood">', '##FORMAT=<ID=GL,Number=G,Type=Float,Description="Phred-scaled genotype likelihoods in the order, REF:(0/0), HET:(0/1), HOM:(1/1)">', '##ALT=<ID=DEL,Description="Deletion, if 80% reciprocal overlap with RepeatMasker element, the class, name, and family are given separated by colons">', '##ALT=<ID=DUP,Description="Duplication, if 80% reciprocal overlap with RepeatMasker element, the class, name, and family are given separated by colons">', '{}'.format('\n'.join(contigs)), '#CHROM\tPOS\tID\tREF\tALT\tQUAL\tFILTER\tINFO\tFORMAT\t{}'.format( '\t'.join(ids)), ]
def main(): init_time = int(time()) parser = argparse.ArgumentParser(formatter_class=RawTextHelpFormatter,usage=splash.replace(" ","",1)+__useage___,add_help=False) inArgs,genoArgs,optArgs = parser.add_argument_group('input arguments'),parser.add_argument_group('genotype arguments'),parser.add_argument_group('optional arguments') inArgs.add_argument('-i','-bam',type=str,default=None,nargs='*') inArgs.add_argument('-b','-bed',type=str,default=None,nargs='*') inArgs.add_argument('-v','-vcf',type=str,default=None,nargs='*') inArgs.add_argument('-snv',type=str,default=None,nargs='*') inArgs.add_argument('-p','-ped',type=str,default=None,nargs='*') genoArgs.add_argument('-g','-genome',required=False,default='hg19',type=str) genoArgs.add_argument('-pcrfree',required=False,default=False,action="store_true") genoArgs.add_argument('-M',default=False,required=False,action="store_true") genoArgs.add_argument('-pre',required=False,default=None) genoArgs.add_argument('-feats',required=False,default=None) optArgs.add_argument('-L','-log',default=None,required=False) optArgs.add_argument('-T','-tmp-dir',default=os.getcwd()+'/sv2_tmp_'+rand_id(),required=False) optArgs.add_argument('-s','-seed',required=False,default=42,type=int) optArgs.add_argument('-o','-out',required=False,default="sv2_training_features",type=str) optArgs.add_argument('-O','-odir',required=False,default=os.getcwd(),type=str) optArgs.add_argument('-h','-help',required=False,action="store_true",default=False) args = parser.parse_args() bams,bed,vcf,snv,ped = args.i,args.b,args.v,args.snv,args.p gen,pcrfree,legacy_m,predir,featsdir= args.g,args.pcrfree,args.M,args.pre,args.feats logfh, tmp_dir, seed, ofh, odir = args.L,args.T,args.s,args.o,args.O _help = args.h if (_help==True or len(sys.argv)==1): print splash+__useage___ sys.exit(0) if logfh!=None: lfh = open(logfh,'w') sys.stderr=lfh preprocess_files,feats_files={},{} gens = ['hg19','hg38','mm10'] olog = logfh if olog == None: olog = 'STDOUT' print 'sv2 version:{} report bugs to <dantaki at ucsd dot edu> error messages located in {}'.format(__version__,olog) Confs=Config() if bams==None and predir==None and featsdir==None: print 'FATAL ERROR: No BAM file specified <-i, -bam FILE ...>' sys.stderr.write('FATAL ERROR: No BAM file specified <-i, -bam FILE ...>\n') sys.exit(1) if snv==None and predir==None and featsdir==None: print 'FATAL ERROR: No SNV VCF file specified <-snv FILE ...>' sys.stderr.write('FATAL ERROR: No SNV VCF file specified <-snv FILE ...>\n') sys.exit(1) if ped==None: print 'FATAL ERROR: No PED file specified <-p, -ped FILE ...>' sys.stderr.write('FATAL ERROR: No PED file specified <-p, -ped FILE ...>\n') sys.exit(1) if bed==None and vcf==None: print 'FATAL ERROR: No SVs provided <-b, -bed BED ...> <-v,-vcf VCF ...>' sys.stderr.write('FATAL ERROR: No SVs provided <-b, -bed BED ...> <-v,-vcf VCF ...>\n') sys.exit(1) if gen not in gens: print 'FATAL ERROR -g must be hg19 or hg38. NOT {}'.format(gen) sys.stderr.write('FATAL ERROR -g must be hg19 or hg38. NOT {}\n'.format(gen)) sys.exit(1) Peds=ped_init(ped) if bams!=None: Bams=bam_init(bams,Peds,snv_init(snv),gen) SV = sv_init(bed,vcf,gen) ofh = ofh.replace('.vcf','').replace('.out','').replace('.txt','') make_dir(tmp_dir) tmp_dir=slash_check(tmp_dir) if not odir.endswith('/'): odir = odir+'/' make_dir(odir) """ PREPROCESSING """ if predir == None: outdir = odir+'sv2_preprocessing/' make_dir(outdir) for bam in Bams: preofh = outdir+bam.id+'_sv2_preprocessing.txt' preprocess_files[bam.id]=preofh preprocess(bam,preofh,seed,gen,tmp_dir) else: predir=slash_check(predir) for fh in glob(predir+'*sv2_preprocessing.txt'): f = open(fh) if sum(1 for l in open(fh)) <= 1: continue else: preids=[] for l in f: if l.startswith('#'):continue preids.append(l.rstrip('\n').split('\t').pop(0)) f.close() for iid in set(preids): if iid in Peds.ids : preprocess_files[iid]=fh report_time(init_time,'PREPROCESSING COMPLETE') """" FEATURE EXTRACTION """ if featsdir == None: outdir = odir+'sv2_features/' make_dir(outdir) for bam in Bams: if preprocess_files.get(bam.id) == None: sys.stderr.write('WARNING: BAM sample id {} not found in preprocessing files. Skipping ...\n'.format(bam.id)) continue prefh = preprocess_files[bam.id] featfh = outdir+bam.id+'_sv2_features.txt' feats_files[bam.id]=featfh extract_feats(bam,SV.raw,prefh,featfh,gen,pcrfree,legacy_m,Confs,tmp_dir) else: featsdir=slash_check(featsdir) for fh in glob(featsdir+'*sv2_features.txt'): f = open(fh) if sum(1 for l in open(fh)) <= 1: continue else: featsid=[] for l in f: if l.startswith('#'):continue featsid.append(l.rstrip('\n').split('\t').pop(5)) f.close() for iid in set(featsid): if iid in Peds.ids : feats_files[iid]=fh feats=[] train_dir = odir+'sv2_training_features/' make_dir(train_dir) for iid in feats_files: with open(feats_files[iid]) as f: for l in f: feats.append(tuple(l.rstrip('\n').split('\t'))) sv2_train_output(feats,Peds,gen,train_dir+ofh) shutil.rmtree(tmp_dir) lfh.close() report_time(init_time,'FEATURE EXTRACTION COMPLETE')
def mask_bed(fh,gen): from sv2Config import Config return BedTool(fh).subtract('{}{}_excluded.bed.gz'.format(Config().resource_path(),gen))
def check_PAR(sv,gen): from sv2Config import Config in_par = False if len(BedTool(sv,from_string=True).intersect(BedTool('{}par_{}.bed'.format(Config().resource_path(),gen)),f=0.5)) > 0: in_par=True return in_par
def check_overlap(self,svs=None,raw=None,gen=None,tmp_dir=None): # overlap cytobands try: for entry in svs.intersect('{}annotation_files/{}_cytoband.bed'.format(Config().resource_path(),gen),wao=True): locus = tokenize_sv(entry) if self.cytoband.get(locus)==None: self.cytoband[locus]=str(entry[3]).replace('chr','')+str(entry[6]) else: self.cytoband[locus]=self.cytoband[locus]+','+str(entry[3]).replace('chr','')+str(entry[6]) except pybedtools.cbedtools.MalformedBedLineError: self.cytoband[locus]='NA' # overlap excluded elements tmp_bed=tmp_dir+'tmp_anno.bed' svs.intersect('{}annotation_files/{}_excluded.bed.gz'.format(Config().resource_path(),gen),wao=True,output=tmp_bed) with open(tmp_bed,'r') as f: for l in f: entry = tuple(l.rstrip().split('\t')) locus, ovr = tokenize_sv(entry),int(entry[-1]) if ovr==0: continue key = locus+(str(entry[6]),) if self.excluded.get(key)==None: self.excluded[key]=ovr else: self.excluded[key]+=ovr for locus in self.excluded: self.excluded[locus]=float(self.excluded[locus])/(int(locus[2])-int(locus[1])) os.remove(tmp_bed) # overlap repeatmasker svs.intersect('{}annotation_files/{}_repeatmasker.bed.gz'.format(Config().resource_path(),gen), f=0.8, F=0.8, wa=True, wb=True,output=tmp_bed) with open(tmp_bed,'r') as f: for l in f: entry = tuple(l.rstrip().split('\t')) locus, rovr=tokenize_sv(entry), reciprocal_overlap(map(int,(entry[1],entry[4],entry[2],entry[5]))) name = '{}:{}:{}'.format(entry[7],entry[8],entry[6]) if self.repeatmasker.get(locus)==None: self.repeatmasker[locus]=(name, rovr) elif self.repeatmasker.get(locus)!=None and rovr > self.repeatmasker[locus][1]: self.repeatmasker[locus]=(name, rovr) else: continue os.remove(tmp_bed) # overlap 1KGP phase 3 DEL/DUP if 'hg' in gen: self.load_1kgp(raw,'DEL',gen,tmp_bed) self.load_1kgp(raw,'DUP',gen,tmp_bed) # overlap genes genes={} svs.intersect('{}annotation_files/{}_genes.bed.gz'.format(Config().resource_path(),gen), wa=True,wb=True,output=tmp_bed) with open(tmp_bed,'r') as f: for l in f: entry = tuple(l.rstrip().split('\t')) locus, gene = tokenize_sv(entry), str(entry[6]) if genes.get(locus)==None: genes[locus]=[gene] elif gene not in genes[locus]: genes[locus].append(gene) else: continue os.remove(tmp_bed) for x in genes: ori,exons,introns,trx,genlist = {},{},{},{},[] exon_total,exon_num, intron_total,intron_num,exoncnt,exontot,introncnt,introntot=0,0,0,0,0,0,0,0 for y in genes[x]: gene = y.split(',') ori[gene[0]]=gene[1] if 'exon' in y: trx[gene[0]]=gene[2] exon_total = int(gene[len(gene)-1].split('/').pop()) exon_num = int(gene[len(gene)-1].split('/').pop(0).replace('exon_','')) exons[(gene[0],'TOTAL')]=exon_total if exons.get(gene[0])==None: exons[gene[0]]=1 exons[(gene[0],exon_num)]=1 elif exons.get((gene[0],exon_num)) == None: exons[gene[0]]+=1 exons[(gene[0],exon_num)]=1 else: continue elif 'UTR3' in y: trx[gene[0]]=gene[2] exons[(gene[0],'UTR3')]=1 elif 'UTR5' in y: trx[gene[0]]=gene[2] exons[(gene[0],'UTR5')]=1 elif 'intron' in y: trx[gene[0]]=gene[2] intron_total = int(gene[len(gene)-1].split('/').pop()) intron_num = int(gene[len(gene)-1].split('/').pop(0).replace('intron_','')) introns[(gene[0],'TOTAL')]=intron_total if introns.get(gene[0])==None: introns[gene[0]]=1 introns[(gene[0],intron_num)]=1 elif introns.get((gene[0],intron_num)) == None: introns[gene[0]]+=1 introns[(gene[0],intron_num)]=1 else: continue elif 'stream' in y: trx[gene[0]]=gene[2] exons[(gene[0],gene[3])]=1 else: continue for y in trx: if ori.get(y) == None: continue orient=ori[y] exoncnt=0 exontot=0 if exons.get(y) != None: exoncnt=exons[y] if exons.get((y,'TOTAL')) != None: exontot=exons[(y,'TOTAL')] if orient == '+': if exons.get((y,'upstream_1kb')) == 1 and exons.get((y,'downstream_1kb')) != 1: genlist.append(','.join(map(str,(y,trx[y],'upstream_1kb')))) if exons.get((y,'UTR3'))!=1 and exons.get((y,'UTR5')) == 1: genlist.append(','.join(map(str,(y,trx[y],'UTR5')))) if exoncnt != 0: genlist.append(','.join(map(str,(y,trx[y],'exon_{}/{}'.format(exoncnt,exontot))))) if exons.get(y) == None or exoncnt != exontot: introncnt,introntot=0,0 if introns.get(y) != None: introncnt=introns[y] if introns.get((y,'TOTAL'))!=None: introntot=introns[(y,'TOTAL')] if introncnt !=0: genlist.append(','.join(map(str,(y,trx[y],'intron_{}/{}'.format(introncnt,introntot))))) if exons.get((y,'UTR3'))==1 and exons.get((y,'UTR5')) != 1: genlist.append(','.join(map(str,(y,trx[y],'UTR3')))) if exons.get((y,'upstream_1kb')) != 1 and exons.get((y,'downstream_1kb')) == 1: genlist.append(','.join(map(str,(y,trx[y],'downstream_1kb')))) else: if exons.get((y,'upstream_1kb')) != 1 and exons.get((y,'downstream_1kb')) == 1: genlist.append(','.join(map(str,(y,trx[y],'downstream_1kb')))) if exons.get((y,'UTR3'))==1 and exons.get((y,'UTR5')) != 1: genlist.append(','.join(map(str,(y,trx[y],'UTR3')))) if exoncnt != 0: genlist.append(','.join(map(str,(y,trx[y],'exon_{}/{}'.format(exoncnt,exontot))))) if exons.get(y) == None or exoncnt != exontot: introncnt,introntot=0,0 if introns.get(y) != None: introncnt=introns[y] if introns.get((y,'TOTAL'))!=None: introntot=introns[(y,'TOTAL')] if introncnt !=0: genlist.append(','.join(map(str,(y,trx[y],'intron_{}/{}'.format(introncnt,introntot))))) if exons.get((y,'UTR3'))!=1 and exons.get((y,'UTR5')) == 1: genlist.append(','.join(map(str,(y,trx[y],'UTR5')))) if exons.get((y,'upstream_1kb')) == 1 and exons.get((y,'downstream_1kb')) != 1: genlist.append(','.join(map(str,(y,trx[y],'upstream_1kb')))) if len(genlist)>=1 : self.genes[x]='|'.join(genlist)