示例#1
0
	def load_1kgp(self,raw=None,svtype=None,gen=None,tmp_bed=None):
		sv = BedTool([(format_chrom(x[0]),x[1],x[2],x[3]) for x in raw if svtype in str(x[3])]).sort()
		sv.intersect('{}annotation_files/{}_1000Genomes_{}.bed'.format(Config().resource_path(),gen,svtype), f=0.8, F=0.8, wao=True,output=tmp_bed)
		with open(tmp_bed,'r') as f:
			for l in f:
				x = tuple(l.rstrip().split('\t'))
				locus = tokenize_sv(x)+(str(x[3]),)
				ovr = int(x[-1])
				if ovr==0: continue
				ovr = format(float(x[len(x)-1])/(int(x[2])-int(x[1])),'.2f')
				if self._1kgp.get(locus)==None:
					self._1kgp[locus]=(x[len(x)-2],ovr)
				elif self._1kgp.get(locus)!=None and float(ovr) > float(self._1kgp[locus][1]):
					self._1kgp[locus]=(x[len(x)-2],ovr)
				else: continue
		os.remove(tmp_bed)
示例#2
0
文件: Vcf.py 项目: sebatlab/SV2
 def init_header(self,
                 date=None,
                 ids=None,
                 Structural_Variant=None,
                 gen=None):
     chroms, refs, contigs = {}, OrderedDict(), []
     from sv2Config import Config
     with open('{}{}.genome'.format(Config().resource_path(), gen),
               'r') as f:
         for l in f:
             chrom, leng = l.rstrip().split('\t')
             chroms[format_chrom(chrom)] = leng
     for x in Structural_Variant.raw:
         if chroms.get(format_chrom(x[0])) != None:
             refs[x[0]] = chroms[format_chrom(x[0])]
     for chrom in refs:
         contigs.append('##contig=<ID={},length={}>'.format(
             chrom, refs[chrom]))
     self.head = [
         '##fileformat=VCFv4.1',
         '##fileDate={}'.format(date),
         '##SV2_CMD="{}"'.format(' '.join(map(str, sys.argv[:]))),
         '##INFO=<ID=END,Number=1,Type=Integer,Description="End position of structural variant">',
         '##INFO=<ID=SVTYPE,Number=1,Type=String,Description="Type of structural variant">',
         '##INFO=<ID=SVLEN,Number=1,Type=Integer,Description="Difference in length between REF and ALT alleles">',
         '##INFO=<ID=DENOVO_FILTER,Number=1,Type=String,Description="Stringent filter status, recommended for de novo mutation discovery">',
         '##INFO=<ID=REF_GTL,Number=1,Type=Float,Description="Median Phred-adjusted REF genotype likelihood">',
         '##INFO=<ID=AF,Number=1,Type=Float,Description="Alternate allele frequency,in the range (0,1)">',
         '##INFO=<ID=CYTOBAND,Number=.,Type=String,Description="Cytoband(s) overlapping the variant">',
         '##INFO=<ID=REPEATMASKER,Number=2,Type=String,Description="Name and reciprocal overlap of RepeatMasker variant">',
         '##INFO=<ID=1000G_ID,Number=1,Type=String,Description="1000 Genomes Phase 3 integrated SV callset variant identifier">',
         '##INFO=<ID=1000G_OVERLAP,Number=1,Type=Float,Description="Overlap to 1000 Genomes Phase 3 variant, in the range (0,1)">',
         '##INFO=<ID=DESCRIPTION,Number=1,Type=String,Description="Verbose description of SV, 1-based coordinates"',
         '##INFO=<ID=GENES,Number=1,Type=String,Description="Genes overlapping the variant, pipe-separated by transcripts>"',
         '##INFO=<ID=ABPARTS,Number=1,Type=Float,Description="Overlap to antibody parts, in the range (0,1)">',
         '##INFO=<ID=CENTROMERE,Number=1,Type=Float,Description="Centromere overlap, in the range (0,1)">',
         '##INFO=<ID=GAP,Number=1,Type=Float,Description="Overlap to gaps in the reference, in the range (0,1)">',
         '##INFO=<ID=SEGDUP,Number=1,Type=Float,Description="Segmental duplication overlap, in the range (0,1)">',
         '##INFO=<ID=STR,Number=1,Type=Float,Description="Short tandem repeat overlap, in the range (0,1)">',
         '##INFO=<ID=UNMAPPABLE,Number=1,Type=Float,Description="Overlap to DAC Blacklisted Regions, in the range (0,1)">',
         '##FILTER=<ID=ABPARTS,Description="Variant overlaps to antibody parts >50%">',
         '##FILTER=<ID=CENTROMERE,Description="Variant overlaps to centromere >50%">',
         '##FILTER=<ID=GAP>,Description="Variant overlaps to reference gaps >50%">',
         '##FILTER=<ID=GENOTYPEFAIL,Description="Variant was unable to be genotyped">',
         '##FILTER=<ID=NOALT,Description="No alternate allele detected">',
         '##FILTER=<ID=SEGDUP,Description="Variant overlaps to segmental duplications >50%">',
         '##FILTER=<ID=STR,Description="Variant overlaps to short tandem repeats >50%">',
         '##FILTER=<ID=UNMAPPABLE,Description="Variant overlaps to DAC Blacklisted Regions regions >50%">',
         '##FILTER=<ID=FAIL,Description="Variant failed standard filters">',
         '##FILTER=<ID=PASS,Description="Variant passed standard filters">',
         '##FORMAT=<ID=GT,Number=1,Type=String,Description="Genotype">',
         '##FORMAT=<ID=CN,Number=1,Type=Float,Description="Copy number estimate">',
         '##FORMAT=<ID=PE,Number=1,Type=Float,Description="Normalized discordant paired-end count">',
         '##FORMAT=<ID=SR,Number=1,Type=Float,Description="Normalized split-read count">',
         '##FORMAT=<ID=SC,Number=1,Type=Float,Description="SNV normalized coverage">',
         '##FORMAT=<ID=NS,Number=1,Type=Integer,Description="Number of SNVs within locus">',
         '##FORMAT=<ID=HA,Number=1,Type=Float,Description="Heterozygous allele ratio">',
         '##FORMAT=<ID=NH,Number=1,Type=Integer,Description="Number of heterozygous SNVs">',
         '##FORMAT=<ID=SQ,Number=1,Type=Float,Description="Phred-scaled genotype likelihood">',
         '##FORMAT=<ID=GL,Number=G,Type=Float,Description="Phred-scaled genotype likelihoods in the order, REF:(0/0), HET:(0/1), HOM:(1/1)">',
         '##ALT=<ID=DEL,Description="Deletion, if 80% reciprocal overlap with RepeatMasker element, the class, name, and family are given separated by colons">',
         '##ALT=<ID=DUP,Description="Duplication, if 80% reciprocal overlap with RepeatMasker element, the class, name, and family are given separated by colons">',
         '{}'.format('\n'.join(contigs)),
         '#CHROM\tPOS\tID\tREF\tALT\tQUAL\tFILTER\tINFO\tFORMAT\t{}'.format(
             '\t'.join(ids)),
     ]
示例#3
0
def main():
	init_time = int(time())
 	parser = argparse.ArgumentParser(formatter_class=RawTextHelpFormatter,usage=splash.replace("       ","",1)+__useage___,add_help=False)
	inArgs,genoArgs,optArgs = parser.add_argument_group('input arguments'),parser.add_argument_group('genotype arguments'),parser.add_argument_group('optional arguments')
	inArgs.add_argument('-i','-bam',type=str,default=None,nargs='*')
	inArgs.add_argument('-b','-bed',type=str,default=None,nargs='*')
	inArgs.add_argument('-v','-vcf',type=str,default=None,nargs='*')
	inArgs.add_argument('-snv',type=str,default=None,nargs='*')
	inArgs.add_argument('-p','-ped',type=str,default=None,nargs='*')
	genoArgs.add_argument('-g','-genome',required=False,default='hg19',type=str)
	genoArgs.add_argument('-pcrfree',required=False,default=False,action="store_true")
	genoArgs.add_argument('-M',default=False,required=False,action="store_true")
	genoArgs.add_argument('-pre',required=False,default=None)
	genoArgs.add_argument('-feats',required=False,default=None)
	optArgs.add_argument('-L','-log',default=None,required=False)
	optArgs.add_argument('-T','-tmp-dir',default=os.getcwd()+'/sv2_tmp_'+rand_id(),required=False)
	optArgs.add_argument('-s','-seed',required=False,default=42,type=int)
	optArgs.add_argument('-o','-out',required=False,default="sv2_training_features",type=str)
	optArgs.add_argument('-O','-odir',required=False,default=os.getcwd(),type=str)
	optArgs.add_argument('-h','-help',required=False,action="store_true",default=False)
	args = parser.parse_args()
	bams,bed,vcf,snv,ped = args.i,args.b,args.v,args.snv,args.p
	gen,pcrfree,legacy_m,predir,featsdir= args.g,args.pcrfree,args.M,args.pre,args.feats
	logfh, tmp_dir, seed, ofh, odir = args.L,args.T,args.s,args.o,args.O
	_help = args.h
	if (_help==True or len(sys.argv)==1):
		print splash+__useage___
		sys.exit(0)
	if logfh!=None:
		lfh = open(logfh,'w')
		sys.stderr=lfh
	preprocess_files,feats_files={},{}
	gens = ['hg19','hg38','mm10']
	olog = logfh
	if olog == None: olog = 'STDOUT'
	print 'sv2 version:{}    report bugs to <dantaki at ucsd dot edu>       error messages located in {}'.format(__version__,olog)
	Confs=Config()
	if bams==None and predir==None and featsdir==None:
		print 'FATAL ERROR: No BAM file specified <-i, -bam  FILE ...>'
		sys.stderr.write('FATAL ERROR: No BAM file specified <-i, -bam  FILE ...>\n')
		sys.exit(1)
	if snv==None and predir==None and featsdir==None:
		print 'FATAL ERROR: No SNV VCF file specified <-snv  FILE ...>'
		sys.stderr.write('FATAL ERROR: No SNV VCF file specified <-snv  FILE ...>\n')
		sys.exit(1)
	if ped==None:
		print 'FATAL ERROR: No PED file specified <-p, -ped  FILE ...>'
		sys.stderr.write('FATAL ERROR: No PED file specified <-p, -ped  FILE ...>\n')
		sys.exit(1)
	if bed==None and vcf==None:
		print 'FATAL ERROR: No SVs provided <-b, -bed  BED ...> <-v,-vcf  VCF ...>'
		sys.stderr.write('FATAL ERROR: No SVs provided <-b, -bed  BED ...> <-v,-vcf  VCF ...>\n')
		sys.exit(1)
	if gen not in gens:
		print 'FATAL ERROR -g must be hg19 or hg38. NOT {}'.format(gen)
		sys.stderr.write('FATAL ERROR -g must be hg19 or hg38. NOT {}\n'.format(gen))
		sys.exit(1)
	Peds=ped_init(ped)
	if bams!=None: Bams=bam_init(bams,Peds,snv_init(snv),gen)
	SV = sv_init(bed,vcf,gen)
	ofh = ofh.replace('.vcf','').replace('.out','').replace('.txt','')
	make_dir(tmp_dir)
	tmp_dir=slash_check(tmp_dir)
	if not odir.endswith('/'): odir = odir+'/'
	make_dir(odir)
	"""
	PREPROCESSING
	"""
	if predir == None:
		outdir = odir+'sv2_preprocessing/'
		make_dir(outdir)
		for bam in Bams:
			preofh = outdir+bam.id+'_sv2_preprocessing.txt'
			preprocess_files[bam.id]=preofh
			preprocess(bam,preofh,seed,gen,tmp_dir)
	else:
		predir=slash_check(predir)
		for fh in glob(predir+'*sv2_preprocessing.txt'):
			f = open(fh)
			if sum(1 for l in open(fh)) <= 1: continue
			else:
				preids=[]
				for l in f:
					if l.startswith('#'):continue
					preids.append(l.rstrip('\n').split('\t').pop(0))
			f.close()
			for iid in set(preids):
				if iid in Peds.ids : preprocess_files[iid]=fh
	report_time(init_time,'PREPROCESSING COMPLETE')
	""""
	FEATURE EXTRACTION
	"""
	if featsdir == None:
		outdir = odir+'sv2_features/'
		make_dir(outdir)
		for bam in Bams:
			if preprocess_files.get(bam.id) == None:
				sys.stderr.write('WARNING: BAM sample id {} not found in preprocessing files. Skipping ...\n'.format(bam.id))
				continue
			prefh = preprocess_files[bam.id]
			featfh = outdir+bam.id+'_sv2_features.txt'
			feats_files[bam.id]=featfh
			extract_feats(bam,SV.raw,prefh,featfh,gen,pcrfree,legacy_m,Confs,tmp_dir)
	else:
		featsdir=slash_check(featsdir)
		for fh in glob(featsdir+'*sv2_features.txt'):
			f = open(fh)
			if sum(1 for l in open(fh)) <= 1: continue
			else:
				featsid=[]
				for l in f:
					if l.startswith('#'):continue
					featsid.append(l.rstrip('\n').split('\t').pop(5))
			f.close()
			for iid in set(featsid):
				if iid in Peds.ids : feats_files[iid]=fh
	feats=[]
	train_dir = odir+'sv2_training_features/'
	make_dir(train_dir)
	for iid in feats_files:
		with open(feats_files[iid]) as f:
			for l in f: feats.append(tuple(l.rstrip('\n').split('\t')))
	sv2_train_output(feats,Peds,gen,train_dir+ofh)
	shutil.rmtree(tmp_dir)
	lfh.close()
	report_time(init_time,'FEATURE EXTRACTION COMPLETE')
示例#4
0
def mask_bed(fh,gen):
	from sv2Config import Config
	return BedTool(fh).subtract('{}{}_excluded.bed.gz'.format(Config().resource_path(),gen))
示例#5
0
def check_PAR(sv,gen):
	from sv2Config import Config
	in_par = False
	if len(BedTool(sv,from_string=True).intersect(BedTool('{}par_{}.bed'.format(Config().resource_path(),gen)),f=0.5)) > 0: in_par=True
	return in_par
示例#6
0
	def check_overlap(self,svs=None,raw=None,gen=None,tmp_dir=None):
		# overlap cytobands
		try:
			for entry in svs.intersect('{}annotation_files/{}_cytoband.bed'.format(Config().resource_path(),gen),wao=True):
				locus = tokenize_sv(entry)
				if self.cytoband.get(locus)==None: self.cytoband[locus]=str(entry[3]).replace('chr','')+str(entry[6])
				else: self.cytoband[locus]=self.cytoband[locus]+','+str(entry[3]).replace('chr','')+str(entry[6])
		except pybedtools.cbedtools.MalformedBedLineError: self.cytoband[locus]='NA'
		# overlap excluded elements
		tmp_bed=tmp_dir+'tmp_anno.bed'
		svs.intersect('{}annotation_files/{}_excluded.bed.gz'.format(Config().resource_path(),gen),wao=True,output=tmp_bed)
		with open(tmp_bed,'r') as f:
			for l in f:
				entry = tuple(l.rstrip().split('\t'))
				locus, ovr = tokenize_sv(entry),int(entry[-1])
				if ovr==0: continue
				key = locus+(str(entry[6]),)
				if self.excluded.get(key)==None: self.excluded[key]=ovr
				else: self.excluded[key]+=ovr
		for locus in self.excluded: self.excluded[locus]=float(self.excluded[locus])/(int(locus[2])-int(locus[1]))
		os.remove(tmp_bed)
		# overlap repeatmasker
		svs.intersect('{}annotation_files/{}_repeatmasker.bed.gz'.format(Config().resource_path(),gen), f=0.8, F=0.8, wa=True, wb=True,output=tmp_bed)
		with open(tmp_bed,'r') as f:
			for l in f:
				entry = tuple(l.rstrip().split('\t'))
				locus, rovr=tokenize_sv(entry), reciprocal_overlap(map(int,(entry[1],entry[4],entry[2],entry[5])))
				name = '{}:{}:{}'.format(entry[7],entry[8],entry[6])
				if self.repeatmasker.get(locus)==None:
					self.repeatmasker[locus]=(name, rovr)
				elif self.repeatmasker.get(locus)!=None and rovr > self.repeatmasker[locus][1]:
					self.repeatmasker[locus]=(name, rovr)
				else: continue
		os.remove(tmp_bed)
		# overlap 1KGP phase 3 DEL/DUP
		if 'hg' in gen:
			self.load_1kgp(raw,'DEL',gen,tmp_bed)
			self.load_1kgp(raw,'DUP',gen,tmp_bed)
		# overlap genes
		genes={}
		svs.intersect('{}annotation_files/{}_genes.bed.gz'.format(Config().resource_path(),gen), wa=True,wb=True,output=tmp_bed)
		with open(tmp_bed,'r') as f:
			for l in f:
				entry = tuple(l.rstrip().split('\t'))
				locus, gene = tokenize_sv(entry), str(entry[6])
				if genes.get(locus)==None: genes[locus]=[gene]
				elif gene not in genes[locus]: genes[locus].append(gene)
				else: continue
		os.remove(tmp_bed)
		for x in genes:
			ori,exons,introns,trx,genlist = {},{},{},{},[]
			exon_total,exon_num, intron_total,intron_num,exoncnt,exontot,introncnt,introntot=0,0,0,0,0,0,0,0
			for y in genes[x]:
				gene = y.split(',')
				ori[gene[0]]=gene[1]
				if 'exon' in y:
					trx[gene[0]]=gene[2]
					exon_total = int(gene[len(gene)-1].split('/').pop())
					exon_num = int(gene[len(gene)-1].split('/').pop(0).replace('exon_',''))
					exons[(gene[0],'TOTAL')]=exon_total
					if exons.get(gene[0])==None:
						exons[gene[0]]=1
						exons[(gene[0],exon_num)]=1
					elif exons.get((gene[0],exon_num)) == None:
						exons[gene[0]]+=1
						exons[(gene[0],exon_num)]=1
					else: continue
				elif 'UTR3' in y:
					trx[gene[0]]=gene[2]
					exons[(gene[0],'UTR3')]=1
				elif 'UTR5' in y:
					trx[gene[0]]=gene[2]
					exons[(gene[0],'UTR5')]=1
				elif 'intron' in y:
					trx[gene[0]]=gene[2]
					intron_total = int(gene[len(gene)-1].split('/').pop())
					intron_num = int(gene[len(gene)-1].split('/').pop(0).replace('intron_',''))
					introns[(gene[0],'TOTAL')]=intron_total
					if introns.get(gene[0])==None:
						introns[gene[0]]=1
						introns[(gene[0],intron_num)]=1
					elif introns.get((gene[0],intron_num)) == None:
						introns[gene[0]]+=1
						introns[(gene[0],intron_num)]=1
					else: continue
				elif 'stream' in y:
					trx[gene[0]]=gene[2]
					exons[(gene[0],gene[3])]=1
				else: continue
			for y in trx:
				if ori.get(y) == None: continue
				orient=ori[y]
				exoncnt=0
				exontot=0
				if exons.get(y) != None: exoncnt=exons[y]
				if exons.get((y,'TOTAL')) != None: exontot=exons[(y,'TOTAL')]
				if orient == '+':
					if exons.get((y,'upstream_1kb')) == 1 and exons.get((y,'downstream_1kb')) != 1:
						genlist.append(','.join(map(str,(y,trx[y],'upstream_1kb'))))
					if exons.get((y,'UTR3'))!=1 and exons.get((y,'UTR5')) == 1:
						genlist.append(','.join(map(str,(y,trx[y],'UTR5'))))
					if exoncnt != 0: genlist.append(','.join(map(str,(y,trx[y],'exon_{}/{}'.format(exoncnt,exontot)))))
					if exons.get(y) == None or exoncnt != exontot:
						introncnt,introntot=0,0
						if introns.get(y) != None: introncnt=introns[y]
						if introns.get((y,'TOTAL'))!=None: introntot=introns[(y,'TOTAL')]
						if introncnt !=0: genlist.append(','.join(map(str,(y,trx[y],'intron_{}/{}'.format(introncnt,introntot)))))
					if exons.get((y,'UTR3'))==1 and exons.get((y,'UTR5')) != 1:
						genlist.append(','.join(map(str,(y,trx[y],'UTR3'))))
					if exons.get((y,'upstream_1kb')) != 1 and exons.get((y,'downstream_1kb')) == 1:
						genlist.append(','.join(map(str,(y,trx[y],'downstream_1kb'))))
				else:
					if exons.get((y,'upstream_1kb')) != 1 and exons.get((y,'downstream_1kb')) == 1:
						genlist.append(','.join(map(str,(y,trx[y],'downstream_1kb'))))
					if exons.get((y,'UTR3'))==1 and exons.get((y,'UTR5')) != 1:
						genlist.append(','.join(map(str,(y,trx[y],'UTR3'))))
					if exoncnt != 0: genlist.append(','.join(map(str,(y,trx[y],'exon_{}/{}'.format(exoncnt,exontot)))))
					if exons.get(y) == None or exoncnt != exontot:
						introncnt,introntot=0,0
						if introns.get(y) != None: introncnt=introns[y]
						if introns.get((y,'TOTAL'))!=None: introntot=introns[(y,'TOTAL')]
						if introncnt !=0: genlist.append(','.join(map(str,(y,trx[y],'intron_{}/{}'.format(introncnt,introntot)))))
					if exons.get((y,'UTR3'))!=1 and exons.get((y,'UTR5')) == 1:
						genlist.append(','.join(map(str,(y,trx[y],'UTR5'))))
					if exons.get((y,'upstream_1kb')) == 1 and exons.get((y,'downstream_1kb')) != 1:
						genlist.append(','.join(map(str,(y,trx[y],'upstream_1kb'))))
			if len(genlist)>=1 : self.genes[x]='|'.join(genlist)