def to_sv_interval(self): sv_type = PINDEL_TO_SV_TYPE[self.sv_type] if sv_type not in PindelReader.svs_supported: return None if sv_type != "INS": return SVInterval(self.chromosome, self.start_pos, self.end_pos, name=self.name, sv_type=sv_type, length=self.sv_len, sources=pindel_source, info=self.info, native_sv=self) else: return SVInterval(self.chromosome, self.start_pos, self.start_pos, self.name, sv_type=sv_type, length=self.sv_len, sources=pindel_source, native_sv=self, wiggle=100, info=self.info, gt=self.gt)
def to_sv_interval(self): if self.sv_type not in valid_breakdancer_svs: return None if self.sv_type == "DEL" or self.sv_type == "INV": return SVInterval( self.chr1, self.pos1 + 1, self.pos2, # fudge name=self.name, sv_type=self.sv_type, length=self.sv_len, sources=breakdancer_source, cipos=[0, self.pos2 - self.pos1 - abs(self.sv_len)], info=self.info, native_sv=self) elif self.sv_type == "INS": return SVInterval( self.chr1, self.pos1 + 1, self.pos2, # fudge name=self.name, sv_type=self.sv_type, length=self.sv_len, sources=breakdancer_source, cipos=[0, self.pos2 - self.pos1], info=self.info, native_sv=self) else: logger.error("Bad SV type: " + repr(self))
def to_sv_interval(self): if self.sv_type not in BreakDancerReader.svs_supported: return None if (self.chr1 != self.chr2) and (self.sv_type != "CTX"): logger.error("Bad entry: " + repr(self)) return None if self.sv_type == "DEL" or self.sv_type == "INV": return SVInterval( self.chr1, self.pos1 + 1, self.pos2, # fudge name=self.name, sv_type=self.sv_type, length=self.sv_len, sources=breakdancer_source, cipos=[0, self.pos2 - self.pos1 - abs(self.sv_len)], info=self.info, native_sv=self) elif self.sv_type == "INS": return SVInterval( self.chr1, self.pos1 + 1, self.pos2, # fudge name=self.name, sv_type=self.sv_type, length=self.sv_len, sources=breakdancer_source, cipos=[0, self.pos2 - self.pos1], info=self.info, native_sv=self) elif self.sv_type == "ITX" or self.sv_type == "CTX": #As in Breakdancer Native output, we always assume that: #For CTX: chr2 >= chr1 #For ITX: pos2 >= pos1 return SVInterval( self.chr1, self.pos1 + 1, self.pos2, # fudge name=self.name, sv_type=self.sv_type, length=0, sources=breakdancer_source, cipos=[0, 0], info=self.info, native_sv=self, chrom2=self.chr2) else: logger.error("Bad SV type: " + repr(self)) return None
def to_sv_interval(self): return SVInterval(self.chromosome, self.start, self.end, name=self.name, sv_type=self.sv_type, length=self.sv_len, sources=cnvnator_source, info=self.info, native_sv=self)
def to_sv_interval(self): if self.sv_type not in CNVnatorReader.svs_supported: return None return SVInterval(self.chromosome, self.start, self.end, name=self.name, sv_type=self.sv_type, length=self.sv_len, sources=cnvnator_source, info=self.info, native_sv=self)
def to_sv_interval(self): if self.sv_type not in BreakSeqReader.svs_supported: return None return SVInterval(self.chromosome, self.start, self.end, name=self.name, sv_type=self.sv_type, length=self.sv_len, sources=source, cipos=[], info=self.info, native_sv=self)
def to_sv_interval(self): if self.sv_type not in valid_svs: return None return SVInterval(self.chromosome, self.start, self.end, name=self.name, sv_type=self.sv_type, length=self.sv_len, sources=source, cipos=[], info=self.info, native_sv=self)
def run_metasv(args): logger.info("Running MetaSV %s" % __version__) logger.info("Arguments are " + str(args)) # Check if there is work to do if not (args.pindel_vcf + args.breakdancer_vcf + args.breakseq_vcf + args.cnvnator_vcf + args.pindel_native + args.breakdancer_native + args.breakseq_native + args.cnvnator_native + args.manta_vcf + args.lumpy_vcf + args.cnvkit_vcf, args.wham_vcf): logger.warning("Nothing to merge since no SV file specified") # Simple check for arguments if not args.disable_assembly: if not args.spades: logger.error("Spades executable not specified") return os.EX_USAGE if not args.age: logger.error("AGE executable not specified") return os.EX_USAGE # Create the directories for working bedtools_tmpdir = os.path.join(args.workdir, "bedtools") create_dirs([args.workdir, args.outdir, bedtools_tmpdir]) # Reference handling if not os.path.isfile(args.reference + ".fai"): logger.error("Reference file %s is not indexed" % (args.reference)) return 1 fasta_handle = pysam.Fastafile(args.reference) if os.path.isfile( args.reference) else None contigs = get_contigs(args.reference) include_intervals = sorted([ SVInterval(contig.name, 0, contig.length, contig.name, "include", length=contig.length) for contig in contigs ]) # Generate the list of contigs to process contig_whitelist = set(args.chromosomes) if args.chromosomes else set( [contig.name for contig in contigs]) if args.keep_standard_contigs: contig_whitelist &= set([str(i) for i in xrange(1, 23)] + ["chr%d" % (i) for i in xrange(1, 23)] + ["X", "Y", "MT", "chrX", "chrY", "chrM"]) logger.info("Only SVs on the following contigs will be reported: %s" % (sorted(list(contig_whitelist)))) # Load the intervals from different files vcf_name_list = [("CNVnator", args.cnvnator_vcf), ("Pindel", args.pindel_vcf), ("BreakDancer", args.breakdancer_vcf), ("BreakSeq", args.breakseq_vcf), ("HaplotypeCaller", args.gatk_vcf), ("Lumpy", args.lumpy_vcf), ("Manta", args.manta_vcf), ("CNVkit", args.cnvkit_vcf), ("WHAM", args.wham_vcf)] native_name_list = [("CNVnator", args.cnvnator_native, CNVnatorReader), ("Pindel", args.pindel_native, PindelReader), ("BreakSeq", args.breakseq_native, BreakSeqReader), ("BreakDancer", args.breakdancer_native, BreakDancerReader)] tools = [] intervals = {} sv_types = set() gap_intervals = [] if args.filter_gaps: gaps = args.gaps if args.gaps else get_gaps_file(contig_whitelist) gap_intervals = sorted(load_gap_intervals(gaps)) # Handles native input logger.info("Load native files") for toolname, nativename, svReader in native_name_list: # If no native file is given, ignore the tool if not nativename: continue tools.append(toolname) intervals[toolname] = defaultdict(list) for native_file in nativename: for record in svReader(native_file, svs_to_report=args.svs_to_report): interval = record.to_sv_interval() if not interval: # This is the case for SVs we want to skip continue BD_min_inv_len = args.mean_read_length + 4 * args.isize_sd if toolname == "BreakDancer" and interval.sv_type == "INV" and abs( interval.length) < BD_min_inv_len: #Filter BreakDancer artifact INVs with size < readlength+4*isize_sd continue if not interval_overlaps_interval_list( interval, gap_intervals) and interval.chrom in contig_whitelist: # Check length if interval.length < args.minsvlen and interval.sv_type not in [ "ITX", "CTX" ]: continue # Set wiggle if interval.sv_type not in ["ITX", "CTX"]: interval.wiggle = max( args.inswiggle if interval.sv_type == "INS" else 0, args.wiggle) else: interval.wiggle = TX_WIGGLE intervals[toolname][interval.sv_type].append(interval) sv_types |= set(intervals[toolname].keys()) # Handles the VCF input cases, we will just deal with these cases logger.info("Load VCF files") for toolname, vcfname in vcf_name_list: # If no VCF is given, ignore the tool if not vcfname: continue tools.append(toolname) intervals[toolname] = {} vcf_list = [] for vcffile in vcfname: if os.path.isdir(vcffile): logger.info( "Will load from per-chromosome VCFs from directory %s for tool %s" % (vcffile, toolname)) vcf_list += [ os.path.join(vcffile, "%s.vcf.gz" % contig.name) for contig in contigs if (not contig_whitelist or contig.name in contig_whitelist ) ] else: vcf_list.append(vcffile) for vcffile in vcf_list: load_intervals(vcffile, intervals[toolname], gap_intervals, include_intervals, toolname, contig_whitelist, minsvlen=args.minsvlen, wiggle=args.wiggle, inswiggle=args.inswiggle, svs_to_report=args.svs_to_report, maxsvlen=args.maxsvlen) sv_types |= set(intervals[toolname].keys()) logger.info("SV types are %s" % (str(sv_types))) tool_merged_intervals = {} final_intervals = [] # This will just output per-tool VCFs, no intra-tool merging is done yet if args.enable_per_tool_output: logger.info("Output per-tool VCFs") for toolname in intervals: tool_out = os.path.join(args.outdir, "%s.vcf" % (toolname.lower())) logger.info("Outputting single tool VCF for %s" % (str(toolname))) vcf_template_reader = vcf.Reader( open(os.path.join(mydir, "resources/template.vcf"), "r")) vcf_template_reader.samples = [args.sample] intervals_tool = [] tool_out_fd = open(tool_out, "w") vcf_writer = vcf.Writer(tool_out_fd, vcf_template_reader) chr_intervals_tool = {contig.name: [] for contig in contigs} for sv_type in sv_types: if sv_type in intervals[toolname]: intervals_tool.extend([ copy.deepcopy(interval) for interval in intervals[toolname][sv_type] ]) for interval in intervals_tool: # Marghoob says that this is just to fill-in some metadata interval.do_validation(args.overlap_ratio) interval.fix_pos() chr_intervals_tool[interval.chrom].append(interval) for contig in contigs: chr_intervals_tool[contig.name].sort() for interval in chr_intervals_tool[contig.name]: vcf_record = interval.to_vcf_record( fasta_handle, args.sample) if vcf_record is not None: vcf_writer.write_record(vcf_record) tool_out_fd.close() vcf_writer.close() logger.info("Indexing single tool VCF for %s" % (str(toolname))) pysam.tabix_index(tool_out, force=True, preset="vcf") # Do merging here logger.info("Do merging") for sv_type in sv_types: logger.info("Processing SVs of type %s" % sv_type) tool_merged_intervals[sv_type] = [] # Do the intra-tool merging logger.info("Intra-tool Merging SVs of type %s" % sv_type) for tool in tools: logger.debug("Is %s in tool keys? %s" % (sv_type, str(intervals[tool].keys()))) if sv_type not in intervals[tool]: logger.debug("%s not in tool %s" % (sv_type, tool)) continue logger.info("First level merging for %s for tool %s" % (sv_type, tool)) tool_merged_intervals[sv_type] += merge_intervals( intervals[tool][sv_type]) # Do the inter-tool merging logger.info("Inter-tool Merging SVs of type %s" % sv_type) final_intervals.extend( merge_intervals_recursively(tool_merged_intervals[sv_type], args.overlap_ratio)) final_chr_intervals = {contig.name: [] for contig in contigs} for interval in final_intervals: interval.do_validation(args.overlap_ratio) interval.fix_pos() if args.minsvlen <= interval.length <= args.maxsvlen or interval.sv_type in [ "ITX", "CTX" ]: final_chr_intervals[interval.chrom].append(interval) # This is the merged VCF without assembly, ok for deletions at this point logger.info("Output merged VCF without assembly ") vcf_template_reader = vcf.Reader( open(os.path.join(mydir, "resources/template.vcf"), "r")) vcf_template_reader.samples = [args.sample] preasm_vcf = os.path.join(args.workdir, "pre_asm.vcf") vcf_fd = open(preasm_vcf, "w") vcf_writer = vcf.Writer(vcf_fd, vcf_template_reader) final_stats = {} bed_intervals = [] for contig in contigs: final_chr_intervals[contig.name].sort() for interval in final_chr_intervals[contig.name]: vcf_record = interval.to_vcf_record(fasta_handle) if vcf_record is not None: key = (interval.sv_type, "PASS" if interval.is_validated else "LowQual", "PRECISE" if interval.is_precise else "IMPRECISE", tuple(sorted(list(interval.sources)))) if key not in final_stats: final_stats[key] = 0 final_stats[key] += 1 vcf_writer.write_record(vcf_record) bed_interval = interval.to_bed_interval(args.sample) if bed_interval is not None: bed_intervals.append(bed_interval) vcf_fd.close() vcf_writer.close() # Also save a BED file representation of the merged variants without assembly merged_bed = None if bed_intervals: merged_bed = os.path.join(args.workdir, "metasv.bed") pybedtools.BedTool(bed_intervals).saveas(merged_bed) for key in sorted(final_stats.keys()): logger.info(str(key) + ":" + str(final_stats[key])) final_vcf = os.path.join(args.outdir, "variants.vcf") # Run assembly here if not args.disable_assembly: logger.info("Running assembly") spades_tmpdir = os.path.join(args.workdir, "spades") age_tmpdir = os.path.join(args.workdir, "age") create_dirs([spades_tmpdir, age_tmpdir]) assembly_bed = merged_bed # this does the improved assembly location finder with softclipped reads if args.boost_sc: logger.info("Generating Soft-Clipping intervals.") assembly_bed = parallel_generate_sc_intervals( args.bams, list(contig_whitelist), merged_bed, args.workdir, num_threads=args.num_threads, min_support_ins=args.min_support_ins, min_support_frac_ins=args.min_support_frac_ins, max_intervals=args.max_ins_intervals, min_mapq=args.min_mapq, min_avg_base_qual=args.min_avg_base_qual, min_soft_clip=args.min_soft_clip, max_nm=args.max_nm, min_matches=args.min_matches, isize_mean=args.isize_mean, isize_sd=args.isize_sd, svs_to_softclip=args.svs_to_softclip, overlap_ratio=args.overlap_ratio, mean_read_length=args.mean_read_length, mean_read_coverage=args.mean_read_coverage, min_ins_cov_frac=args.min_ins_cov_frac, max_ins_cov_frac=args.max_ins_cov_frac, assembly_max_tools=args.assembly_max_tools) logger.info("Generated intervals for assembly in %s" % assembly_bed) logger.info("Will run assembly now") assembled_fasta, ignored_bed = run_spades_parallel( bams=args.bams, spades=args.spades, spades_options=args.spades_options, bed=assembly_bed, work=spades_tmpdir, pad=args.assembly_pad, nthreads=args.num_threads, chrs=list(contig_whitelist), max_interval_size=args.spades_max_interval_size, timeout=args.spades_timeout, svs_to_assemble=args.svs_to_assemble, stop_on_fail=args.stop_spades_on_fail, max_read_pairs=args.extraction_max_read_pairs, assembly_max_tools=args.assembly_max_tools) breakpoints_bed = run_age_parallel( intervals_bed=assembly_bed, reference=args.reference, assembly=assembled_fasta, pad=args.assembly_pad, age=args.age, timeout=args.age_timeout, chrs=list(contig_whitelist), nthreads=args.num_threads, min_contig_len=AGE_MIN_CONTIG_LENGTH, min_del_subalign_len=args.min_del_subalign_len, min_inv_subalign_len=args.min_inv_subalign_len, age_window=args.age_window, age_workdir=age_tmpdir) final_bed = os.path.join(args.workdir, "final.bed") if breakpoints_bed: if ignored_bed: pybedtools.BedTool(breakpoints_bed) \ .cat(pybedtools.BedTool(ignored_bed), postmerge=False) \ .sort().saveas(final_bed) else: pybedtools.BedTool(breakpoints_bed).saveas(final_bed) elif ignored_bed: pybedtools.BedTool(ignored_bed).sort().saveas(final_bed) else: final_bed = None genotyped_bed = parallel_genotype_intervals( final_bed, args.bams, workdir=os.path.join(args.workdir, "genotyping"), nthreads=args.num_threads, chromosomes=list(contig_whitelist), window=args.gt_window, isize_mean=args.isize_mean, isize_sd=args.isize_sd, normal_frac_threshold=args.gt_normal_frac) logger.info("Output final VCF file") convert_metasv_bed_to_vcf(bedfile=genotyped_bed, vcf_out=final_vcf, workdir=args.workdir, sample=args.sample, reference=args.reference, pass_calls=False) else: shutil.copy(preasm_vcf, final_vcf) pysam.tabix_index(final_vcf, force=True, preset="vcf") logger.info("Clean up pybedtools") pybedtools.cleanup(remove_all=True) logger.info("All Done!") return os.EX_OK
def run_metasv(sample, reference, pindel_vcf=[], pindel_native=[], breakdancer_vcf=[], breakdancer_native=[], breakseq_vcf=[], breakseq_native=[], cnvnator_vcf=[], cnvnator_native=[], gatk_vcf=[], gaps=None, filter_gaps=False, keep_standard_contigs=False, wiggle=WIGGLE, overlap_ratio=OVERLAP_RATIO, workdir="work", outdir="out", boost_ins=False, bam=None, chromosomes=[], num_threads=1, spades=None, age=None, disable_assembly=True, minsvlen=MIN_SV_LENGTH, inswiggle=INS_WIGGLE, enable_per_tool_output=False, min_support=MIN_SUPPORT, min_support_frac=MIN_SUPPORT_FRAC, max_intervals=MAX_INTERVALS, disable_deletion_assembly=False, stop_spades_on_fail=False): """Invoke the MetaSV workflow. Positional arguments: sample -- Sample name reference -- Path to a samtools indexed reference FASTA Keyword arguments: pindel_vcf -- List of Pindel VCFs generated by SVGenotyper pindel_native -- List of Pindel native output files breakdancer_vcf -- List of BreakDancer VCFs generated by SVGenotyper breakdancer_native -- List of BreakDancer native output files breakseq_vcf -- List of BreakSeq2 VCFs breakseq_native -- List of BreakSeq native GFF outputs cnvnator_vcf -- List of CNVnator VCFs generated by cnvnator2VCF.pl cnvnator_native -- List of CNVnator native output files gatk_vcf -- List of Indel VCFs generated by GATK's HaplotypeCaller gaps -- Gaps BED file filter_gaps -- Flag to filter out SVs overlapping gaps (default False) keep_standard_contigs -- Flag to only generate SVs for the major contigs 1, 2, ..., 22, X, Y, MT (default False) wiggle -- Wiggle for SV interval comparision (default 100) overlap_ratio -- Reciprocal overlap ratio for SV interval comparison (default 0.5) workdir -- Scratch directory for MetaSV (default "work") outdir -- Output directory for MetaSV (default "out") boost_ins -- Enable MetaSV's soft-clip based insertion detection (default False) bam -- Alignment BAM for assembly and insertion detection (default None) chromosomes -- If specified, indicates the list of chromosomes to process (default []) num_threads -- Number of worker threads to use for assembly steps (default 1) spades -- Path for the SPAdes executable (default None) age -- Path for the AGE executable (default None) disable_assembly -- Flag to disable assembly (default False) enable_per_tool_output -- Flag to also output merged calls for each tool (default False) """ # Check if there is work to do if not (pindel_vcf + breakdancer_vcf + breakseq_vcf + cnvnator_vcf + pindel_native + breakdancer_native + breakseq_native + cnvnator_native): logger.error("Nothing to do since no SV file specified") return 1 # Create the directories for working bedtools_tmpdir = os.path.join(workdir, "bedtools") create_dirs([workdir, outdir, bedtools_tmpdir]) # Reference handling if not os.path.isfile(reference + ".fai"): logger.error("Reference file %s is not indexed" % (reference)) return 1 fasta_handle = pysam.Fastafile(reference) if os.path.isfile( reference) else None contigs = get_contigs(reference) include_intervals = sorted([ SVInterval(contig.name, 0, contig.length, contig.name, "include", length=contig.length) for contig in contigs ]) # Generate the list of contigs to process contig_whitelist = set(chromosomes) if chromosomes else set( [contig.name for contig in contigs]) if keep_standard_contigs: contig_whitelist &= set([str(i) for i in xrange(1, 23)] + ["chr%d" % (i) for i in xrange(1, 23)] + ["X", "Y", "MT", "chrX", "chrY", "chrM"]) logger.info("Only SVs on the following contigs will be reported: %s" % (sorted(list(contig_whitelist)))) # Load the intervals from different files vcf_name_list = [("CNVnator", cnvnator_vcf), ("Pindel", pindel_vcf), ("BreakDancer", breakdancer_vcf), ("BreakSeq", breakseq_vcf), ("HaplotypeCaller", gatk_vcf)] native_name_list = [("CNVnator", cnvnator_native, CNVnatorReader), ("Pindel", pindel_native, PindelReader), ("BreakSeq", breakseq_native, BreakSeqReader), ("BreakDancer", breakdancer_native, BreakDancerReader)] tools = [] intervals = {} sv_types = set() gap_intervals = [] if filter_gaps: if not gaps: gaps = get_gaps_file(contig_whitelist) gap_intervals = sorted(load_gap_intervals(gaps)) # Handles native input logger.info("Load native files") for toolname, nativename, svReader in native_name_list: # If no native file is given, ignore the tool if not nativename: continue tools.append(toolname) intervals[toolname] = defaultdict(list) for native_file in nativename: for record in svReader(native_file): interval = record.to_sv_interval() if not interval: # This is the case for SVs we want to skip continue if not interval_overlaps_interval_list( interval, gap_intervals) and interval.chrom in contig_whitelist: # Check length if interval.length < minsvlen: continue # Set wiggle if interval.sv_type == "INS": interval.wiggle = max(inswiggle, wiggle) else: interval.wiggle = wiggle intervals[toolname][interval.sv_type].append(interval) sv_types |= set(intervals[toolname].keys()) # Handles the VCF input cases, we will just deal with these cases logger.info("Load VCF files") for toolname, vcfname in vcf_name_list: # If no VCF is given, ignore the tool if not vcfname: continue tools.append(toolname) intervals[toolname] = {} vcf_list = [] for vcffile in vcfname: if os.path.isdir(vcffile): logger.info( "Will load from per-chromosome VCFs from directory %s for tool %s" % (vcffile, toolname)) vcf_list += [ os.path.join(vcffile, "%s.vcf.gz" % contig.name) for contig in contigs if (not contig_whitelist or contig.name in contig_whitelist ) ] else: vcf_list.append(vcffile) for vcffile in vcf_list: load_intervals(vcffile, intervals[toolname], gap_intervals, include_intervals, toolname, contig_whitelist, minsvlen=minsvlen, wiggle=wiggle, inswiggle=inswiggle) sv_types |= set(intervals[toolname].keys()) logger.info("SV types are %s" % (str(sv_types))) tool_merged_intervals = {} final_intervals = [] bd_out = os.path.join(outdir, "breakdancer.vcf") pindel_out = os.path.join(outdir, "pindel.vcf") cnvnator_out = os.path.join(outdir, "cnvnator.vcf") breakseq_out = os.path.join(outdir, "breakseq.vcf") vcf_out_list = [("BreakDancer", bd_out), ("Pindel", pindel_out), ("CNVnator", cnvnator_out), ("BreakSeq", breakseq_out)] # This will just output per-tool VCFs, no intra-tool merging is done yet if enable_per_tool_output: logger.info("Output per-tool VCFs") for toolname, tool_out in vcf_out_list: if tool_out is None or toolname not in intervals: continue logger.info("Outputting single tool VCF for %s" % (str(toolname))) vcf_template_reader = vcf.Reader( open(os.path.join(mydir, "resources/template.vcf"), "r")) vcf_template_reader.samples = [sample] intervals_tool = [] tool_out_fd = open(tool_out, "w") vcf_writer = vcf.Writer(tool_out_fd, vcf_template_reader) chr_intervals_tool = {contig.name: [] for contig in contigs} for sv_type in sv_types: if sv_type in intervals[toolname]: intervals_tool.extend([ copy.deepcopy(interval) for interval in intervals[toolname][sv_type] ]) for interval in intervals_tool: # Marghoob says that this is just to fill-in some metadata interval.do_validation(overlap_ratio) interval.fix_pos() chr_intervals_tool[interval.chrom].append(interval) for contig in contigs: chr_intervals_tool[contig.name].sort() for interval in chr_intervals_tool[contig.name]: vcf_record = interval.to_vcf_record(fasta_handle, sample) if vcf_record is not None: vcf_writer.write_record(vcf_record) tool_out_fd.close() vcf_writer.close() logger.info("Indexing single tool VCF for %s" % (str(toolname))) pysam.tabix_index(tool_out, force=True, preset="vcf") # Do merging here logger.info("Do merging") for sv_type in sv_types: logger.info("Processing SVs of type %s" % sv_type) tool_merged_intervals[sv_type] = [] # Do the intra-tool merging logger.info("Intra-tool Merging SVs of type %s" % sv_type) for tool in tools: logger.debug("Is %s in tool keys? %s" % (sv_type, str(intervals[tool].keys()))) if sv_type not in intervals[tool]: logger.debug("%s not in tool %s" % (sv_type, tool)) continue logger.info("First level merging for %s for tool %s" % (sv_type, tool)) tool_merged_intervals[sv_type] += merge_intervals( intervals[tool][sv_type]) # Do the inter-tool merging logger.info("Inter-tool Merging SVs of type %s" % sv_type) merged_intervals = merge_intervals(tool_merged_intervals[sv_type]) # Intervals which overlap well with merged_intervals intervals1 = [] # Intervals which do not overlap well with merged_intervals. # Used to filter out small intervals which got merged with large intervals intervals2 = [] logger.info("Checking overlaps SVs of type %s" % sv_type) for interval in tool_merged_intervals[sv_type]: if interval_overlaps_interval_list(interval, merged_intervals, overlap_ratio, overlap_ratio): intervals2.append(interval) else: intervals1.append(interval) final_intervals.extend( merge_intervals(intervals1) + merge_intervals(intervals2)) final_chr_intervals = {contig.name: [] for contig in contigs} for interval in final_intervals: interval.do_validation(overlap_ratio) interval.fix_pos() final_chr_intervals[interval.chrom].append(interval) # This is the merged VCF without assembly, ok for deletions at this point logger.info("Output merged VCF without assembly ") vcf_template_reader = vcf.Reader( open(os.path.join(mydir, "resources/template.vcf"), "r")) vcf_template_reader.samples = [sample] preasm_vcf = os.path.join(workdir, "pre_asm.vcf") vcf_fd = open(preasm_vcf, "w") vcf_writer = vcf.Writer(vcf_fd, vcf_template_reader) final_stats = {} bed_intervals = [] merged_bed = os.path.join(workdir, "metasv.bed") for contig in contigs: final_chr_intervals[contig.name].sort() for interval in final_chr_intervals[contig.name]: vcf_record = interval.to_vcf_record(fasta_handle) if vcf_record is not None: key = (interval.sv_type, "PASS" if interval.is_validated else "LowQual", "PRECISE" if interval.is_precise else "IMPRECISE", tuple(sorted(list(interval.sources)))) if key not in final_stats: final_stats[key] = 0 final_stats[key] += 1 vcf_writer.write_record(vcf_record) bed_interval = interval.to_bed_interval(sample) if bed_interval is not None: bed_intervals.append(bed_interval) # Also save a BED file representation of the merged variants without assembly pybedtools.BedTool(bed_intervals).saveas(merged_bed) vcf_fd.close() vcf_writer.close() for key in sorted(final_stats.keys()): logger.info(str(key) + ":" + str(final_stats[key])) final_vcf = os.path.join(outdir, "variants.vcf") # Run assembly here if not disable_assembly: logger.info("Running assembly") if spades is None: logger.error("Spades executable not specified") return 1 if age is None: logger.error("AGE executable not specified") return 1 spades_tmpdir = os.path.join(workdir, "spades") age_tmpdir = os.path.join(workdir, "age") create_dirs([spades_tmpdir, age_tmpdir]) assembly_bed = merged_bed # this does the improved assembly location finder with softclipped reads if boost_ins: logger.info("Generating intervals for insertions") assembly_bed = parallel_generate_sc_intervals( [bam.name], list(contig_whitelist), merged_bed, workdir, num_threads=num_threads, min_support=min_support, min_support_frac=min_support_frac, max_intervals=max_intervals) logger.info("Generated intervals for assembly in %s" % assembly_bed) logger.info("Will run assembly now") assembled_fasta, ignored_bed = run_spades_parallel( bam=bam.name, spades=spades, bed=assembly_bed, work=spades_tmpdir, pad=SPADES_PAD, nthreads=num_threads, chrs=list(contig_whitelist), disable_deletion_assembly=disable_deletion_assembly, stop_on_fail=stop_spades_on_fail) breakpoints_bed = run_age_parallel( intervals_bed=assembly_bed, reference=reference, assembly=assembled_fasta, pad=AGE_PAD, age=age, chrs=list(contig_whitelist), nthreads=num_threads, min_contig_len=AGE_MIN_CONTIG_LENGTH, age_workdir=age_tmpdir) final_bed = os.path.join(workdir, "final.bed") if ignored_bed: pybedtools.BedTool(breakpoints_bed) \ .cat(pybedtools.BedTool(ignored_bed), postmerge=False) \ .sort().saveas(final_bed) else: pybedtools.BedTool(breakpoints_bed).saveas(final_bed) logger.info("Output final VCF file") convert_metasv_bed_to_vcf(bedfile=final_bed, vcf_out=final_vcf, sample=sample, pass_calls=False) else: shutil.copy(preasm_vcf, final_vcf) pysam.tabix_index(final_vcf, force=True, preset="vcf") logger.info("Clean up pybedtools") pybedtools.cleanup(remove_all=True) logger.info("All Done!")