def break_contigs(contigs_file, sam_file, output_file): contigs = list(SeqIO.parse(open(contigs_file, "rU"), "fasta")) # sam = sam_parser.SamChain([sam_parser.Samfile(sam_file) for sam_file in sam_files]) sam = sam_parser.Samfile(sam_file) # last two arguments: K, min0 stretch length to break coverage_breaker = break_by_coverage.ContigBreaker(contigs, sam, 100, 50) coverage_breaker.OutputBroken(output_file)
def ConstructSubreferenceFromSam(sam_files): #todo: make online #todo: use config recs = [] for sam_file in sam_files: sam = sam_parser.Samfile(sam_file) for rec in sam: if rec.pos != -1: recs.append(Record(rec)) recs.sort() filtered_recs = CollectParts(recs, 0, 2, 500) subreferences = CollectParts(filtered_recs, 20000, 0, 7000) return filtered_recs, subreferences
def moleculo_postprocessing(contigs_file, output_file, sam_files, log): log.info("===== Starting postprocessing based on read alignment") log.info("Processing scaffolds from " + contigs_file) log.info("Using read alignments to break and filter scaffolds") contigs = list(SeqIO.parse(open(contigs_file, "rU"), "fasta")) sam = sam_parser.SamChain( [sam_parser.Samfile(sam_file) for sam_file in sam_files]) generate_quality.GenerateQuality(contigs, sam) pattern_filter = moleculo_filter_contigs.PatternContigFilter( contigs, sam, pattern, rc_pattern) length_filter = moleculo_filter_contigs.ContigLengthFilter(1500) coverage_breaker = break_by_coverage.ContigBreaker(contigs, sam, 100, 50) pattern_breaker = break_by_coverage.PatternBreaker(pattern, rc_pattern, 150) n_breaker = break_by_coverage.NBreaker(3) result = SplitAndFilter(contigs, coverage_breaker, length_filter, n_breaker, pattern_breaker, pattern_filter) OutputResults(output_file, "fasta", result) OutputResults(output_file, "fastq", result) log.info("===== Postprocessing finished. Results can be found in " + output_file + ".fastq")