def convert_to_sam(self, regions, output): if (self.verbosity == "verbose"): print " - Converting to SAM: " + output if (output == "-"): fh = sys.stdout else: fh = open(output, "w") i = 0 # 1: write header fh.write("@HD VN:1.0 SO:unsorted\n") for region in regions: if (self.input_format == 'bam'): aligned_reads = BAMParser(region[0], region[1], region[2], self.alignments, self.verbosity) elif (self.input_format == 'sslm'): aligned_reads = SSLMParser(region[0], region[1], region[2], self.alignments, self.verbosity) iterator = aligned_reads.parse_reads() if (next(iterator, None)): fh.write("@SQ SN:" + region[0] + " LN:" + str(region[2] - region[1] + 1) + "\n") del (iterator, aligned_reads) fh.write("@PG ID:0 PN:manual_conversion_script VN:0.0\n") # 2: write alignment for region in regions: if (self.verbosity == "verbose"): print " - Masked region: " + region[0] + ":" + str( region[1]) + "-" + str(region[2]) if (self.input_format == 'bam'): aligned_reads = BAMParser(region[0], region[1], region[2], self.alignments, self.verbosity) elif (self.input_format == 'sslm'): aligned_reads = SSLMParser(region[0], region[1], region[2], self.alignments, self.verbosity) for read in aligned_reads.parse_reads(): if (read.name): fh.write(read.name) else: fh.write("unknown_read_" + str(i)) i += 1 strand = "60" fh.write("\t0\t" + region[0] + "\t" + str(read.start + 1) + "\t" + strand + "\t" + str(read.stop - read.start) + "M\t*\t0\t0\t" + read.sequence + "\t*\tNH:i:1\n") fh.close()
def run(self, regions, fasta_file): if (self.verbosity == "verbose"): print " - Running fragment detection" self.fasta_file = fasta_file for region in regions: if (self.verbosity == "verbose"): print " - Masked region: " + region[0] + ":" + str( region[1]) + "-" + str(region[2]) print " * Acquiring statistics" if (self.input_format == 'bam'): aligned_reads = BAMParser(region[0], region[1], region[2], self.alignments, self.verbosity) elif (self.input_format == 'sslm'): aligned_reads = SSLMParser(region[0], region[1], region[2], self.alignments, self.verbosity) aligned_reads.parse_stats() if (self.verbosity == "verbose"): print " * Detecting fragments" predicted_fragments = FragmentFinder(region, aligned_reads) self.add_fragments(predicted_fragments, self.fasta_file)
def test_01_a(self): command = ['tar', '-xzf', '../share/small_RNA-seq_alignments/SRP028959/SRR954958.tar.gz'] subprocess.call(command) args = CLI_sslm2sam(['-o', 'tmp/tests/test.sam', 'SRR954958']) sslm2bed_converter = SSLMParser(args.sslm_directory) sslm2bed_converter.convert_to_sam(args.output) assertion = (os.stat("tmp/tests/test.sam").st_size == 46985661) self.assertTrue(assertion, "Incorrect ../share/small_RNA-seq_alignments/SRP028959/test.sam") # Assume file size is sufficient :) if assertion: os.remove("tmp/tests/test.sam") os.remove("SRR954958.bam") os.remove("SRR954958.bam.bai") shutil.rmtree("SRR954958")
def convert_to_bed(self, regions, output): if (self.verbosity == "verbose"): print " - Converting to BED: " + output if (output == "-"): fh = sys.stdout else: fh = open(output, "w") i = 0 for region in regions: if (self.verbosity == "verbose"): print " - Masked region: " + region[0] + ":" + str( region[1]) + "-" + str(region[2]) if (self.input_format == 'bam'): aligned_reads = BAMParser(region[0], region[1], region[2], self.alignments, self.verbosity) elif (self.input_format == 'sslm'): aligned_reads = SSLMParser(region[0], region[1], region[2], self.alignments, self.verbosity) for read_stacked in aligned_reads.parse_reads_stacked(): read = read_stacked[0] numberofhits = read_stacked[1] if (read.name): fh.write(region[0] + "\t" + str(read.start) + "\t" + str(read.stop) + "\t" + read.name + "\t" + str(numberofhits) + "\t-\n") else: fh.write(region[0] + "\t" + str(read.start) + "\t" + str(read.stop) + "\tunknown_read_" + str(i) + "\t" + str(numberofhits) + "\t-\n") i += 1 fh.close()
def count_error_with_intensity(self, regions, links, masked_regions, reference_offset=0): """ All sequences in our library of ncRNAs have been extended with 10 bases. """ out = [] if (self.verbosity == "verbose"): print " - Running fragment detection" for region in masked_regions: ncRNA = region[0] if (links.has_key(ncRNA)): if (self.verbosity == "verbose"): print " - Analysing: " + ncRNA annotations = regions.index[links[ncRNA]] if (self.input_format == 'bam'): aligned_reads = BAMParser(region[0], region[1], region[2], self.alignments, self.verbosity) elif (self.input_format == 'sslm'): aligned_reads = SSLMParser(region[0], region[1], region[2], self.alignments, self.verbosity) aligned_reads.parse_stats() predicted_fragments_obj = FragmentFinder(ncRNA, aligned_reads) predicted_fragments_obj.run() predicted_fragments = predicted_fragments_obj.getResults() aligned_reads.count_reads_per_region( predicted_fragments_obj.getResults()) for mirna_annotation in annotations.fragments: closest_fragment = self.find_closest_overlapping_fragment( mirna_annotation, predicted_fragments, reference_offset) if (closest_fragment): errors = self.find_errors(mirna_annotation, [ (closest_fragment.start - reference_offset), (closest_fragment.stop - reference_offset) ]) #@todo ,reference_offset err_5p = errors[0] err_3p = errors[1] #out.append({'5p':[closest_fragment[2],err_5p],'3p':[closest_fragment[3],err_3p]}) out.append({ '5p': [closest_fragment.supporting_reads_start, err_5p], '3p': [closest_fragment.supporting_reads_stop, err_3p], 'coverage': closest_fragment.supporting_reads }) return out
def count_reads_per_region(self, regions, links, masked_regions, reference_offset=0): """ All sequences in our library of ncRNAs have been extended with 10 bases. """ stats_table = {} stats_table['experimental'] = { 'error_5p': { "<-5": 0, -5: 0, -4: 0, -3: 0, -2: 0, -1: 0, 0: 0, 1: 0, 2: 0, 3: 0, 4: 0, 5: 0, ">5": 0 }, 'error_3p': { "<-5": 0, -5: 0, -4: 0, -3: 0, -2: 0, -1: 0, 0: 0, 1: 0, 2: 0, 3: 0, 4: 0, 5: 0, ">5": 0 }, 'predicted': 0, 'not_predicted_no_reads': 0, 'not_predicted_with_reads': 0 } stats_table['not_experimental'] = { 'error_5p': { "<-5": 0, -5: 0, -4: 0, -3: 0, -2: 0, -1: 0, 0: 0, 1: 0, 2: 0, 3: 0, 4: 0, 5: 0, ">5": 0 }, 'error_3p': { "<-5": 0, -5: 0, -4: 0, -3: 0, -2: 0, -1: 0, 0: 0, 1: 0, 2: 0, 3: 0, 4: 0, 5: 0, ">5": 0 }, 'predicted': 0, 'not_predicted_no_reads': 0, 'not_predicted_with_reads': 0 } if (self.verbosity == "verbose"): print " - Running fragment detection" i = 0 j = 0 #for ncRNA in self.alignment_directories_indexed.keys(): for region in masked_regions: ncRNA = region[0] if (links.has_key(ncRNA)): if (self.verbosity == "verbose"): print " - Analysing: " + ncRNA annotations = regions.index[links[ncRNA]] if (self.input_format == 'bam'): aligned_reads = BAMParser(region[0], region[1], region[2], self.alignments, self.verbosity) elif (self.input_format == 'sslm'): aligned_reads = SSLMParser(region[0], region[1], region[2], self.alignments, self.verbosity) aligned_reads.parse_stats() predicted_fragments_obj = FragmentFinder(ncRNA, aligned_reads) predicted_fragments_obj.run() predicted_fragments = predicted_fragments_obj.results i += 1 for annotation in annotations.fragments: closest = self.find_closest_overlapping_fragment( annotation, predicted_fragments, reference_offset) j += 1 if (closest): errors = self.find_errors(annotation, closest, reference_offset) err_5p = errors[0] err_3p = errors[1] if (err_5p > 5): err_5p = ">5" elif (err_5p < -5): err_5p = "<-5" if (err_3p > 5): err_3p = ">5" elif (err_3p < -5): err_3p = "<-5" if (annotation.evidence == "experimental"): stats_table['experimental']["predicted"] += 1 stats_table['experimental']["error_5p"][ err_5p] += 1 stats_table['experimental']["error_3p"][ err_3p] += 1 else: stats_table['not_experimental']["predicted"] += 1 stats_table['not_experimental']["error_5p"][ err_5p] += 1 stats_table['not_experimental']["error_3p"][ err_3p] += 1 else: if (annotation.evidence == "experimental"): if (annotation.get_supporting_reads() == 0): stats_table['experimental'][ "not_predicted_no_reads"] += 1 else: stats_table['experimental'][ "not_predicted_with_reads"] += 1 else: if (annotation.get_supporting_reads() == 0): stats_table['not_experimental'][ "not_predicted_no_reads"] += 1 else: stats_table['not_experimental'][ "not_predicted_with_reads"] += 1 print i, "annotated pre-miRNAs" print j, "annotated miRNAs" return stats_table