Пример #1
0
    def convert_to_sam(self, regions, output):
        if (self.verbosity == "verbose"):
            print "   - Converting to SAM: " + output

        if (output == "-"):
            fh = sys.stdout
        else:
            fh = open(output, "w")

        i = 0

        # 1: write header
        fh.write("@HD	VN:1.0	SO:unsorted\n")
        for region in regions:
            if (self.input_format == 'bam'):
                aligned_reads = BAMParser(region[0], region[1], region[2],
                                          self.alignments, self.verbosity)
            elif (self.input_format == 'sslm'):
                aligned_reads = SSLMParser(region[0], region[1], region[2],
                                           self.alignments, self.verbosity)

            iterator = aligned_reads.parse_reads()
            if (next(iterator, None)):
                fh.write("@SQ	SN:" + region[0] + "	LN:" +
                         str(region[2] - region[1] + 1) + "\n")

            del (iterator, aligned_reads)
        fh.write("@PG	ID:0	PN:manual_conversion_script	VN:0.0\n")

        # 2: write alignment
        for region in regions:
            if (self.verbosity == "verbose"):
                print "   - Masked region: " + region[0] + ":" + str(
                    region[1]) + "-" + str(region[2])

            if (self.input_format == 'bam'):
                aligned_reads = BAMParser(region[0], region[1], region[2],
                                          self.alignments, self.verbosity)
            elif (self.input_format == 'sslm'):
                aligned_reads = SSLMParser(region[0], region[1], region[2],
                                           self.alignments, self.verbosity)

            for read in aligned_reads.parse_reads():
                if (read.name):
                    fh.write(read.name)
                else:
                    fh.write("unknown_read_" + str(i))
                    i += 1

                strand = "60"
                fh.write("\t0\t" + region[0] + "\t" + str(read.start + 1) +
                         "\t" + strand + "\t" + str(read.stop - read.start) +
                         "M\t*\t0\t0\t" + read.sequence + "\t*\tNH:i:1\n")

        fh.close()
Пример #2
0
    def run(self, regions, fasta_file):
        if (self.verbosity == "verbose"):
            print " - Running fragment detection"

        self.fasta_file = fasta_file

        for region in regions:
            if (self.verbosity == "verbose"):
                print "   - Masked region: " + region[0] + ":" + str(
                    region[1]) + "-" + str(region[2])
                print "     * Acquiring statistics"

            if (self.input_format == 'bam'):
                aligned_reads = BAMParser(region[0], region[1], region[2],
                                          self.alignments, self.verbosity)
            elif (self.input_format == 'sslm'):
                aligned_reads = SSLMParser(region[0], region[1], region[2],
                                           self.alignments, self.verbosity)

            aligned_reads.parse_stats()

            if (self.verbosity == "verbose"):
                print "     * Detecting fragments"

            predicted_fragments = FragmentFinder(region, aligned_reads)
            self.add_fragments(predicted_fragments, self.fasta_file)
Пример #3
0
    def test_01_a(self):
        command = ['tar', '-xzf', '../share/small_RNA-seq_alignments/SRP028959/SRR954958.tar.gz']
        subprocess.call(command)

        args = CLI_sslm2sam(['-o', 'tmp/tests/test.sam', 'SRR954958'])
        sslm2bed_converter = SSLMParser(args.sslm_directory)
        sslm2bed_converter.convert_to_sam(args.output)

        assertion = (os.stat("tmp/tests/test.sam").st_size == 46985661)
        self.assertTrue(assertion, "Incorrect ../share/small_RNA-seq_alignments/SRP028959/test.sam")  # Assume file size is sufficient :)

        if assertion:
            os.remove("tmp/tests/test.sam")

        os.remove("SRR954958.bam")
        os.remove("SRR954958.bam.bai")
        shutil.rmtree("SRR954958")
Пример #4
0
    def convert_to_bed(self, regions, output):
        if (self.verbosity == "verbose"):
            print "   - Converting to BED: " + output

        if (output == "-"):
            fh = sys.stdout
        else:
            fh = open(output, "w")

        i = 0

        for region in regions:
            if (self.verbosity == "verbose"):
                print "   - Masked region: " + region[0] + ":" + str(
                    region[1]) + "-" + str(region[2])

            if (self.input_format == 'bam'):
                aligned_reads = BAMParser(region[0], region[1], region[2],
                                          self.alignments, self.verbosity)
            elif (self.input_format == 'sslm'):
                aligned_reads = SSLMParser(region[0], region[1], region[2],
                                           self.alignments, self.verbosity)

            for read_stacked in aligned_reads.parse_reads_stacked():
                read = read_stacked[0]
                numberofhits = read_stacked[1]

                if (read.name):
                    fh.write(region[0] + "\t" + str(read.start) + "\t" +
                             str(read.stop) + "\t" + read.name + "\t" +
                             str(numberofhits) + "\t-\n")
                else:
                    fh.write(region[0] + "\t" + str(read.start) + "\t" +
                             str(read.stop) + "\tunknown_read_" + str(i) +
                             "\t" + str(numberofhits) + "\t-\n")
                    i += 1

        fh.close()
Пример #5
0
    def count_error_with_intensity(self,
                                   regions,
                                   links,
                                   masked_regions,
                                   reference_offset=0):
        """
		All sequences in our library of ncRNAs have been extended with 10 bases.
		"""
        out = []

        if (self.verbosity == "verbose"):
            print " - Running fragment detection"

        for region in masked_regions:
            ncRNA = region[0]
            if (links.has_key(ncRNA)):
                if (self.verbosity == "verbose"):
                    print "   - Analysing: " + ncRNA

                annotations = regions.index[links[ncRNA]]

                if (self.input_format == 'bam'):
                    aligned_reads = BAMParser(region[0], region[1], region[2],
                                              self.alignments, self.verbosity)
                elif (self.input_format == 'sslm'):
                    aligned_reads = SSLMParser(region[0], region[1], region[2],
                                               self.alignments, self.verbosity)

                aligned_reads.parse_stats()

                predicted_fragments_obj = FragmentFinder(ncRNA, aligned_reads)
                predicted_fragments_obj.run()
                predicted_fragments = predicted_fragments_obj.getResults()

                aligned_reads.count_reads_per_region(
                    predicted_fragments_obj.getResults())

                for mirna_annotation in annotations.fragments:
                    closest_fragment = self.find_closest_overlapping_fragment(
                        mirna_annotation, predicted_fragments,
                        reference_offset)

                    if (closest_fragment):
                        errors = self.find_errors(mirna_annotation, [
                            (closest_fragment.start - reference_offset),
                            (closest_fragment.stop - reference_offset)
                        ])  #@todo ,reference_offset
                        err_5p = errors[0]
                        err_3p = errors[1]

                        #out.append({'5p':[closest_fragment[2],err_5p],'3p':[closest_fragment[3],err_3p]})
                        out.append({
                            '5p':
                            [closest_fragment.supporting_reads_start, err_5p],
                            '3p':
                            [closest_fragment.supporting_reads_stop, err_3p],
                            'coverage':
                            closest_fragment.supporting_reads
                        })

        return out
Пример #6
0
    def count_reads_per_region(self,
                               regions,
                               links,
                               masked_regions,
                               reference_offset=0):
        """
		All sequences in our library of ncRNAs have been extended with 10 bases.
		"""

        stats_table = {}
        stats_table['experimental'] = {
            'error_5p': {
                "<-5": 0,
                -5: 0,
                -4: 0,
                -3: 0,
                -2: 0,
                -1: 0,
                0: 0,
                1: 0,
                2: 0,
                3: 0,
                4: 0,
                5: 0,
                ">5": 0
            },
            'error_3p': {
                "<-5": 0,
                -5: 0,
                -4: 0,
                -3: 0,
                -2: 0,
                -1: 0,
                0: 0,
                1: 0,
                2: 0,
                3: 0,
                4: 0,
                5: 0,
                ">5": 0
            },
            'predicted': 0,
            'not_predicted_no_reads': 0,
            'not_predicted_with_reads': 0
        }
        stats_table['not_experimental'] = {
            'error_5p': {
                "<-5": 0,
                -5: 0,
                -4: 0,
                -3: 0,
                -2: 0,
                -1: 0,
                0: 0,
                1: 0,
                2: 0,
                3: 0,
                4: 0,
                5: 0,
                ">5": 0
            },
            'error_3p': {
                "<-5": 0,
                -5: 0,
                -4: 0,
                -3: 0,
                -2: 0,
                -1: 0,
                0: 0,
                1: 0,
                2: 0,
                3: 0,
                4: 0,
                5: 0,
                ">5": 0
            },
            'predicted': 0,
            'not_predicted_no_reads': 0,
            'not_predicted_with_reads': 0
        }

        if (self.verbosity == "verbose"):
            print " - Running fragment detection"

        i = 0
        j = 0

        #for ncRNA in self.alignment_directories_indexed.keys():
        for region in masked_regions:
            ncRNA = region[0]
            if (links.has_key(ncRNA)):
                if (self.verbosity == "verbose"):
                    print "   - Analysing: " + ncRNA

                annotations = regions.index[links[ncRNA]]

                if (self.input_format == 'bam'):
                    aligned_reads = BAMParser(region[0], region[1], region[2],
                                              self.alignments, self.verbosity)
                elif (self.input_format == 'sslm'):
                    aligned_reads = SSLMParser(region[0], region[1], region[2],
                                               self.alignments, self.verbosity)

                aligned_reads.parse_stats()

                predicted_fragments_obj = FragmentFinder(ncRNA, aligned_reads)
                predicted_fragments_obj.run()

                predicted_fragments = predicted_fragments_obj.results

                i += 1

                for annotation in annotations.fragments:
                    closest = self.find_closest_overlapping_fragment(
                        annotation, predicted_fragments, reference_offset)
                    j += 1

                    if (closest):
                        errors = self.find_errors(annotation, closest,
                                                  reference_offset)
                        err_5p = errors[0]
                        err_3p = errors[1]

                        if (err_5p > 5):
                            err_5p = ">5"
                        elif (err_5p < -5):
                            err_5p = "<-5"

                        if (err_3p > 5):
                            err_3p = ">5"
                        elif (err_3p < -5):
                            err_3p = "<-5"

                        if (annotation.evidence == "experimental"):
                            stats_table['experimental']["predicted"] += 1
                            stats_table['experimental']["error_5p"][
                                err_5p] += 1
                            stats_table['experimental']["error_3p"][
                                err_3p] += 1

                        else:
                            stats_table['not_experimental']["predicted"] += 1
                            stats_table['not_experimental']["error_5p"][
                                err_5p] += 1
                            stats_table['not_experimental']["error_3p"][
                                err_3p] += 1

                    else:
                        if (annotation.evidence == "experimental"):
                            if (annotation.get_supporting_reads() == 0):
                                stats_table['experimental'][
                                    "not_predicted_no_reads"] += 1
                            else:
                                stats_table['experimental'][
                                    "not_predicted_with_reads"] += 1
                        else:
                            if (annotation.get_supporting_reads() == 0):
                                stats_table['not_experimental'][
                                    "not_predicted_no_reads"] += 1
                            else:
                                stats_table['not_experimental'][
                                    "not_predicted_with_reads"] += 1

        print i, "annotated pre-miRNAs"
        print j, "annotated miRNAs"

        return stats_table