Exemplo n.º 1
0
def sff_filter(in_file, out_file, iterator_filter, inter):
    count = 0
    try:
        from Bio.SeqIO.SffIO import ReadRocheXmlManifest
    except ImportError:
        # Prior to Biopython 1.56 this was a private function
        from Bio.SeqIO.SffIO import _sff_read_roche_index_xml as ReadRocheXmlManifest
    with open(in_file, "rb") as in_handle:
        try:
            manifest = ReadRocheXmlManifest(in_handle)
        except ValueError:
            manifest = None
        in_handle.seek(0)
        with open(out_file, "wb") as out_handle:
            writer = SffWriter(out_handle, xml=manifest)
            in_handle.seek(0)  # start again after getting manifest
            if inter:
                from itertools import chain

                count = writer.write_file(
                    chain.from_iterable(
                        iterator_filter(pair(SffIterator(in_handle)))))
                assert count % 2 == 0, "Odd number of records? %i" % count
                count /= 2
            else:
                count = writer.write_file(
                    iterator_filter(SffIterator(in_handle)))
    return count
Exemplo n.º 2
0
def sff_filter(in_file, out_file, iterator_filter, inter):
    count = 0
    try:
        from Bio.SeqIO.SffIO import SffIterator, SffWriter
    except ImportError:
        sys_exit("SFF filtering requires Biopython 1.54 or later")
    try:
        from Bio.SeqIO.SffIO import ReadRocheXmlManifest
    except ImportError:
        #Prior to Biopython 1.56 this was a private function
        from Bio.SeqIO.SffIO import _sff_read_roche_index_xml as ReadRocheXmlManifest
    with open(in_file, "rb") as in_handle:
        try:
            manifest = ReadRocheXmlManifest(in_handle)
        except ValueError:
            manifest = None
        in_handle.seek(0)
        with open(out_file, "wb") as out_handle:
            writer = SffWriter(out_handle, xml=manifest)
            in_handle.seek(0) #start again after getting manifest
            if inter:
                from itertools import chain
                count = writer.write_file(chain.from_iterable(iterator_filter(pair(SffIterator(in_handle)))))
                assert count % 2 == 0, "Odd number of records? %i" % count
                count /= 2
            else:
                count = writer.write_file(iterator_filter(SffIterator(in_handle)))
                #count = writer.write_file(SffIterator(in_handle))
    return count
Exemplo n.º 3
0
 def run(self, proc_name=None):
     sffpath = self.id_str + '.sff'
     try:
         with open(sffpath, 'wb') as fh:
             self.proc_name = proc_name
             self.sff_file = SffWriter(fh)
             self.sff_file.write_file(self.reads_for_barcode(
                 self.reads_sff))
             logger.info(
                 "%s reads of %s matched %s" %
                 (self._matched_reads, self._processed, self.id_str))
     except ValueError:
         # No reads for barcode so remove the temporary file
         os.unlink(sffpath)
Exemplo n.º 4
0
 def test_no_index(self):
     # Does a lot of work to create a no-index SFF file
     # (in the process checking this bit of SffWriter works)
     records = list(SeqIO.parse(BytesIO(self.good), "sff"))
     with BytesIO() as handle:
         writer = SffWriter(handle, index=False)
         count = writer.write_file(records)
         self.assertEqual(count, len(records))
         handle.seek(0)
         new = list(SeqIO.parse(handle, "sff"))
         self.assertEqual(len(records), len(new))
         for a, b in zip(records, new):
             self.assertEqual(a.id, b.id)
         handle.seek(0)
         with self.assertRaises(ValueError) as cm:
             values = _sff_find_roche_index(handle)
         err = str(cm.exception)
         self.assertEqual(err, "No index present in this SFF file")
Exemplo n.º 5
0
def sff_filter(in_file, pos_file, neg_file, wanted):
    """SFF filter."""
    try:
        from Bio.SeqIO.SffIO import SffIterator, SffWriter
    except ImportError:
        sys.exit("SFF filtering requires Biopython 1.54 or later")

    try:
        from Bio.SeqIO.SffIO import ReadRocheXmlManifest
    except ImportError:
        # Prior to Biopython 1.56 this was a private function
        from Bio.SeqIO.SffIO import _sff_read_roche_index_xml as ReadRocheXmlManifest

    in_handle = open(in_file, "rb")  # must be binary mode!
    try:
        manifest = ReadRocheXmlManifest(in_handle)
    except ValueError:
        manifest = None

    # This makes two passes though the SFF file with isn't so efficient,
    # but this makes the code simple.
    pos_count = neg_count = 0
    if pos_file is not None:
        out_handle = open(pos_file, "wb")
        writer = SffWriter(out_handle, xml=manifest)
        in_handle.seek(0)  # start again after getting manifest
        pos_count = writer.write_file(
            rec for rec in SffIterator(in_handle) if clean_name(rec.id) in wanted
        )
        out_handle.close()
    if neg_file is not None:
        out_handle = open(neg_file, "wb")
        writer = SffWriter(out_handle, xml=manifest)
        in_handle.seek(0)  # start again
        neg_count = writer.write_file(
            rec for rec in SffIterator(in_handle) if clean_name(rec.id) not in wanted
        )
        out_handle.close()
    # And we're done
    in_handle.close()
    # At the time of writing, Galaxy doesn't show SFF file read counts,
    # so it is useful to put them in stdout and thus shown in job info.
    return pos_count, neg_count
Exemplo n.º 6
0
 def test_no_index(self):
     # Does a lot of work to create a no-index SFF file
     # (in the process checking this bit of SffWriter works)
     records = list(SeqIO.parse(BytesIO(self.good), "sff"))
     with BytesIO() as handle:
         writer = SffWriter(handle, index=False)
         count = writer.write_file(records)
         self.assertEqual(count, len(records))
         handle.seek(0)
         new = list(SeqIO.parse(handle, "sff"))
         self.assertEqual(len(records), len(new))
         for a, b in zip(records, new):
             self.assertEqual(a.id, b.id)
         handle.seek(0)
         try:
             values = _sff_find_roche_index(handle)
         except ValueError as err:
             self.assertEqual(str(err), "No index present in this SFF file")
         else:
             self.assertTrue(False, "Test _sff_find_roche_index did not raise exception")
Exemplo n.º 7
0
 def run( self, proc_name = None ):
     sffpath = self.id_str + '.sff'
     try:
         with open( sffpath, 'wb' ) as fh:
             self.proc_name = proc_name
             self.sff_file = SffWriter( fh )
             self.sff_file.write_file( self.reads_for_barcode( self.reads_sff ) )
             logger.info( "%s reads of %s matched %s" % (self._matched_reads, self._processed, self.id_str) )
     except ValueError:
         # No reads for barcode so remove the temporary file
         os.unlink( sffpath )
Exemplo n.º 8
0
def sff_filter(in_file, out_file, iterator_filter):
    count = 0
    try:
        from Bio.SeqIO.SffIO import SffIterator, SffWriter
    except ImportError:
        stop_err("SFF filtering requires Biopython 1.54 or later")
    try:
        from Bio.SeqIO.SffIO import ReadRocheXmlManifest
    except ImportError:
        #Prior to Biopython 1.56 this was a private function
        from Bio.SeqIO.SffIO import _sff_read_roche_index_xml as ReadRocheXmlManifest
    with open(in_file, "rb") as in_handle:
        try:
            manifest = ReadRocheXmlManifest(in_handle)
        except ValueError:
            manifest = None
        in_handle.seek(0)
        with open(out_file, "wb") as out_handle:
            writer = SffWriter(out_handle, xml=manifest)
            in_handle.seek(0) #start again after getting manifest
            count = writer.write_file(iterator_filter(SffIterator(in_handle)))
            #count = writer.write_file(SffIterator(in_handle))
    return count
Exemplo n.º 9
0
def sff_filter(in_file, out_file, iterator_filter):
    count = 0
    try:
        from Bio.SeqIO.SffIO import SffIterator, SffWriter
    except ImportError:
        stop_err("SFF filtering requires Biopython 1.54 or later")
    try:
        from Bio.SeqIO.SffIO import ReadRocheXmlManifest
    except ImportError:
        #Prior to Biopython 1.56 this was a private function
        from Bio.SeqIO.SffIO import _sff_read_roche_index_xml as ReadRocheXmlManifest
    with open(in_file, "rb") as in_handle:
        try:
            manifest = ReadRocheXmlManifest(in_handle)
        except ValueError:
            manifest = None
        in_handle.seek(0)
        with open(out_file, "wb") as out_handle:
            writer = SffWriter(out_handle, xml=manifest)
            in_handle.seek(0)  #start again after getting manifest
            count = writer.write_file(iterator_filter(SffIterator(in_handle)))
            #count = writer.write_file(SffIterator(in_handle))
    return count
Exemplo n.º 10
0
 def test_write(self):
     filename = "Roche/E3MFGYR02_random_10_reads.sff"
     with open(filename, "rb") as handle:
         metadata = ReadRocheXmlManifest(handle)
     with open(filename, "rb") as handle:
         sff = list(SffIterator(handle))
     b_handle = BytesIO()
     w = SffWriter(b_handle, xml=metadata)
     w.write_file(sff)  # list
     data = b_handle.getvalue()
     # And again with an iterator...
     handle = BytesIO()
     w = SffWriter(handle, xml=metadata)
     w.write_file(iter(sff))
     self.assertEqual(data, handle.getvalue())
     # Check 100% identical to the original:
     with open(filename, "rb") as handle:
         original = handle.read()
     self.assertEqual(len(data), len(original))
     self.assertEqual(data, original)
     del data
Exemplo n.º 11
0
def sff_filter(in_file, pos_file, neg_file, wanted):
    """SFF filter."""
    try:
        from Bio.SeqIO.SffIO import SffIterator, SffWriter
    except ImportError:
        sys.exit("SFF filtering requires Biopython 1.54 or later")

    try:
        from Bio.SeqIO.SffIO import ReadRocheXmlManifest
    except ImportError:
        # Prior to Biopython 1.56 this was a private function
        from Bio.SeqIO.SffIO import _sff_read_roche_index_xml as ReadRocheXmlManifest

    in_handle = open(in_file, "rb")  # must be binary mode!
    try:
        manifest = ReadRocheXmlManifest(in_handle)
    except ValueError:
        manifest = None

    # This makes two passes though the SFF file with isn't so efficient,
    # but this makes the code simple.
    pos_count = neg_count = 0
    if pos_file is not None:
        out_handle = open(pos_file, "wb")
        writer = SffWriter(out_handle, xml=manifest)
        in_handle.seek(0)  # start again after getting manifest
        pos_count = writer.write_file(rec for rec in SffIterator(in_handle)
                                      if clean_name(rec.id) in wanted)
        out_handle.close()
    if neg_file is not None:
        out_handle = open(neg_file, "wb")
        writer = SffWriter(out_handle, xml=manifest)
        in_handle.seek(0)  # start again
        neg_count = writer.write_file(rec for rec in SffIterator(in_handle)
                                      if clean_name(rec.id) not in wanted)
        out_handle.close()
    # And we're done
    in_handle.close()
    # At the time of writing, Galaxy doesn't show SFF file read counts,
    # so it is useful to put them in stdout and thus shown in job info.
    return pos_count, neg_count
Exemplo n.º 12
0
 def test_write(self):
     filename = "Roche/E3MFGYR02_random_10_reads.sff"
     with open(filename, "rb") as handle:
         metadata = ReadRocheXmlManifest(handle)
     with open(filename, "rb") as handle:
         sff = list(SffIterator(handle))
     b_handle = BytesIO()
     w = SffWriter(b_handle, xml=metadata)
     w.write_file(sff)  # list
     data = b_handle.getvalue()
     # And again with an iterator...
     handle = BytesIO()
     w = SffWriter(handle, xml=metadata)
     w.write_file(iter(sff))
     self.assertEqual(data, handle.getvalue())
     # Check 100% identical to the original:
     with open(filename, "rb") as handle:
         original = handle.read()
     self.assertEqual(len(data), len(original))
     self.assertEqual(data, original)
     del data
Exemplo n.º 13
0
class PGMBarcode(object):
    """
        Represents a barcode from IonTorrent
    """
    def __init__(self, *args, **kwargs):
        """
            args - id_str, type, sequence, floworder, index, annotation, adapter, score_mode, score_cutoff
        """
        self.id_str = kwargs['id_str']
        self.type = kwargs['type']
        self.sequence = kwargs['sequence']
        self.floworder = kwargs['floworder']
        self.index = kwargs['index']
        self.annotation = kwargs['annotation']
        self.adapter = kwargs['adapter']
        self.score_mode = kwargs['score_mode']
        self.score_cutoff = kwargs['score_cutoff']
        self.sff_file = None
        self.proc_name = None

        self.reads_sff = kwargs['sfffilepath']
        self.max_num = kwargs['max_num']
        self._processed = 0
        self._matched_reads = 0

    def _readMatches(self, read):
        """
            read - Bio.Seq record representing a read from sff file
        """
        return self.sequence.lower() == self._getReadBarcode(read)

    def _getReadBarcode(self, read):
        """
            Returns the barcode for a given read which should be between the flow_key and adapter sequence
        """
        start = len(read.annotations['flow_key'])
        end = read.annotations['clip_adapter_left'] - len(self.adapter)
        seq = str(read.seq)
        return seq[start:end].lower()

    def reads_for_barcode(self, reads_file):
        """
            Generator method returning only reads for the barcode this 
            class instance is setup for
        """
        for read in SeqIO.parse(reads_file, 'sff'):
            # Quit if max_num is reached
            if self.max_num != 'All' and self._processed == self.max_num:
                break
            if self._readMatches(read):
                logger.debug("%s: %s Matched Read %s" %
                             (self.proc_name, self.id_str, read.id))
                self._matched_reads += 1
                yield read
            self._processed += 1

    def run(self, proc_name=None):
        sffpath = self.id_str + '.sff'
        try:
            with open(sffpath, 'wb') as fh:
                self.proc_name = proc_name
                self.sff_file = SffWriter(fh)
                self.sff_file.write_file(self.reads_for_barcode(
                    self.reads_sff))
                logger.info(
                    "%s reads of %s matched %s" %
                    (self._matched_reads, self._processed, self.id_str))
        except ValueError:
            # No reads for barcode so remove the temporary file
            os.unlink(sffpath)
Exemplo n.º 14
0
        if padding:
            padding = 8 - padding
        index += chr(0) * padding
        assert len(index) % 8 == 0

        # Ugly bit of code to make a fake index at start
        records = list(SffIterator(
            open("Roche/E3MFGYR02_random_10_reads.sff", "rb")))
        out_handle = open(
            "Roche/E3MFGYR02_alt_index_at_start.sff", "w")
        index = ".diy1.00This is a fake index block (DIY = Do It Yourself), which is allowed under the SFF standard.\0"
        padding = len(index) % 8
        if padding:
            padding = 8 - padding
        index += chr(0) * padding
        w = SffWriter(out_handle, index=False, xml=None)
        # Fake the header...
        w._number_of_reads = len(records)
        w._index_start = 0
        w._index_length = 0
        w._key_sequence = records[0].annotations["flow_key"]
        w._flow_chars = records[0].annotations["flow_chars"]
        w._number_of_flows_per_read = len(w._flow_chars)
        w.write_header()
        w._index_start = out_handle.tell()
        w._index_length = len(index)
        out_handle.seek(0)
        w.write_header()  # this time with index info
        w.handle.write(index)
        for record in records:
            w.write_record(record)
Exemplo n.º 15
0
def main():
    # Parse Command Line
    try:
        tabular_file, cols_arg, in_file, seq_format, out_positive_file, out_negative_file = sys.argv[1:]
    except ValueError:
        stop_err("Expected six arguments, got %i:\n%s" % (len(sys.argv) - 1, " ".join(sys.argv)))
    try:
        columns = [int(arg) - 1 for arg in cols_arg.split(",")]
    except ValueError:
        stop_err("Expected list of columns (comma separated integers), got %s" % cols_arg)

    if out_positive_file == "-" and out_negative_file == "-":
        stop_err("Neither output file requested")

    # Read tabular file and record all specified identifiers
    ids = set()
    handle = open(tabular_file, "rU")
    if len(columns) > 1:
        # General case of many columns
        for line in handle:
            if line.startswith("#"):
                # Ignore comments
                continue
            parts = line.rstrip("\n").split("\t")
            for col in columns:
                ids.add(parts[col])
        print "Using %i IDs from %i columns of tabular file" % (len(ids), len(columns))
    else:
        # Single column, special case speed up
        col = columns[0]
        for line in handle:
            if not line.startswith("#"):
                ids.add(line.rstrip("\n").split("\t")[col])
        print "Using %i IDs from tabular file" % (len(ids))
    handle.close()

    if seq_format.lower() == "sff":
        # Now write filtered SFF file based on IDs from BLAST file
        try:
            from Bio.SeqIO.SffIO import SffIterator, SffWriter
        except ImportError:
            stop_err("Requires Biopython 1.54 or later")

        try:
            from Bio.SeqIO.SffIO import ReadRocheXmlManifest
        except ImportError:
            # Prior to Biopython 1.56 this was a private function
            from Bio.SeqIO.SffIO import _sff_read_roche_index_xml as ReadRocheXmlManifest
        in_handle = open(in_file, "rb")  # must be binary mode!
        try:
            manifest = ReadRocheXmlManifest(in_handle)
        except ValueError:
            manifest = None
        # This makes two passes though the SFF file with isn't so efficient,
        # but this makes the code simple.
        if out_positive_file != "-":
            out_handle = open(out_positive_file, "wb")
            writer = SffWriter(out_handle, xml=manifest)
            in_handle.seek(0)  # start again after getting manifest
            pos_count = writer.write_file(rec for rec in SffIterator(in_handle) if rec.id in ids)
            out_handle.close()
        if out_negative_file != "-":
            out_handle = open(out_negative_file, "wb")
            writer = SffWriter(out_handle, xml=manifest)
            in_handle.seek(0)  # start again
            neg_count = writer.write_file(rec for rec in SffIterator(in_handle) if rec.id not in ids)
            out_handle.close()
        # And we're done
        in_handle.close()
        # At the time of writing, Galaxy doesn't show SFF file read counts,
        # so it is useful to put them in stdout and thus shown in job info.
        if out_positive_file != "-" and out_negative_file != "-":
            print "%i with and %i without specified IDs" % (pos_count, neg_count)
        elif out_positive_file != "-":
            print "%i with specified IDs" % pos_count
        elif out_negative_file != "-":
            print "%i without specified IDs" % neg_count
    elif seq_format.lower() == "fasta":
        # Write filtered FASTA file based on IDs from tabular file
        reader = fastaReader(open(in_file, "rU"))
        if out_positive_file != "-" and out_negative_file != "-":
            print "Generating two FASTA files"
            positive_writer = fastaWriter(open(out_positive_file, "w"))
            negative_writer = fastaWriter(open(out_negative_file, "w"))
            for record in reader:
                # The [1:] is because the fastaReader leaves the > on the identifer.
                if record.identifier and record.identifier.split()[0][1:] in ids:
                    positive_writer.write(record)
                else:
                    negative_writer.write(record)
            positive_writer.close()
            negative_writer.close()
        elif out_positive_file != "-":
            print "Generating matching FASTA file"
            positive_writer = fastaWriter(open(out_positive_file, "w"))
            for record in reader:
                # The [1:] is because the fastaReader leaves the > on the identifer.
                if record.identifier and record.identifier.split()[0][1:] in ids:
                    positive_writer.write(record)
            positive_writer.close()
        elif out_negative_file != "-":
            print "Generating non-matching FASTA file"
            negative_writer = fastaWriter(open(out_negative_file, "w"))
            for record in reader:
                # The [1:] is because the fastaReader leaves the > on the identifer.
                if not record.identifier or record.identifier.split()[0][1:] not in ids:
                    negative_writer.write(record)
            negative_writer.close()
    elif seq_format.lower().startswith("fastq"):
        # Write filtered FASTQ file based on IDs from tabular file
        from galaxy_utils.sequence.fastq import fastqReader, fastqWriter

        reader = fastqReader(open(in_file, "rU"))
        if out_positive_file != "-" and out_negative_file != "-":
            print "Generating two FASTQ files"
            positive_writer = fastqWriter(open(out_positive_file, "w"))
            negative_writer = fastqWriter(open(out_negative_file, "w"))
            for record in reader:
                # The [1:] is because the fastaReader leaves the @ on the identifer.
                if record.identifier and record.identifier.split()[0][1:] in ids:
                    positive_writer.write(record)
                else:
                    negative_writer.write(record)
            positive_writer.close()
            negative_writer.close()
        elif out_positive_file != "-":
            print "Generating matching FASTQ file"
            positive_writer = fastqWriter(open(out_positive_file, "w"))
            for record in reader:
                # The [1:] is because the fastaReader leaves the @ on the identifer.
                if record.identifier and record.identifier.split()[0][1:] in ids:
                    positive_writer.write(record)
            positive_writer.close()
        elif out_negative_file != "-":
            print "Generating non-matching FASTQ file"
            negative_writer = fastqWriter(open(out_negative_file, "w"))
            for record in reader:
                # The [1:] is because the fastaReader leaves the @ on the identifer.
                if not record.identifier or record.identifier.split()[0][1:] not in ids:
                    negative_writer.write(record)
            negative_writer.close()
    else:
        stop_err("Unsupported file type %r" % seq_format)
Exemplo n.º 16
0
    try:
        from Bio.SeqIO.SffIO import ReadRocheXmlManifest
    except ImportError:
        #Prior to Biopython 1.56 this was a private function
        from Bio.SeqIO.SffIO import _sff_read_roche_index_xml as ReadRocheXmlManifest

    in_handle = open(in_file, "rb") #must be binary mode!
    try:
        manifest = ReadRocheXmlManifest(in_handle)
    except ValueError:
        manifest = None
    in_handle.close()

    out_handle = open(out_file, "wb")
    writer = SffWriter(out_handle, xml=manifest)
    count = 0
    #This does have the overhead of parsing into SeqRecord objects,
    #but doing the header and index at the low level is too fidly.
    iterator = (records[name] for name in parse_ids(tabular_file, column))
    try:
        count = writer.write_file(iterator)
    except KeyError, err:
        out_handle.close()
        if name not in records:
            stop_err("Identifier %r not found in sequence file" % name)
        else:
            raise err
    out_handle.close()
else:
    #Avoid overhead of parsing into SeqRecord objects,
Exemplo n.º 17
0
    except ImportError:
        sys.exit("Requires Biopython 1.54 or later")

    try:
        from Bio.SeqIO.SffIO import ReadRocheXmlManifest
    except ImportError:
        #Prior to Biopython 1.56 this was a private function
        from Bio.SeqIO.SffIO import _sff_read_roche_index_xml as ReadRocheXmlManifest

    in_handle = open(in_file, "rb") #must be binary mode!
    try:
        manifest = ReadRocheXmlManifest(in_handle)
    except ValueError:
        manifest = None
    out_handle = open(out_file, "wb")
    writer = SffWriter(out_handle, xml=manifest)
    in_handle.seek(0) #start again after getting manifest
    count = writer.write_file(rename_seqrecords(SffIterator(in_handle), rename))
    out_handle.close()
    in_handle.close()
else:
    #Use Galaxy for FASTA, QUAL or FASTQ
    if seq_format.lower() in ["fasta", "csfasta"] \
    or seq_format.lower().startswith("qual"):
        from galaxy_utils.sequence.fasta import fastaReader, fastaWriter
        reader = fastaReader(open(in_file, "rU"))
        writer = fastaWriter(open(out_file, "w"))
        marker = ">"
    elif seq_format.lower().startswith("fastq"):
        from galaxy_utils.sequence.fastq import fastqReader, fastqWriter
        reader = fastqReader(open(in_file, "rU"))
Exemplo n.º 18
0
    try:
        from Bio.SeqIO.SffIO import ReadRocheXmlManifest
    except ImportError:
        #Prior to Biopython 1.56 this was a private function
        from Bio.SeqIO.SffIO import _sff_read_roche_index_xml as ReadRocheXmlManifest
    in_handle = open(in_file, "rb") #must be binary mode!
    try:
        manifest = ReadRocheXmlManifest(in_handle)
    except ValueError:
        manifest = None
    #This makes two passes though the SFF file with isn't so efficient,
    #but this makes the code simple.
    pos_count = neg_count = 0
    if out_positive_file != "-":
        out_handle = open(out_positive_file, "wb")
        writer = SffWriter(out_handle, xml=manifest)
        in_handle.seek(0) #start again after getting manifest
        pos_count = writer.write_file(rec for rec in SffIterator(in_handle) if rec.id in ids)
        out_handle.close()
    if out_negative_file != "-":
        out_handle = open(out_negative_file, "wb")
        writer = SffWriter(out_handle, xml=manifest)
        in_handle.seek(0) #start again
        neg_count = writer.write_file(rec for rec in SffIterator(in_handle) if rec.id not in ids)
        out_handle.close()
    #And we're done
    in_handle.close()
    #At the time of writing, Galaxy doesn't show SFF file read counts,
    #so it is useful to put them in stdout and thus shown in job info.
    print "%i with and %i without specified IDs" % (pos_count, neg_count)
elif seq_format.lower()=="fasta":
Exemplo n.º 19
0
    try:
        from Bio.SeqIO.SffIO import ReadRocheXmlManifest
    except ImportError:
        #Prior to Biopython 1.56 this was a private function
        from Bio.SeqIO.SffIO import _sff_read_roche_index_xml as ReadRocheXmlManifest
    in_handle = open(in_file, "rb") #must be binary mode!
    try:
        manifest = ReadRocheXmlManifest(in_handle)
    except ValueError:
        manifest = None
    #This makes two passes though the SFF file with isn't so efficient,
    #but this makes the code simple.
    pos_count = neg_count = 0
    if out_positive_file is not None:
        out_handle = open(out_positive_file, "wb")
        writer = SffWriter(out_handle, xml=manifest)
        in_handle.seek(0) #start again after getting manifest
        pos_count = writer.write_file(rec for rec in SffIterator(in_handle) if clean_name(rec.id) in ids)
        out_handle.close()
    if out_negative_file is not None:
        out_handle = open(out_negative_file, "wb")
        writer = SffWriter(out_handle, xml=manifest)
        in_handle.seek(0) #start again
        neg_count = writer.write_file(rec for rec in SffIterator(in_handle) if clean_name(rec.id) not in ids)
        out_handle.close()
    #And we're done
    in_handle.close()
    #At the time of writing, Galaxy doesn't show SFF file read counts,
    #so it is useful to put them in stdout and thus shown in job info.
    print "%i with and %i without specified IDs" % (pos_count, neg_count)
elif seq_format.lower()=="fasta":
Exemplo n.º 20
0
class PGMBarcode( object ):
    """
        Represents a barcode from IonTorrent
    """
    def __init__( self, *args, **kwargs ):
        """
            args - id_str, type, sequence, floworder, index, annotation, adapter, score_mode, score_cutoff
        """
        self.id_str = kwargs['id_str']
        self.type = kwargs['type']
        self.sequence = kwargs['sequence']
        self.floworder = kwargs['floworder']
        self.index = kwargs['index']
        self.annotation = kwargs['annotation']
        self.adapter = kwargs['adapter']
        self.score_mode = kwargs['score_mode']
        self.score_cutoff = kwargs['score_cutoff']
        self.sff_file = None
        self.proc_name = None

        self.reads_sff = kwargs['sfffilepath']
        self.max_num = kwargs['max_num']
        self._processed = 0
        self._matched_reads = 0

    def _readMatches( self, read ):
        """
            read - Bio.Seq record representing a read from sff file
        """
        return self.sequence.lower() == self._getReadBarcode( read )

    def _getReadBarcode( self, read ):
        """
            Returns the barcode for a given read which should be between the flow_key and adapter sequence
        """
        start = len( read.annotations['flow_key'] )
        end = read.annotations['clip_adapter_left'] - len( self.adapter )
        seq = str( read.seq )
        return seq[start:end].lower()

    def reads_for_barcode( self, reads_file ):
        """
            Generator method returning only reads for the barcode this 
            class instance is setup for
        """
        for read in SeqIO.parse( reads_file, 'sff' ):
            # Quit if max_num is reached
            if self.max_num != 'All' and self._processed == self.max_num:
                break
            if self._readMatches( read ):
                logger.debug( "%s: %s Matched Read %s" % (self.proc_name, self.id_str, read.id) )
                self._matched_reads += 1
                yield read
            self._processed += 1

    def run( self, proc_name = None ):
        sffpath = self.id_str + '.sff'
        try:
            with open( sffpath, 'wb' ) as fh:
                self.proc_name = proc_name
                self.sff_file = SffWriter( fh )
                self.sff_file.write_file( self.reads_for_barcode( self.reads_sff ) )
                logger.info( "%s reads of %s matched %s" % (self._matched_reads, self._processed, self.id_str) )
        except ValueError:
            # No reads for barcode so remove the temporary file
            os.unlink( sffpath )
Exemplo n.º 21
0
    try:
        from Bio.SeqIO.SffIO import ReadRocheXmlManifest
    except ImportError:
        # Prior to Biopython 1.56 this was a private function
        from Bio.SeqIO.SffIO import _sff_read_roche_index_xml as ReadRocheXmlManifest

    in_handle = open(in_file, "rb")  # must be binary mode!
    try:
        manifest = ReadRocheXmlManifest(in_handle)
    except ValueError:
        manifest = None
    in_handle.close()

    out_handle = open(out_file, "wb")
    writer = SffWriter(out_handle, xml=manifest)
    count = 0
    # This does have the overhead of parsing into SeqRecord objects,
    # but doing the header and index at the low level is too fidly.
    name = None  # We want the variable to leak from the iterator's scope...
    iterator = (records[name] for name in parse_ids(tabular_file, column))
    try:
        count = writer.write_file(iterator)
    except KeyError:
        out_handle.close()
        if name not in records:
            sys.exit("Identifier %r not found in sequence file" % name)
        else:
            raise
    out_handle.close()
else:
Exemplo n.º 22
0
    padding = len(index) % 8
    if padding:
        padding = 8 - padding
    index += chr(0) * padding
    assert len(index) % 8 == 0

    # Ugly bit of code to make a fake index at start
    index = ".diy1.00This is a fake index block (DIY = Do It Yourself), which is allowed under the SFF standard.\0"
    padding = len(index) % 8
    if padding:
        padding = 8 - padding
    index += chr(0) * padding
    with open("Roche/E3MFGYR02_random_10_reads.sff", "rb") as handle:
        records = list(SffIterator(handle))
    with open("Roche/E3MFGYR02_alt_index_at_start.sff", "w") as out_handle:
        w = SffWriter(out_handle, index=False, xml=None)
        # Fake the header...
        w._number_of_reads = len(records)
        w._index_start = 0
        w._index_length = 0
        w._key_sequence = records[0].annotations["flow_key"]
        w._flow_chars = records[0].annotations["flow_chars"]
        w._number_of_flows_per_read = len(w._flow_chars)
        w.write_header()
        w._index_start = out_handle.tell()
        w._index_length = len(index)
        out_handle.seek(0)
        w.write_header()  # this time with index info
        w.handle.write(index)
        for record in records:
            w.write_record(record)
Exemplo n.º 23
0
    except ImportError:
        sys.exit("Requires Biopython 1.54 or later")

    try:
        from Bio.SeqIO.SffIO import ReadRocheXmlManifest
    except ImportError:
        # Prior to Biopython 1.56 this was a private function
        from Bio.SeqIO.SffIO import _sff_read_roche_index_xml as ReadRocheXmlManifest

    in_handle = open(in_file, "rb")  # must be binary mode!
    try:
        manifest = ReadRocheXmlManifest(in_handle)
    except ValueError:
        manifest = None
    out_handle = open(out_file, "wb")
    writer = SffWriter(out_handle, xml=manifest)
    in_handle.seek(0)  # start again after getting manifest
    count = writer.write_file(rename_seqrecords(SffIterator(in_handle), rename))
    out_handle.close()
    in_handle.close()
else:
    # Use Galaxy for FASTA, QUAL or FASTQ
    if seq_format.lower() in ["fasta", "csfasta"] or seq_format.lower().startswith(
        "qual"
    ):
        from galaxy_utils.sequence.fasta import fastaReader, fastaWriter

        reader = fastaReader(open(in_file, "rU"))
        writer = fastaWriter(open(out_file, "w"))
        marker = ">"
    elif seq_format.lower().startswith("fastq"):
Exemplo n.º 24
0
                        short_clipped += 1
                elif keep_negatives:
                    if len(seq) >= min_len:
                        negs += 1
                        yield record
                    else:
                        short_neg += 1
    
    in_handle = open(in_file, "rb")
    try:
        manifest = ReadRocheXmlManifest(in_handle)
    except ValueError:
        manifest = None
    in_handle.seek(0)
    out_handle = open(out_file, "wb")
    writer = SffWriter(out_handle, xml=manifest)
    writer.write_file(process(SffIterator(in_handle)))
    #End of SFF code
elif seq_format.lower().startswith("fastq"):
    in_handle = open(in_file, "rU")
    out_handle = open(out_file, "w")
    reader = fastqReader(in_handle)
    writer = fastqWriter(out_handle)
    if forward:
        for record in reader:
            seq = record.sequence.upper()
            result = primer.search(seq)
            if result:
                #Forward primer, take everything after it
                cut = result.end()
                record.sequence = seq[cut:]
Exemplo n.º 25
0
                        short_clipped += 1
                elif keep_negatives:
                    if len(seq) >= min_len:
                        negs += 1
                        yield record
                    else:
                        short_neg += 1

    in_handle = open(in_file, "rb")
    try:
        manifest = ReadRocheXmlManifest(in_handle)
    except ValueError:
        manifest = None
    in_handle.seek(0)
    out_handle = open(out_file, "wb")
    writer = SffWriter(out_handle, xml=manifest)
    writer.write_file(process(SffIterator(in_handle)))
    #End of SFF code
elif seq_format.lower().startswith("fastq"):
    in_handle = open(in_file, "rU")
    out_handle = open(out_file, "w")
    reader = fastqReader(in_handle)
    writer = fastqWriter(out_handle)
    if forward:
        for record in reader:
            seq = record.sequence.upper()
            result = primer.search(seq)
            if result:
                #Forward primer, take everything after it
                cut = result.end()
                record.sequence = seq[cut:]
Exemplo n.º 26
0
 try:
     from Bio.SeqIO.SffIO import ReadRocheXmlManifest
 except ImportError:
     #Prior to Biopython 1.56 this was a private function
     from Bio.SeqIO.SffIO import _sff_read_roche_index_xml as ReadRocheXmlManifest
 in_handle = open(in_file, "rb")  #must be binary mode!
 try:
     manifest = ReadRocheXmlManifest(in_handle)
 except ValueError:
     manifest = None
 #This makes two passes though the SFF file with isn't so efficient,
 #but this makes the code simple.
 pos_count = neg_count = 0
 if out_positive_file is not None:
     out_handle = open(out_positive_file, "wb")
     writer = SffWriter(out_handle, xml=manifest)
     in_handle.seek(0)  #start again after getting manifest
     pos_count = writer.write_file(rec for rec in SffIterator(in_handle)
                                   if clean_name(rec.id) in ids)
     out_handle.close()
 if out_negative_file is not None:
     out_handle = open(out_negative_file, "wb")
     writer = SffWriter(out_handle, xml=manifest)
     in_handle.seek(0)  #start again
     neg_count = writer.write_file(rec for rec in SffIterator(in_handle)
                                   if clean_name(rec.id) not in ids)
     out_handle.close()
 #And we're done
 in_handle.close()
 #At the time of writing, Galaxy doesn't show SFF file read counts,
 #so it is useful to put them in stdout and thus shown in job info.