Exemplo n.º 1
0
 def _get_tile_info(self):
     identifiers = []
     tiles = {}
     with FastqReader(self.filename) as f:
         for i, record in enumerate(f):
             if i < self.max_sample:
                 identifier = Identifier(record.name)
                 identifiers.append(identifier.info)
     tiles['x'] = [float(this['x_coordinate']) for this in identifiers]
     tiles['y'] = [float(this['y_coordinate']) for this in identifiers]
     tiles['tiles'] = [this['tile_number'] for this in identifiers]
     return tiles
Exemplo n.º 2
0
 def _get_qualities(self):
     from sequana import logger
     logger.info("Extracting qualities")
     qualities = []
     with FastqReader(self.filename) as f:
         for i, record in enumerate(f):
             if i < self.max_sample:
                 quality = [ord(x) -33 for x in record.qualities]
                 qualities.append(quality)
             else:
                 break
     return qualities
Exemplo n.º 3
0
 def test_context_manager(self):
     filename = "tests/data/simple.fastq"
     with open(filename) as f:
         assert not f.closed
         reads = list(openseq(f))
         assert not f.closed
     assert f.closed
     with FastqReader(filename) as sr:
         tmp_sr = sr
         assert not sr._file.closed
         reads = list(sr)
         assert not sr._file.closed
     assert tmp_sr._file is None
Exemplo n.º 4
0
    def _get_info(self):
        """Populates the data structures for plotting.

        Will be called on request"""

        stats = {"A":0, "C":0, "G":0, "T":0, "N":0}
        stats["qualities"] = []
        stats["mean_qualities"] = []
        stats["mean_length"] = 0
        stats["sequences"] = []

        minimum = 1e6
        maximum = 0
        # FIXME this self.N takes time in the cosntructor
        # do we need it ?
        self.lengths = np.empty(self.N)
        self.gc_list = []
        total_length = 0
        C = defaultdict(int)
        if self.verbose:
            pb = Progress(self.N)

        sequences = []
        mean_qualities = []
        qualities = []
        # could use multiprocessing
        # FastxFile has shown some errors while handling gzip files
        # created with zlib (e.g. from atropos). This is now replaced
        # by the Atropos FastqReader for now.
        #fastq = pysam.FastxFile(self.filename)

        with FastqReader(self.filename) as f:
            for i, record in enumerate(f):
                N = len(record.sequence)
                self.lengths[i] = N

                # we can store all qualities and sequences reads, so
                # just max_sample are stored:
                if i < self.max_sample:
                    quality = [ord(x) -33 for x in record.qualities]
                    mean_qualities.append(sum(quality) / N)
                    qualities.append(quality)
                    sequences.append(record.sequence)

                # store count of all qualities
                for k in record.qualities:
                    C[k] += 1

                GG = record.sequence.count('G') 
                CC = record.sequence.count('C')
                self.gc_list.append((GG+CC)/float(N)*100)

                # not using a counter, or loop speed up the code
                stats["A"] += record.sequence.count("A")
                stats["C"] += CC
                stats["G"] += GG
                stats["T"] += record.sequence.count("T")
                stats["N"] += record.sequence.count("N")

                total_length += len(record.sequence)

                if self.verbose:
                    pb.animate(i+1)

        # other data
        self.qualities = qualities
        self.mean_qualities = mean_qualities
        self.minimum = int(self.lengths.min())
        self.maximum = int(self.lengths.max())
        self.sequences = sequences
        self.gc_content = np.mean(self.gc_list)
        stats['mean_length'] = total_length / float(self.N)
        stats['total_bp'] = stats['A'] + stats['C'] + stats['G'] + stats["T"] + stats['N']
        stats['mean_quality'] = sum([(ord(k) -33)*v for k,v in C.items()]) / stats['total_bp']

        self.stats = stats
Exemplo n.º 5
0
 def test_alphabet(self):
     filename = "tests/data/bad_bases.fq"
     with FastqReader(filename, alphabet=ALPHABETS['dna']) as f:
         reads = list(f)
         assert reads[0].sequence == 'ACGNGGACT'
         assert reads[1].sequence == 'CGGACNNNC'
Exemplo n.º 6
0
 def test_fastq_incomplete(self):
     fastq = StringIO("@name\nACGT+\n")
     with raises(FormatError), FastqReader(fastq) as fq:
         list(fq)
Exemplo n.º 7
0
 def test_fastq_wrongformat(self):
     with raises(FormatError), FastqReader(
             "tests/data/withplus.fastq") as f:
         reads = list(f)
Exemplo n.º 8
0
 def test_fastqreader_dos(self):
     with FastqReader("tests/data/dos.fastq") as f:
         dos_reads = list(f)
     with FastqReader("tests/data/small.fastq") as f:
         unix_reads = list(f)
     assert dos_reads == unix_reads
Exemplo n.º 9
0
 def test_fastqreader(self):
     with FastqReader("tests/data/simple.fastq") as f:
         reads = list(f)
     assert reads == simple_fastq