def compare_record(self, old, new, fmt=None, msg=None):
        """Quality aware SeqRecord comparison.

        This will check the mapping between Solexa and PHRED scores.
        It knows to ignore UnknownSeq objects for string matching (i.e. QUAL files).
        """
        super().compare_record(old, new, msg=None)
        if fmt in ["fastq-solexa", "fastq-illumina"]:
            truncate = 62
        elif fmt in ["fastq", "fastq-sanger"]:
            truncate = 93
        else:
            assert fmt in ["fasta", "qual", "phd", "sff", "tab", None]
            truncate = None
        for keyword in ("phred_quality", "solexa_quality"):
            q_old = old.letter_annotations.get(keyword)
            q_new = new.letter_annotations.get(keyword)
            if q_old is None or q_new is None:
                continue
            if truncate is not None and q_old != q_new:
                q_old = [min(q, truncate) for q in q_old]
                q_new = [min(q, truncate) for q in q_new]
            err_msg = "mismatch in %s" % keyword
            if msg is not None:
                err_msg = "%s: %s" % (msg, err_msg)
            self.assertEqual(q_old, q_new, msg=err_msg)

        q_old = old.letter_annotations.get("phred_quality")
        q_new = new.letter_annotations.get("solexa_quality")
        if q_old is not None and q_new is not None:
            # Mapping from Solexa to PHRED is lossy, but so is PHRED to Solexa.
            # Assume "old" is the original, and "new" has been converted.
            converted = [
                round(QualityIO.solexa_quality_from_phred(q)) for q in q_old
            ]
            if truncate is not None:
                converted = [min(q, truncate) for q in converted]
            err_msg = "mismatch converting phred_quality %s to solexa_quality" % q_old
            if msg is not None:
                err_msg = "%s: %s" % (msg, err_msg)
            self.assertEqual(converted, q_new, msg=err_msg)

        q_old = old.letter_annotations.get("solexa_quality")
        q_new = new.letter_annotations.get("phred_quality")
        if q_old is not None and q_new is not None:
            # Mapping from Solexa to PHRED is lossy, but so is PHRED to Solexa.
            # Assume "old" is the original, and "new" has been converted.
            converted = [
                round(QualityIO.phred_quality_from_solexa(q)) for q in q_old
            ]
            if truncate is not None:
                converted = [min(q, truncate) for q in converted]
            err_msg = "mismatch converting solexa_quality %s to phred_quality" % q_old
            if msg is not None:
                err_msg = "%s: %s" % (msg, err_msg)
            self.assertEqual(converted, q_new, msg=err_msg)
 def test_paired(self):
     """Check FASTQ parsing matches FASTA+QUAL parsing"""
     records1 = list(\
         QualityIO.PairedFastaQualIterator(open("Quality/example.fasta"),
                                           open("Quality/example.qual")))
     records2 = list(SeqIO.parse(open("Quality/example.fastq"), "fastq"))
     self.assert_(compare_records(records1, records2))
示例#3
0
文件: fastq.py 项目: cez026/BarcSeek
def read_fastq(fastq: str, pair: Optional[str] = None) -> Tuple[Read]:
    """Read in a FASTQ file, and optionally its pair
    'fastq' the filename for the forward or only FASTQ file
    'pair' an optional filename for the reverse FASTQ file"""
    reads = dict()  # type: Dict[str, Read]
    with open(fastq, 'r') as ffile:  # type: _io.TextIOWrapper
        for read in QualityIO.FastqGeneralIterator(ffile):  # type: Tuple[str]
            read_id, seq, qual = read  # type: str, str, str
            reads[read_id] = Read(read_id=read_id, seq=seq, qual=qual)
    if pair:
        with open(pair, 'r') as rfile:  # type: _io.TextIOWrapper
            for read in QualityIO.FastqGeneralIterator(
                    rfile):  # type: Tuple[str]
                read_id, seq, qual = read  # type: str, str, str
                reads[read_id].add_reverse(seq=seq, qual=qual)
    return tuple(reads.values())
示例#4
0
 def check_general_fails(self, filename, good_count):
     handle = open(filename, "rU")
     tuples = QualityIO.FastqGeneralIterator(handle)
     for i in range(good_count):
         title, seq, qual = next(tuples)  # Make sure no errors!
     self.assertRaises(ValueError, next, tuples)
     handle.close()
示例#5
0
def mean_emoji(filename):
    # works for up to 500bp reads
    means = np.zeros(500)
    seq_count = 0

    for r in SeqIO.parse(filename, "fastq"):
        index = 0
        for s in r.letter_annotations["phred_quality"]:
            means[index] += s
            index = index + 1
        seq_count = seq_count + 1

    cleaned = np.trim_zeros(means)
    means_fp = cleaned / seq_count

    fake_seq = ''.join(["a"] * len(means_fp.round()))

    record = SeqRecord(Seq(fake_seq),
                       id="test",
                       name="mean scores",
                       description="example with mean fastq socres",
                       letter_annotations={
                           'phred_quality': list(means_fp.round().astype(int))
                       })

    print("".join([
        emojify(fastq_emoji_map[s])
        for s in QualityIO._get_sanger_quality_str(record)
    ]))
示例#6
0
 def test_paired(self):
     """Check FASTQ parsing matches FASTA+QUAL parsing"""
     with open("Quality/example.fasta") as f:
         with open("Quality/example.qual") as q:
             records1 = list(QualityIO.PairedFastaQualIterator(f, q))
     records2 = list(SeqIO.parse("Quality/example.fastq", "fastq"))
     self.assertTrue(compare_records(records1, records2))
 def check_general_fails(self, filename, good_count):
     tuples = QualityIO.FastqGeneralIterator(filename)
     msg = "FastqGeneralIterator failed to detect error in %s" % filename
     for i in range(good_count):
         title, seq, qual = next(tuples)  # Make sure no errors!
     # Detect error in the next record:
     with self.assertRaises(ValueError, msg=msg) as cm:
         title, seq, qual = next(tuples)
 def check_general_passes(self, filename, record_count):
     tuples = QualityIO.FastqGeneralIterator(filename)
     # This "raw" parser doesn't check the ASCII characters which means
     # certain invalid FASTQ files will get parsed without errors.
     msg = "FastqGeneralIterator failed to parse %s" % filename
     count = 0
     for title, seq, qual in tuples:
         self.assertEqual(len(seq), len(qual), msg=msg)
         count += 1
     self.assertEqual(count, record_count, msg=msg)
示例#9
0
 def check_general_passes(self, filename, record_count):
     handle = open(filename, "rU")
     tuples = QualityIO.FastqGeneralIterator(handle)
     #This "raw" parser doesn't check the ASCII characters which means
     #certain invalid FASTQ files will get parsed without errors.
     count = 0
     for title, seq, qual in tuples:
         self.assertEqual(len(seq), len(qual))
         count += 1
     self.assertEqual(count, record_count)
     handle.close()
示例#10
0
    def compare_record(self, old, new, truncate, msg):
        """Quality aware SeqRecord comparison.

        This will check the mapping between Solexa and PHRED scores.
        It knows to ignore UnknownSeq objects for string matching (i.e. QUAL files).
        """
        super().compare_record(old, new, msg=msg)
        for keyword in ("phred_quality", "solexa_quality"):
            q_old = old.letter_annotations.get(keyword)
            q_new = new.letter_annotations.get(keyword)
            if q_old is None or q_new is None:
                continue
            if truncate and q_old != q_new:
                q_old = [min(q, truncate) for q in q_old]
                q_new = [min(q, truncate) for q in q_new]
            err_msg = "%s: mismatch in %s" % (msg, keyword)
            self.assertEqual(q_old, q_new, msg=err_msg)

        q_old = old.letter_annotations.get("phred_quality")
        q_new = new.letter_annotations.get("solexa_quality")
        if q_old is not None and q_new is not None:
            # Mapping from Solexa to PHRED is lossy, but so is PHRED to Solexa.
            # Assume "old" is the original, and "new" has been converted.
            converted = [round(QualityIO.solexa_quality_from_phred(q)) for q in q_old]
            if truncate:
                converted = [min(q, truncate) for q in converted]
            err_msg = "%s: mismatch in phred_quality vs solexa_quality" % msg
            self.assertEqual(converted, q_new, msg=err_msg)

        q_old = old.letter_annotations.get("solexa_quality")
        q_new = new.letter_annotations.get("phred_quality")
        if q_old is not None and q_new is not None:
            # Mapping from Solexa to PHRED is lossy, but so is PHRED to Solexa.
            # Assume "old" is the original, and "new" has been converted.
            converted = [round(QualityIO.phred_quality_from_solexa(q)) for q in q_old]
            if truncate:
                converted = [min(q, truncate) for q in converted]
            err_msg = "%s: mismatch in solexa_quality vs phred_quality" % msg
            self.assertEqual(converted, q_new, msg=err_msg)
示例#11
0
    def write_record(self, record):
        """Write a single Phd record to the file."""
        assert record.seq, "No sequence present in SeqRecord"
        # This method returns the 'phred_quality' scores or converted
        # 'solexa_quality' scores if present, else raises a value error
        phred_qualities = QualityIO._get_phred_quality(record)
        peak_locations = record.letter_annotations.get("peak_location")
        if len(record.seq) != len(phred_qualities):
            raise ValueError("Number of phd quality scores does not match "
                             "length of sequence")
        if peak_locations:
            if len(record.seq) != len(peak_locations):
                raise ValueError("Number of peak location scores does not "
                                 "match length of sequence")
        if None in phred_qualities:
            raise ValueError("A quality value of None was found")
        if record.description.startswith("%s " % record.id):
            title = record.description
        else:
            title = "%s %s" % (record.id, record.description)
        self.handle.write("BEGIN_SEQUENCE %s\nBEGIN_COMMENT\n"
                          % self.clean(title))
        for annot in [k.lower() for k in Phd.CKEYWORDS]:
            value = None
            if annot == "trim":
                if record.annotations.get("trim"):
                    value = "%s %s %.4f" % record.annotations["trim"]
            elif annot == "trace_peak_area_ratio":
                if record.annotations.get("trace_peak_area_ratio"):
                    value = "%.4f" % record.annotations[
                        "trace_peak_area_ratio"]
            else:
                value = record.annotations.get(annot)
            if value or value == 0:
                self.handle.write("%s: %s\n" % (annot.upper(), value))

        self.handle.write("END_COMMENT\nBEGIN_DNA\n")
        for i, site in enumerate(record.seq):
            if peak_locations:
                self.handle.write("%s %i %i\n" % (
                    site,
                    round(phred_qualities[i]),
                    peak_locations[i])
                )
            else:
                self.handle.write("%s %i\n" % (
                    site,
                    round(phred_qualities[i]))
                )

        self.handle.write("END_DNA\nEND_SEQUENCE\n")
示例#12
0
    def write_record(self, record):
        """Write a single Phd record to the file."""
        assert record.seq, "No sequence present in SeqRecord"
        # This method returns the 'phred_quality' scores or converted
        # 'solexa_quality' scores if present, else raises a value error
        phred_qualities = QualityIO._get_phred_quality(record)
        peak_locations = record.letter_annotations.get("peak_location", None)
        assert len(record.seq) == len(phred_qualities), "Number of " + \
            "phd quality scores does not match length of sequence"
        if peak_locations:
            assert len(record.seq) == len(peak_locations), "Number " + \
                "of peak location scores does not match length of sequence"
        if None in phred_qualities:
            raise ValueError("A quality value of None was found")
        if record.description.startswith("%s " % record.id):
            title = record.description
        else:
            title = "%s %s" % (record.id, record.description)
        self.handle.write("BEGIN_SEQUENCE %s\nBEGIN_COMMENT\n"
                          % self.clean(title))
        for annot in [k.lower() for k in Phd.CKEYWORDS]:
            value = None
            if annot == "trim":
                if record.annotations.get("trim", None):
                    value = "%s %s %.4f" % record.annotations["trim"]
            elif annot == "trace_peak_area_ratio":
                if record.annotations.get("trace_peak_area_ratio", None):
                    value = "%.4f" % record.annotations[
                        "trace_peak_area_ratio"]
            else:
                value = record.annotations.get(annot, None)
            if value or value == 0:
                self.handle.write("%s: %s\n" % (annot.upper(), value))

        self.handle.write("END_COMMENT\nBEGIN_DNA\n")
        for i, site in enumerate(record.seq):
            if peak_locations:
                self.handle.write("%s %i %i\n" % (
                    site,
                    round(phred_qualities[i]),
                    peak_locations[i])
                )
            else:
                self.handle.write("%s %i\n" % (
                    site,
                    round(phred_qualities[i]))
                )

        self.handle.write("END_DNA\nEND_SEQUENCE\n")
示例#13
0
 def test_solexa_to_sanger(self):
     """Mapping check for FASTQ Solexa (-5 to 62) to Sanger (0 to 62)."""
     # The point of this test is the writing code doesn't actually use the
     # solexa_quality_from_phred function directly. For speed it uses a
     # cached dictionary of the mappings.
     seq = "N" * 68
     qual = "".join(chr(64 + q) for q in range(-5, 63))
     expected_phred = [
         round(QualityIO.phred_quality_from_solexa(q)) for q in range(-5, 63)
     ]
     in_handle = StringIO(f"@Test\n{seq}\n+\n{qual}")
     out_handle = StringIO()
     SeqIO.write(SeqIO.parse(in_handle, "fastq-solexa"), out_handle, "fastq-sanger")
     out_handle.seek(0)
     record = SeqIO.read(out_handle, "fastq-sanger")
     self.assertEqual(record.seq, seq)
     self.assertEqual(record.letter_annotations["phred_quality"], expected_phred)
示例#14
0
 def test_solexa_to_sanger(self):
     """Mapping check for FASTQ Solexa (-5 to 62) to Sanger (0 to 62)"""
     # The point of this test is the writing code doesn't actually use the
     # solexa_quality_from_phred function directly. For speed it uses a
     # cached dictionary of the mappings.
     seq = "N"*68
     qual = "".join(chr(64+q) for q in range(-5, 63))
     expected_phred = [round(QualityIO.phred_quality_from_solexa(q))
                       for q in range(-5, 63)]
     in_handle = StringIO("@Test\n%s\n+\n%s" % (seq, qual))
     out_handle = StringIO()
     SeqIO.write(SeqIO.parse(in_handle, "fastq-solexa"),
                 out_handle, "fastq-sanger")
     out_handle.seek(0)
     record = SeqIO.read(out_handle, "fastq-sanger")
     self.assertEqual(str(record.seq), seq)
     self.assertEqual(record.letter_annotations["phred_quality"],
                      expected_phred)
示例#15
0
def get_vcf_qual(quality):
    '''Map a quality value to an emoji'''

    # Hack to do this quickly - use same trick as FASTQE and convert from value to a PHRED encoding then map
    #TODO make this better
    #
    if quality == None:
        bioemojify_qual = emojify(":question:")
    else:
        fake_seq = 'N'
        record_qual = SeqRecord(Seq(fake_seq), id="test", name="lookup",
                                description="example",
                                letter_annotations={'phred_quality': [int(quality)]})
        mapping_dict_qual_use = emaps.fastq_emoji_map_binned
        original_qual = QualityIO._get_sanger_quality_str(record_qual)
        #print(original_qual)
        bioemojify_qual = "".join([emojify(mapping_dict_qual_use.get(s, ":heart_eyes:")) for s in original_qual])

    return(bioemojify_qual)
示例#16
0
def map_scores(sequence,
               mapping_dict=emaps.fastq_emoji_map,
               default_value=":heart_eyes:",
               mapping_function=emojify,
               spacer=" "):
    '''
    :param sequence:
    :param mapping_dict:
    :param default_value:
    :param mapping_function:
    :param spacer:
    :return:
    '''

    mapped_values = spacer.join([
        mapping_function(mapping_dict.get(s, default_value))
        for s in QualityIO._get_sanger_quality_str(sequence)
    ])
    return (mapped_values)
示例#17
0
def not_trimmed(cur, conf, options, sequence, qual):
    cur.execute('SELECT name FROM sequence WHERE cluster = %s',
                (options.species))
    data = cur.fetchall()
    dataset = set()
    for d in data:
        dataset.add(d[0])
    seqs = QualityIO.PairedFastaQualIterator(
        open(conf.get('Input', 'sequence'), "rU"),
        open(conf.get('Input', 'qual'), "rU"))
    try:
        while seqs:
            record = seqs.next()
            if record.name in dataset:
                sequence.write('%s' % record.format('fasta'))
                qual.write('%s' % record.format('qual'))
    except StopIteration:
        pass
    qual.close()
    sequence.close()
示例#18
0
 def test_sanger_to_solexa(self):
     """Mapping check for FASTQ Sanger (0 to 93) to Solexa (-5 to 62)"""
     # The point of this test is the writing code doesn't actually use the
     # solexa_quality_from_phred function directly. For speed it uses a
     # cached dictionary of the mappings.
     seq = "N" * 94
     qual = "".join(chr(33 + q) for q in range(0, 94))
     expected_sol = [min(62, int(round(QualityIO.solexa_quality_from_phred(q))))
                     for q in range(0, 94)]
     in_handle = StringIO("@Test\n%s\n+\n%s" % (seq, qual))
     out_handle = StringIO()
     with warnings.catch_warnings(record=True) as w:
         warnings.simplefilter("always", BiopythonWarning)
         SeqIO.write(SeqIO.parse(in_handle, "fastq-sanger"),
                     out_handle, "fastq-solexa")
         self.assertTrue(len(w) <= 1, w)
     out_handle.seek(0)
     record = SeqIO.read(out_handle, "fastq-solexa")
     self.assertEqual(str(record.seq), seq)
     self.assertEqual(record.letter_annotations["solexa_quality"],
                      expected_sol)
示例#19
0
 def test_sanger_to_solexa(self):
     """Mapping check for FASTQ Sanger (0 to 93) to Solexa (-5 to 62)"""
     # The point of this test is the writing code doesn't actually use the
     # solexa_quality_from_phred function directly. For speed it uses a
     # cached dictionary of the mappings.
     seq = "N"*94
     qual = "".join(chr(33+q) for q in range(0, 94))
     expected_sol = [min(62, int(round(QualityIO.solexa_quality_from_phred(q))))
                     for q in range(0, 94)]
     in_handle = StringIO("@Test\n%s\n+\n%s" % (seq, qual))
     out_handle = StringIO()
     with warnings.catch_warnings(record=True) as w:
         warnings.simplefilter("always", BiopythonWarning)
         SeqIO.write(SeqIO.parse(in_handle, "fastq-sanger"),
                     out_handle, "fastq-solexa")
         self.assertTrue(len(w) <= 1, w)
     out_handle.seek(0)
     record = SeqIO.read(out_handle, "fastq-solexa")
     self.assertEqual(str(record.seq), seq)
     self.assertEqual(record.letter_annotations["solexa_quality"],
                      expected_sol)
 def test_sanger_to_solexa(self):
     """Mapping check for FASTQ Sanger (0 to 93) to Solexa (-5 to 62)"""
     #The point of this test is the writing code doesn't actually use the
     #solexa_quality_from_phred function directly. For speed it uses a
     #cached dictionary of the mappings.
     seq = "N"*94
     qual = "".join(chr(33+q) for q in range(0,94))
     expected_sol = [min(62,int(round(QualityIO.solexa_quality_from_phred(q))))
                     for q in range(0,94)]
     in_handle = StringIO("@Test\n%s\n+\n%s" % (seq,qual))
     out_handle = StringIO("")
     #Want to ignore the data loss warning
     #(on Python 2.6 we could check for it!)
     warnings.simplefilter('ignore', BiopythonWarning)
     SeqIO.write(SeqIO.parse(in_handle, "fastq-sanger"),
                 out_handle, "fastq-solexa")
     warnings.filters.pop()
     out_handle.seek(0)
     record = SeqIO.read(out_handle, "fastq-solexa")
     self.assertEqual(str(record.seq), seq)
     self.assertEqual(record.letter_annotations["solexa_quality"],
                      expected_sol)
 def test_solexa_to_sanger(self):
     """Mapping check for FASTQ Solexa (-5 to 62) to Sanger (0 to 62)"""
     #The point of this test is the writing code doesn't actually use the
     #solexa_quality_from_phred function directly. For speed it uses a
     #cached dictionary of the mappings.
     seq = "N" * 68
     qual = "".join(chr(64 + q) for q in range(-5, 63))
     expected_phred = [round(QualityIO.phred_quality_from_solexa(q)) \
                       for q in range(-5,63)]
     in_handle = StringIO("@Test\n%s\n+\n%s" % (seq, qual))
     out_handle = StringIO("")
     #Want to ignore the data loss warning
     #(on Python 2.6 we could check for it!)
     warnings.simplefilter('ignore', UserWarning)
     SeqIO.write(SeqIO.parse(in_handle, "fastq-solexa"), out_handle,
                 "fastq-sanger")
     warnings.resetwarnings()
     out_handle.seek(0)
     record = SeqIO.read(out_handle, "fastq-sanger")
     self.assertEqual(str(record.seq), seq)
     self.assertEqual(record.letter_annotations["phred_quality"],
                      expected_phred)
示例#22
0
def compare_record(old, new, truncate=None):
    """Quality aware SeqRecord comparison.

    This will check the mapping between Solexa and PHRED scores.
    It knows to ignore UnknownSeq objects for string matching (i.e. QUAL files).
    """
    if old.id != new.id:
        raise ValueError("'%s' vs '%s' " % (old.id, new.id))
    if old.description != new.description \
    and (old.id+" "+old.description).strip() != new.description:
        raise ValueError("'%s' vs '%s' " % (old.description, new.description))
    if len(old.seq) != len(new.seq):
        raise ValueError("%i vs %i" % (len(old.seq), len(new.seq)))
    if isinstance(old.seq, UnknownSeq) or isinstance(new.seq, UnknownSeq):
        pass
    elif str(old.seq) != str(new.seq):
        if len(old.seq) < 200:
            raise ValueError("'%s' vs '%s'" % (old.seq, new.seq))
        else:
            raise ValueError("'%s...' vs '%s...'" % (old.seq[:100], new.seq[:100]))
    if "phred_quality" in old.letter_annotations \
    and "phred_quality" in new.letter_annotations \
    and old.letter_annotations["phred_quality"] != new.letter_annotations["phred_quality"]:
        if truncate and [min(q,truncate) for q in old.letter_annotations["phred_quality"]] == \
                        [min(q,truncate) for q in new.letter_annotations["phred_quality"]]:
            pass
        else:
            raise ValuerError("Mismatch in phred_quality")
    if "solexa_quality" in old.letter_annotations \
    and "solexa_quality" in new.letter_annotations \
    and old.letter_annotations["solexa_quality"] != new.letter_annotations["solexa_quality"]:
        if truncate and [min(q,truncate) for q in old.letter_annotations["solexa_quality"]] == \
                        [min(q,truncate) for q in new.letter_annotations["solexa_quality"]]:
            pass
        else:
            raise ValueError("Mismatch in phred_quality")
    if "phred_quality" in old.letter_annotations \
    and "solexa_quality" in new.letter_annotations:
        #Mapping from Solexa to PHRED is lossy, but so is PHRED to Solexa.
        #Assume "old" is the original, and "new" has been converted.
        converted = [round(QualityIO.solexa_quality_from_phred(q))
                     for q in old.letter_annotations["phred_quality"]]
        if truncate:
            converted = [min(q,truncate) for q in converted]
        if converted != new.letter_annotations["solexa_quality"]:
            print
            print(old.letter_annotations["phred_quality"])
            print(converted)
            print(new.letter_annotations["solexa_quality"])
            raise ValueError("Mismatch in phred_quality vs solexa_quality")
    if "solexa_quality" in old.letter_annotations \
    and "phred_quality" in new.letter_annotations:
        #Mapping from Solexa to PHRED is lossy, but so is PHRED to Solexa.
        #Assume "old" is the original, and "new" has been converted.
        converted = [round(QualityIO.phred_quality_from_solexa(q))
                     for q in old.letter_annotations["solexa_quality"]]
        if truncate:
            converted = [min(q,truncate) for q in converted]
        if converted != new.letter_annotations["phred_quality"]:
            print(old.letter_annotations["solexa_quality"])
            print(converted)
            print(new.letter_annotations["phred_quality"])
            raise ValueError("Mismatch in solexa_quality vs phred_quality")
    return True
示例#23
0
 def test_phred_quality_from_solexa(self):
     """Mapping check for function phred_quality_from_solexa"""
     self.assertEqual(1, round(QualityIO.phred_quality_from_solexa(-5)))
     self.assertEqual(1, round(QualityIO.phred_quality_from_solexa(-4)))
     self.assertEqual(2, round(QualityIO.phred_quality_from_solexa(-3)))
     self.assertEqual(2, round(QualityIO.phred_quality_from_solexa(-2)))
     self.assertEqual(3, round(QualityIO.phred_quality_from_solexa(-1)))
     self.assertEqual(3, round(QualityIO.phred_quality_from_solexa(0)))
     self.assertEqual(4, round(QualityIO.phred_quality_from_solexa(1)))
     self.assertEqual(4, round(QualityIO.phred_quality_from_solexa(2)))
     self.assertEqual(5, round(QualityIO.phred_quality_from_solexa(3)))
     self.assertEqual(5, round(QualityIO.phred_quality_from_solexa(4)))
     self.assertEqual(6, round(QualityIO.phred_quality_from_solexa(5)))
     self.assertEqual(7, round(QualityIO.phred_quality_from_solexa(6)))
     self.assertEqual(8, round(QualityIO.phred_quality_from_solexa(7)))
     self.assertEqual(9, round(QualityIO.phred_quality_from_solexa(8)))
     self.assertEqual(10, round(QualityIO.phred_quality_from_solexa(9)))
     for i in range(10, 100):
         self.assertEqual(i, round(QualityIO.phred_quality_from_solexa(i)))
示例#24
0
 def test_solexa_quality_from_phred(self):
     """Mapping check for function solexa_quality_from_phred"""
     self.assertEqual(-5, round(QualityIO.solexa_quality_from_phred(0)))
     self.assertEqual(-5, round(QualityIO.solexa_quality_from_phred(1)))
     self.assertEqual(-2, round(QualityIO.solexa_quality_from_phred(2)))
     self.assertEqual(0, round(QualityIO.solexa_quality_from_phred(3)))
     self.assertEqual(2, round(QualityIO.solexa_quality_from_phred(4)))
     self.assertEqual(3, round(QualityIO.solexa_quality_from_phred(5)))
     self.assertEqual(5, round(QualityIO.solexa_quality_from_phred(6)))
     self.assertEqual(6, round(QualityIO.solexa_quality_from_phred(7)))
     self.assertEqual(7, round(QualityIO.solexa_quality_from_phred(8)))
     self.assertEqual(8, round(QualityIO.solexa_quality_from_phred(9)))
     for i in range(10, 100):
         self.assertEqual(i, round(QualityIO.solexa_quality_from_phred(i)))
示例#25
0
def compare_record(old, new, truncate=None):
    """Quality aware SeqRecord comparison.

    This will check the mapping between Solexa and PHRED scores.
    It knows to ignore UnknownSeq objects for string matching (i.e. QUAL files).
    """
    if old.id != new.id:
        raise ValueError("'%s' vs '%s' " % (old.id, new.id))
    if old.description != new.description \
    and (old.id+" "+old.description).strip() != new.description:
        raise ValueError("'%s' vs '%s' " % (old.description, new.description))
    if len(old.seq) != len(new.seq):
        raise ValueError("%i vs %i" % (len(old.seq), len(new.seq)))
    if isinstance(old.seq, UnknownSeq) or isinstance(new.seq, UnknownSeq):
        pass
    elif str(old.seq) != str(new.seq):
        if len(old.seq) < 200:
            raise ValueError("'%s' vs '%s'" % (old.seq, new.seq))
        else:
            raise ValueError("'%s...' vs '%s...'" %
                             (old.seq[:100], new.seq[:100]))
    if "phred_quality" in old.letter_annotations \
    and "phred_quality" in new.letter_annotations \
    and old.letter_annotations["phred_quality"] != new.letter_annotations["phred_quality"]:
        if truncate and [min(q,truncate) for q in old.letter_annotations["phred_quality"]] == \
                        [min(q,truncate) for q in new.letter_annotations["phred_quality"]]:
            pass
        else:
            raise ValuerError("Mismatch in phred_quality")
    if "solexa_quality" in old.letter_annotations \
    and "solexa_quality" in new.letter_annotations \
    and old.letter_annotations["solexa_quality"] != new.letter_annotations["solexa_quality"]:
        if truncate and [min(q,truncate) for q in old.letter_annotations["solexa_quality"]] == \
                        [min(q,truncate) for q in new.letter_annotations["solexa_quality"]]:
            pass
        else:
            raise ValueError("Mismatch in phred_quality")
    if "phred_quality" in old.letter_annotations \
    and "solexa_quality" in new.letter_annotations:
        #Mapping from Solexa to PHRED is lossy, but so is PHRED to Solexa.
        #Assume "old" is the original, and "new" has been converted.
        converted = [
            round(QualityIO.solexa_quality_from_phred(q))
            for q in old.letter_annotations["phred_quality"]
        ]
        if truncate:
            converted = [min(q, truncate) for q in converted]
        if converted != new.letter_annotations["solexa_quality"]:
            print
            print(old.letter_annotations["phred_quality"])
            print(converted)
            print(new.letter_annotations["solexa_quality"])
            raise ValueError("Mismatch in phred_quality vs solexa_quality")
    if "solexa_quality" in old.letter_annotations \
    and "phred_quality" in new.letter_annotations:
        #Mapping from Solexa to PHRED is lossy, but so is PHRED to Solexa.
        #Assume "old" is the original, and "new" has been converted.
        converted = [
            round(QualityIO.phred_quality_from_solexa(q))
            for q in old.letter_annotations["solexa_quality"]
        ]
        if truncate:
            converted = [min(q, truncate) for q in converted]
        if converted != new.letter_annotations["phred_quality"]:
            print(old.letter_annotations["solexa_quality"])
            print(converted)
            print(new.letter_annotations["phred_quality"])
            raise ValueError("Mismatch in solexa_quality vs phred_quality")
    return True
示例#26
0
def action(arguments):
    """
    Given parsed arguments, filter input files.
    """
    if arguments.quality_window_mean_qual and not arguments.quality_window:
        raise ValueError("--quality-window-mean-qual specified without "
                         "--quality-window")

    if trie is None or triefind is None:
        raise ValueError(
            'Missing Bio.trie and/or Bio.triefind modules. Cannot continue')

    filters = []
    input_type = fileformat.from_handle(arguments.sequence_file)
    output_type = fileformat.from_handle(arguments.output_file)
    with arguments.sequence_file as fp:
        if arguments.input_qual:
            sequences = QualityIO.PairedFastaQualIterator(
                fp, arguments.input_qual)
        else:
            sequences = SeqIO.parse(fp, input_type)

        listener = RecordEventListener()
        if arguments.details_out:
            rh = RecordReportHandler(arguments.details_out, arguments.argv,
                                     arguments.details_comment)
            rh.register_with(listener)

        # Track read sequences
        sequences = listener.iterable_hook('read', sequences)

        # Add filters
        if arguments.min_mean_quality and input_type == 'fastq':
            qfilter = QualityScoreFilter(arguments.min_mean_quality)
            filters.append(qfilter)
        if arguments.max_length:
            max_length_filter = MaxLengthFilter(arguments.max_length)
            filters.append(max_length_filter)
        if arguments.min_length:
            min_length_filter = MinLengthFilter(arguments.min_length)
            filters.append(min_length_filter)
        if arguments.max_ambiguous is not None:
            max_ambig_filter = MaxAmbiguousFilter(arguments.max_ambiguous)
            filters.append(max_ambig_filter)
        if arguments.pct_ambiguous is not None:
            pct_ambig_filter = PctAmbiguousFilter(arguments.pct_ambiguous)
            filters.append(pct_ambig_filter)
        if arguments.ambiguous_action:
            ambiguous_filter = AmbiguousBaseFilter(arguments.ambiguous_action)
            filters.append(ambiguous_filter)
        if arguments.quality_window:
            min_qual = (arguments.quality_window_mean_qual or
                        arguments.min_mean_quality)
            window_filter = WindowQualityScoreFilter(arguments.quality_window,
                                                     min_qual)
            filters.insert(0, window_filter)

        if arguments.barcode_file:
            with arguments.barcode_file:
                tr = parse_barcode_file(arguments.barcode_file,
                                        arguments.primer,
                                        arguments.barcode_header)
            f = PrimerBarcodeFilter(tr)
            filters.append(f)

            if arguments.map_out:
                barcode_writer = csv.writer(
                    arguments.map_out,
                    quoting=getattr(csv, arguments.quoting),
                    lineterminator='\n')

                def barcode_handler(record, sample, barcode=None):
                    barcode_writer.writerow((record.id, sample))

                listener.register_handler('found_barcode', barcode_handler)
        for f in filters:
            f.listener = listener
            sequences = f.filter_records(sequences)

        # Track sequences which passed all filters
        sequences = listener.iterable_hook('write', sequences)

        with arguments.output_file:
            SeqIO.write(sequences, arguments.output_file, output_type)

    rpt_rows = (f.report_dict() for f in filters)

    # Write report
    with arguments.report_out as fp:
        writer = csv.DictWriter(
            fp, BaseFilter.report_fields, lineterminator='\n', delimiter='\t')
        writer.writeheader()
        writer.writerows(rpt_rows)
示例#27
0
        return read


totreads = 0
passing_reads = 0
while True:
    try:
        read = R1.next()
        read2 = R2.next()
        totreads += 1
    except StopIteration:
        break
    fil1 = filterSeq(read, 0.1, 10, 100)
    fil2 = filterSeq(read2, 0.1, 10, 100)
    if (fil1 != None) and (fil2 != None):
        sys.stdout.write(fil1.id + "\t" + str(fil1.seq) + "\t" +
                         str(fil2.seq) + "\t" +
                         QualityIO._get_sanger_quality_str(fil1) + "\t" +
                         QualityIO._get_sanger_quality_str(fil2) + "\n")
        passing_reads += 1
    elif (fil1 != None):
        SeqIO.write(fil1, out, "fastq")
    elif (fil2 != None):
        SeqIO.write(fil2, out, "fastq")

sys.stderr.write("\t" + str(passing_reads) + " out of " + str(totreads) +
                 " fragments passed the filtering" + "\n")
data = commands.getstatusoutput('date')
sys.stderr.write("2nd step: filtering out duplicated fragments at " + data[1] +
                 "\n")
示例#28
0
def main():
    '''Main loop'''
    start_time = time.time()
    options, arg = interface()
    motd()
    print 'Started: ', time.strftime("%a %b %d, %Y  %H:%M:%S",
                                     time.localtime(start_time))
    conf = ConfigParser.ConfigParser()
    conf.read(options.conf)
    # build our configuration
    params = Parameters(conf)
    conn = MySQLdb.connect(user=params.user, passwd=params.pwd, db=params.db)
    cur = conn.cursor()
    # crank out a new table for the data
    createSeqTable(cur)
    conn.commit()
    seqcount = sequenceCount(conf.get('Input', 'sequence'))
    sequence = QualityIO.PairedFastaQualIterator(
        open(conf.get('Input', 'sequence'), "rU"),
        open(conf.get('Input', 'qual'), "rU"))
    #pdb.set_trace()
    if conf.getboolean('Multiprocessing', 'MULTIPROCESSING'):
        # get num processors
        n_procs = conf.get('Multiprocessing', 'processors')
        if n_procs == 'Auto':
            # we'll use x-1 cores (where x = avail. cores)
            n_procs = multiprocessing.cpu_count() - 1
        else:
            n_procs = int(n_procs)
        print 'Multiprocessing.  Number of processors = ', n_procs
        # to test with fewer sequences
        #count = 0
        try:
            threads = []
            pb = progress.bar(0, seqcount, 60)
            pb_inc = 0
            while sequence:
                if len(threads) < n_procs:
                    p = multiprocessing.Process(target=linkerWorker,
                                                args=(
                                                    sequence.next(),
                                                    params,
                                                ))
                    p.start()
                    threads.append(p)
                    if (pb_inc + 1) % 1000 == 0:
                        pb.__call__(pb_inc + 1)
                    elif pb_inc + 1 == seqcount:
                        pb.__call__(pb_inc + 1)
                    pb_inc += 1
                else:
                    for t in threads:
                        if not t.is_alive():
                            threads.remove(t)
        except StopIteration:
            pass
    else:
        print 'Not using multiprocessing'
        count = 0
        try:
            pb = progress.bar(0, seqcount, 60)
            pb_inc = 0
            #while count < 1000:
            while sequence:
                #count +=1
                linkerWorker(sequence.next(), params)
                if (pb_inc + 1) % 1000 == 0:
                    pb.__call__(pb_inc + 1)
                elif pb_inc + 1 == seqcount:
                    pb.__call__(pb_inc + 1)
                pb_inc += 1
        except StopIteration:
            pass
    print '\n'
    cur.close()
    conn.close()
    end_time = time.time()
    print 'Ended: ', time.strftime("%a %b %d, %Y  %H:%M:%S",
                                   time.localtime(end_time))
    print '\nTime for execution: ', (end_time - start_time) / 60, 'minutes'
示例#29
0
def process_files(options):
    '''Compute and print FastaStats for each input FASTA file specified on the
    command line. If no FASTA files are specified on the command line then
    read from the standard input (stdin).

    Arguments:
       options: the command line options of the program
    Result:
       None
    '''
    if options.fasta_files:
        for fasta_filename in options.fasta_files:
            logging.info(
                "Processing FASTA file from {}".format(fasta_filename))
            try:
                fasta_file = open(fasta_filename)
            except IOError as exception:
                exit_with_error(str(exception), EXIT_FILE_IO_ERROR)
            else:
                with fasta_file:
                    stats = FastaStats().from_file(fasta_file, options.minlen)
                    #print(stats.pretty(fasta_filename))

                    if options.scale:
                        print_scale(emaps.all_qualities, options.bin)

                    #rewrite this
                    if options.bin:
                        logging.info("Binned calculations")
                        if options.max:
                            logging.info("Calculate max quality per position")
                            print(
                                stats.pretty(fasta_filename),
                                "max (binned)",
                                " ".join([
                                    emojify(
                                        emaps.fastq_emoji_map_binned.get(
                                            s, ':heart_eyes:'))
                                    for s in QualityIO._get_sanger_quality_str(
                                        stats.quality_scores_maxs)
                                ]),
                                sep='\t')
                        logging.info("Calculate mean quality per position")
                        print(stats.pretty(fasta_filename),
                              "mean (binned)",
                              " ".join([
                                  emojify(
                                      emaps.fastq_emoji_map_binned.get(
                                          s, ':heart_eyes:'))
                                  for s in QualityIO._get_sanger_quality_str(
                                      stats.quality_scores_mean)
                              ]),
                              sep='\t')
                        if options.min:
                            logging.info("Calculate min quality per position")
                            print(
                                stats.pretty(fasta_filename),
                                "min (binned)",
                                " ".join([
                                    emojify(
                                        emaps.fastq_emoji_map_binned.get(
                                            s, ':heart_eyes:'))
                                    for s in QualityIO._get_sanger_quality_str(
                                        stats.quality_scores_mins)
                                ]),
                                sep='\t')
                    else:
                        if options.max:
                            logging.info("Calculate max quality per position")
                            print(
                                stats.pretty(fasta_filename),
                                "max",
                                " ".join([
                                    emojify(
                                        emaps.fastq_emoji_map.get(
                                            s, ':heart_eyes:'))
                                    for s in QualityIO._get_sanger_quality_str(
                                        stats.quality_scores_maxs)
                                ]),
                                sep='\t')
                        logging.info("Calculate mean quality per position")
                        print(stats.pretty(fasta_filename),
                              "mean",
                              " ".join([
                                  emojify(
                                      emaps.fastq_emoji_map.get(
                                          s, ':heart_eyes:'))
                                  for s in QualityIO._get_sanger_quality_str(
                                      stats.quality_scores_mean)
                              ]),
                              sep='\t')
                        if options.min:
                            logging.info("Calculate min quality per position")
                            print(
                                stats.pretty(fasta_filename),
                                "min",
                                " ".join([
                                    emojify(
                                        emaps.fastq_emoji_map.get(
                                            s, ':heart_eyes:'))
                                    for s in QualityIO._get_sanger_quality_str(
                                        stats.quality_scores_mins)
                                ]),
                                sep='\t')

                    #print("MAX:  "," ".join([s for s in QualityIO._get_sanger_quality_str(stats.quality_scores_maxs)]))
                    #print("MEAN: "," ".join([s for s in QualityIO._get_sanger_quality_str(stats.quality_scores_mean)]))
                    #print("MIN:  "," ".join([s for s in QualityIO._get_sanger_quality_str(stats.quality_scores_mins)]))

    else:
        logging.info("Processing FASTA file from stdin")
        stats = FastaStats().from_file(sys.stdin, options.minlen)
        print(stats.pretty("stdin"))
示例#30
0
def convert_fastq(options):
    '''Convert FASTQ file to emoji. If no FASTQ files are specified on the command line then
    read from the standard input (stdin).

    Arguments:
       options: the command line options of the program
    Result:
       None
    '''

    if options.custom:
        with open(options.custom) as f:
            mapping_dict_use = ast.literal_eval(f.read())
    else:
        mapping_dict_use = local_seq_emoji_map

    if options.custom_qual:
        with open(options.custom_qual) as f:
            mapping_dict_qual_use = ast.literal_eval(f.read())
    elif options.bin:
        mapping_dict_qual_use = emaps.fastq_emoji_map_binned
    else:
        mapping_dict_qual_use = emaps.fastq_emoji_map

    if options.fastq_files:
        for fastq_filename in options.fastq_files:
            logging.info("Processing FASTA file from %s", fastq_filename)
            try:
                if fastq_filename.endswith(".gz"):
                    fastq_file = gzip.open(fastq_filename, 'rt')
                else:
                    fastq_file = open(fastq_filename)

            except IOError as exception:
                exit_with_error(str(exception), EXIT_FILE_IO_ERROR)
            else:
                with fastq_file:
                    for seq in SeqIO.parse(fastq_file, "fastq"):
                        print(emojify(":arrow_forward:")+"  "+seq.id)
                        #print(">"+seq.id)
                        original = seq.seq
                        bioemojify = "".join([emojify(mapping_dict_use.get(s,":heart_eyes:")) for s in original])
                        original_qual = QualityIO._get_sanger_quality_str(seq)
                        bioemojify_qual = "".join([emojify(mapping_dict_qual_use.get(s,":heart_eyes:")) for s in original_qual])
                        print(bioemojify+"\n"+bioemojify_qual)
#                        print(*zip([a for a in bioemojify if a != " "],[b for b in bioemojify_qual if b != " "]))
    else:
        logging.info("Processing FASTQ file from stdin")
        #stats = FastaStats().from_file(sys.stdin, options.minlen)
        if (binascii.hexlify(sys.stdin.buffer.peek(1)[:2]) == b'1f8b'):
            # print("zipped")
            stdin_file = gzip.open(sys.stdin.buffer, 'rt')
        else:
            stdin_file = sys.stdin

        for seq in SeqIO.parse(stdin_file, "fastq"):
                        print(emojify(":arrow_forward:")+"  "+seq.id)
                        #print(">"+seq.id)
                        original = seq.seq
                        bioemojify = "".join([emojify(mapping_dict_use.get(s,":heart_eyes:")) for s in original])
                        original_qual = QualityIO._get_sanger_quality_str(seq)
                        bioemojify_qual = "".join([emojify(mapping_dict_qual_use.get(s,":heart_eyes:")) for s in original_qual])
                        print(bioemojify+"\n"+bioemojify_qual)
示例#31
0
 def test_solexa_quality_from_phred(self):
     """Mapping check for function solexa_quality_from_phred"""
     self.assertEqual(-5, round(QualityIO.solexa_quality_from_phred(0)))
     self.assertEqual(-5, round(QualityIO.solexa_quality_from_phred(1)))
     self.assertEqual(-2, round(QualityIO.solexa_quality_from_phred(2)))
     self.assertEqual(0, round(QualityIO.solexa_quality_from_phred(3)))
     self.assertEqual(2, round(QualityIO.solexa_quality_from_phred(4)))
     self.assertEqual(3, round(QualityIO.solexa_quality_from_phred(5)))
     self.assertEqual(5, round(QualityIO.solexa_quality_from_phred(6)))
     self.assertEqual(6, round(QualityIO.solexa_quality_from_phred(7)))
     self.assertEqual(7, round(QualityIO.solexa_quality_from_phred(8)))
     self.assertEqual(8, round(QualityIO.solexa_quality_from_phred(9)))
     for i in range(10,100):
         self.assertEqual(i, round(QualityIO.solexa_quality_from_phred(i)))
示例#32
0
def Main():
    args = ParseArg()

    #    enable_thread_profiling()
    #    yappi.set_clock_type("wall")
    #    yappi.start()

    #    global type
    #    type = "fastq"
    #    if args.fastq:
    #       type="fastq"
    #    elif args.fasta:
    #       type="fasta"

    global Range
    Range = args.range

    global Trim
    Trim = args.trim

    global max_score
    max_score = args.max_score

    name1 = args.input1.split('/')[-1]
    name2 = args.input2.split('/')[-1]
    #----------- read barcode ----------
    global barcodes
    global barcode_len
    barcodes = []
    for i in open(args.barcode, 'r'):
        i = i.strip()
        barcodes.append(i)
        barcode_len = len(i)

    barcodes.append('unassign')

    #-----------------------------------

    records = QualityIO.FastqGeneralIterator(open(args.input1, "rU"))
    records2 = QualityIO.FastqGeneralIterator(open(args.input2, "rU"))

    Files = MainIO(records, records2, name1, name2)

    print "start to assign sequence to different barcodes..."
    print "----------"
    num_thread = args.parallel
    #    q=Queue(maxsize=10000)

    #    feeder = threading.Thread(target = push_stuff, args = (q, num_thread, records, records2))
    #    feeder.start()

    workers = []
    for i in range(num_thread):
        worker = threading.Thread(target=do_stuff, args=(Files, ))
        workers.append(worker)
        #worker.setDaemon(True)
        worker.start()

#    print >>sys.stderr,"Finish reading records"

#    q.join()

#    feeder.join()
    for i in range(num_thread):
        workers[i].join()
示例#33
0
 def test_phred_quality_from_solexa(self):
     """Mapping check for function phred_quality_from_solexa"""
     self.assertEqual(1, round(QualityIO.phred_quality_from_solexa(-5)))
     self.assertEqual(1, round(QualityIO.phred_quality_from_solexa(-4)))
     self.assertEqual(2, round(QualityIO.phred_quality_from_solexa(-3)))
     self.assertEqual(2, round(QualityIO.phred_quality_from_solexa(-2)))
     self.assertEqual(3, round(QualityIO.phred_quality_from_solexa(-1)))
     self.assertEqual(3, round(QualityIO.phred_quality_from_solexa(0)))
     self.assertEqual(4, round(QualityIO.phred_quality_from_solexa(1)))
     self.assertEqual(4, round(QualityIO.phred_quality_from_solexa(2)))
     self.assertEqual(5, round(QualityIO.phred_quality_from_solexa(3)))
     self.assertEqual(5, round(QualityIO.phred_quality_from_solexa(4)))
     self.assertEqual(6, round(QualityIO.phred_quality_from_solexa(5)))
     self.assertEqual(7, round(QualityIO.phred_quality_from_solexa(6)))
     self.assertEqual(8, round(QualityIO.phred_quality_from_solexa(7)))
     self.assertEqual(9, round(QualityIO.phred_quality_from_solexa(8)))
     self.assertEqual(10, round(QualityIO.phred_quality_from_solexa(9)))
     for i in range(10,100):
         self.assertEqual(i, round(QualityIO.phred_quality_from_solexa(i)))
示例#34
0
    seq, tag = str(record.seq), str(record.seq)
    seq_match, tag_match, score, start, end = pairwise2.align.localms(
        seq, tag, 5.0, -4.0, -9.0, -0.5, one_alignment_only=True)[0]
    #name = multiprocessing.current_process().name
    #print 'Worker', name, str(record.seq)
    print "Parent: ", os.getppid(), "Child: ", os.getpid(), "Count: ", count
    return


if __name__ == '__main__':
    start_time = time.time()
    conf = ConfigParser.ConfigParser()
    conf.read('mc454.conf')
    #jobs = []
    record = QualityIO.PairedFastaQualIterator(
        open(conf.get('Input', 'sequence'), "rU"),
        open(conf.get('Input', 'qual'), "rU"))
    mproc = True
    if mproc == True:
        count = 0
        try:
            while count < 500:
                #pdb.set_trace()
                jobs = []
                for i in range(multiprocessing.cpu_count()):
                    count += 1
                    p = multiprocessing.Process(target=worker,
                                                args=(record.next(), count))
                    jobs.append(p)
                    p.start()
                #p.join()