Пример #1
0
def convert_phred_scores(fname, out_phred_offset):
    if out_phred_offset not in [33, 64]:
        sys.exit('Error: out_phred_offset must be 33 or 64. Received %s' %
                 repr(out_phred_offset))

    in_phred_offset = determine_phred_offset(fname)
    if in_phred_offset == out_phred_offset:
        print 'Cowardly refusing to convert %s from phred%d to phred%d' \
            % (fname, in_phred_offset, out_phred_offset)
        return -1

    phred_diff = out_phred_offset - in_phred_offset

    fname_parts = fname.split('.')
    out_fname = fname_parts[0] + '_phred' + str(
        out_phred_offset) + '.' + '.'.join(fname_parts[1:])

    from misc_tools import gzip_friendly_open
    with gzip_friendly_open(fname) as f, gzip_friendly_open(out_fname,
                                                            'w') as out:
        while True:
            defline = f.readline().strip()
            if not defline:
                break
            seqline = f.readline().strip()
            plusline = f.readline().strip()
            qualline = f.readline().strip()

            out_qualline = ''.join(
                [chr(ord(c) + phred_diff) for c in qualline])

            out.write('\n'.join([defline, seqline, plusline, out_qualline]) +
                      '\n')
Пример #2
0
def get_reads_by_name(fastq_dir_or_fpath, read_names):
    if os.path.isfile(fastq_dir_or_fpath):
        se_fpaths = [fastq_dir_or_fpath]
        pe_fpaths = []
    elif os.path.isdir(fastq_dir_or_fpath):
        pe_fpaths, se_fpaths = find_paired_and_unpaired_files(
            fastq_dir_or_fpath)
    else:
        assert False, 'Fastq dir or fpath required'

    if not isinstance(read_names, set):
        read_names = set(read_names)

    out_r1_records = []
    out_r2_records = []
    for i, (fpath1, fpath2) in enumerate(pe_fpaths):
        print '%d of %d: %s' % (i + 1, len(pe_fpaths) + len(se_fpaths),
                                (fpath1, fpath2))
        for rec1, rec2 in izip(
                SeqIO.parse(gzip_friendly_open(fpath1), 'fastq'),
                SeqIO.parse(gzip_friendly_open(fpath2), 'fastq')):
            assert rec1.id == rec2.id, (fpath1, fpath2, rec1.id, rec2.id)
            if str(rec1.id) in read_names:
                out_r1_records.append(rec1)
                out_r2_records.append(rec2)

    for i, fpath in enumerate(se_fpaths):
        print '%d of %d: %s' % (len(pe_fpaths) + i + 1,
                                len(pe_fpaths) + len(se_fpaths), fpath)
        for rec in SeqIO.parse(gzip_friendly_open(fpath), 'fastq'):
            if str(rec.id) in read_names:
                out_r1_records.append(rec)

    return out_r1_records, out_r2_records
Пример #3
0
def insert_lengths(paired_fq_fpaths, short=True):
    fpath1, fpath2 = paired_fq_fpaths
    assert simple_hamming_distance(fpath1, fpath2) == 1, paired_fq_fpaths
    ins_len_counter = Counter()
    it = izip(SeqIO.parse(gzip_friendly_open(fpath1), 'fastq'),
              SeqIO.parse(gzip_friendly_open(fpath2), 'fastq'))
    if short:
        it = islice(it, None, 10000)
    for rec1, rec2 in it:
        assert rec1.id == rec2.id, '%s\n%s' % (rec1.id, rec2.id)
        seq_obj = collapse_if_overlapped_pair([str(rec1.seq), str(rec2.seq)])
        if isinstance(seq_obj, str):
            ins_len_counter[len(seq_obj)] += 1
        else:
            ins_len_counter[sum(len(s) for s in seq_obj)] += 1
    return ins_len_counter
Пример #4
0
def determine_phred_offset(filename, num_reads_to_consider=1000):
    min_val = 126
    max_val = 0

    i = 0
    with gzip_friendly_open(filename) as f:
        while True:
            i += 1
            if i > num_reads_to_consider:
                break
            nameline = f.readline()
            assert nameline, 'Could not determine phred offset'
            seqline = f.readline()
            plusline = f.readline()
            qualline = f.readline()

            ascii_vals = map(ord,
                             qualline.strip())  # Convert to numerical values
            min_val = min([min_val] + ascii_vals)
            max_val = max([max_val] + ascii_vals)

    if min_val < 50:
        return 33  # Illumina 1.8 and Sanger
    elif max_val > 89:
        return 64  # Illumina 1.3 - 1.7 and Solexa
    else:
        return determine_phred_offset(filename,
                                      num_reads_to_consider=2 *
                                      num_reads_to_consider)
Пример #5
0
def iterate_ncbi_rna_cds_and_tranlation(fpath):
    """From NCBI RNA annotation file, iterate over CDSs and tranlations."""
    standard_mismatches = set(['X*', 'U*'])  # NCBI often translates * as X

    def get_singular_qualifier(feature, qual_name):
        """Return qualifier known to have exactly one entry given feature and qualifier name."""
        quals = feature.qualifiers[qual_name]
        assert len(quals) == 1, str(quals)
        return quals[0]

    for rec in SeqIO.parse(gzip_friendly_open(fpath), 'gb'):
        cds_feats = [feat for feat in rec.features if feat.type.upper() == 'CDS']
        if not cds_feats:
            continue
        assert len(cds_feats) == 1, rec.name
        cds_feat = cds_feats[0]

        gene_name = get_singular_qualifier(cds_feat, 'gene')
        protein_id = get_singular_qualifier(cds_feat, 'protein_id')
        ncbi_translation = get_singular_qualifier(cds_feat, 'translation')
        codon_start = int(get_singular_qualifier(cds_feat, 'codon_start'))
        cds, cds_translation = cds_extract(rec.seq, cds_feat.location, codon_start)
        # Note that mismatching first aa is handled separately for alternative start codons
        mismatches = set(c1+c2 for c1, c2 in
                         izip(ncbi_translation[1:], cds_translation[1:]) if c1 != c2)
        if ncbi_translation[0] != 'M' and ncbi_translation[0] != cds_translation[0]:
            mismatches.add(ncbi_translation[0] + cds_translation[0])
        assert mismatches <= standard_mismatches, \
                '\n'.join([repr(mismatches), rec.name, ncbi_translation, cds_translation])
        yield rec, gene_name, rec.name, protein_id, cds, ncbi_translation
Пример #6
0
def iterate_seqs(fpath):
    with gzip_friendly_open(fpath) as f:
        while True:
            defline = f.readline().strip()
            if not defline:
                break
            seqline = f.readline().strip()
            plusline = f.readline().strip()
            qualline = f.readline().strip()
            yield defline, seqline, plusline, qualline
Пример #7
0
def get_composition_consensus_quality(fpath=None, records=None):
    if fpath is not None:
        with gzip_friendly_open(fpath) as f:
            rec = next(SeqIO.parse(f, 'fastq'))
            seq_len = len(rec)
        records_iterator = SeqIO.parse(gzip_friendly_open(fpath), 'fastq')
    else:
        assert records is not None
        seq_len = len(records[0])
        records_iterator = records

    bases = 'ACGT'
    composition_dict = {b: np.zeros((seq_len, )) for b in bases}
    quality_counts = np.zeros((41, seq_len))
    num_seqs = 0
    for rec in records_iterator:
        num_seqs += 1
        for b in bases:
            composition_dict[b] += np.array(
                [int(c == b) for c in str(rec.seq)])
        for i, q in enumerate(rec.letter_annotations['phred_quality']):
            quality_counts[q, i] += 1
    for b in bases:
        composition_dict[b] = composition_dict[b] / float(num_seqs)

    consensus = []
    for i in range(len(composition_dict['A'])):
        cons_base = max(bases, key=lambda b: composition_dict[b][i])
        if composition_dict[cons_base][i] < 0.5:
            cons_base = 'N'
        consensus.append(cons_base)

    weighted_quality = np.array([quality_counts[i] * i for i in range(41)])
    avg_quality = weighted_quality.sum(axis=0) / num_seqs

    return composition_dict, ''.join(consensus), avg_quality