def convert_phred_scores(fname, out_phred_offset): if out_phred_offset not in [33, 64]: sys.exit('Error: out_phred_offset must be 33 or 64. Received %s' % repr(out_phred_offset)) in_phred_offset = determine_phred_offset(fname) if in_phred_offset == out_phred_offset: print 'Cowardly refusing to convert %s from phred%d to phred%d' \ % (fname, in_phred_offset, out_phred_offset) return -1 phred_diff = out_phred_offset - in_phred_offset fname_parts = fname.split('.') out_fname = fname_parts[0] + '_phred' + str( out_phred_offset) + '.' + '.'.join(fname_parts[1:]) from misc_tools import gzip_friendly_open with gzip_friendly_open(fname) as f, gzip_friendly_open(out_fname, 'w') as out: while True: defline = f.readline().strip() if not defline: break seqline = f.readline().strip() plusline = f.readline().strip() qualline = f.readline().strip() out_qualline = ''.join( [chr(ord(c) + phred_diff) for c in qualline]) out.write('\n'.join([defline, seqline, plusline, out_qualline]) + '\n')
def get_reads_by_name(fastq_dir_or_fpath, read_names): if os.path.isfile(fastq_dir_or_fpath): se_fpaths = [fastq_dir_or_fpath] pe_fpaths = [] elif os.path.isdir(fastq_dir_or_fpath): pe_fpaths, se_fpaths = find_paired_and_unpaired_files( fastq_dir_or_fpath) else: assert False, 'Fastq dir or fpath required' if not isinstance(read_names, set): read_names = set(read_names) out_r1_records = [] out_r2_records = [] for i, (fpath1, fpath2) in enumerate(pe_fpaths): print '%d of %d: %s' % (i + 1, len(pe_fpaths) + len(se_fpaths), (fpath1, fpath2)) for rec1, rec2 in izip( SeqIO.parse(gzip_friendly_open(fpath1), 'fastq'), SeqIO.parse(gzip_friendly_open(fpath2), 'fastq')): assert rec1.id == rec2.id, (fpath1, fpath2, rec1.id, rec2.id) if str(rec1.id) in read_names: out_r1_records.append(rec1) out_r2_records.append(rec2) for i, fpath in enumerate(se_fpaths): print '%d of %d: %s' % (len(pe_fpaths) + i + 1, len(pe_fpaths) + len(se_fpaths), fpath) for rec in SeqIO.parse(gzip_friendly_open(fpath), 'fastq'): if str(rec.id) in read_names: out_r1_records.append(rec) return out_r1_records, out_r2_records
def insert_lengths(paired_fq_fpaths, short=True): fpath1, fpath2 = paired_fq_fpaths assert simple_hamming_distance(fpath1, fpath2) == 1, paired_fq_fpaths ins_len_counter = Counter() it = izip(SeqIO.parse(gzip_friendly_open(fpath1), 'fastq'), SeqIO.parse(gzip_friendly_open(fpath2), 'fastq')) if short: it = islice(it, None, 10000) for rec1, rec2 in it: assert rec1.id == rec2.id, '%s\n%s' % (rec1.id, rec2.id) seq_obj = collapse_if_overlapped_pair([str(rec1.seq), str(rec2.seq)]) if isinstance(seq_obj, str): ins_len_counter[len(seq_obj)] += 1 else: ins_len_counter[sum(len(s) for s in seq_obj)] += 1 return ins_len_counter
def determine_phred_offset(filename, num_reads_to_consider=1000): min_val = 126 max_val = 0 i = 0 with gzip_friendly_open(filename) as f: while True: i += 1 if i > num_reads_to_consider: break nameline = f.readline() assert nameline, 'Could not determine phred offset' seqline = f.readline() plusline = f.readline() qualline = f.readline() ascii_vals = map(ord, qualline.strip()) # Convert to numerical values min_val = min([min_val] + ascii_vals) max_val = max([max_val] + ascii_vals) if min_val < 50: return 33 # Illumina 1.8 and Sanger elif max_val > 89: return 64 # Illumina 1.3 - 1.7 and Solexa else: return determine_phred_offset(filename, num_reads_to_consider=2 * num_reads_to_consider)
def iterate_ncbi_rna_cds_and_tranlation(fpath): """From NCBI RNA annotation file, iterate over CDSs and tranlations.""" standard_mismatches = set(['X*', 'U*']) # NCBI often translates * as X def get_singular_qualifier(feature, qual_name): """Return qualifier known to have exactly one entry given feature and qualifier name.""" quals = feature.qualifiers[qual_name] assert len(quals) == 1, str(quals) return quals[0] for rec in SeqIO.parse(gzip_friendly_open(fpath), 'gb'): cds_feats = [feat for feat in rec.features if feat.type.upper() == 'CDS'] if not cds_feats: continue assert len(cds_feats) == 1, rec.name cds_feat = cds_feats[0] gene_name = get_singular_qualifier(cds_feat, 'gene') protein_id = get_singular_qualifier(cds_feat, 'protein_id') ncbi_translation = get_singular_qualifier(cds_feat, 'translation') codon_start = int(get_singular_qualifier(cds_feat, 'codon_start')) cds, cds_translation = cds_extract(rec.seq, cds_feat.location, codon_start) # Note that mismatching first aa is handled separately for alternative start codons mismatches = set(c1+c2 for c1, c2 in izip(ncbi_translation[1:], cds_translation[1:]) if c1 != c2) if ncbi_translation[0] != 'M' and ncbi_translation[0] != cds_translation[0]: mismatches.add(ncbi_translation[0] + cds_translation[0]) assert mismatches <= standard_mismatches, \ '\n'.join([repr(mismatches), rec.name, ncbi_translation, cds_translation]) yield rec, gene_name, rec.name, protein_id, cds, ncbi_translation
def iterate_seqs(fpath): with gzip_friendly_open(fpath) as f: while True: defline = f.readline().strip() if not defline: break seqline = f.readline().strip() plusline = f.readline().strip() qualline = f.readline().strip() yield defline, seqline, plusline, qualline
def get_composition_consensus_quality(fpath=None, records=None): if fpath is not None: with gzip_friendly_open(fpath) as f: rec = next(SeqIO.parse(f, 'fastq')) seq_len = len(rec) records_iterator = SeqIO.parse(gzip_friendly_open(fpath), 'fastq') else: assert records is not None seq_len = len(records[0]) records_iterator = records bases = 'ACGT' composition_dict = {b: np.zeros((seq_len, )) for b in bases} quality_counts = np.zeros((41, seq_len)) num_seqs = 0 for rec in records_iterator: num_seqs += 1 for b in bases: composition_dict[b] += np.array( [int(c == b) for c in str(rec.seq)]) for i, q in enumerate(rec.letter_annotations['phred_quality']): quality_counts[q, i] += 1 for b in bases: composition_dict[b] = composition_dict[b] / float(num_seqs) consensus = [] for i in range(len(composition_dict['A'])): cons_base = max(bases, key=lambda b: composition_dict[b][i]) if composition_dict[cons_base][i] < 0.5: cons_base = 'N' consensus.append(cons_base) weighted_quality = np.array([quality_counts[i] * i for i in range(41)]) avg_quality = weighted_quality.sum(axis=0) / num_seqs return composition_dict, ''.join(consensus), avg_quality