def trim_polyA_ends(fastq_filename, output_dir, compressed=False, min_polyA_len=3, min_read_len=22): """ Trim polyA ends from reads. """ print "Trimming polyA trails from: %s" % (fastq_filename) # Strip the trailing extension output_basename = \ ".".join(os.path.basename(fastq_filename).split(".")[0:-1]) output_basename = "%s.trimmed_polyA.fastq.gz" % (output_basename) output_filename = os.path.join(output_dir, output_basename) utils.make_dir(output_dir) if os.path.isfile(output_filename): print "SKIPPING: %s already exists!" % (output_filename) return output_filename print " - Outputting trimmed sequences to: %s" % (output_filename) input_file = fastq_utils.read_open_fastq(fastq_filename) output_file = fastq_utils.write_open_fastq(output_filename) t1 = time.time() for line in fastq_utils.read_fastq(input_file): header, seq, header2, qual = line if seq.endswith("A"): # Skip sequences that do not end with at least N # many As if seq[-min_polyA_len:] != ("A" * min_polyA_len): continue # Get sequence stripped of contiguous strech of polyAs stripped_seq = rstrip_stretch(seq, "A") if len(stripped_seq) < min_read_len: # Skip altogether reads that are shorter than # the required length after trimming continue # Strip the quality scores to match trimmed sequence new_qual = qual[0:len(stripped_seq)] new_rec = (header, stripped_seq, header2, new_qual) # Write the record with trimmed sequence back out to file fastq_utils.write_fastq(output_file, new_rec) t2 = time.time() print "Trimming took %.2f mins." % ((t2 - t1) / 60.) output_file.close() return output_filename
def trim_polyA_ends(fastq_filename, output_dir, compressed=False, min_polyA_len=3, min_read_len=22): """ Trim polyA ends from reads. """ print "Trimming polyA trails from: %s" %(fastq_filename) # Strip the trailing extension output_basename = \ ".".join(os.path.basename(fastq_filename).split(".")[0:-1]) output_basename = "%s.trimmed_polyA.fastq.gz" %(output_basename) output_filename = os.path.join(output_dir, output_basename) utils.make_dir(output_dir) if os.path.isfile(output_filename): print "SKIPPING: %s already exists!" %(output_filename) return output_filename print " - Outputting trimmed sequences to: %s" %(output_filename) input_file = fastq_utils.read_open_fastq(fastq_filename) output_file = fastq_utils.write_open_fastq(output_filename) t1 = time.time() for line in fastq_utils.read_fastq(input_file): header, seq, header2, qual = line if seq.endswith("A"): # Skip sequences that do not end with at least N # many As if seq[-min_polyA_len:] != ("A" * min_polyA_len): continue # Get sequence stripped of contiguous strech of polyAs stripped_seq = rstrip_stretch(seq, "A") if len(stripped_seq) < min_read_len: # Skip altogether reads that are shorter than # the required length after trimming continue # Strip the quality scores to match trimmed sequence new_qual = qual[0:len(stripped_seq)] new_rec = (header, stripped_seq, header2, new_qual) # Write the record with trimmed sequence back out to file fastq_utils.write_fastq(output_file, new_rec) t2 = time.time() print "Trimming took %.2f mins." %((t2 - t1)/60.) output_file.close() return output_filename
def get_fastx_entries(fastx_filename, fasta=False, fastq=False): """ Get entries of FASTQ/FASTA file. if fasta=True, read file as fasta regardless of extension. if fastq=True, read file as fastq regardless of extension """ entries = [] fastx_type = get_fastx_type(fastx_filename) if (fastx_type == "fasta") or fasta: # It's a FASTA file entries = fasta_utils.read_fasta(fastx_filename) elif (fastx_type == "fastq") or fastq: # It's a FASTQ file entries = fastq_utils.read_fastq(fastx_filename) return entries
def get_seq_cycle_profile(self, fastq_filename, first_n_seqs=None):#sample): """ Compute the average 'N' bases (unable to sequence) as a function of the position of the read. """ fastq_file = fastq_utils.read_open_fastq(fastq_filename) fastq_entries = fastq_utils.read_fastq(fastq_file) # Mapping from position in read to number of Ns num_n_bases = defaultdict(int) # Mapping from position in read to total number of # reads in that position num_reads = defaultdict(int) num_entries = 0 print "Computing sequence cycle profile for: %s" %(fastq_filename) if first_n_seqs != None: print "Looking at first %d sequences only" %(first_n_seqs) for entry in fastq_entries: if first_n_seqs != None: # Stop at requested number of entries if asked to if num_entries >= first_n_seqs: break header1, seq, header2, qual = entry seq_len = len(seq) for n in range(seq_len): if seq[n] == "N": # Record occurrences of N num_n_bases[n] += 1 num_reads[n] += 1 num_entries += 1 # Compute percentage of N along each position percent_n = [] for base_pos in range(max(num_reads.keys())): curr_percent_n = float(num_n_bases[base_pos]) / num_reads[base_pos] percent_n.append(curr_percent_n) return percent_n