def get_num_reads(self): """ Return number of reads in FASTA/FASTQ file. For single-end samples, returns a single number. For paired-end samples, return a comma-separated pair of numbers: 'num_left_mate,num_right_mate' """ self.logger.info("Getting number of reads.") if self.sample.paired: self.logger.info("Getting number of paired-end reads.") # Paired-end mate_reads = [] for mate_rawdata in self.sample.rawdata: num_reads = 0 fastx_entries = fastx_utils.get_fastx_entries(mate_rawdata.reads_filename) for entry in fastx_entries: num_reads += 1 mate_reads.append(num_reads) pair_num_reads = ",".join(map(str, mate_reads)) return pair_num_reads else: self.logger.info("Getting number of single-end reads.") num_reads = 0 # Single-end fastx_entries = fastx_utils.get_fastx_entries(self.sample.rawdata.reads_filename) for entry in fastx_entries: num_reads += 1 return num_reads
def get_num_reads(self): """ Return number of reads in FASTA/FASTQ file. For single-end samples, returns a single number. For paired-end samples, return a comma-separated pair of numbers: 'num_left_mate,num_right_mate' """ self.logger.info("Getting number of reads.") if self.sample.paired: self.logger.info("Getting number of paired-end reads.") # Paired-end mate_reads = [] for mate_rawdata in self.sample.rawdata: num_reads = 0 fastx_entries = \ fastx_utils.get_fastx_entries(mate_rawdata.reads_filename) for entry in fastx_entries: num_reads += 1 mate_reads.append(num_reads) pair_num_reads = ",".join(map(str, mate_reads)) return pair_num_reads else: self.logger.info("Getting number of single-end reads.") num_reads = 0 # Single-end fastx_entries = \ fastx_utils.get_fastx_entries(self.sample.rawdata.reads_filename) for entry in fastx_entries: num_reads += 1 return num_reads
def output_bed_coords_from_fasta(fasta_fname, bed_fname): """ Output event coordinates from a FASTA file into a BED format. Assumes FASTA entry is of the form: >part_id:coords:entry_type """ print "Converting FASTA %s to BED %s" %(fasta_fname, bed_fname) total_len = 0 with open(bed_fname, "w") as bed_out: for fasta_entry in fastx_utils.get_fastx_entries(fasta_fname): fasta_name, fasta_seq = fasta_entry # Assume FASTA entry coordinates are in GFF format. # Convert them to BED if ";" not in fasta_name: raise Exception, "Malformed FASTA entry name: %s" %(fasta_name) gff_coords = fasta_name.split(";")[1] chrom, start, end, strand = parse_gff_coords(gff_coords) # Convert start to BED by subtracting one start = start - 1 bed_entry = \ pybedtools.create_interval_from_list(map(str, [chrom, start, end, gff_coords, "1", strand])) bed_out.write("%s" %(str(bed_entry))) # Accumulate total length of FASTA seqs total_len += len(fasta_seq) return total_len
def load_fastx_into_ktable(fastx_fname, kmer_len): t1 = time.time() ktable = khmer.new_ktable(kmer_len) # Load up the FASTA into ktable for fastx_entry in fastx_utils.get_fastx_entries(fastx_fname): fastx_name, fastx_seq = fastx_entry # Skip very short sequences if len(fastx_seq) < kmer_len: continue ktable.consume(fastx_seq) t2 = time.time() print "Loading up of seqs into ktable took %.2f seconds." %(t2 - t1) return ktable
def load_fastx_into_ktable(fastx_fname, kmer_len): t1 = time.time() ktable = khmer.new_ktable(kmer_len) # Load up the FASTA into ktable for fastx_entry in fastx_utils.get_fastx_entries(fastx_fname): fastx_name, fastx_seq = fastx_entry # Skip very short sequences if len(fastx_seq) < kmer_len: continue ktable.consume(fastx_seq) t2 = time.time() print "Loading up of seqs into ktable took %.2f seconds." % (t2 - t1) return ktable
def jf_counts_to_dict(jf_counts_fname): """ Load jellyfish counts file (a FASTA file) into a dictionary mapping kmers to counts. """ kmer_counts = defaultdict(int) if not os.path.isfile(jf_counts_fname): print "Error: Cannot find jf counts file %s" % (jf_counts_fname) sys.exit(1) for fastx_entry in fastx_utils.get_fastx_entries(jf_counts_fname, fasta=True): kmer_count, kmer = fastx_entry # Remove prefix '>' from FASTA entry kmer_count = int(kmer_count[1:]) kmer_counts[kmer] = kmer_count return kmer_counts
def output_dinuc_shuffled_fasta(fasta_fname, shuffled_fasta_fname, num_shuffles=1): """ Given a FASTA file, output a dinucleotide shuffled version of it. """ fasta_out = fastx_utils.write_open_fastx(shuffled_fasta_fname) for fastx_entry in fastx_utils.get_fastx_entries(fasta_fname): fastx_name, fastx_seq = fastx_entry shuffled_recs = [] for shuffle_num in range(num_shuffles): shuffled_seq = \ get_dinuc_shuffles(fastx_seq)[0] shuffled_rec = (fastx_name, shuffled_seq) shuffled_recs.append(shuffled_rec) fasta_utils.write_fasta(fasta_out, shuffled_recs) fasta_out.close()
def jf_counts_to_dict(jf_counts_fname): """ Load jellyfish counts file (a FASTA file) into a dictionary mapping kmers to counts. """ kmer_counts = defaultdict(int) if not os.path.isfile(jf_counts_fname): print "Error: Cannot find jf counts file %s" %(jf_counts_fname) sys.exit(1) for fastx_entry in fastx_utils.get_fastx_entries(jf_counts_fname, fasta=True): kmer_count, kmer = fastx_entry # Remove prefix '>' from FASTA entry kmer_count = int(kmer_count[1:]) kmer_counts[kmer] = kmer_count return kmer_counts
def output_gff_event_seqs(event_ids, input_fasta_fname, output_fasta_fname, entry_types=None, suffixes=None, remove_repeats=False): """ Given a set of event ids, pull out their sequences from an input fasta filename and output these to a separate FASTA file. Return the entries that were outputted. - entry_types: optional list of entry types that should be outputted, e.g. 'exon', 'intron'. Skip all entry types not within list. - suffixes: optional list of suffixes that the first field of the FASTA name should end in. For example, if the FASTA field is: >event_id;part_id;entry_type Then event_id must end in one of the suffixes for it to be included. """ num_events = len(event_ids) print "Retrieving sequences for %d events" %(num_events) print " - Input FASTA: %s" %(input_fasta_fname) print " - Output FASTA: %s" %(output_fasta_fname) def is_event_fasta(fasta_name): """ Return true if the event is a FASTA one. """ # If there's any event such that the # FASTA record starts with that event's name, then # the FASTA record should be outputted return len(filter(lambda e: \ fasta_name.startswith(e), event_ids)) > 0 kept_fasta_entries = [] with open(output_fasta_fname, "w") as fasta_out: for entry in fastx_utils.get_fastx_entries(input_fasta_fname): fasta_name, fasta_seq = entry fasta_name_fields = fasta_name.split(";") entry_type = fasta_name_fields[2] if is_event_fasta(fasta_name[1:]): # If given entry types, check that this sequence # is of one of the right entry types; otherwise # skip it if (entry_types is not None) and \ (entry_type not in entry_types): # Not of correct entry type continue # If given suffixes, check that the first field # of the FASTA name ends in one of the suffixes if suffixes is not None: if not any([fasta_name_fields[0].endswith(s) \ for s in suffixes]): # The first FASTA name field does not end # in any of the suffixes, so skip it. continue # If asked, remove repeats from sequence if remove_repeats: repeatless_seq = \ fasta_seq.translate(None, string.ascii_lowercase) if len(repeatless_seq) == 0: print "%s is all repeat! Not removing" %(fasta_name) continue fasta_seq = repeatless_seq fasta_out.write("%s\n" %(fasta_name)) fasta_out.write("%s\n" %(fasta_seq)) kept_fasta_entries.append(fasta_name) print "Outputted %d entries." %(len(kept_fasta_entries)) return kept_fasta_entries