def GC_info(seq, win_len, step): """ Calculate needed %GC information. Calculate G+C content, minimal %GC in sliding windows, maximal %GC in sliding windows, stdev of %GC in sliding windows, and CV of %GC in sliding windows For GC content, it returns the percentage (float between 0 and 100). Copes mixed case sequences, and with the ambiguous nucleotide S (G or C) when counting the G and C content. The percentage is calculated against the length of the sequence using A,C,G,T,S,W with Ns, e.g.: >>> GC("ACTGN") 50.0 Note that this will return zero for an empty sequence. """ gc = GC(seq) tmp_gc = [] if win_len >= len(seq): return gc, gc, gc, 0, 0 for i in range(0, len(seq) - win_len): tmp_gc.append(GC(seq[i:i + win_len])) sd = numpy.std(tmp_gc) # Applying +1 to GC to make sure we do not divide by 0 return gc, min(tmp_gc), max(tmp_gc), sd, 100. * sd / (gc + 1.)
def bg_len_GC_bins(bg, bg_dir): """ Get lengths info for background sequences. Compute G+C content for all sequences in the background and store the information in a list. To each G+C percentage bin, we associate the corresponding sequence names with information about GC composition within sliding windows. Return info in lists. """ stream = open(bg) gc_bins = [] gc_list = [] lengths = [] for _ in xrange(0, 101): gc_bins.append({}) for record in SeqIO.parse(stream, "fasta"): gc = GC(record.seq) gc_list.append(gc) if len(record) in gc_bins[gc]: gc_bins[gc][len(record)].append(record) else: gc_bins[gc][len(record)] = [record] lengths.append(len(record.seq)) stream.close() print_in_bg_dir(gc_bins, bg_dir, True) return gc_list, gc_bins, lengths
def fg_len_GC_bins(fg_file): """ Compute G+C content for all sequences in the foreground. Computes %GC contant and store the information in a list. To each G+C percentage bin, we associate the number of sequences falling in the corresponding bin. Return lists of GC contents, GC bins, and lengths distrib. """ stream = open(fg_file) gc_bins = [] for _ in range(0, 101): gc_bins.append({}) gc_list = [] lengths = [] for record in SeqIO.parse(stream, "fasta"): gc = GC(record.seq) gc_list.append(gc) length = len(record) lengths.append(length) if length in gc_bins[gc]: gc_bins[gc][length] += 1 else: gc_bins[gc][length] = 1 stream.close() return gc_list, gc_bins, lengths
def bg_len_GC_bins(bg_file, bg_dir): """ Compute G+C content for all sequences in the background. Compute and store the %GC information in a list. To each G+C percentage bin, we associate the corresponding sequence names. Return lists of GC contents, GC bins, and lengths distrib. """ stream = open(bg_file) gc_bins = [] gc_list = [] lengths = [] for _ in range(0, 101): gc_bins.append({}) for record in SeqIO.parse(stream, "fasta"): gc = GC(record.seq) gc_list.append(gc) if len(record) in gc_bins[gc]: gc_bins[gc][len(record)].append(record) else: gc_bins[gc][len(record)] = [record] lengths.append(len(record.seq)) stream.close() print_in_bg_dir(gc_bins, bg_dir, True) return gc_list, gc_bins, lengths
def fg_GC_bins(fg_file): """Computes G+C content for all sequences in the foreground and store the information in a list. To each G+C percentage bin, we associate the number of sequences falling in the corresponding bin """ stream = open(fg_file) gc_bins = [0] * 101 gc_list = [] lengths = [] for record in SeqIO.parse(stream, "fasta"): gc = GC(record.seq) gc_list.append(gc) gc_bins[gc] += 1 lengths.append(len(record.seq)) stream.close() return gc_list, gc_bins, lengths
def generate_sequences(seqs, winlen, step, nfold): """ Shuffle sequences within a sliding window, keeping mononuc compo. Return %GC and length distribution of output sequences. """ bg_gc_list = [] bg_lengths = [] for record in seqs: sequence = record.seq.__str__() for _ in range(0, nfold): new_sequence = shuffle_window(sequence, winlen, step) new_seq = SeqRecord(Seq(new_sequence, generic_dna), id="background_seq_{}".format(record.name), description="") print(new_seq.format("fasta"), end="") bg_gc_list.append(GC(new_sequence)) bg_lengths.append(len(new_sequence)) return bg_gc_list, bg_lengths
def bg_GC_bins(bg_file): """ Computes G+C content for all sequences in the background and store the information in a list. To each G+C percentage bin, we associate the corresponding sequence names """ stream = open(bg_file) gc_bins = [] gc_list = [] lengths = [] for i in range(0, 101): gc_bins.append([]) for record in SeqIO.parse(stream, "fasta"): gc = GC(record.seq) gc_list.append(gc) gc_bins[gc].append(record) lengths.append(len(record.seq)) stream.close() return gc_list, gc_bins, lengths
def generate_sequences(seqs, winlen, step, nfold): bg_gc_list = [] bg_lengths = [] for record in seqs: seq = record.seq.__str__() for n in range(0, nfold): new_sequence = "" for sequence in split_seq(seq): if re.match("N", sequence): new_sequence += sequence elif sequence: new_sequence += shuffle_window(sequence, winlen, step) new_seq = SeqRecord(Seq(new_sequence, generic_dna), id="background_seq_for_{}".format(record.name), description="") print(new_seq.format("fasta"), end="") bg_gc_list.append(GC(new_sequence)) bg_lengths.append(len(new_sequence)) return bg_gc_list, bg_lengths
def generate_sequences(seqs, nfold): """ Generate sequences by shuffling input (mononucleotide). return tuple containing %GC compo and length distrib of output. """ bg_gc_list = [] bg_lengths = [] for record in seqs: seq = record.seq.__str__() for _ in range(0, nfold): new_sequence = "".join(random.sample(seq, len(seq))) new_seq = SeqRecord(Seq(new_sequence, generic_dna), id="background_seq_for_{}".format(record.name), description="") print(new_seq.format("fasta"), end="") bg_gc_list.append(GC(new_sequence)) bg_lengths.append(len(new_sequence)) return bg_gc_list, bg_lengths
def bg_GC_bins(bg_file, bg_dir): """ Compute G+C content for all sequences in the background. Compute and store the GC information in a list. To each G+C percentage bin, we associate the corresponding sequence names. Files representing the binning are stored in the "bg_dir" directory. Return lists of GC contents, GC bins, and lengths distrib. """ stream = open(bg_file) gc_bins = [] gc_list = [] lengths = [] for _ in xrange(0, 101): gc_bins.append([]) for record in SeqIO.parse(stream, "fasta"): gc = GC(record.seq) gc_list.append(gc) gc_bins[gc].append(record) lengths.append(len(record.seq)) stream.close() print_in_bg_dir(gc_bins, bg_dir) return gc_list, gc_bins, lengths
wins = windows.slidingWindow(sequence, size=windowSize, step=overlap, fillvalue="-") # let's count the windows i = 0 for window in wins: if i % 100 == 0: print("[STATUS] \t" + str(i) + " windows processed for " + str(header) + ".", end="\r") # get the sequence seq = ''.join(window) # calculate GC stats currentWindow = str(bin_int) + '-' + str(bin_int + windowSize) PerGC, GCSkew, nucleotideCounts = GC.GCStats(seq) # calculate kmer stats UniqueKmers = kmers.getUniqueKmers(seq, kmerLength) formattedHeader = header.replace(">.", "") GCperWindowCSV.write(formattedHeader + ',' + currentWindow + ',' + str(PerGC) + ',' + str(GCSkew) + ',' + str(UniqueKmers) + '\n') # keep running total of nucleotide counts totalNucleotideCounts = dictionaries.mergeDictionaries( totalNucleotideCounts, nucleotideCounts) if bin_int < seq_length - overlap: bin_int += overlap else: bin_int = 0