def main(params): isoforms = { acc: seq for acc, seq in misc_functions.read_fasta( open(params.transcript_file, "r")) } config = set_parameters(len(isoforms)) all_transcripts_for_sequencing = generate_abundance( isoforms, config, params) out_file = open(params.outfile, "w") for acc, seq in misc_functions.iteritems(all_transcripts_for_sequencing): out_file.write(">{0}\n{1}\n".format(acc, seq))
def generate_abundance(isoforms, config, params): sequence_pool = {} params.logfile.write("ABUNDANCES\n") for i, (acc, seq) in enumerate(misc_functions.iteritems(isoforms)): # abundance = random.choice(config["abundance"]) abundance = config["abundance"][i % len(config["abundance"])] params.logfile.write("{0}, abundance: {1}\n".format(acc, abundance)) for i in range(1, abundance + 1): new_acc = acc + ":copy" + str(i) sequence_pool[new_acc] = seq return sequence_pool
def main(params): # config = misc_functions.read_config(params.config) config = set_parameters(params) print(params.exon_file, params.transcript_file) if params.exon_file: exons = misc_functions.read_fasta(open(params.exon_file, "r")) tree_evolution = create_family(exons, config) isoforms = drop_exons(tree_evolution, config) elif params.transcript_file: isoforms = { acc: seq for acc, seq in misc_functions.read_fasta( open(params.transcript_file, "r")) } out_file = open(params.outfile, "w") for acc, seq in misc_functions.iteritems(isoforms): out_file.write(">{0}\n{1}\n".format(acc, seq))
def main(params): # config = misc_functions.read_config(params.config) reference = {acc : seq for acc, seq in misc_functions.read_fasta(open(params.transcript,"r"))} ref_seq = list(reference.values())[0] while True: random_positions = set([random.randint(0, len(ref_seq) - 1) for i in range(params.ed)] ) if len(random_positions) == params.ed: break mutated_ref_seq = mutate(ref_seq, random_positions) out_file_ref = open(params.ref_outfile, "w") out_file_ref.write(">{0}\n{1}".format("mutated_ref_seq", mutated_ref_seq)) out_file_ref.close() # simulate reads reads = generate_reads(ref_seq, mutated_ref_seq, params) out_file_reads = open(params.reads_outfile, "w") for acc, seq in misc_functions.iteritems(reads): out_file_reads.write(">{0}\n{1}\n".format(acc,seq)) out_file_reads.close()
def mutate_member(exons, config, params, mut=True): leaf_node = {} total_mutations = 0 total_length = 0 log_string = "" for ex_nr, seq in misc_functions.iteritems(exons): if mut: new_seq, mutation_log, exon_mutations, exon_indels, total_del_length = misc_functions.mutate_sequence( seq, config["mut"], config["ins"], config["del"]) # print(new_seq, mutation_log, exon_mutations) leaf_node[ex_nr] = new_seq else: new_seq = seq leaf_node[ex_nr] = seq mutation_log, exon_mutations = "-", 0 log_string += "mutations in exon: {0}, mutation places: {1}\n".format( exon_mutations, mutation_log) # params.logfile.write("mutations in exon: {0}, mutation places: {1}\n".format(exon_mutations, mutation_log)) total_mutations += exon_mutations total_length += len(new_seq) if total_mutations > 0 and mut: params.logfile.write(log_string) params.logfile.write("Total mutations: {0}\n".format(total_mutations)) params.logfile.write("mutation rate all exons: {0}\n".format( total_mutations / float(total_length))) return leaf_node elif not mut: params.logfile.write(log_string) params.logfile.write("Total mutations: {0}\n".format(total_mutations)) params.logfile.write("mutation rate all exons: {0}\n".format( total_mutations / float(total_length))) return leaf_node else: print("NO mutations!") return False
def main(params): # config = misc_functions.read_config(params.config) sequence_transcripts = {} # for acc, seq in misc_functions.read_fasta(open(params.sequence_material, "r")): # sequence_transcripts[acc] = seq sequence_transcripts = dict( misc_functions.read_fasta(open(params.sequence_material, "r"))) # print(sequence_transcripts) # read lengths ~ according to P6-C4 chemistry histogram from here # http://www.slideshare.net/GenomeInABottle/jan2016-pac-bio-giab slide 13 # this looks like it can be well approximated by triangiular distributions with parameters # 0 (base start), 10000 (peak), ~45000 (base end) # http://docs.scipy.org/doc/numpy/reference/generated/numpy.random.triangular.html # pacbios own distribution is here: # http://www.pacb.com/blog/new-chemistry-boosts-average-read/ # read_lengths = np.random.triangular(0, 10000, 45000, config["read_count"]) # Get average quality based on subread length and the length of the transcript # while read count is less than the red count we want: # 1. Draw read length from distribution for each read length in simulated read lengths # 2. Randomly select a transcript from pool # 3. Get average quality based on the number of passes = floor(read_length/transcript_length) # Avg quality is derived from this plot: https://speakerdeck.com/pacbio/specifics-of-smrt-sequencing-data # slide 21, the P4 C2 chemistry, in case of one pass we chose 13 percent error rate from here : http://www.sciencedirect.com/science/article/pii/S1672022915001345. # Out of the errors we follow the data here: http://bib.oxfordjournals.org/content/17/1/154.full.pdf # and here http://www.homolog.us/Tutorials/index.php?p=2.8&s=2 # that indicates that for a pacbio genomic read, we have roughly 13-15 percent error rate (older chemistry) # we choose 13. Out of the total errors, we let 11/16 = 68.75 be insertions # 4/16= 25% be deletions and 1/16 = 6.25% be substitutions (given here http://www.homolog.us/Tutorials/index.php?p=2.8&s=2 and http://bib.oxfordjournals.org/content/17/1/154.full.pdf) # 4. generate the read quality_function = { 1: 0.87, 2: 0.95, 3: 0.957, 4: 0.969, 5: 0.981, 6: 0.985, 7: 0.99, 8: 0.992, 9: 0.994, 10: 0.995, 11: 0.995, 12: 0.995, 13: 0.996, 14: 0.996, 15: 0.996, 16: 0.999, 17: 0.999, 18: 0.999 } # passes : quality read_count = 1 # just generate all numbers at once and draw from this 5x should be enough it = 0 lengths = np.random.triangular(0, 10000, 45000, 5 * params.read_count) pacbio_reads = {} reads_generated_log = defaultdict(int) errors = [] while read_count <= params.read_count: if it >= len(lengths): lengths = np.random.triangular(0, 10000, 45000, 5 * params.read_count) it = 0 read_len = lengths[it] acc = random.choice(list(sequence_transcripts.keys())) transcript = sequence_transcripts[acc] passes = int(read_len / len(transcript)) # print(passes, read_len, len(transcript)) if passes > 0: if passes < 18: quality = quality_function[passes] else: quality = 0.999 subs_rate = (1.0 - quality) * 0.0625 ins_rate = (1.0 - quality) * 0.6875 del_rate = (1.0 - quality) * 0.25 read, error_log, total_error_length, total_indel_length, total_del_length = misc_functions.mutate_sequence( transcript, subs_rate, ins_rate, del_rate) read_acc = "{0}_read_{1}_error_rate_{2}_total_errors_{3}".format( acc, str(read_count), total_error_length / float(len(read) + total_del_length), total_error_length) # params.logfile.write("{0}, error places: {1}\n".format(read_acc, error_log)) reads_generated_log[acc.split(":copy")[0]] += 1 errors.append(total_error_length) pacbio_reads[read_acc] = read read_count += 1 it += 1 for acc, abundance in misc_functions.iteritems(reads_generated_log): params.logfile.write("{0}\t{1}\n".format(acc, abundance)) n = float(len(errors)) mu = sum(errors) / n sigma = (sum(list(map( (lambda x: x**2 - 2 * x * mu + mu**2), errors))) / (n - 1))**0.5 min_error = min(errors) max_error = max(errors) errors.sort() if len(errors) % 2 == 0: median_error = (errors[int(len(errors) / 2) - 1] + errors[int(len(errors) / 2)]) / 2.0 else: median_error = errors[int(len(errors) / 2)] params.logfile.write( "mean error: {0}, sd error:{1}, min_error:{2}, max_error:{3}, median_error:{4}\n" .format(mu, sigma, min_error, max_error, median_error)) out_file = open(params.outfile, "w") for acc, seq in misc_functions.iteritems(pacbio_reads): out_file.write(">{0}\n{1}\n".format(acc, seq))