def intron_hexamer_test(input_fasta, motif_file, output_directory, output_file, required_simulations = None, families_file = None): """ Generate random hexamers from introns and calculate purine content Args: input_fasta (str): path to intron fasta motif_file (str): path to file containing real motifs output_directory (str): path to output directory output_file (str): path to output file required_simulations (int): if set, the number of simulations to run families_file (str): if set, path to families file """ hexamers_dir = "{0}/random_hexamers".format(output_directory) gen.create_output_directories(hexamers_dir) # get the motifs motifs = sequo.read_motifs(motif_file) # if there are not enough simulations, generate them if len(os.listdir(hexamers_dir)) < required_simulations: gen.create_output_directories(hexamers_dir) required = list(range(required_simulations - len(os.listdir(hexamers_dir)))) names, seqs = gen.read_fasta(sequences_file) seqs_list = collections.defaultdict(lambda: []) for i, name in enumerate(names): seqs_list[name.split(".")[0]].append(seqs[i]) if families_file: seqs_list = sequo.pick_random_family_member(families_file, seqs_list) all_seqs = [] [all_seqs.extend(seqs_list[i]) for i in seqs_list] full_seq = "X".join(all_seqs) simopc.run_simulation_function(required, [full_seq, motifs, hexamers_dir], sequo.locate_random_motifs, sim_run = False) # calculate the purine contents real_purine_content = sequo.calc_purine_content(motifs) real_nt_content = sequo.calc_nucleotide_content(motifs) test_purine_content = [] test_nt_content = [] for file in os.listdir(hexamers_dir): filepath = "{0}/{1}".format(hexamers_dir, file) test_motifs = sequo.read_motifs(filepath) test_purine_content.append(sequo.calc_purine_content(test_motifs)) test_nt_content.append(sequo.calc_nucleotide_content(test_motifs)) with open(output_file, "w") as outfile: outfile.write("id,purine_content,a_content,c_content,g_content,t_content\n") outfile.write("real,{0},{1}\n".format(real_purine_content, ",".join(gen.stringify([real_nt_content[i] for i in sorted(real_nt_content)])))) for i in range(len(test_purine_content)): outfile.write("{0},{1},{2}\n".format(i+1, test_purine_content[i], ",".join(gen.stringify([test_nt_content[i][j] for j in sorted(test_nt_content[i])])))) # remove the output directory gen.remove_directory(hexamers_dir)
def motif_codon_density(motif_file, output_directory): stops = ["TAA", "TAG", "TGA"] gc_matchd_motifs_file = "{0}/gc_matched_combinations.bed".format( output_directory) if not os.path.isfile(gc_matchd_motifs_file): seqo.get_gc_matched_motifs(stops, gc_matchd_motifs_file) temp_dir = "temp_motif_density" gen.create_output_directories(temp_dir) motif_sets = gen.read_many_fields(gc_matchd_motifs_file, "\t") motif_sets.append(["TAA", "TAG", "TGA"]) args = [motif_file, temp_dir] outputs = simoc.run_simulation_function(motif_sets, args, ops.calc_codon_density_in_motifs, sim_run=False) new_output_dir = "{0}/motif_densities".format(output_directory) gen.create_output_directories(new_output_dir) output_file = "{0}/{1}.csv".format(new_output_dir, motif_file.split("/")[-1].split(".")[0]) with open(output_file, "w") as outfile: outfile.write("id,motifs,density\n") for i, file in enumerate(sorted(outputs)): data = gen.read_many_fields(file, ",")[0] outfile.write("{0},{1},{2}\n".format(i + 1, data[0], data[1])) gen.remove_directory(temp_dir)
def motif_stop_codon_densities(motif_file, motif_controls_directory, required_simulations, output_file): filelist = {"real": motif_file} for i, file in enumerate(os.listdir(motif_controls_directory)[:required_simulations]): filelist[i] = "{0}/{1}".format(motif_controls_directory, file) file_ids = [i for i in filelist] temp_dir = "temp_motif_dir" gen.create_output_directories(temp_dir) args = [filelist, temp_dir] outputs = simopc.run_simulation_function(file_ids, args, calculate_stop_codon_densities, sim_run = False) with open(output_file, "w") as outfile: outfile.write("sim_id,stop_density\n") for file in outputs: outfile.write("{0}\n".format(",".join(gen.read_many_fields(file, "\t")[0])))
def cds_motif_test(cds_fasta, output_file): nts = ["A", "C", "G", "T"] stops = ["TAA", "TAG", "TGA"] codon_list = sorted( ["".join(codon) for codon in it.product(nts, nts, nts)]) combinations = [sorted(i) for i in it.combinations(codon_list, 3)] # combination_not_all_stops = [i for i in combinations if len(list(set(i) & set(stops))) < 3] # combinations = combinations[:30] temp_dir = "temp_motif_densities" gen.create_output_directories(temp_dir) args = [cds_fasta, temp_dir] outputs = simoc.run_simulation_function(combinations, args, ops.calc_motif_densities, sim_run=False) temp_filelist = [] for output in outputs: temp_filelist.append(output) densities = collections.defaultdict(lambda: collections.defaultdict()) for file in temp_filelist: motif = file.split("/")[-1].split(".")[0] data = gen.read_many_fields(file, ",")[0] gc = data[0] density = data[1] densities[gc][motif] = density iterator = 0 with open(output_file, "w") as outfile: outfile.write("id,motif,gc,density\n") for gc in sorted(densities): for motif in sorted(densities[gc]): iterator += 1 outfile.write("{0},{1},{2},{3}\n".format( iterator, motif, gc, densities[gc][motif]))
def motif_codon_densities(motif_file, codon_combinations_file, motif_controls_directory, required_simulations, output_file): filelist = {"real": motif_file} for i, file in enumerate(os.listdir(motif_controls_directory)[:required_simulations]): filelist[i] = "{0}/{1}".format(motif_controls_directory, file) file_ids = [i for i in filelist] codon_sets = gen.read_many_fields(codon_combinations_file, "\t") temp_dir = "temp_motif_dir" gen.create_output_directories(temp_dir) args = [filelist, codon_sets, temp_dir] outputs = simopc.run_simulation_function(file_ids, args, calculate_motif_densities, sim_run = False) real_density_list = {} sim_density_list = collections.defaultdict(lambda: []) for file in outputs: results = gen.read_many_fields(file, "\t") if "real" in file: for i in results: real_density_list[i[0]] = float(i[1]) else: for i in results: sim_density_list[i[0]].append(float(i[1])) with open(output_file, "w") as outfile: outfile.write("codons,gc_content,purine_content,density,nd\n") for codon_set in sorted(real_density_list): nd = np.divide(real_density_list[codon_set] - np.mean(sim_density_list[codon_set]), np.mean(sim_density_list[codon_set])) outputs = [codon_set, seqo.calc_gc_seqs_combined(codon_set.split("_")), sequo.calc_purine_content(codon_set.split("_")), real_density_list[codon_set], nd] outfile.write("{0}\n".format(",".join(gen.stringify(outputs)))) gen.remove_directory(temp_dir)
for i, iteration in enumerate(iterations): print("{0}/{1}".format(i + 1, len(iterations))) if iteration != "real": new_seqs = [] for id in seq_list: new_seqs.append(randomise_seq(seq_list[id])) else: new_seqs = [seq_list[i] for i in seq_list] iteration_densities = {} for codon_set in codon_sets: iteration_densities["_".join( codon_set)] = seqo.calc_motif_density(new_seqs, codon_set) output_file = "{0}/{1}.csv".format(output_directory, iteration) with open(output_file, "w") as outfile: for codon_set in codon_sets: outfile.write("{0},{1}\n".format( "_".join(codon_set), iteration_densities["_".join(codon_set)])) return outputs iterations = ["real"] + list(range(1000)) soc.run_simulation_function(iterations, [seq_list, codon_sets, output_directory], run_simulations, sim_run=False, workers=50)