def calc_values(seq_list): densities = collections.defaultdict(lambda: []) for id in seq_list: for seq in seq_list[id]: density = seqo.calc_motif_density([seq], stops) densities[id].append(density) ids = list(seq_list) sim_outputs = gen.run_in_parallel(list(seq_list), ["foo", seq_list, 1000], randomise_densities) random_densities = collections.defaultdict(lambda: []) for output in sim_outputs: result = output.get() for id in result: random_densities[id].extend(result[id]) random_densities = {i: random_densities[i] for i in random_densities} nds = collections.defaultdict(lambda: []) for id in densities: for i, exon_density in enumerate(densities[id]): nd = np.divide(exon_density - np.mean(random_densities[id][i]), np.mean(random_densities[id][i])) nds[id].append(nd) return densities, nds
def generate_motifs_sets(motifs, simulations_to_run, output_file, seed_list=None, onebyone=None): ''' Generate n sets of motifs based on the set of motifs provided. seed_list: a list of seeds to use (must be of length greater or equal to the number of simulations) ''' #check that there are enough seeds if the seed is set if seed_list and simulations_to_run > len(seed_list): print( 'The number of seeds must be at least equal to the number of simulations!' ) raise Exception #get dinucleotides dinucleotides = get_dinucleotides(motifs) #create a list of processes input_list = [i for i in range(simulations_to_run)] #build processes processes = gen.run_in_parallel(input_list, ["foo", motifs, dinucleotides, seed_list], generate_motifs, onebyone) #run processes and output to output_file output = open(output_file, "w") for process in processes: simulants = process.get() if simulants: for simulant in simulants: output.write('{0}\n'.format("|".join(simulant))) output.close()
def run_exon_simulation(motif_file, exon_fasta, output_dir, required_simulations, output_file): ''' Run simulation that picks hexamers from the exon sequences ''' exon_names, exon_seqs = gen.read_fasta(exon_fasta) #exons needs to be >= 16 to get the two exon ends exon_seqs = [exon for exon in exon_seqs if len(exon) >= 16] # get motifs, avoid header if there is one motif_list = gen.read_many_fields(motif_file, ",") motifs = [i[0] for i in motif_list if i[0][0] != "#"] real_count = se.get_stop_codon_count(motifs) simulations = list(range(required_simulations)) # simulated_counts = simulate_motifs(simulations, exon_seqs, motifs) processes = gen.run_in_parallel(simulations, ["foo", exon_seqs, motifs], simulate_motifs) outputs = [] for process in processes: outputs.extend(process.get()) with open(output_file, "w") as outfile: outfile.write('sim,count\n') outfile.write('real,{0}\n'.format(real_count)) for i, count in enumerate(outputs): outfile.write('{0},{1}\n'.format(i+1,count))
def ptc_snp_simulation(out_prefix, simulation_output_folder, ptc_file, syn_nonsyn_file, exon_junctions_file, bam_files, required_simulations, exon_junctions_bam_output_folder, use_old_sims=False): ''' Set up the PTC simulations and then run. if use_old_sims is True, don't pick new simulant SNPs. ''' #setup up simulation output folder if simulation_output_folder == "None": simulation_output_folder = "{0}_simulate_ptc_snps".format(out_prefix) if not use_old_sims: #if the simulation folder we are specifying already exists, delete and start again gen.create_strict_directory(simulation_output_folder) else: gen.create_directory(simulation_output_folder) #setup up simulation bam analysis output folder simulation_bam_analysis_output_folder = "{0}__analysis_simulation_ptc_snps_bam_analysis".format( out_prefix) if not use_old_sims: #if the simulation folder we are specifying already exists, delete and start again gen.create_strict_directory(simulation_bam_analysis_output_folder) else: gen.create_directory(simulation_bam_analysis_output_folder) #get all nonsynonymous snps and put them in the simulation output folder nonsynonymous_snps_file = "{0}/nonsynonymous_snps.txt".format( simulation_output_folder) so.filter_by_snp_type(syn_nonsyn_file, nonsynonymous_snps_file, "non") #create a list of simulations to iterate over simulations = list(range(1, required_simulations + 1)) #if you're only doing one simulation, don't parallelize the simulations #parallelize the processing of bams like for true data if required_simulations > 1: processes = gen.run_in_parallel(simulations, [ "foo", out_prefix, simulation_output_folder, simulation_bam_analysis_output_folder, ptc_file, nonsynonymous_snps_file, exon_junctions_file, bam_files, exon_junctions_bam_output_folder, True, use_old_sims ], run_ptc_simulation_instance) for process in processes: process.get() else: run_ptc_simulation_instance([1], out_prefix, simulation_output_folder, simulation_bam_analysis_output_folder, ptc_file, nonsynonymous_snps_file, exon_junctions_file, bam_files, exon_junctions_bam_output_folder, False, use_old_sims)
def large_effect_locations_sim(): ''' Test where the large effect cases are ''' output_prefix = "results/clean_run_2/clean_run" ptc_file = "{0}_ptc_file.txt".format(output_prefix) relative_positions_file = "{0}_PTC_relative_exon_positions.bed".format( output_prefix) final_output_file = "{0}__analysis_final_output.txt".format(output_prefix) filtered_list = get_filtered_skipped_exons(final_output_file) large_effects, non_large_effects = get_large_effect_overlaps( filtered_list, 5, 0.025) relative_positions = gen.read_many_fields(relative_positions_file, "\t") rel_pos_list = {} for ptc in relative_positions[1:]: start = int(ptc[1]) stop = int(ptc[2]) exon = ptc[3] rel_pos = int(ptc[11]) rel_pos_list[exon] = [rel_pos, start, stop] real_regions, real_ends = get_ptc_regions(large_effects, rel_pos_list) simulations = 10000 sims = list(range(simulations)) # outputs = large_effects_locations_sim(sims, large_effects, non_large_effects, rel_pos_list) processes = gen.run_in_parallel( sims, ["foo", large_effects, non_large_effects, rel_pos_list], large_effects_locations_sim) regions = [] ends = [] for process in processes: output = process.get() region = output[0] end = output[1] for i in region: regions.append(i) for i in end: ends.append(i) ese_region_pval = np.divide( len([1 for region in regions if region[1] >= real_regions[1]]) + 1, len(regions) + 1) ends_pval = np.divide( len([1 for end in ends if end[0] >= real_ends[0]]) + 1, len(ends) + 1) # ese_region_pval = 1 print("PTCs in regions (0-2,3-69,70+): {0}".format(real_regions)) print("Ends (5', 3'): {0}".format(real_ends)) print("ESE region compared with sims: {0}".format(ese_region_pval)) print("ESE exon ends with sims: {0}".format(ends_pval))
def main(): #Get genomic flux rates and pTGA genomic_taa_tga, genomic_tga_taa, genomic_pTGA = get_nulls(full_source) #Simulate GC rich set high_sims = [] workers = int(os.cpu_count()) - 1 sims = list(range(1, 101)) high_processes = run_in_parallel(sims, ['foo', high_source, genomic_taa_tga, genomic_tga_taa], get_sims, workers=workers) for process in high_processes: output = process.get() high_sims.extend(output) #Simulate GC poor set low_sims = [] workers = int(os.cpu_count()) - 1 sims = list(range(1, 101)) low_processes = run_in_parallel(sims, ['foo', low_source, genomic_taa_tga, genomic_tga_taa], get_sims, workers=workers) for process in low_processes: output = process.get() low_sims.extend(output) print (len(high_sims)) #Now select random pairs to generate p value counter = 0 no_sims = 0 for i in tqdm(range(1,10001)): random_high = np.random.choice(high_sims) random_low = np.random.choice(low_sims) diff = random_high - random_low observed = 0.08544058 #Edit this observed difference according to the trio tested if diff > observed: counter += 1 no_sims += 1 p = counter / no_sims print (p)
def retrieve_bams(ftp_site, local_directory, remote_directory, password_file, subset=None): ''' For each .bam file at the ftp site, downsload it, transfer it to a remote server and delete it. ftp_site: the remote site that contains the files local_directory: the local directory where you want to temporarily store the files remote_directory: path to directory on remote server where you want to transfer the files password_file: path to file that contains Watson password subset: only retrieve this many .bam files (useful for testing) ''' #create local directory, if it doesn't exist gen.create_directory(local_directory) #split the ftp_site address into host and the path ftp_site = ftp_site.split("/") host = ftp_site[0] ftp_directory = "/".join(ftp_site[1:]) user = "******" password = "******" #connect to FTP server ftp = gen.ftp_connect(host, user, password, directory=ftp_directory) #get list of all .bam files all_files = ftp.nlst() all_files = [i for i in all_files if i[-4:] == ".bam"] print(len(all_files)) ftp = gen.ftp_check(ftp, host, user, password, ftp_directory) ftp.quit() #get password for Watson with open(password_file) as file: expect_password = "".join(file) expect_password = expect_password.rstrip("\n") #I will use expect to run scp from the script #the way this works is you write an expect script #and then use the expect programme to run it #this is the string that will be in the script #each time, you replace "foo" with the name of the file you want to transfer expect_string = "#!/usr/bin/expect\nset timeout -1\nspawn rsync {0}/foo {1}\nexpect \"rs949@bssv-watson's password:\"\nsend \"{2}\\n\";\nexpect eof\nexit".format( local_directory, remote_directory, expect_password) if subset: all_files = all_files[:subset] #retrieve and transfer .bams in parallel processes = gen.run_in_parallel(all_files, [ "foo", local_directory, host, user, password, ftp_directory, expect_string ], retrieve_bams_core, workers=6) for process in processes: process.get()
def simulate_exon_ese_hits(simulations, relative_positions, large_effects, non_large_effects, exon_seqs, ese_list): # get the information on the ptc and add to the two lists large_effect_info = [] non_large_effect_info = [] for ptc in relative_positions: exon = ptc[3] if exon in large_effects: large_effect_info.append(ptc) elif exon in non_large_effects: non_large_effect_info.append(ptc) real_ese_overlap = get_ese_overlap_count(large_effect_info, exon_seqs, ese_list, real=True) sims = list(range(simulations)) ese_overlaps = simulate_exon_ese_overlaps(sims, large_effect_info, exon_seqs, ese_list, non_large_effect_info) # print(ese_overlaps) processes = gen.run_in_parallel( sims, ["foo", large_effect_info, exon_seqs, ese_list, non_large_effect_info], simulate_exon_ese_overlaps) outputs = [] for process in processes: output = process.get() outputs.extend(output) pval = np.divide( len([i for i in outputs if i >= real_ese_overlap]) + 1, simulations + 1) print("Number of PTCs that hit an ESE in the real cases: {0}/{1}".format( real_ese_overlap, len(large_effects))) print( "Is this a significant number when picking non-large effect cases: {0}" .format(pval))
def ese_hit_simulation(rel_pos_file, coding_exons_fasta, simulations, output_file, ese_file, window_start, window_end, exclude_cpg, clinvar=None): ''' Simulate ese hits strictly within a region ''' # get a list of the relative positions of the ptcs relative_positions_list = get_relative_position_list(rel_pos_file) # get a list of eses ese_list = get_eses_from_file(ese_file) # get the coding exons coding_exons = get_coding_exons(coding_exons_fasta) long_exons = get_long_exons(relative_positions_list, coding_exons, window_end*2) # get the ptcs that are in the 3-69 bp region for each exon of exon # this requires exons at least 128 bp in length for comparison window_ptcs = get_ptcs_in_window(long_exons, window_start, window_end, coding_exons) real_ese_hits = get_ese_hits(window_ptcs, coding_exons, ese_list) # simulate the hit counts for nt matched mutations simulation_list = list(range(simulations)) # simulate_ese_hits(simulation_list, simulations, window_ptcs, coding_exons_fasta, ese_list, window_start, window_end) processes = gen.run_in_parallel(simulation_list, ["foo", simulations, window_ptcs, coding_exons_fasta, ese_list, window_start, window_end], simulate_ese_hits) # simulation_outputs = {} for process in processes: simulation_hits = process.get() simulation_outputs = {**simulation_outputs, **simulation_hits} with open(output_file, "w") as outfile: outfile.write("simulation,ese_hit_count,cant_count\n") outfile.write("real,{0},0\n".format(real_ese_hits)) for simulation in sorted(simulation_outputs): outlist = [simulation+1, simulation_outputs[simulation][0], simulation_outputs[simulation][1]] outfile.write("{0}\n".format(",".join(gen.stringify(outlist))))
def calc_values(seq_list): densities = collections.defaultdict(lambda: []) gcs = collections.defaultdict(lambda: []) ese = collections.defaultdict(lambda: []) for id in seq_list: for i, exon in enumerate(seq_list[id]): density = seqo.calc_motif_density([exon], stops) densities['{0}.{1}'.format(id, i)].append(density) gcs['{0}.{1}'.format(id, i)].append(seqo.calc_gc_seqs_combined([exon])) ese['{0}.{1}'.format(id, i)].append( seqo.calc_motif_density([exon], motifs)) ids = list(seq_list) its = 1000 sim_outputs = gen.run_in_parallel(ids, ["foo", seq_list, its], randomise_densities) randomised_densities = process_densities(sim_outputs) nds = calc_nds(densities, randomised_densities) return densities, nds, gcs, ese
def ptc_location_simulation(rel_pos_file, coding_exons_fasta, simulations, output_file, ese_overlap_output_file, ese_file=None, only_ese=None, exclude_cpg=None): ''' Simulation mutation locations of PTCs. Take the exon in which each PTC is location and randomly pick a site with the same nt composition. Locations of these matched mutations are used for null. ''' # get a list of the relative positions of the ptcs relative_positions_list = get_relative_position_list(rel_pos_file) # get a list of eses ese_list = get_eses_from_file(ese_file) # get the coding exons coding_exons = get_coding_exons(coding_exons_fasta) # get the positions of the ptcs real_positions = get_ptc_positions(relative_positions_list, coding_exons) # get the number of ptcs with ese overlaps real_ese_overlap = get_ptc_ese_overlap(relative_positions_list, coding_exons, ese_list) # now do the simulations simulant_list = list(range(1, simulations+1)) processes = gen.run_in_parallel(simulant_list, ["foo", simulations, relative_positions_list, coding_exons_fasta, ese_list, exclude_cpg], simulate_mutation_locations) position_list = {} ese_overlap_list = {} for process in processes: result = process.get() position_list = {**position_list, **result[0]} ese_overlap_list = {**ese_overlap_list, **result[1]} # ignore writing this to file if we just want the ese overlap if not only_ese: with open(output_file, "w") as outfile: outfile.write('simulation,0.2,3.69,70+\n') outfile.write('real,{0}\n'.format(",".join(gen.stringify(real_positions)))) for simulant in position_list: outfile.write("{0},{1}\n".format(simulant, ",".join(gen.stringify(position_list[simulant])))) with open(ese_overlap_output_file, "w") as outfile: outfile.write('simulation,0.2,3.69,70+\n') outfile.write('real,{0}\n'.format(",".join(gen.stringify(real_ese_overlap)))) for simulant in ese_overlap_list: outfile.write("{0},{1}\n".format(simulant, ",".join(gen.stringify(ese_overlap_list[simulant])))) # def ptc_location_simulation(snp_file, full_bed, cds_fasta, possible_positions_dir, output_directory, required_simulations, coding_exons_file): ''' Simulate the snp location. For each snp, pick another site that has the same reference allele and that would generate a ptc with the mutated allele. Repeat n times. ''' # return all the possible_locations possible_locations = collections.defaultdict(lambda: collections.defaultdict(lambda: collections.defaultdict(lambda: []))) nts = ["A", "C", "G", "T"] for nt in nts: location_file = "{0}/possible_ptc_locations_{1}.fasta".format(possible_positions_dir, nt) entry_names, entry_locations = gen.read_fasta(location_file) for i, name in enumerate(entry_names): exon = name.split(':')[0] aa = name.split(':')[1][0] ma = name.split(':')[1][-1] possible_locations[exon][aa][ma].append(entry_locations[i]) # get a list of exons and their lengths exons = gen.read_many_fields(coding_exons_file, "\t") exon_list = {} for exon in exons: exon_list[exon[3]] = int(exon[2]) - int(exon[1]) # create a list of required simulations simulations = list(range(1, int(required_simulations) + 1)) run_location_simulations(simulations, snp_file, possible_locations, exon_list, output_directory)
def main(): description = "Check whether PTCs are associated with greater rates of exon skipping." args = gen.parse_arguments( description, [ "gtf", "genome_fasta", "bams_folder", "vcf_folder", "panel_file", "out_prefix", "bam_analysis_folder", "number_of_simulations", "simulation_output_folder", "motif_file", "filter_genome_data", "get_SNPs", "process_bams", "simulate_ptc_snps", "motif_complement", "overwrite_intersect", "use_old_sims", "out_of_frame", "simulate_ptcs_with_monomorphic", "generate_monomorphic_indices", "ignore_determine_snp_type", "ignore_psi_calculation", "ptc_location_analysis" ], flags=[10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22], ints=[7]) gtf, genome_fasta, bams_folder, vcf_folder, panel_file, out_prefix, bam_analysis_folder, number_of_simulations, simulation_output_folder, motif_file, filter_genome_data, get_SNPs, process_bams, simulate_ptc_snps, motif_complement, overwrite_intersect, use_old_sims, out_of_frame, simulate_ptcs_with_monomorphic, generate_monomorphic_indices, ignore_determine_snp_type, ignore_psi_calculation, ptc_location_analysis = args.gtf, args.genome_fasta, args.bams_folder, args.vcf_folder, args.panel_file, args.out_prefix, args.bam_analysis_folder, args.number_of_simulations, args.simulation_output_folder, args.motif_file, args.filter_genome_data, args.get_SNPs, args.process_bams, args.simulate_ptc_snps, args.motif_complement, args.overwrite_intersect, args.use_old_sims, args.out_of_frame, args.simulate_ptcs_with_monomorphic, args.generate_monomorphic_indices, args.ignore_determine_snp_type, args.ignore_psi_calculation, args.ptc_location_analysis start = time.time() # create any necessary output diretories directory_splits = out_prefix.split('/') directory_paths = "/".join(directory_splits[:-1]) gen.create_output_directories(directory_paths) gen.create_directory('temp_data/') CDS_fasta = "{0}_CDS.fasta".format(out_prefix) CDS_bed = "{0}_CDS.bed".format(out_prefix) exon_bed = "{0}_exons.bed".format(out_prefix) filtered_exon_bed = "{0}_filtered_exons.bed".format(out_prefix) exon_junctions_file = "{0}_exon_junctions.bed".format(out_prefix) coding_exon_bed = "{0}_coding_exons.bed".format(out_prefix) if filter_genome_data: #extract and filter CDS coordinates and sequences print("Extracting and filtering CDSs...") bo.extract_cds(gtf, CDS_bed, CDS_fasta, genome_fasta, all_checks=True, uniquify=True, clean_chrom_only=True, full_chr_name=True) gen.get_time(start) #group the CDS sequences into families based on sequence similarity print("Grouping sequences into families...") names = gen.read_fasta(CDS_fasta)[0] gen.find_families_ensembl( "../source_data/GRCh37_ensembl_protein_families.txt", names, "{0}_families.txt".format(out_prefix)) gen.get_time(start) print("Extracting and filtering exons...") #extract exon coordinates bo.extract_exons(gtf, exon_bed) #only leave exons from transcripts that passed quality control in the extract_cds step above. #also only leave a single gene per family bo.filter_bed_from_fasta( exon_bed, CDS_fasta, filtered_exon_bed, families_file="{0}_families.txt".format(out_prefix)) gen.get_time(start) #extract exon-exon junction coordinates print("Extracting exon-exon junctions...") bo.extract_exon_junctions(exon_bed, exon_junctions_file, window_of_interest=2) gen.get_time(start) #make another exons bed that only contains fully coding exons. #This is because in the final analysis, we should only consider fully protein-coding exons. #However, for getting the exon junctions we need the full exons file because fully protein-coding exons might #be flanked by exons that are not. This is why we couldn't do this filtering step earlier. print( "Filtering out overlapping, non-coding and partially coding, as well as terminal exons..." ) bo.check_coding(filtered_exon_bed, CDS_bed, coding_exon_bed, remove_overlapping=True) gen.get_time(start) SNP_file = "{0}_SNP_file.txt".format(out_prefix) if out_of_frame: out_prefix = out_prefix + "_out_of_frame" PTC_file = "{0}_ptc_file.txt".format(out_prefix) syn_nonsyn_file = "{0}_syn_nonsyn_file.txt".format(out_prefix) CDS_interval_file = "{0}_intervals{1}".format( os.path.splitext(CDS_fasta)[0], os.path.splitext(CDS_fasta)[1]) #check which individuals were included in Geuvadis full_sample_names = os.listdir(bams_folder) full_sample_names = [ i for i in full_sample_names if i[-4:] == ".bam" and "proc" not in i ] sample_names = [(i.split("."))[0] for i in full_sample_names] sample_names = [i for i in sample_names if len(i) > 0] print('{0} samples included in Geuvadis...'.format(len(sample_names))) #for some reason, 17 of the samples from Geuvadis don't appear in the 1000genomes vcf #I'm gonna have to get to the bottom of this at some point #but at the moment I'm just gonna filter them out with open("../source_data/samples_in_vcf.txt") as file: samples_in_vcf = file.readlines() samples_in_vcf = [i.rstrip("\n") for i in samples_in_vcf] sample_names = [i for i in sample_names if i in samples_in_vcf] print('{0} samples also in vcf...'.format(len(sample_names))) sample_file = "{0}_sample_file.txt".format(out_prefix) # create a fasta containing all sequences for exons with snp coding_exons_fasta = "{0}_coding_exons.fasta".format(out_prefix) bo.fasta_from_intervals(coding_exon_bed, coding_exons_fasta, genome_fasta, names=True) if get_SNPs: #get SNPs for the sample intersect_file = "{0}_SNP_CDS_intersect.bed".format(out_prefix) print("Getting SNP data...") so.get_snps_in_cds(coding_exon_bed, CDS_bed, vcf_folder, panel_file, sample_names, sample_file, intersect_file, out_prefix) print("Calculating SNP positions...") so.get_snp_positions(sample_file, SNP_file, CDS_bed, intersect_file, out_prefix) gen.get_time(start) if ignore_determine_snp_type: pass else: print("Determining SNP type...") so.get_snp_change_status(SNP_file, CDS_fasta, PTC_file, syn_nonsyn_file, out_of_frame=out_of_frame, ref_check=True, headers=True) gen.get_time(start) #filter the exon junctions file to only leave those junctions that flank exons retained in the previous step. print( "Filtering exon-exon junctions to only leave those that flank exons with a PTC variant..." ) PTC_exon_junctions_file = "{0}_filtered_exon_junctions.bed".format( out_prefix) bo.filter_exon_junctions(exon_junctions_file, PTC_file, PTC_exon_junctions_file) #make a list of all the .bam files and modify them to have the full path rather than just the file name bam_files = [ "{0}/{1}".format(bams_folder, i) for i in full_sample_names if (i.split("."))[0] in sample_names ] #in parallel, do the processing on individual .bam files exon_junctions_bam_output_folder = "{0}__analysis_exon_junction_bams".format( out_prefix) if bam_analysis_folder == "None": bam_analysis_folder = "{0}__analysis_bam_analysis".format(out_prefix) gen.create_directory(bam_analysis_folder) if process_bams: print("Processing RNA-seq data...") if out_of_frame: splits = exon_junctions_bam_output_folder.split('/') splits[-1] = splits[-1].replace('_out_of_frame', '') exon_junctions_bam_output_folder = "/".join(splits) gen.create_directory(exon_junctions_bam_output_folder) #we have to do it like this because you can't pass flags into run_in_parallel keyword_dict = {"overwrite_intersect": overwrite_intersect} processes = gen.run_in_parallel(bam_files, [ "foo", exon_junctions_file, PTC_exon_junctions_file, bam_analysis_folder, PTC_file, syn_nonsyn_file, out_prefix, exon_junctions_bam_output_folder, keyword_dict ], nao.process_bam_per_individual, workers=36) for process in processes: process.get() gen.get_time(start) #if required, filter PTCs to only leave ones that overlap motifs from a specified set motif_filtering = False if motif_file != "None": print( "Filtering SNPs based on whether or not they overlap a motif from the specified set..." ) motif_suffix = ((motif_file.split("/"))[-1]).split(".")[0] if motif_complement: out_prefix = "{0}_{1}_complement".format(out_prefix, motif_suffix) else: out_prefix = "{0}_{1}".format(out_prefix, motif_suffix) filtered_ptc = "{0}_ptc_file.txt".format(out_prefix) so.filter_motif_SNPs(CDS_fasta, PTC_file, motif_file, filtered_ptc, complement=motif_complement) PTC_file = filtered_ptc final_file = "{0}__analysis_final_output.txt".format(out_prefix) if ignore_psi_calculation: pass else: print("Calculating PSI...") bmo.compare_PSI(PTC_file, bam_analysis_folder, final_file) #run the simulation that swaps ptcs for nonsynonymous snps if simulate_ptc_snps: if simulate_ptc_snps and not number_of_simulations: print("Please specify the number of simulations") raise Exception nao.ptc_snp_simulation(out_prefix, simulation_output_folder, PTC_file, syn_nonsyn_file, exon_junctions_file, bam_files, number_of_simulations, exon_junctions_bam_output_folder, use_old_sims=use_old_sims) # run the simulation that picks monomorphic sites if simulate_ptcs_with_monomorphic: if simulate_ptcs_with_monomorphic and not number_of_simulations: print("Please specify the number of simulations") raise Exception coding_exon_fasta = "{0}_coding_exons.fasta".format(out_prefix) if not os.path.exists(coding_exon_fasta): print('Coding exon fasta is required...') raise Exception nao.ptc_monomorphic_simulation( out_prefix, simulation_output_folder, sample_file, genome_fasta, PTC_file, syn_nonsyn_file, coding_exon_bed, coding_exon_fasta, exon_junctions_file, bam_files, number_of_simulations, generate_indices=generate_monomorphic_indices, use_old_sims=use_old_sims) # get the locations of the ptcs if ptc_location_analysis: print("PTC locations analysis...") snp_relative_exon_position_file = "{0}_SNP_relative_exon_position.bed".format( out_prefix) ptc_location_analysis_output_file = "{0}_ptc_location_analysis.csv".format( out_prefix) coding_exon_fasta = "{0}_coding_exons.fasta".format(out_prefix) if not os.path.exists(coding_exon_fasta) or not os.path.exists( snp_relative_exon_position_file) or not os.path.exists( PTC_file): print("Please run --filter_genome_data and --get_SNPs first...") raise Exception # need to work out where and what the analysis outputs need to do so.ptc_locations(PTC_file, snp_relative_exon_position_file, ptc_location_analysis_output_file)
def run_ptc_monomorphic_simulation_instance( simulations, out_prefix, simulation_output_folder, simulation_bam_analysis_output_folder, ptc_file, syn_nonsyn_file, exon_junctions_file, bam_files, nt_indices_files, coding_exon_fasta, parallel=False, use_old_sims=False): ''' Run the ptc simulations for the required number. ''' #iterate over simulations counter = 0 for simulation_number in simulations: counter = gen.update_counter(counter, 10, "SIMULATION ") #setup a folder to contain the individual simulation inside the simulations output simulation_instance_folder = "{0}/ptc_monomorphic_simulation_run_{1}".format( simulation_output_folder, simulation_number) if not use_old_sims: gen.create_strict_directory(simulation_instance_folder) else: gen.create_directory(simulation_instance_folder) # copy ptc file to directory real_ptcs_for_sim_file = "{0}/{1}".format(simulation_output_folder, ptc_file.split('/')[-1]) gen.copy_file(ptc_file, real_ptcs_for_sim_file) ptc_file = real_ptcs_for_sim_file #get list of exons exon_list = bo.get_fasta_exon_intervals(coding_exon_fasta) #generate pseudo ptc snps pseudo_monomorphic_ptc_file = "{0}/pseudo_monomorphic_ptc_file_{1}.txt".format( simulation_instance_folder, simulation_number) if (not use_old_sims) or ( not (os.path.isfile(pseudo_monomorphic_ptc_file))): so.generate_pseudo_monomorphic_ptcs(ptc_file, nt_indices_files, exon_list, pseudo_monomorphic_ptc_file) #filter the exon junctions file to only leave those junctions that flank exons retained in the previous step when generating pseudo ptcs pseudo_monomorphic_ptc_exon_junctions_file = "{0}/filtered_exon_junctions_{1}.bed".format( simulation_instance_folder, simulation_number) if (not use_old_sims) or (not ( os.path.isfile(pseudo_monomorphic_ptc_exon_junctions_file))): bo.filter_exon_junctions( exon_junctions_file, pseudo_monomorphic_ptc_file, pseudo_monomorphic_ptc_exon_junctions_file) exon_junctions_bam_output_folder = "{0}__analysis_exon_junction_bams".format( out_prefix) gen.create_directory(exon_junctions_bam_output_folder) #run the bam analysis for each #(don't parallelize if you're doing the simulations in parallel) kw_dict = { "ptc_snp_simulation": True, "simulation_instance_folder": simulation_instance_folder, "simulation_number": simulation_number } if parallel: process_bam_per_individual( bam_files, exon_junctions_file, pseudo_monomorphic_ptc_exon_junctions_file, simulation_bam_analysis_output_folder, pseudo_monomorphic_ptc_file, syn_nonsyn_file, out_prefix, exon_junctions_bam_output_folder, kw_dict) else: processes = gen.run_in_parallel(bam_files, [ "foo", exon_junctions_file, pseudo_monomorphic_ptc_exon_junctions_file, simulation_bam_analysis_output_folder, pseudo_monomorphic_ptc_file, syn_nonsyn_file, out_prefix, exon_junctions_bam_output_folder, kw_dict ], process_bam_per_individual, workers=36) for process in processes: process.get() #process final psi for simulation final_file = "{0}/final_output_simulation_{1}.txt".format( simulation_bam_analysis_output_folder, simulation_number) bmo.compare_PSI(pseudo_monomorphic_ptc_file, simulation_bam_analysis_output_folder, final_file, sim_number=simulation_number)
def ptc_monomorphic_simulation(out_prefix, simulation_output_folder, sample_file, genome_fasta, ptc_file, syn_nonsyn_file, coding_exon_bed, coding_exon_fasta, exon_junctions_file, bam_files, required_simulations, generate_indices=False, use_old_sims=False): ''' Set up the PTC simulations and then run. if use_old_sims is True, don't pick new simulant SNPs from monomorphic sites. ''' print( "Running simulation picking monomorphic sites that have the same ancestral allele as a PTC snp..." ) #setup up simulation output folder if simulation_output_folder == "None": simulation_output_folder = "{0}_simulate_ptc_monomorphic_sites".format( out_prefix) if not use_old_sims and generate_indices: #if the simulation folder we are specifying already exists, delete and start again gen.create_strict_directory(simulation_output_folder) else: gen.create_directory(simulation_output_folder) #setup up simulation bam analysis output folder simulation_bam_analysis_output_folder = "{0}_simulate_ptc_monomorphic_sites_bam_analysis".format( out_prefix) # create the filepaths to hold to positions of non mutated sites nt_indices_files = { "A": "{0}/nt_indices_no_mutations_A.fasta".format(simulation_output_folder), "C": "{0}/nt_indices_no_mutations_C.fasta".format(simulation_output_folder), "G": "{0}/nt_indices_no_mutations_G.fasta".format(simulation_output_folder), "T": "{0}/nt_indices_no_mutations_T.fasta".format(simulation_output_folder), } if generate_indices and not use_old_sims: get_non_mutation_indices(simulation_output_folder, sample_file, coding_exon_bed, out_prefix, genome_fasta, nt_indices_files) if not use_old_sims: #if the simulation folder we are specifying already exists, delete and start again gen.create_strict_directory(simulation_bam_analysis_output_folder) else: gen.create_directory(simulation_bam_analysis_output_folder) # #create a list of simulations to iterate over simulations = list(range(1, required_simulations + 1)) # #if you're only doing one simulation, don't parallelize the simulations # #parallelize the processing of bams like for true data if required_simulations > 1: processes = gen.run_in_parallel( simulations, [ "foo", out_prefix, simulation_output_folder, simulation_bam_analysis_output_folder, ptc_file, syn_nonsyn_file, exon_junctions_file, bam_files, nt_indices_files, coding_exon_fasta, True, use_old_sims ], run_ptc_monomorphic_simulation_instance, workers=36) for process in processes: process.get() else: run_ptc_monomorphic_simulation_instance( [1], out_prefix, simulation_output_folder, simulation_bam_analysis_output_folder, ptc_file, syn_nonsyn_file, exon_junctions_file, bam_files, nt_indices_files, coding_exon_fasta, False, use_old_sims)
def large_effects_lengths_sim(): ''' Test the lengths of the large effects exons and whether they are biased of length 3 ''' output_prefix = "results/clean_run_2/clean_run" ptc_file = "{0}_ptc_file.txt".format(output_prefix) relative_positions_file = "{0}_PTC_relative_exon_positions.bed".format( output_prefix) final_output_file = "{0}__analysis_final_output.txt".format(output_prefix) filtered_list = get_filtered_skipped_exons(final_output_file) large_effects, non_large_effects = get_large_effect_overlaps( filtered_list, 5, 0.025) relative_positions = gen.read_many_fields(relative_positions_file, "\t") rel_pos_list = {} for ptc in relative_positions[1:]: start = int(ptc[1]) stop = int(ptc[2]) exon = ptc[3] rel_pos = int(ptc[11]) rel_pos_list[exon] = [rel_pos, start, stop] real_lengths, real_periods = get_exon_length_info(large_effects, rel_pos_list) simulations = 10000 sims = list(range(simulations)) # sim_exon_length_info(sims, large_effects, non_large_effects, rel_pos_list) processes = gen.run_in_parallel( sims, ["foo", large_effects, non_large_effects, rel_pos_list], sim_exon_length_info) lengths = [] periods = [] for process in processes: output = process.get() length = output[0] period = output[1] for i in length: lengths.append(i) for i in period: periods.append(i) length_pval = np.divide( len([ 1 for length in lengths if np.mean(length) <= np.mean(real_lengths) ]) + 1, len(lengths) + 1) # for those exons of length 3 period_pval = np.divide( len([1 for period in periods if period[0] >= real_periods[0]]) + 1, len(periods) + 1) # ese_region_pval = 1 print("Mean large effect exon length: {0}".format(np.mean(real_lengths))) print("Real periodicity (0,1,2): {0}".format(real_periods)) print("Lengths compared with sims: {0}".format(length_pval)) print("Periodicity with sims: {0}".format(period_pval))