def ptc_snp_simulation(out_prefix, simulation_output_folder, ptc_file, syn_nonsyn_file, exon_junctions_file, bam_files, required_simulations, exon_junctions_bam_output_folder, use_old_sims=False): ''' Set up the PTC simulations and then run. if use_old_sims is True, don't pick new simulant SNPs. ''' #setup up simulation output folder if simulation_output_folder == "None": simulation_output_folder = "{0}_simulate_ptc_snps".format(out_prefix) if not use_old_sims: #if the simulation folder we are specifying already exists, delete and start again gen.create_strict_directory(simulation_output_folder) else: gen.create_directory(simulation_output_folder) #setup up simulation bam analysis output folder simulation_bam_analysis_output_folder = "{0}__analysis_simulation_ptc_snps_bam_analysis".format( out_prefix) if not use_old_sims: #if the simulation folder we are specifying already exists, delete and start again gen.create_strict_directory(simulation_bam_analysis_output_folder) else: gen.create_directory(simulation_bam_analysis_output_folder) #get all nonsynonymous snps and put them in the simulation output folder nonsynonymous_snps_file = "{0}/nonsynonymous_snps.txt".format( simulation_output_folder) so.filter_by_snp_type(syn_nonsyn_file, nonsynonymous_snps_file, "non") #create a list of simulations to iterate over simulations = list(range(1, required_simulations + 1)) #if you're only doing one simulation, don't parallelize the simulations #parallelize the processing of bams like for true data if required_simulations > 1: processes = gen.run_in_parallel(simulations, [ "foo", out_prefix, simulation_output_folder, simulation_bam_analysis_output_folder, ptc_file, nonsynonymous_snps_file, exon_junctions_file, bam_files, exon_junctions_bam_output_folder, True, use_old_sims ], run_ptc_simulation_instance) for process in processes: process.get() else: run_ptc_simulation_instance([1], out_prefix, simulation_output_folder, simulation_bam_analysis_output_folder, ptc_file, nonsynonymous_snps_file, exon_junctions_file, bam_files, exon_junctions_bam_output_folder, False, use_old_sims)
def retrieve_bams(ftp_site, local_directory, remote_directory, password_file, subset=None): ''' For each .bam file at the ftp site, downsload it, transfer it to a remote server and delete it. ftp_site: the remote site that contains the files local_directory: the local directory where you want to temporarily store the files remote_directory: path to directory on remote server where you want to transfer the files password_file: path to file that contains Watson password subset: only retrieve this many .bam files (useful for testing) ''' #create local directory, if it doesn't exist gen.create_directory(local_directory) #split the ftp_site address into host and the path ftp_site = ftp_site.split("/") host = ftp_site[0] ftp_directory = "/".join(ftp_site[1:]) user = "******" password = "******" #connect to FTP server ftp = gen.ftp_connect(host, user, password, directory=ftp_directory) #get list of all .bam files all_files = ftp.nlst() all_files = [i for i in all_files if i[-4:] == ".bam"] print(len(all_files)) ftp = gen.ftp_check(ftp, host, user, password, ftp_directory) ftp.quit() #get password for Watson with open(password_file) as file: expect_password = "".join(file) expect_password = expect_password.rstrip("\n") #I will use expect to run scp from the script #the way this works is you write an expect script #and then use the expect programme to run it #this is the string that will be in the script #each time, you replace "foo" with the name of the file you want to transfer expect_string = "#!/usr/bin/expect\nset timeout -1\nspawn rsync {0}/foo {1}\nexpect \"rs949@bssv-watson's password:\"\nsend \"{2}\\n\";\nexpect eof\nexit".format( local_directory, remote_directory, expect_password) if subset: all_files = all_files[:subset] #retrieve and transfer .bams in parallel processes = gen.run_in_parallel(all_files, [ "foo", local_directory, host, user, password, ftp_directory, expect_string ], retrieve_bams_core, workers=6) for process in processes: process.get()
def run_ptc_simulation_instance(simulations, out_prefix, simulation_output_folder, simulation_bam_analysis_output_folder, ptc_file, nonsynonymous_snps_file, exon_junctions_file, bam_files, exon_junctions_bam_output_folder, parallel=False, use_old_sims=False): ''' Run the ptc simulations for the required number. ''' #iterate over simulations counter = 0 for simulation_number in simulations: counter = gen.update_counter(counter, 10, "SIMULATION ") #setup a folder to contain the individual simulation inside the simulations output simulation_instance_folder = "{0}/ptc_simulation_run_{1}".format( simulation_output_folder, simulation_number) if not use_old_sims: gen.create_strict_directory(simulation_instance_folder) else: gen.create_directory(simulation_instance_folder) #generate pseudo ptc snps #also need to remove these snps from the file they started in so create a new remaining snps file #we can tweak these if we start running out of snps pseudo_ptc_file = "{0}/pseudo_ptc_file_{1}.txt".format( simulation_instance_folder, simulation_number) remaining_snps_file = "{0}/remaining_snps_file_{1}.txt".format( simulation_instance_folder, simulation_number) if (not use_old_sims) or (not (os.path.isfile(pseudo_ptc_file))): so.generate_pseudo_ptc_snps(ptc_file, nonsynonymous_snps_file, pseudo_ptc_file, remaining_snps_file, group_by_gene=False, without_replacement=True, match_allele_frequency=True, match_allele_frequency_window=0.05) #filter the exon junctions file to only leave those junctions that flank exons retained in the previous step when generating pseudo ptcs pseudo_ptc_exon_junctions_file = "{0}/filtered_exon_junctions_{1}.bed".format( simulation_instance_folder, simulation_number) if (not use_old_sims) or (not (os.path.isfile(pseudo_ptc_file))): bo.filter_exon_junctions(exon_junctions_file, pseudo_ptc_file, pseudo_ptc_exon_junctions_file)
def bam_quality_filter(input_bam, output_bam, quality_greater_than_equal_to=None, quality_less_than_equal_to=None): ''' Filters bam reads by quality. quality_less_than_equal_to: the lower threshold for quality control quality_greater_than_equal_to: the upper threshold for quality control ''' samtools_args = ["samtools", "view", "-h"] #if neither thresholds are specified if not quality_greater_than_equal_to and not quality_less_than_equal_to: print("You must specify one threshold to filter reads by.") raise Exception #if both thresholds are specified if quality_greater_than_equal_to and quality_less_than_equal_to: #create temp file gen.create_directory("temp_data/") temp_file = "temp_data/{0}.{1}.bam".format( os.path.split(output_bam)[1][:-4], random.random()) #first get everything below the upper threshold #need to account for the fact samtools removes everything below threshold #so when inversing need to add 1 to total args = samtools_args.copy() upper_limit = quality_less_than_equal_to + 1 args.extend(["-q", upper_limit, input_bam, "-U", temp_file]) gen.run_process(args) #second get everything above the lower threshold args = samtools_args.copy() args.extend(["-bq", quality_greater_than_equal_to, temp_file]) gen.run_process(args, file_for_output=output_bam) # #cleanup files gen.remove_file(temp_file) #if only the lower threshold is specified elif quality_greater_than_equal_to and not quality_less_than_equal_to: samtools_args.extend(["-bq", quality_greater_than_equal_to, input_bam]) gen.run_process(samtools_args, file_for_output=output_bam) #if only the upper threshold is specified elif quality_less_than_equal_to and not quality_greater_than_equal_to: #need to account for the fact samtools removes everything below threshold #so when inversing need to add 1 to total upper_limit = quality_less_than_equal_to + 1 samtools_args.extend(["-q", upper_limit, input_bam, "-U", output_bam]) gen.run_process(samtools_args)
def fasta_from_intervals_temp_file(bed_file, output_fasta, genome_fasta, random_directory=None): ''' Create a temporary file to hold the fasta extractions ''' random_int = np.random.randint(9999999,size=2) if random_directory: temp_directory_path = './temp_files/temp_fasta_files_{0}'.format(random_int[0]) else: temp_directory_path = './temp_files/temp_fasta_files' #create temp directory if doesnt already exist gen.create_directory('./temp_files/') #delete temp fasta directory and create new gen.create_strict_directory(temp_directory_path) #set the temporary fasta file path temp_fasta_file = '{0}/{1}_{2}{3}'.format(temp_directory_path, os.path.splitext(os.path.basename(output_fasta))[0], random_int[1], os.path.splitext(os.path.basename(output_fasta))[1]) temp_fasta_file = output_fasta fasta_from_intervals(bed_file, temp_fasta_file, genome_fasta, force_strand = True, names = True) return(temp_fasta_file, temp_directory_path)
def main(): description = "Check whether stop codons are depleted in motif sets by simulating the motif set." args = gen.parse_arguments(description, [ "required_simulations", "all_sets", "ESR", "Ke", "PESE", "RESCUE", "INT3", "RBP_motifs", "filter_RBPs", "split_RBPs" ], flags=[1, 2, 3, 4, 5, 6, 7, 8, 9]) required_simulations, all_sets, ESR, Ke, PESE, RESCUE, INT3, RBP_motifs, filter_RBPs, split_rbps = args.required_simulations, args.all_sets, args.ESR, args.Ke, args.PESE, args.RESCUE, args.INT3, args.RBP_motifs, args.filter_RBPs, args.split_RBPs if split_rbps and not filter_RBPs: print('You must specify the filtered RBPs if you want to split by ND.') raise Exception if not required_simulations: print('You must specify the number of simulations you require.') raise Exception #create the output_directory output_directory = "output_data" gen.create_directory(output_directory) #set up the simulations we want required_sets = [] if all_sets: required_sets.extend([i for i in ese_sets]) else: if ESR: required_sets.append("ESR") if Ke: required_sets.append("Ke400_ESEs") if PESE: required_sets.append("PESE") if RESCUE: required_sets.append("RESCUE") if INT3: required_sets.append("INT3") if RBP_motifs and not filter_RBPs: required_sets.append("RBP_motifs") if RBP_motifs and filter_RBPs: required_sets.append("RBP_motifs_filtered") #check whether any sets have been chosen if len(required_sets) == 0: print("\nPlease choose a motif set to analyse:\n") [print("--{0}".format(i)) for i in sorted(ese_sets)] print("\n") raise Exception #create the necessary files simulation_sets = [] for ese_set in required_sets: if ese_set == "RBP_motifs_filtered": dir_name = "RBP_motifs" else: dir_name = ese_set #create the output directory for the particular motif set motif_output_directory = "{0}/{1}".format(output_directory, dir_name) gen.create_directory(motif_output_directory) if split_rbps: #if we want to split the rbp motifs based on nd, need to create 2 lots of outputs simulated_set_output_pos_nd = "{0}/{1}_simulants_pos_nd_{2}.txt".format( motif_output_directory, dir_name, required_simulations) output_file_pos_nd = "{0}/{1}_stop_counts_pos_nd_{2}.csv".format( motif_output_directory, dir_name, required_simulations) simulation_sets.append([ ese_set, simulated_set_output_pos_nd, output_file_pos_nd, 1, "Positive ND" ]) simulated_set_output_neg_nd = "{0}/{1}_simulants_neg_nd_{2}.txt".format( motif_output_directory, dir_name, required_simulations) output_file_neg_nd = "{0}/{1}_stop_counts_neg_nd_{2}.csv".format( motif_output_directory, dir_name, required_simulations) simulation_sets.append([ ese_set, simulated_set_output_neg_nd, output_file_neg_nd, -1, "Negative ND" ]) else: #create simulated set output, analysis output file simulated_set_output = "{0}/{1}_simulants_{2}.txt".format( motif_output_directory, dir_name, required_simulations) output_file = "{0}/{1}_stop_counts_{2}.csv".format( motif_output_directory, dir_name, required_simulations) simulation_sets.append( [ese_set, simulated_set_output, output_file]) run_simulations(simulation_sets, int(required_simulations))
def intersect_bed(bed_file1, bed_file2, overlap=False, overlap_rec=False, write_both=False, sort=False, output_file=None, force_strand=False, no_name_check=False, no_dups=True, intersect=False, hit_count=False, bed_path=None, intersect_bam=None, write_zero=False, write_bed=False, subtract=None, return_non_overlaps=None, write_none=False): """ Use bedtools to intersect coordinates from two bed files. Return those lines in bed file 1 that overlap with intervals in bed file 2. Adapted from RS. Args: bed_file1 (str): path to first bed file (could be bam file if intersect_bam=True) bed_file2 (str): path to second bed file overlap (float): minimum overlap required as a fraction of the intervals in bed file 1 (EX: 0.8 means that the overlap has to be at least 80% of the intervals in bed file 1). overlap_rec (bool): if true, require that the overlap as a fraction of the intervals in file 2 be at least as high as the threshold indicated in -f. write_both (bool): if true, return not only the interval from bed file 1 but, tagged onto the end, also the interval from bed file 2 that it overlaps. sort (bool): if true, sort bed files before taking the intersection output_file (str): if exists, path to with which to write output file force_strand (bool): if true, check that the feature and the bed interval are on the same strand no_name_check (bool): if false, checks whether the chromosome names are the same in the too bed files no_dups (bool): if true, only returns each interval once. If false, intervals in bed file 1 that overlap several intervals in bed file 2 will be returned several times (as many times as there are overlaps with different elements in bed file 2) intersect (bool): if true, rather than returning the entire interval, only return the part of the interval that overlaps an interval in bed file 2. hit_count (bool): for each element in bed file 1, return the number of elements it overlaps in bed file 2 intersect_bam (bool): if true, intersect a bam file with a bed file. Requires bam file to be called first write_zero (bool): like write_both but also write A intervals that don't overlap with any B intervals write_bed (bool): if true, when intersecting a bam file, write output as bed subtract (bool): if true, set argument to subtractBed return_non_overlaps (bool): if true, only return entries in bed file 1 that don't overlap bed file 2 Returns: bedtools_output (list): list of bed lines from the output file """ gen.create_directory("temp_data/") temp_file_name = "temp_data/temp_bed_file{0}.bed".format(random.random()) #have it write the output to a temporary file bedtools_output = run_bedtools(bed_file1, bed_file2, force_strand, write_both, overlap, sort, no_name_check, no_dups, output_file=temp_file_name, intersect=intersect, hit_number=hit_count, bed_path=bed_path, intersect_bam=intersect_bam, write_zero=write_zero, overlap_rec=overlap_rec, write_bed=write_bed, subtract=subtract, return_non_overlaps=return_non_overlaps, write_none=write_none) #move it to a permanent location only if you want to keep it if output_file: gen.run_process(["mv", temp_file_name, output_file]) else: bedtools_output = gen.read_many_fields(temp_file_name, "\t") gen.remove_file(temp_file_name) return (bedtools_output)
def intersect_bed(bed_file1, bed_file2, use_bedops=False, overlap=False, overlap_rec=False, write_both=False, sort=False, output_file=None, force_strand=False, no_name_check=False, no_dups=True, chrom=None, intersect=False, hit_count=False, bed_path=None, intersect_bam=None, write_zero=False, write_bed=False, subtract=None): '''Use bedtools/bedops to intersect coordinates from two bed files. Return those lines in bed file 1 that overlap with intervals in bed file 2. OPTIONS output_file: write output to this file use_bedops: use bedops rather than bedtools. Certain options are only valid with one of the two, see below. overlap: minimum overlap required as a fraction of the intervals in bed file 1 (EX: 0.8 means that the overlap has to be at least 80% of the intervals in bed file 1). overlap_rec: require that the overlap as a fraction of the intervals in file 2 be at least as high as the threshold indicated in -f. write_both: if True, return not only the interval from bed file 1 but, tagged onto the end, also the interval from bed file 2 that it overlaps (only valid when using bedtools). sort: sort bed files before taking the intersection force_strand: check that the feature and the bed interval are on the same strand (only valid with bedtools) no_name_check: if set to False, checks whether the chromosome names are the same in the too bed files (only valid with bedtools) no_dups: if True, only returns each interval once. If set to false, intervals in bed file 1 that overlap several intervals in bed file 2 will be returned several times (as many times as there are overlaps with different elements in bed file 2) chrom: limit search to a specific chromosome (only valid with bedops, can help in terms of efficiency) intersect: rather than returning the entire interval, only return the part of the interval that overlaps an interval in bed file 2. hit_count: for each element in bed file 1, return the number of elements it overlaps in bed file 2 (only valid with bedtools) intersect_bam: intersect a bam file with a bed file. Requires bam file to be called first write_zero: like write_both but also write A intervals that don't overlap with any B intervals, write_bed: when intersecting a bam file, write output as bed.''' gen.create_directory("temp_data/") temp_file_name = "temp_data/temp_bed_file{0}.bed".format(random.random()) #have it write the output to a temporary file if use_bedops: bedtools_output = run_bedops(bed_file1, bed_file2, force_strand, write_both, chrom, overlap, sort, output_file=temp_file_name, intersect=intersect, hit_number=hit_count, no_dups=no_dups, intersect_bam=intersect_bam, overlap_rec=overlap_rec) else: bedtools_output = run_bedtools(bed_file1, bed_file2, force_strand, write_both, chrom, overlap, sort, no_name_check, no_dups, output_file=temp_file_name, intersect=intersect, hit_number=hit_count, bed_path=bed_path, intersect_bam=intersect_bam, write_zero=write_zero, overlap_rec=overlap_rec, write_bed=write_bed, subtract=subtract) #move it to a permanent location only if you want to keep it if output_file: gen.run_process(["mv", temp_file_name, output_file]) else: bedtools_output = gen.read_many_fields(temp_file_name, "\t") gen.remove_file(temp_file_name) return (bedtools_output)
def main(): description = "Check whether PTCs are associated with greater rates of exon skipping." args = gen.parse_arguments( description, [ "gtf", "genome_fasta", "bams_folder", "vcf_folder", "panel_file", "out_prefix", "bam_analysis_folder", "number_of_simulations", "simulation_output_folder", "motif_file", "filter_genome_data", "get_SNPs", "process_bams", "simulate_ptc_snps", "motif_complement", "overwrite_intersect", "use_old_sims", "out_of_frame", "simulate_ptcs_with_monomorphic", "generate_monomorphic_indices", "ignore_determine_snp_type", "ignore_psi_calculation", "ptc_location_analysis" ], flags=[10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22], ints=[7]) gtf, genome_fasta, bams_folder, vcf_folder, panel_file, out_prefix, bam_analysis_folder, number_of_simulations, simulation_output_folder, motif_file, filter_genome_data, get_SNPs, process_bams, simulate_ptc_snps, motif_complement, overwrite_intersect, use_old_sims, out_of_frame, simulate_ptcs_with_monomorphic, generate_monomorphic_indices, ignore_determine_snp_type, ignore_psi_calculation, ptc_location_analysis = args.gtf, args.genome_fasta, args.bams_folder, args.vcf_folder, args.panel_file, args.out_prefix, args.bam_analysis_folder, args.number_of_simulations, args.simulation_output_folder, args.motif_file, args.filter_genome_data, args.get_SNPs, args.process_bams, args.simulate_ptc_snps, args.motif_complement, args.overwrite_intersect, args.use_old_sims, args.out_of_frame, args.simulate_ptcs_with_monomorphic, args.generate_monomorphic_indices, args.ignore_determine_snp_type, args.ignore_psi_calculation, args.ptc_location_analysis start = time.time() # create any necessary output diretories directory_splits = out_prefix.split('/') directory_paths = "/".join(directory_splits[:-1]) gen.create_output_directories(directory_paths) gen.create_directory('temp_data/') CDS_fasta = "{0}_CDS.fasta".format(out_prefix) CDS_bed = "{0}_CDS.bed".format(out_prefix) exon_bed = "{0}_exons.bed".format(out_prefix) filtered_exon_bed = "{0}_filtered_exons.bed".format(out_prefix) exon_junctions_file = "{0}_exon_junctions.bed".format(out_prefix) coding_exon_bed = "{0}_coding_exons.bed".format(out_prefix) if filter_genome_data: #extract and filter CDS coordinates and sequences print("Extracting and filtering CDSs...") bo.extract_cds(gtf, CDS_bed, CDS_fasta, genome_fasta, all_checks=True, uniquify=True, clean_chrom_only=True, full_chr_name=True) gen.get_time(start) #group the CDS sequences into families based on sequence similarity print("Grouping sequences into families...") names = gen.read_fasta(CDS_fasta)[0] gen.find_families_ensembl( "../source_data/GRCh37_ensembl_protein_families.txt", names, "{0}_families.txt".format(out_prefix)) gen.get_time(start) print("Extracting and filtering exons...") #extract exon coordinates bo.extract_exons(gtf, exon_bed) #only leave exons from transcripts that passed quality control in the extract_cds step above. #also only leave a single gene per family bo.filter_bed_from_fasta( exon_bed, CDS_fasta, filtered_exon_bed, families_file="{0}_families.txt".format(out_prefix)) gen.get_time(start) #extract exon-exon junction coordinates print("Extracting exon-exon junctions...") bo.extract_exon_junctions(exon_bed, exon_junctions_file, window_of_interest=2) gen.get_time(start) #make another exons bed that only contains fully coding exons. #This is because in the final analysis, we should only consider fully protein-coding exons. #However, for getting the exon junctions we need the full exons file because fully protein-coding exons might #be flanked by exons that are not. This is why we couldn't do this filtering step earlier. print( "Filtering out overlapping, non-coding and partially coding, as well as terminal exons..." ) bo.check_coding(filtered_exon_bed, CDS_bed, coding_exon_bed, remove_overlapping=True) gen.get_time(start) SNP_file = "{0}_SNP_file.txt".format(out_prefix) if out_of_frame: out_prefix = out_prefix + "_out_of_frame" PTC_file = "{0}_ptc_file.txt".format(out_prefix) syn_nonsyn_file = "{0}_syn_nonsyn_file.txt".format(out_prefix) CDS_interval_file = "{0}_intervals{1}".format( os.path.splitext(CDS_fasta)[0], os.path.splitext(CDS_fasta)[1]) #check which individuals were included in Geuvadis full_sample_names = os.listdir(bams_folder) full_sample_names = [ i for i in full_sample_names if i[-4:] == ".bam" and "proc" not in i ] sample_names = [(i.split("."))[0] for i in full_sample_names] sample_names = [i for i in sample_names if len(i) > 0] print('{0} samples included in Geuvadis...'.format(len(sample_names))) #for some reason, 17 of the samples from Geuvadis don't appear in the 1000genomes vcf #I'm gonna have to get to the bottom of this at some point #but at the moment I'm just gonna filter them out with open("../source_data/samples_in_vcf.txt") as file: samples_in_vcf = file.readlines() samples_in_vcf = [i.rstrip("\n") for i in samples_in_vcf] sample_names = [i for i in sample_names if i in samples_in_vcf] print('{0} samples also in vcf...'.format(len(sample_names))) sample_file = "{0}_sample_file.txt".format(out_prefix) # create a fasta containing all sequences for exons with snp coding_exons_fasta = "{0}_coding_exons.fasta".format(out_prefix) bo.fasta_from_intervals(coding_exon_bed, coding_exons_fasta, genome_fasta, names=True) if get_SNPs: #get SNPs for the sample intersect_file = "{0}_SNP_CDS_intersect.bed".format(out_prefix) print("Getting SNP data...") so.get_snps_in_cds(coding_exon_bed, CDS_bed, vcf_folder, panel_file, sample_names, sample_file, intersect_file, out_prefix) print("Calculating SNP positions...") so.get_snp_positions(sample_file, SNP_file, CDS_bed, intersect_file, out_prefix) gen.get_time(start) if ignore_determine_snp_type: pass else: print("Determining SNP type...") so.get_snp_change_status(SNP_file, CDS_fasta, PTC_file, syn_nonsyn_file, out_of_frame=out_of_frame, ref_check=True, headers=True) gen.get_time(start) #filter the exon junctions file to only leave those junctions that flank exons retained in the previous step. print( "Filtering exon-exon junctions to only leave those that flank exons with a PTC variant..." ) PTC_exon_junctions_file = "{0}_filtered_exon_junctions.bed".format( out_prefix) bo.filter_exon_junctions(exon_junctions_file, PTC_file, PTC_exon_junctions_file) #make a list of all the .bam files and modify them to have the full path rather than just the file name bam_files = [ "{0}/{1}".format(bams_folder, i) for i in full_sample_names if (i.split("."))[0] in sample_names ] #in parallel, do the processing on individual .bam files exon_junctions_bam_output_folder = "{0}__analysis_exon_junction_bams".format( out_prefix) if bam_analysis_folder == "None": bam_analysis_folder = "{0}__analysis_bam_analysis".format(out_prefix) gen.create_directory(bam_analysis_folder) if process_bams: print("Processing RNA-seq data...") if out_of_frame: splits = exon_junctions_bam_output_folder.split('/') splits[-1] = splits[-1].replace('_out_of_frame', '') exon_junctions_bam_output_folder = "/".join(splits) gen.create_directory(exon_junctions_bam_output_folder) #we have to do it like this because you can't pass flags into run_in_parallel keyword_dict = {"overwrite_intersect": overwrite_intersect} processes = gen.run_in_parallel(bam_files, [ "foo", exon_junctions_file, PTC_exon_junctions_file, bam_analysis_folder, PTC_file, syn_nonsyn_file, out_prefix, exon_junctions_bam_output_folder, keyword_dict ], nao.process_bam_per_individual, workers=36) for process in processes: process.get() gen.get_time(start) #if required, filter PTCs to only leave ones that overlap motifs from a specified set motif_filtering = False if motif_file != "None": print( "Filtering SNPs based on whether or not they overlap a motif from the specified set..." ) motif_suffix = ((motif_file.split("/"))[-1]).split(".")[0] if motif_complement: out_prefix = "{0}_{1}_complement".format(out_prefix, motif_suffix) else: out_prefix = "{0}_{1}".format(out_prefix, motif_suffix) filtered_ptc = "{0}_ptc_file.txt".format(out_prefix) so.filter_motif_SNPs(CDS_fasta, PTC_file, motif_file, filtered_ptc, complement=motif_complement) PTC_file = filtered_ptc final_file = "{0}__analysis_final_output.txt".format(out_prefix) if ignore_psi_calculation: pass else: print("Calculating PSI...") bmo.compare_PSI(PTC_file, bam_analysis_folder, final_file) #run the simulation that swaps ptcs for nonsynonymous snps if simulate_ptc_snps: if simulate_ptc_snps and not number_of_simulations: print("Please specify the number of simulations") raise Exception nao.ptc_snp_simulation(out_prefix, simulation_output_folder, PTC_file, syn_nonsyn_file, exon_junctions_file, bam_files, number_of_simulations, exon_junctions_bam_output_folder, use_old_sims=use_old_sims) # run the simulation that picks monomorphic sites if simulate_ptcs_with_monomorphic: if simulate_ptcs_with_monomorphic and not number_of_simulations: print("Please specify the number of simulations") raise Exception coding_exon_fasta = "{0}_coding_exons.fasta".format(out_prefix) if not os.path.exists(coding_exon_fasta): print('Coding exon fasta is required...') raise Exception nao.ptc_monomorphic_simulation( out_prefix, simulation_output_folder, sample_file, genome_fasta, PTC_file, syn_nonsyn_file, coding_exon_bed, coding_exon_fasta, exon_junctions_file, bam_files, number_of_simulations, generate_indices=generate_monomorphic_indices, use_old_sims=use_old_sims) # get the locations of the ptcs if ptc_location_analysis: print("PTC locations analysis...") snp_relative_exon_position_file = "{0}_SNP_relative_exon_position.bed".format( out_prefix) ptc_location_analysis_output_file = "{0}_ptc_location_analysis.csv".format( out_prefix) coding_exon_fasta = "{0}_coding_exons.fasta".format(out_prefix) if not os.path.exists(coding_exon_fasta) or not os.path.exists( snp_relative_exon_position_file) or not os.path.exists( PTC_file): print("Please run --filter_genome_data and --get_SNPs first...") raise Exception # need to work out where and what the analysis outputs need to do so.ptc_locations(PTC_file, snp_relative_exon_position_file, ptc_location_analysis_output_file)
def run_ptc_monomorphic_simulation_instance( simulations, out_prefix, simulation_output_folder, simulation_bam_analysis_output_folder, ptc_file, syn_nonsyn_file, exon_junctions_file, bam_files, nt_indices_files, coding_exon_fasta, parallel=False, use_old_sims=False): ''' Run the ptc simulations for the required number. ''' #iterate over simulations counter = 0 for simulation_number in simulations: counter = gen.update_counter(counter, 10, "SIMULATION ") #setup a folder to contain the individual simulation inside the simulations output simulation_instance_folder = "{0}/ptc_monomorphic_simulation_run_{1}".format( simulation_output_folder, simulation_number) if not use_old_sims: gen.create_strict_directory(simulation_instance_folder) else: gen.create_directory(simulation_instance_folder) # copy ptc file to directory real_ptcs_for_sim_file = "{0}/{1}".format(simulation_output_folder, ptc_file.split('/')[-1]) gen.copy_file(ptc_file, real_ptcs_for_sim_file) ptc_file = real_ptcs_for_sim_file #get list of exons exon_list = bo.get_fasta_exon_intervals(coding_exon_fasta) #generate pseudo ptc snps pseudo_monomorphic_ptc_file = "{0}/pseudo_monomorphic_ptc_file_{1}.txt".format( simulation_instance_folder, simulation_number) if (not use_old_sims) or ( not (os.path.isfile(pseudo_monomorphic_ptc_file))): so.generate_pseudo_monomorphic_ptcs(ptc_file, nt_indices_files, exon_list, pseudo_monomorphic_ptc_file) #filter the exon junctions file to only leave those junctions that flank exons retained in the previous step when generating pseudo ptcs pseudo_monomorphic_ptc_exon_junctions_file = "{0}/filtered_exon_junctions_{1}.bed".format( simulation_instance_folder, simulation_number) if (not use_old_sims) or (not ( os.path.isfile(pseudo_monomorphic_ptc_exon_junctions_file))): bo.filter_exon_junctions( exon_junctions_file, pseudo_monomorphic_ptc_file, pseudo_monomorphic_ptc_exon_junctions_file) exon_junctions_bam_output_folder = "{0}__analysis_exon_junction_bams".format( out_prefix) gen.create_directory(exon_junctions_bam_output_folder) #run the bam analysis for each #(don't parallelize if you're doing the simulations in parallel) kw_dict = { "ptc_snp_simulation": True, "simulation_instance_folder": simulation_instance_folder, "simulation_number": simulation_number } if parallel: process_bam_per_individual( bam_files, exon_junctions_file, pseudo_monomorphic_ptc_exon_junctions_file, simulation_bam_analysis_output_folder, pseudo_monomorphic_ptc_file, syn_nonsyn_file, out_prefix, exon_junctions_bam_output_folder, kw_dict) else: processes = gen.run_in_parallel(bam_files, [ "foo", exon_junctions_file, pseudo_monomorphic_ptc_exon_junctions_file, simulation_bam_analysis_output_folder, pseudo_monomorphic_ptc_file, syn_nonsyn_file, out_prefix, exon_junctions_bam_output_folder, kw_dict ], process_bam_per_individual, workers=36) for process in processes: process.get() #process final psi for simulation final_file = "{0}/final_output_simulation_{1}.txt".format( simulation_bam_analysis_output_folder, simulation_number) bmo.compare_PSI(pseudo_monomorphic_ptc_file, simulation_bam_analysis_output_folder, final_file, sim_number=simulation_number)
def ptc_monomorphic_simulation(out_prefix, simulation_output_folder, sample_file, genome_fasta, ptc_file, syn_nonsyn_file, coding_exon_bed, coding_exon_fasta, exon_junctions_file, bam_files, required_simulations, generate_indices=False, use_old_sims=False): ''' Set up the PTC simulations and then run. if use_old_sims is True, don't pick new simulant SNPs from monomorphic sites. ''' print( "Running simulation picking monomorphic sites that have the same ancestral allele as a PTC snp..." ) #setup up simulation output folder if simulation_output_folder == "None": simulation_output_folder = "{0}_simulate_ptc_monomorphic_sites".format( out_prefix) if not use_old_sims and generate_indices: #if the simulation folder we are specifying already exists, delete and start again gen.create_strict_directory(simulation_output_folder) else: gen.create_directory(simulation_output_folder) #setup up simulation bam analysis output folder simulation_bam_analysis_output_folder = "{0}_simulate_ptc_monomorphic_sites_bam_analysis".format( out_prefix) # create the filepaths to hold to positions of non mutated sites nt_indices_files = { "A": "{0}/nt_indices_no_mutations_A.fasta".format(simulation_output_folder), "C": "{0}/nt_indices_no_mutations_C.fasta".format(simulation_output_folder), "G": "{0}/nt_indices_no_mutations_G.fasta".format(simulation_output_folder), "T": "{0}/nt_indices_no_mutations_T.fasta".format(simulation_output_folder), } if generate_indices and not use_old_sims: get_non_mutation_indices(simulation_output_folder, sample_file, coding_exon_bed, out_prefix, genome_fasta, nt_indices_files) if not use_old_sims: #if the simulation folder we are specifying already exists, delete and start again gen.create_strict_directory(simulation_bam_analysis_output_folder) else: gen.create_directory(simulation_bam_analysis_output_folder) # #create a list of simulations to iterate over simulations = list(range(1, required_simulations + 1)) # #if you're only doing one simulation, don't parallelize the simulations # #parallelize the processing of bams like for true data if required_simulations > 1: processes = gen.run_in_parallel( simulations, [ "foo", out_prefix, simulation_output_folder, simulation_bam_analysis_output_folder, ptc_file, syn_nonsyn_file, exon_junctions_file, bam_files, nt_indices_files, coding_exon_fasta, True, use_old_sims ], run_ptc_monomorphic_simulation_instance, workers=36) for process in processes: process.get() else: run_ptc_monomorphic_simulation_instance( [1], out_prefix, simulation_output_folder, simulation_bam_analysis_output_folder, ptc_file, syn_nonsyn_file, exon_junctions_file, bam_files, nt_indices_files, coding_exon_fasta, False, use_old_sims)