def get_non_mutation_indices(simulation_output_folder, sample_file, coding_exon_bed, out_prefix, genome_fasta, nt_indices_files): ''' Get the indices of all the positions where no SNPs/other variants are reported. ''' # to do: # 1. need to convert bed to not have chr name # for each vcf file: # 1. subtract bed to get all locations where there isnt a mutation using intersect_bed with subtract=True # bmo.intersect_bed(file1, file2, output_file, subtract=True) # 2. using that, extract all sequence pieces using fasta_from_intervals, need to decide if names=true or names=false # fasta_from_intervals(intersect_bed_file, fasta_file, genome_fasta, force_strand = True, names = False) # 3. from this fasta file, create a file that contains for each exon, the indices at which each nt resides # split this so each chr has its own file? # bo.extract_nt_indices(fasta_file, output_file) # set up the new file to contain the regions without a mutation coding_exon_bed_out = "{0}/format_{1}".format( simulation_output_folder, coding_exon_bed.split('/')[-1]) # set up the new file to contain the regions without a mutation coding_exon_bed_out_subtract = "{0}/subtract_{1}".format( simulation_output_folder, coding_exon_bed.split('/')[-1]) # set up the new file to contain the regions without a mutation coding_exon_bed_out_subtract_format = "{0}/subtract_format_{1}".format( simulation_output_folder, coding_exon_bed.split('/')[-1]) # file to contain the fasta output of regions containing no mutation fasta_no_mutations = "{0}/exon_regions_no_mutations.fasta".format( simulation_output_folder) # change the names in the bed file to correspond to the mutation vcf bo.change_bed_names(coding_exon_bed, coding_exon_bed_out, full_names=True, header=False) # intersect the bed, leaving only regions that contain no mutations bmo.intersect_bed(coding_exon_bed, sample_file, write_both=False, output_file=coding_exon_bed_out_subtract, no_dups=False, subtract=True, intersect=True) # file = gen.read_many_fields(coding_exon_bed_out_subtract, "\t") bo.change_bed_names(coding_exon_bed_out_subtract, coding_exon_bed_out_subtract_format, full_names=False, header=False) # generate fasta of all the regions bo.fasta_from_intervals(coding_exon_bed_out_subtract, fasta_no_mutations, genome_fasta, force_strand=True, names=True) # # extract the indices of each location a mutation doesnt occur bo.extract_nt_indices(fasta_no_mutations, nt_indices_files)
def check_coding(exons_file, CDSs_file, outfile, remove_overlapping = False): ''' Given a bed file of exon coordinates and a bed file of CDS coordinates, writes a new bed file that only contains those exon coordinates form the former file that 1) are fully coding 2) are internal NB! Assumes that all the coordinates are from non-overlapping transcripts. If this is not the case, set remove_overlaps to True and it'll remove overlapping intervals. ''' if remove_overlapping: bmo.sort_bed(exons_file, exons_file) remove_overlaps(exons_file, exons_file) #filter out anything that isn't fully coding #you have to write_both because you want to make sure that they #haven't been kept because of an overlap to a transcript that doesn't appear in the exons file temp_file = "temp_data/temp{0}.txt".format(random.random()) bmo.intersect_bed(exons_file, CDSs_file, overlap = 1, overlap_rec = True, output_file = temp_file, force_strand = True, write_both = True, no_dups = False, no_name_check = False) #filter out terminal exons #in theory, there shouldn't be any left after the previous step #in practice, there may be unannotated UTRs, so it looks like we have a fully coding terminal exon, #whereas in reality, the exon is only partially coding temp_file2 = "temp_data/temp{0}.txt".format(random.random()) with open(temp_file2, "w") as o_file: #figure out the rank of the last exon for each transcript filt_exons = gen.read_many_fields(exons_file, "\t") filt_exons = [i for i in filt_exons if len(i) > 3] names = [i[3].split(".") for i in filt_exons] names = gen.list_to_dict(names, 0, 1, as_list = True) names = {i: max([int(j) for j in names[i]]) for i in names} coding_exons = gen.read_many_fields(temp_file, "\t") for exon in coding_exons: overlap_name = exon[9].split(".") if overlap_name[0] in names: name = exon[3].split(".") if name[-1] != "1": last_exon = names[name[0]] if int(name[-1]) != last_exon: exon = [str(i) for i in exon[:6]] o_file.write("\t".join(exon)) o_file.write("\n") bmo.sort_bed(temp_file2, temp_file2) gen.run_process(["mergeBed", "-i", temp_file2, "-c", "4,5,6", "-o", "distinct,distinct,distinct"], file_for_output = outfile) gen.remove_file(temp_file) gen.remove_file(temp_file2)
def main(): description = "Look at disease snps." arguments = ["disease_snps_file", "output_directory", "results_prefix", "simulations", "ese_file", "intersect_snps", "get_relative_positions", "get_snp_status", "get_info", "simulate_ptc_location", "get_possible_ptc_locations", "required_simulations", "get_overlaps", "intersect_ptcs", "compare_ptcs" ,"get_introns", "compare_distances", "clinvar_ptc_locations", "location_simulation", "exclude_cpg", "ese_hit_simulation", "only_disease", "only_kgenomes", "only_ese", "get_unique_ptcs", "get_unique_rel_pos", "excess_test", "disease_locations_chisquare"] args = gen.parse_arguments(description, arguments, flags = [5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20, 21, 22, 23,24,25,26,27], ints=[3]) disease_snps_file, output_directory, results_prefix, simulations, ese_file, intersect_snps, get_relative_positions, get_snp_status, get_info, simulate_ptc_location, get_possible_ptc_locations, required_simulations, get_overlaps, intersect_ptcs, compare_ptcs, get_introns, compare_distances, clinvar_ptc_locations, location_simulation, exclude_cpg, ese_hit_simulation, only_disease, only_kgenomes, only_ese, get_unique_ptcs, get_unique_rel_pos, excess_test, disease_locations_chisquare = args.disease_snps_file, args.output_directory, args.results_prefix, args.simulations, args.ese_file, args.intersect_snps, args.get_relative_positions, args.get_snp_status, args.get_info, args.simulate_ptc_location, args.get_possible_ptc_locations, args.required_simulations, args.get_overlaps, args.intersect_ptcs, args.compare_ptcs, args.get_introns, args.compare_distances, args.clinvar_ptc_locations, args.location_simulation, args.exclude_cpg, args.ese_hit_simulation, args.only_disease, args.only_kgenomes, args.only_ese, args.get_unique_ptcs, args.get_unique_rel_pos, args.excess_test, args.disease_locations_chisquare if simulations and not isinstance(simulations, int): print("\nERROR: Please provide the correct number for simulations.\n") raise Exception # create the output directory if it doesnt already exist gen.create_output_directories(output_directory) # disease_snps_file = "./source_data/clinvar_20180429.vcf.gz" disease_snps_index_file = "{0}.tbi".format(disease_snps_file) if not os.path.isfile(disease_snps_file) or not os.path.isfile(disease_snps_index_file): print("\nERROR: Please provide the required disease SNPs file(s).\n") raise Exception # intersect the coding exons with the disease snps exon_bed = "{0}_coding_exons.bed".format(results_prefix) disease_snp_intersect_file_vcf = "{0}/disease_snp_intersect.vcf".format(output_directory) disease_snp_intersect_file_bed = "{0}/disease_snp_intersect.bed".format(output_directory) if intersect_snps: print("Intersecting snps with exons") so.intersect_snps_parallel(exon_bed, disease_snps_file, disease_snp_intersect_file_vcf) so.intersect_vcf_to_bed(exon_bed, disease_snp_intersect_file_vcf, disease_snp_intersect_file_bed, change_names = True) # get relative positions of the snps in cds and exons full_bed = "{0}_CDS.bed".format(results_prefix) disease_snps_relative_exon_positions = "{0}/disease_snp_relative_exon_positions.bed".format(output_directory) disease_snps_relative_cds_positions = "{0}/disease_snp_relative_cds_positions.bed".format(output_directory) if get_relative_positions: print("Getting snp relative positions...") so.get_snp_relative_exon_position(disease_snp_intersect_file_bed, disease_snps_relative_exon_positions) # output to var because this is how the function was made relative_positions = gen.read_many_fields(disease_snps_relative_exon_positions, "\t") so.get_snp_relative_cds_position(relative_positions, disease_snps_relative_cds_positions, full_bed) # get the change status of the snps to check them cds_fasta = "{0}_CDS.fasta".format(results_prefix) disease_ptcs_file = "{0}/disease_ptcs.txt".format(output_directory) disease_other_file = "{0}/disease_other_snps.txt".format(output_directory) if get_snp_status: print("Getting snp status...") so.get_snp_change_status(disease_snps_relative_cds_positions, cds_fasta, disease_ptcs_file, disease_other_file) # get intersect between the clinvar ptcs and 1000 genomes ptcs ptc_file = "{0}_ptc_file.txt".format(results_prefix) ptc_intersect_file = "{0}/ptc_intersect.bed".format(output_directory) if intersect_ptcs: temp_disease_ptc_file = "temp_data/{0}".format(random.random()) dso.refactor_ptc_file(disease_ptcs_file, temp_disease_ptc_file) temp_k_genomes_ptc_file = "temp_data/{0}".format(random.random()) dso.refactor_ptc_file(ptc_file, temp_k_genomes_ptc_file, header=True) bao.intersect_bed(temp_k_genomes_ptc_file, temp_disease_ptc_file, write_both = True, no_dups=False, output_file = ptc_intersect_file) gen.remove_file(temp_disease_ptc_file) gen.remove_file(temp_k_genomes_ptc_file) # get a list of ptcs unique to each dataset unique_ptcs = "{0}/disease_ptcs_no_intersect.bed".format(output_directory) unique_ptcs_kgenomes = "{0}/kgenomes_ptcs_no_intersect.bed".format(output_directory) if get_unique_ptcs: dso.get_unique_ptcs(disease_ptcs_file, ptc_file, ptc_intersect_file, unique_ptcs, unique_ptcs_kgenomes) # get the relative positions of the ptcs unique to each dataset unique_ptcs_rel_pos_file = "{0}/disease_ptcs_no_intersect_rel_pos.bed".format(output_directory) kgenomes_relative_positions = "{0}_PTC_relative_exon_positions.bed".format(results_prefix) kgenomes_unique_ptcs_rel_pos_file = "{0}/kgenomes_ptcs_no_intersect_rel_pos.bed".format(output_directory) if get_unique_rel_pos: dso.get_unique_rel_pos(unique_ptcs, disease_snps_relative_exon_positions, unique_ptcs_kgenomes, kgenomes_relative_positions, unique_ptcs_rel_pos_file, kgenomes_unique_ptcs_rel_pos_file) # get the ese file name ese_file_name = ese_file.split('/')[-1].split('.')[0] # get the coding exons fasta file path coding_exons_fasta = "{0}_coding_exons.fasta".format(results_prefix) # snp_relative_positions_file = "{0}_SNP_relative_exon_position.bed".format(results_prefix) # simulation picking random reference allele matched simulants clinvar_location_simulation_file = "{0}/clinvar_ptc_location_simulation.csv".format(output_directory) clinvar_location_simulation_ese_overlap_file = "{0}/clinvar_ptc_location_simulation_{1}_ese_overlaps.csv".format(output_directory, ese_file_name) kgenomes_location_simulation_file = "{0}/1000_genomes_simulations.csv".format(output_directory) kgenomes_location_simulation_ese_overlap_file = "{0}/1000_genomes_simulations_ese_overlaps.csv".format(output_directory) if location_simulation: if not only_kgenomes: print('Running ptc location simulation on disease PTCs...') dso.ptc_location_simulation(unique_ptcs_rel_pos_file, coding_exons_fasta, simulations, clinvar_location_simulation_file, clinvar_location_simulation_ese_overlap_file, ese_file, only_ese, exclude_cpg) if not only_disease: print('Running ptc location simulation on 1000 genomes PTCs...') dso.ptc_location_simulation(kgenomes_unique_ptcs_rel_pos_file, coding_exons_fasta, simulations, kgenomes_location_simulation_file, kgenomes_location_simulation_ese_overlap_file, ese_file, only_ese, exclude_cpg) window_start = 3 window_end = 69 clinvar_ese_hit_simulation_file = "{0}/clinvar_ese_hit_simulation_{1}_{2}_{3}.csv".format(output_directory, window_start, window_end, ese_file_name) kgenomes_ese_hit_simulation_file = "{0}/1000_genomes_ese_hit_simulation_{1}_{2}_{3}.csv".format(output_directory, window_start, window_end, ese_file_name) # do a simulation picking only sites from within the region if ese_hit_simulation: if not only_kgenomes: print("Simulating ESE hits on the {0}-{1} region for disease PTCs...".format(window_start, window_end)) dso.ese_hit_simulation(unique_ptcs_rel_pos_file, coding_exons_fasta, simulations, clinvar_ese_hit_simulation_file, ese_file, window_start, window_end, exclude_cpg) if not only_disease: print("Simulating ESE hits on the {0}-{1} region for 1000 genomes PTCs...".format(window_start, window_end)) dso.ese_hit_simulation(kgenomes_unique_ptcs_rel_pos_file, coding_exons_fasta, simulations, kgenomes_ese_hit_simulation_file, ese_file, window_start, window_end, exclude_cpg) excess_test_file = "{0}/clinvar_ptc_{1}_{2}_excesses.csv".format(output_directory, window_start, window_end) if excess_test: dso.excess_test(unique_ptcs_rel_pos_file, coding_exons_fasta, excess_test_file) location_test_file = "{0}/clinvar_locations_chisquare.csv".format(output_directory) if disease_locations_chisquare: dso.disease_ptc_location_test(unique_ptcs_rel_pos_file, coding_exons_fasta, location_test_file)
def main(): description = "Take an output file from prepare_FANTOM.py and make a file with the expression data for each gene." args = gen.parse_arguments(description, [ "clean_fasta", "promoters_file_name", "cage_file_name", "out_prefix", "TPM_threshold" ], ints=[4]) [ clean_fasta, promoters_file_name, cage_file_name, out_prefix, TPM_threshold ] = [ args.clean_fasta, args.promoters_file_name, args.cage_file_name, args.out_prefix, args.TPM_threshold ] #extract transcript coordinates transcripts_file = "{0}_transcripts_clean.bed".format(out_prefix) bo.extract_features("../source_data/Homo_sapiens.GRCh37.87.gtf", transcripts_file, ["transcript"]) #get the names of the transcripts you're interested names = gen.read_fasta(clean_fasta)[0] #write the coordinates of the promoter regions of those transcripts to file with open(promoters_file_name, "w") as out_file, open(transcripts_file, "r") as in_file: for line in in_file: parsed = (line.rstrip("\n")).split("\t") #parse out the transcript name name = parsed[3].split(".")[0] #skip transcripts that aren't among your transcripts of interest if name in names: #determine the coordinates of a 1001 bp region centered on the TSS (the supposed promoter region) if parsed[5] == "+": current_line = [ "chr" + parsed[0], int(parsed[1]) - 500, int(parsed[1]) + 500 + 1, name, ".", parsed[5] ] elif parsed[5] == "-": current_line = [ "chr" + parsed[0], int(parsed[2]) - 500 - 1, int(parsed[2]) + 500, name, ".", parsed[5] ] else: RuntimeError("Invalid strand information!") out_file.write("\t".join([str(i) for i in current_line])) out_file.write("\n") #check which CAGE peaks overlap which promoters overlapping_peaks_file = "{0}_FANTOM_overlap_peaks.bed".format(out_prefix) bmo.intersect_bed(cage_file_name, promoters_file_name, output_file=overlapping_peaks_file, force_strand=True, write_both=True, no_dups=False) #for each transcript, get all overlapping peaks #(store only the expression information) peaks_dict = {name: [] for name in names} with open(overlapping_peaks_file, "r") as peaks: for peak in peaks: peak = peak.split("\t") name = peak[9] peaks_dict[name].append(peak[3]) #for each transcript, #store the mean TPM within each tissue (averaged over the different peaks #associated to that transcript) mean_dict = {} np.set_printoptions(suppress=True) for name in peaks_dict: if len(peaks_dict[name]) > 0: current_mat = np.array([[float(j) for j in i.split("|")] for i in peaks_dict[name]]) means = np.mean(current_mat, axis=0) mean_dict[name] = means #calculate expression parameters final_dict = {} for gene in mean_dict: expressed = len([i for i in mean_dict[gene] if i > TPM_threshold]) fraction = expressed / len(mean_dict[gene]) maximum = np.max(mean_dict[gene]) median_expr = np.median(mean_dict[gene]) median_if_expressed = np.median( [i for i in mean_dict[gene] if i > TPM_threshold]) final_dict[gene] = [ fraction, maximum, median_expr, median_if_expressed ] output_file_name = "{0}_FANTOM_expression_per_transcript.txt".format( out_prefix) with open(output_file_name, "w") as file: file.write("gene\tbreadth\tmax\tmedian\tmedian_expr\n") for i in sorted(list(final_dict.keys())): if final_dict[i] != None: file.write("\t".join([i] + [str(j) for j in final_dict[i]])) file.write("\n")
def process_bam_per_individual(bam_files, global_exon_junctions_file, PTC_exon_junctions_file, out_folder, PTC_file, syn_nonsyn_file, out_prefix, exon_junctions_bam_output_folder, kw_dict): ''' Do all of the processing on an individual bam, from filtering out low quality data to mapping reads to exon-exon junctions. For each exon, return information on how many reads fall at different exon-exon junctions. ''' #parse keyword_dict #it's done like this to make it easier to parallelize this process if "ptc_snp_simulation" in kw_dict: ptc_snp_simulation = kw_dict["ptc_snp_simulation"] else: ptc_snp_simulation = False if "simulation_instance_folder" in kw_dict: simulation_instance_folder = kw_dict["simulation_instance_folder"] else: simulation_instance_folder = None if "simulation_number" in kw_dict: simulation_number = kw_dict["simulation_number"] else: simulation_number = None if "overwrite_intersect" in kw_dict: overwrite_intersect = kw_dict["overwrite_intersect"] else: overwrite_intersect = False if "phase" in kw_dict: phase = kw_dict["phase"] else: phase = False bam_file_number = len(bam_files) for pos, bam_file in enumerate(bam_files): #Process: # 1. get the number of reads in bam # 2. Filter out reads that don't overlap exon-exon junctions # 3. Filter out reads that don't overlap exon-exon junctions flanking PTC-containing exons # 4. Filter bams by quality # This gives us a set of "good" quality reads. # 5. scale down total read number proportionally to how many reads were lost in the quality filtering # 6. Count reads either skipping or including each exon print("{0}/{1}: {2}".format(pos, bam_file_number, bam_file)) sample_name = (bam_file.split("/")[-1]).split(".")[0] if ptc_snp_simulation: output_file = "{0}/{1}_simulation_{2}.txt".format( out_folder, sample_name, simulation_number) else: output_file = "{0}/{1}.txt".format(out_folder, sample_name) #folder that will contain all of the intermediate steps in the processing of the bam file if ptc_snp_simulation: proc_folder = "{0}/bam_proc_files".format( simulation_instance_folder) else: proc_folder = "{0}__analysis_bam_proc_files".format(out_prefix) gen.create_output_directories(proc_folder) bam_file_parts = os.path.split(bam_file) mapq_filtered_bam = "{0}/{1}_filtered_mapq.bam".format( proc_folder, bam_file_parts[1]) mapq_flag_filtered_bam = "{0}_flag.bam".format(mapq_filtered_bam[:-4]) mapq_flag_xt_filtered_bam = "{0}_xt.bam".format( mapq_flag_filtered_bam[:-4]) mapq_flag_xt_nm_filtered_bam = "{0}_nm.bam".format( mapq_flag_xt_filtered_bam[:-4]) if not os.path.isfile(output_file): #1: We get a count of the total reads in the sample which can be used for normalisation #I'm initializing it to None for safety. That way, if the process fails, #it won't just silently go with whatever the value was at the end of the previous loop. #also, writing it down cause this bit takes forever, don't want to do it again every time. read_count_file_name = "{0}/read_count_sample_name.txt".format( exon_junctions_bam_output_folder) read_count = None if os.path.isfile(read_count_file_name): with open(read_count_file_name) as file: read_count = int("".join(file)) else: read_count = int( gen.run_process(["samtools", "view", "-c", bam_file])) with open(read_count_file_name, "w") as file: file.write(str(read_count)) #2: intersect the bam with all exon-exon junctions #only has to be done once for each bam #also removing "_out_of_frame" from out_prefix if it is present global_out_prefix = out_prefix if "out_of_frame" in global_out_prefix: global_out_prefix = global_out_prefix[:6] global_intersect_bam = "{0}/{1}_exon_junctions.bam".format( exon_junctions_bam_output_folder, bam_file_parts[1][:-4]) if not os.path.isfile(global_intersect_bam) or overwrite_intersect: #intersect the filtered bam and the global exon junctions file # print(global_intersect_bam) bmo.intersect_bed(bam_file, global_exon_junctions_file, output_file=global_intersect_bam, intersect_bam=True) #3: filter to relevant exon-exon junctions ##Intersect junctions and .bam, and write down the overlapping .bam alignments, without counting. #this uses intersect bed, with the intersect bam parameter intersect_bam = "{0}/{1}_exon_junction_bam_intersect.bam".format( proc_folder, bam_file_parts[1][:-4]) #intersect the filtered bam and the ptc exon junctions file bmo.intersect_bed(global_intersect_bam, PTC_exon_junctions_file, output_file=intersect_bam, intersect_bam=True) #count how many reads there are in the sample after filtering to relevant exon-exon junctions but before quality filtering read_count_junctions_no_filter = int( gen.run_process(["samtools", "view", "-c", intersect_bam])) #4. filter .bam alignments by quality. #takes both upper and lower bam thresholds #outputs bam file with "_quality_filter_{lower_lim}_{upper_lim}" appended # need to do this twice and merge, so we use both intervals used by Geuvadis #set the mapq filter parameters here mapq_intervals = [[251, 255], [175, 181]] mapq_filter_filelist = [] for mapq_interval in mapq_intervals: lower_threshold, upper_threshold = mapq_interval[ 0], mapq_interval[1] mapq_filter_file = "{0}/{1}_mapq_filter_{2}_{3}.bam".format( proc_folder, bam_file_parts[1][:-4], lower_threshold, upper_threshold) mapq_filter_filelist.append(mapq_filter_file) ##run the mapq filter bmo.bam_quality_filter( intersect_bam, mapq_filter_file, quality_greater_than_equal_to=lower_threshold, quality_less_than_equal_to=upper_threshold) ##merge files in filelist bmo.merge_bams(mapq_filter_filelist, mapq_filtered_bam) ##filter by flags: get all mapped reads #Leaves: mapped unpaired and paired reads bmo.bam_flag_filter(mapq_filtered_bam, mapq_flag_filtered_bam, get_mapped_reads=True) ##filter bam by xt tag XT=U bmo.bam_xt_filter(mapq_flag_filtered_bam, mapq_flag_xt_filtered_bam, xt_filter="U") ##filter bam by nm tag NM<=6 bmo.bam_nm_filter(mapq_flag_xt_filtered_bam, mapq_flag_xt_nm_filtered_bam, nm_less_equal_to=6) #5. scale down the initial count of reads in the sample by the proportion lost during quality filtering read_count_junctions_filter = int( gen.run_process( ["samtools", "view", "-c", mapq_flag_xt_nm_filtered_bam])) prop_kept = np.divide(read_count_junctions_filter, read_count_junctions_no_filter) read_count = prop_kept * read_count #convert to sam format and phase reads intersect_sam = "{0}_phased.sam".format( mapq_flag_xt_nm_filtered_bam[:-4]) if phase: temp_snp_file = "temp_data/snps{0}.txt".format(random.random()) so.merge_and_header(PTC_file, syn_nonsyn_file, temp_snp_file) bmo.phase_bams(temp_snp_file, mapq_flag_xt_nm_filtered_bam, sample_name, intersect_sam) gen.remove_file(temp_snp_file) else: gen.run_process( ["samtools", "view", mapq_flag_xt_nm_filtered_bam], file_for_output=intersect_sam) #6. count the number of reads supporting either the skipping or the inclusion of each exon junctions = bmo.read_exon_junctions(PTC_exon_junctions_file) bmo.count_junction_reads(intersect_sam, junctions, output_file, read_count)