def test_sort_bed(self): infile = "tests/sort_bed_input.bed" expected_file = "tests/sort_bed_expected.bed" observed_file = "tests/sort_bed_observed.bed" hk.remove_file(observed_file) sort_bed(infile, observed_file) expected = rw.read_many_fields(expected_file, "\t") observed = rw.read_many_fields(observed_file, "\t") self.assertEqual(expected, observed)
def test_intersect_bed_no_dups(self): A_file = "tests/intersect_bed_no_dups_input_A.bed" B_file = "tests/intersect_bed_no_dups_input_B.bed" expected_file = "tests/intersect_bed_no_dups_expected.bed" observed_file = "tests/intersect_bed_no_dups_observed.bed" hk.remove_file(observed_file) intersect_bed(A_file, B_file, output_file=observed_file, no_dups=True) expected = rw.read_many_fields(expected_file, "\t") observed = rw.read_many_fields(observed_file, "\t") self.assertEqual(expected, observed)
def main(): parser = argparse.ArgumentParser(description="Calculate the conservation of a series of RBP motifs in exon cores and flanks.") parser.add_argument("features_file_name", type = str, help = "name of GTF file with genome features") parser.add_argument("dataset_name", type = str, help = "dataset name") parser.add_argument("genome", type = str, help = "genome assembly name") parser.add_argument("RBP_file_name", type = str, help = "name of file with RBP motifs") parser.add_argument("correspondances_file_name", type = str, help = "name of file with correspondances between genes in dataset and orthologs") parser.add_argument("fasta_file_prefix", type = str, help = "prefix for fasta files with the sequences") parser.add_argument("output_file_name", type = str, help = "file for output data") parser.add_argument("output_folder_name", type = str, help = "folder that will contain simulated dS scores") parser.add_argument("alignment_folder_name", type = str, help = "name of folder that contains alignments") parser.add_argument("n_sim", type = int, help = "number of simulants") parser.add_argument("--markov", dest = "markov", action = "store_true", help = "Should simulants be generated using a Markov model?") parser.add_argument("--new_filters", dest = "new_filters", action = "store_true", help = "Should simulants be generated using the sampling method but removing existing motifs and capping mononucleotide runs?") parser.add_argument("--goldman_yang", dest = "goldman_yang", action = "store_true", help = "Should the Goldman & Yang method (rather tahn Yang & Nielsen) be used for calculating dS?") parser.add_argument("--validity", dest = "validity", action = "store_true", help = "Should RBPs be filtered based on information content?") args = parser.parse_args() [features_file_name, dataset_name, genome, RBP_file_name, correspondances_file_name, output_folder_name, fasta_file_prefix, output_file_name, output_folder_name, alignment_folder_name, n_sim, markov, new_filters, goldman_yang, validity] = [args.features_file_name, args.dataset_name, args.genome, args.RBP_file_name, args.correspondances_file_name, args.output_folder_name, args.fasta_file_prefix, args.output_file_name, args.output_folder_name, args.alignment_folder_name, args.n_sim, args.markov, args.new_filters, args.goldman_yang, args.validity] #make dictionary with RBPs as keys and lists of associated motifs as values motif_dict = rw.read_motifs(RBP_file_name) uf_fasta = "{0}_uf.fasta".format(fasta_file_prefix) df_fasta = "{0}_df.fasta".format(fasta_file_prefix) c_fasta = "{0}_c.fasta".format(fasta_file_prefix) fs = Feature_Set(features_file_name, genome) fs.set_dataset(dataset_name) transcripts = fs.get_transcripts() gene_name_dict = fs.get_gene_name_dict(transcripts) CDS = fs.get_CDS() #prepare the dictionary that is going to be necessray for mapping between exonic subregions and full CDSs regions_dict = {} regions_dict["gene name dict"] = gene_name_dict regions_dict["CDS"] = CDS regions_bed_file_names = [i[:-6] + ".bed" for i in [uf_fasta, c_fasta, df_fasta]] regions_dict["regions bed file"] = regions_bed_file_names regions_dict["fastas"] = [uf_fasta, c_fasta, df_fasta] #leave only those RBPs that pass the information content cutoff if validity: validity_5 = rw.read_many_fields("{0}/sufficient_information_fraction05_fiveprime.csv".format(output_folder_name), "\t") validity_core = rw.read_many_fields("{0}/sufficient_information_fraction05_core.csv".format(output_folder_name), "\t") validity_3 = rw.read_many_fields("{0}/sufficient_information_fraction05_threeprime.csv".format(output_folder_name), "\t") validity_5 = list_to_dict(validity_5, 0, 1) validity_core = list_to_dict(validity_core, 0, 1) validity_3 = list_to_dict(validity_3, 0, 1) protein_names = sorted([name for name in list(motif_dict.keys()) if validity_5[name] == "True" and validity_3[name] == "True" and validity_core[name] == "True"]) else: protein_names = sorted(list(motif_dict.keys())) #run conservation analysis do_dS_calc(protein_names, motif_dict, uf_fasta, df_fasta, c_fasta, n_sim, output_folder_name, correspondances_file_name, alignment_folder_name, output_file_name, regions_dict, markov, new_filters, goldman_yang)
def test_intersect_bed_hit_count_unsorted(self): A_file = "tests/intersect_bed_hit_count_unsorted_input_A.bed" B_file = "tests/intersect_bed_hit_count_unsorted_input_B.bed" expected_file = "tests/intersect_bed_hit_count_unsorted_expected.bed" observed_file = "tests/intersect_bed_hit_count_unsorted_observed.bed" hk.remove_file(observed_file) intersect_bed(A_file, B_file, output_file=observed_file, hit_count=True, no_dups=False) expected = rw.read_many_fields(expected_file, "\t") observed = rw.read_many_fields(observed_file, "\t") self.assertEqual(expected, observed)
def get_pp(outroot, subst_model, phy_file, model_file, separate_to_concat_mapping, combined_dict, tuples_mapping, min_inf = None, parse_output = True): ''' Get prior probabilities for all the bases at the different sites in an MSA. Note that for phyloFit these are posterior probabilities but theyare priors for INSIGHT. ''' #you don't want to compute a tree, just get the posterior probabilities for an existing tree #hence all the flags from --post_probs onwards arguments = ["phyloFit", "--init-model", model_file, "--out-root", outroot, "--subst-mod", subst_model, "--msa-format", "PHYLIP", "--post-probs", "--scale-only", "--no-rates", "--no-freqs", phy_file] if min_inf: arguments.extend(["-I", min_inf]) results = run_process(arguments) #parse into convenient dictionary if parse_output: pp_file = "{0}.postprob".format(outroot) pp = rw.read_many_fields(pp_file, " ") pp = [[j for j in i if j] for i in pp] #the outgroup nodes are labelled from the inside out, starting from 1 pp = {i[1]: i[-4:] for i in pp} pp_final = {} #map from coordinates in the concatenated alignment to positions in individual CDSs for trans in separate_to_concat_mapping: pp_final[trans] = {} for position in combined_dict[trans]: pp_final[trans][position] = pp[tuples_mapping[separate_to_concat_mapping[trans][position]]] return(pp_final)
def main(): description = "Record the distribution of peaks for different exons." args = hk.parse_arguments(description, ["peaks_file", "gtf", "exon_starts_file", "output_file", "reads_file", "from_end", "intronic", "limit", "nts_before_start", "noncoding", "reads_mode"], flags = [5, 6, 9, 10], ints = [7, 8]) peaks_file, gtf, exon_starts_file, output_file, reads_file, from_end, intronic, limit, nts_before_start, noncoding, reads_mode = args.peaks_file, args.gtf, args.exon_starts_file, args.output_file, args.reads_file, args.from_end, args.intronic, args.limit, args.nts_before_start, args.noncoding, args.reads_mode if noncoding: exons = rw.read_gtf(gtf, "exon", gene=False) else: exons = rw.read_gtf(gtf, "CDS", gene=False) # the 3' ss that will be analyzed valid_junctions = rw.read_many_fields(exon_starts_file, "\t") # pull out the column with transcript IDs valid_junctions = [i[3] for i in valid_junctions] lengths_dict = co.get_lengths(exons, valid_junctions, intronic=intronic) if nts_before_start: lengths_dict = {i: lengths_dict[i] + nts_before_start for i in lengths_dict} coverage_file_name = "{0}_{1}_coverage.bed".format(exon_starts_file[:-4], reads_file.split("/")[-1][:-4]) co.get_coverage(exon_starts_file, reads_file, coverage_file_name) peak_distances_all, peak_centres = co.peak_pos_in_exon(exon_starts_file, peaks_file, from_end = from_end, reads_mode = reads_mode) write_dist_mat(peak_distances_all, limit, output_file, lengths_dict, "{0}_intron_names.txt".format(output_file[:-4]), None) write_dist_mat(peak_centres, limit, "{0}_centres.txt".format(output_file[:-4]), lengths_dict, "{0}_centres_intron_names.txt".format(output_file[:-4]), None)
def get_data(file): ''' Read in polymorphism data from an INSIGHT input file. ''' data = rw.read_many_fields(file, "\t") data = [i for i in data if i[0] == "site"] return (data)
def parse_basinhoppin_pos(file): ''' Parse hit/control positions. ''' #not used in main() in the present script but imported into other scripts #this is ugly, this function needs to move positions = rw.read_many_fields(file, "\t") positions = [[i[0], [int(j) for j in i[1].split(",") if j]] for i in positions] positions = list_to_dict(positions, 0, 1) return(positions)
def parse_pos(file): ''' Parse a hits/controls positions file. ''' pos_list = rw.read_many_fields(file, "\t") pos_dict = list_to_dict(pos_list, 0, 1) pos_dict = { i: [int(j) for j in pos_dict[i].split(",") if j != ""] for i in pos_dict } return (pos_dict)
def parse_degen(file_name): ''' Parse a degenracy file into a nice dictionary with transcript IDs as keys. ''' degen = rw.read_many_fields(file_name, "\t") degen = list_to_dict(degen, 0, 1) degen = {i: degen[i].split(",") for i in degen} for trans in degen: separate = [i.split(":") for i in degen[trans]] separate = [i for i in separate if len(i) == 2] degen[trans] = {int(i[0]): i[1].split("|") for i in separate} return(degen)
def get_mean_freq(SFS_file): ''' Use a Mann-Whitney U-test to compare MAFs in hits vs controls. ''' SFS = rw.read_many_fields(SFS_file, " ") n = int(SFS[0][0]) hit_freqs = flatten([[i / n for j in range(int(SFS[1][i]))] for i in range(1, len(SFS[1]))]) control_freqs = flatten([[i / n for j in range(int(SFS[2][i]))] for i in range(1, len(SFS[2]))]) mwu = scipy.stats.mannwhitneyu(control_freqs, hit_freqs) hit_median = np.median(hit_freqs) control_median = np.median(control_freqs) print("Median MAF in hits: {0}.".format(hit_median)) print("Median MAF in controls: {0}.".format(control_median)) print("MWU p: {0}.".format(mwu[1])) print("Difference: {0}.\n".format(hit_median - control_median)) return ([hit_freqs, control_freqs], hit_median - control_median)
def intersect_bed(bed_file1, bed_file2, use_bedops = False, overlap = False, overlap_rec = False, write_both = False, sort = False, output_file = None, force_strand = False, force_opposite_strand = False, no_name_check = False, no_dups = True, chrom = None, intersect = False, hit_count = False, bed_path = None, intersect_bam=None, write_zero = False, write_bed = False, exclude = False): '''Use bedtools/bedops to intersect coordinates from two bed files. Return those lines in bed file 1 that overlap with intervals in bed file 2. OPTIONS output_file: write output to this file use_bedops: use bedops rather than bedtools. Certain options are only valid with one of the two, see below. overlap: minimum oxverlap required as a fraction of the intervals in bed file 1 (EX: 0.8 means that the overlap has to be at least 80% of the intervals in bed file 1). overlap_rec: require that the overlap as a fraction of the intervals in file 2 be at least as high as the threshold indicated in -f. write_both: if True, return not only the interval from bed file 1 but, tagged onto the end, also the interval from bed file 2 that it overlaps (only valid when using bedtools). exclude: if True, report intervals that DON'T overlap sort: sort bed files before taking the intersection force_strand: check that the feature and the bed interval are on the same strand (only valid with bedtools) force_opposite_strand: if True, check that the feature and the interval are on OPPOSITE strands no_name_check: if set to False, checks whether the chromosome names are the same in the too bed files (only valid with bedtools) no_dups: if True, only returns each interval once. If set to false, intervals in bed file 1 that overlap several intervals in bed file 2 will be returned several times (as many times as there are overlaps with different elements in bed file 2) chrom: limit search to a specific chromosome (only valid with bedops, can help in terms of efficiency) intersect: rather than returning the entire interval, only return the part of the interval that overlaps an interval in bed file 2. hit_count: for each element in bed file 1, return the number of elements it overlaps in bed file 2 (only valid with bedtools) intersect_bam: intersect a bam file with a bed file. Requires bam file to be called first write_zero: like write_both but also write A intervals that don't overlap with any B intervals, write_bed: when intersecting a bam file, write output as bed.''' if force_strand and force_opposite_strand: raise Exception("force_strand and force_opposite_strand can't both be True") hk.make_dir("temp_data/") temp_file_name = "temp_data/temp_bed_file{0}.bed".format(random.random()) #have it write the output to a temporary file if use_bedops: bedtools_output = run_bedops(bed_file1, bed_file2, force_strand, force_opposite_strand, write_both, chrom, overlap, sort, output_file = temp_file_name, intersect = intersect, hit_number = hit_count, no_dups = no_dups, intersect_bam = intersect_bam, overlap_rec = overlap_rec) else: bedtools_output = run_bedtools(bed_file1, bed_file2, force_strand, force_opposite_strand, write_both, chrom, overlap, sort, no_name_check, no_dups, output_file = temp_file_name, intersect = intersect, hit_number = hit_count, bed_path = bed_path, intersect_bam = intersect_bam, write_zero = write_zero, overlap_rec = overlap_rec, write_bed = write_bed, exclude = exclude) #move it to a permanent location only if you want to keep it if output_file: hk.run_process(["mv", temp_file_name, output_file]) else: bedtools_output = rw.read_many_fields(temp_file_name, "\t") hk.remove_file(temp_file_name) return(bedtools_output)
def main(): description = "Calculate the combined density of a set of motif sets." args = parse_arguments(description, ["motifs_file_name", "summary_file_name", "dataset_name", "correspondances_file_name", "alignment_folder_name", "output_folder_name", "output_file_name", "n_sim", "features_file_name", "genome", "families_file_name", "fasta_name", "ND_column", "output_suffix", "validity_folder_name", "negative_ND", "new_filters", "upper_quarter", "lower_quarter", "full_set", "gene_families", "newer_filters", "baseml"], ints = [7, 12], flags = [15, 16, 17, 18, 19, 20, 21, 22]) [motifs_file_name, summary_file_name, dataset_name, correspondances_file_name, alignment_folder_name, output_folder_name, output_file_name, n_sim, features_file_name, genome, families_file_name, fasta_name, ND_column, output_suffix, validity_folder_name, negative_ND, new_filters, upper_quarter, lower_quarter, full_set, gene_families, newer_filters, baseml] = [args.motifs_file_name, args.summary_file_name, args.dataset_name, args.correspondances_file_name, args.alignment_folder_name, args.output_folder_name, args.output_file_name, args.n_sim, args.features_file_name, args.genome, args.families_file_name, args.fasta_name, args.ND_column, args.output_suffix, args.validity_folder_name, args.negative_ND, args.new_filters, args.upper_quarter, args.lower_quarter, args.full_set, args.gene_families, args.newer_filters, args.baseml] #make a dictionary with RBPs as keys and ND/p values as values. if summary_file_name != "None": summary_data = rw.read_many_fields(summary_file_name, "\t") #because some of the files are tab-separated, while others are comma-separated and have a header row if len(summary_data[0]) == 1: summary_data = rw.read_many_fields(summary_file_name, ",") summary_data = summary_data[1:] summary_dict = list_to_dict(summary_data, 0, ND_column, floatify = True) #make a dictionary with RBPs as keys and lists of associated motifs as values motifs = rw.read_motifs(motifs_file_name) #if you only want to be using a subset of the motifs if not full_set: #which RBPs fulfill the necessary information content criteria? validity = rw.read_many_fields("{0}/sufficient_information_fraction05.csv".format(validity_folder_name), "\t") validity = list_to_dict(validity, 0, 1) #motifs with negative ND if negative_ND: motifs = [motifs[RBP] for RBP in motifs if (summary_dict[RBP] < 0) and (validity[RBP] == "True")] #the most significantly enriched motifs elif upper_quarter: motifs = [motifs[RBP] for RBP in motifs if (summary_dict[RBP] < 0.1) and (validity[RBP] == "True")] #the most significantly depleted motifs elif lower_quarter: motifs = [motifs[RBP] for RBP in motifs if (summary_dict[RBP] > 0.9) and (validity[RBP] == "True")] #motifs with positive ND else: motifs = [motifs[RBP] for RBP in motifs if (summary_dict[RBP] >= 0) and (validity[RBP] == "True")] #shove all the remaining motifs into a great big flattened and uniquified bag motifs = list(set(flatten(list(motifs.values())))) make_dir(output_folder_name) #prepare a Feature_Set object (a genome gtf associated to a particular genome and to a set of transcript identifiers) if features_file_name != "None": fs = Feature_Set(features_file_name, genome) fs.set_dataset(dataset_name) transcripts = fs.get_transcripts() CDS = fs.get_CDS() #paralogous families families = rw.read_families(families_file_name) #the families file might use gene identifiers, whereas the Feature_Set object uses transcript identifiers if gene_families: families = fs.convert_families_to_ENST(families, transcripts) fs.add_families(families) #pick a random member from each paralogous family picked_trans = fs.pick_random_members() names = rw.read_fasta(fasta_name)[0] if picked_trans[0] not in names: picked = [fs.convert_between_ENST_and_ENSG(i, transcripts, "ENSG") for i in picked_trans] else: picked = picked_trans print(len(picked)) else: picked = None if baseml: method = "baseml" else: method = "gy" #write the input data for the conservation analysis into a file input_dict_file_name = "temp_data/temp_{0}.txt".format(random.random()) conservation.input_dict_for_dS(correspondances_file_name, alignment_folder_name, fasta_name, input_dict_file_name, picked = picked) with open(output_file_name, "w") as file: file.write(",".join(["real_dS", "mean_sim_dS", "norm_dS", "p", "motif_number"])) file.write("\n") #make n_sim simulant sets for the motifs, filtering the simulants based on different sets of criteria if new_filters: simulants = nc.make_simulants(motifs, n_sim, remove_existing = True, cap_runs = True, seed = 1) elif newer_filters: simulants = nc.make_simulants(motifs, n_sim, remove_existing = True, cap_runs = True, seed = 1, no_duplicates = True, concat = False) else: simulants = nc.make_simulants(motifs, n_sim, seed = 100) #file where the simulants dS values will be stored sim_output_file_name = "{0}/{1}_sim_ds.csv".format(output_folder_name, output_suffix) #calculate dS within motifs and simulants output_dict = conservation.dS_from_hits(motifs, alignment_folder_name, input_dict_file_name, n_sim = n_sim, simulants = simulants, sim_output_file_name = sim_output_file_name, method = method) print(output_dict) print("\n") #write to output file if output_dict != None: file.write(",".join([str(output_dict["dS"]), str(output_dict["mean simulated dS"]), str(output_dict["normalized dS"]), str(output_dict["effective p"]), str(len(motifs))])) else: file.write(",".join([str(None), str(None), str(None), str(None), str(None)])) os.remove(input_dict_file_name)
def main(): description = "Calculate the combined density of a set of motif sets." args = parse_arguments(description, ["motifs_file_name", "summary_file_name", "dataset_name", "output_folder_name", "output_file_name", "n_sim", "features_file_name", "genome", "families_file_name", "fasta_name", "ND_column", "seed", "output_suffix", "negative_ND", "new_filters", "upper_quarter", "lower_quarter", "full_set", "newer_filters", "two_seqs"], ints = [5, 10, 11], flags = [13, 14, 15, 16, 17, 18, 19]) [motifs_file_name, summary_file_name, dataset_name, output_folder_name, output_file_name, n_sim, features_file_name, genome, families_file_name, fasta_name, ND_column, seed, output_suffix, negative_ND, new_filters, upper_quarter, lower_quarter, full_set, newer_filters, two_seqs] = [args.motifs_file_name, args.summary_file_name, args.dataset_name, args.output_folder_name, args.output_file_name, args.n_sim, args.features_file_name, args.genome, args.families_file_name, args.fasta_name, args.ND_column, args.seed, args.output_suffix, args.negative_ND, args.new_filters, args.upper_quarter, args.lower_quarter, args.full_set, args.newer_filters, args.two_seqs] #make a dictionary with RBPs as keys and ND/p values as values. if summary_file_name != "None": summary_data = rw.read_many_fields(summary_file_name, "\t") #because some of the files are tab-separated, while others are comma-separated and have a header row if len(summary_data[0]) == 1: summary_data = rw.read_many_fields(summary_file_name, ",") summary_data = summary_data[1:] summary_dict = list_to_dict(summary_data, 0, ND_column, floatify = True) #make a dictionary with RBPs as keys and lists of associated motifs as values motifs = rw.read_motifs(motifs_file_name) #if you only want to be using a subset of the motifs if not full_set: #motifs with negative ND if negative_ND: motifs = [motifs[RBP] for RBP in motifs if summary_dict[RBP] < 0] #the most significantly enriched motifs elif upper_quarter: motifs = [motifs[RBP] for RBP in motifs if summary_dict[RBP] < 0.1] #the most significantly depleted motifs elif lower_quarter: motifs = [motifs[RBP] for RBP in motifs if summary_dict[RBP] > 0.9] #motifs with positive ND else: motifs = [motifs[RBP] for RBP in motifs if summary_dict[RBP] >= 0] #shove all the remaining motifs into a great big flattened and uniquified bag motifs = list(set(flatten(list(motifs.values())))) print(len(motifs)) make_dir(output_folder_name) #if you want to average over families if features_file_name != "None": fs = Feature_Set(features_file_name, genome) fs.set_dataset(dataset_name) families = rw.read_families(families_file_name) fs.add_families(families) else: fs = None #generate 100 1000 bp long random sequences based on the hg38 mononucleotide composition and use that as your sequence fasta if fasta_name == "random": names = [i for i in range(100)] seqs = nc.kmers_from_nc(1000, 100, genome_comp = True) fasta_name = "RBP/random_sequences_from_genome_comp.fasta" rw.write_to_fasta(names, seqs, fasta_name) with open(output_file_name, "w") as output_file: #generate n_sim sets of simulant motifs (constraining the space of simulants based on different sets of filters) if new_filters: simulants = nc.make_simulants(motifs, n_sim, remove_existing = True, cap_runs = True, seed = seed) elif newer_filters: simulants = nc.make_simulants(motifs, n_sim, remove_existing = True, cap_runs = True, seed = seed, concat = False, no_duplicates = True) else: current_simulants = nc.make_simulants(motifs, n_sim, seed = seed) #calculate the density parameters of the motifs in the sequence fasta output_dict = nc.get_sequence_set_density(fasta_name, None, motifs, simulants, n_sim, "{0}/overall_density_{1}.csv".format(output_folder_name, output_suffix), "{0}/overall_sim_density_{1}.csv".format(output_folder_name, output_suffix), "{0}/overall_positions.csv_{1}".format(output_folder_name, output_suffix), "{0}/overall_sim_positions_{1}".format(output_folder_name, output_suffix), concat = False, positions = False, feature_set = fs, verbose = True, two_seqs = two_seqs) record = [str(output_dict["median density"]), str(np.mean(output_dict["simulated densities"])), str(output_dict["median ND"]), str(output_dict["effective p"]), str(output_dict["Z"]), str(output_dict["depletion p"]), str(len(motifs)), str(output_dict["simulant sd"])] #write to output file output_file.write("\t".join(record)) print(record)
def main(): description = "Call peaks in a BED file of NET-seq reads." help_info = [ "BED file (at least a BED6) with NET-seq reads. Should be single-nucleotide resolution (each BED region is the 3' end of a read.).", "Ensembl GTF file for the relevant species. Ensure that chromosome names are formatted the same way in both the GTF and the BED file with reads!", "BED file with the coordinates of the transcripts to analyze. Only the name field is read, hence the others can hold placeholders. The name field must contain transcript IDs from the GTF file.", "Name of the output file (BED file with peak coordinates).", "Alpha value for calling a position as having a significantly higher local read denisty than expected by chance. Default: 0.01.", "Merge distance: adjacent peaks will be merged if they are closer than this many nucleotides. Default: 21.", "Minimum reads per peak. Default: 10.", "The number of times the read position randomization should be performed for each transcript. Higher values make the significance calculation (marginally) more robust, however, they also make the programme very slow. Default: 5.", "Minimum length of a peak in nucleotides. Default: 5.", "Size of the sliding window to use when calculating the local read density. It may be sensible to set this to the same value as the merge distance. Should be an odd integer. Default: 21", "The analysis will be performed this many times, with the output files numbered. Useful for running many negative control simulations at once. Default: 1.", "Read positions will be shuffled within each transcript before analysis. This should disrupt any signal and should give a flat peak density profile.", "Instead of a sliding window, adjacent non-overlapping windows will be used when calculating the local read density.", "When calling peaks in a given exon/intron, do not include that exon/intron in the read position randomization.", "When --exclude_focal is set, count an exon and its upstream intron as a single unit (except for the first exon).", "Don't filter out likely PCR duplicates (peaks where more than 90%% of the reads come from a single nucleotide position).)" ] defaults = {4: 0.01, 5: 21, 6: 10, 7: 5, 8: 5, 9: 21, 10: 1} args = hk.parse_arguments(description, [ "reads_file", "gtf", "trans_active_file", "output_file", "significance_threshold", "merge", "min_reads_per_peak", "iterations", "min_peak_length", "window_size", "runs", "neg_control", "no_slide", "exclude_focal", "with_ups_intron", "no_PCR_filter" ], floats=[4], ints=[5, 6, 7, 8, 9, 10], flags=[4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15], detailed_help=help_info, defaults=defaults) reads_file, gtf, trans_active_file, output_file, significance_threshold, merge, min_reads_per_peak, iterations, min_peak_length, window_size, runs, neg_control, no_slide, exclude_focal, with_ups_intron, no_PCR_filter = args.reads_file, args.gtf, args.trans_active_file, args.output_file, args.significance_threshold, args.merge, args.min_reads_per_peak, args.iterations, args.min_peak_length, args.window_size, args.runs, args.neg_control, args.no_slide, args.exclude_focal, args.with_ups_intron, args.no_PCR_filter print("Merge distance: {0}".format(merge)) print("Minimum number of reads per peak: {0}".format(min_reads_per_peak)) print("Minimum peak length: {0}".format(min_peak_length)) print("Window size: {0}".format(window_size)) print("Significance level: {0}".format(significance_threshold)) print("Randomization iterations to perform: {0}".format(iterations)) print("Runs: {0}".format(runs)) neg_str = "" if neg_control: neg_str = "_neg_control" slide_str = "" if no_slide: slide_str = "_no_slide" intron_str = "" if with_ups_intron: intron_str = "w_ups_intr" # 0. make a BED file with the coordinates of transcripts transcripts_file = "{0}_transcripts.bed".format(gtf[:-4]) co.get_transcripts(gtf, transcripts_file, add_chr=True) exons = rw.read_gtf(gtf, "exon") # 1. intersect the two files, loop over the result and make a # dictionary of reads per pos for each transcript, which has reads reads_per_pos = get_reads_per_pos(reads_file, transcripts_file) # only leave transcriptionally active genes (one isoform per gene) trans_active_genes = rw.read_many_fields(trans_active_file, "\t")[1:] # pull out the column with transcript IDs trans_active_genes = [i[3] for i in trans_active_genes] reads_per_pos = { i: reads_per_pos[i] for i in reads_per_pos if i.split(".")[-1] in trans_active_genes } for sim in range(runs): print("**********{0}**********".format(sim)) # 2. for each transcript, randomly reshuffle the reads and calculate the # nth percentile depending on what the significance threshold is # keep positions that are higher than that threshold and write to BED file raw_peak_bed = "{0}_{1}_raw_peaks{2}_{3}_{4}{5}{6}{7}_{8}_sim.bed".format( reads_file[:-4], gtf.split("/")[-1][:-4], iterations, min_reads_per_peak, window_size, neg_str, intron_str, slide_str, sim) read_count_file = "{0}_{1}_read_counts{2}_{3}{4}{5}_{6}_sim.txt".format( reads_file[:-4], gtf.split("/")[-1][:-4], iterations, window_size, neg_str, intron_str, sim) new_reads_file = write_raw_peaks(reads_per_pos, raw_peak_bed, read_count_file, exons, iterations=iterations, min_read_count=min_reads_per_peak, window_size=window_size, neg_control=neg_control, no_slide=no_slide, exclude_focal=exclude_focal, with_ups_intron=with_ups_intron) if neg_control: reads_file = new_reads_file # 3. merge peaks merged_peak_bed = "{0}_{1}_merged_peaks{2}_{3}_{4}{5}{6}{7}_{8}_sim.bed".format( reads_file[:-4], gtf.split("/")[-1][:-4], iterations, window_size, merge, neg_str, slide_str, intron_str, sim) co.merge_bed(raw_peak_bed, merged_peak_bed, merge) print("Before filtering, there are {0} peaks.".format( hk.line_count(merged_peak_bed))) # 4. filter out peaks that don't have enough reads or are too short. # Write final results to file and also write a stats file with the size, # read count and overlapping transcript of the peaks stats_file = "{0}_stats_{1}_sim.txt".format(output_file[:-4], sim) filter_peaks(merged_peak_bed, reads_file, read_count_file, "{0}_{1}_sim.bed".format(output_file[:-4], sim), min_reads_per_peak, min_peak_length, stats_file, no_PCR_filter=no_PCR_filter)
def main(): parser = argparse.ArgumentParser(description="Calculate the conservation level of a series of RBP motifs.") parser.add_argument("features_file_name", type = str, help = "name of GTF file with genome features") parser.add_argument("dataset_name", type = str, help = "dataset name") parser.add_argument("genome", type = str, help = "genome assembly name") parser.add_argument("RBP_file_name", type = str, help = "name of file with RBP motifs") parser.add_argument("correspondances_file_name", type = str, help = "name of file with correspondances between genes in dataset and orthologs") parser.add_argument("fasta_file_name", type = str, help = "name of fasta file with the sequences") parser.add_argument("families_file_name", type = str, help = "name of file that contains families") parser.add_argument("output_file_name", type = str, help = "file for output data") parser.add_argument("output_folder_name", type = str, help = "folder that will contain simulated dS scores") parser.add_argument("alignment_folder_name", type = str, help = "name of folder that contains alignments") parser.add_argument("n_sim", type = int, help = "number of simulants") parser.add_argument("--valid_file", nargs = "?", const = "False") parser.add_argument("--gene_families", action = "store_true", help = "does the families file use gene identifiers?") parser.add_argument("--markov", dest = "markov", action = "store_true", help = "Should simulants be generated using a Markov model?") parser.add_argument("--new_filters", dest = "new_filters", action = "store_true", help = "Should simulants be generated using the old method but capping mononucleotide runs and removing existing motifs?") parser.add_argument("--newer_filters", dest = "newer_filters", action = "store_true", help = "Like new_filters but without concatenation and without allowing duplicates within simulant sets.") parser.add_argument("--goldman_yang", dest = "goldman_yang", action = "store_true", help = "Should Goldman & Yang's method be used for calculating dS?") parser.add_argument("--baseml", dest = "baseml", action = "store_true", help = "Should baseml be used instead of codeml?") args = parser.parse_args() [features_file_name, dataset_name, genome, RBP_file_name, correspondances_file_name, output_folder_name, fasta_file_name, families_file_name, output_file_name, output_folder_name, alignment_folder_name, n_sim, valid_file, gene_families, markov, new_filters, newer_filters, goldman_yang, baseml] = [args.features_file_name, args.dataset_name, args.genome, args.RBP_file_name, args.correspondances_file_name, args.output_folder_name, args.fasta_file_name, args.families_file_name, args.output_file_name, args.output_folder_name, args.alignment_folder_name, args.n_sim, args.valid_file, args.gene_families, args.markov, args.new_filters, args.newer_filters, args.goldman_yang, args.baseml] #pick a random member from each paralogous family if features_file_name != "None": fs = Feature_Set(features_file_name, genome) fs.set_dataset(dataset_name) families = rw.read_families(families_file_name) #if the families file uses gene identifiers rather than transcript identifiers if gene_families: families = fs.convert_families_to_ENST(families, transcripts) fs.add_families(families) picked_trans = fs.pick_random_members() #if the fasta uses gene identifiers but the feature set uses transcript identifiers names = rw.read_fasta(fasta_file_name)[0] if picked_trans[0] not in names: transcripts = fs.get_transcripts() picked = [] for i in picked_trans: picked.append(fs.convert_between_ENST_and_ENSG(i, transcripts, "ENSG")) else: picked = picked_trans print(len(picked)) else: picked = None motif_dict = rw.read_motifs(RBP_file_name) #valid_file says which proteins pass information content criteria. Only analyze the ones that do. if not valid_file: validity = rw.read_many_fields("{0}/sufficient_information_fraction05.csv".format(output_folder_name), "\t") validity = list_to_dict(validity, 0, 1) elif valid_file == "None": validity = {i: "True" for i in motif_dict} else: validity = rw.read_many_fields(valid_file, "\t") validity = list_to_dict(validity, 0, 1) protein_names = sorted([name for name in list(motif_dict.keys()) if validity[name] == "True"]) #whether to use PAML codeml or yn00. if baseml: method = "baseml" elif goldman_yang: method = "gy" else: method = "yn" #write the input data for the conservation analysis to file input_dict_file_name = "temp_data/temp_{0}.txt".format(random.random()) conservation.input_dict_for_dS(correspondances_file_name, alignment_folder_name, fasta_file_name, input_dict_file_name, picked = picked) with open(output_file_name, "w") as file: file.write(",".join(["protein_name", "real_dS", "mean_sim_dS", "norm_dS", "p", "motif_number"])) file.write("\n") for protein in protein_names: print(protein) motifs = motif_dict[protein] #use one of several different methods to generate simulant motifs if markov: simulants = nc.make_simulants_markov(motifs, n_sim) elif new_filters: simulants = nc.make_simulants(motifs, n_sim, remove_existing = True, cap_runs = True) elif newer_filters: simulants = nc.make_simulants(motifs, n_sim, remove_existing = True, cap_runs = True, no_duplicates = True, concat = False, seed = 1) else: simulants = nc.make_simulants(motifs, n_sim) sim_output_file_name = "{0}/{1}_sim_ds.csv".format(output_folder_name, protein) #determine the conservation parameters of the current protein output_dict = conservation.dS_from_hits(motifs, alignment_folder_name, input_dict_file_name, n_sim = n_sim, simulants = simulants, sim_output_file_name = sim_output_file_name, method = method) print(output_dict) print("\n") if output_dict != None: file.write(",".join([protein, str(output_dict["dS"]), str(output_dict["mean simulated dS"]), str(output_dict["normalized dS"]), str(output_dict["effective p"]), str(len(motifs))])) else: file.write(",".join([protein, str(None), str(None), str(None), str(None), str(None)])) file.write("\n") os.remove(input_dict_file_name)
def main(): description = "Calculate the conservation of k-mers that are a single point mutation away from being part of a set of motifs." args = parse_arguments(description, ["motifs_file_name", "summary_file_name", "output_folder_name", "p_column", "alignment_folder_name", "correspondances_file_name", "output_file_name", "dataset_name", "features_file_name", "n_sim", "output_suffix", "sequences_file_name", "families_file_name", "genome", "by_RBP"], ints = [3, 9], flags = [14]) [motifs_file_name, summary_file_name, output_folder_name, p_column, alignment_folder_name, correspondances_file_name, output_file_name, dataset_name, features_file_name, n_sim, output_suffix, sequences_file_name, families_file_name, genome, by_RBP] = [args.motifs_file_name, args.summary_file_name, args.output_folder_name, args.p_column, args.alignment_folder_name, args.correspondances_file_name, args.output_file_name, args.dataset_name, args.features_file_name, args.n_sim, args.output_suffix, args.sequences_file_name, args.families_file_name, args.genome, args.by_RBP] RBPs = rw.read_motifs(motifs_file_name) #only leave those RBPs hat pass information content criteria validity = rw.read_many_fields("{0}/sufficient_information_fraction05.csv".format(output_folder_name), "\t") validity = list_to_dict(validity, 0, 1) RBPs = {i: RBPs[i] for i in RBPs if validity[i] == "True"} #if you're not doing this by RBP, pool motifs from the most significantly depleted sets if not by_RBP: summary_data = rw.read_many_fields(summary_file_name, "\t") if len(summary_data[0]) == 1: summary_data = rw.read_many_fields(summary_file_name, ",") summary_dict = list_to_dict(summary_data, 0, p_column, floatify = True) RBPs = {i: RBPs[i] for i in RBPs if summary_dict[i] > 0.9} motifs = list(set(flatten(list(RBPs.values())))) RBPs = {"all": motifs} #randomly pick one gene from each paralogous family fs = Feature_Set(features_file_name, genome) fs.set_dataset(dataset_name) transcripts = fs.get_transcripts() families = rw.read_families(families_file_name) families = fs.convert_families_to_ENST(families, transcripts) fs.add_families(families) picked_from_families = fs.pick_random_members() gene_name_dict = fs.get_gene_name_dict(transcripts) picked = [fs.convert_between_ENST_and_ENSG(i, gene_name_dict, "ENSG") for i in picked_from_families] names, CDS = rw.read_fasta(sequences_file_name) #make a dictionary where the keys are genes from the focal species and the values are orthologs from another species correspondances = rw.read_many_fields(correspondances_file_name, ",") correspondance_dict = {} for i in correspondances: correspondance_dict[i[0]] = i[1] output_dict = {} #loop over the RBPs for protein in sorted(RBPs): #fetch the current motifs print(protein) motifs = RBPs[protein] print("There are {0} motifs.".format(len(motifs))) #generate all unique motifs that are a single base substitution away from one of the motifs but are not actually in the set neighbours = nc.get_neighbours(motifs) print("There are {0} neighbours.".format(len(neighbours))) #make simulants for the motifs. don't allow simulants to be part of the set of neighbours. simulants = nc.make_simulants(motifs, n_sim, remove_existing = True, cap_runs = True, exclude = neighbours, no_duplicates = True, concat = False) neighbour_lengths = [len(i) for i in neighbours] neighbours = nc.motif_to_regex(neighbours) #determine the true frequency at which fourfold degenarte sites that are a single substitution away from a motif in human actually contain the base that #would give rise to the motif in the orthologous species site_number = 0 mutation_score = 0 motifs = [list(i) for i in motifs] true_result = run_in_parallel(picked, ["foo", correspondance_dict, alignment_folder_name, CDS, names, motifs, neighbours, neighbour_lengths], get_mutation_to_motif) for i in true_result: current = i.get() site_number = site_number + current[0] mutation_score = mutation_score + current[1] if site_number > 0: real_fraction = mutation_score/site_number else: real_fraction = None print("Real fraction:") print(real_fraction) neighbours = "" sim_site_numbers = np.zeros((n_sim)) sim_mutation_scores = np.zeros((n_sim)) #obtain this estimate also for each simulant set #I'm doing this in this awkward manner because I don't have enough RAM to hold all the simulated neighbours in memory at once for sim in range(n_sim): if sim%10 == 0: print(sim) current_simulants = simulants[sim] current_neighbours = nc.get_neighbours(current_simulants) current_neighbour_lengths = [len(i) for i in current_neighbours] current_neighbours = nc.motif_to_regex(current_neighbours) current_simulants = [list(i) for i in current_simulants] current_result = run_in_parallel(picked, ["foo", correspondance_dict, alignment_folder_name, CDS, names, current_simulants, current_neighbours, current_neighbour_lengths], get_mutation_to_motif) for i in current_result: current = i.get() sim_site_numbers[sim] = sim_site_numbers[sim] + current[0] sim_mutation_scores[sim] = sim_mutation_scores[sim] + current[1] #normalize the real fraction, calculate p sim_fractions = np.divide(sim_mutation_scores, sim_site_numbers) sim_fractions = [i for i in sim_fractions if i != np.inf] p = ms.calc_eff_p(real_fraction, sim_fractions, greater = False) norm_fraction = ms.normalize(real_fraction, sim_fractions) output_dict[protein] = [protein, mutation_score, site_number, real_fraction, np.mean(sim_fractions), p, norm_fraction] print(output_dict[protein]) with open(output_file_name, "w") as output_file: #write header to output file output_file.write("protein\tmutation score\tsite number\treal fraction\tmean sim fraction\tp\tnormalized fraction\n") #write the rest of the output data for protein in sorted(list(output_dict.keys())): to_write = output_dict[protein] to_write = [str(i) for i in to_write] output_file.write("\t".join(to_write)) output_file.write("\n")
def main(): description = "Pick roughly nucleotide-matched control sites for a set of motif hits." args = parse_arguments(description, ["fasta", "genome", "features_file", "families_file", "dataset", "motifs_file", "run_number", "hit_file", "niter", "stepsize", "control_file", "error_file", "MSA_file_name_prefix", "anc_CG_file_name", "high_CG_file_name", "exclude_file", "brute_mapping", "verbose", "old_motif_format", "nonsyn_hits", "top_set_only", "remove_GT", "leave_CG", "remove_ancestral_CpG", "replacement_control", "macaque_anc", "remove_macaque_CpG", "big_tree", "pseudoCG", "comprehensive", "context", "prone_sites", "CG_gene_filter", "match_size", "raw", "regions"], ints = [6, 8, 9], flags = [16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35]) fasta, genome, features_file, families_file, dataset, motifs_file, run_number, hit_file, niter, stepsize, control_file, error_file, MSA_file_name_prefix, anc_CG_file_name, high_CG_file_name, exclude_file, brute_mapping, verbose, old_motif_format, nonsyn_hits, top_set_only, remove_GT, leave_CG, remove_ancestral_CpG, replacement_control, macaque_anc, remove_macaque_CpG, big_tree, pseudoCG, comprehensive, context, prone_sites, CG_gene_filter, match_size, raw, regions = args.fasta, args.genome, args.features_file, args.families_file, args.dataset, args.motifs_file, args.run_number, args.hit_file, args.niter, args.stepsize, args.control_file, args.error_file, args.MSA_file_name_prefix, args.anc_CG_file_name, args.high_CG_file_name, args.exclude_file, args.brute_mapping, args.verbose, args.old_motif_format, args.nonsyn_hits, args.top_set_only, args.remove_GT, args.leave_CG, args.remove_ancestral_CpG, args.replacement_control, args.macaque_anc, args.remove_macaque_CpG, args.big_tree, args.pseudoCG, args.comprehensive, args.context, args.prone_sites, args.CG_gene_filter, args.match_size, args.raw, args.regions #argparse can't do booleans if anc_CG_file_name == "None": anc_CG_file_name = None #I store motif data in one of two formats if old_motif_format: motifs = rw.read_names(motifs_file)[1:] else: motifs = rw.read_motifs(motifs_file) #if you're doing RBP motifs and only want motifs that were found to be enriched in Savisaar and Hurst 2017 if top_set_only: summary_data = rw.read_many_fields("RBP/RBP_hg38_introncontaining_new.txt", "\t") summary_dict = list_to_dict(summary_data, 0, 4, floatify = True) motifs = {RBP: motifs[RBP] for RBP in motifs if (summary_dict[RBP] < 0.1)} motifs = list(set(flatten(list(motifs.values())))) #create an instance of a Feature_Set object and associate a structure of paralogous families to it, unless if you've said to ignore that (used when analyzing exon flanks/cores) fs = Feature_Set(features_file, genome) fs.set_dataset(dataset) if families_file == "None": conservation.find_families(fasta, "general/{0}".format(dataset)) families_file = "general/{0}_families.txt".format(dataset) if families_file != "ignore": families = rw.read_families(families_file) fs.add_families(families) general_folder = "DFE/for_everybody" make_dir(general_folder) #if you've already retrieved MSAs from ensembl if MSA_file_name_prefix == "None": MSA_file_name_prefix = "{0}/{1}_MSA".format(general_folder, dataset) #admin transcripts = fs.get_transcripts() CDSs = fs.get_CDS() lengths = fs.get_lengths(CDSs, CDS = True) #only consider genes that are not on the sex chromosomes sex_chromosomes = ["X", "Y"] chrom_dict = {i: transcripts[i][0] for i in transcripts if transcripts[i][0] not in sex_chromosomes} chroms = list(set(list(chrom_dict.values()))) #U2S is a dinucleotide-based substitution model, JC69 is mononucleotide-based if context: subst_model = "U2S" else: subst_model = "JC69" #names used in the MSA (there's a character restriction in the phylip files so you can't use the full name) clean_names = ["h**o", "pan", "pongo", "macaca"] phylip_data = {"homo_sapiens": [], "pongo_abelii": [], "macaca_mulatta": [], "pan_troglodytes": []} if big_tree: clean_names = ["calli", "chloro", "gorilla", "h**o", "macaca", "pan", "papio", "pongo"] phylip_data = {"gorilla_gorilla": [], "callithrix_jacchus": [], "papio_anubis": [], "chlorocebus_sabaeus": [], "homo_sapiens": [], "pongo_abelii": [], "macaca_mulatta": [], "pan_troglodytes": []} if remove_ancestral_CpG or remove_macaque_CpG or CG_gene_filter: anc_CG_dict, macaque_CG_dict = get_CpG_dicts(CDSs, chroms, MSA_file_name_prefix, lengths, clean_names, phylip_data, fasta, anc_CG_file_name, high_CG_file_name, fs, macaque_anc = macaque_anc, pseudoCG = pseudoCG, comprehensive = comprehensive, subst_model = subst_model, regions = regions) else: anc_CG_dict = None macaque_CG_dict = None if replacement_control: nc.fit_control_pos_to_hits_replacement(fasta, motifs, run_number, hit_file, control_file, anc_CG_dict, macaque_CG_dict, family_seed = 5, CG_gene_filter = CG_gene_filter, niter = niter, verbose = verbose, brute_mapping = brute_mapping, stepsize = stepsize, write_errors = error_file, fs = fs, nonsyn_hits = nonsyn_hits, leave_CG = leave_CG, remove_ancestral_CpG = remove_ancestral_CpG, remove_macaque_CpG = remove_macaque_CpG, pseudoCG = pseudoCG, prone_sites = prone_sites, match_size = match_size, raw = raw, exclude_file = exclude_file) else: nc.fit_control_pos_to_hits_wrapper(fasta, motifs, run_number, hit_file, control_file, anc_CG_dict, macaque_CG_dict, family_seed = 5, CG_gene_filter = CG_gene_filter, niter = niter, verbose = verbose, brute_mapping = brute_mapping, stepsize = stepsize, write_errors = error_file, fs = fs, nonsyn_hits = nonsyn_hits, leave_CG = leave_CG, remove_ancestral_CpG = remove_ancestral_CpG, remove_macaque_CpG = remove_macaque_CpG, pseudoCG = pseudoCG, prone_sites = prone_sites, match_size = match_size)
def main(): description = "Calculate the normalized dS of a dataset." args = parse_arguments(description, [ "dataset", "feature_set", "genome", "families_file", "fasta", "hit_file_prefix", "motifs_file", "correspondances", "alignments", "suffix", "trials", "trial_file", "old_trial_file", "region_fasta", "old_motif_format", "nonsense", "no_families", "newest_only", "top_set_only", "calc_p", "reverse_site_numbers", "matched", "degen", "regions" ], ints=[10], flags=[14, 15, 16, 17, 18, 19, 20, 21, 22, 23]) dataset, feature_set, genome, families_file, fasta, hit_file_prefix, motifs_file, correspondances, alignments, suffix, trials, trial_file, old_trial_file, region_fasta, old_motif_format, nonsense, no_families, newest_only, top_set_only, calc_p, reverse_site_numbers, matched, degen, regions = args.dataset, args.feature_set, args.genome, args.families_file, args.fasta, args.hit_file_prefix, args.motifs_file, args.correspondances, args.alignments, args.suffix, args.trials, args.trial_file, args.old_trial_file, args.region_fasta, args.old_motif_format, args.nonsense, args.no_families, args.newest_only, args.top_set_only, args.calc_p, args.reverse_site_numbers, args.matched, args.degen, args.regions n_sim = 1000 print(suffix) #set up feature set and families fs = Feature_Set(feature_set, genome) fs.set_dataset(dataset) if no_families: picked = fs.names else: families = rw.read_families(families_file) fs.add_families(families) picked = fs.pick_random_members() hit_phylip = "temp_data/temp_{0}.phy".format(random.random()) control_phylip = "temp_data/temp_control_{0}.phy".format(random.random()) if not nonsense: if old_motif_format: motifs = rw.read_names(motifs_file)[1:] else: motifs = rw.read_motifs(motifs_file) if top_set_only: summary_data = rw.read_many_fields( "RBP/RBP_hg38_introncontaining_new.txt", "\t") summary_dict = list_to_dict(summary_data, 0, 4, floatify=True) motifs = { RBP: motifs[RBP] for RBP in motifs if (summary_dict[RBP] < 0.1) } motifs = list(set(flatten(motifs.values()))) if reverse_site_numbers: site_number_suffix = "_reversed_site_numbers_" else: site_number_suffix = "" if matched: matched_suff = "_matched" else: matched_suff = "" if degen: degen_suff = "_degen.txt" else: degen_suff = "" with open(trial_file, "w") as trial_out: trial_out.write( "trial\tA\tT\tC\tG\told\told_no_hum_CG\tnew_no_human_CG\tnew_no_hum_no_anc_CG\tnew_w_CG\tnew_no_anc_CG\tnew_no_anc_CG_macaque\tnewer_no_human_CG\tnewer_no_hum_no_anc_CG\tnewer_w_CG\tnewer_no_anc_CG\n" ) if old_trial_file != "None": old_trials = rw.read_many_fields(old_trial_file, "\t") old_trials = old_trials[1:] old_trials = [i[1:5] for i in old_trials] seed_kmers = 1 else: seed_kmers = None #you can do this for loads of trials #useful as a negative control if you're generating a new set of nonsense motifs #each time for trial in range(trials): print(trial) trial_output = [trial] #if you're meant to generate a load of nonsense motifs rather than using real motifs if nonsense: if old_trial_file != "None": #read in the intended nucleotide composition of the nonsense #motifs from file scaled_comp = [float(i) for i in old_trials[trial]] else: #pick nonsense motifs nucleotide composition by chance comp = [random.random() for i in range(4)] scaled_comp = [i / np.sum(comp) for i in comp] comp_dict = { i: scaled_comp[pos] for pos, i in enumerate(nc._canon_bases_) } motifs, obtained_dict = nc.kmers_from_nc(6, 50, comp_dict=comp_dict, return_freqs=True, seed=seed_kmers) motifs = ["motifs"] + motifs trial_output = trial_output + [ obtained_dict[i] for i in nc._canon_bases_ ] temp_motifs_file = "temp_data/temp_motifs.txt" rw.write_names(motifs, temp_motifs_file) print( "===NEW METHOD WITH NO ANCESTRAL CpG (MACAQUE, BIG TREE, CONTEXT), REPLACEMENT CONTROL===" ) hit_file = "{0}_hits_no_anc_CG_only_macaque_big_context{1}_replace.txt{2}".format( hit_file_prefix, matched_suff, degen_suff) control_file = "{0}_controls_no_anc_CG_only_macaque_big_context{1}_replace.txt{2}".format( hit_file_prefix, matched_suff, degen_suff) if nonsense: hit_file = "temp_data/temp_hits{0}.txt".format(random.random()) control_file = "temp_data/temp_controls{0}.txt".format( random.random()) error_file = "temp_data/temp_error{0}.txt".format( random.random()) get_control_sites( fasta, genome, feature_set, families_file, dataset, temp_motifs_file, hit_file, control_file, error_file, "DFE/for_everybody/filtered_hg38_85_pc_multiexon_anc_CG_big_context_threshold05.txt", [ "--leave_CG", "--context", "--remove_ancestral_CpG", "--macaque_anc", "--big_tree", "--replacement_control" ]) get_density(fasta, motifs, fs) norm_ds = get_new_method_results(hit_file, control_file, hit_phylip, control_phylip, correspondances, alignments, fasta, regions=regions, global_fasta=region_fasta, fs=fs) trial_output.append(norm_ds) if calc_p: p, low_CI, high_CI, sd, Z = get_sim_p( norm_ds, hit_file, control_file, correspondances, alignments, fasta, n_sim, reverse_site_numbers=reverse_site_numbers, sim_ds_file= "{0}{1}_sim_norm_ds_no_anc_CG_only_macaque_big_context{2}_replace.txt{3}" .format(hit_file_prefix, site_number_suffix, matched_suff, degen_suff)) trial_output = "\t".join([str(i) for i in trial_output]) trial_out.write(trial_output) trial_out.write("\n") remove_file(hit_phylip)
def main(): description = "Construct a site frequency spectrum that only considers motif-disrupting SNPs." args = parse_arguments(description, ["fasta", "output_file", "motif_file", "anc_file", "control_file", "SNPs_file", "N", "old_motif_format", "human", "ancestral"], ints = [6], flags = [7, 8, 9]) fasta, output_file, motif_file, anc_file, control_file, SNPs_file, N, old_motif_format, human, ancestral = args.fasta, args.output_file, args.motif_file, args.anc_file, args.control_file, args.SNPs_file, args.N, args.old_motif_format, args.human, args.ancestral names, seqs = rw.read_fasta(fasta) #I use two different formats for storing sequence motifs, #got to know which on it is if old_motif_format: motifs = rw.read_names(motif_file)[1:] print(len(motifs)) else: motifs = rw.read_motifs(motif_file) motifs = sorted(list(set(flatten(list(motifs.values()))))) #get the lengths of the motifs and compile lookahead regexes #that recognize the whole motif but only store the position of the first bases #these will be needed when searchin for the motifs motif_lengths = [len(i) for i in motifs] motif_regex = nc.motif_to_regex(motifs) #I'm gonna treat CG and GC as two 2-bp motifs, use the same code as wehn searching for, say, #ESE motifs CG_2mers = ["CG", "GC"] CG_lengths = [2, 2] CG_regex = nc.motif_to_regex(CG_2mers) motifs = [list(i) for i in motifs] if ancestral: anc_pos = rw.read_pos(anc_file) #read in hit and control positions controls = rw.read_pos(control_file) hit_file = re.sub("controls", "hits", control_file) hits = rw.read_pos(hit_file) #read in SNP data SNPs = rw.read_many_fields(SNPs_file, "\t") #the second column in the SNPs file contains positions that need to be discarded from analysis because they contain unanalyzable SNP data to_remove = list_to_dict(SNPs, 0, 2) to_remove = {i: to_remove[i].split(",") for i in to_remove} to_remove = {i: [int(j) for j in to_remove[i] if j not in ["error", ""]] for i in to_remove} SNPs = list_to_dict(SNPs, 0, 1) #all the SNPs associated to a transcript full_SNPs = {} #disruptive SNPs only clean_SNPs = {} minor_alleles = {} #the number of hit positions where, say, a T could theoretically substitute to an A (i.e. all T positions) transitions_total = {i: {j: 0 for j in nc._canon_bases_} for i in nc._canon_bases_} #the same as above but only counting those substitutions that would turn a motif into a non-motif transitions_disr = {i: {j: 0 for j in nc._canon_bases_} for i in nc._canon_bases_} #this block of code filters the true SNPs to only leave those that are disruptive #and also calculates the probability of being disruptive for all potential SNPs with open("{0}_degen.txt".format(hit_file), "w") as hit_degen_file: counter = 0 for trans in names: counter = update_counter(counter, 1000) if trans in controls: if trans in SNPs: trans_SNPs = SNPs[trans] else: trans_SNPs = [] trans_SNPs, clean_SNPs, full_SNPs, minor_alleles = parse_SNPs(trans_SNPs, clean_SNPs, full_SNPs, minor_alleles, trans) current_seq = seqs[names.index(trans)] fourfold_pos = nc.get_4fold_deg(current_seq) #CpG filtering if human: CG_pos = nc.get_motif_set_density(CG_regex, CG_lengths, current_seq, concat = True)["positions"] fourfold_pos = [i for i in fourfold_pos if i not in CG_pos] if ancestral: fourfold_pos = [i for i in fourfold_pos if i not in anc_pos[trans]] all_sites, clean_SNPs, transitions_total, transitions_disr, hit_degen_file = check_disruption(motif_regex, current_seq, motifs, motif_lengths, fourfold_pos, full_SNPs, clean_SNPs, minor_alleles, trans, transitions_total, transitions_disr, hit_degen_file, to_remove) hit_degen_file.write("\n") to_remove = {i: [j for j in to_remove[i] if j not in full_SNPs[i]] for i in to_remove if i in controls} hit_SFS = get_SFS(hits, clean_SNPs, to_remove, N) transitions = get_transitions(transitions_disr, transitions_total) print(transitions) #this block randomly assigns certain SNPs at simulant positions to be disruptive, #with the probability of that happening proportional to the frequency with which potential substitutions #of that nucleotide composition would be disruptive for true (motif) sites with open("{0}_degen.txt".format(control_file), "w") as control_degen_file: control_SNPs = {} counter = 0 for trans in controls: control_degen_file.write("{0}\t".format(trans)) counter = update_counter(counter, 1000) control_SNPs[trans] = {} trans_SNPs = full_SNPs[trans] current_seq = seqs[names.index(trans)] for site in controls[trans]: if trans not in to_remove or site not in to_remove[trans]: ref_allele = current_seq[site] disrupt_bases = get_disrupt_bases(ref_allele, transitions) control_degen_file.write("{0}:{1},".format(site, "|".join(disrupt_bases))) if site in trans_SNPs: minor_allele = minor_alleles[trans][site] if minor_allele in disrupt_bases: control_SNPs[trans][site] = trans_SNPs[site] control_degen_file.write("\n") control_SFS = get_SFS(controls, control_SNPs, to_remove, N) with open(output_file, "w") as file: file.write("{0}\n".format(N)) file.write(" ".join([str(i) for i in hit_SFS])) file.write("\n") file.write(" ".join([str(i) for i in control_SFS])) file.write("\n")
def get_ancestral_CG(outroot, subst_model, phy_files, model_file, tuples_mapping_dict, anc_CG_file_name, high_CG = None, min_inf = None, macaque = False, comprehensive = False, from_model = False): ''' Get a dictionary that says for each transcript which positions were ancestrally CpG/GpC. ''' #if a file name hasn't been supplied or if the file with the supplied name doesn't exist, determine #CpG positions again, otherwise just read them in from the file if not anc_CG_file_name or anc_CG_file_name == "None" or not os.path.exists(anc_CG_file_name): #you need several in case you have a high_CG dictionary pps = [] for phy_file in phy_files: if subst_model == "JC69" or from_model: #use an existing substitution model arguments = ["phyloFit", "--init-model", model_file, "--out-root", outroot, "--subst-mod", subst_model, "--msa-format", "PHYLIP", "--post-probs", "--scale-only", phy_file] else: #estimate a new model arguments = ["phyloFit", "--out-root", outroot, "--subst-mod", subst_model, "--msa-format", "PHYLIP", "--tree", "DFE/full_tree.tree", "--post-probs", phy_file] if subst_model == "JC69": block_size = 4 tuple_pos_lim = 2 shift_in_tuple = 0 else: #for dinucleotide models block_size = 16 tuple_pos_lim = 3 shift_in_tuple = 9 #turn off when testing if min_inf: arguments.extend(["-I", min_inf]) results = run_process(arguments) #read in posterior probabilities of having various nucelotides ancestrally pp_file = "{0}.postprob".format(outroot) pp = rw.read_many_fields(pp_file, " ") pp = [[j for j in i if j] for i in pp] pp = pp[2:] #the posterior probability that you had a CpG at a position has to be greater #than threshold for a position to be counted as ancestrally CpG threshold = 0.5 #will be over-written if you're doing big tree human_pos = 0 #the outgroup nodes are labelled from the outside in, starting from 1 if macaque: #it's to know whether we're doing big tree or little tree if len(pp[0]) == 14: #little tree, mononucleotide pp = {"_".join(i[1:tuple_pos_lim]): [i[len(i) - (3 * block_size): len(i) - (2 * block_size)]] for i in pp} elif len(pp[0]) > 14: #big tree/dinucleotide (i.e. it'll give you nonsense if you're trying to do context with the little tree) #the shift_in_tuple is to do with the fact that if you're doing U2S, you want the second tuple and not the first human_pos = 3 + shift_in_tuple if comprehensive: #you want to get all nodes except for node 0, which is the outgroup-ingroup ancestor pp = {"_".join(i[1:tuple_pos_lim]): [i[len(i) - (j * block_size): len(i) - ((j - 1) * block_size)] for j in range(1, 7)] for i in pp} else: pp = {"_".join(i[1:tuple_pos_lim]): [i[len(i) - (6 * block_size): len(i) - (5 * block_size)]] for i in pp} else: #for tests etc. where you might only have, say, two species pp = {"_".join(i[1:tuple_pos_lim]): [i[-block_size:]] for i in pp} else: pp = {"_".join(i[1:tuple_pos_lim]): [i[-block_size:]] for i in pp} pps.append(pp) anc_CG = {} #just to get the length example_pp = pps[0][list(pps[0].keys())[0]] for trans in tuples_mapping_dict: #tuples_mapping_dict has the alignment tuple corresponding to each position #because the phyloFit output is organized by tuples, not by positions anc_CG[trans] = [] for node_pos in range(len(example_pp)): #if you're using dinucleotides if subst_model != "JC69": for pos in sorted(tuples_mapping_dict[trans].keys())[1:]: try: pp_number = 0 #if you're gonna produce different output dictionaries for high and low GC regions if high_CG: if pos in high_CG[trans]: pp_number = 1 current_tuple = tuples_mapping_dict[trans][pos] #don't consider positions where there is an alignment gap for human if current_tuple[human_pos] != "*": ## print(current_tuple) ## print(pps[pp_number]) ## print("\n") if current_tuple in pps[pp_number]: current_pp = pps[pp_number][current_tuple][node_pos] else: current_pp = pps[abs(pp_number - 1)][current_tuple][node_pos] #because it can be either GC or CG, hence 6 or 9 if float(current_pp[6]) > threshold or float(current_pp[9]) > threshold: #you're always testing the second member in the dinucleotide anc_CG[trans].append(pos - 1) anc_CG[trans].append(pos) except KeyError: if pos % 100 == 0: pass else: raise KeyError else: #if you're using mononucleotides, you have to keep track of what the previous neuclotide was C_prev = False G_prev = False for pos in sorted(tuples_mapping_dict[trans].keys()): pp_number = 0 if high_CG: if pos in high_CG[trans]: pp_number = 1 current_C = False current_G = False current_tuple = tuples_mapping_dict[trans][pos] if current_tuple[human_pos] != "*": current_pp = pps[pp_number][current_tuple][node_pos] #if current is C and previous was G if float(current_pp[1]) > threshold: if G_prev: anc_CG[trans].append(G_pos) anc_CG[trans].append(pos) current_C = True #if current is G and previous was C if float(current_pp[2]) > threshold: if C_prev: anc_CG[trans].append(C_pos) anc_CG[trans].append(pos) current_G = True C_prev = False G_prev = False if current_C: C_prev = True #you need to specify the position explicitly because it's not necessarily #the last one if there were dashes C_pos = pos if current_G: G_prev = True G_pos = pos anc_CG[trans] = sorted(list(set(anc_CG[trans]))) remove_file(pp_file) if anc_CG_file_name and anc_CG_file_name != "None": with open(anc_CG_file_name, "w") as file: for trans in anc_CG: to_write = "\t".join([trans, ",".join([str(i) for i in anc_CG[trans]])]) file.write(to_write) file.write("\n") else: #parse anc_CG = rw.read_many_fields(anc_CG_file_name, "\t") anc_CG = [i for i in anc_CG if len(i) == 2] anc_CG = list_to_dict(anc_CG, 0, 1) anc_CG = {i: [int(i) for i in anc_CG[i].split(",") if i != ""] for i in anc_CG} return(anc_CG)
def main(): ''' Read in a series of input files on the sequence specificities of RBPs, filter the data and write a set of motifs for each RBP. Arguments (see Methods for further details on the input data files): upper_threshold, lower_threshold: the longest and shortest a motif is allowed to be, respectively RBPDB_experiments: path to RBPDB experiments file RBPDB proteins: path to RBPDB proteins file RBPDB_PWMs: path to file containing RBPDB PWM identifier to RBP mapping pwm_dir: path to directory containing RBPDB PWMs RBPmap_PSSMs: path to directory containing RBPmap PSSMs SFmap_proteins: path to file containing motifs from SFmap RNAcompete_information: path to summary file from CIS-BP RNA RNAcompete_PWMs: path to directory containing CIS-BP RNA PWMs final_motifs_file_name: name for output file plot_name: file for plot displaying the distribution of motif set sizes species: the species for which motifs are required ''' description = "Compile a set of motifs putatively recognized by RNA-binding proteins." args = parse_arguments(description, ["upper_threshold", "lower_threshold", "RBPDB_experiments", "RBPDB_proteins", "RBPDB_PWMs", "pwm_dir", "RBPmap_PSSMs", "SFmap_proteins", "RNAcompete_information", "RNAcompete_PWMs", "final_motifs_file_name", "plot_name", "species"], ints = [0, 1]) [upper_threshold, lower_threshold, RBPDB_experiments, RBPDB_proteins, RBPDB_PWMs, pwm_dir, RBPmap_PSSMs, SFmap_proteins, RNAcompete_information, RNAcompete_PWMs, final_motifs_file_name, plot_name, species] = [args.upper_threshold, args.lower_threshold, args.RBPDB_experiments, args.RBPDB_proteins, args.RBPDB_PWMs, args.pwm_dir, args.RBPmap_PSSMs, args.SFmap_proteins, args.RNAcompete_information, args.RNAcompete_PWMs, args.final_motifs_file_name, args.plot_name, args.species] db_fields = rw.read_many_fields(RBPDB_experiments, ",") db_fields = db_fields[1:] print("There are {0} RBPDB experiments.".format(len(db_fields))) db_proteins = rw.read_many_fields(RBPDB_proteins, ",") #species is "H**o sapiens" or "Mus musculus" db_proteins = [i for i in db_proteins if i[6] == species] protein_names = sorted(list(set([i[4] for i in db_proteins]))) db_fields = [i for i in db_fields if i[3] in protein_names] protein_number_before = (len(list(set([i[3] for i in db_fields])))) print("{0} were performed in {1}.\n".format(len(db_fields), species)) db_fields = [i for i in db_fields if i[2] != ""] protein_number_after = (len(list(set([i[3] for i in db_fields])))) db_fields = [[i[3], "RBPDB", i[0], i[1], i[2]] for i in db_fields] print("After removing experiments with no reported motif, {0} proteins remain of the initial {1}.\n".format(protein_number_after, protein_number_before)) bases = np.array(["A", "C", "G", "U"]) db_pwm_list = rw.read_many_fields(RBPDB_PWMs, "\t") for i in db_pwm_list: if i[1] in protein_names: current_file_name = "{0}/{1}.pwm".format(pwm_dir, i[0]) current_PWM = rw.read_many_fields(current_file_name, delimiter = " ") for j in range(len(current_PWM)): current_PWM[j] = [float(k) for k in current_PWM[j] if k != ""] consensus = nc.consensus_from_PWM(current_PWM, bases, 0) PMID = i[0].split("_") PMID = PMID[1] new_record = [i[1], "RBPDB_PWM", PMID, "SELEX", consensus] db_fields.append(new_record) protein_number_after = (len(list(set([i[0] for i in db_fields])))) print("After adding additional sequences from SELEX PWMs (RBPDB), there are {0} proteins.\n".format(protein_number_after)) if species == "Mus musculus": RBPmap_proteins = rw.read_many_fields("RBP/RBPmap_proteins.csv", ",") RBPmap_proteins = list_to_dict(RBPmap_proteins, 0, 1) RNAc_source = [i for i in RBPmap_proteins if "23846655" in RBPmap_proteins[i]] else: RNAc_source = [] for file_name in os.listdir(RBPmap_PSSMs): #RBPmap and SFmap don't distinguish between human and mouse motifs if "human" in file_name: file_name_split = file_name.split("_") protein_name = file_name_split[0] if protein_name not in RNAc_source: initial_pssm = rw.read_many_fields(os.path.join(RBPmap_PSSMs, file_name), delimiter = "\t") current_pssm = initial_pssm[1:] current_pssm = [i[1:] for i in current_pssm] for i in range(len(current_pssm)): current_pssm[i] = [float(j) for j in current_pssm[i]] consensus = nc.consensus_from_PWM(current_pssm, bases, 0.25, transform = True) protein_name = list(protein_name) if protein_name[:4] == ["S", "R", "S", "F"]: protein_name[:4] = ["S", "F", "R", "S"] protein_name = "".join(protein_name) new_record = [protein_name, "RBPmap_PWM", "NULL", "various", consensus] db_fields.append(new_record) protein_number_after = (len(list(set([i[0] for i in db_fields])))) print("After adding additional sequences from RBPmap PSSMs, there are {0} proteins.\n".format(protein_number_after)) SFmap_data = rw.read_many_fields(SFmap_proteins, delimiter = ",") for i in SFmap_data: if "," in i[1]: temp_split = i[1].split(", ") temp_split = [j.upper() for j in temp_split] i[1] = ";".join(temp_split) else: i[1] = i[1].upper() new_record = [i[0], "SFmap", "NULL", "various", i[1]] db_fields.append(new_record) protein_number_after = (len(list(set([i[0] for i in db_fields])))) print("After adding motifs from SFmap, there are {0} proteins.\n".format(protein_number_after)) RNAc = rw.read_many_fields(RNAcompete_information, delimiter = "\t") RNAc = [i for i in RNAc[1:] if i] if species == "H**o sapiens": RNAc = [i for i in RNAc if i[3] != "." and i[8] == "D"] if species == "Mus musculus": RNAc = [i for i in RNAc if i[3] != "."] PSSM_folder = RNAcompete_PWMs for record in RNAc: motif_name = record[3] initial_pssm = rw.read_many_fields(os.path.join(PSSM_folder, "{0}.txt".format(motif_name)), delimiter = "\t") if initial_pssm == []: if record[19] == "21036867":#RBPDB paper pass else: print(record) else: current_pssm = initial_pssm[1:] current_pssm = [i[1:] for i in current_pssm] for i in range(len(current_pssm)): current_pssm[i] = [float(j) for j in current_pssm[i]] consensus = nc.consensus_from_PWM(current_pssm, bases, 0.25, transform = True) protein_name = record[6] new_record = [protein_name, "CIS-BP_RNA_PWM", record[19], record[14], consensus] db_fields.append(new_record) protein_number_after = (len(list(set([i[0] for i in db_fields])))) print("After adding motifs from CIS-BP RNA, there are {0} proteins.\n".format(protein_number_after)) to_delete = [] for pos, i in enumerate(db_fields): if ";" in i[4]: if "; " in i[4]: temp_split = i[4].split("; ") else: temp_split = i[4].split(";") temp_split = [((j.upper()).lstrip("N")).rstrip("N") for j in temp_split] temp_split = [j for j in temp_split if len(j) <= upper_threshold and len(j) >= lower_threshold and "(" not in j] if temp_split: db_fields[pos][4] = temp_split[0] for j in temp_split[1:]: db_fields.append([i[0], i[1], i[2], i[3], j]) else: to_delete.append(pos) else: i[4] = (((i[4]).upper()).rstrip("N")).lstrip("N") if len(i[4]) > upper_threshold or len(i[4]) < lower_threshold or "(" in i[4]: to_delete.append(pos) else: db_fields[pos][4] = i[4] db_fields = [i for pos, i in enumerate(db_fields) if pos not in to_delete] protein_number_after = (len(list(set([i[0] for i in db_fields])))) print("After only keeping motifs of length {0}-{1} bp, {2} proteins remain.\n".format(lower_threshold, upper_threshold, protein_number_after)) protein_names = list(set([i[0] for i in db_fields])) if species == "Mus musculus": protein_names_file = "RBP/RBP_names_for_checking.txt" with open(protein_names_file, "w") as file: for name in protein_names: file.write("{0}\n".format(name)) MGI_file = "RBP/MGI_correspondances.txt" MGI = rw.read_many_fields(MGI_file, "\t") MGI_names_all = [i[0] for i in MGI[1:]] found = [i[0] for i in MGI if i[0] == i[3]] MGI = {i[0]: i[3] for i in MGI[1:] if i[0] not in found} to_delete = [] for pos, i in enumerate(db_fields): if species == "Mus musculus": db_fields[pos][0] = "".join([db_fields[pos][0][0].upper(), db_fields[pos][0][1:].lower()]) #will get rid of Hnrnpcl1, which didn't return anything in the MGI search. if db_fields[pos][0] not in MGI_names_all: to_delete.append(pos) else: if db_fields[pos][0] not in found: db_fields[pos][0] = MGI[db_fields[pos][0]] elif species == "H**o sapiens": if i[0] == "A2BP1" or i[0] == "FOX1": db_fields[pos][0] = "RBFOX1" elif i[0] == "SFRS13A": db_fields[pos][0] = "SRSF10" elif i[0][:6] == "BRUNOL": db_fields[pos][0] = "CELF{0}".format(i[0][-1]) elif i[0] == "CUGBP": db_fields[pos][0] = "CELF1" elif i[0] == "Fusip1": db_fields[pos][0] = "SRSF10" elif i[0][:4] == "SFRS": db_fields[pos][0] = "SRSF{0}".format(i[0][4:]) elif i[0] == "HuR": db_fields[pos][0] = "ELAVL1" elif i[0] == "MBNL": db_fields[pos][0] = "MBNL1" elif i[0] == "PTB": db_fields[pos][0] = "PTBP1" elif i[0] == "QK1": db_fields[pos][0] = "QKI" elif i[0] == "RBM9": db_fields[pos][0] = "RBFOX2" elif i[0] == "STAR-PAP": db_fields[pos][0] = "TUT1" elif i[0] == "YB-1": db_fields[pos][0] = "YBX1" elif i[0] == "hnRNPK": db_fields[pos][0] = "HNRNPK" elif i[0] == "hnRNPLL" or i[0] == "HNRPLL": db_fields[pos][0] = "HNRNPLL" db_fields = [i for pos, i in enumerate(db_fields) if pos not in to_delete] protein_names = list(set([i[0] for i in db_fields])) protein_number_after = (len(list(set([i[0] for i in db_fields])))) print("After cleaning up protein IDs, {0} proteins remain.\n".format(protein_number_after)) protein_dict = {} for i in db_fields: if i[0] not in protein_dict.keys(): protein_dict[i[0]] = [i] else: protein_dict[i[0]].append(i) if species == "H**o sapeins": del protein_dict["PPIE"] del protein_dict["MIR1236"] del protein_dict["PABPC4"] print("After removing PPIE, PABPC4 and MIR1236, {0} proteins remain.\n".format(len(protein_dict))) elif species == "Mus musculus": del protein_dict["Pabpc4"] print("After removing Pabpc4, {0} proteins remain.\n".format(len(protein_dict))) for i in protein_dict: if i == "ELAVL1": protein_dict[i].append(['ELAVL1', 'synthetic', 'synthetic', 'synthetic', 'UUWGDUU']) elif i == "ELAVL2": protein_dict[i].append(['ELAVL2', 'synthetic', 'synthetic', 'synthetic', 'RWUUYAUUUWR']) protein_dict[i] = sorted(protein_dict[i], key = lambda x:x[4]) current_motifs = [j[4] for j in protein_dict[i]] to_delete = [] for j in range(1, len(current_motifs)): if current_motifs[j] == current_motifs[j-1]: for k in range(1, 4): protein_dict[i][j][k] = ",".join([protein_dict[i][j][k], protein_dict[i][j - 1][k]]) to_delete.append(j - 1) protein_dict[i] = [protein_dict[i][j] for j in range(len(protein_dict[i])) if j not in to_delete] for i in protein_dict: protein_dict[i] = [[j[0], j[4], j[1], j[2], j[3]] for j in protein_dict[i]] print("\n") print("Writing motifs to {0}.\n".format(final_motifs_file_name)) motif_numbers = [] with open(final_motifs_file_name, "w") as final_motifs_file: for i in sorted(list(protein_dict.keys())): final_motifs_file.write(">{0}\n".format(i)) current_motifs = [j[1] for j in protein_dict[i]] DNA_motifs = [nc.DNA_RNA_conversion(j) for j in current_motifs] unravelled_motifs = [nc.unravel_consensus(j) for j in DNA_motifs] unravelled_motifs = flatten(unravelled_motifs) unravelled_motifs = list(set(unravelled_motifs)) print("Writing {0} motifs for {1}.".format(len(unravelled_motifs), i)) motif_numbers.append(len(unravelled_motifs)) unravelled_motifs = "|".join(unravelled_motifs) final_motifs_file.write("{0}\n".format(unravelled_motifs)) plt.figure(1) plotting.histogram(motif_numbers, 50, x_lab = "Motif number", y_lab = "Frequency", title = None) plotting.save_and_show([10, 10], 100, plot_name)
def main(): description = "Generate a NET-seq control set that would have the same distribution of -2:2 nucleotides" \ "as the true set." args = hk.parse_arguments(description, [ "active_genes_file", "gtf", "PolII_file", "fasta", "outfile", "chrom_sizes" ]) active_genes_file, gtf, PolII_file, fasta, outfile, chrom_sizes = args.active_genes_file, args.gtf, args.PolII_file, args.fasta, args.outfile, args.chrom_sizes chrom_sizes = rw.read_many_fields(chrom_sizes, delimiter="\t") chrom_sizes = hk.list_to_dict(chrom_sizes, 0, 1, intify=True) # get transcriptionally active genes and make a BED file with their coordinates print("Getting the coordinates of transcriptionally active genes...") trans_active_genes = rw.read_many_fields(active_genes_file, "\t")[1:] trans_active_genes = [i[3] for i in trans_active_genes] transcripts_file = "{0}_transcripts_all.bed".format(gtf[:-4]) co.get_transcripts(gtf, transcripts_file, add_chr=True) transcripts_dict = {} # this will be used for getting the k-mers in the transcripts filtered_transcripts_file_plus2 = "{0}_trans_act_only_plus3.bed".format( transcripts_file[:-4]) # this will be used for filtering the reads filtered_transcripts_file = "{0}_trans_act_only.bed".format( transcripts_file[:-4]) with open(filtered_transcripts_file, "w") as ft_file, open(transcripts_file) as t_file, open( filtered_transcripts_file_plus2, "w") as ft_file2: reader = csv.reader(t_file, delimiter="\t") writer = csv.writer(ft_file, delimiter="\t") writer2 = csv.writer(ft_file2, delimiter="\t") for line in reader: if line[3] in trans_active_genes: # if line[0][0] not in ["G", "K"]: # line[0] = "chr{0}".format(line[0]) writer.writerow(line) # this is because if a read falls at the first position, you will need to know the # preceding two bases. Same if it falls at the last position. line[1] = str((int(line[1])) - 3) line[2] = str((int(line[2])) + 3) writer2.writerow(line) transcripts_dict[line[3]] = line print("Filtering reads to the transcripts...") # filter reads to only ones that overlap these transcripts transcripts_PolII = "{0}_transcripts.bed".format(PolII_file[:-4]) co.intersect_bed(PolII_file, filtered_transcripts_file, force_strand=True, output_file=transcripts_PolII) print("Extracting FASTA from the transcript coordinates...") # the genome FASTA is formatted as N rather than chrN filtered_transcripts_file_no_chr = "{0}_trans_act_only_plus3_no_chr.bed".format( transcripts_file[:-4]) hk.run_process(["sed", "s/^chr//", filtered_transcripts_file_plus2], file_for_output=filtered_transcripts_file_no_chr) filtered_transcripts_fasta_no_chr = "{0}_trans_act_only_plus3.fasta".format( transcripts_file[:-4]) hk.run_process([ "bedtools", "getfasta", "-fi", fasta, "-bed", filtered_transcripts_file_no_chr, "-fo", filtered_transcripts_fasta_no_chr, "-s", "-name" ]) print("Mapping kmers to transcript positions...") kmer_dict = map_kmers_to_positions(filtered_transcripts_fasta_no_chr, k=6, focal_pos=3) print("Extracting the starting dinucleotide for each read...") starting_dints_PolII = "{0}_transcripts_starting_6mers.bed".format( PolII_file[:-4]) starting_dints_PolII_fasta = "{0}_transcripts_starting_6mers.fasta".format( PolII_file[:-4]) co.extend_intervals(transcripts_PolII, starting_dints_PolII, 3, 3, remove_chr=True) hk.run_process([ "bedtools", "getfasta", "-fi", fasta, "-bed", starting_dints_PolII, "-fo", starting_dints_PolII_fasta, "-s" ]) print("Picking random control positions...") pick_random_positions(transcripts_PolII, starting_dints_PolII_fasta, outfile, kmer_dict, transcripts_dict, chrom_sizes=chrom_sizes) print("Making single nucleotide resolution file...") snr_file = "{0}_snr.bed".format(outfile[:-4]) co.snr_bed(outfile, snr_file) print( "Removing reads that overlap potential splice intermediate positions..." ) no_si_snr_file = "{0}_snr_no_si.bed".format(outfile[:-4]) co.intersect_bed(snr_file, "data/Genomes/GTFs/dm6/dmel-all-r6.18_exon_ends_chr.gtf", force_strand=True, exclude=True, no_dups=False)
def main(): description = "Prepare input file for running MultiDFEest." args = parse_arguments(description, [ "hit_file", "control_file", "SNPs_file_prefix", "N", "output_file", "per_chrom_files", "shuffle" ], ints=[3], flags=[5, 6]) hit_file, control_file, SNPs_file_prefix, N, output_file, per_chrom_files, shuffle = args.hit_file, args.control_file, args.SNPs_file_prefix, args.N, args.output_file, args.per_chrom_files, args.shuffle hits = parse_pos(hit_file) controls = parse_pos(control_file) if shuffle: hits, controls = shuffle_dictionaries(hits, controls) SNPs = {} to_remove_all = {} #if the data is stored chromosome by chromosome, rather than all combined if per_chrom_files: for chrom in range(1, 23): try: SNPs_file = "{0}{1}.bed".format(SNPs_file_prefix, str(chrom)) current_SNPs = rw.read_many_fields(SNPs_file, "\t") to_remove = list_to_dict(current_SNPs, 0, 2) to_remove = {i: to_remove[i].split(",") for i in to_remove} current_SNPs = list_to_dict(current_SNPs, 0, 1) for trans in current_SNPs: if trans in controls: SNPs[trans] = {} trans_SNPs = current_SNPs[trans] if trans_SNPs: trans_SNPs = [ i.split(",") for i in trans_SNPs.split("|") ] #this is where you get the allele count trans_SNPs = list_to_dict(trans_SNPs, 0, 3) trans_SNPs = { int(i): int(trans_SNPs[i]) for i in trans_SNPs } SNPs[trans] = trans_SNPs to_remove_all[trans] = [ int(i) for i in to_remove[trans] if i not in ["error", ""] ] except FileNotFoundError: pass else: SNPs_file = SNPs_file_prefix current_SNPs = rw.read_many_fields(SNPs_file, "\t") to_remove = list_to_dict(current_SNPs, 0, 2) to_remove = {i: to_remove[i].split(",") for i in to_remove} current_SNPs = list_to_dict(current_SNPs, 0, 1) counter = 0 for trans in current_SNPs: if trans in controls: SNPs[trans] = {} trans_SNPs = current_SNPs[trans] if trans_SNPs: trans_SNPs = [i.split(",") for i in trans_SNPs.split("|")] #this is where you get the allele count trans_SNPs = list_to_dict(trans_SNPs, 0, 3) trans_SNPs = { int(i): int(trans_SNPs[i]) for i in trans_SNPs } SNPs[trans] = trans_SNPs to_remove_all[trans] = [ int(i) for i in to_remove[trans] if i not in ["error", ""] ] hit_SFS = get_SFS(hits, SNPs, to_remove_all, N) control_SFS = get_SFS(controls, SNPs, to_remove_all, N) with open(output_file, "w") as file: file.write("{0}\n".format(N)) file.write(" ".join([str(i) for i in hit_SFS])) file.write("\n") file.write(" ".join([str(i) for i in control_SFS])) file.write("\n")
def main(): description = "Aggregate various statistics on the splicing events you're studying." args = hk.parse_arguments(description, [ "gtf", "polII_bed", "exon_start_coords", "truncated_exons_file", "genome_file", "output_file" ]) gtf, polII_bed, exon_start_coords, truncated_exons_file, genome_file, output_file = args.gtf, args.polII_bed, args.exon_start_coords, args.truncated_exons_file, args.genome_file, args.output_file CDSs = rw.read_gtf(gtf, "CDS", gene=False) exons = rw.read_gtf(gtf, "exon", gene=False) exon_starts = rw.read_many_fields(exon_start_coords, skip_header=False, delimiter="\t") exon_starts = {i[3]: i for i in exon_starts} out_array = np.array(sorted(exon_starts.keys()), dtype="str") out_array.shape = (len(exon_starts.keys()), 1) out_array = np.vstack((["junction"], out_array)) #1. exon size curr_dict = co.get_lengths(CDSs, exon_starts.keys()) out_array = add_to_array(out_array, curr_dict, "exon_size") print("Exon size done.") #2. exon number curr_dict = co.get_exon_number(exons, exon_starts.keys()) out_array = add_to_array(out_array, curr_dict, "exon_number") print("Exon number done.") #3. exon rank (from start and end) exon_rank_start, exon_rank_end = co.get_exon_rank(exons, exon_starts) out_array = add_to_array(out_array, exon_rank_start, "exon_rank_from_start") out_array = add_to_array(out_array, exon_rank_end, "exon_rank_from_end") print("Exon rank done.") #4. upstream intron size curr_dict = co.get_upstream_intron_size(exons, exon_rank_start) out_array = add_to_array(out_array, curr_dict, "upstream_intron_size") curr_dict = co.get_upstream_intron_size(exons, exon_rank_start, downstream=True) out_array = add_to_array(out_array, curr_dict, "downstream_intron_size") print("Intron size done.") if truncated_exons_file != "None": #5. Pol II density per transcript dens_per_trans_file = "{0}_dens_per_trans.txt".format(polII_bed[:-4]) dens_per_trans_junctions = get_dens_per_trans(truncated_exons_file, polII_bed, dens_per_trans_file, out_array[1:, 0]) out_array = add_to_array(out_array, dens_per_trans_junctions, "polII_dens_per_trans") print("Pol II density done.") #6. exon GC4 and GC content genome = Fasta(genome_file) curr_dict = get_exon_GC4(CDSs, exons, exon_rank_start, genome) out_array = add_to_array(out_array, curr_dict, "exon_GC4") curr_dict = get_exon_GC(exons, exon_rank_start, genome) out_array = add_to_array(out_array, curr_dict, "exon_GC") print("Exon GC done.") #7. upstream intron GC content curr_dict = get_upstream_intron_GC(exons, exon_rank_start, genome) out_array = add_to_array(out_array, curr_dict, "upstream_intron_GC") print("Intron GC done.") #8. splice site strength curr_dict = nc.get_ss_strength(exons, genome_file, upstream=True, five=True, exonic=3, intronic=6) out_array = add_to_array(out_array, curr_dict, "upstream_5ss_strength") curr_dict = nc.get_ss_strength(exons, genome_file, upstream=True, five=False, exonic=3, intronic=20) out_array = add_to_array(out_array, curr_dict, "upstream_3ss_strength") curr_dict = nc.get_ss_strength(exons, genome_file, upstream=False, five=True, exonic=3, intronic=6) out_array = add_to_array(out_array, curr_dict, "downstream_5ss_strength") print("Splice site strength done.") with open(output_file, "w") as file: for line in range(0, out_array.shape[0]): line = out_array[line, :] line = "\t".join([str(i) for i in line]) file.write(line) file.write("\n")
def main(): parser = argparse.ArgumentParser(description="Prepare a clean dataset of protein-coding genes.") parser.add_argument("features_file_name", type = str, help = "name of GTF file with genome features") parser.add_argument("ortholog_features_file_name", type = str, help = "name of GTF file with genome features for the orthologous genome") parser.add_argument("genome", type = str, help = "genome assembly name") parser.add_argument("ortholog_genome", type = str, help = "ortholog genome assembly name") parser.add_argument("dataset_name", type = str, help = "dataset name") parser.add_argument("ortholog_dataset_name", type = str, help = "ortholog dataset name") parser.add_argument("orthologs_file_name", type = str, help = "csv with orthologous pairs") parser.add_argument("dS_threshold", type = float, help = "csv with orthologus pair") parser.add_argument("alignment_folder", type = str, help = "folder where phy alignment files will be stored") parser.add_argument("raw_orth_seq_file", type = str, help = "file with the raw ortholog CDS sequences (downloaded via ensembl biomart)") args = parser.parse_args() [features_file_name, ortholog_features_file_name, genome, ortholog_genome, dataset_name, ortholog_dataset_name, orthologs_file_name, dS_threshold, alignment_folder, raw_orth_seq_file] = [args.features_file_name, args.ortholog_features_file_name, args.genome, args.ortholog_genome, args.dataset_name, args.ortholog_dataset_name, args.orthologs_file_name, args.dS_threshold, args.alignment_folder, args.raw_orth_seq_file] make_dir(alignment_folder) trans_id_pattern = re.compile("ENS\w*T\d*") ids_to_keep = [] #loop over an ensembl GTF file with open(features_file_name) as features_file: #skip the metadata for i in range(5): features_file.readline() for i in features_file: #only consider features that have been localized to chromosomes and that are from protein-coding genes if "PATCH" not in i and "gene_biotype \"protein_coding\"" in i and i[0] in "123456789XY" and i[1] in "0123456789XY\t": trans_id_obj = re.search(trans_id_pattern, i) if trans_id_obj: trans_id = trans_id_obj.group(0) #store the transcript ID ids_to_keep.append(trans_id) #make a list of the unique transcript IDs you got in the previous step ids_to_keep = list(set(ids_to_keep)) #create a feature set object from the transcript IDs, #that is to say, make a file that has all the associated gene feature annotations fs = Feature_Set(features_file_name, genome) #the dataset only needs to be created if it didn't exist previously ## fs.create_dataset(dataset_name, input_list = ids_to_keep) fs.set_dataset(dataset_name) print("Created dataset with {0} transcripts.".format(len(fs.names))) #this file will have the mappings between genes from the focal species and genes from the orthologus species final_pairs_file_name = "general/{0}_{1}_pc_pairs.csv".format(genome, ortholog_genome) CDS = fs.get_CDS() CDS = {i: CDS[i] for i in CDS if CDS[i]} #write the full ORF sequences of the genes to FASTA, filtering based on reading frame integrity. Also check that #there are no premature termination codons. fs.write_full_CDS(CDS, check_ORF = True, bare_name = True, PTC_check = True) ids_to_keep = rw.read_fasta("{0}_{1}_full_CDS.fasta".format(fs.features_file_name[:-4], fs.dataset))[0] print("{0} transcripts pass the check for ORF integrity.".format(len(ids_to_keep))) transcripts = fs.get_transcripts() transcripts = {i: transcripts[i] for i in ids_to_keep} #for genes with several associated transcript IDs, only keep the longest. gene_name_dict = fs.get_gene_name_dict(transcripts) ids_to_keep = [] for gene in gene_name_dict: current_CDS = [CDS[j] for j in gene_name_dict[gene]] current_lengths = [sum([j[0][3] - j[0][2] + 1 for j in k]) for k in current_CDS] id_to_keep = gene_name_dict[gene][current_lengths.index(max(current_lengths))] ids_to_keep.append(id_to_keep) print("After only keeping one transcript per gene (the longest), {0} transcripts remain.".format(len(ids_to_keep))) #this is a file that has the orthologs of your gens from Ensmebl biomart orth_data = rw.read_many_fields(orthologs_file_name, ",") #make a dictionary for the gene-to-ortholog mapping pairs_dict = {} for line in orth_data: if line[1] not in pairs_dict: pairs_dict[line[1]] = [] pairs_dict[line[1]].append(line[2]) #only keep genes for which there is an ortholog in the comparator species #transcript identifiers ids_to_keep = [i for i in ids_to_keep if i in pairs_dict] #gene identifiers orth_ids_to_keep = list(pairs_dict.values()) orth_ids_to_keep = list(set(flatten(orth_ids_to_keep))) #create a feature set for the other species based on the genes that are orthologous to the genes in your focal set orth_fs = Feature_Set(ortholog_features_file_name, ortholog_genome) ## orth_fs.create_dataset(ortholog_dataset_name, input_list = orth_ids_to_keep, input_type = "gene") orth_fs.set_dataset(ortholog_dataset_name) orth_CDS = orth_fs.get_CDS() orth_CDS = {i: orth_CDS[i] for i in orth_CDS if orth_CDS[i]} #write the ortholog ORFs to FASTA. Filter based on reading frame integrity and PTC content. orth_fs.write_full_CDS(orth_CDS, check_ORF = True, bare_name = True, PTC_check = True) orth_full_CDS_file = "{0}_{1}_full_CDS.fasta".format(ortholog_features_file_name[:-4], ortholog_dataset_name) #in some cases, if the genome assembly for the ortholog is not very good, it can take forever to get the sequences using faidx. #In that case, you can get the sequences via biomart. Uncomment the code below! ## rw.write_names(list(orth_CDS.keys()), "general/{0}_trans_IDs.txt".format(ortholog_dataset_name)) ## with open(raw_orth_seq_file) as file: ## raw_orth_seq = "".join(file) ## raw_orth_seq = re.sub("([A-Z])\n([A-Z])", "\\1\\2", raw_orth_seq) ## raw_orth_seq = raw_orth_seq.split("\n") ## raw_orth_seq = [i for i in raw_orth_seq if len(i) > 0] ## raw_orth_names = [i for i in raw_orth_seq if i[0] == ">"] ## raw_orth_seq = [i for i in raw_orth_seq if i[0] != ">"] ## with open(orth_full_CDS_file, "w") as file: ## for pos, seq in enumerate(raw_orth_seq): ## ORF_check = check_ORF_integrity(seq, PTC_check = True) ## if ORF_check[0]: ## file.write("{0}\n".format(raw_orth_names[pos])) ## file.write("{0}\n".format(seq)) ## else: ## print(pos) ## print(ORF_check[1]) ## print(raw_orth_names[pos]) ## print(seq) ## print("\n") #read in the full ORF sequences from both species CDS_names, CDS_seq = rw.read_fasta("{0}_{1}_full_CDS.fasta".format(fs.features_file_name[:-4], fs.dataset)) orth_CDS_names, orth_CDS_seq = rw.read_fasta(orth_full_CDS_file) orth_transcripts = orth_fs.get_transcripts() orth_gene_name_dict = orth_fs.get_gene_name_dict(orth_transcripts) final_pairs = {} counter = 0 #loop over the remaining genes for i in ids_to_keep: if counter%1000 == 0: print(counter) counter = counter + 1 #get the IDs of the orthologous genes in the ortholog species orth_ids = pairs_dict[i] #get all the associated transcript identifiers orth_ids_trans = [[orth_gene_name_dict[j][k] for k in range(len(orth_gene_name_dict[j]))] for j in orth_ids if j in orth_gene_name_dict] orth_ids_trans = flatten(orth_ids_trans) CDS = CDS_seq[CDS_names.index(i)] orth_CDS = [] ids_to_remove = [] #get all the ortholog ORF sequences for j in orth_ids_trans: try: current_CDS = orth_CDS_seq[orth_CDS_names.index(j)] orth_CDS.append(current_CDS) #this is because some of the transcripts produced from the gene might be non-coding or have a wonky ORF and therefore not appear in the CDS fasta except ValueError: ids_to_remove.append(j) orth_ids_trans = [j for j in orth_ids_trans if j not in ids_to_remove] #check that the sequence from the focal species aligns to an ortholog with dN/dS below 0.5 and dS below the specified threshold if orth_ids_trans: conservation_check = keep_conserved_pc(i, orth_ids_trans, CDS, orth_CDS, dS_threshold, alignment_folder) if conservation_check[0]: #also store which ortholog transcript gave the lowest dS in the alignment final_pairs[i] = conservation_check[1] print("After filtering by conservation, {0} transcripts remain.".format(len(list(final_pairs.values())))) #write the final retained ortholog gene pairs to file with open(final_pairs_file_name, "w") as file: output_writer = csv.writer(file, delimiter = ",") for i in final_pairs: output_writer.writerow([i, final_pairs[i]]) print("Wrote ortholog pairs to {0}.".format(final_pairs_file_name)) #write the remaining ORF sequences to fasta CDS_seq = [i for pos, i in enumerate(CDS_seq) if CDS_names[pos] in final_pairs] CDS_names = [i for i in CDS_names if i in final_pairs] rw.write_to_fasta(CDS_names, CDS_seq, "general/filtered_{0}_wo_low_omega.fasta".format(dataset_name)) #create a feature set with the remaining genes filtered_fs = Feature_Set(features_file_name, genome) filtered_fs.create_dataset("filtered_{0}".format(dataset_name), input_list = list(final_pairs.keys())) print("All done.")
def main(): description = "Record splicing distance." args = hk.parse_arguments(description, ["input_file", "gtf", "output_folder", "trans_active_file", "window_size", "intron_window_size", "outsuffix", "leave_terminal"], ints = [4, 5], flags = [7]) input_file, gtf, output_folder, trans_active_file, window_size, intron_window_size, outsuffix, leave_terminal = args.input_file, args.gtf, args.output_folder, args.trans_active_file, args.window_size, args.intron_window_size, args.outsuffix, args.leave_terminal if outsuffix == "None": outsuffix = "" bare_input_path = input_file.split("/")[-1] bed = "{0}.bed".format(input_file[:-4]) # hk.convert2bed(input_file, bed) # get descriptive stats of the reads length_file = "{0}/{1}_read_lengths.txt".format(output_folder, bare_input_path[:-4]) write_read_lengths(bed, length_file) # read in CDS coordinates exons = rw.read_gtf(gtf, "CDS", gene=False) # only leave transcriptionally active genes (one isoform per gene) trans_active_genes = rw.read_many_fields(trans_active_file, "\t")[1:] # pull out the column with transcript IDs trans_active_genes = [i[3] for i in trans_active_genes] exons = {i: exons[i] for i in exons if i in trans_active_genes} terminal_suff = "_with_terminal" if not leave_terminal: # remove last exons exons = {i: exons[i][:-1] for i in exons} terminal_suff = "" # prepare exon-exon junctions exon_junctions_file = "{0}_exon_junctions{1}{2}.bed".format(gtf[:-4], outsuffix, terminal_suff) all_junctions = co.extract_3ss(exons, exon_junctions_file) out_bed = "{0}/{1}_first_{2}_bp{3}{4}.bed".format(output_folder, bare_input_path[:-4], window_size, outsuffix, terminal_suff) write_exon_starts(all_junctions, out_bed, exons, window_size, add_chr=True) out_bed_end = "{0}/{1}_last_{2}_bp{3}{4}.bed".format(output_folder, bare_input_path[:-4], window_size, outsuffix, terminal_suff) write_exon_starts(all_junctions, out_bed_end, exons, window_size, add_chr=True, from_end=True) intron_bed = "{0}/{1}_first_{2}_intronic_bp{3}{4}.bed".format(output_folder, bare_input_path[:-4], intron_window_size, outsuffix, terminal_suff) write_intron_starts(all_junctions, intron_bed, exons, intron_window_size, add_chr=True) out_bed = "{0}/{1}_first_centred_{2}_bp{3}{4}.bed".format(output_folder, bare_input_path[:-4], window_size, outsuffix, terminal_suff) write_exon_starts(all_junctions, out_bed, exons, window_size, add_chr=True, centre=True) out_bed_end = "{0}/{1}_last_centred_{2}_bp{3}{4}.bed".format(output_folder, bare_input_path[:-4], window_size, outsuffix, terminal_suff) write_exon_starts(all_junctions, out_bed_end, exons, window_size, add_chr=True, from_end=True, centre=True) out_bed_si = "{0}/{1}_si_pos{2}{3}.bed".format(output_folder, bare_input_path[:-4], outsuffix, terminal_suff) write_si_pos(all_junctions, out_bed_si, exons, add_chr=True) out_bed_si_current = "{0}/{1}_si_pos_current{2}{3}.bed".format(output_folder, bare_input_path[:-4], outsuffix, terminal_suff) write_si_pos(all_junctions, out_bed_si_current, exons, add_chr=True, curr_exon=True) # check which junctions are associated with a splicing intermediate read snr_bed = "{0}_snr.bed".format(bed[:-4]) co.snr_bed(bed, snr_bed) si_counts_bed = "{0}/{1}_si_counts{2}{3}.bed".format(output_folder, bare_input_path[:-4], outsuffix, terminal_suff) co.intersect_bed(out_bed_si, snr_bed, force_strand=True, hit_count=True, no_dups=False, output_file=si_counts_bed) si_counts_current_bed = "{0}/{1}_si_counts_current{2}{3}.bed".format(output_folder, bare_input_path[:-4], outsuffix, terminal_suff) co.intersect_bed(out_bed_si_current, snr_bed, force_strand=True, hit_count=True, no_dups=False, output_file=si_counts_current_bed) # filter out reads that don't overlap exon-exon junctions exon_junction_bed = "{0}_exon_junctions{1}{2}.bed".format(input_file[:-4], outsuffix, terminal_suff) co.intersect_bed(bed, exon_junctions_file, write_both=True, output_file=exon_junction_bed, force_strand=True, no_dups=False) spliced_bed = "{0}_spliced{1}{2}.bed".format(input_file[:-4], outsuffix, terminal_suff) unspliced_bed = "{0}_unspliced{1}{2}.bed".format(input_file[:-4], outsuffix, terminal_suff) sr_distances = {} ur_distances = {} found_count = 0 file_size = hk.line_count(exon_junction_bed) # will store all the intron names for which there are # either spliced or unspliced reads valid_junctions = [] with open(exon_junction_bed) as file, open(spliced_bed, "w") as sfile, open(unspliced_bed, "w") as ufile: for pos, line in enumerate(file): if pos % 100000 == 0: print("{0}/{1}".format(pos, file_size)) print("Found {0} spliced reads.".format(found_count)) print("\n") line = line.split("\t") # reads that end at the last nucleotide of an exon intermediate_read = NGS.check_intermediate_read(line, exons) intron_name = line[20] if not intermediate_read: # check that it ends within the exon just downstream of # the 3' ss that is being analyzed in_dwns_exon = NGS.check_position_in_exon(line, exons) if in_dwns_exon: # 'spliced', 'unspliced' or 'None' (=can't analyze) read_type = NGS.analyze_cigar(line, overhang = 5) if read_type: if intron_name not in valid_junctions: valid_junctions.append(intron_name) splice_dist = NGS.get_splice_dist(line) if read_type == "S": sfile.write("\t".join([str(i) for i in line])) found_count = found_count + 1 sr_distances = update_dist_dict(intron_name, sr_distances, splice_dist) else: ufile.write("\t".join([str(i) for i in line])) ur_distances = update_dist_dict(intron_name, ur_distances, splice_dist) print("Proportion of spliced reads: {0}.".format(found_count/(pos + 1))) # for each valid junction, calculate the length of the exonic sequence # afterwards, so that you wouldn't consider intronic sequence in the distance # matrix lengths_dict = co.get_lengths(exons, valid_junctions) write_dist_mat(sr_distances, window_size, "{0}/{1}_spliced_read_distances_{2}{3}{4}.txt".format(output_folder, bare_input_path[:-4], window_size, outsuffix, terminal_suff), lengths_dict, "{0}/{1}_spliced_read_{2}_intron_names{3}{4}.txt".format(output_folder, bare_input_path[:-4], window_size, outsuffix, terminal_suff), "{0}/{1}_spliced_read_first_spliced{2}{3}.txt".format(output_folder, bare_input_path[:-4], outsuffix, terminal_suff)) write_dist_mat(ur_distances, window_size, "{0}/{1}_unspliced_read_distances_{2}{3}{4}.txt".format(output_folder, bare_input_path[:-4], window_size, outsuffix, terminal_suff), lengths_dict, "{0}/{1}_unspliced_read_{2}_intron_names{3}{4}.txt".format(output_folder, bare_input_path[:-4], window_size, outsuffix, terminal_suff), "{0}/{1}_unspliced_read_first_unspliced{2}{3}.txt".format(output_folder, bare_input_path[:-4], outsuffix, terminal_suff))
def mDFEest(model, input_file, n_spikes = None, repetitions = None, fold_SFS = True, pop_change = False, seed = None): ''' Wraps call to multiDFEest. ''' flags = [] if fold_SFS: fold_SFS = 1 else: fold_SFS = 0 #this looks weird but is normal: this value will be the value of conpop in the multiDFE call, meaning it'll be 1 with constant population size if pop_change: pop_change = 0 else: pop_change = 1 #convert the English distribution names into multiDFEest model codes if model == "lognormal": model_code = 4 #parameter number for calculating AIC par_number = 2 elif model == "gamma": model_code = 2 par_number = 2 elif model == "beta": model_code = 3 par_number = 2 elif model == "spikes": model_code = 0 if not n_spikes: print("To be able to use a spikes model, you need to specify the number of spikes.") raise Exception par_number = (2 * n_spikes) - 1 flags = ["-ranrep", repetitions, "-nspikes", n_spikes] elif model == "steps": model_code = 1 if not n_spikes: print("To be able to use a steps model, you need to specify the number of steps.") raise Exception par_number = (2 * n_spikes) - 1 flags = ["-ranrep", repetitions, "-nspikes", n_spikes] elif model == "six_spikes": model_code = 5 par_number = 5 flags = ["-ranrep", repetitions] else: print("{0} is not a valid model name!".format(model)) raise Exception input_file_short = input_file.split("/") input_file_short = input_file_short[-1] #do the analysis in the directory where multiDFEest is stored if not os.path.exists("../multidfe/{0}".format(input_file_short)): run_process(["cp", input_file, "../multidfe"]) MDE_output = "{0}.MAXL.out".format(input_file_short) current_dir = os.getcwd() os.chdir("../multidfe") arguments = ["./MultiDFE", "-N1", 100, "-conpop", pop_change, "-sfsfold", fold_SFS, "-selmode", model_code, "-file", input_file_short] if seed: seed_string = "GSL_RNG_SEED={0}".format(seed) arguments = [seed_string] + arguments arguments.extend(flags) print(" ".join([str(i) for i in arguments])) #run multiDFEest run_process(arguments) #parse output output = rw.read_many_fields(MDE_output, "\t")[0] output = [i.split(":") for i in output if ":" in i] output = {i[0]: float(i[1]) for i in output} #get the log likelihood and calculate AIC ll = output["L"] print("\n") print(par_number) print(ll) AIC = (2 * par_number) - (2 * ll) output["AIC"] = AIC if n_spikes: output["model"] = "{0}_{1}".format(model, n_spikes) else: output["model"] = model remove_file(MDE_output) os.chdir(current_dir) return(output)
def get_CpG_dicts(CDSs, chroms, MSA_file_name_prefix, lengths, clean_names, phylip_data, fasta, anc_CG_file_name, high_CG_file_name, fs, macaque_anc = False, pseudoCG = False, comprehensive = False, subst_model = None, return_tuples = False, regions = False): ''' Get two dictionaries, one that says for each transcript which positions are CpG/GpC in macaque and one which positions were likely CpG/GpC in the human-macaque ancestor. ''' names, seqs = rw.read_fasta(fasta) #if you're gonna determine ancestral CpG positions from scratch rather than reading them in from an existing file #if you want to have the name of the file determined automatically if (not anc_CG_file_name) or (anc_CG_file_name == "None"): new_CG = True phy_file = "temp_data/temp_anc_CG{0}.txt".format(random.random()) #if you want to give the file a name yourself elif not os.path.exists(anc_CG_file_name): new_CG = True else: new_CG = False if new_CG: print("Will get new CpG data...") if len(phylip_data) < 8 and comprehensive: print("Comprehensive CpG filtering only in big tree mode!") raise Exception #if you want to pretend some other dinucleotide are CpG if pseudoCG: CG_kmers = ["C[\-]*T", "A[\-]*G"] #the hyphens are there in case the two nucleotides are separated by an indel else: CG_kmers = ["C[\-]*G", "G[\-]*C"] CG_kmers = [re.compile(i) for i in CG_kmers] macaque_CG_dict = {} anc_CG_concat_full = [[[""]], [[""]]] tuples_mapping_dict_full = {} for chrom in chroms: print(chrom) #only leave those CDSs that are on the current chromosome current_CDSs = {i: CDSs[i] for i in CDSs if CDSs[i][0][0][0] == chrom} coords_file = "temp_data/coords_file{0}.txt".format(random.random()) #check if the MSA is already at the specified location, otherwise retrieve it MSA_file = "{0}_{1}.txt".format(MSA_file_name_prefix, chrom) if not os.path.isfile(MSA_file): print("Obtaining MSA...") eo.get_MSA_gene_list(current_CDSs, coords_file, "EPO", "primates", 85, "homo_sapiens", MSA_file) os.remove(coords_file) eo.flush_tables("localhost", "mysql", "fackel") MSA_raw = eo.parse_MSA_output(MSA_file) if high_CG_file_name != "None": high_CG = rw.read_many_fields(high_CG_file_name, "\t") high_CG = {i[0]: [int(j) for j in i[1:]] for i in high_CG} else: high_CG = None #get concatenated sequences (for determining ancestral CpG positions) and macaque CpG information for this chromosome anc_CG_concat, macaque_CG_dict, tuples_mapping_dict = get_CpG_dicts_core(MSA_raw, lengths, phylip_data, CG_kmers, macaque_anc, macaque_CG_dict, high_CG, comprehensive = comprehensive, subst_model = subst_model) remove_file(coords_file) #add that information to the global dictionaries anc_CG_concat_full, tuples_mapping_dict_full = update_anc_CG(anc_CG_concat_full, anc_CG_concat, tuples_mapping_dict_full, tuples_mapping_dict) phy_files = write_anc_CG(anc_CG_concat_full, anc_CG_file_name, clean_names, macaque_CG_dict) pp_file = anc_CG_file_name else: print("Will read in existing CpG data...") pp_file = None phy_files = "None" high_CG = None tuples_mapping_dict_full = None macaque_CG_file_name = "{0}_macaque.txt".format(anc_CG_file_name[:-4]) macaque_CG_dict = rw.read_many_fields(macaque_CG_file_name, "\t") macaque_CG_dict = [i for i in macaque_CG_dict if len(i) == 2] macaque_CG_dict = list_to_dict(macaque_CG_dict, 0, 1) macaque_CG_dict = {i: [int(i) for i in macaque_CG_dict[i].split(",") if i != ""] for i in macaque_CG_dict} anc_CG_dict = get_ancestral_CG(pp_file, subst_model, phy_files, "DFE/UCSC_model.mod", tuples_mapping_dict_full, anc_CG_file_name, high_CG = high_CG, macaque = macaque_anc, comprehensive = comprehensive) [remove_file(i) for i in phy_files] #if you're looking at exon cores/flanks rather than full CDSs if regions: #you need to have matching bed/fasta files for this to work (with the records in the same order) bed = fasta.replace("fasta", "bed") transcripts = fs.get_transcripts() #for each flank/core, figure out what positions it covers in the full CDS mapping_dict = conservation.map_regions_to_CDS(fasta, bed, fs, transcripts, CDSs, trans_ids = True) anc_CG_dict = region_CpG(mapping_dict, anc_CG_dict) if return_tuples: return(anc_CG_dict, macaque_CG_dict, tuples_mapping_dict_full) else: return(anc_CG_dict, macaque_CG_dict)