def add_overlaps(diff_micropeptides, overlap_file, name, bed_df_a, bed_df_b, exons): msg = "Reading overlaps file: {}".format(overlap_file) logger.info(msg) overlap_bed = bed_utils.read_bed(overlap_file) msg = "Finding overlaps" a_overlaps = bed_utils.get_bed_overlaps(bed_df_a, overlap_bed, exons=exons) a_overlaps_ids = {to.a_info for to in a_overlaps} b_overlaps = bed_utils.get_bed_overlaps(bed_df_b, overlap_bed, exons=exons) b_overlaps_ids = {to.a_info for to in b_overlaps} m_match_a = diff_micropeptides['A'].isin(a_overlaps_ids) m_match_b = diff_micropeptides['B'].isin(b_overlaps_ids) match_name_a = "{}_A".format(name) match_name_b = "{}_B".format(name) diff_micropeptides[match_name_a] = 'No' diff_micropeptides[match_name_b] = 'No' diff_micropeptides.loc[m_match_a, match_name_a] = 'Yes' diff_micropeptides.loc[m_match_b, match_name_b] = 'Yes' return diff_micropeptides
def main(): parser = argparse.ArgumentParser(formatter_class=argparse.ArgumentDefaultsHelpFormatter, description="This script clusters the ORFs based on their subcodon " "counts using a DP-GMM; the means and weights of the clusters are " "written to a pickle file which consists of a list. The first element " "of the list are the means, and the second is the weights.") parser.add_argument('bf', help="The bayes factor file containing counts") parser.add_argument('out', help="The output (pickle) file") parser.add_argument('--fraction', help="The top <fraction> genes, based " "on normalized read counts, will be used for clustering", type=float, default=default_fraction) parser.add_argument('--max-iter', help="The maximum number of iterations " "for clustering", type=int, default=default_max_iter) parser.add_argument('--n-components', help="The maximum number of " "clusters", type=int, default=default_n_components) parser.add_argument('--seed', help="The seed for the random number " "generator", type=int, default=default_seed) logging_utils.add_logging_options(parser) args = parser.parse_args() logging_utils.update_logging(args) msg = "Reading BF file" logger.info(msg) bf = bed_utils.read_bed(args.bf) msg = "Extracting top k% of ORFs" logger.info(msg) # calculate the normalized read coverage total_read_coverage = bf["x_1_sum"] + bf["x_2_sum"] + bf["x_3_sum"] rpk = total_read_coverage / bf['orf_len'] sorted_rpk_indices = np.argsort(rpk) # and get the best args.fraction of them num_orfs = int(len(rpk) * args.fraction) top_k_orfs = sorted_rpk_indices.tail(num_orfs) msg = "Finding subcodon clusters" logger.info(msg) x_i_fields = ["x_1_sum", "x_2_sum", "x_3_sum"] X = bf.iloc[top_k_orfs.values][x_i_fields] model = np_utils.fit_bayesian_gaussian_mixture(X, max_iter=args.max_iter, n_components=args.n_components, seed=args.seed) msg = "Writing means and weights to disk" logger.info(msg) to_pkl = [model.means_, model.weights_] pickle.dump(to_pkl, open(args.out, 'wb'))
def main(): parser = argparse.ArgumentParser(formatter_class=argparse.ArgumentDefaultsHelpFormatter, description="This script visualizes the metagene profiles for each ORF type " "present in a given BED12+ file. It visualizes the mean and variance of normalized " "profiles in the first 21-bp, last 21-bp, and across all other 21-bp windows.") parser.add_argument('orfs', help="The BED12+ file containing the ORFs") parser.add_argument('profiles', help="The (mtx) file containing the ORF profiles") parser.add_argument('out', help="The base output name. The output filenames will be of " "the form: <out>.<orf-type>.<image-type>.") parser.add_argument('--min-profile', help="The minimum value of the sum over the profile " "to include it in the analysis", type=float, default=default_min_profile) parser.add_argument('--max-orfs', help="At most this many ORFs of each type will be " "used to create the figures. They will be sampled randomly from among those " "which meet the min-profile constraint.", type=int, default=default_max_orfs) parser.add_argument('--title', help="The prefix to use for the title of the plots", default=default_title) parser.add_argument('--image-type', help="The type of image files to create. The type " "must be recognized by matplotlib.", default=default_image_type) logging_utils.add_logging_options(parser) args = parser.parse_args() logging_utils.update_logging(args) msg = "Reading ORFs" logger.info(msg) orfs = bed_utils.read_bed(args.orfs) msg = "Reading profiles" logger.info(msg) profiles = scipy.io.mmread(args.profiles).tocsr() msg = "Extracting the metagene profiles and creating the images" logger.info(msg) orf_type_groups = orfs.groupby('orf_type') orf_type_groups.apply(extract_profiles_and_plot, profiles, args) msg = "Finished" logger.info(msg)
def main(): parser = argparse.ArgumentParser( formatter_class=argparse.ArgumentDefaultsHelpFormatter, description= "This script extracts the differential micropeptides from two " "conditions. Please see the documentation in redmine for more details.\n\n" "Please see the pyensembl (https://github.com/hammerlab/pyensembl) " "documentation for more information about the ensembl release and species." ) parser.add_argument('config', help="The (yaml) config file") parser.add_argument('name_a', help="The name of the first condition") parser.add_argument('name_b', help="The name of the second condition") parser.add_argument('out', help="The output (.csv.gz or .xlsx) file") parser.add_argument( '-a', '--append-sheet', help="If this flag is given, " "then a worksheet with the name '<name_a>,<name_b>' will be appended " "to the .xlsx file given by out (if it exists)", action='store_true') parser.add_argument( '-f', '--filter', help="If this flag is present, then " "the output will be filtered to include only the differential " "micropeptides with the highest KL-divergence and read coverage", action='store_true') parser.add_argument( '--read-filter-percent', help="If the the --filter flag " "is given, then only the top --read-filter-percent micropeptides will " "be considered for the final output. They still must meet the KL-" "divergence filtering criteria.", type=float, default=default_read_filter_percent) parser.add_argument( '--kl-filter-percent', help="If the the --filter flag " "is given, then only the top --read-kl-percent micropeptides will " "be considered for the final output. They still must meet the read " "coverage filtering criteria.", type=float, default=default_kl_filter_percent) parser.add_argument( '--id-matches', help="This is a list of files which " "contain ORF identifiers to compare to the differential micropeptides. " "For each of the files given, two columns will be added to the output " "which indicate if either A or B appear in the respective file. Each " "file should have a single ORF identifier on each line and contain " "nothing else.", nargs='*', default=default_id_matches) parser.add_argument( '--id-match-names', help="A name to include in the " "output file for each --id-matches file. The number of names must " "match the number of files.", nargs='*', default=default_id_match_names) parser.add_argument( '--overlaps', help="This is a list of bed12+ files " "which will be compared to the differential micropeptides. Two columns " "(one for A, one for B) will be added to the output which indicate if " "the respective micropeptides overlap a feature in each file by at " "least 1 bp.", nargs='*', default=default_overlaps) parser.add_argument( '--overlap-names', help="A name to include in the " "output file for each --overlaps file. The number of names must match " "the number of files.", nargs='*', default=default_overlap_names) parser.add_argument( '-r', '--ensembl-release', help="The version of Ensembl " "to use when mapping transcript identifiers to gene identifiers", type=int, default=default_ensembl_release) parser.add_argument( '-s', '--ensembl-species', help="The Ensembl species " "to use when mapping transcript identifiers to gene identifiers", default=default_ensembl_species) parser.add_argument( '--a-is-single-sample', help="By default, this script " "assumes the predictions come from merged replicates. If name_a is from " "a single sample, this flag should be given. It is necessary to find " "the correct filenames.", action='store_true') parser.add_argument( '--b-is-single-sample', help="By default, this script " "assumes the predictions come from merged replicates. If name_b is from " "a single sample, this flag should be given. It is necessary to find " "the correct filenames.", action='store_true') parser.add_argument('--fields-to-keep', help="The fields to keep from the " "Bayes factor file for each condition", nargs='*', default=default_fields_to_keep) parser.add_argument('--max-micropeptide-len', help="The maximum (inclusive) " "length of ORFs considered as micropeptides", type=int, default=default_max_micropeptide_len) parser.add_argument( '--do-not-fix-tcons', help="By default, the \"TCONS_\" " "identifiers from StringTie, etc., do not parse correctly; this script " "update the identifiers so that will parse correctly unless instructed not " "to. The script is likely to crash if the identifiers are not fixed.", action='store_true') logging_utils.add_logging_options(parser) args = parser.parse_args() logging_utils.update_logging(args) msg = "Loading ensembl database" logger.info(msg) ensembl = pyensembl.EnsemblRelease(release=args.ensembl_release, species=args.ensembl_species) ensembl.db msg = "Checking the id-match and overlaps files" logger.info(msg) if len(args.id_matches) != len(args.id_match_names): msg = ("The number of --id-matches files and --id-match-names do not " "match. {} files and {} names".format(len(args.id_matches), len(args.id_match_names))) raise ValueError(msg) if len(args.overlaps) != len(args.overlap_names): msg = ("The number of --overlaps files and --overlaps-names do not " "match. {} files and {} names".format(len(args.overlaps), len(args.overlap_names))) raise ValueError(msg) utils.check_files_exist(args.id_matches) utils.check_files_exist(args.overlaps) if args.filter: msg = "Validating filter percentages" logger.info(msg) math_utils.check_range(args.read_filter_percent, 0, 1, variable_name="--read-filter-percent") math_utils.check_range(args.kl_filter_percent, 0, 1, variable_name="--kl-filter-percent") msg = "Extracting file names" logger.info(msg) config = yaml.load(open(args.config)) note_str = config.get('note', None) # keep multimappers? is_unique = not ('keep_riboseq_multimappers' in config) # and the smoothing parameters fraction = config.get('smoothing_fraction', None) reweighting_iterations = config.get('smoothing_reweighting_iterations', None) lengths_a = None offsets_a = None if args.a_is_single_sample: lengths_a, offsets_a = ribo_utils.get_periodic_lengths_and_offsets( config, args.name_a, is_unique=is_unique) bayes_factors_a = filenames.get_riboseq_bayes_factors( config['riboseq_data'], args.name_a, length=lengths_a, offset=offsets_a, is_unique=is_unique, note=note_str, fraction=fraction, reweighting_iterations=reweighting_iterations) if not os.path.exists(bayes_factors_a): msg = ("Could not find the Bayes factor file for {}. ({}). Quitting.". format(args.name_a, bayes_factors_a)) raise FileNotFoundError(msg) predicted_orfs_a = filenames.get_riboseq_predicted_orfs( config['riboseq_data'], args.name_a, length=lengths_a, offset=offsets_a, is_unique=is_unique, note=note_str, fraction=fraction, reweighting_iterations=reweighting_iterations, is_filtered=True, is_chisq=False) if not os.path.exists(predicted_orfs_a): msg = ( "Could not find the predictions bed file for {}. ({}). Quitting.". format(args.name_a, predicted_orfs_a)) raise FileNotFoundError(msg) lengths_b = None offsets_b = None if args.b_is_single_sample: lengths_b, offsets_b = ribo_utils.get_periodic_lengths_and_offsets( config, args.name_b, is_unique=is_unique) bayes_factors_b = filenames.get_riboseq_bayes_factors( config['riboseq_data'], args.name_b, length=lengths_b, offset=offsets_b, is_unique=is_unique, note=note_str, fraction=fraction, reweighting_iterations=reweighting_iterations) if not os.path.exists(bayes_factors_b): msg = ("Could not find the Bayes factor file for {}. ({}). Quitting.". format(args.name_b, bayes_factors_b)) raise FileNotFoundError(msg) predicted_orfs_b = filenames.get_riboseq_predicted_orfs( config['riboseq_data'], args.name_b, length=lengths_b, offset=offsets_b, is_unique=is_unique, note=note_str, fraction=fraction, reweighting_iterations=reweighting_iterations, is_filtered=True, is_chisq=False) if not os.path.exists(predicted_orfs_b): msg = ( "Could not find the predictions bed file for {}. ({}). Quitting.". format(args.name_b, predicted_orfs_b)) raise FileNotFoundError(msg) exons_file = filenames.get_exons(config['genome_base_path'], config['genome_name'], note=config.get('orf_note')) if not os.path.exists(exons_file): msg = "Could not find the exons file ({}). Quitting.".format( exons_file) raise FileNotFoundError(msg) msg = "Reading the exons" logger.info(msg) exons = bed_utils.read_bed(exons_file) msg = "Reading the BF files" logger.info(msg) bf_df_a = bed_utils.read_bed(bayes_factors_a) bf_df_b = bed_utils.read_bed(bayes_factors_b) msg = "Reading the predictions files" logger.info(msg) bed_df_a = bed_utils.read_bed(predicted_orfs_a) bed_df_b = bed_utils.read_bed(predicted_orfs_b) differential_micropeptide_dfs = [] # extract micropeptides msg = "Extracting micropeptides" logger.info(msg) m_micropeptides_a = bed_df_a['orf_len'] <= args.max_micropeptide_len m_micropeptides_b = bed_df_b['orf_len'] <= args.max_micropeptide_len micropeptides_a = bed_df_a[m_micropeptides_a] micropeptides_b = bed_df_b[m_micropeptides_b] long_orfs_a = bed_df_a[~m_micropeptides_a] long_orfs_b = bed_df_b[~m_micropeptides_b] msg = "Finding micropeptides in A with no overlap in B" logger.info(msg) micropeptides_a_no_match_b = bed_utils.subtract_bed(micropeptides_a, bed_df_b, exons=exons) micropeptides_a_no_match_b_df = pd.DataFrame() micropeptides_a_no_match_b_df['A'] = list(micropeptides_a_no_match_b) micropeptides_a_no_match_b_df['B'] = None micropeptides_a_no_match_b_df['kl'] = np.inf micropeptides_a_no_match_b_df['overlap_type'] = 'micro_a_only' differential_micropeptide_dfs.append(micropeptides_a_no_match_b_df) msg = "Finding micropeptides in B with no overlap in A" logger.info(msg) micropeptides_b_no_match_a = bed_utils.subtract_bed(micropeptides_b, bed_df_a, exons=exons) micropeptides_b_no_match_a_df = pd.DataFrame() micropeptides_b_no_match_a_df['B'] = list(micropeptides_b_no_match_a) micropeptides_b_no_match_a_df['A'] = None micropeptides_b_no_match_a_df['kl'] = np.inf micropeptides_b_no_match_a_df['overlap_type'] = 'micro_b_only' differential_micropeptide_dfs.append(micropeptides_b_no_match_a_df) msg = "Finding overlapping micropeptides" logger.info(msg) micropeptides_a_micropeptides_b_df = get_overlap_df( micropeptides_a, micropeptides_b, 'micro_a_micro_b', bf_df_a, bf_df_b) differential_micropeptide_dfs.append(micropeptides_a_micropeptides_b_df) micropeptides_a_long_b_df = get_overlap_df(micropeptides_a, long_orfs_b, 'micro_a_long_b', bf_df_a, bf_df_b) differential_micropeptide_dfs.append(micropeptides_a_long_b_df) micropeptides_b_long_a_df = get_overlap_df(long_orfs_a, micropeptides_b, 'long_a_micro_b', bf_df_a, bf_df_b) differential_micropeptide_dfs.append(micropeptides_b_long_a_df) differential_micropeptides_df = pd.concat(differential_micropeptide_dfs) msg = "Adding read count information" logger.info(msg) res = differential_micropeptides_df.merge(bf_df_a[args.fields_to_keep], left_on='A', right_on='id', how='left') to_rename = {f: "{}_A".format(f) for f in args.fields_to_keep} res = res.rename(columns=to_rename) res = res.drop('id_A', axis=1) res = res.merge(bf_df_b[args.fields_to_keep], left_on='B', right_on='id', how='left') to_rename = {f: "{}_B".format(f) for f in args.fields_to_keep} res = res.rename(columns=to_rename) res = res.drop('id_B', axis=1) id_columns = ['A', 'B'] res = res.drop_duplicates(subset=id_columns) if not args.do_not_fix_tcons: # replace TCONS_ with TCONS res['A'] = res['A'].str.replace("TCONS_", "TCONS") res['B'] = res['B'].str.replace("TCONS_", "TCONS") msg = "Extracting the genes and their biotypes using pyensembl" logger.info(msg) ensembl = pyensembl.EnsemblRelease(release=args.ensembl_release, species=args.ensembl_species) ensembl_transcript_ids = set(ensembl.transcript_ids()) biotypes_a = parallel.apply_df_simple(res, get_transcript_and_biotype, 'A', ensembl, ensembl_transcript_ids) biotypes_b = parallel.apply_df_simple(res, get_transcript_and_biotype, 'B', ensembl, ensembl_transcript_ids) biotypes_a = utils.remove_nones(biotypes_a) biotypes_b = utils.remove_nones(biotypes_b) biotypes_a = pd.DataFrame(biotypes_a) biotypes_b = pd.DataFrame(biotypes_b) res = res.merge(biotypes_a, on='A', how='left') res = res.merge(biotypes_b, on='B', how='left') msg = "Pulling annotations from mygene.info" logger.info(msg) # pull annotations from mygene gene_info_a = mygene_utils.query_mygene(res['gene_id_A']) gene_info_b = mygene_utils.query_mygene(res['gene_id_B']) # and add the mygene info res = res.merge(gene_info_a, left_on='gene_id_A', right_on='gene_id', how='left') to_rename = {f: "{}_A".format(f) for f in gene_info_a.columns} to_rename.pop('gene_id') res = res.rename(columns=to_rename) res = res.drop('gene_id', axis=1) res = res.merge(gene_info_b, left_on='gene_id_B', right_on='gene_id', how='left') to_rename = {f: "{}_B".format(f) for f in gene_info_a.columns} to_rename.pop('gene_id') res = res.rename(columns=to_rename) res = res.drop('gene_id', axis=1) msg = "Removing duplicates" logger.info(msg) id_columns = ['A', 'B'] res = res.drop_duplicates(subset=id_columns) msg = "Adding --id-matches columns" logger.info(msg) for (id_match_file, name) in zip(args.id_matches, args.id_match_names): res = add_id_matches(res, id_match_file, name) msg = "Adding --overlaps columns" logger.info(msg) for (overlap_file, name) in zip(args.overlaps, args.overlap_names): res = add_overlaps(res, overlap_file, name, bed_df_a, bed_df_b, exons) msg = "Sorting by in-frame reads" logger.info(msg) res['x_1_sum_A'] = res['x_1_sum_A'].fillna(0) res['x_1_sum_B'] = res['x_1_sum_B'].fillna(0) res['x_1_sum'] = res['x_1_sum_A'] + res['x_1_sum_B'] res = res.sort_values('x_1_sum', ascending=False) if args.filter: msg = "Filtering the micropeptides by read coverage and KL-divergence" logger.info(msg) x_1_sum_ranks = res['x_1_sum'].rank(method='min', na_option='top', ascending=False) num_x_1_sum_ranks = x_1_sum_ranks.max() max_good_x_1_sum_rank = num_x_1_sum_ranks * args.read_filter_percent m_good_x_1_sum_rank = x_1_sum_ranks <= max_good_x_1_sum_rank msg = ("Number of micropeptides passing read filter: {}".format( sum(m_good_x_1_sum_rank))) logger.debug(msg) kl_ranks = res['kl'].rank(method='dense', na_option='top', ascending=False) num_kl_ranks = kl_ranks.max() max_good_kl_rank = num_kl_ranks * args.kl_filter_percent m_good_kl_rank = kl_ranks <= max_good_kl_rank msg = ("Number of micropeptides passing KL filter: {}".format( sum(m_good_kl_rank))) logger.debug(msg) m_both_filters = m_good_x_1_sum_rank & m_good_kl_rank msg = ("Number of micropeptides passing both filters: {}".format( sum(m_both_filters))) logger.debug(msg) res = res[m_both_filters] msg = "Writing differential micropeptides to disk" logger.info(msg) if args.append_sheet is None: utils.write_df(res, args.out, index=False) else: sheet_name = "{},{}".format(args.name_a, args.name_b) utils.append_to_xlsx(res, args.out, sheet=sheet_name, index=False)
def main(): parser = argparse.ArgumentParser( formatter_class=argparse.ArgumentDefaultsHelpFormatter, description= "This script labels the ORFs found with extract-orf-coordinates " "based on their exon structure and relation to the annotated, canonical " "ORFs. It requires the exon blocks for the ORFs (created with " "split-bed12-blocks). It completely reads in the ORFs, so unless otherwise " "desired for some reason, the input and output files can be the same.") parser.add_argument('annotated_transcripts', help="The annotated transcripts " "for the genome, in bed12+ format") parser.add_argument('extracted_orfs', help="The ORFs extracted from the " "transcripts, in bed12+ format") parser.add_argument('orf_exons', help="The exon blocks for the ORFs, in " "bed6+ format") parser.add_argument('out', help="The output (bed12+.gz) file") parser.add_argument('-p', '--num-cpus', help="The number of CPUs to use for " "a few parts of the script", type=int, default=default_num_cpus) parser.add_argument( '-f', '--filter', help="If this flag is given, then ORFs " "which are completely covered by an annotated transcript are discarded. " "Presumably, this is used to filter uninteresting ORFs from de novo " "assemblies.", action='store_true') parser.add_argument( '-e', '--annotated-exons', help="If the --filter flag is " "given, the annotated transcript exons can optionally be provided with " "this option. If they are not given, they will be split from the annotated " "transcripts. That is generally not a very expensive operation relative to " "everything else in the labeling script. If --filter is not given, then " "these are ignored.", default=default_annotated_exons) parser.add_argument( '-n', '--nonoverlapping-label', help="If this option is " "given, then ORFs which do not overlap the annotated transcripts at all " "will be given this label. Otherwise, they will be labeled as \"suspect\"", default=default_nonoverlapping_label) parser.add_argument( '-l', '--label-prefix', help="This string is prepended " "to all labels assigned to ORFs. For example, it is a useful way to " "indicate ORFs from de novo assemblies are \"novel.\" In any case, this " "*is not* prepended to \"canonical\" ORFs.", default=default_label_prefix) logging_utils.add_logging_options(parser) args = parser.parse_args() logging_utils.update_logging(args) msg = "Reading annotated transcripts" logger.info(msg) annotated_transcripts = bed_utils.read_bed(args.annotated_transcripts) msg = "Reading extracted ORFs and exons" logger.info(msg) extracted_orfs = bed_utils.read_bed(args.extracted_orfs) extracted_orf_exons = bed_utils.read_bed(args.orf_exons) msg = "Found {} extracted ORFs with {} exons".format( len(extracted_orfs), len(extracted_orf_exons)) logger.debug(msg) # check if we want to remove the extracted_orfs completely covered by # the annotated transcripts if args.filter: msg = ("Removing extracted ORFs which are completely covered by the " "annotated transcripts") logger.info(msg) # we need the annotated transcript exons if args.annotated_exons is None: msg = "Splitting the annotated transcripts into exon blocks" logger.info(msg) annotated_exons = bed_utils.split_bed12(annotated_transcripts, num_cpus=args.num_cpus, progress_bar=True) else: msg = "Reading the annotated transcript exons" logger.info(msg) annotated_exons = bed_utils.read_bed(args.annotated_exons) msg = "Finding completely covered extracted ORFs" logger.info(msg) nonoverlapping_ids = bed_utils.subtract_bed(extracted_orf_exons, annotated_exons, min_a_overlap=1) m_unfiltered = extracted_orfs['id'].isin(nonoverlapping_ids) extracted_orfs = extracted_orfs[m_unfiltered] # also discard the unnecessary exons m_unfiltered = extracted_orf_exons['id'].isin(nonoverlapping_ids) extracted_orf_exons = extracted_orf_exons[m_unfiltered] msg = "After filtering, {} extracted ORFs remain".format( len(extracted_orfs)) logger.info(msg) # if the nonoverlapping-label is given, annotate and remove the ORFs # which do not at all overlap the annotations if args.nonoverlapping_label is not None: nonoverlapping_ids = bed_utils.subtract_bed( extracted_orfs, annotated_transcripts, exons_a=extracted_orf_exons, exons_b=annotated_exons) m_nonoverlapping = extracted_orf_exons['id'].isin(nonoverlapping_ids) extracted_orf_exons = extracted_orf_exons[~m_nonoverlapping] m_nonoverlapping = extracted_orfs['id'].isin(nonoverlapping_ids) extracted_orfs.loc[m_nonoverlapping, 'orf_type'] = args.nonoverlapping_label msg = ("Found {} ORFs completely nonoverlapping annotated transcripts". format(len(nonoverlapping_ids))) logger.info(msg) msg = "Removing the annotated UTRs from the transcripts" logger.info(msg) canonical_orfs = bed_utils.retain_all_thick_only(annotated_transcripts, num_cpus=args.num_cpus) msg = "Splitting the canonical ORFs into exons" logger.info(msg) canonical_orf_exons = bed_utils.split_bed12(canonical_orfs, num_cpus=args.num_cpus, progress_bar=True) msg = "Extracting annotated 5' leader regions" logger.info(msg) five_prime_regions = bed_utils.retain_all_five_prime_of_thick( annotated_transcripts, num_cpus=args.num_cpus) if len(five_prime_regions) == 0: msg = "No annotated 5' leader regions were found" logger.warning(msg) msg = "Splitting the 5' leaders into exons" logger.info(msg) five_prime_exons = bed_utils.split_bed12(five_prime_regions, num_cpus=args.num_cpus, progress_bar=True) msg = "Extracting annotated 3' trailer regions" logger.info(msg) three_prime_regions = bed_utils.retain_all_three_prime_of_thick( annotated_transcripts, num_cpus=args.num_cpus) if len(three_prime_regions) == 0: msg = "No annotated 3' trailer regions were found" logger.warning(msg) msg = "Splitting the 3' trailers into exons" logger.info(msg) three_prime_exons = bed_utils.split_bed12(three_prime_regions, num_cpus=args.num_cpus, progress_bar=True) msg = "Splitting noncoding transcripts into exons" logger.info(msg) m_no_thick_start = annotated_transcripts['thick_start'] == -1 m_no_thick_end = annotated_transcripts['thick_end'] == -1 m_no_thick = m_no_thick_start & m_no_thick_end noncoding_transcripts = annotated_transcripts[m_no_thick] noncoding_exons = bed_utils.split_bed12(noncoding_transcripts, num_cpus=args.num_cpus, progress_bar=True) msg = "Marking canonical and extracted ORFs with the same stop codon" logger.info(msg) # first, add the true ORF end m_forward_canonical = canonical_orfs['strand'] == '+' m_reverse_canonical = canonical_orfs['strand'] == '-' m_forward_extracted = extracted_orfs['strand'] == '+' m_reverse_extracted = extracted_orfs['strand'] == '-' canonical_orfs['orf_end'] = canonical_orfs['end'] canonical_orfs.loc[m_reverse_canonical, 'orf_end'] = canonical_orfs.loc[m_reverse_canonical, 'start'] extracted_orfs['orf_end'] = extracted_orfs['end'] extracted_orfs.loc[m_reverse_extracted, 'orf_end'] = extracted_orfs.loc[m_reverse_extracted, 'start'] # now, find extracted ORFs with the same "orf_end" (and seqname, strand) as canonical ORFs merge_fields = ['seqname', 'strand', 'orf_end'] canonical_extracted_orf_ends = canonical_orfs.merge( extracted_orfs, on=merge_fields, suffixes=['_canonical', '_extracted']) # now, pull this into a set zip_it = zip(canonical_extracted_orf_ends['id_canonical'], canonical_extracted_orf_ends['id_extracted']) canonical_extracted_matching_ends = {(c, a) for c, a in zip_it} msg = "Finding ORFs which exactly overlap the canonical ORFs" logger.info(msg) exact_matches = bed_utils.get_bed_overlaps(canonical_orf_exons, extracted_orf_exons, min_a_overlap=1, min_b_overlap=1) exact_match_orf_ids = {o.b_info for o in exact_matches} m_exact_orf_matches = extracted_orf_exons['id'].isin(exact_match_orf_ids) extracted_orf_exons = extracted_orf_exons[~m_exact_orf_matches] m_canonical = extracted_orfs['id'].isin(exact_match_orf_ids) extracted_orfs.loc[m_canonical, 'orf_type'] = 'canonical' msg = "Found {} canonical ORFs".format(len(exact_match_orf_ids)) logger.info(msg) msg = "Finding ORFs which are extended versions of the canonical ORFs" logger.info(msg) extended_matches = bed_utils.get_bed_overlaps(canonical_orf_exons, extracted_orf_exons, min_a_overlap=1) # make sure the "end"s match before calling something an extended match extended_match_ids = { m.b_info for m in tqdm.tqdm(extended_matches) if (m.a_info, m.b_info) in canonical_extracted_matching_ends } m_extended_matches = extracted_orf_exons['id'].isin(extended_match_ids) extracted_orf_exons = extracted_orf_exons[~m_extended_matches] m_canonical_extended = extracted_orfs['id'].isin(extended_match_ids) l = "{}canonical_extended".format(args.label_prefix) extracted_orfs.loc[m_canonical_extended, 'orf_type'] = l msg = "Found {} canonical_extended ORFs".format(len(extended_match_ids)) logger.info(msg) msg = "Finding ORFs which are truncated versions of the canonical ORFs" logger.info(msg) truncated_matches = bed_utils.get_bed_overlaps(canonical_orf_exons, extracted_orf_exons, min_b_overlap=1) # make sure the "end"s match before calling something a truncated match truncated_match_ids = { m.b_info for m in tqdm.tqdm(truncated_matches) if (m.a_info, m.b_info) in canonical_extracted_matching_ends } m_truncated_matches = extracted_orf_exons['id'].isin(truncated_match_ids) extracted_orf_exons = extracted_orf_exons[~m_truncated_matches] m_canonical_truncated = extracted_orfs['id'].isin(truncated_match_ids) l = "{}canonical_truncated".format(args.label_prefix) extracted_orfs.loc[m_canonical_truncated, 'orf_type'] = l msg = "Found {} canonical_truncated ORFs".format(len(truncated_match_ids)) logger.info(msg) msg = ("Labeling ORFs which are completely covered by a canonical ORF but " "do not share its stop codon") logger.info(msg) # anything in "truncated matches" which *does not* share a stop codon with # the match is a "within" orf within_ids = { m.b_info for m in truncated_matches if m.b_info not in truncated_match_ids } m_within_matches = extracted_orf_exons['id'].isin(within_ids) extracted_orf_exons = extracted_orf_exons[~m_within_matches] m_within = extracted_orfs['id'].isin(within_ids) l = "{}within".format(args.label_prefix) extracted_orfs.loc[m_within, 'orf_type'] = l msg = "Found {} within ORFs".format(len(within_ids)) logger.info(msg) msg = "Finding out-of-frame overlaps" logger.info(msg) out_of_frame_matches = bed_utils.get_bed_overlaps(canonical_orf_exons, extracted_orf_exons) msg = "Finding leader overlaps" logger.info(msg) leader_matches = bed_utils.get_bed_overlaps(five_prime_exons, extracted_orf_exons) msg = "Finding trailer overlaps" logger.info(msg) trailer_matches = bed_utils.get_bed_overlaps(three_prime_exons, extracted_orf_exons) msg = ("Labeling ORFs which have (out-of-frame) overlaps with both a " "canonical ORF and annotated leaders or trailers") logger.info(msg) out_of_frame_ids = {m.b_info for m in out_of_frame_matches} leader_ids = {m.b_info for m in leader_matches} trailer_ids = {m.b_info for m in trailer_matches} leader_overlap_ids = out_of_frame_ids & leader_ids trailer_overlap_ids = out_of_frame_ids & trailer_ids m_leader_overlap_matches = extracted_orf_exons['id'].isin( leader_overlap_ids) extracted_orf_exons = extracted_orf_exons[~m_leader_overlap_matches] m_trailer_overlap_matches = extracted_orf_exons['id'].isin( trailer_overlap_ids) extracted_orf_exons = extracted_orf_exons[~m_trailer_overlap_matches] m_five_prime_overlap = extracted_orfs['id'].isin(leader_overlap_ids) l = "{}five_prime_overlap".format(args.label_prefix) extracted_orfs.loc[m_five_prime_overlap, 'orf_type'] = l m_three_prime_overlap = extracted_orfs['id'].isin(trailer_overlap_ids) l = "{}three_prime_overlap".format(args.label_prefix) extracted_orfs.loc[m_three_prime_overlap, 'orf_type'] = l msg = "Found {} five_prime_overlap ORFs".format(len(leader_overlap_ids)) logger.info(msg) msg = "Found {} three_prime_overlap ORFs".format(len(trailer_overlap_ids)) logger.info(msg) msg = "Finding ORFs completely within 5' leaders" logger.info(msg) leader_matches = bed_utils.get_bed_overlaps(five_prime_exons, extracted_orf_exons, min_b_overlap=1) leader_ids = {m.b_info for m in leader_matches} m_leader_matches = extracted_orf_exons['id'].isin(leader_ids) extracted_orf_exons = extracted_orf_exons[~m_leader_matches] m_five_prime = extracted_orfs['id'].isin(leader_ids) l = "{}five_prime".format(args.label_prefix) extracted_orfs.loc[m_five_prime, 'orf_type'] = l msg = "Found {} five_prime ORFs".format(len(leader_ids)) logger.info(msg) msg = "Finding ORFs completely within 3' trailers" logger.info(msg) trailer_matches = bed_utils.get_bed_overlaps(three_prime_exons, extracted_orf_exons, min_b_overlap=1) trailer_ids = {m.b_info for m in trailer_matches} m_trailer_matches = extracted_orf_exons['id'].isin(trailer_ids) extracted_orf_exons = extracted_orf_exons[~m_trailer_matches] m_three_prime = extracted_orfs['id'].isin(trailer_ids) l = "{}three_prime".format(args.label_prefix) extracted_orfs.loc[m_three_prime, 'orf_type'] = l msg = "Found {} three_prime ORFs".format(len(trailer_ids)) logger.info(msg) msg = "Finding ORFs completely within annotated, noncoding transcripts" logger.info(msg) noncoding_matches = bed_utils.get_bed_overlaps(noncoding_exons, extracted_orf_exons, min_b_overlap=1) noncoding_ids = {m.b_info for m in noncoding_matches} m_noncoding_matches = extracted_orf_exons['id'].isin(noncoding_ids) extracted_orf_exons = extracted_orf_exons[~m_noncoding_matches] m_noncoding = extracted_orfs['id'].isin(noncoding_ids) l = "{}noncoding".format(args.label_prefix) extracted_orfs.loc[m_noncoding, 'orf_type'] = l msg = "Found {} noncoding ORFs".format(len(noncoding_ids)) logger.info(msg) # all of the remaining ORFs fall into the "suspect" category suspect_ids = {orf_id for orf_id in extracted_orf_exons['id']} m_suspect = extracted_orfs['id'].isin(suspect_ids) l = "{}suspect".format(args.label_prefix) extracted_orfs.loc[m_suspect, 'orf_type'] = l msg = "Found {} \"suspect\" ORFs".format(len(suspect_ids)) logger.info(msg) m_no_orf_type = extracted_orfs['orf_type'].isnull() msg = "Found {} unlabeled ORFs".format(sum(m_no_orf_type)) logger.info(msg) msg = "Writing ORFs with types to disk" logger.info(msg) fields = bed_utils.bed12_field_names + ['orf_num', 'orf_len', 'orf_type'] extracted_orfs = extracted_orfs[fields] extracted_orfs = bed_utils.sort(extracted_orfs) bed_utils.write_bed(extracted_orfs, args.out)
def main(): parser = argparse.ArgumentParser( formatter_class=argparse.ArgumentDefaultsHelpFormatter, description= "This script creates a pie chart which shows the proportion of " "each ORF type in a given BED12+ file. Optionally, the ORFs can be grouped " "into similar types.") parser.add_argument('orfs', help="The BED12+ file with the ORFs") parser.add_argument('out', help="The output (image) file") parser.add_argument('--title', help="The title to use for the plot", default=default_title) parser.add_argument('--use-groups', help="If this flag is given, the the ORFs " "will be grouped", action='store_true') args = parser.parse_args() orfs = bed_utils.read_bed(args.orfs) strands = ['+', '-'] fracs = [] labels = [] for strand in ['+', '-']: m_strand = orfs['strand'] == strand orf_type_groups = orfs[m_strand].groupby('orf_type') counts = orf_type_groups.size() if args.use_groups: lab = ribo_utils.orf_type_labels fr = [get_orf_label_counts(counts, l) for l in lab] else: fr = counts.values lab = np.array(counts.index) lab = ["{} ({})".format(l, f) for l, f in zip(lab, fr)] fracs.append(fr) labels.append(lab) fig, axes = plt.subplots(ncols=2, figsize=(10, 5)) cmap = plt.cm.Blues colors = cmap(np.linspace(0., 1., len(labels[0]))) # forward strand ORFs extra_artists = [] if sum(fracs[0]) > 0: patches, texts = axes[0].pie(fracs[0], colors=colors) lgd = axes[0].legend(patches, labels[0], loc="center right", bbox_to_anchor=(0, 0.5)) axes[0].set_title("Strand: {}".format(strands[0])) extra_artists.append(lgd) else: title = "Strand: {}. No ORFs".format(strands[0]) axes[0].set_title(title) axes[0].set_axis_off() # reverse strand ORFs if sum(fracs[1]) > 0: patches, texts = axes[1].pie(fracs[1], colors=colors) lgd = axes[1].legend(patches, labels[1], loc="center right", bbox_to_anchor=(2.0, 0.5)) axes[1].set_title("Strand: {}".format(strands[1])) extra_artists.append(lgd) else: title = "Strand: {}. No ORFs".format(strands[1]) axes[1].set_title(title) axes[1].set_axis_off() if len(args.title) > 0: sup = fig.suptitle(args.title) extra_artists.append(sup) fig.savefig(args.out, bbox_extra_artists=extra_artists, bbox_inches='tight')
def main(): parser = argparse.ArgumentParser( formatter_class=argparse.ArgumentDefaultsHelpFormatter, description= "This script extracts all of the ORFs from the given transcripts. " "It writes the result as a bed12+1 file. The additional field, 'orf_len', gives " "the length of the respective ORF. It removes duplicate ORFs.\n\nN.B. The DEBUG " "output for this script is _very_ verbose. It is not recommended to run this " "script with that logging level.") parser.add_argument('transcripts_bed', help="The bed12 file containing the " "transcript information") parser.add_argument('transcripts_fasta', help="The fasta file containing the " "spliced transcript sequences") parser.add_argument('out', help="The output (bed12+1 gz) file") parser.add_argument('--start-codons', help="A list of codons which will be " "treated as start codons when extracting ORFs", nargs='+', default=default_start_codons) parser.add_argument('--stop-codons', help="A list of codons which will be " "treated as stop codons when extracting ORFs", nargs='+', default=default_stop_codons) slurm.add_sbatch_options(parser) logging_utils.add_logging_options(parser) args = parser.parse_args() logging_utils.update_logging(args) # check if we wanted to use slurm if args.use_slurm: cmd = ' '.join(sys.argv) slurm.check_sbatch(cmd, args=args) return msg = "Compiling start and stop codon regular expressions" logger.info(msg) start_codons_re = '|'.join(args.start_codons) stop_codons_re = '|'.join(args.stop_codons) start_codons_re = re.compile(start_codons_re) stop_codons_re = re.compile(stop_codons_re) msg = "Reading transcripts bed file" logger.info(msg) transcripts_bed = bed_utils.read_bed(args.transcripts_bed) msg = "Creating the sequence iterator" logger.info(msg) transcripts_fasta = fastx_utils.get_read_iterator(args.transcripts_fasta) transcripts_iter = ((get_transcript(transcript_header, transcripts_bed), transcript_sequence) for (transcript_header, transcript_sequence) in transcripts_fasta) msg = "Finding all ORFs" logger.info(msg) orfs = parallel.apply_parallel_iter(transcripts_iter, args.num_cpus, get_orfs, start_codons_re, stop_codons_re, total=len(transcripts_bed), progress_bar=True) msg = "Joining ORFs in a large data frame" logger.info(msg) orfs = pd.concat(orfs) msg = "Removing duplicate ORFs" logger.info(msg) orfs = orfs.drop_duplicates(subset=DUPLICATE_FIELDS) msg = "Numbering remaining ORFs" logger.info(msg) orfs['orf_num'] = np.arange(len(orfs)) msg = "Writing ORFs to disk" logger.info(msg) bed_utils.write_bed(orfs, args.out)
def main(): parser = argparse.ArgumentParser(formatter_class=argparse.ArgumentDefaultsHelpFormatter, description="This script uses the mygene.info service to find annotations " "for the transcripts associated with the ORFs in the given bed file. In " "particular, it extracts information from Swiss-Prot, TrEMBL, Interpro, " "PDB, Pfam, PROSITE, the Gene Ontology, and KEGG.") parser.add_argument('bed', help="The bed file") parser.add_argument('out', help="The output file. Its type will be inferred " "from its extension.") parser.add_argument('--do-not-trim', help="By default, the script will " "attempt to trim transcript identifiers such that they are valid Ensembl " "identifiers. If this flag is given, no trimming will take place.", action='store_true') parser.add_argument('--scopes', help="A list of scopes to use when querying " "mygene.info. Please see the documentation for more information about " "valid scopes: http://mygene.info/doc/query_service.html#available_fields", nargs='*', default=default_scopes) parser.add_argument('--do-not-convert-ids', help="By default, the script will " "treat the identifiers in the file as transcript identifiers. It first " "maps those to gene identifiers, and then it uses those to find the " "gene annotations. If the identifiers are already gene ids (or whatever " "is specified by scopes), then the first mapping is not necessary and " "can be skipped using this flag.", action='store_true') logging_utils.add_logging_options(parser) args = parser.parse_args() logging_utils.update_logging(args) convert_ids = not args.do_not_convert_ids msg = "Reading the bed file" logger.info(msg) bed = bed_utils.read_bed(args.bed) bed = bed[fields_to_keep] msg = "Extracting transcript ids" logger.info(msg) trim = not args.do_not_trim orf_ids = parallel.apply_iter_simple(bed['id'], parse_orf_id, trim) orf_ids_df = pd.DataFrame(orf_ids) if convert_ids: msg = "Querying transcript to gene id mapping" logger.info(msg) gene_ids = mygene_utils.get_transcript_to_gene_mapping(orf_ids_df['transcript_id']) else: gene_ids = pd.DataFrame() gene_ids['transcript_id'] = orf_ids_df['transcript_id'] gene_ids['gene_id'] = orf_ids_df['transcript_id'] msg = "Querying gene annotations" logger.info(msg) res_df = mygene_utils.query_mygene(gene_ids['gene_id']) msg = "Combining gene annotations with transcript ids" logger.info(msg) res_df = gene_ids.merge(res_df, on='gene_id', how='inner') msg = "Combining transcript annotations with ORF ids" logger.info(msg) orf_ids_fields = ['transcript_id', 'orf_id'] res_df = orf_ids_df[orf_ids_fields].merge(res_df, on='transcript_id', how='inner') msg = "Combining ORF annotations with ORF predictions" logger.info(msg) res_df = bed.merge(res_df, left_on='id', right_on='orf_id', how='left') msg = "Writing ORF annotations to disk" logger.info(msg) utils.write_df(res_df, args.out, index=False)
def main(): parser = argparse.ArgumentParser( formatter_class=argparse.ArgumentDefaultsHelpFormatter, description="This script constructs the profile for each ORF. It " "first adjusts the mapped read positions to properly align with " "the P-sites. Second, it uses a custom chrom-sweep algorithm to " "find the coverage of each position in each exon of each ORF. Finally, " "the ORF exons are glued together to find the profile of the entire ORF." ) parser.add_argument('bam', help="The bam file including filtered (unique, " "etc.) alignments") parser.add_argument('orfs', help="The (bed12) file containing the ORFs") parser.add_argument('exons', help="The (bed6+2) file containing the exons") parser.add_argument('out', help="The (mtx.gz) output file containing the " "ORF profiles") parser.add_argument( '-l', '--lengths', help="If any values are given, " "then only reads which have those lengths will be included in the " "signal construction.", type=int, default=default_lengths, nargs='*') parser.add_argument( '-o', '--offsets', help="The 5' end of reads will be " "shifted by this amount. There must be one offset value for each " "length (given by the --lengths argument.", type=int, default=default_offsets, nargs='*') parser.add_argument('-k', '--num-exons', help="If k>0, then only the " "first k exons will be processed.", type=int, default=default_num_exons) parser.add_argument( '-g', '--num-groups', help="The number of groups into " "which to split the exons. More groups means the progress bar is " "updated more frequently but incurs more overhead because of the " "parallel calls.", type=int, default=default_num_groups) parser.add_argument('--seqname-prefix', help="If present, this string " "will be prepended to the seqname field of the ORFs.", default=default_seqname_prefix) slurm.add_sbatch_options(parser) logging_utils.add_logging_options(parser) args = parser.parse_args() logging_utils.update_logging(args) msg = "[extract-orf-profiles]: {}".format(' '.join(sys.argv)) logger.info(msg) # make sure the number of lengths and offsets match if len(args.lengths) != len(args.offsets): msg = "The number of --lengths and --offsets do not match." raise ValueError(msg) # make sure the necessary files exist required_files = [args.bam, args.orfs, args.exons] msg = "[extract-orf-profiles]: Some input files were missing: " utils.check_files_exist(required_files, msg=msg) if args.use_slurm: cmd = ' '.join(sys.argv) slurm.check_sbatch(cmd, args=args) return msg = "Finding P-sites" logger.info(msg) p_sites = ribo_utils.get_p_sites(args.bam, args.lengths, args.offsets) # we do not need the data frame anymore, so save some memory msg = "Reading exons" logger.info(msg) exons = bed_utils.read_bed(args.exons) msg = "Reading ORFs" logger.info(msg) orfs = bed_utils.read_bed(args.orfs) if len(args.seqname_prefix) > 0: orfs['seqname'] = args.seqname_prefix + orfs['seqname'] exons['seqname'] = args.seqname_prefix + exons['seqname'] if args.num_exons > 0: exons = exons.head(args.num_exons) num_orfs = orfs['orf_num'].max() + 1 max_orf_len = orfs['orf_len'].max() msg = "Adding the ORF index to the exons" logger.info(msg) orf_fields = ['id', 'orf_num'] exons_orfs = exons.merge(orfs[orf_fields], on='id') msg = "Splitting exons and P-sites" logger.info(msg) exon_groups = pandas_utils.split_df(exons_orfs, args.num_groups) exons_dfs = [] psites_dfs = [] for group_index, exon_group in exon_groups: # pull out only the p-sites that come from these chromosomes seqnames = set(exon_group['seqname'].unique()) m_psites = p_sites['seqname'].isin(seqnames) exons_dfs.append(exon_group) psites_dfs.append(p_sites[m_psites]) # we no longer need the full list of psites del p_sites del exons_orfs del exon_groups del exons gc.collect() exons_psites = zip(exons_dfs, psites_dfs) msg = "Finding all P-site intersections" logger.info(msg) sum_profiles = parallel.apply_parallel_iter(exons_psites, args.num_cpus, get_all_p_site_intersections, num_orfs, max_orf_len, progress_bar=True, total=args.num_groups) msg = "Combining the ORF profiles into one matrix" logger.info(msg) f = lambda x, y: x + y sum_profiles = functools.reduce(f, sum_profiles) sum_profiles_lil = sum_profiles.tolil() msg = "Flipping the reverse strand profiles" logger.info(msg) m_reverse = orfs['strand'] == '-' reverse_orfs = orfs[m_reverse] for idx, reverse_orf in tqdm.tqdm(reverse_orfs.iterrows()): orf_num = reverse_orf['orf_num'] if sum_profiles[orf_num].sum() == 0: continue orf_len = reverse_orf['orf_len'] dense = utils.to_dense(sum_profiles, orf_num, length=orf_len) dense = dense[::-1] sum_profiles_lil[orf_num, :orf_len] = dense msg = "Writing the sparse matrix to disk" logger.info(msg) math_utils.write_sparse_matrix(args.out, sum_profiles_lil)
def main(): parser = argparse.ArgumentParser( formatter_class=argparse.ArgumentDefaultsHelpFormatter, description="This script creates a bar chart which shows the count of " "each ORF type in a given BED12+ file. Optionally, the ORFs can be " "grouped into similar types.") parser.add_argument('orfs', help="The BED12+ file with the ORFs") parser.add_argument('out', help="The output (image) file") parser.add_argument('--title', help="The title to use for the plot", default=default_title) parser.add_argument('--use-groups', help="If this flag is given, the ORFs " "will be grouped", action='store_true') parser.add_argument('--fontsize', default=default_fontsize) parser.add_argument('--legend-fontsize', default=default_legend_fontsize) parser.add_argument('--ymax', type=int, default=default_ymax) logging_utils.add_logging_options(parser) args = parser.parse_args() msg = "Reading bed file" logger.info(msg) bed = bed_utils.read_bed(args.orfs) if args.use_groups: bed['orf_type_group'] = bed['orf_type'].map( ribo_utils.orf_type_labels_reverse_mapping) orf_type_counts = bed.groupby(['orf_type_group', 'strand']).size() orf_type_counts = orf_type_counts.reset_index(name="count") orf_type_counts['display_name'] = orf_type_counts[ 'orf_type_group'].map(ribo_utils.orf_type_labels_display_name_map) else: orf_type_counts = bed.groupby(['orf_type', 'strand']).size() orf_type_counts = orf_type_counts.reset_index(name="count") orf_type_counts['display_name'] = orf_type_counts['orf_type'].map( ribo_utils.orf_type_display_name_map) msg = "Creating the bar chart" color = sns.palettes.color_palette("Set3", n_colors=3) fig, ax = plt.subplots(figsize=(9, 5)) sns.barplot(x="display_name", y="count", hue="strand", data=orf_type_counts, ax=ax, zorder=-1, palette='Set3') sns.despine() ax.legend(loc='upper right', bbox_to_anchor=(1.0, 0.95), fontsize=args.legend_fontsize, frameon=True, framealpha=0.9, title="Strand") mpl_utils.set_legend_title_fontsize(ax, args.fontsize) ax.set_yscale('log') ax.set_ylim((1, args.ymax)) ax.set_ylabel("Number of ORFs", fontsize=args.fontsize) ax.set_xlabel("", fontsize=0) # rotate the ORF type names mpl_utils.set_ticklabels_fontsize(ax, args.fontsize) mpl_utils.set_ticklabel_rotation(ax, axis='x', rotation=90) # place the ORF type names in the middle of the bar for ticklabel in ax.xaxis.get_ticklabels(): p = ticklabel.get_position() ticklabel.set_position((p[0], 0.1)) ticklabel.set_verticalalignment('bottom') if args.title is not None: ax.set_title(args.title, fontsize=args.fontsize) if args.out is not None: fig.savefig(args.out, bbox_inches='tight')
def main(): parser = argparse.ArgumentParser( formatter_class=argparse.ArgumentDefaultsHelpFormatter, description= "Given a list of ORFs with associated Bayes factors and a fasta " "sequence file, this script extracts the sequences of the ORFs whose Bayes factor " "exceeds the given threshold. Finally, biopython is used to translate the " "selected ORFs into protein sequences.\n\n" "The min-length and minimum-profile-sum filters are applied in the obvious way.\n\n" "For both BF and chi-square predictions, only ORFs which have more reads in the " "first reading frame than either of the other two will be selected as translated. " "(This is called the 'frame filter' below.)\n\n" "The selection based on Bayes factors follows this logic: if max_bf_var is given, " "then it and min_bf_mean are taken as a hard threshold on the estimated Bayes " "factor mean. If min_bf_likelihood is given, then this min_bf_mean is taken as the " "boundary value; that is, an ORF is \"translated\" if:\n\n" "\t\t[P(bf > min_bf_mean)] > min_bf_likelihood\n\n" "If both max_bf_var and min_bf_likelihood are None, then min_bf_mean is taken as a " "hard threshold on the mean for selecting translated ORFs.\n\n" "If both max_bf_var and min_bf_likelihood are given, then both filters will be " "applied and the result will be the intersection.\n\n" "If the --use-chi-square option is given, the significance value is " "Bonferroni-corrected based on the number of ORFs which meet the length, profile " "and frame filters.") parser.add_argument('bayes_factors', help="The file containing the ORFs and Bayes' " "factors (BED12+)") parser.add_argument('fasta', help="The *genome* fasta file") parser.add_argument('predicted_orfs', help="The (output) BED12+ file containing " "the predicted ORFs.") parser.add_argument( 'predicted_dna_sequences', help="The (output) fasta file " "containing the predicted ORF sequences, as DNA sequences") parser.add_argument( 'predicted_protein_sequences', help="The (output) fasta file " "containing the predicted ORF sequences, as protein sequences") parser.add_argument( '--select-longest-by-stop', help="If this flag is given, then " "the selected ORFs will be merged based on stop codons. In particular, only the " "longest translated ORF at each stop codon will be selected.", action='store_true') parser.add_argument( '--select-best-overlapping', help="If this flag is given, then " "only the ORF with the highest estimated Bayes factor will be kept among each " "set of overlapping ORFs. N.B. This filter is applied *AFTER* selecting the " "longest ORF at each stop codon, if the --select-longest-by-stop flag is " "given.", action='store_true') parser.add_argument('--min-length', help="The minimum length to predict an ORF " "as translated", type=int, default=default_min_length) parser.add_argument('--min-bf-mean', help="The minimum Bayes' factor mean to predict " "an ORF as translated (use --help for more details)", type=float, default=default_min_bf_mean) parser.add_argument('--max-bf-var', help="The maximum Bayes' factor variance to predict " "an ORF as translated (use --help for more details)", type=float, default=default_max_bf_var) parser.add_argument( '--min-bf-likelihood', help="If given, then this is taken a threshold " "on the likelihood of translation (use --help for more details)", type=float, default=default_min_bf_likelihood) parser.add_argument( '--use-chi-square', help="If this flag is present, the the " "chi square value will be used to predict ORFs rather than the Bayes' factor", action='store_true') parser.add_argument( '--chisq-significance-level', help="If using chi square, then this " "value is Bonferroni corrected and used as the significance cutoff", type=float, default=default_chisq_significance_level) parser.add_argument('--filtered-orf-types', help="A list of ORF types which will be " "removed before selecting the final prediction set.", nargs='*', default=default_filtered_orf_types) parser.add_argument( '--filter-non-canonical-overlaps', help="If this flag is given, then " "--filtered-orf-types will be extended with the non-canonical overlap types ({})." .format(non_canonical_overlap_orf_types_str), action='store_true') logging_utils.add_logging_options(parser) args = parser.parse_args() logging_utils.update_logging(args) # first, extract all of the predictions which exceed the threshold msg = "Reading Bayes factor information" logger.info(msg) bayes_factors = bed_utils.read_bed(args.bayes_factors) if args.filter_non_canonical_overlaps: args.filtered_orf_types.extend(non_canonical_overlap_orf_types) if len(args.filtered_orf_types) > 0: filtered_orf_types_str = ','.join(args.filtered_orf_types) msg = "Filtering these ORF types: {}".format(filtered_orf_types_str) logger.info(msg) m_orf_types = bayes_factors['orf_type'].isin(args.filtered_orf_types) bayes_factors = bayes_factors[~m_orf_types] msg = "Identifying ORFs which meet the prediction thresholds" logger.info(msg) all_orfs, bf_orfs, chisq_orfs = ribo_utils.get_predicted_orfs( bayes_factors, min_bf_mean=args.min_bf_mean, max_bf_var=args.max_bf_var, min_bf_likelihood=args.min_bf_likelihood, min_length=args.min_length, chisq_alpha=args.chisq_significance_level, select_longest_by_stop=args.select_longest_by_stop) if args.use_chi_square: predicted_orfs = chisq_orfs else: predicted_orfs = bf_orfs msg = "Number of selected ORFs: {}".format(len(predicted_orfs)) logger.info(msg) if args.select_best_overlapping: msg = "Finding overlapping ORFs" logger.info(msg) merged_intervals = bed_utils.merge_all_intervals(predicted_orfs) msg = "Selecting best among overlapping ORFs" logger.info(msg) predicted_orfs = parallel.apply_iter_simple( merged_intervals['merged_ids'], get_best_overlapping_orf, predicted_orfs, progress_bar=True) predicted_orfs = pd.DataFrame(predicted_orfs) msg = "Sorting selected ORFs" logger.info(msg) predicted_orfs = bed_utils.sort(predicted_orfs) msg = "Writing selected ORFs to disk" logger.info(msg) bed_utils.write_bed(predicted_orfs, args.predicted_orfs) # now get the sequences msg = "Extracting predicted ORFs DNA sequence" logger.info(msg) split_exons = True transcript_sequences = bed_utils.get_all_bed_sequences( predicted_orfs, args.fasta, split_exons) fastx_utils.write_fasta(transcript_sequences, args.predicted_dna_sequences, compress=False) # translate the remaining ORFs into protein sequences msg = "Converting predicted ORF sequences to amino acids" logger.info(msg) records = fastx_utils.get_read_iterator(args.predicted_dna_sequences) protein_records = {r[0]: Bio.Seq.translate(r[1]) for r in records} fastx_utils.write_fasta(protein_records.items(), args.predicted_protein_sequences, compress=False)
def main(): parser = argparse.ArgumentParser(formatter_class=argparse.ArgumentDefaultsHelpFormatter, description="This script creates a line graph showing the length distributions " "of the various types of ORFs. Optionally, it can also include the length " "distribution of ORFs downloaded from uniprot. If uniprot ORFs are given, then the " "KL-divergence between the type distributions and the uniprot ORFs is calculated.") parser.add_argument('orfs', help="The BED12+ file with the ORFs") parser.add_argument('out', help="The output (image) file") parser.add_argument('--uniprot', help="The uniprot ORF lengths, if available", default=default_uniprot) parser.add_argument('--uniprot-label', help="The label to use for the uniprot ORFs in " "the plot", default=default_uniprot_label) parser.add_argument('--title', help="The title to use for the plot", default=default_title) parser.add_argument('--use-groups', help="If this flag is given, the the ORFs " "will be grouped", action='store_true') args = parser.parse_args() orfs = bed_utils.read_bed(args.orfs) if args.use_groups: orf_lengths = [ get_orf_lengths(orfs, ribo_utils.orf_type_labels_mapping[label]) for label in ribo_utils.orf_type_labels] prediction_labels = [latex.get_latex_safe_string(l) for l in ribo_utils.orf_type_labels] prediction_lengths_list = orf_lengths else: orf_lengths = [ get_orf_lengths(orfs, [orf_type]) for orf_type in ribo_utils.orf_types] prediction_labels = [latex.get_latex_safe_string(l) for l in ribo_utils.orf_types] prediction_lengths_list = orf_lengths if os.path.exists(args.uniprot): truth_nt_lengths = bio.get_uniprot_nt_lengths(args.uniprot) truth_label = args.uniprot_label else: truth_nt_lengths = None truth_label = None #prediction_lengths_list = [bf_lengths, chisq_lengths] #prediction_labels = ['BF', r'$\chi^2$'] # input: truth_nt_lengths (array-like) # prediction_lengths_list (list of array-likes) # truth_label (string) # prediction_labels (list of array-likes) # # if truth_nt_lengths is not defined, then the KL-divergence calculations # will be skipped (and it will not be shown) fontsize = 20 legend_fontsize = 20 title_fontsize = 20 linewidth = 4 # plot the empirical distribution of ORF lengths hist_min = 200 hist_max = 5250 hist_step = 200 hist_range = (hist_min, hist_max) hist_bins = np.arange(hist_min, hist_max, hist_step) if truth_nt_lengths is not None: truth_hist, _ = np.histogram(truth_nt_lengths, bins=hist_bins, range=hist_range, density=True) else: truth_hist = None prediction_hists = [] for prediction_lengths in prediction_lengths_list: prediction_hist, _ = np.histogram(prediction_lengths, bins=hist_bins, range=hist_range, density=True) prediction_hists.append(prediction_hist) # now, normalize the histograms if truth_hist is not None: truth_hist = truth_hist / np.sum(truth_hist) truth_hist += 1e-3 for i, prediction_hist in enumerate(prediction_hists): prediction_hists[i] = prediction_hist / np.sum(prediction_hist) prediction_hists[i] += 1e-3 kls = [] if truth_hist is not None: for i, prediction_hist in enumerate(prediction_hists): kl = math_utils.calculate_symmetric_kl_divergence(truth_hist, prediction_hist, scipy.stats.entropy) kls.append(kl) # and update the label prediction_labels[i] = '{}, KL: ${:.2f}$'.format(prediction_labels[i], kl) if truth_hist is not None: truth_hist = 100 * truth_hist for i, prediction_hist in enumerate(prediction_hists): prediction_hists[i] *= 100 fig, ax = plt.subplots(figsize=(10,5)) cm = plt.cm.gist_earth x = np.arange(len(hist_bins)-1) truth_cm_offset = 0.1 if truth_hist is not None: color = cm(truth_cm_offset) ax.plot(x, truth_hist, label=truth_label, linewidth=linewidth, color=color) color_range = 1 - 2*truth_cm_offset for i, prediction_hist in enumerate(prediction_hists): color = i / len(prediction_hists) * color_range color += 2*truth_cm_offset color = cm(color) ax.plot(x, prediction_hist, label=prediction_labels[i], linewidth=linewidth, color=color) ax.set_xlabel('Length (bp)', fontsize=fontsize) ax.set_ylabel('\% of predicted ORFs', fontsize=fontsize) if len(args.title) > 0: ax.set_title(args.title, fontsize=fontsize) ax.set_xticks(x[::2]) ax.set_xticklabels(hist_bins[::2], fontsize=fontsize, rotation=90) ax.set_ylim((0, 20)) ax.set_xlim((0, len(hist_bins))) # hide the "0" tick label yticks = ax.yaxis.get_major_ticks() yticks[0].label1.set_visible(False) # chop off everything from 3000 on index_of_3000 = 14 ax.set_xlim((0, index_of_3000)) #ax.set_xlim((0, len(uniprot_hist)-1)) lgd = ax.legend(loc='center right', fontsize=legend_fontsize, bbox_to_anchor=(1.75,0.5)) ax.tick_params(axis='both', which='major', labelsize=fontsize) fig.savefig(args.out, bbox_inches='tight', bbox_extra_artists=(lgd,))
def main(): global profiles_data, profiles_indices, profiles_indptr, profiles_shape global translated_models, untranslated_models global args parser = argparse.ArgumentParser(formatter_class=argparse.ArgumentDefaultsHelpFormatter, description=""" This script uses Hamiltonian MCMC with Stan to estimate translation parameters for a set of regions (presumably ORFs). Roughly, it takes as input: (1) a set of regions (ORFs) and their corresponding profiles (2) a "translated" model which gives the probability that a region is translated (3) an "untranslated" model which gives the probability that a region is not translated The script first smoothes the profiles using LOWESS. It then calculates both the Bayes' factor (using the smoothed profile) and \chi^2 value (using the raw counts) for each ORF. """ ) parser.add_argument('profiles', help="The ORF profiles (counts) (mtx)") parser.add_argument('regions', help="The regions (ORFs) for which predictions will " "be made (BED12+)") parser.add_argument('out', help="The output file for the Bayes' factors (BED12+)") parser.add_argument('--chi-square-only', help="If this flag is present, then only the chi " "square test will be performed for each ORF. This can also be a way to get the counts " "within each of the ORFs.", action='store_true') parser.add_argument('--translated-models', help="The models to use as H_t (pkl)", nargs='+') parser.add_argument('--untranslated-models', help="The models to use as H_u (pkl)", nargs='+') ### filtering options parser.add_argument('--orf-types', help="If values are given, then only orfs with " "those types are processed.", nargs='*', default=default_orf_types) parser.add_argument('--orf-type-field', default=default_orf_type_field) parser.add_argument('--min-length', help="ORFs with length less than this value will not " "be processed", type=int, default=default_min_length) parser.add_argument('--max-length', help="ORFs with length greater than this value will not " "be processed", type=int, default=default_max_length) parser.add_argument('--min-profile', help="ORFs with profile sum (i.e., number " "of reads) less than this value will not be processed.", type=float, default=default_min_profile) ### smoothing options parser.add_argument('--fraction', help="The fraction of signal to use in LOWESS", type=float, default=default_fraction) parser.add_argument('--reweighting-iterations', help="The number of reweighting " "iterations to use in LOWESS. Please see the statsmodels documentation for a " "detailed description of this parameter.", type=int, default=default_reweighting_iterations) ### MCMC options parser.add_argument('-s', '--seed', help="The random seeds to use for inference", type=int, default=default_seed) parser.add_argument('-c', '--chains', help="The number of MCMC chains to use", type=int, default=default_chains) parser.add_argument('-i', '--iterations', help="The number of MCMC iterations to use for " "each chain", type=int, default=default_iterations) ### behavior options parser.add_argument('--num-orfs', help="If n>0, then only this many ORFs will be processed", type=int, default=default_num_orfs) parser.add_argument('--orf-num-field', default=default_orf_num_field) parser.add_argument('--do-not-compress', help="Unless otherwise specified, the output will " "be written in GZip format", action='store_true') parser.add_argument('-g', '--num-groups', help="The number of groups into which to split " "the ORFs. More groups means the progress bar is updated more frequently but incurs " "more overhead because of the parallel calls.", type=int, default=default_num_groups) slurm.add_sbatch_options(parser) logging_utils.add_logging_options(parser) args = parser.parse_args() logging_utils.update_logging(args) if args.use_slurm: cmd = ' '.join(sys.argv) slurm.check_sbatch(cmd, args=args) return # read in the regions and apply the filters msg = "Reading and filtering ORFs" logger.info(msg) regions = bed_utils.read_bed(args.regions) # by default, keep everything m_filters = np.array([True] * len(regions)) if len(args.orf_types) > 0: m_orf_type = regions[args.orf_type_field].isin(args.orf_types) m_filters = m_orf_type & m_filters # min length if args.min_length > 0: m_min_length = regions['orf_len'] >= args.min_length m_filters = m_min_length & m_filters # max length if args.max_length > 0: m_max_length = regions['orf_len'] <= args.max_length m_filters = m_max_length & m_filters # min profile profiles = scipy.io.mmread(args.profiles).tocsr() profiles_sums = profiles.sum(axis=1) good_orf_nums = np.where(profiles_sums >= args.min_profile) good_orf_nums = set(good_orf_nums[0]) m_profile = regions['orf_num'].isin(good_orf_nums) m_filters = m_profile & m_filters regions = regions[m_filters] if args.num_orfs > 0: regions = regions.head(args.num_orfs) regions = regions.reset_index(drop=True) msg = "Number of regions after filtering: {}".format(len(regions)) logger.info(msg) logger.debug("Reading models") translated_models = [pickle.load(open(tm, 'rb')) for tm in args.translated_models] untranslated_models = [pickle.load(open(bm, 'rb')) for bm in args.untranslated_models] profiles_data = multiprocessing.RawArray(ctypes.c_double, profiles.data.flat) profiles_indices = multiprocessing.RawArray(ctypes.c_int, profiles.indices) profiles_indptr = multiprocessing.RawArray(ctypes.c_int, profiles.indptr) profiles_shape = multiprocessing.RawArray(ctypes.c_int, profiles.shape) bfs_l = parallel.apply_parallel_split( regions, args.num_cpus, get_all_bayes_factors_args, num_groups=args.num_groups, progress_bar=True ) bfs = pd.concat(bfs_l) # write the results as a bed12+ file bed_utils.write_bed(bfs, args.out)