def add_overlaps(diff_micropeptides, overlap_file, name, bed_df_a, bed_df_b, exons): msg = "Reading overlaps file: {}".format(overlap_file) logger.info(msg) overlap_bed = bed_utils.read_bed(overlap_file) msg = "Finding overlaps" a_overlaps = bed_utils.get_bed_overlaps(bed_df_a, overlap_bed, exons=exons) a_overlaps_ids = {to.a_info for to in a_overlaps} b_overlaps = bed_utils.get_bed_overlaps(bed_df_b, overlap_bed, exons=exons) b_overlaps_ids = {to.a_info for to in b_overlaps} m_match_a = diff_micropeptides['A'].isin(a_overlaps_ids) m_match_b = diff_micropeptides['B'].isin(b_overlaps_ids) match_name_a = "{}_A".format(name) match_name_b = "{}_B".format(name) diff_micropeptides[match_name_a] = 'No' diff_micropeptides[match_name_b] = 'No' diff_micropeptides.loc[m_match_a, match_name_a] = 'Yes' diff_micropeptides.loc[m_match_b, match_name_b] = 'Yes' return diff_micropeptides
def main(): parser = argparse.ArgumentParser( formatter_class=argparse.ArgumentDefaultsHelpFormatter, description= "This script visualizes the metagene profiles for each ORF type " "present in a given BED12+ file. It visualizes the mean and variance of normalized " "profiles in the first 21-bp, last 21-bp, and across all other 21-bp windows." ) parser.add_argument('orfs', help="The BED12+ file containing the ORFs") parser.add_argument('profiles', help="The (mtx) file containing the ORF profiles") parser.add_argument( 'out', help="The base output name. The output filenames will be of " "the form: <out>.<orf-type>.<image-type>.") parser.add_argument('--min-profile', help="The minimum value of the sum over the profile " "to include it in the analysis", type=float, default=default_min_profile) parser.add_argument( '--max-orfs', help="At most this many ORFs of each type will be " "used to create the figures. They will be sampled randomly from among those " "which meet the min-profile constraint.", type=int, default=default_max_orfs) parser.add_argument('--title', help="The prefix to use for the title of the plots", default=default_title) parser.add_argument('--image-type', help="The type of image files to create. The type " "must be recognized by matplotlib.", default=default_image_type) logging_utils.add_logging_options(parser) args = parser.parse_args() logging_utils.update_logging(args) msg = "Reading ORFs" logger.info(msg) orfs = bed_utils.read_bed(args.orfs) msg = "Reading profiles" logger.info(msg) profiles = scipy.io.mmread(args.profiles).tocsr() msg = "Extracting the metagene profiles and creating the images" logger.info(msg) orf_type_groups = orfs.groupby('orf_type') orf_type_groups.apply(extract_profiles_and_plot, profiles, args) msg = "Finished" logger.info(msg)
def main(): parser = argparse.ArgumentParser(formatter_class=argparse.ArgumentDefaultsHelpFormatter, description="Collect the individual read length ORF profiles (mtx) created " "by 'create-read-length-orf-profiles' into a single 'sparse tensor'. " "N.B. This script is called by 'create-read-length-orf-profiles', however" "we still call each sample independently for condition, lengths and offsets") parser.add_argument('config', help="The (yaml) config file") parser.add_argument('name', help="The name of either one of the 'riboseq_samples'" "or 'riboseq_biological_replicates' from the config file.") parser.add_argument('out', help="The output (txt.gz) file. N.B. The output uses" "base-0 indexing, contrary to the unsmoothed ORF profiles, which are written" "using the matrix market format (base-1 indexing).") parser.add_argument('-c', '--is-condition', help="If this flag is present, " "then 'name' will be taken to be a condition name. The profiles for " "all relevant replicates of the condition will be created.", action='store_true') parser.add_argument('--add-ids', help="If this flag is present, " "then orf_ids will be added to the final output.", action='store_true') logging_utils.add_logging_options(parser) args = parser.parse_args() logging_utils.update_logging(args) msg = "Reading config file" logger.info(msg) config = yaml.load(open(args.config), Loader=yaml.FullLoader) # pull out what we need from the config file is_unique = not ('keep_riboseq_multimappers' in config) note = config.get('note', None) if args.add_ids: orf_note = config.get('orf_note', None) orfs_file = filenames.get_orfs( config['genome_base_path'], config['genome_name'], note=orf_note ) orfs = bed_utils.read_bed(orfs_file) names = [args.name] if args.is_condition: riboseq_replicates = ribo_utils.get_riboseq_replicates(config) names = [n for n in riboseq_replicates[args.name]] # keep a map from the lengths to the combined profiles length_profile_map = {} for name in names: msg = "Processing sample: {}".format(name) logger.info(msg) # get the lengths and offsets which meet the required criteria from the config file lengths, offsets = ribo_utils.get_periodic_lengths_and_offsets( config, name, is_unique=is_unique, default_params=metagene_options ) if len(lengths) == 0: msg = ("No periodic read lengths and offsets were found. Try relaxing " "min_metagene_profile_count, min_metagene_bf_mean, " "max_metagene_bf_var, and/or min_metagene_bf_likelihood. Qutting.") logger.critical(msg) return for length, offset in zip(lengths, offsets): mtx = filenames.get_riboseq_profiles( config['riboseq_data'], name, length=[length], offset=[offset], is_unique=is_unique, note=note ) mtx = scipy.io.mmread(mtx).tocsr() prior_mtx = length_profile_map.get(length, None) if prior_mtx is None: length_profile_map[length] = mtx else: length_profile_map[length] = prior_mtx + mtx if args.add_ids: with gzip.open(args.out, 'wb') as target_gz: for length, mtx in length_profile_map.items(): mtx = mtx.tocoo() msg = "Writing ORF profiles. length: {}.".format(length) logger.info(msg) for row, col, val in zip(mtx.row, mtx.col, mtx.data): # orf_num are both zero-based, since we are now using coo orf_id = orfs.loc[orfs['orf_num'] == row]['id'].values[0] s = "{} {} {} {} {}\n".format(row, orf_id, col, length, val) target_gz.write(s.encode()) else: with gzip.open(args.out, 'wb') as target_gz: for length, mtx in length_profile_map.items(): mtx = mtx.tocoo() msg = "Writing ORF profiles. length: {}.".format(length) logger.info(msg) for row, col, val in zip(mtx.row, mtx.col, mtx.data): s = "{} {} {} {}\n".format(row, col, length, val) target_gz.write(s.encode())
def main(): parser = argparse.ArgumentParser( formatter_class=argparse.ArgumentDefaultsHelpFormatter, description= "This script extracts the differential micropeptides from two " "conditions. Please see the documentation in redmine for more details.\n\n" "Please see the pyensembl (https://github.com/hammerlab/pyensembl) " "documentation for more information about the ensembl release and species." ) parser.add_argument('config', help="The (yaml) config file") parser.add_argument('name_a', help="The name of the first condition") parser.add_argument('name_b', help="The name of the second condition") parser.add_argument('out', help="The output (.csv.gz or .xlsx) file") parser.add_argument( '-a', '--append-sheet', help="If this flag is given, " "then a worksheet with the name '<name_a>,<name_b>' will be appended " "to the .xlsx file given by out (if it exists)", action='store_true') parser.add_argument( '-f', '--filter', help="If this flag is present, then " "the output will be filtered to include only the differential " "micropeptides with the highest KL-divergence and read coverage", action='store_true') parser.add_argument( '--read-filter-percent', help="If the the --filter flag " "is given, then only the top --read-filter-percent micropeptides will " "be considered for the final output. They still must meet the KL-" "divergence filtering criteria.", type=float, default=default_read_filter_percent) parser.add_argument( '--kl-filter-percent', help="If the the --filter flag " "is given, then only the top --read-kl-percent micropeptides will " "be considered for the final output. They still must meet the read " "coverage filtering criteria.", type=float, default=default_kl_filter_percent) parser.add_argument( '--id-matches', help="This is a list of files which " "contain ORF identifiers to compare to the differential micropeptides. " "For each of the files given, two columns will be added to the output " "which indicate if either A or B appear in the respective file. Each " "file should have a single ORF identifier on each line and contain " "nothing else.", nargs='*', default=default_id_matches) parser.add_argument( '--id-match-names', help="A name to include in the " "output file for each --id-matches file. The number of names must " "match the number of files.", nargs='*', default=default_id_match_names) parser.add_argument( '--overlaps', help="This is a list of bed12+ files " "which will be compared to the differential micropeptides. Two columns " "(one for A, one for B) will be added to the output which indicate if " "the respective micropeptides overlap a feature in each file by at " "least 1 bp.", nargs='*', default=default_overlaps) parser.add_argument( '--overlap-names', help="A name to include in the " "output file for each --overlaps file. The number of names must match " "the number of files.", nargs='*', default=default_overlap_names) parser.add_argument( '-r', '--ensembl-release', help="The version of Ensembl " "to use when mapping transcript identifiers to gene identifiers", type=int, default=default_ensembl_release) parser.add_argument( '-s', '--ensembl-species', help="The Ensembl species " "to use when mapping transcript identifiers to gene identifiers", default=default_ensembl_species) parser.add_argument( '--a-is-single-sample', help="By default, this script " "assumes the predictions come from merged replicates. If name_a is from " "a single sample, this flag should be given. It is necessary to find " "the correct filenames.", action='store_true') parser.add_argument( '--b-is-single-sample', help="By default, this script " "assumes the predictions come from merged replicates. If name_b is from " "a single sample, this flag should be given. It is necessary to find " "the correct filenames.", action='store_true') parser.add_argument('--fields-to-keep', help="The fields to keep from the " "Bayes factor file for each condition", nargs='*', default=default_fields_to_keep) parser.add_argument('--max-micropeptide-len', help="The maximum (inclusive) " "length of ORFs considered as micropeptides", type=int, default=default_max_micropeptide_len) parser.add_argument( '--do-not-fix-tcons', help="By default, the \"TCONS_\" " "identifiers from StringTie, etc., do not parse correctly; this script " "update the identifiers so that will parse correctly unless instructed not " "to. The script is likely to crash if the identifiers are not fixed.", action='store_true') logging_utils.add_logging_options(parser) args = parser.parse_args() logging_utils.update_logging(args) msg = "Loading ensembl database" logger.info(msg) ensembl = pyensembl.EnsemblRelease(release=args.ensembl_release, species=args.ensembl_species) ensembl.db msg = "Checking the id-match and overlaps files" logger.info(msg) if len(args.id_matches) != len(args.id_match_names): msg = ("The number of --id-matches files and --id-match-names do not " "match. {} files and {} names".format(len(args.id_matches), len(args.id_match_names))) raise ValueError(msg) if len(args.overlaps) != len(args.overlap_names): msg = ("The number of --overlaps files and --overlaps-names do not " "match. {} files and {} names".format(len(args.overlaps), len(args.overlap_names))) raise ValueError(msg) utils.check_files_exist(args.id_matches) utils.check_files_exist(args.overlaps) if args.filter: msg = "Validating filter percentages" logger.info(msg) math_utils.check_range(args.read_filter_percent, 0, 1, variable_name="--read-filter-percent") math_utils.check_range(args.kl_filter_percent, 0, 1, variable_name="--kl-filter-percent") msg = "Extracting file names" logger.info(msg) config = yaml.load(open(args.config), Loader=yaml.FullLoader) note_str = config.get('note', None) # keep multimappers? is_unique = not ('keep_riboseq_multimappers' in config) # and the smoothing parameters fraction = config.get('smoothing_fraction', None) reweighting_iterations = config.get('smoothing_reweighting_iterations', None) lengths_a = None offsets_a = None if args.a_is_single_sample: lengths_a, offsets_a = ribo_utils.get_periodic_lengths_and_offsets( config, args.name_a, is_unique=is_unique) bayes_factors_a = filenames.get_riboseq_bayes_factors( config['riboseq_data'], args.name_a, length=lengths_a, offset=offsets_a, is_unique=is_unique, note=note_str, fraction=fraction, reweighting_iterations=reweighting_iterations) if not os.path.exists(bayes_factors_a): msg = ("Could not find the Bayes factor file for {}. ({}). Quitting.". format(args.name_a, bayes_factors_a)) raise FileNotFoundError(msg) predicted_orfs_a = filenames.get_riboseq_predicted_orfs( config['riboseq_data'], args.name_a, length=lengths_a, offset=offsets_a, is_unique=is_unique, note=note_str, fraction=fraction, reweighting_iterations=reweighting_iterations, is_filtered=True, is_chisq=False) if not os.path.exists(predicted_orfs_a): msg = ( "Could not find the predictions bed file for {}. ({}). Quitting.". format(args.name_a, predicted_orfs_a)) raise FileNotFoundError(msg) lengths_b = None offsets_b = None if args.b_is_single_sample: lengths_b, offsets_b = ribo_utils.get_periodic_lengths_and_offsets( config, args.name_b, is_unique=is_unique) bayes_factors_b = filenames.get_riboseq_bayes_factors( config['riboseq_data'], args.name_b, length=lengths_b, offset=offsets_b, is_unique=is_unique, note=note_str, fraction=fraction, reweighting_iterations=reweighting_iterations) if not os.path.exists(bayes_factors_b): msg = ("Could not find the Bayes factor file for {}. ({}). Quitting.". format(args.name_b, bayes_factors_b)) raise FileNotFoundError(msg) predicted_orfs_b = filenames.get_riboseq_predicted_orfs( config['riboseq_data'], args.name_b, length=lengths_b, offset=offsets_b, is_unique=is_unique, note=note_str, fraction=fraction, reweighting_iterations=reweighting_iterations, is_filtered=True, is_chisq=False) if not os.path.exists(predicted_orfs_b): msg = ( "Could not find the predictions bed file for {}. ({}). Quitting.". format(args.name_b, predicted_orfs_b)) raise FileNotFoundError(msg) exons_file = filenames.get_exons(config['genome_base_path'], config['genome_name'], note=config.get('orf_note')) if not os.path.exists(exons_file): msg = "Could not find the exons file ({}). Quitting.".format( exons_file) raise FileNotFoundError(msg) msg = "Reading the exons" logger.info(msg) exons = bed_utils.read_bed(exons_file) msg = "Reading the BF files" logger.info(msg) bf_df_a = bed_utils.read_bed(bayes_factors_a) bf_df_b = bed_utils.read_bed(bayes_factors_b) msg = "Reading the predictions files" logger.info(msg) bed_df_a = bed_utils.read_bed(predicted_orfs_a) bed_df_b = bed_utils.read_bed(predicted_orfs_b) differential_micropeptide_dfs = [] # extract micropeptides msg = "Extracting micropeptides" logger.info(msg) m_micropeptides_a = bed_df_a['orf_len'] <= args.max_micropeptide_len m_micropeptides_b = bed_df_b['orf_len'] <= args.max_micropeptide_len micropeptides_a = bed_df_a[m_micropeptides_a] micropeptides_b = bed_df_b[m_micropeptides_b] long_orfs_a = bed_df_a[~m_micropeptides_a] long_orfs_b = bed_df_b[~m_micropeptides_b] msg = "Finding micropeptides in A with no overlap in B" logger.info(msg) micropeptides_a_no_match_b = bed_utils.subtract_bed(micropeptides_a, bed_df_b, exons=exons) micropeptides_a_no_match_b_df = pd.DataFrame() micropeptides_a_no_match_b_df['A'] = list(micropeptides_a_no_match_b) micropeptides_a_no_match_b_df['B'] = None micropeptides_a_no_match_b_df['kl'] = np.inf micropeptides_a_no_match_b_df['overlap_type'] = 'micro_a_only' differential_micropeptide_dfs.append(micropeptides_a_no_match_b_df) msg = "Finding micropeptides in B with no overlap in A" logger.info(msg) micropeptides_b_no_match_a = bed_utils.subtract_bed(micropeptides_b, bed_df_a, exons=exons) micropeptides_b_no_match_a_df = pd.DataFrame() micropeptides_b_no_match_a_df['B'] = list(micropeptides_b_no_match_a) micropeptides_b_no_match_a_df['A'] = None micropeptides_b_no_match_a_df['kl'] = np.inf micropeptides_b_no_match_a_df['overlap_type'] = 'micro_b_only' differential_micropeptide_dfs.append(micropeptides_b_no_match_a_df) msg = "Finding overlapping micropeptides" logger.info(msg) micropeptides_a_micropeptides_b_df = get_overlap_df( micropeptides_a, micropeptides_b, 'micro_a_micro_b', bf_df_a, bf_df_b) differential_micropeptide_dfs.append(micropeptides_a_micropeptides_b_df) micropeptides_a_long_b_df = get_overlap_df(micropeptides_a, long_orfs_b, 'micro_a_long_b', bf_df_a, bf_df_b) differential_micropeptide_dfs.append(micropeptides_a_long_b_df) micropeptides_b_long_a_df = get_overlap_df(long_orfs_a, micropeptides_b, 'long_a_micro_b', bf_df_a, bf_df_b) differential_micropeptide_dfs.append(micropeptides_b_long_a_df) differential_micropeptides_df = pd.concat(differential_micropeptide_dfs) msg = "Adding read count information" logger.info(msg) res = differential_micropeptides_df.merge(bf_df_a[args.fields_to_keep], left_on='A', right_on='id', how='left') to_rename = {f: "{}_A".format(f) for f in args.fields_to_keep} res = res.rename(columns=to_rename) res = res.drop('id_A', axis=1) res = res.merge(bf_df_b[args.fields_to_keep], left_on='B', right_on='id', how='left') to_rename = {f: "{}_B".format(f) for f in args.fields_to_keep} res = res.rename(columns=to_rename) res = res.drop('id_B', axis=1) id_columns = ['A', 'B'] res = res.drop_duplicates(subset=id_columns) if not args.do_not_fix_tcons: # replace TCONS_ with TCONS res['A'] = res['A'].str.replace("TCONS_", "TCONS") res['B'] = res['B'].str.replace("TCONS_", "TCONS") msg = "Extracting the genes and their biotypes using pyensembl" logger.info(msg) ensembl = pyensembl.EnsemblRelease(release=args.ensembl_release, species=args.ensembl_species) ensembl_transcript_ids = set(ensembl.transcript_ids()) biotypes_a = parallel.apply_df_simple(res, get_transcript_and_biotype, 'A', ensembl, ensembl_transcript_ids) biotypes_b = parallel.apply_df_simple(res, get_transcript_and_biotype, 'B', ensembl, ensembl_transcript_ids) biotypes_a = utils.remove_nones(biotypes_a) biotypes_b = utils.remove_nones(biotypes_b) biotypes_a = pd.DataFrame(biotypes_a) biotypes_b = pd.DataFrame(biotypes_b) res = res.merge(biotypes_a, on='A', how='left') res = res.merge(biotypes_b, on='B', how='left') msg = "Pulling annotations from mygene.info" logger.info(msg) # pull annotations from mygene gene_info_a = mygene_utils.query_mygene(res['gene_id_A']) gene_info_b = mygene_utils.query_mygene(res['gene_id_B']) # and add the mygene info res = res.merge(gene_info_a, left_on='gene_id_A', right_on='gene_id', how='left') to_rename = {f: "{}_A".format(f) for f in gene_info_a.columns} to_rename.pop('gene_id') res = res.rename(columns=to_rename) res = res.drop('gene_id', axis=1) res = res.merge(gene_info_b, left_on='gene_id_B', right_on='gene_id', how='left') to_rename = {f: "{}_B".format(f) for f in gene_info_a.columns} to_rename.pop('gene_id') res = res.rename(columns=to_rename) res = res.drop('gene_id', axis=1) msg = "Removing duplicates" logger.info(msg) id_columns = ['A', 'B'] res = res.drop_duplicates(subset=id_columns) msg = "Adding --id-matches columns" logger.info(msg) for (id_match_file, name) in zip(args.id_matches, args.id_match_names): res = add_id_matches(res, id_match_file, name) msg = "Adding --overlaps columns" logger.info(msg) for (overlap_file, name) in zip(args.overlaps, args.overlap_names): res = add_overlaps(res, overlap_file, name, bed_df_a, bed_df_b, exons) msg = "Sorting by in-frame reads" logger.info(msg) res['x_1_sum_A'] = res['x_1_sum_A'].fillna(0) res['x_1_sum_B'] = res['x_1_sum_B'].fillna(0) res['x_1_sum'] = res['x_1_sum_A'] + res['x_1_sum_B'] res = res.sort_values('x_1_sum', ascending=False) if args.filter: msg = "Filtering the micropeptides by read coverage and KL-divergence" logger.info(msg) x_1_sum_ranks = res['x_1_sum'].rank(method='min', na_option='top', ascending=False) num_x_1_sum_ranks = x_1_sum_ranks.max() max_good_x_1_sum_rank = num_x_1_sum_ranks * args.read_filter_percent m_good_x_1_sum_rank = x_1_sum_ranks <= max_good_x_1_sum_rank msg = ("Number of micropeptides passing read filter: {}".format( sum(m_good_x_1_sum_rank))) logger.debug(msg) kl_ranks = res['kl'].rank(method='dense', na_option='top', ascending=False) num_kl_ranks = kl_ranks.max() max_good_kl_rank = num_kl_ranks * args.kl_filter_percent m_good_kl_rank = kl_ranks <= max_good_kl_rank msg = ("Number of micropeptides passing KL filter: {}".format( sum(m_good_kl_rank))) logger.debug(msg) m_both_filters = m_good_x_1_sum_rank & m_good_kl_rank msg = ("Number of micropeptides passing both filters: {}".format( sum(m_both_filters))) logger.debug(msg) res = res[m_both_filters] msg = "Writing differential micropeptides to disk" logger.info(msg) if args.append_sheet is None: pandas_utils.write_df(res, args.out, index=False) else: sheet_name = "{},{}".format(args.name_a, args.name_b) utils.append_to_xlsx(res, args.out, sheet=sheet_name, index=False)
def main(): parser = argparse.ArgumentParser( formatter_class=argparse.ArgumentDefaultsHelpFormatter, description='''Label the ORFs based on their transcript exon structure wrt the annotated transcripts.''') parser.add_argument('annotated_transcripts', help='''The annotated transcripts for the genome in BED12+ format.''') parser.add_argument('extracted_orfs', help='''The ORFs extracted from the transcripts in BED12+ format.''') parser.add_argument('out', help='''The output (BED12+.gz) file.''') parser.add_argument('-e', '--annotated-exons', help='''The annotated transcript exons can be passed with this option. If they are not given, they will be split from the annotated transcripts.''', default=None) parser.add_argument('-o', '--orf-exons', help='''The exon blocks for the ORFs, in BED6+ format, obtained from "split-bed12-blocks". If they are not given, they will be split from the extracted ORFs.''', default=None) parser.add_argument('-n', '--nonoverlapping-label', help='''If this option is given, then the ORFs which do not overlap the annotated transcripts at all will be given this label. By default, remaining oof overlapping ORFs are assigned the "overlap" label. If not given, the ORFs outside of annotated regions are labeled as "suspect".''', default=None) parser.add_argument('-l', '--label-prefix', help='''This string is prepended to all labels assigned to ORFs, e.g. to indicate ORFs from a de novo assembly (Rp-Bp assigns the label "novel" to these, however the string is not prepended to "canonical ORFs").''', default='') parser.add_argument('-f', '--filter', help='''If this flag is given, then ORFs which are completely covered by an annotated transcript are discarded. Use to filter uninteresting ORFs from a de novo assembly.''', action='store_true') parser.add_argument('-p', '--num-cpus', help='''The number of CPUs to use to perform BED operations.''', type=int, default=default_num_cpus) logging_utils.add_logging_options(parser) args = parser.parse_args() logging_utils.update_logging(args) msg = "Reading annotated transcripts" logger.info(msg) annotated_transcripts = bed_utils.read_bed(args.annotated_transcripts) # get the annotated transcript exons if args.annotated_exons is None: msg = "Splitting the annotated transcripts into exon blocks" logger.info(msg) annotated_exons = bed_utils.split_bed12(annotated_transcripts, num_cpus=args.num_cpus, progress_bar=True) else: msg = "Reading the annotated transcript exons" logger.info(msg) annotated_exons = bed_utils.read_bed(args.annotated_exons) msg = "Reading extracted ORFs" logger.info(msg) extracted_orfs = bed_utils.read_bed(args.extracted_orfs) if args.orf_exons is None: msg = "Splitting the extracted ORFs into exon blocks" logger.info(msg) extracted_orf_exons = bed_utils.split_bed12(extracted_orfs, num_cpus=args.num_cpus, progress_bar=True) else: msg = "Reading the extracted ORFs exons" logger.info(msg) extracted_orf_exons = bed_utils.read_bed(args.orf_exons) msg = "Found {} extracted ORFs with {} exons".format( len(extracted_orfs), len(extracted_orf_exons)) logger.debug(msg) # filter out the ORFs that are entirely within annotated transcripts if args.filter: msg = "Removing ORFs which are completely covered by the annotated transcripts" logger.info(msg) nonoverlapping_ids = bed_utils.subtract_bed(extracted_orf_exons, annotated_exons, min_a_overlap=1) m_unfiltered = extracted_orfs['id'].isin(nonoverlapping_ids) extracted_orfs = extracted_orfs[m_unfiltered] # discard the unnecessary exons m_unfiltered = extracted_orf_exons['id'].isin(nonoverlapping_ids) extracted_orf_exons = extracted_orf_exons[m_unfiltered] msg = "After filtering, {} extracted ORFs remain".format( len(extracted_orfs)) logger.info(msg) # annotate and remove the ORFs which do not at all overlap the annotations if args.nonoverlapping_label is not None: nonoverlapping_ids = bed_utils.subtract_bed( extracted_orfs, annotated_transcripts, exons_a=extracted_orf_exons, exons_b=annotated_exons) m_nonoverlapping = extracted_orf_exons['id'].isin(nonoverlapping_ids) extracted_orf_exons = extracted_orf_exons[~m_nonoverlapping] m_nonoverlapping = extracted_orfs['id'].isin(nonoverlapping_ids) extracted_orfs.loc[m_nonoverlapping, 'orf_type'] = args.nonoverlapping_label msg = ("Found {} ORFs completely non-overlapping annotated transcripts" .format(len(nonoverlapping_ids))) logger.info(msg) msg = "Removing the annotated UTRs from the transcripts" logger.info(msg) canonical_orfs = bed_utils.retain_all_thick_only(annotated_transcripts, num_cpus=args.num_cpus) msg = "Splitting the canonical ORFs into exons" logger.info(msg) canonical_orf_exons = bed_utils.split_bed12(canonical_orfs, num_cpus=args.num_cpus, progress_bar=True) msg = "Extracting annotated 5' leader regions" logger.info(msg) five_prime_regions = bed_utils.retain_all_five_prime_of_thick( annotated_transcripts, num_cpus=args.num_cpus) if len(five_prime_regions) == 0: msg = "No annotated 5' leader regions were found" logger.warning(msg) msg = "Splitting the 5' leaders into exons" logger.info(msg) five_prime_exons = bed_utils.split_bed12(five_prime_regions, num_cpus=args.num_cpus, progress_bar=True) msg = "Extracting annotated 3' trailer regions" logger.info(msg) three_prime_regions = bed_utils.retain_all_three_prime_of_thick( annotated_transcripts, num_cpus=args.num_cpus) if len(three_prime_regions) == 0: msg = "No annotated 3' trailer regions were found" logger.warning(msg) msg = "Splitting the 3' trailers into exons" logger.info(msg) three_prime_exons = bed_utils.split_bed12(three_prime_regions, num_cpus=args.num_cpus, progress_bar=True) msg = "Splitting non-coding transcripts into exons" logger.info(msg) m_no_thick_start = annotated_transcripts['thick_start'] == -1 m_no_thick_end = annotated_transcripts['thick_end'] == -1 m_no_thick = m_no_thick_start & m_no_thick_end noncoding_transcripts = annotated_transcripts[m_no_thick] noncoding_exons = bed_utils.split_bed12(noncoding_transcripts, num_cpus=args.num_cpus, progress_bar=True) # First, remove all in-frame (canonical, canonical variants), and also within and oof ORFs msg = "Marking canonical and extracted ORFs with the same stop codon" logger.info(msg) # first, add the "true" ORF end m_reverse_canonical = canonical_orfs['strand'] == '-' canonical_orfs['orf_end'] = canonical_orfs['end'] canonical_orfs.loc[m_reverse_canonical, 'orf_end'] = canonical_orfs.loc[m_reverse_canonical, 'start'] m_reverse_extracted = extracted_orfs['strand'] == '-' extracted_orfs['orf_end'] = extracted_orfs['end'] extracted_orfs.loc[m_reverse_extracted, 'orf_end'] = extracted_orfs.loc[m_reverse_extracted, 'start'] # then, find extracted ORFs with the same "orf_end" (and seqname, strand) as canonical ORFs merge_fields = ['seqname', 'strand', 'orf_end'] canonical_extracted_orf_ends = canonical_orfs.merge( extracted_orfs, on=merge_fields, suffixes=['_canonical', '_extracted']) # finally, pull this into a set zip_it = zip(canonical_extracted_orf_ends['id_canonical'], canonical_extracted_orf_ends['id_extracted']) canonical_extracted_matching_ends = {(c, a) for c, a in zip_it} msg = "Finding ORFs which exactly overlap the canonical ORFs" logger.info(msg) exact_matches = bed_utils.get_bed_overlaps(canonical_orf_exons, extracted_orf_exons, min_a_overlap=1, min_b_overlap=1) exact_match_orf_ids = {m.b_info for m in exact_matches} m_exact_orf_matches = extracted_orf_exons['id'].isin(exact_match_orf_ids) extracted_orf_exons = extracted_orf_exons[~m_exact_orf_matches] m_canonical = extracted_orfs['id'].isin(exact_match_orf_ids) label = 'canonical' extracted_orfs.loc[m_canonical, 'orf_type'] = label msg = "Found {} canonical ORFs".format(len(exact_match_orf_ids)) logger.info(msg) msg = "Finding truncated canonical ORFs" logger.info(msg) truncated_matches = bed_utils.get_bed_overlaps(canonical_orf_exons, extracted_orf_exons, min_b_overlap=1) truncated_match_ids = { m.b_info for m in truncated_matches if (m.a_info, m.b_info) in canonical_extracted_matching_ends } m_truncated_matches = extracted_orf_exons['id'].isin(truncated_match_ids) extracted_orf_exons = extracted_orf_exons[~m_truncated_matches] m_canonical_truncated = extracted_orfs['id'].isin(truncated_match_ids) msg = "Finding extended canonical ORFs" logger.info(msg) extended_matches = bed_utils.get_bed_overlaps(canonical_orf_exons, extracted_orf_exons, min_a_overlap=1) # For standard assembly, we also need to make sure that # all extended matches are fully contained within the # transcript structure (i.e start upstream but otherwise # have the same structure). if args.nonoverlapping_label is None: transcript_matches = bed_utils.get_bed_overlaps(annotated_exons, extracted_orf_exons, min_b_overlap=1) transcript_match_pairs = {(m.a_info, m.b_info) for m in transcript_matches} extended_match_ids = { m.b_info for m in extended_matches if (m.a_info, m.b_info) in transcript_match_pairs and ( m.a_info, m.b_info) in canonical_extracted_matching_ends } else: extended_match_ids = { m.b_info for m in extended_matches if (m.a_info, m.b_info) in canonical_extracted_matching_ends } m_extended_matches = extracted_orf_exons['id'].isin(extended_match_ids) extracted_orf_exons = extracted_orf_exons[~m_extended_matches] m_canonical_extended = extracted_orfs['id'].isin(extended_match_ids) m_canonical_variants = m_canonical_truncated | m_canonical_extended label = "{}canonical_variant".format(args.label_prefix) extracted_orfs.loc[m_canonical_variants, 'orf_type'] = label msg = "Found {} canonical_variant ORFs".\ format(len(extended_match_ids | truncated_match_ids)) logger.info(msg) msg = ("Finding within canonical ORFs that do not share an " "annotated stop codon with a canonical ORF (e.g. in " "frame stop, out-of-frame)") logger.info(msg) within_ids = { m.b_info for m in truncated_matches if m.b_info not in truncated_match_ids } m_within_matches = extracted_orf_exons['id'].isin(within_ids) extracted_orf_exons = extracted_orf_exons[~m_within_matches] m_within = extracted_orfs['id'].isin(within_ids) label = "{}within".format(args.label_prefix) extracted_orfs.loc[m_within, 'orf_type'] = label msg = "Found {} within ORFs".format(len(within_ids)) logger.info(msg) # find all overlapping ORFs msg = "Finding all UTR overlap matches" logger.info(msg) out_of_frame_matches = bed_utils.get_bed_overlaps(canonical_orf_exons, extracted_orf_exons) leader_matches = bed_utils.get_bed_overlaps(five_prime_exons, extracted_orf_exons) trailer_matches = bed_utils.get_bed_overlaps(three_prime_exons, extracted_orf_exons) msg = ("Labeling ORFs which have (out-of-frame) overlaps with both a " "canonical ORF and annotated leaders or trailers") logger.info(msg) # We need to choose how to ensure that up-/downstream overlaps are unique. # Where an ORF overlaps both the 5'UTR and the 3'UTR of different same # sense overlapping transcripts, it is assigned by default to the downstream overlap. # For de novo, everything is labeled as overlap. leader_match_pairs = {(m.a_info, m.b_info) for m in leader_matches} trailer_match_pairs = {(m.a_info, m.b_info) for m in trailer_matches} if args.nonoverlapping_label is None: # For standard assembly, we also need to make sure that # all overlap matches are fully contained within the # transcript structure. transcript_matches = bed_utils.get_bed_overlaps(annotated_exons, extracted_orf_exons, min_b_overlap=1) transcript_match_pairs = {(m.a_info, m.b_info) for m in transcript_matches} leader_overlap_pairs = { (m.a_info, m.b_info) for m in out_of_frame_matches if (m.a_info, m.b_info) in leader_match_pairs and ( m.a_info, m.b_info) not in trailer_match_pairs and ( m.a_info, m.b_info) in transcript_match_pairs } trailer_overlap_pairs = { (m.a_info, m.b_info) for m in out_of_frame_matches if (m.a_info, m.b_info) in trailer_match_pairs and ( m.a_info, m.b_info) not in leader_match_pairs and ( m.a_info, m.b_info) in transcript_match_pairs } # We do not assign preference where the ORF overlaps both sides # of the coding sequence on the same transcript, any ORF # satisfying both will be labeled simply as overlap. overlap_ids = { m.b_info for m in out_of_frame_matches if (m.a_info, m.b_info) in leader_match_pairs and ( m.a_info, m.b_info) in trailer_match_pairs and ( m.a_info, m.b_info) in transcript_match_pairs } trailer_overlap_ids = { pair[1] for pair in trailer_overlap_pairs if pair[1] not in overlap_ids } leader_overlap_ids = { pair[1] for pair in leader_overlap_pairs if pair[1] not in trailer_overlap_ids and pair[1] not in overlap_ids } m_overlap_matches = extracted_orf_exons['id'].isin(overlap_ids) extracted_orf_exons = extracted_orf_exons[~m_overlap_matches] m_leader_overlap_matches = extracted_orf_exons['id'].isin( leader_overlap_ids) extracted_orf_exons = extracted_orf_exons[~m_leader_overlap_matches] m_five_prime_overlap = extracted_orfs['id'].isin(leader_overlap_ids) label = "{}five_prime_overlap".format(args.label_prefix) extracted_orfs.loc[m_five_prime_overlap, 'orf_type'] = label m_trailer_overlap_matches = extracted_orf_exons['id'].isin( trailer_overlap_ids) extracted_orf_exons = extracted_orf_exons[~m_trailer_overlap_matches] m_three_prime_overlap = extracted_orfs['id'].isin(trailer_overlap_ids) label = "{}three_prime_overlap".format(args.label_prefix) extracted_orfs.loc[m_three_prime_overlap, 'orf_type'] = label msg = "Found {} five_prime_overlap ORFs".format( len(leader_overlap_ids)) logger.info(msg) msg = "Found {} three_prime_overlap ORFs".format( len(trailer_overlap_ids)) logger.info(msg) else: overlap_ids = {m.b_info for m in out_of_frame_matches} overlap_ids |= {m.b_info for m in leader_matches} overlap_ids |= {m.b_info for m in trailer_matches} m_overlap_matches = extracted_orf_exons['id'].isin(overlap_ids) extracted_orf_exons = extracted_orf_exons[~m_overlap_matches] m_overlap = extracted_orfs['id'].isin(overlap_ids) label = "{}overlap".format(args.label_prefix) extracted_orfs.loc[m_overlap, 'orf_type'] = label msg = "Found {} overlap ORFs".format(len(overlap_ids)) logger.info(msg) msg = "Finding ORFs completely within 5' or 3' leaders" logger.info(msg) leader_matches = bed_utils.get_bed_overlaps(five_prime_exons, extracted_orf_exons, min_b_overlap=1) leader_ids = {m.b_info for m in leader_matches} m_leader_matches = extracted_orf_exons['id'].isin(leader_ids) extracted_orf_exons = extracted_orf_exons[~m_leader_matches] m_five_prime = extracted_orfs['id'].isin(leader_ids) label = "{}five_prime".format(args.label_prefix) extracted_orfs.loc[m_five_prime, 'orf_type'] = label msg = "Found {} five_prime ORFs".format(len(leader_ids)) logger.info(msg) trailer_matches = bed_utils.get_bed_overlaps(three_prime_exons, extracted_orf_exons, min_b_overlap=1) trailer_ids = {m.b_info for m in trailer_matches} m_trailer_matches = extracted_orf_exons['id'].isin(trailer_ids) extracted_orf_exons = extracted_orf_exons[~m_trailer_matches] m_three_prime = extracted_orfs['id'].isin(trailer_ids) label = "{}three_prime".format(args.label_prefix) extracted_orfs.loc[m_three_prime, 'orf_type'] = label msg = "Found {} three_prime ORFs".format(len(trailer_ids)) logger.info(msg) msg = "Finding ORFs completely within annotated, non-coding transcripts" logger.info(msg) noncoding_matches = bed_utils.get_bed_overlaps(noncoding_exons, extracted_orf_exons, min_b_overlap=1) noncoding_ids = {m.b_info for m in noncoding_matches} m_noncoding_matches = extracted_orf_exons['id'].isin(noncoding_ids) extracted_orf_exons = extracted_orf_exons[~m_noncoding_matches] m_noncoding = extracted_orfs['id'].isin(noncoding_ids) label = "{}noncoding".format(args.label_prefix) extracted_orfs.loc[m_noncoding, 'orf_type'] = label msg = "Found {} noncoding ORFs".format(len(noncoding_ids)) logger.info(msg) # all of the remaining ORFs fall into the "suspect" category suspect_ids = {orf_id for orf_id in extracted_orf_exons['id']} m_suspect = extracted_orfs['id'].isin(suspect_ids) label = "{}suspect".format(args.label_prefix) extracted_orfs.loc[m_suspect, 'orf_type'] = label n_suspect_ids = len(suspect_ids) msg = "Remaining {} ORFs labeled as suspect".format(n_suspect_ids) logger.info(msg) m_no_orf_type = extracted_orfs['orf_type'].isnull() msg = "Found {} unlabeled ORFs".format(sum(m_no_orf_type)) logger.info(msg) msg = "Writing ORFs with labels to disk" logger.info(msg) extracted_orfs = bed_utils.sort(extracted_orfs) msg = ("The ORF labels will be written to {} in the next major release.". format(args.out)) logger.warning(msg) additional_columns = ['orf_num', 'orf_len', 'orf_type'] fields = bed_utils.bed12_field_names + additional_columns orfs_genomic = extracted_orfs[fields] bed_utils.write_bed(orfs_genomic, args.extracted_orfs) label_columns = ['id', 'duplicates', 'orf_type'] extracted_orfs = extracted_orfs[label_columns] bed_utils.write_bed(extracted_orfs, args.out)
def main(): parser = argparse.ArgumentParser( formatter_class=argparse.ArgumentDefaultsHelpFormatter, description= "This script creates a line graph showing the length distributions " "of the various types of ORFs. Optionally, it can also include the length " "distribution of ORFs downloaded from uniprot. If uniprot ORFs are given, then the " "KL-divergence between the type distributions and the uniprot ORFs is calculated." ) parser.add_argument('orfs', help="The BED12+ file with the ORFs") parser.add_argument('out', help="The output (image) file") parser.add_argument('--uniprot', help="The uniprot ORF lengths, if available", default=default_uniprot) parser.add_argument('--uniprot-label', help="The label to use for the uniprot ORFs in " "the plot", default=default_uniprot_label) parser.add_argument('--title', help="The title to use for the plot", default=default_title) parser.add_argument('--use-groups', help="If this flag is given, the the ORFs " "will be grouped", action='store_true') args = parser.parse_args() orfs = bed_utils.read_bed(args.orfs) if args.use_groups: orf_lengths = [ get_orf_lengths(orfs, ribo_utils.orf_type_labels_mapping[label]) for label in ribo_utils.orf_type_labels ] prediction_labels = [ latex.get_latex_safe_string(l) for l in ribo_utils.orf_type_labels ] prediction_lengths_list = orf_lengths else: orf_lengths = [ get_orf_lengths(orfs, [orf_type]) for orf_type in ribo_utils.orf_types ] prediction_labels = [ latex.get_latex_safe_string(l) for l in ribo_utils.orf_types ] prediction_lengths_list = orf_lengths if os.path.exists(args.uniprot): truth_nt_lengths = bio.get_uniprot_nt_lengths(args.uniprot) truth_label = args.uniprot_label else: truth_nt_lengths = None truth_label = None #prediction_lengths_list = [bf_lengths, chisq_lengths] #prediction_labels = ['BF', r'$\chi^2$'] # input: truth_nt_lengths (array-like) # prediction_lengths_list (list of array-likes) # truth_label (string) # prediction_labels (list of array-likes) # # if truth_nt_lengths is not defined, then the KL-divergence calculations # will be skipped (and it will not be shown) fontsize = 20 legend_fontsize = 20 title_fontsize = 20 linewidth = 4 # plot the empirical distribution of ORF lengths hist_min = 200 hist_max = 5250 hist_step = 200 hist_range = (hist_min, hist_max) hist_bins = np.arange(hist_min, hist_max, hist_step) if truth_nt_lengths is not None: truth_hist, _ = np.histogram(truth_nt_lengths, bins=hist_bins, range=hist_range, density=True) else: truth_hist = None prediction_hists = [] for prediction_lengths in prediction_lengths_list: prediction_hist, _ = np.histogram(prediction_lengths, bins=hist_bins, range=hist_range, density=True) prediction_hists.append(prediction_hist) # now, normalize the histograms if truth_hist is not None: truth_hist = truth_hist / np.sum(truth_hist) truth_hist += 1e-3 for i, prediction_hist in enumerate(prediction_hists): prediction_hists[i] = prediction_hist / np.sum(prediction_hist) prediction_hists[i] += 1e-3 kls = [] if truth_hist is not None: for i, prediction_hist in enumerate(prediction_hists): kl = math_utils.calculate_symmetric_kl_divergence( truth_hist, prediction_hist, scipy.stats.entropy) kls.append(kl) # and update the label prediction_labels[i] = '{}, KL: ${:.2f}$'.format( prediction_labels[i], kl) if truth_hist is not None: truth_hist = 100 * truth_hist for i, prediction_hist in enumerate(prediction_hists): prediction_hists[i] *= 100 fig, ax = plt.subplots(figsize=(10, 5)) cm = plt.cm.gist_earth x = np.arange(len(hist_bins) - 1) truth_cm_offset = 0.1 if truth_hist is not None: color = cm(truth_cm_offset) ax.plot(x, truth_hist, label=truth_label, linewidth=linewidth, color=color) color_range = 1 - 2 * truth_cm_offset for i, prediction_hist in enumerate(prediction_hists): color = i / len(prediction_hists) * color_range color += 2 * truth_cm_offset color = cm(color) ax.plot(x, prediction_hist, label=prediction_labels[i], linewidth=linewidth, color=color) ax.set_xlabel('Length (bp)', fontsize=fontsize) ax.set_ylabel('\% of predicted ORFs', fontsize=fontsize) if len(args.title) > 0: ax.set_title(args.title, fontsize=fontsize) ax.set_xticks(x[::2]) ax.set_xticklabels(hist_bins[::2], fontsize=fontsize, rotation=90) ax.set_ylim((0, 20)) ax.set_xlim((0, len(hist_bins))) # hide the "0" tick label yticks = ax.yaxis.get_major_ticks() yticks[0].label1.set_visible(False) # chop off everything from 3000 on index_of_3000 = 14 ax.set_xlim((0, index_of_3000)) #ax.set_xlim((0, len(uniprot_hist)-1)) lgd = ax.legend(loc='center right', fontsize=legend_fontsize, bbox_to_anchor=(1.75, 0.5)) ax.tick_params(axis='both', which='major', labelsize=fontsize) fig.savefig(args.out, bbox_inches='tight', bbox_extra_artists=(lgd, ))
def main(): parser = argparse.ArgumentParser(formatter_class=argparse.ArgumentDefaultsHelpFormatter, description="""This script constructs the profile for each ORF. It first adjusts the mapped read positions to properly align with the P-sites. Second, it uses a custom chrom-sweep algorithm to find the coverage of each position in each exon of each ORF. Finally, the ORF exons are glued together to find the profile of the entire ORF.""") parser.add_argument('bam', help="The bam file including filtered (unique, etc.) alignments") parser.add_argument('orfs', help="The (bed12) file containing the ORFs") parser.add_argument('exons', help="The (bed6+2) file containing the exons") parser.add_argument('out', help="The (mtx.gz) output file containing the ORF profiles") parser.add_argument('-l', '--lengths', help="""If any values are given, then only reads which have those lengths will be included in the signal construction.""", type=int, default=[], nargs='*') parser.add_argument('-o', '--offsets', help="""The 5' end of reads will be shifted by this amount. There must be one offset value for each length (given by the --lengths argument.""", type=int, default=[], nargs='*') parser.add_argument('-k', '--num-exons', help="If k>0, then only the first k exons will be processed.", type=int, default=0) parser.add_argument('-g', '--num-groups', help=""""The number of groups into which to split the exons. More groups means the progress bar is updated more frequently but incurs more overhead because of the parallel calls.""", type=int, default=default_num_groups) parser.add_argument('--seqname-prefix', help="""If present, this string will be prepended to the seqname field of the ORFs.""", default='') slurm.add_sbatch_options(parser) logging_utils.add_logging_options(parser) args = parser.parse_args() logging_utils.update_logging(args) msg = "[extract-orf-profiles]: {}".format(' '.join(sys.argv)) logger.info(msg) # make sure the number of lengths and offsets match if len(args.lengths) != len(args.offsets): msg = "The number of --lengths and --offsets do not match." raise ValueError(msg) # make sure the necessary files exist required_files = [args.bam, args.orfs, args.exons] msg = "[extract-orf-profiles]: Some input files were missing: " utils.check_files_exist(required_files, msg=msg) if args.use_slurm: cmd = ' '.join(sys.argv) slurm.check_sbatch(cmd, args=args) return msg = "Finding P-sites" logger.info(msg) p_sites = ribo_utils.get_p_sites(args.bam, args.lengths, args.offsets) # we do not need the data frame anymore, so save some memory msg = "Reading exons" logger.info(msg) exons = bed_utils.read_bed(args.exons) msg = "Reading ORFs" logger.info(msg) orfs = bed_utils.read_bed(args.orfs) if len(args.seqname_prefix) > 0: orfs['seqname'] = args.seqname_prefix + orfs['seqname'] exons['seqname'] = args.seqname_prefix + exons['seqname'] if args.num_exons > 0: exons = exons.head(args.num_exons) num_orfs = orfs['orf_num'].max() + 1 max_orf_len = orfs['orf_len'].max() msg = "Adding the ORF index to the exons" logger.info(msg) orf_fields = ['id', 'orf_num'] exons_orfs = exons.merge(orfs[orf_fields], on='id') msg = "Splitting exons and P-sites" logger.info(msg) exon_groups = pandas_utils.split_df(exons_orfs, args.num_groups) exons_dfs = [] psites_dfs = [] for group_index, exon_group in exon_groups: # pull out only the p-sites that come from these chromosomes seqnames = set(exon_group['seqname'].unique()) m_psites = p_sites['seqname'].isin(seqnames) exons_dfs.append(exon_group) psites_dfs.append(p_sites[m_psites]) # we no longer need the full list of psites del p_sites del exons_orfs del exon_groups del exons gc.collect() exons_psites = zip(exons_dfs, psites_dfs) msg = "Finding all P-site intersections" logger.info(msg) sum_profiles = parallel.apply_parallel_iter( exons_psites, args.num_cpus, get_all_p_site_intersections, num_orfs, max_orf_len, progress_bar=True, total=len(exons_dfs), backend='multiprocessing' ) msg = "Combining the ORF profiles into one matrix" logger.info(msg) f = lambda x, y: x+y sum_profiles = functools.reduce(f, sum_profiles) sum_profiles_lil = sum_profiles.tolil() msg = "Flipping the reverse strand profiles" logger.info(msg) m_reverse = orfs['strand'] == '-' reverse_orfs = orfs[m_reverse] for idx, reverse_orf in tqdm.tqdm(reverse_orfs.iterrows()): orf_num = reverse_orf['orf_num'] if sum_profiles[orf_num].sum() == 0: continue orf_len = reverse_orf['orf_len'] dense = utils.to_dense(sum_profiles, orf_num, length=orf_len) dense = dense[::-1] sum_profiles_lil[orf_num, :orf_len] = dense msg = "Writing the sparse matrix to disk" logger.info(msg) math_utils.write_sparse_matrix(args.out, sum_profiles_lil)
def main(): parser = argparse.ArgumentParser( formatter_class=argparse.ArgumentDefaultsHelpFormatter, description="This script creates a bar chart which shows the count of " "each ORF type in a given BED12+ file. Optionally, the ORFs can be " "grouped into similar types.") parser.add_argument('orfs', help="The BED12+ file with the ORFs") parser.add_argument('out', help="The output (image) file") parser.add_argument('--title', help="The title to use for the plot", default=default_title) parser.add_argument('--use-groups', help="If this flag is given, the ORFs " "will be grouped", action='store_true') parser.add_argument('--fontsize', default=default_fontsize) parser.add_argument('--legend-fontsize', default=default_legend_fontsize) parser.add_argument('--ymax', type=int, default=default_ymax) logging_utils.add_logging_options(parser) args = parser.parse_args() msg = "Reading bed file" logger.info(msg) bed = bed_utils.read_bed(args.orfs) if args.use_groups: bed['orf_type_group'] = bed['orf_type'].map( ribo_utils.orf_type_labels_reverse_mapping) orf_type_counts = bed.groupby(['orf_type_group', 'strand']).size() orf_type_counts = orf_type_counts.reset_index(name="count") orf_type_counts['display_name'] = orf_type_counts[ 'orf_type_group'].map(ribo_utils.orf_type_labels_display_name_map) else: orf_type_counts = bed.groupby(['orf_type', 'strand']).size() orf_type_counts = orf_type_counts.reset_index(name="count") orf_type_counts['display_name'] = orf_type_counts['orf_type'].map( ribo_utils.orf_type_display_name_map) msg = "Creating the bar chart" color = sns.palettes.color_palette("Set3", n_colors=3) fig, ax = plt.subplots(figsize=(9, 5)) sns.barplot(x="display_name", y="count", hue="strand", data=orf_type_counts, ax=ax, zorder=-1, palette='Set3', log=True) sns.despine() ax.legend(loc='upper right', bbox_to_anchor=(1.0, 0.95), fontsize=args.legend_fontsize, frameon=True, framealpha=0.9, title="Strand") mpl_utils.set_legend_title_fontsize(ax, args.fontsize) #ax.set_yscale('log') #ax.set_ylim((1, args.ymax)) ax.set_ylabel("Number of ORFs", fontsize=args.fontsize) ax.set_xlabel("", fontsize=0) # rotate the ORF type names mpl_utils.set_ticklabels_fontsize(ax, args.fontsize) mpl_utils.set_ticklabel_rotation(ax, axis='x', rotation=90) # place the ORF type names in the middle of the bar for ticklabel in ax.xaxis.get_ticklabels(): p = ticklabel.get_position() ticklabel.set_position((p[0], 0.1)) ticklabel.set_verticalalignment('bottom') if args.title is not None: ax.set_title(args.title, fontsize=args.fontsize) if args.out is not None: fig.savefig(args.out, bbox_inches='tight')
def main(): parser = argparse.ArgumentParser(formatter_class=argparse.ArgumentDefaultsHelpFormatter, description="This script uses the mygene.info service to find annotations " "for the transcripts associated with the ORFs in the given bed file. In " "particular, it extracts information from Swiss-Prot, TrEMBL, Interpro, " "PDB, Pfam, PROSITE, the Gene Ontology, and KEGG.") parser.add_argument('bed', help="The bed file") parser.add_argument('out', help="The output file. Its type will be inferred " "from its extension.") parser.add_argument('--do-not-trim', help="By default, the script will " "attempt to trim transcript identifiers such that they are valid Ensembl " "identifiers. If this flag is given, no trimming will take place.", action='store_true') parser.add_argument('--scopes', help="A list of scopes to use when querying " "mygene.info. Please see the documentation for more information about " "valid scopes: http://mygene.info/doc/query_service.html#available_fields", nargs='*', default=default_scopes) parser.add_argument('--do-not-convert-ids', help="By default, the script will " "treat the identifiers in the file as transcript identifiers. It first " "maps those to gene identifiers, and then it uses those to find the " "gene annotations. If the identifiers are already gene ids (or whatever " "is specified by scopes), then the first mapping is not necessary and " "can be skipped using this flag.", action='store_true') logging_utils.add_logging_options(parser) args = parser.parse_args() logging_utils.update_logging(args) convert_ids = not args.do_not_convert_ids msg = "Reading the bed file" logger.info(msg) bed = bed_utils.read_bed(args.bed) bed = bed[fields_to_keep] msg = "Extracting transcript ids" logger.info(msg) trim = not args.do_not_trim orf_ids = parallel.apply_iter_simple(bed['id'], parse_orf_id, trim) orf_ids_df = pd.DataFrame(orf_ids) if convert_ids: msg = "Querying transcript to gene id mapping" logger.info(msg) gene_ids = mygene_utils.get_transcript_to_gene_mapping(orf_ids_df['transcript_id']) else: gene_ids = pd.DataFrame() gene_ids['transcript_id'] = orf_ids_df['transcript_id'] gene_ids['gene_id'] = orf_ids_df['transcript_id'] msg = "Querying gene annotations" logger.info(msg) res_df = mygene_utils.query_mygene(gene_ids['gene_id']) msg = "Combining gene annotations with transcript ids" logger.info(msg) res_df = gene_ids.merge(res_df, on='gene_id', how='inner') msg = "Combining transcript annotations with ORF ids" logger.info(msg) orf_ids_fields = ['transcript_id', 'orf_id'] res_df = orf_ids_df[orf_ids_fields].merge(res_df, on='transcript_id', how='inner') msg = "Combining ORF annotations with ORF predictions" logger.info(msg) res_df = bed.merge(res_df, left_on='id', right_on='orf_id', how='left') msg = "Writing ORF annotations to disk" logger.info(msg) pandas_utils.write_df(res_df, args.out, index=False)
def main(): parser = argparse.ArgumentParser( formatter_class=argparse.ArgumentDefaultsHelpFormatter, description= "This script creates a pie chart which shows the proportion of " "each ORF type in a given BED12+ file. Optionally, the ORFs can be grouped " "into similar types.") parser.add_argument('orfs', help="The BED12+ file with the ORFs") parser.add_argument('out', help="The output (image) file") parser.add_argument('--title', help="The title to use for the plot", default=default_title) parser.add_argument('--use-groups', help="If this flag is given, the the ORFs " "will be grouped", action='store_true') args = parser.parse_args() orfs = bed_utils.read_bed(args.orfs) strands = ['+', '-'] fracs = [] labels = [] for strand in ['+', '-']: m_strand = orfs['strand'] == strand orf_type_groups = orfs[m_strand].groupby('orf_type') counts = orf_type_groups.size() if args.use_groups: lab = ribo_utils.orf_type_labels fr = [get_orf_label_counts(counts, l) for l in lab] else: fr = counts.values lab = np.array(counts.index) lab = ["{} ({})".format(l, f) for l, f in zip(lab, fr)] fracs.append(fr) labels.append(lab) fig, axes = plt.subplots(ncols=2, figsize=(10, 5)) cmap = plt.cm.Blues colors = cmap(np.linspace(0., 1., len(labels[0]))) # forward strand ORFs extra_artists = [] if sum(fracs[0]) > 0: patches, texts = axes[0].pie(fracs[0], colors=colors) lgd = axes[0].legend(patches, labels[0], loc="center right", bbox_to_anchor=(0, 0.5)) axes[0].set_title("Strand: {}".format(strands[0])) extra_artists.append(lgd) else: title = "Strand: {}. No ORFs".format(strands[0]) axes[0].set_title(title) axes[0].set_axis_off() # reverse strand ORFs if sum(fracs[1]) > 0: patches, texts = axes[1].pie(fracs[1], colors=colors) lgd = axes[1].legend(patches, labels[1], loc="center right", bbox_to_anchor=(2.0, 0.5)) axes[1].set_title("Strand: {}".format(strands[1])) extra_artists.append(lgd) else: title = "Strand: {}. No ORFs".format(strands[1]) axes[1].set_title(title) axes[1].set_axis_off() if len(args.title) > 0: sup = fig.suptitle(args.title) extra_artists.append(sup) fig.savefig(args.out, bbox_extra_artists=extra_artists, bbox_inches='tight')
def main(): parser = argparse.ArgumentParser(formatter_class=argparse.ArgumentDefaultsHelpFormatter, description="""Given a list of ORFs with associated Bayes factors and a fasta sequence file, this script extracts the sequences of the ORFs whose Bayes factor exceeds the given threshold. Finally, biopython is used to translate the selected ORFs into protein sequences. The min-length and minimum-profile-sum filters are applied in the obvious way. For both BF and chi-square predictions, only ORFs which have more reads in the first reading frame than either of the other two will be selected as translated. (This is called the 'frame filter' below.) The selection based on Bayes factors follows this logic: if max_bf_var is given, then it and min_bf_mean are taken as a hard threshold on the estimated Bayes factor mean. If min_bf_likelihood is given, then this min_bf_mean is taken as the boundary value; that is, an ORF is 'translated' if: [P(bf > min_bf_mean)] > min_bf_likelihood. If both max_bf_var and min_bf_likelihood are None, then min_bf_mean is taken as a hard threshold on the mean for selecting translated ORFs. If both max_bf_var and min_bf_likelihood are given, then both filters will be applied and the result will be the intersection. If the --use-chi-square option is given, the significance value is Bonferroni-corrected based on the number of ORFs which meet the length, profile and frame filters.""") parser.add_argument('bayes_factors', help="""The file containing the ORFs and Bayes' factors (BED12+).""") parser.add_argument('fasta', help="The *genome* fasta file") parser.add_argument('predicted_orfs', help="""The (output) BED12+ file containing the predicted ORFs.""") parser.add_argument('predicted_dna_sequences', help="""The (output) fasta file containing the predicted ORF sequences, as DNA sequences.""") parser.add_argument('predicted_protein_sequences', help="""The (output) fasta file containing the predicted ORF sequences, as protein sequences.""") parser.add_argument('--select-longest-by-stop', help="""If this flag is given, then the selected ORFs will be merged based on stop codons. In particular, only the longest translated ORF at each stop codon will be selected.""", action='store_true') parser.add_argument('--select-best-overlapping', help="""If this flag is given, then only the ORF with the highest estimated Bayes factor will be kept among each set of overlapping ORFs. N.B. This filter is applied *AFTER* selecting the longest ORF at each stop codon, if the --select-longest-by-stop flag is given.""", action='store_true') parser.add_argument('--min-length', help="The minimum length to predict an ORF as translated", type=int, default=translation_options['orf_min_length']) parser.add_argument('--min-bf-mean', help="""The minimum Bayes' factor mean to predict an ORF as translated (use --help for more details)""", type=float, default=translation_options['min_bf_mean']) parser.add_argument('--max-bf-var', help="""The maximum Bayes' factor variance to predict an ORF as translated (use --help for more details).""", type=float, default=translation_options['max_bf_var']) parser.add_argument('--min-bf-likelihood', help="""If given, then this is taken a threshold on the likelihood of translation (use --help for more details).""", type=float, default=translation_options['min_bf_likelihood']) parser.add_argument('--min-profile', help="""The minimum sum across all reading frames to consider an ORF as translated""", type=float, default=translation_options['orf_min_profile_count']) parser.add_argument('--chi-square-only', help="""If this flag is present, then the chi square value will be used to predict ORFs rather than the Bayes' factor.""", action='store_true') parser.add_argument('--chisq-significance-level', help="""If using chi square, then this value is Bonferroni corrected and used as the significance cutoff, else it is ignored.""", type=float, default=translation_options['chisq_alpha']) parser.add_argument('--filtered-orf-types', help=""""A list of ORF types which will be removed before selecting the final prediction set.""", nargs='*', default=[]) logging_utils.add_logging_options(parser) args = parser.parse_args() logging_utils.update_logging(args) # first, extract all of the predictions which exceed the threshold msg = "Reading Bayes factor information" logger.info(msg) bayes_factors = bed_utils.read_bed(args.bayes_factors) if len(args.filtered_orf_types) > 0: filtered_orf_types_str = ','.join(args.filtered_orf_types) msg = "Filtering these ORF types: {}".format(filtered_orf_types_str) logger.info(msg) m_orf_types = bayes_factors['orf_type'].isin(args.filtered_orf_types) bayes_factors = bayes_factors[~m_orf_types] msg = "Identifying ORFs which meet the prediction thresholds" logger.info(msg) all_orfs, predicted_orfs = ribo_utils.get_predicted_orfs( bayes_factors, min_signal=args.min_profile, min_length=args.min_length, min_bf_mean=args.min_bf_mean, max_bf_var=args.max_bf_var, min_bf_likelihood=args.min_bf_likelihood, chisq_alpha=args.chisq_significance_level, select_longest_by_stop=args.select_longest_by_stop, use_chi_square=args.chi_square_only ) msg = "Number of selected ORFs: {}".format(len(predicted_orfs)) logger.info(msg) if args.select_best_overlapping: msg = "Finding overlapping ORFs" logger.info(msg) merged_intervals = bed_utils.merge_all_intervals(predicted_orfs) msg = "Selecting best among overlapping ORFs" logger.info(msg) predicted_orfs = parallel.apply_iter_simple( merged_intervals['merged_ids'], get_best_overlapping_orf, predicted_orfs, progress_bar=True ) predicted_orfs = pd.DataFrame(predicted_orfs) msg = "Sorting selected ORFs" logger.info(msg) predicted_orfs = bed_utils.sort(predicted_orfs) msg = "Writing selected ORFs to disk" logger.info(msg) bed_utils.write_bed(predicted_orfs, args.predicted_orfs) # now get the sequences msg = "Extracting predicted ORFs DNA sequence" logger.info(msg) split_exons = True transcript_sequences = bed_utils.get_all_bed_sequences( predicted_orfs, args.fasta, split_exons ) fastx_utils.write_fasta(transcript_sequences, args.predicted_dna_sequences, compress=False) # translate the remaining ORFs into protein sequences msg = "Converting predicted ORF sequences to amino acids" logger.info(msg) records = fastx_utils.get_read_iterator(args.predicted_dna_sequences) protein_records = { r[0]: Bio.Seq.translate(r[1]) for r in records } fastx_utils.write_fasta( protein_records.items(), args.predicted_protein_sequences, compress=False )
def main(): parser = argparse.ArgumentParser( formatter_class=argparse.ArgumentDefaultsHelpFormatter, description='''Extract the ORFs from the given transcripts and write as a BED12+ file. Additional fields, 'orf_len' and 'orf_num', give the length of each ORF and it's index (used to write the ORF profiles). A third additional field records duplicated ORFs from transcript variants.''') parser.add_argument('transcripts_bed', help='''The BED12 file containing the transcript information.''') parser.add_argument('transcripts_fasta', help='''The fasta file containing the spliced transcript sequences.''') parser.add_argument('out', help='''The output (BED12+ gz) file.''') parser.add_argument('--start-codons', help='''A list of codons which will be treated as start codons when extracting the ORFs.''', nargs='+', default=default_start_codons) parser.add_argument('--stop-codons', help='''A list of codons which will be treated as stop codons when extracting the ORFs.''', nargs='+', default=default_stop_codons) slurm.add_sbatch_options(parser) logging_utils.add_logging_options(parser) args = parser.parse_args() logging_utils.update_logging(args) # check if we wanted to use slurm if args.use_slurm: cmd = ' '.join(sys.argv) slurm.check_sbatch(cmd, args=args) return msg = "Compiling start and stop codon regular expressions" logger.info(msg) start_codons_re = '|'.join(args.start_codons) stop_codons_re = '|'.join(args.stop_codons) start_codons_re = re.compile(start_codons_re) stop_codons_re = re.compile(stop_codons_re) msg = "Reading transcripts bed file" logger.info(msg) transcripts_bed = bed_utils.read_bed(args.transcripts_bed) msg = "Creating the sequence iterator" logger.info(msg) transcripts_fasta = fastx_utils.get_read_iterator(args.transcripts_fasta) transcripts_iter = ((get_transcript(transcript_header, transcripts_bed), transcript_sequence) for (transcript_header, transcript_sequence) in transcripts_fasta) msg = "Finding all ORFs" logger.info(msg) orfs = parallel.apply_parallel_iter(transcripts_iter, args.num_cpus, get_orfs, start_codons_re, stop_codons_re, total=len(transcripts_bed), progress_bar=True) msg = "Joining ORFs in a large data frame" logger.info(msg) orfs = pd.concat(orfs) orfs.reset_index(drop=True, inplace=True) # This is done arbitrarily, however we keep all matching # transcripts for reference msg = "Marking and removing duplicate ORFs" logger.info(msg) groupby_duplicates = orfs.groupby(DUPLICATE_FIELDS, as_index=False).agg({'id': ','.join}) orfs = orfs.merge(groupby_duplicates, how='left', on=DUPLICATE_FIELDS) orfs.drop_duplicates(subset=DUPLICATE_FIELDS, inplace=True, keep='first') orfs.rename(columns={'id_x': 'id', 'id_y': 'duplicates'}, inplace=True) msg = "Numbering remaining ORFs" logger.info(msg) orfs['orf_num'] = np.arange(len(orfs)) msg = "Writing ORFs to disk" logger.info(msg) bed_utils.write_bed(orfs, args.out)
def main(): global profiles_data, profiles_indices, profiles_indptr, profiles_shape global translated_models, untranslated_models global args parser = argparse.ArgumentParser( formatter_class=argparse.ArgumentDefaultsHelpFormatter, description="""This script uses Hamiltonian MCMC with Stan to estimate translation parameters for a set of regions (presumably ORFs). Roughly, it takes as input: (1) a set of regions (ORFs) and their corresponding profiles (2) a "translated" model which gives the probability that a region is translated (3) an "untranslated" model which gives the probability that a region is not translated. The script first smoothes the profiles using LOWESS. It then calculates both the Bayes' factor (using the smoothed profile) and chi2 value (using the raw counts) for each ORF.""" ) parser.add_argument('profiles', help="The ORF profiles (counts) (mtx)") parser.add_argument( 'regions', help="The regions (ORFs) for which predictions will be made (BED12+)") parser.add_argument('out', help="The output file for the Bayes' factors (BED12+)") parser.add_argument('--chi-square-only', help="""If this flag is present, then only the chi square test will be performed for each ORF. This can also be a way to get the counts within each of the ORFs.""", action='store_true') parser.add_argument('--translated-models', help="The models to use as H_t (pkl)", nargs='+') parser.add_argument('--untranslated-models', help="The models to use as H_u (pkl)", nargs='+') # filtering options parser.add_argument( '--orf-types', help= "If values are given, then only orfs with those types are processed.", nargs='*', default=translation_options['orf_types']) parser.add_argument('--orf-type-field', default=default_orf_type_field) parser.add_argument( '--min-length', help="ORFs with length less than this value will not be processed", type=int, default=translation_options['orf_min_length_pre']) parser.add_argument( '--max-length', help="ORFs with length greater than this value will not be processed", type=int, default=translation_options['orf_max_length_pre']) parser.add_argument( '--min-profile', help="""ORFs with profile sum (i.e., number of reads) less than this value will not be processed.""", type=float, default=translation_options['orf_min_profile_count_pre']) # smoothing options parser.add_argument('--fraction', help="The fraction of signal to use in LOWESS", type=float, default=translation_options['smoothing_fraction']) parser.add_argument( '--reweighting-iterations', help="The number of reweighting " "iterations to use in LOWESS. " "Please see the statsmodels documentation for a " "detailed description of this parameter.", type=int, default=translation_options['smoothing_reweighting_iterations']) # MCMC options parser.add_argument('-s', '--seed', help="The random seeds to use for inference", type=int, default=translation_options['seed']) parser.add_argument('-c', '--chains', help="The number of MCMC chains to use", type=int, default=translation_options['chains']) parser.add_argument( '-i', '--iterations', help="The number of MCMC iterations to use for each chain", type=int, default=translation_options['translation_iterations']) # behavior options parser.add_argument( '--num-orfs', help="If n>0, then only this many ORFs will be processed", type=int, default=0) parser.add_argument('--orf-num-field', default=default_orf_num_field) parser.add_argument('--do-not-compress', help="Unless otherwise specified, the output will " "be written in GZip format", action='store_true') parser.add_argument('-g', '--num-groups', help="The number of groups into which to split " "the ORFs. More groups means the progress bar is " "updated more frequently but incurs more overhead " "because of the parallel calls.", type=int, default=default_num_groups) slurm.add_sbatch_options(parser) logging_utils.add_logging_options(parser) args = parser.parse_args() logging_utils.update_logging(args) if args.use_slurm: cmd = ' '.join(sys.argv) slurm.check_sbatch(cmd, args=args) return # read in the regions and apply the filters msg = "Reading and filtering ORFs" logger.info(msg) regions = bed_utils.read_bed(args.regions) # by default, keep everything m_filters = np.array([True] * len(regions)) if len(args.orf_types) > 0: m_orf_type = regions[args.orf_type_field].isin(args.orf_types) m_filters = m_orf_type & m_filters # min length if args.min_length > 0: m_min_length = regions['orf_len'] >= args.min_length m_filters = m_min_length & m_filters # max length if args.max_length > 0: m_max_length = regions['orf_len'] <= args.max_length m_filters = m_max_length & m_filters # min profile profiles = scipy.io.mmread(args.profiles).tocsr() profiles_sums = profiles.sum(axis=1) good_orf_nums = np.where(profiles_sums >= args.min_profile) good_orf_nums = set(good_orf_nums[0]) m_profile = regions['orf_num'].isin(good_orf_nums) m_filters = m_profile & m_filters regions = regions[m_filters] if args.num_orfs > 0: regions = regions.head(args.num_orfs) regions = regions.reset_index(drop=True) msg = "Number of regions after filtering: {}".format(len(regions)) logger.info(msg) logger.debug("Reading models") translated_models = [ pickle.load(open(tm, 'rb')) for tm in args.translated_models ] untranslated_models = [ pickle.load(open(bm, 'rb')) for bm in args.untranslated_models ] profiles_data = multiprocessing.RawArray(ctypes.c_double, profiles.data.flat) profiles_indices = multiprocessing.RawArray(ctypes.c_int, profiles.indices) profiles_indptr = multiprocessing.RawArray(ctypes.c_int, profiles.indptr) profiles_shape = multiprocessing.RawArray(ctypes.c_int, profiles.shape) with suppress_stdout_stderr(): bfs_l = parallel.apply_parallel_split(regions, args.num_cpus, get_all_bayes_factors_args, num_groups=args.num_groups, progress_bar=True, backend='multiprocessing') bfs = pd.concat(bfs_l) # write the results as a bed12+ file bed_utils.write_bed(bfs, args.out)