def main(): parser = argparse.ArgumentParser( formatter_class=argparse.ArgumentDefaultsHelpFormatter, description= "This script labels the ORFs found with extract-orf-coordinates " "based on their exon structure and relation to the annotated, canonical " "ORFs. It requires the exon blocks for the ORFs (created with " "split-bed12-blocks). It completely reads in the ORFs, so unless otherwise " "desired for some reason, the input and output files can be the same.") parser.add_argument('annotated_transcripts', help="The annotated transcripts " "for the genome, in bed12+ format") parser.add_argument('extracted_orfs', help="The ORFs extracted from the " "transcripts, in bed12+ format") parser.add_argument('orf_exons', help="The exon blocks for the ORFs, in " "bed6+ format") parser.add_argument('out', help="The output (bed12+.gz) file") parser.add_argument('-p', '--num-cpus', help="The number of CPUs to use for " "a few parts of the script", type=int, default=default_num_cpus) parser.add_argument( '-f', '--filter', help="If this flag is given, then ORFs " "which are completely covered by an annotated transcript are discarded. " "Presumably, this is used to filter uninteresting ORFs from de novo " "assemblies.", action='store_true') parser.add_argument( '-e', '--annotated-exons', help="If the --filter flag is " "given, the annotated transcript exons can optionally be provided with " "this option. If they are not given, they will be split from the annotated " "transcripts. That is generally not a very expensive operation relative to " "everything else in the labeling script. If --filter is not given, then " "these are ignored.", default=default_annotated_exons) parser.add_argument( '-n', '--nonoverlapping-label', help="If this option is " "given, then ORFs which do not overlap the annotated transcripts at all " "will be given this label. Otherwise, they will be labeled as \"suspect\"", default=default_nonoverlapping_label) parser.add_argument( '-l', '--label-prefix', help="This string is prepended " "to all labels assigned to ORFs. For example, it is a useful way to " "indicate ORFs from de novo assemblies are \"novel.\" In any case, this " "*is not* prepended to \"canonical\" ORFs.", default=default_label_prefix) logging_utils.add_logging_options(parser) args = parser.parse_args() logging_utils.update_logging(args) msg = "Reading annotated transcripts" logger.info(msg) annotated_transcripts = bed_utils.read_bed(args.annotated_transcripts) msg = "Reading extracted ORFs and exons" logger.info(msg) extracted_orfs = bed_utils.read_bed(args.extracted_orfs) extracted_orf_exons = bed_utils.read_bed(args.orf_exons) msg = "Found {} extracted ORFs with {} exons".format( len(extracted_orfs), len(extracted_orf_exons)) logger.debug(msg) # check if we want to remove the extracted_orfs completely covered by # the annotated transcripts if args.filter: msg = ("Removing extracted ORFs which are completely covered by the " "annotated transcripts") logger.info(msg) # we need the annotated transcript exons if args.annotated_exons is None: msg = "Splitting the annotated transcripts into exon blocks" logger.info(msg) annotated_exons = bed_utils.split_bed12(annotated_transcripts, num_cpus=args.num_cpus, progress_bar=True) else: msg = "Reading the annotated transcript exons" logger.info(msg) annotated_exons = bed_utils.read_bed(args.annotated_exons) msg = "Finding completely covered extracted ORFs" logger.info(msg) nonoverlapping_ids = bed_utils.subtract_bed(extracted_orf_exons, annotated_exons, min_a_overlap=1) m_unfiltered = extracted_orfs['id'].isin(nonoverlapping_ids) extracted_orfs = extracted_orfs[m_unfiltered] # also discard the unnecessary exons m_unfiltered = extracted_orf_exons['id'].isin(nonoverlapping_ids) extracted_orf_exons = extracted_orf_exons[m_unfiltered] msg = "After filtering, {} extracted ORFs remain".format( len(extracted_orfs)) logger.info(msg) # if the nonoverlapping-label is given, annotate and remove the ORFs # which do not at all overlap the annotations if args.nonoverlapping_label is not None: nonoverlapping_ids = bed_utils.subtract_bed( extracted_orfs, annotated_transcripts, exons_a=extracted_orf_exons, exons_b=annotated_exons) m_nonoverlapping = extracted_orf_exons['id'].isin(nonoverlapping_ids) extracted_orf_exons = extracted_orf_exons[~m_nonoverlapping] m_nonoverlapping = extracted_orfs['id'].isin(nonoverlapping_ids) extracted_orfs.loc[m_nonoverlapping, 'orf_type'] = args.nonoverlapping_label msg = ("Found {} ORFs completely nonoverlapping annotated transcripts". format(len(nonoverlapping_ids))) logger.info(msg) msg = "Removing the annotated UTRs from the transcripts" logger.info(msg) canonical_orfs = bed_utils.retain_all_thick_only(annotated_transcripts, num_cpus=args.num_cpus) msg = "Splitting the canonical ORFs into exons" logger.info(msg) canonical_orf_exons = bed_utils.split_bed12(canonical_orfs, num_cpus=args.num_cpus, progress_bar=True) msg = "Extracting annotated 5' leader regions" logger.info(msg) five_prime_regions = bed_utils.retain_all_five_prime_of_thick( annotated_transcripts, num_cpus=args.num_cpus) if len(five_prime_regions) == 0: msg = "No annotated 5' leader regions were found" logger.warning(msg) msg = "Splitting the 5' leaders into exons" logger.info(msg) five_prime_exons = bed_utils.split_bed12(five_prime_regions, num_cpus=args.num_cpus, progress_bar=True) msg = "Extracting annotated 3' trailer regions" logger.info(msg) three_prime_regions = bed_utils.retain_all_three_prime_of_thick( annotated_transcripts, num_cpus=args.num_cpus) if len(three_prime_regions) == 0: msg = "No annotated 3' trailer regions were found" logger.warning(msg) msg = "Splitting the 3' trailers into exons" logger.info(msg) three_prime_exons = bed_utils.split_bed12(three_prime_regions, num_cpus=args.num_cpus, progress_bar=True) msg = "Splitting noncoding transcripts into exons" logger.info(msg) m_no_thick_start = annotated_transcripts['thick_start'] == -1 m_no_thick_end = annotated_transcripts['thick_end'] == -1 m_no_thick = m_no_thick_start & m_no_thick_end noncoding_transcripts = annotated_transcripts[m_no_thick] noncoding_exons = bed_utils.split_bed12(noncoding_transcripts, num_cpus=args.num_cpus, progress_bar=True) msg = "Marking canonical and extracted ORFs with the same stop codon" logger.info(msg) # first, add the true ORF end m_forward_canonical = canonical_orfs['strand'] == '+' m_reverse_canonical = canonical_orfs['strand'] == '-' m_forward_extracted = extracted_orfs['strand'] == '+' m_reverse_extracted = extracted_orfs['strand'] == '-' canonical_orfs['orf_end'] = canonical_orfs['end'] canonical_orfs.loc[m_reverse_canonical, 'orf_end'] = canonical_orfs.loc[m_reverse_canonical, 'start'] extracted_orfs['orf_end'] = extracted_orfs['end'] extracted_orfs.loc[m_reverse_extracted, 'orf_end'] = extracted_orfs.loc[m_reverse_extracted, 'start'] # now, find extracted ORFs with the same "orf_end" (and seqname, strand) as canonical ORFs merge_fields = ['seqname', 'strand', 'orf_end'] canonical_extracted_orf_ends = canonical_orfs.merge( extracted_orfs, on=merge_fields, suffixes=['_canonical', '_extracted']) # now, pull this into a set zip_it = zip(canonical_extracted_orf_ends['id_canonical'], canonical_extracted_orf_ends['id_extracted']) canonical_extracted_matching_ends = {(c, a) for c, a in zip_it} msg = "Finding ORFs which exactly overlap the canonical ORFs" logger.info(msg) exact_matches = bed_utils.get_bed_overlaps(canonical_orf_exons, extracted_orf_exons, min_a_overlap=1, min_b_overlap=1) exact_match_orf_ids = {o.b_info for o in exact_matches} m_exact_orf_matches = extracted_orf_exons['id'].isin(exact_match_orf_ids) extracted_orf_exons = extracted_orf_exons[~m_exact_orf_matches] m_canonical = extracted_orfs['id'].isin(exact_match_orf_ids) extracted_orfs.loc[m_canonical, 'orf_type'] = 'canonical' msg = "Found {} canonical ORFs".format(len(exact_match_orf_ids)) logger.info(msg) msg = "Finding ORFs which are extended versions of the canonical ORFs" logger.info(msg) extended_matches = bed_utils.get_bed_overlaps(canonical_orf_exons, extracted_orf_exons, min_a_overlap=1) # make sure the "end"s match before calling something an extended match extended_match_ids = { m.b_info for m in tqdm.tqdm(extended_matches) if (m.a_info, m.b_info) in canonical_extracted_matching_ends } m_extended_matches = extracted_orf_exons['id'].isin(extended_match_ids) extracted_orf_exons = extracted_orf_exons[~m_extended_matches] m_canonical_extended = extracted_orfs['id'].isin(extended_match_ids) l = "{}canonical_extended".format(args.label_prefix) extracted_orfs.loc[m_canonical_extended, 'orf_type'] = l msg = "Found {} canonical_extended ORFs".format(len(extended_match_ids)) logger.info(msg) msg = "Finding ORFs which are truncated versions of the canonical ORFs" logger.info(msg) truncated_matches = bed_utils.get_bed_overlaps(canonical_orf_exons, extracted_orf_exons, min_b_overlap=1) # make sure the "end"s match before calling something a truncated match truncated_match_ids = { m.b_info for m in tqdm.tqdm(truncated_matches) if (m.a_info, m.b_info) in canonical_extracted_matching_ends } m_truncated_matches = extracted_orf_exons['id'].isin(truncated_match_ids) extracted_orf_exons = extracted_orf_exons[~m_truncated_matches] m_canonical_truncated = extracted_orfs['id'].isin(truncated_match_ids) l = "{}canonical_truncated".format(args.label_prefix) extracted_orfs.loc[m_canonical_truncated, 'orf_type'] = l msg = "Found {} canonical_truncated ORFs".format(len(truncated_match_ids)) logger.info(msg) msg = ("Labeling ORFs which are completely covered by a canonical ORF but " "do not share its stop codon") logger.info(msg) # anything in "truncated matches" which *does not* share a stop codon with # the match is a "within" orf within_ids = { m.b_info for m in truncated_matches if m.b_info not in truncated_match_ids } m_within_matches = extracted_orf_exons['id'].isin(within_ids) extracted_orf_exons = extracted_orf_exons[~m_within_matches] m_within = extracted_orfs['id'].isin(within_ids) l = "{}within".format(args.label_prefix) extracted_orfs.loc[m_within, 'orf_type'] = l msg = "Found {} within ORFs".format(len(within_ids)) logger.info(msg) msg = "Finding out-of-frame overlaps" logger.info(msg) out_of_frame_matches = bed_utils.get_bed_overlaps(canonical_orf_exons, extracted_orf_exons) msg = "Finding leader overlaps" logger.info(msg) leader_matches = bed_utils.get_bed_overlaps(five_prime_exons, extracted_orf_exons) msg = "Finding trailer overlaps" logger.info(msg) trailer_matches = bed_utils.get_bed_overlaps(three_prime_exons, extracted_orf_exons) msg = ("Labeling ORFs which have (out-of-frame) overlaps with both a " "canonical ORF and annotated leaders or trailers") logger.info(msg) out_of_frame_ids = {m.b_info for m in out_of_frame_matches} leader_ids = {m.b_info for m in leader_matches} trailer_ids = {m.b_info for m in trailer_matches} leader_overlap_ids = out_of_frame_ids & leader_ids trailer_overlap_ids = out_of_frame_ids & trailer_ids m_leader_overlap_matches = extracted_orf_exons['id'].isin( leader_overlap_ids) extracted_orf_exons = extracted_orf_exons[~m_leader_overlap_matches] m_trailer_overlap_matches = extracted_orf_exons['id'].isin( trailer_overlap_ids) extracted_orf_exons = extracted_orf_exons[~m_trailer_overlap_matches] m_five_prime_overlap = extracted_orfs['id'].isin(leader_overlap_ids) l = "{}five_prime_overlap".format(args.label_prefix) extracted_orfs.loc[m_five_prime_overlap, 'orf_type'] = l m_three_prime_overlap = extracted_orfs['id'].isin(trailer_overlap_ids) l = "{}three_prime_overlap".format(args.label_prefix) extracted_orfs.loc[m_three_prime_overlap, 'orf_type'] = l msg = "Found {} five_prime_overlap ORFs".format(len(leader_overlap_ids)) logger.info(msg) msg = "Found {} three_prime_overlap ORFs".format(len(trailer_overlap_ids)) logger.info(msg) msg = "Finding ORFs completely within 5' leaders" logger.info(msg) leader_matches = bed_utils.get_bed_overlaps(five_prime_exons, extracted_orf_exons, min_b_overlap=1) leader_ids = {m.b_info for m in leader_matches} m_leader_matches = extracted_orf_exons['id'].isin(leader_ids) extracted_orf_exons = extracted_orf_exons[~m_leader_matches] m_five_prime = extracted_orfs['id'].isin(leader_ids) l = "{}five_prime".format(args.label_prefix) extracted_orfs.loc[m_five_prime, 'orf_type'] = l msg = "Found {} five_prime ORFs".format(len(leader_ids)) logger.info(msg) msg = "Finding ORFs completely within 3' trailers" logger.info(msg) trailer_matches = bed_utils.get_bed_overlaps(three_prime_exons, extracted_orf_exons, min_b_overlap=1) trailer_ids = {m.b_info for m in trailer_matches} m_trailer_matches = extracted_orf_exons['id'].isin(trailer_ids) extracted_orf_exons = extracted_orf_exons[~m_trailer_matches] m_three_prime = extracted_orfs['id'].isin(trailer_ids) l = "{}three_prime".format(args.label_prefix) extracted_orfs.loc[m_three_prime, 'orf_type'] = l msg = "Found {} three_prime ORFs".format(len(trailer_ids)) logger.info(msg) msg = "Finding ORFs completely within annotated, noncoding transcripts" logger.info(msg) noncoding_matches = bed_utils.get_bed_overlaps(noncoding_exons, extracted_orf_exons, min_b_overlap=1) noncoding_ids = {m.b_info for m in noncoding_matches} m_noncoding_matches = extracted_orf_exons['id'].isin(noncoding_ids) extracted_orf_exons = extracted_orf_exons[~m_noncoding_matches] m_noncoding = extracted_orfs['id'].isin(noncoding_ids) l = "{}noncoding".format(args.label_prefix) extracted_orfs.loc[m_noncoding, 'orf_type'] = l msg = "Found {} noncoding ORFs".format(len(noncoding_ids)) logger.info(msg) # all of the remaining ORFs fall into the "suspect" category suspect_ids = {orf_id for orf_id in extracted_orf_exons['id']} m_suspect = extracted_orfs['id'].isin(suspect_ids) l = "{}suspect".format(args.label_prefix) extracted_orfs.loc[m_suspect, 'orf_type'] = l msg = "Found {} \"suspect\" ORFs".format(len(suspect_ids)) logger.info(msg) m_no_orf_type = extracted_orfs['orf_type'].isnull() msg = "Found {} unlabeled ORFs".format(sum(m_no_orf_type)) logger.info(msg) msg = "Writing ORFs with types to disk" logger.info(msg) fields = bed_utils.bed12_field_names + ['orf_num', 'orf_len', 'orf_type'] extracted_orfs = extracted_orfs[fields] extracted_orfs = bed_utils.sort(extracted_orfs) bed_utils.write_bed(extracted_orfs, args.out)
def main(): parser = argparse.ArgumentParser( formatter_class=argparse.ArgumentDefaultsHelpFormatter, description= "This script extracts all of the ORFs from the given transcripts. " "It writes the result as a bed12+1 file. The additional field, 'orf_len', gives " "the length of the respective ORF. It removes duplicate ORFs.\n\nN.B. The DEBUG " "output for this script is _very_ verbose. It is not recommended to run this " "script with that logging level.") parser.add_argument('transcripts_bed', help="The bed12 file containing the " "transcript information") parser.add_argument('transcripts_fasta', help="The fasta file containing the " "spliced transcript sequences") parser.add_argument('out', help="The output (bed12+1 gz) file") parser.add_argument('--start-codons', help="A list of codons which will be " "treated as start codons when extracting ORFs", nargs='+', default=default_start_codons) parser.add_argument('--stop-codons', help="A list of codons which will be " "treated as stop codons when extracting ORFs", nargs='+', default=default_stop_codons) slurm.add_sbatch_options(parser) logging_utils.add_logging_options(parser) args = parser.parse_args() logging_utils.update_logging(args) # check if we wanted to use slurm if args.use_slurm: cmd = ' '.join(sys.argv) slurm.check_sbatch(cmd, args=args) return msg = "Compiling start and stop codon regular expressions" logger.info(msg) start_codons_re = '|'.join(args.start_codons) stop_codons_re = '|'.join(args.stop_codons) start_codons_re = re.compile(start_codons_re) stop_codons_re = re.compile(stop_codons_re) msg = "Reading transcripts bed file" logger.info(msg) transcripts_bed = bed_utils.read_bed(args.transcripts_bed) msg = "Creating the sequence iterator" logger.info(msg) transcripts_fasta = fastx_utils.get_read_iterator(args.transcripts_fasta) transcripts_iter = ((get_transcript(transcript_header, transcripts_bed), transcript_sequence) for (transcript_header, transcript_sequence) in transcripts_fasta) msg = "Finding all ORFs" logger.info(msg) orfs = parallel.apply_parallel_iter(transcripts_iter, args.num_cpus, get_orfs, start_codons_re, stop_codons_re, total=len(transcripts_bed), progress_bar=True) msg = "Joining ORFs in a large data frame" logger.info(msg) orfs = pd.concat(orfs) msg = "Removing duplicate ORFs" logger.info(msg) orfs = orfs.drop_duplicates(subset=DUPLICATE_FIELDS) msg = "Numbering remaining ORFs" logger.info(msg) orfs['orf_num'] = np.arange(len(orfs)) msg = "Writing ORFs to disk" logger.info(msg) bed_utils.write_bed(orfs, args.out)
def main(): parser = argparse.ArgumentParser( formatter_class=argparse.ArgumentDefaultsHelpFormatter, description= "Given a list of ORFs with associated Bayes factors and a fasta " "sequence file, this script extracts the sequences of the ORFs whose Bayes factor " "exceeds the given threshold. Finally, biopython is used to translate the " "selected ORFs into protein sequences.\n\n" "The min-length and minimum-profile-sum filters are applied in the obvious way.\n\n" "For both BF and chi-square predictions, only ORFs which have more reads in the " "first reading frame than either of the other two will be selected as translated. " "(This is called the 'frame filter' below.)\n\n" "The selection based on Bayes factors follows this logic: if max_bf_var is given, " "then it and min_bf_mean are taken as a hard threshold on the estimated Bayes " "factor mean. If min_bf_likelihood is given, then this min_bf_mean is taken as the " "boundary value; that is, an ORF is \"translated\" if:\n\n" "\t\t[P(bf > min_bf_mean)] > min_bf_likelihood\n\n" "If both max_bf_var and min_bf_likelihood are None, then min_bf_mean is taken as a " "hard threshold on the mean for selecting translated ORFs.\n\n" "If both max_bf_var and min_bf_likelihood are given, then both filters will be " "applied and the result will be the intersection.\n\n" "If the --use-chi-square option is given, the significance value is " "Bonferroni-corrected based on the number of ORFs which meet the length, profile " "and frame filters.") parser.add_argument('bayes_factors', help="The file containing the ORFs and Bayes' " "factors (BED12+)") parser.add_argument('fasta', help="The *genome* fasta file") parser.add_argument('predicted_orfs', help="The (output) BED12+ file containing " "the predicted ORFs.") parser.add_argument( 'predicted_dna_sequences', help="The (output) fasta file " "containing the predicted ORF sequences, as DNA sequences") parser.add_argument( 'predicted_protein_sequences', help="The (output) fasta file " "containing the predicted ORF sequences, as protein sequences") parser.add_argument( '--select-longest-by-stop', help="If this flag is given, then " "the selected ORFs will be merged based on stop codons. In particular, only the " "longest translated ORF at each stop codon will be selected.", action='store_true') parser.add_argument( '--select-best-overlapping', help="If this flag is given, then " "only the ORF with the highest estimated Bayes factor will be kept among each " "set of overlapping ORFs. N.B. This filter is applied *AFTER* selecting the " "longest ORF at each stop codon, if the --select-longest-by-stop flag is " "given.", action='store_true') parser.add_argument('--min-length', help="The minimum length to predict an ORF " "as translated", type=int, default=default_min_length) parser.add_argument('--min-bf-mean', help="The minimum Bayes' factor mean to predict " "an ORF as translated (use --help for more details)", type=float, default=default_min_bf_mean) parser.add_argument('--max-bf-var', help="The maximum Bayes' factor variance to predict " "an ORF as translated (use --help for more details)", type=float, default=default_max_bf_var) parser.add_argument( '--min-bf-likelihood', help="If given, then this is taken a threshold " "on the likelihood of translation (use --help for more details)", type=float, default=default_min_bf_likelihood) parser.add_argument( '--use-chi-square', help="If this flag is present, the the " "chi square value will be used to predict ORFs rather than the Bayes' factor", action='store_true') parser.add_argument( '--chisq-significance-level', help="If using chi square, then this " "value is Bonferroni corrected and used as the significance cutoff", type=float, default=default_chisq_significance_level) parser.add_argument('--filtered-orf-types', help="A list of ORF types which will be " "removed before selecting the final prediction set.", nargs='*', default=default_filtered_orf_types) parser.add_argument( '--filter-non-canonical-overlaps', help="If this flag is given, then " "--filtered-orf-types will be extended with the non-canonical overlap types ({})." .format(non_canonical_overlap_orf_types_str), action='store_true') logging_utils.add_logging_options(parser) args = parser.parse_args() logging_utils.update_logging(args) # first, extract all of the predictions which exceed the threshold msg = "Reading Bayes factor information" logger.info(msg) bayes_factors = bed_utils.read_bed(args.bayes_factors) if args.filter_non_canonical_overlaps: args.filtered_orf_types.extend(non_canonical_overlap_orf_types) if len(args.filtered_orf_types) > 0: filtered_orf_types_str = ','.join(args.filtered_orf_types) msg = "Filtering these ORF types: {}".format(filtered_orf_types_str) logger.info(msg) m_orf_types = bayes_factors['orf_type'].isin(args.filtered_orf_types) bayes_factors = bayes_factors[~m_orf_types] msg = "Identifying ORFs which meet the prediction thresholds" logger.info(msg) all_orfs, bf_orfs, chisq_orfs = ribo_utils.get_predicted_orfs( bayes_factors, min_bf_mean=args.min_bf_mean, max_bf_var=args.max_bf_var, min_bf_likelihood=args.min_bf_likelihood, min_length=args.min_length, chisq_alpha=args.chisq_significance_level, select_longest_by_stop=args.select_longest_by_stop) if args.use_chi_square: predicted_orfs = chisq_orfs else: predicted_orfs = bf_orfs msg = "Number of selected ORFs: {}".format(len(predicted_orfs)) logger.info(msg) if args.select_best_overlapping: msg = "Finding overlapping ORFs" logger.info(msg) merged_intervals = bed_utils.merge_all_intervals(predicted_orfs) msg = "Selecting best among overlapping ORFs" logger.info(msg) predicted_orfs = parallel.apply_iter_simple( merged_intervals['merged_ids'], get_best_overlapping_orf, predicted_orfs, progress_bar=True) predicted_orfs = pd.DataFrame(predicted_orfs) msg = "Sorting selected ORFs" logger.info(msg) predicted_orfs = bed_utils.sort(predicted_orfs) msg = "Writing selected ORFs to disk" logger.info(msg) bed_utils.write_bed(predicted_orfs, args.predicted_orfs) # now get the sequences msg = "Extracting predicted ORFs DNA sequence" logger.info(msg) split_exons = True transcript_sequences = bed_utils.get_all_bed_sequences( predicted_orfs, args.fasta, split_exons) fastx_utils.write_fasta(transcript_sequences, args.predicted_dna_sequences, compress=False) # translate the remaining ORFs into protein sequences msg = "Converting predicted ORF sequences to amino acids" logger.info(msg) records = fastx_utils.get_read_iterator(args.predicted_dna_sequences) protein_records = {r[0]: Bio.Seq.translate(r[1]) for r in records} fastx_utils.write_fasta(protein_records.items(), args.predicted_protein_sequences, compress=False)
def main(): parser = argparse.ArgumentParser( formatter_class=argparse.ArgumentDefaultsHelpFormatter, description= "This script creates all of the files necessary for downstream " "analysis performed with the rpbp package.") parser.add_argument('config', help="The (yaml) config file") parser.add_argument('--overwrite', help="If this flag is present, existing files " "will be overwritten.", action='store_true') star_utils.add_star_options(parser) slurm.add_sbatch_options(parser) logging_utils.add_logging_options(parser) args = parser.parse_args() logging_utils.update_logging(args) logging_str = logging_utils.get_logging_options_string(args) config = yaml.load(open(args.config)) call = not args.do_not_call # check that all of the necessary programs are callable programs = [ 'extract-orf-coordinates', 'label-orfs', 'bowtie2-build-s', 'split-bed12-blocks', 'gtf-to-bed12', args.star_executable ] shell_utils.check_programs_exist(programs) required_keys = [ 'genome_base_path', 'genome_name', 'gtf', 'fasta', 'ribosomal_fasta', 'ribosomal_index', 'star_index' ] utils.check_keys_exist(config, required_keys) # check that the required files are present files = [config['gtf'], config['fasta'], config['ribosomal_fasta']] if 'de_novo_gtf' in config: files += [config['de_novo_gtf']] utils.check_files_exist(files, source='prepare-rpbp-genome') # now, check if we want to use slurm if args.use_slurm: cmd = ' '.join(sys.argv) slurm.check_sbatch(cmd, args=args) return # the rrna index cmd = "bowtie2-build-s {} {}".format(config['ribosomal_fasta'], config['ribosomal_index']) in_files = [config['ribosomal_fasta']] out_files = bio.get_bowtie2_index_files(config['ribosomal_index']) shell_utils.call_if_not_exists(cmd, out_files, in_files=in_files, overwrite=args.overwrite, call=call) # the STAR index mem = utils.human2bytes(args.mem) cmd = ("{} --runMode genomeGenerate --genomeDir {} --genomeFastaFiles {} " "--runThreadN {} --limitGenomeGenerateRAM {}".format( args.star_executable, config['star_index'], config['fasta'], args.num_cpus, mem)) in_files = [config['fasta']] out_files = star_utils.get_star_index_files(config['star_index']) shell_utils.call_if_not_exists(cmd, out_files, in_files=in_files, overwrite=args.overwrite, call=call) # get the main orfs get_orfs(config['gtf'], args, config, is_annotated=True, is_de_novo=False) # eventually, we will use these names annotated_orfs = filenames.get_orfs(config['genome_base_path'], config['genome_name'], note=config.get('orf_note'), is_annotated=True, is_de_novo=False) annotated_exons_file = filenames.get_exons(config['genome_base_path'], config['genome_name'], note=config.get('orf_note'), is_annotated=True, is_de_novo=False) orfs_genomic = filenames.get_orfs(config['genome_base_path'], config['genome_name'], note=config.get('orf_note')) exons_file = filenames.get_exons(config['genome_base_path'], config['genome_name'], note=config.get('orf_note')) # now, check if we have a de novo assembly if 'de_novo_gtf' in config: get_orfs(config['de_novo_gtf'], args, config, is_annotated=False, is_de_novo=True) # we need to concat the ORF and exon files de_novo_orfs = filenames.get_orfs(config['genome_base_path'], config['genome_name'], note=config.get('orf_note'), is_annotated=False, is_de_novo=True) de_novo_exons_file = filenames.get_exons(config['genome_base_path'], config['genome_name'], note=config.get('orf_note'), is_annotated=False, is_de_novo=True) orfs_files = [annotated_orfs, de_novo_orfs] orfs_files_str = ' '.join(orfs_files) msg = ("Concatenating files. Output file: {}; Input files: {}".format( orfs_genomic, orfs_files_str)) logger.info(msg) if call: concatenated_bed = bed_utils.concatenate(orfs_files, sort_bed=True) concatenated_bed['orf_num'] = range(len(concatenated_bed)) fields = bed_utils.bed12_field_names + [ 'orf_num', 'orf_len', 'orf_type' ] bed_utils.write_bed(concatenated_bed[fields], orfs_genomic) else: msg = "Skipping concatenation due to --call value" logger.info(msg) exons_files = [annotated_exons_file, de_novo_exons_file] exons_files_str = ' '.join(exons_files) msg = ("Concatenating files. Output file: {}; Input files: {}".format( exons_file, exons_files_str)) logger.info(msg) if call: concatenated_bed = bed_utils.concatenate(exons_files, sort_bed=True) fields = bed_utils.bed6_field_names + [ 'exon_index', 'transcript_start' ] bed_utils.write_bed(concatenated_bed[fields], exons_file) else: msg = "Skipping concatenation due to --call value" logger.info(msg) else: # finally, make sure our files are named correctly if os.path.exists(annotated_orfs): utils.create_symlink(annotated_orfs, orfs_genomic, call) if os.path.exists(annotated_exons_file): utils.create_symlink(annotated_exons_file, exons_file, call)
def main(): global profiles_data, profiles_indices, profiles_indptr, profiles_shape global translated_models, untranslated_models global args parser = argparse.ArgumentParser(formatter_class=argparse.ArgumentDefaultsHelpFormatter, description=""" This script uses Hamiltonian MCMC with Stan to estimate translation parameters for a set of regions (presumably ORFs). Roughly, it takes as input: (1) a set of regions (ORFs) and their corresponding profiles (2) a "translated" model which gives the probability that a region is translated (3) an "untranslated" model which gives the probability that a region is not translated The script first smoothes the profiles using LOWESS. It then calculates both the Bayes' factor (using the smoothed profile) and \chi^2 value (using the raw counts) for each ORF. """ ) parser.add_argument('profiles', help="The ORF profiles (counts) (mtx)") parser.add_argument('regions', help="The regions (ORFs) for which predictions will " "be made (BED12+)") parser.add_argument('out', help="The output file for the Bayes' factors (BED12+)") parser.add_argument('--chi-square-only', help="If this flag is present, then only the chi " "square test will be performed for each ORF. This can also be a way to get the counts " "within each of the ORFs.", action='store_true') parser.add_argument('--translated-models', help="The models to use as H_t (pkl)", nargs='+') parser.add_argument('--untranslated-models', help="The models to use as H_u (pkl)", nargs='+') ### filtering options parser.add_argument('--orf-types', help="If values are given, then only orfs with " "those types are processed.", nargs='*', default=default_orf_types) parser.add_argument('--orf-type-field', default=default_orf_type_field) parser.add_argument('--min-length', help="ORFs with length less than this value will not " "be processed", type=int, default=default_min_length) parser.add_argument('--max-length', help="ORFs with length greater than this value will not " "be processed", type=int, default=default_max_length) parser.add_argument('--min-profile', help="ORFs with profile sum (i.e., number " "of reads) less than this value will not be processed.", type=float, default=default_min_profile) ### smoothing options parser.add_argument('--fraction', help="The fraction of signal to use in LOWESS", type=float, default=default_fraction) parser.add_argument('--reweighting-iterations', help="The number of reweighting " "iterations to use in LOWESS. Please see the statsmodels documentation for a " "detailed description of this parameter.", type=int, default=default_reweighting_iterations) ### MCMC options parser.add_argument('-s', '--seed', help="The random seeds to use for inference", type=int, default=default_seed) parser.add_argument('-c', '--chains', help="The number of MCMC chains to use", type=int, default=default_chains) parser.add_argument('-i', '--iterations', help="The number of MCMC iterations to use for " "each chain", type=int, default=default_iterations) ### behavior options parser.add_argument('--num-orfs', help="If n>0, then only this many ORFs will be processed", type=int, default=default_num_orfs) parser.add_argument('--orf-num-field', default=default_orf_num_field) parser.add_argument('--do-not-compress', help="Unless otherwise specified, the output will " "be written in GZip format", action='store_true') parser.add_argument('-g', '--num-groups', help="The number of groups into which to split " "the ORFs. More groups means the progress bar is updated more frequently but incurs " "more overhead because of the parallel calls.", type=int, default=default_num_groups) slurm.add_sbatch_options(parser) logging_utils.add_logging_options(parser) args = parser.parse_args() logging_utils.update_logging(args) if args.use_slurm: cmd = ' '.join(sys.argv) slurm.check_sbatch(cmd, args=args) return # read in the regions and apply the filters msg = "Reading and filtering ORFs" logger.info(msg) regions = bed_utils.read_bed(args.regions) # by default, keep everything m_filters = np.array([True] * len(regions)) if len(args.orf_types) > 0: m_orf_type = regions[args.orf_type_field].isin(args.orf_types) m_filters = m_orf_type & m_filters # min length if args.min_length > 0: m_min_length = regions['orf_len'] >= args.min_length m_filters = m_min_length & m_filters # max length if args.max_length > 0: m_max_length = regions['orf_len'] <= args.max_length m_filters = m_max_length & m_filters # min profile profiles = scipy.io.mmread(args.profiles).tocsr() profiles_sums = profiles.sum(axis=1) good_orf_nums = np.where(profiles_sums >= args.min_profile) good_orf_nums = set(good_orf_nums[0]) m_profile = regions['orf_num'].isin(good_orf_nums) m_filters = m_profile & m_filters regions = regions[m_filters] if args.num_orfs > 0: regions = regions.head(args.num_orfs) regions = regions.reset_index(drop=True) msg = "Number of regions after filtering: {}".format(len(regions)) logger.info(msg) logger.debug("Reading models") translated_models = [pickle.load(open(tm, 'rb')) for tm in args.translated_models] untranslated_models = [pickle.load(open(bm, 'rb')) for bm in args.untranslated_models] profiles_data = multiprocessing.RawArray(ctypes.c_double, profiles.data.flat) profiles_indices = multiprocessing.RawArray(ctypes.c_int, profiles.indices) profiles_indptr = multiprocessing.RawArray(ctypes.c_int, profiles.indptr) profiles_shape = multiprocessing.RawArray(ctypes.c_int, profiles.shape) bfs_l = parallel.apply_parallel_split( regions, args.num_cpus, get_all_bayes_factors_args, num_groups=args.num_groups, progress_bar=True ) bfs = pd.concat(bfs_l) # write the results as a bed12+ file bed_utils.write_bed(bfs, args.out)