def main(): parser = argparse.ArgumentParser( formatter_class=argparse.ArgumentDefaultsHelpFormatter, description= "This script creates a simple latex document containing the read " "filtering images, metagene profiles and analysis, and standard section text." ) parser.add_argument('config', help="The (yaml) config file for the project") parser.add_argument('out', help="The path for the output files") parser.add_argument( '--show-orf-periodicity', help="If this flag is " "present, bar charts showing the periodicity of each ORF type will be " "included in the report.", action='store_true') parser.add_argument( '--show-read-length-bfs', help="If this flag is given, " "plots showing the Bayes factor at each offset for each read length " "are included in the report.", action='store_true') parser.add_argument('--overwrite', help="If this flag is present, existing files will " "be overwritten.", action='store_true') parser.add_argument('--min-visualization-count', help="Read lengths with fewer than this " "number of reads will not be included in the report.", type=int, default=metagene_options['min_metagene_image_count']) parser.add_argument('--image-type', help="The type of image types to create. This " "must be an extension which matplotlib can interpret.", default=default_image_type) parser.add_argument( '-c', '--create-fastqc-reports', help="If this flag is given, then " "fastqc reports will be created for most fastq and bam files. By default, they are " "not created.", action='store_true') parser.add_argument('--tmp', help="If the fastqc reports are created, " "they will use this location for temp files", default=None) parser.add_argument( '--note', help="If this option is given, it will be used in the " "filenames.\n\nN.B. This REPLACES the note in the config file.", default=None) slurm.add_sbatch_options(parser, num_cpus=default_num_cpus) logging_utils.add_logging_options(parser) args = parser.parse_args() logging_utils.update_logging(args) config = yaml.load(open(args.config), Loader=yaml.FullLoader) if args.note is not None: config['note'] = args.note note = config.get('note', None) sample_names = sorted(config['riboseq_samples'].keys()) # keep multimappers? is_unique = not ('keep_riboseq_multimappers' in config) programs = [ 'create-read-length-metagene-profile-plot', 'visualize-metagene-profile-bayes-factor', 'get-all-read-filtering-counts', 'samtools', 'visualize-read-filtering-counts', 'get-read-length-distribution', 'plot-read-length-distribution' ] if args.create_fastqc_reports: programs.extend(['fastqc', 'java']) shell_utils.check_programs_exist(programs) if args.use_slurm: cmd = ' '.join(sys.argv) slurm.check_sbatch(cmd, args=args) return # make sure the path to the output file exists os.makedirs(args.out, exist_ok=True) # first, create the read filtering information... create_read_filtering_plots(args.config, config, args) # ... and all the other figures. for name in sample_names: periodic_offsets = filenames.get_periodic_offsets( config['riboseq_data'], name, is_unique=is_unique, note=note) offsets_df = pd.read_csv(periodic_offsets) create_figures(args.config, config, name, offsets_df, args) min_metagene_profile_count = config.get( 'min_metagene_profile_count', metagene_options['min_metagene_profile_count']) min_bf_mean = config.get('min_metagene_bf_mean', metagene_options['min_metagene_bf_mean']) max_bf_var = config.get('max_metagene_bf_var', metagene_options['max_metagene_bf_var']) min_bf_likelihood = config.get( 'min_metagene_bf_likelihood', metagene_options['min_metagene_bf_likelihood']) project_name = config.get("project_name", default_project_name) title = "Preprocessing results for {}".format(project_name) tex_file = os.path.join(args.out, "preprocessing-report.tex") with open(tex_file, 'w') as out: latex.begin_document(out, title, abstract, commands=commands) latex.section(out, "Introduction") latex.clearpage(out) latex.newpage(out) latex.section(out, "Mapping and filtering") latex.write(out, mapping_and_filtering_text) # the read filtering figures read_filtering_image = filenames.get_riboseq_read_filtering_counts_image( config['riboseq_data'], note=note, image_type=args.image_type) n = "no-rrna-{}".format(note) no_rrna_read_filtering_image = filenames.get_riboseq_read_filtering_counts_image( config['riboseq_data'], note=n, image_type=args.image_type) latex.begin_figure(out) latex.write_graphics(out, read_filtering_image, width=0.45) latex.write_graphics(out, no_rrna_read_filtering_image, width=0.45) latex.write_caption(out, read_filtering_caption, label=read_filtering_label) latex.end_figure(out) latex.clearpage(out) # the read length distributions latex.section(out, "Read length distributions", label=length_distribution_section_label) msg = "Writing length distribution figures" logger.info(msg) latex.begin_table(out, "cc") latex.write_header(out, ["All aligned reads", "Uniquely-aligning reads"]) for name in sample_names: data = config['riboseq_samples'][name] read_length_distribution_image = filenames.get_riboseq_read_length_distribution_image( config['riboseq_data'], name, is_unique=False, note=note, image_type=args.image_type) unique_read_length_distribution_image = filenames.get_riboseq_read_length_distribution_image( config['riboseq_data'], name, is_unique=True, note=note, image_type=args.image_type) msg = "Looking for image file: {}".format( read_length_distribution_image) logger.debug(msg) if os.path.exists(read_length_distribution_image): latex.write_graphics(out, read_length_distribution_image, width=0.45) else: msg = "Could not find image: {}".format( read_length_distribution_image) logger.warning(msg) text = "Missing: {}\n\n".format(name) latex.write(out, text) latex.write_column_sep(out) msg = "Looking for image file: {}".format( unique_read_length_distribution_image) logger.debug(msg) if os.path.exists(unique_read_length_distribution_image): latex.write_graphics(out, unique_read_length_distribution_image, width=0.45) else: msg = "Could not find image: {}".format( unique_read_length_distribution_image) logger.warning(msg) text = "Missing: {}\n\n".format(name) latex.write(out, text) latex.write_row_sep(out) latex.end_table(out) latex.clearpage(out) latex.section(out, "Read length periodicity", label=periodicity_label) for name in sample_names: i = 0 data = config['riboseq_samples'][name] msg = "Processing sample: {}".format(name) logger.info(msg) logger.debug("overwrite: {}".format(args.overwrite)) periodic_offsets = filenames.get_periodic_offsets( config['riboseq_data'], name, is_unique=is_unique, note=note) offsets_df = pd.read_csv(periodic_offsets) min_read_length = int(offsets_df['length'].min()) max_read_length = int(offsets_df['length'].max()) latex.begin_table(out, "YY") header = "\\multicolumn{2}{c}{" + name + "}" header = [header] latex.write_header(out, header) for length in range(min_read_length, max_read_length + 1): msg = "Processing length: {}".format(length) logger.info(msg) # check which offset is used # select the row for this length mask_length = offsets_df['length'] == length # TODO: this is sometimes length 0. why? if sum(mask_length) == 0: continue length_row = offsets_df[mask_length].iloc[0] # now, check all of the filters offset = int(length_row['highest_peak_offset']) offset_status = "Used for analysis" if max_bf_var is not None: if ((length_row['highest_peak_bf_mean'] <= min_bf_mean) or length_row['highest_peak_bf_var'] >= max_bf_var): offset_status = "BF mean too small or BF var too high" if min_bf_likelihood is not None: likelihood = 1 - scipy.stats.norm.cdf( min_bf_mean, length_row['highest_peak_bf_mean'], np.sqrt(length_row['highest_peak_bf_var'])) if likelihood <= min_bf_likelihood: offset_status = "Likehood too small" if (max_bf_var is None) and (min_bf_likelihood is None): if length_row['highest_peak_bf_mean'] <= min_bf_mean: offset_status = "BF mean too small" if length_row[ 'highest_peak_profile_sum'] < min_metagene_profile_count: offset_status = "Count too small" if length_row[ 'highest_peak_profile_sum'] < args.min_visualization_count: msg = "Not enough reads of this length. Skipping." logger.warning(msg) continue metagene_profile_image = filenames.get_metagene_profile_image( config['riboseq_data'], name, image_type=args.image_type, is_unique=is_unique, length=length, note=note) #title = ("length: {}. P-site offset: {}. \\newline status: {}" #"\n".format(length, offset, offset_status)) #latex.write(out, title, size="scriptsize") title = ("Length: {}. P-site offset: {}. Status: {}\n".format( length, offset, offset_status)) if args.show_read_length_bfs: title = "\scriptsize{" + title + "}" title = "\\multicolumn{2}{c}{" + title + "}" latex.write(out, title) latex.write_row_sep(out) else: latex.write(out, title, size="scriptsize") latex.write_graphics(out, metagene_profile_image, width=0.45) i += 1 if i % 2 == 1: latex.write_column_sep(out) else: latex.write_row_sep(out) if args.show_read_length_bfs: bayes_factor_image = filenames.get_metagene_profile_bayes_factor_image( config['riboseq_data'], name, image_type=args.image_type, is_unique=is_unique, length=length, note=note) #latex.centering(out) latex.write_graphics(out, bayes_factor_image, width=0.45) i += 1 if i % 2 == 1: latex.write_column_sep(out) else: latex.write_row_sep(out) if i % 2 == 1: latex.write_row_sep(out) latex.end_table(out) latex.clearpage(out) ### ORF type metagene profiles if args.show_orf_periodicity: title = "ORF type periodicity" latex.section(out, title) strands = ['+', '-'] for sample_name in sample_names: i = 0 try: lengths, offsets = ribo_utils.get_periodic_lengths_and_offsets( config, sample_name, is_unique=is_unique, default_params=metagene_options) except FileNotFoundError: msg = ( "Could not parse out lengths and offsets for sample: {}. " "Skipping".format(sample_name)) logger.error(msg) continue orf_type_profile_base = filenames.get_orf_type_profile_base( config['riboseq_data'], sample_name, length=lengths, offset=offsets, is_unique=is_unique, note=note, subfolder='orf-profiles') for orf_type in ribo_utils.orf_types: for strand in strands: orf_type_profile = filenames.get_orf_type_profile_image( orf_type_profile_base, orf_type, strand, image_type=args.image_type) msg = "Looking for image file: {}".format( orf_type_profile) logger.debug(msg) if os.path.exists(orf_type_profile): if i % 4 == 0: latex.begin_figure(out) i += 1 latex.write_graphics(out, orf_type_profile, height=0.23) if i % 4 == 0: latex.end_figure(out) latex.clearpage(out) if (i > 0) and (i % 4 != 0): latex.end_figure(out) latex.clearpage(out) latex.end_document(out) tex_filename = os.path.basename(tex_file) latex.compile(args.out, tex_filename) if args.create_fastqc_reports: parallel.apply_parallel_iter(config['riboseq_samples'].items(), args.num_cpus, create_fastqc_reports, config, args)
def main(): parser = argparse.ArgumentParser( formatter_class=argparse.ArgumentDefaultsHelpFormatter, description="""This is a helper script to submit a set of samples to SLURM. It can also be used to run a set of samples sequentially. Due to limitations on the config file specification, all of the samples must use the same reference indices obtained by running 'create-base-genome-profile.""") parser.add_argument('config', help="The (yaml) configuration file") parser.add_argument('--tmp', help="The temp directory", default=None) parser.add_argument('--overwrite', help="""If this flag is present, existing files will be overwritten.""", action='store_true') parser.add_argument('--profiles-only', help="""If this flag is present, then only the pre-processing part of the pipeline will be called, i.e. profiles will be created for each sample specified in the config file, but no predictions will be made.""", action='store_true') parser.add_argument('--merge-replicates', help="""If this flag is present, then the ORF profiles from the replicates will be merged before making the final predictions""", action='store_true') parser.add_argument('--run-replicates', help="""If this flag is given with the --merge-replicates flag, then both the replicates and the individual samples will be run. This flag has no effect if --merge-replicates is not given.""", action='store_true') parser.add_argument('-k', '--keep-intermediate-files', help="""If this flag is given, then all intermediate files will be kept; otherwise, they will be deleted. This feature is implemented piecemeal. If the --do-not-call flag is given, then nothing will be deleted.""", action='store_true') slurm.add_sbatch_options(parser, num_cpus=default_num_cpus, mem=default_mem) logging_utils.add_logging_options(parser) pgrm_utils.add_star_options(parser, star_executable) pgrm_utils.add_flexbar_options(parser) args = parser.parse_args() logging_utils.update_logging(args) config = yaml.load(open(args.config), Loader=yaml.FullLoader) # check that all of the necessary programs are callable programs = [ 'flexbar', args.star_executable, 'samtools', 'bowtie2', 'create-base-genome-profile', 'remove-multimapping-reads', 'extract-metagene-profiles', 'estimate-metagene-profile-bayes-factors', 'select-periodic-offsets', 'extract-orf-profiles', 'estimate-orf-bayes-factors', 'select-final-prediction-set', 'create-orf-profiles', 'predict-translated-orfs', 'run-rpbp-pipeline' ] shell_utils.check_programs_exist(programs) required_keys = [ 'riboseq_data', 'riboseq_samples', 'ribosomal_index', 'star_index', 'genome_base_path', 'genome_name', 'fasta', 'gtf' ] utils.check_keys_exist(config, required_keys) # handle all option strings to call the pipeline script logging_str = logging_utils.get_logging_options_string(args) star_str = pgrm_utils.get_star_options_string(args) flexbar_str = pgrm_utils.get_flexbar_options_string(args) # handle do_not_call so that we do call the pipeline script, but that it does not run anything call = not args.do_not_call do_not_call_str = "" if not call: do_not_call_str = "--do-not-call" args.do_not_call = False overwrite_str = "" if args.overwrite: overwrite_str = "--overwrite" mem_str = "--mem {}".format(shlex.quote(args.mem)) keep_intermediate_str = "" if args.keep_intermediate_files: keep_intermediate_str = "--keep-intermediate-files" # check if we only want to create the profiles, in this case # we call run-rpbp-pipeline with the --profiles-only option profiles_only_str = "" if args.profiles_only: if args.merge_replicates: msg = ( "The --profiles-only option was given, this option has" "precedence, and it will override the --merge-replicates option!" ) logger.warning(msg) args.merge_replicates = False profiles_only_str = "--profiles-only" # if we merge the replicates, then we only use the rpbp script to create # the ORF profiles, but we still make predictions if args.merge_replicates and not args.run_replicates: profiles_only_str = "--profiles-only" if args.run_replicates and not args.merge_replicates: msg = ( "The --run-replicates option was given without the --merge-replicates " "option. It will be ignored.") logger.warning(msg) # collect the job_ids in case we are using slurm and need to merge replicates rep_to_condition = ribo_utils.get_riboseq_replicates_reverse_map(config) job_ids_mapping = defaultdict(list) sample_names = sorted(config['riboseq_samples'].keys()) for sample_name in sample_names: data = config['riboseq_samples'][sample_name] tmp_str = "" if args.tmp is not None: tmp = os.path.join(args.tmp, "{}_rpbp".format(sample_name)) tmp_str = "--tmp {}".format(tmp) cmd = "run-rpbp-pipeline {} {} {} --num-cpus {} {} {} {} {} {} {} {} {} {}".format( data, args.config, sample_name, args.num_cpus, mem_str, tmp_str, do_not_call_str, overwrite_str, profiles_only_str, keep_intermediate_str, logging_str, star_str, flexbar_str) job_id = slurm.check_sbatch(cmd, args=args) job_ids_mapping[rep_to_condition[sample_name]].append(job_id) # now, if we are running the "standard" pipeline, we are done if not args.merge_replicates: return # otherwise, we need to merge the replicates for each condition riboseq_replicates = ribo_utils.get_riboseq_replicates(config) merge_replicates_str = "--merge-replicates" for condition_name in sorted(riboseq_replicates.keys()): # then we predict the ORFs cmd = "predict-translated-orfs {} {} --num-cpus {} {} {} {} {}".format( args.config, condition_name, args.num_cpus, do_not_call_str, overwrite_str, logging_str, merge_replicates_str) job_ids = job_ids_mapping[condition_name] slurm.check_sbatch(cmd, args=args, dependencies=job_ids)
def main(): parser = argparse.ArgumentParser( formatter_class=argparse.ArgumentDefaultsHelpFormatter, description="""This script runs the Rp-Bp pipelines on a given sample. It requires a YAML config file that includes a number of keys. Please see the documentation for a complete description.""") parser.add_argument('raw_data', help="The raw data file (fastq[.gz])") parser.add_argument('config', help="The (yaml) configuration file") parser.add_argument( 'name', help="The name for the dataset, used in the created files") parser.add_argument('--tmp', help="The temp directory", default=None) parser.add_argument('--overwrite', help="If this flag is present, existing files " "will be overwritten.", action='store_true') parser.add_argument('--profiles-only', help="""If this flag is present, then only the ORF profiles will be created""", action='store_true') parser.add_argument('-k', '--keep-intermediate-files', help="""If this flag is given, then all intermediate files will be kept; otherwise, they will be deleted. This feature is implemented piecemeal. If the --do-not-call flag is given, then nothing will be deleted.""", action='store_true') slurm.add_sbatch_options(parser, num_cpus=default_num_cpus, mem=default_mem) logging_utils.add_logging_options(parser) pgrm_utils.add_star_options(parser, star_executable) pgrm_utils.add_flexbar_options(parser) args = parser.parse_args() logging_utils.update_logging(args) config = yaml.load(open(args.config), Loader=yaml.FullLoader) # check that all of the necessary programs are callable programs = [ 'flexbar', args.star_executable, 'samtools', 'bowtie2', 'create-base-genome-profile', 'remove-multimapping-reads', 'extract-metagene-profiles', 'estimate-metagene-profile-bayes-factors', 'select-periodic-offsets', 'extract-orf-profiles', 'estimate-orf-bayes-factors', 'select-final-prediction-set', 'create-orf-profiles', 'predict-translated-orfs' ] shell_utils.check_programs_exist(programs) required_keys = [ 'riboseq_data', 'ribosomal_index', 'star_index', 'genome_base_path', 'genome_name', 'fasta', 'gtf' ] utils.check_keys_exist(config, required_keys) # if using slurm, submit the script, but we cannot use sys.argv directly # as the shell strips the quotes around the arguments if args.use_slurm: cmd = "{}".format(' '.join("'" + s + "'" if '"' in s else s for s in sys.argv)) slurm.check_sbatch(cmd, args=args) return # handle all option strings to call programs logging_str = logging_utils.get_logging_options_string(args) star_str = pgrm_utils.get_star_options_string(args) flexbar_str = pgrm_utils.get_flexbar_options_string(args) # handle do_not_call so that we do call the preprocessing script, # but that it does not run anything call = not args.do_not_call do_not_call_str = "" if not call: do_not_call_str = "--do-not-call" overwrite_str = "" if args.overwrite: overwrite_str = "--overwrite" keep_intermediate_str = "" if args.keep_intermediate_files: keep_intermediate_str = "--keep-intermediate-files" tmp_str = "" if args.tmp is not None: tmp_str = "--tmp {}".format(shlex.quote(args.tmp)) mem_str = "--mem {}".format(shlex.quote(args.mem)) cmd = ("create-orf-profiles {} {} {} --num-cpus {} {} {} {} {} {} {} {} {}" .format(args.raw_data, args.config, args.name, args.num_cpus, mem_str, do_not_call_str, overwrite_str, keep_intermediate_str, logging_str, tmp_str, star_str, flexbar_str)) shell_utils.check_call(cmd) # check if we only want to create the profiles if args.profiles_only: return # then we predict the ORFs cmd = ("predict-translated-orfs {} {} --num-cpus {} {} {} {}".format( args.config, args.name, args.num_cpus, do_not_call_str, overwrite_str, logging_str)) shell_utils.check_call(cmd)
def main(): parser = argparse.ArgumentParser( formatter_class=argparse.ArgumentDefaultsHelpFormatter, description='''Prepare a reference genome and matching annotations, including labelled ORFs, for use with the Rp-Bp periodicity estimation and ORF translation prediction pipeline.''') parser.add_argument('config', help='''The (yaml) configuration file''') parser.add_argument('--overwrite', help='''If this flag is present, existing files will be overwritten.''', action='store_true') slurm.add_sbatch_options(parser, num_cpus=default_num_cpus, mem=default_mem) logging_utils.add_logging_options(parser) pgrm_utils.add_star_options(parser, star_executable) args = parser.parse_args() logging_utils.update_logging(args) config = yaml.load(open(args.config), Loader=yaml.FullLoader) # check required callable programs, config keys and files programs = [ 'extract-orf-coordinates', 'label-orfs', 'bowtie2-build-s', 'split-bed12-blocks', 'gtf-to-bed12', args.star_executable ] shell_utils.check_programs_exist(programs) required_keys = [ 'genome_base_path', 'genome_name', 'gtf', 'fasta', 'ribosomal_fasta', 'ribosomal_index', 'star_index' ] utils.check_keys_exist(config, required_keys) files = [config['gtf'], config['fasta'], config['ribosomal_fasta']] if 'de_novo_gtf' in config: files += [config['de_novo_gtf']] utils.check_files_exist(files, source='prepare-rpbp-genome') # check if we want to use slurm if args.use_slurm: cmd = ' '.join(sys.argv) slurm.check_sbatch(cmd, args=args) return call = not args.do_not_call # the rRNA index cmd = "bowtie2-build-s {} {}".format(config['ribosomal_fasta'], config['ribosomal_index']) in_files = [config['ribosomal_fasta']] out_files = pgrm_utils.get_bowtie2_index_files(config['ribosomal_index']) shell_utils.call_if_not_exists(cmd, out_files, in_files=in_files, overwrite=args.overwrite, call=call) # the STAR index mem = utils.human2bytes(args.mem) cmd = ("{} --runMode genomeGenerate --genomeDir {} --genomeFastaFiles {} " "--runThreadN {} --limitGenomeGenerateRAM {}".format( args.star_executable, config['star_index'], config['fasta'], args.num_cpus, mem)) in_files = [config['fasta']] out_files = pgrm_utils.get_star_index_files(config['star_index']) shell_utils.call_if_not_exists(cmd, out_files, in_files=in_files, overwrite=args.overwrite, call=call) # get the ORFs get_orfs(config['gtf'], args, config, is_annotated=True, is_de_novo=False) # we will use these files later in the pipeline annotated_orfs = filenames.get_orfs(config['genome_base_path'], config['genome_name'], note=config.get('orf_note'), is_annotated=True, is_de_novo=False) orfs_genomic = filenames.get_orfs(config['genome_base_path'], config['genome_name'], note=config.get('orf_note')) annotated_exons_file = filenames.get_exons(config['genome_base_path'], config['genome_name'], note=config.get('orf_note'), is_annotated=True, is_de_novo=False) exons_file = filenames.get_exons(config['genome_base_path'], config['genome_name'], note=config.get('orf_note')) annotated_labeled_orfs = filenames.get_labels(config['genome_base_path'], config['genome_name'], note=config.get('orf_note'), is_annotated=True, is_de_novo=False) labeled_orfs = filenames.get_labels(config['genome_base_path'], config['genome_name'], note=config.get('orf_note')) use_gff3_specs = config['gtf'].endswith('gff') gtf_file = filenames.get_gtf(config['genome_base_path'], config['genome_name'], is_gff3=use_gff3_specs, is_star_input=True) # now, check if we have a de novo assembly if 'de_novo_gtf' in config: get_orfs(config['de_novo_gtf'], args, config, is_annotated=False, is_de_novo=True) # we need to concat the ORF and exon files de_novo_orfs = filenames.get_orfs(config['genome_base_path'], config['genome_name'], note=config.get('orf_note'), is_annotated=False, is_de_novo=True) orfs_files = [annotated_orfs, de_novo_orfs] orfs_files_str = ' '.join(orfs_files) msg = ("Concatenating files. Output file: {}; Input files: {}".format( orfs_genomic, orfs_files_str)) logger.info(msg) if call: concatenated_bed = bed_utils.concatenate(orfs_files, sort_bed=True) concatenated_bed['orf_num'] = range(len(concatenated_bed)) additional_columns = ['orf_num', 'orf_len', 'orf_type'] fields = bed_utils.bed12_field_names + additional_columns bed_utils.write_bed(concatenated_bed[fields], orfs_genomic) else: msg = "Skipping concatenation due to --call value" logger.info(msg) de_novo_exons_file = filenames.get_exons(config['genome_base_path'], config['genome_name'], note=config.get('orf_note'), is_annotated=False, is_de_novo=True) exons_files = [annotated_exons_file, de_novo_exons_file] exons_files_str = ' '.join(exons_files) msg = ("Concatenating files. Output file: {}; Input files: {}".format( exons_file, exons_files_str)) logger.info(msg) if call: concatenated_bed = bed_utils.concatenate(exons_files, sort_bed=True) fields = bed_utils.bed6_field_names + [ 'exon_index', 'transcript_start' ] bed_utils.write_bed(concatenated_bed[fields], exons_file) else: msg = "Skipping concatenation due to --call value" logger.info(msg) de_novo_labeled_orfs = filenames.get_labels( config['genome_base_path'], config['genome_name'], note=config.get('orf_note'), is_annotated=False, is_de_novo=True) label_files = [annotated_labeled_orfs, de_novo_labeled_orfs] label_files_str = ' '.join(label_files) msg = ("Concatenating files. Output file: {}; Input files: {}".format( labeled_orfs, label_files_str)) logger.info(msg) if call: # not sorted, as is concatenated_bed = bed_utils.concatenate(label_files, sort_bed=False) bed_utils.write_bed(concatenated_bed, labeled_orfs) else: msg = "Skipping concatenation due to --call value" logger.info(msg) # we also need to concat the annotations to inform STAR # there is no particular reason to merge and sort the files, so # we just concatenate them... if (config['de_novo_gtf'].endswith('gff') == use_gff3_specs): cmd = ("awk '!/^#/' {} {} > {}".format(config['gtf'], config['de_novo_gtf'], gtf_file)) in_files = [config['gtf'], config['de_novo_gtf']] out_files = [gtf_file] shell_utils.call_if_not_exists(cmd, out_files, in_files=in_files, overwrite=args.overwrite, call=call) else: msg = ( "Skipping concatenation due to mismatch in format specifications (GTF2/GFF3)" "for reference and do novo annotations. Symlink to reference annotations created." ) logger.warning(msg) if os.path.exists(config['gtf']): shell_utils.create_symlink(config['gtf'], gtf_file, call) else: # if we do not have a de novo assembly, symlink the files if os.path.exists(annotated_orfs): shell_utils.create_symlink(annotated_orfs, orfs_genomic, call) if os.path.exists(annotated_exons_file): shell_utils.create_symlink(annotated_exons_file, exons_file, call) if os.path.exists(annotated_labeled_orfs): shell_utils.create_symlink(annotated_labeled_orfs, labeled_orfs, call) if os.path.exists(config['gtf']): shell_utils.create_symlink(config['gtf'], gtf_file, call)
def main(): parser = argparse.ArgumentParser(formatter_class=argparse.ArgumentDefaultsHelpFormatter, description="""This script constructs the profile for each ORF. It first adjusts the mapped read positions to properly align with the P-sites. Second, it uses a custom chrom-sweep algorithm to find the coverage of each position in each exon of each ORF. Finally, the ORF exons are glued together to find the profile of the entire ORF.""") parser.add_argument('bam', help="The bam file including filtered (unique, etc.) alignments") parser.add_argument('orfs', help="The (bed12) file containing the ORFs") parser.add_argument('exons', help="The (bed6+2) file containing the exons") parser.add_argument('out', help="The (mtx.gz) output file containing the ORF profiles") parser.add_argument('-l', '--lengths', help="""If any values are given, then only reads which have those lengths will be included in the signal construction.""", type=int, default=[], nargs='*') parser.add_argument('-o', '--offsets', help="""The 5' end of reads will be shifted by this amount. There must be one offset value for each length (given by the --lengths argument.""", type=int, default=[], nargs='*') parser.add_argument('-k', '--num-exons', help="If k>0, then only the first k exons will be processed.", type=int, default=0) parser.add_argument('-g', '--num-groups', help=""""The number of groups into which to split the exons. More groups means the progress bar is updated more frequently but incurs more overhead because of the parallel calls.""", type=int, default=default_num_groups) parser.add_argument('--seqname-prefix', help="""If present, this string will be prepended to the seqname field of the ORFs.""", default='') slurm.add_sbatch_options(parser) logging_utils.add_logging_options(parser) args = parser.parse_args() logging_utils.update_logging(args) msg = "[extract-orf-profiles]: {}".format(' '.join(sys.argv)) logger.info(msg) # make sure the number of lengths and offsets match if len(args.lengths) != len(args.offsets): msg = "The number of --lengths and --offsets do not match." raise ValueError(msg) # make sure the necessary files exist required_files = [args.bam, args.orfs, args.exons] msg = "[extract-orf-profiles]: Some input files were missing: " utils.check_files_exist(required_files, msg=msg) if args.use_slurm: cmd = ' '.join(sys.argv) slurm.check_sbatch(cmd, args=args) return msg = "Finding P-sites" logger.info(msg) p_sites = ribo_utils.get_p_sites(args.bam, args.lengths, args.offsets) # we do not need the data frame anymore, so save some memory msg = "Reading exons" logger.info(msg) exons = bed_utils.read_bed(args.exons) msg = "Reading ORFs" logger.info(msg) orfs = bed_utils.read_bed(args.orfs) if len(args.seqname_prefix) > 0: orfs['seqname'] = args.seqname_prefix + orfs['seqname'] exons['seqname'] = args.seqname_prefix + exons['seqname'] if args.num_exons > 0: exons = exons.head(args.num_exons) num_orfs = orfs['orf_num'].max() + 1 max_orf_len = orfs['orf_len'].max() msg = "Adding the ORF index to the exons" logger.info(msg) orf_fields = ['id', 'orf_num'] exons_orfs = exons.merge(orfs[orf_fields], on='id') msg = "Splitting exons and P-sites" logger.info(msg) exon_groups = pandas_utils.split_df(exons_orfs, args.num_groups) exons_dfs = [] psites_dfs = [] for group_index, exon_group in exon_groups: # pull out only the p-sites that come from these chromosomes seqnames = set(exon_group['seqname'].unique()) m_psites = p_sites['seqname'].isin(seqnames) exons_dfs.append(exon_group) psites_dfs.append(p_sites[m_psites]) # we no longer need the full list of psites del p_sites del exons_orfs del exon_groups del exons gc.collect() exons_psites = zip(exons_dfs, psites_dfs) msg = "Finding all P-site intersections" logger.info(msg) sum_profiles = parallel.apply_parallel_iter( exons_psites, args.num_cpus, get_all_p_site_intersections, num_orfs, max_orf_len, progress_bar=True, total=len(exons_dfs), backend='multiprocessing' ) msg = "Combining the ORF profiles into one matrix" logger.info(msg) f = lambda x, y: x+y sum_profiles = functools.reduce(f, sum_profiles) sum_profiles_lil = sum_profiles.tolil() msg = "Flipping the reverse strand profiles" logger.info(msg) m_reverse = orfs['strand'] == '-' reverse_orfs = orfs[m_reverse] for idx, reverse_orf in tqdm.tqdm(reverse_orfs.iterrows()): orf_num = reverse_orf['orf_num'] if sum_profiles[orf_num].sum() == 0: continue orf_len = reverse_orf['orf_len'] dense = utils.to_dense(sum_profiles, orf_num, length=orf_len) dense = dense[::-1] sum_profiles_lil[orf_num, :orf_len] = dense msg = "Writing the sparse matrix to disk" logger.info(msg) math_utils.write_sparse_matrix(args.out, sum_profiles_lil)
def main(): parser = argparse.ArgumentParser( formatter_class=argparse.ArgumentDefaultsHelpFormatter, description= "This script identifies the orf peptide matches for all samples in " "a project.") parser.add_argument('config', help="The (yaml) config file") parser.add_argument('--peptide-filter-field', help="The field to use for " "filtering the peptides from MaxQuant", default=default_peptide_filter_field) parser.add_argument('--peptide-filter-value', help="All peptides with a value " "greater than the filter value will be removed", type=float, default=default_peptide_filter_value) parser.add_argument('--peptide-separator', help="The separator in the " "peptide file", default=default_peptide_separator) parser.add_argument( '--note', help="If this option is given, it will be used in " "the output filenames.\n\nN.B. This REPLACES the note in the config file.", default=default_note) slurm.add_sbatch_options(parser) logging_utils.add_logging_options(parser) args = parser.parse_args() logging_utils.update_logging(args) logging_str = logging_utils.get_logging_options_string(args) config = yaml.load(open(args.config), Loader=yaml.FullLoader) call = not args.do_not_call programs = ['get-orf-peptide-matches'] shell_utils.check_programs_exist(programs) required_keys = [ 'peptide_files', 'peptide_cell_type_analysis', 'riboseq_data', 'riboseq_samples' ] utils.check_keys_exist(config, required_keys) note_str = config.get('note', None) out_note_str = note_str if args.note is not None and len(args.note) > 0: out_note_str = args.note args_dict = vars(args) peptide_filter_field_str = utils.get_config_argument( args_dict, 'peptides_filter_field') peptide_filter_value_str = utils.get_config_argument( args_dict, 'peptides_filter_value') peptide_separator_str = utils.get_config_argument(args_dict, 'peptide_separator') num_cpus_str = utils.get_config_argument(args_dict, 'num_cpus') cell_types = ribo_utils.get_riboseq_cell_type_samples(config) for cell_type, peptide_files in config['peptide_cell_type_analysis'].items( ): if cell_type not in cell_types: msg = ( "Could not find cell_type specification. Please check the config " "file: {}".format(cell_type)) logger.warning(msg) continue cell_type_protein = ribo_filenames.get_riboseq_cell_type_protein( config['riboseq_data'], cell_type, is_filtered=True, note=note_str) if not os.path.exists(cell_type_protein): msg = ("Could not find cell_type protein fasta. Skipping: {}". format(cell_type_protein)) logger.warning(msg) continue for peptide_file in peptide_files: if peptide_file not in config['peptide_files']: msg = ( "Could not find peptide_file specification. Please check " "the config file: {}".format(peptide_file)) logger.warning(msg) continue peptide_txt_file = config['peptide_files'][peptide_file] if not os.path.exists(peptide_txt_file): msg = ("Could not find peptide.txt file. Skipping: {}".format( peptide_txt_file)) logger.warning(msg) continue peptide_matches = ribo_filenames.get_riboseq_peptide_matches( config['riboseq_data'], cell_type, peptide_file, is_filtered=True, note=out_note_str) cmd = "get-orf-peptide-matches {} {} {} {} {} {} {} {}".format( cell_type_protein, peptide_txt_file, peptide_matches, num_cpus_str, peptide_filter_field_str, peptide_filter_value_str, peptide_separator_str, logging_str) slurm.check_sbatch(cmd, args=args)
def main(): parser = argparse.ArgumentParser(formatter_class=argparse.ArgumentDefaultsHelpFormatter, description="Extract the ORF profiles for each specified read length " "and offset independently, creating one sparse matrix file (mtx) for " "each read length. These are then collected into a 'sparse tensor'.") parser.add_argument('config', help="The yaml config file.") parser.add_argument('name', help="The name of either one of the 'riboseq_samples'" "or 'riboseq_biological_replicates' from the config file.") parser.add_argument('out', help="The output (txt.gz) file. N.B. The output uses" "base-0 indexing, contrary to the unsmoothed ORF profiles, which are written" "using the matrix market format (base-1 indexing).") parser.add_argument('-c', '--is-condition', help="If this flag is present, " "then 'name' will be taken to be a condition name. The profiles for " "all relevant replicates of the condition will be created.", action='store_true') parser.add_argument('--add-ids', help="If this flag is present, " "then orf_ids will be added to the final output.", action='store_true') slurm.add_sbatch_options(parser) logging_utils.add_logging_options(parser) args = parser.parse_args() logging_utils.update_logging(args) logging_str = logging_utils.get_logging_options_string(args) cpus_str = "--num-cpus {}".format(args.num_cpus) msg = "[create-read-length-orf-profiles]: {}".format(' '.join(sys.argv)) logger.info(msg) msg = "Reading config file" logger.info(msg) config = yaml.load(open(args.config), Loader=yaml.FullLoader) # pull out what we need from the config file is_unique = not ('keep_riboseq_multimappers' in config) seqname_str = utils.get_config_argument(config, 'seqname_prefix') note = config.get('note', None) orf_note = config.get('orf_note', None) orfs = filenames.get_orfs( config['genome_base_path'], config['genome_name'], note=orf_note ) exons = filenames.get_exons( config['genome_base_path'], config['genome_name'], note=orf_note, is_orf=True ) # make sure the necessary files exist required_files = [orfs, exons] msg = "[create-read-length-orf-profiles]: Some input files were missing: " utils.check_files_exist(required_files, msg=msg) # process one sample or all samples from condition names = [args.name] is_condition_str = "" if args.is_condition: is_condition_str = "--is-condition" riboseq_replicates = ribo_utils.get_riboseq_replicates(config) names = [n for n in riboseq_replicates[args.name]] job_ids = [] for name in names: msg = "Processing sample: {}".format(name) logger.info(msg) # now the relevant files bam = filenames.get_riboseq_bam( config['riboseq_data'], name, is_unique=is_unique, note=note ) # make sure the necessary files exist required_files = [bam] msg = "[create-read-length-orf-profiles]: Some input files were missing: " utils.check_files_exist(required_files, msg=msg) # get the lengths and offsets which meet the required criteria from the config file lengths, offsets = ribo_utils.get_periodic_lengths_and_offsets( config, name, is_unique=is_unique ) if len(lengths) == 0: msg = ("No periodic read lengths and offsets were found. Try relaxing " "min_metagene_profile_count, min_metagene_bf_mean, " "max_metagene_bf_var, and/or min_metagene_bf_likelihood. Qutting.") logger.critical(msg) return for length, offset in zip(lengths, offsets): lengths_str = "--lengths {}".format(length) offsets_str = "--offsets {}".format(offset) mtx = filenames.get_riboseq_profiles( config['riboseq_data'], name, length=[length], offset=[offset], is_unique=is_unique, note=note ) cmd = "extract-orf-profiles {} {} {} {} {} {} {} {}".format( bam, orfs, exons, mtx, lengths_str, offsets_str, seqname_str, cpus_str, logging_str ) job_id = slurm.check_sbatch(cmd, args=args) job_ids.append(job_id) add_ids_str = "" if args.add_ids: add_ids_str = "--add-ids" cmd = "collect-read-length-orf-profiles {} {} {} {} {}".format( args.config, args.name, args.out, is_condition_str, add_ids_str, logging_str ) slurm.check_sbatch(cmd, args=args, dependencies=job_ids)
def main(): parser = argparse.ArgumentParser( formatter_class=argparse.ArgumentDefaultsHelpFormatter, description='''Extract the ORFs from the given transcripts and write as a BED12+ file. Additional fields, 'orf_len' and 'orf_num', give the length of each ORF and it's index (used to write the ORF profiles). A third additional field records duplicated ORFs from transcript variants.''') parser.add_argument('transcripts_bed', help='''The BED12 file containing the transcript information.''') parser.add_argument('transcripts_fasta', help='''The fasta file containing the spliced transcript sequences.''') parser.add_argument('out', help='''The output (BED12+ gz) file.''') parser.add_argument('--start-codons', help='''A list of codons which will be treated as start codons when extracting the ORFs.''', nargs='+', default=default_start_codons) parser.add_argument('--stop-codons', help='''A list of codons which will be treated as stop codons when extracting the ORFs.''', nargs='+', default=default_stop_codons) slurm.add_sbatch_options(parser) logging_utils.add_logging_options(parser) args = parser.parse_args() logging_utils.update_logging(args) # check if we wanted to use slurm if args.use_slurm: cmd = ' '.join(sys.argv) slurm.check_sbatch(cmd, args=args) return msg = "Compiling start and stop codon regular expressions" logger.info(msg) start_codons_re = '|'.join(args.start_codons) stop_codons_re = '|'.join(args.stop_codons) start_codons_re = re.compile(start_codons_re) stop_codons_re = re.compile(stop_codons_re) msg = "Reading transcripts bed file" logger.info(msg) transcripts_bed = bed_utils.read_bed(args.transcripts_bed) msg = "Creating the sequence iterator" logger.info(msg) transcripts_fasta = fastx_utils.get_read_iterator(args.transcripts_fasta) transcripts_iter = ((get_transcript(transcript_header, transcripts_bed), transcript_sequence) for (transcript_header, transcript_sequence) in transcripts_fasta) msg = "Finding all ORFs" logger.info(msg) orfs = parallel.apply_parallel_iter(transcripts_iter, args.num_cpus, get_orfs, start_codons_re, stop_codons_re, total=len(transcripts_bed), progress_bar=True) msg = "Joining ORFs in a large data frame" logger.info(msg) orfs = pd.concat(orfs) orfs.reset_index(drop=True, inplace=True) # This is done arbitrarily, however we keep all matching # transcripts for reference msg = "Marking and removing duplicate ORFs" logger.info(msg) groupby_duplicates = orfs.groupby(DUPLICATE_FIELDS, as_index=False).agg({'id': ','.join}) orfs = orfs.merge(groupby_duplicates, how='left', on=DUPLICATE_FIELDS) orfs.drop_duplicates(subset=DUPLICATE_FIELDS, inplace=True, keep='first') orfs.rename(columns={'id_x': 'id', 'id_y': 'duplicates'}, inplace=True) msg = "Numbering remaining ORFs" logger.info(msg) orfs['orf_num'] = np.arange(len(orfs)) msg = "Writing ORFs to disk" logger.info(msg) bed_utils.write_bed(orfs, args.out)
def main(): global profiles_data, profiles_indices, profiles_indptr, profiles_shape global translated_models, untranslated_models global args parser = argparse.ArgumentParser( formatter_class=argparse.ArgumentDefaultsHelpFormatter, description="""This script uses Hamiltonian MCMC with Stan to estimate translation parameters for a set of regions (presumably ORFs). Roughly, it takes as input: (1) a set of regions (ORFs) and their corresponding profiles (2) a "translated" model which gives the probability that a region is translated (3) an "untranslated" model which gives the probability that a region is not translated. The script first smoothes the profiles using LOWESS. It then calculates both the Bayes' factor (using the smoothed profile) and chi2 value (using the raw counts) for each ORF.""" ) parser.add_argument('profiles', help="The ORF profiles (counts) (mtx)") parser.add_argument( 'regions', help="The regions (ORFs) for which predictions will be made (BED12+)") parser.add_argument('out', help="The output file for the Bayes' factors (BED12+)") parser.add_argument('--chi-square-only', help="""If this flag is present, then only the chi square test will be performed for each ORF. This can also be a way to get the counts within each of the ORFs.""", action='store_true') parser.add_argument('--translated-models', help="The models to use as H_t (pkl)", nargs='+') parser.add_argument('--untranslated-models', help="The models to use as H_u (pkl)", nargs='+') # filtering options parser.add_argument( '--orf-types', help= "If values are given, then only orfs with those types are processed.", nargs='*', default=translation_options['orf_types']) parser.add_argument('--orf-type-field', default=default_orf_type_field) parser.add_argument( '--min-length', help="ORFs with length less than this value will not be processed", type=int, default=translation_options['orf_min_length_pre']) parser.add_argument( '--max-length', help="ORFs with length greater than this value will not be processed", type=int, default=translation_options['orf_max_length_pre']) parser.add_argument( '--min-profile', help="""ORFs with profile sum (i.e., number of reads) less than this value will not be processed.""", type=float, default=translation_options['orf_min_profile_count_pre']) # smoothing options parser.add_argument('--fraction', help="The fraction of signal to use in LOWESS", type=float, default=translation_options['smoothing_fraction']) parser.add_argument( '--reweighting-iterations', help="The number of reweighting " "iterations to use in LOWESS. " "Please see the statsmodels documentation for a " "detailed description of this parameter.", type=int, default=translation_options['smoothing_reweighting_iterations']) # MCMC options parser.add_argument('-s', '--seed', help="The random seeds to use for inference", type=int, default=translation_options['seed']) parser.add_argument('-c', '--chains', help="The number of MCMC chains to use", type=int, default=translation_options['chains']) parser.add_argument( '-i', '--iterations', help="The number of MCMC iterations to use for each chain", type=int, default=translation_options['translation_iterations']) # behavior options parser.add_argument( '--num-orfs', help="If n>0, then only this many ORFs will be processed", type=int, default=0) parser.add_argument('--orf-num-field', default=default_orf_num_field) parser.add_argument('--do-not-compress', help="Unless otherwise specified, the output will " "be written in GZip format", action='store_true') parser.add_argument('-g', '--num-groups', help="The number of groups into which to split " "the ORFs. More groups means the progress bar is " "updated more frequently but incurs more overhead " "because of the parallel calls.", type=int, default=default_num_groups) slurm.add_sbatch_options(parser) logging_utils.add_logging_options(parser) args = parser.parse_args() logging_utils.update_logging(args) if args.use_slurm: cmd = ' '.join(sys.argv) slurm.check_sbatch(cmd, args=args) return # read in the regions and apply the filters msg = "Reading and filtering ORFs" logger.info(msg) regions = bed_utils.read_bed(args.regions) # by default, keep everything m_filters = np.array([True] * len(regions)) if len(args.orf_types) > 0: m_orf_type = regions[args.orf_type_field].isin(args.orf_types) m_filters = m_orf_type & m_filters # min length if args.min_length > 0: m_min_length = regions['orf_len'] >= args.min_length m_filters = m_min_length & m_filters # max length if args.max_length > 0: m_max_length = regions['orf_len'] <= args.max_length m_filters = m_max_length & m_filters # min profile profiles = scipy.io.mmread(args.profiles).tocsr() profiles_sums = profiles.sum(axis=1) good_orf_nums = np.where(profiles_sums >= args.min_profile) good_orf_nums = set(good_orf_nums[0]) m_profile = regions['orf_num'].isin(good_orf_nums) m_filters = m_profile & m_filters regions = regions[m_filters] if args.num_orfs > 0: regions = regions.head(args.num_orfs) regions = regions.reset_index(drop=True) msg = "Number of regions after filtering: {}".format(len(regions)) logger.info(msg) logger.debug("Reading models") translated_models = [ pickle.load(open(tm, 'rb')) for tm in args.translated_models ] untranslated_models = [ pickle.load(open(bm, 'rb')) for bm in args.untranslated_models ] profiles_data = multiprocessing.RawArray(ctypes.c_double, profiles.data.flat) profiles_indices = multiprocessing.RawArray(ctypes.c_int, profiles.indices) profiles_indptr = multiprocessing.RawArray(ctypes.c_int, profiles.indptr) profiles_shape = multiprocessing.RawArray(ctypes.c_int, profiles.shape) with suppress_stdout_stderr(): bfs_l = parallel.apply_parallel_split(regions, args.num_cpus, get_all_bayes_factors_args, num_groups=args.num_groups, progress_bar=True, backend='multiprocessing') bfs = pd.concat(bfs_l) # write the results as a bed12+ file bed_utils.write_bed(bfs, args.out)
def main(): parser = argparse.ArgumentParser( formatter_class=argparse.ArgumentDefaultsHelpFormatter, description= "This script creates the plots which detail the basic characteristics " "of the ORF predictions from the Rp-Bp pipeline. It also creates and compiles (if " "possible) a latex report for them.") parser.add_argument('config', help="The (yaml) config file") parser.add_argument('out', help="The base output directory for the latex report") parser.add_argument( '--show-unfiltered-orfs', help="If this flag is " "present, bar charts showing the distribution of the types of the " "unfiltered ORF set will be included", action='store_true') parser.add_argument( '--show-orf-periodicity', help="If this flag is " "present, bar charts showing the periodicity of each ORF type will be " "included in the report.", action='store_true') parser.add_argument('--uniprot', help="The uniprot ORF lengths, if available", default=default_uniprot) parser.add_argument('--uniprot-label', help="The label to use for the uniprot ORFs in " "the plot", default=default_uniprot_label) parser.add_argument('--image-type', help="The format of the image files. This must be " "a format usable by matplotlib.", default=default_image_type) parser.add_argument('--overwrite', help="If this flag is present, existing files will " "be overwritten.", action='store_true') parser.add_argument( '--note', help="If this option is given, it will be used in the " "filenames.\n\nN.B. This REPLACES the note in the config file.", default=None) parser.add_argument( '--show-chisq', help="If this flag is given, then the " "results from Rp-chi will be included in the document; otherwise, they " "will not be created or shown.", action='store_true') parser.add_argument('-t', '--tmp', help="A location for temporary files", default=None) slurm.add_sbatch_options(parser, num_cpus=default_num_cpus) logging_utils.add_logging_options(parser) args = parser.parse_args() logging_utils.update_logging(args) config = yaml.load(open(args.config), Loader=yaml.FullLoader) # keep multimappers? is_unique = not ('keep_riboseq_multimappers' in config) programs = [ 'create-orf-length-distribution-line-graph', 'create-orf-types-bar-chart', 'visualize-orf-type-metagene-profiles' ] shell_utils.check_programs_exist(programs) required_keys = ['riboseq_data', 'riboseq_samples'] utils.check_keys_exist(config, required_keys) if args.use_slurm: cmd = ' '.join(sys.argv) slurm.check_sbatch(cmd, args=args) return # by default, we will not include chisq chisq_values = [False] if args.show_chisq: chisq_values = [True, False] filtered_values = [True] if args.show_unfiltered_orfs: filtered_values = [True, False] grouped_values = [True, False] # make sure the path to the output file exists os.makedirs(args.out, exist_ok=True) # first, create all of the figures create_all_figures(config, args) note_str = config.get('note', None) out_note_str = note_str if args.note is not None and len(args.note) > 0: out_note_str = args.note fraction = config.get('smoothing_fraction', None) reweighting_iterations = config.get('smoothing_reweighting_iterations', None) project_name = config.get("project_name", default_project_name) title = "Rp-Bp prediction analysis for {}".format(project_name) abstract = "This document shows the results of the Rp-Bp pipeline analysis." #tex_file = os.path.join(args.out, "prediction-report.tex") tex_file = filenames.get_rpbp_prediction_report(args.out, out_note_str) with open(tex_file, 'w') as out: latex.begin_document(out, title, abstract) latex.write(out, "\n") latex.clearpage(out) ### ORF type distributions title = "Predicted ORF type distributions" latex.section(out, title) # first, handle all of the regular datasets sample_names = sorted(config['riboseq_samples'].keys()) # and check if we also have replicates replicate_names = [] if 'riboseq_biological_replicates' in config: replicate_names = sorted( ribo_utils.get_riboseq_replicates(config).keys()) strands = ["+", "-"] i = 0 for sample_name in sample_names: try: lengths, offsets = ribo_utils.get_periodic_lengths_and_offsets( config, sample_name, is_unique=is_unique, default_params=metagene_options) except FileNotFoundError: msg = ( "Could not parse out lengths and offsets for sample: {}. " "Skipping".format(sample_name)) logger.error(msg) continue caption = "ORF types: {}".format(sample_name) is_first = True # first, just dump all of the bar charts to the page it = itertools.product(grouped_values, chisq_values, filtered_values) for is_grouped, is_chisq, is_filtered in it: if is_chisq: f = None rw = None else: f = fraction rw = reweighting_iterations orf_types_bar_chart = filenames.get_orf_types_bar_chart( config['riboseq_data'], sample_name, length=lengths, offset=offsets, is_unique=is_unique, note=out_note_str, image_type=args.image_type, fraction=f, reweighting_iterations=rw, is_grouped=is_grouped, is_chisq=is_chisq, is_filtered=is_filtered) msg = "Looking for image file: {}".format(orf_types_bar_chart) logger.debug(msg) if os.path.exists(orf_types_bar_chart): if is_first or (i % 4 == 0): latex.begin_figure(out) is_first = False i += 1 latex.write_graphics(out, orf_types_bar_chart, height=0.15) if i % 6 == 0: latex.write_caption(out, caption) latex.end_figure(out) latex.clearpage(out) else: msg = "Could not find image: {}".format( orf_types_bar_chart) logger.warning(msg) if (i > 0) and (i % 6) != 0: latex.write_caption(out, caption) latex.end_figure(out) #latex.clearpage(out) if i % 6 != 0: latex.clearpage(out) # now, if the config file specifies replicates, create figures for those i = 0 for replicate_name in replicate_names: lengths = None offsets = None caption = "ORF types: {}".format(replicate_name) it = itertools.product(grouped_values, chisq_values, filtered_values) is_first = True for is_grouped, is_chisq, is_filtered in it: if is_chisq: f = None rw = None else: f = fraction rw = reweighting_iterations orf_types_bar_chart = filenames.get_orf_types_bar_chart( config['riboseq_data'], replicate_name, length=lengths, offset=offsets, is_unique=is_unique, note=out_note_str, image_type=args.image_type, fraction=f, reweighting_iterations=rw, is_grouped=is_grouped, is_chisq=is_chisq, is_filtered=is_filtered) msg = "Looking for image file: {}".format(orf_types_bar_chart) logger.debug(msg) if os.path.exists(orf_types_bar_chart): if is_first or (i % 4 == 0): latex.begin_figure(out) is_first = False i += 1 latex.write_graphics(out, orf_types_bar_chart, height=0.15) if i % 6 == 0: latex.write_caption(out, caption) latex.end_figure(out) latex.clearpage(out) else: msg = "Could not find image: {}".format( orf_types_bar_chart) logger.warning(msg) if (i > 0) and (i % 6) != 0: latex.write_caption(out, caption) latex.end_figure(out) #latex.clearpage(out) if i % 6 != 0: latex.clearpage(out) ### ORF type length distributions title = "Predicted ORF type length distributions" latex.section(out, title) i = 0 for sample_name in sample_names: try: lengths, offsets = ribo_utils.get_periodic_lengths_and_offsets( config, sample_name, is_unique=is_unique, default_params=metagene_options) except FileNotFoundError: msg = ( "Could not parse out lengths and offsets for sample: {}. " "Skipping".format(sample_name)) logger.error(msg) continue caption = "ORF type length distributions: {}".format(sample_name) is_first = True it = itertools.product(grouped_values, chisq_values) for is_grouped, is_chisq in it: if is_chisq: f = None rw = None else: f = fraction rw = reweighting_iterations orf_length_line_graph = filenames.get_orf_length_distribution_line_graph( config['riboseq_data'], sample_name, length=lengths, offset=offsets, is_unique=is_unique, note=out_note_str, image_type=args.image_type, fraction=f, reweighting_iterations=rw, is_grouped=is_grouped, is_chisq=is_chisq) if os.path.exists(orf_length_line_graph): if is_first or (i % 4 == 0): latex.begin_figure(out) is_first = False i += 1 latex.write_graphics(out, orf_length_line_graph, height=0.15) if i % 4 == 0: latex.write_caption(out, caption) latex.end_figure(out) latex.clearpage(out) else: msg = "Could not find image: {}".format( orf_length_line_graph) logger.debug(msg) if (i > 0) and (i % 4) != 0: latex.write_caption(out, caption) latex.end_figure(out) #latex.clearpage(out) if i % 4 != 0: latex.clearpage(out) # now, if the config file specifies replicates, create figures for those i = 0 for replicate_name in replicate_names: lengths = None offsets = None caption = "ORF types: {}".format(replicate_name) is_first = True it = itertools.product(grouped_values, chisq_values) for is_grouped, is_chisq in it: if is_chisq: f = None rw = None else: f = fraction rw = reweighting_iterations orf_length_line_graph = filenames.get_orf_length_distribution_line_graph( config['riboseq_data'], replicate_name, length=lengths, offset=offsets, is_unique=is_unique, note=out_note_str, image_type=args.image_type, fraction=f, reweighting_iterations=rw, is_grouped=is_grouped, is_chisq=is_chisq) if os.path.exists(orf_length_line_graph): if is_first or (i % 4 == 0): latex.begin_figure(out) is_first = False i += 1 latex.write_graphics(out, orf_length_line_graph, height=0.15) if i % 4 == 0: latex.write_caption(out, caption) latex.end_figure(out) latex.clearpage(out) else: msg = "Could not find image: {}".format( orf_length_line_graph) logger.debug(msg) if (i > 0) and (i % 4) != 0: latex.write_caption(out, caption) latex.end_figure(out) #latex.clearpage(out) if i % 4 != 0: latex.clearpage(out) ### ORF type metagene profiles if args.show_orf_periodicity: title = "Predicted ORF type metagene profiles" latex.section(out, title) i = 0 for sample_name in sample_names: try: lengths, offsets = ribo_utils.get_periodic_lengths_and_offsets( config, sample_name, is_unique=is_unique, default_params=metagene_options) except FileNotFoundError: msg = ( "Could not parse out lengths and offsets for sample: {}. " "Skipping".format(sample_name)) logger.error(msg) continue caption = "ORF type metagene profiles: {}".format(sample_name) is_first = True for is_chisq in chisq_values: if is_chisq: f = None rw = None else: f = fraction rw = reweighting_iterations orf_type_profile_base = filenames.get_orf_type_profile_base( config['riboseq_data'], sample_name, length=lengths, offset=offsets, is_unique=is_unique, note=out_note_str, fraction=f, reweighting_iterations=rw, is_chisq=is_chisq) it = itertools.product(ribo_utils.orf_types, strands) for orf_type, strand in it: orf_type_profile = filenames.get_orf_type_profile_image( orf_type_profile_base, orf_type, strand, args.image_type) msg = "Looking for image file: {}".format( orf_type_profile) logger.debug(msg) if os.path.exists(orf_type_profile): if is_first or (i % 4 == 0): latex.begin_figure(out) is_first = False i += 1 latex.write_graphics(out, orf_type_profile, height=0.23) if i % 4 == 0: latex.write_caption(out, caption) latex.end_figure(out) latex.clearpage(out) else: msg = "Could not find image: {}".format( orf_type_profile) logger.warning(msg) if (i > 0) and (i % 4 != 0): latex.write_caption(out, caption) latex.end_figure(out) #latex.clearpage(out) if i % 4 != 0: latex.clearpage(out) i = 0 for replicate_name in replicate_names: lengths = None offsets = None caption = "ORF type metagene profiles: {}".format( replicate_name) is_first = True for is_chisq in chisq_values: if is_chisq: f = None rw = None else: f = fraction rw = reweighting_iterations orf_type_profile_base = filenames.get_orf_type_profile_base( config['riboseq_data'], replicate_name, length=lengths, offset=offsets, is_unique=is_unique, note=out_note_str, fraction=f, reweighting_iterations=rw, is_chisq=is_chisq) it = itertools.product(ribo_utils.orf_types, strands) for orf_type, strand in it: orf_type_profile = filenames.get_orf_type_profile_image( orf_type_profile_base, orf_type, strand, args.image_type) if os.path.exists(orf_type_profile): if is_first or (i % 4 == 0): latex.begin_figure(out) is_first = False i += 1 latex.write_graphics(out, orf_type_profile, height=0.23) if i % 4 == 0: latex.write_caption(out, caption) latex.end_figure(out) latex.clearpage(out) else: msg = "Could not find image: {}".format( orf_type_profile) logger.debug(msg) if (i > 0) and (i % 4 != 0): latex.write_caption(out, caption) latex.end_figure(out) #latex.clearpage(out) if i % 4 != 0: latex.clearpage(out) latex.end_document(out) tex_filename = os.path.basename(tex_file) latex.compile(args.out, tex_filename)