def create_all_figures(config, args): # first, handle all of the regular datasets is_replicate = False sample_names = sorted(config['riboseq_samples'].keys()) sample_name_map = ribo_utils.get_sample_name_map(config) samples = [(name, sample_name_map[name], is_replicate) for name in sample_names ] is_replicate = True replicate_names = sorted(ribo_utils.get_riboseq_replicates(config).keys()) condition_name_map = ribo_utils.get_riboseq_condition_name_map(config) conditions = [(name, condition_name_map[name], is_replicate) for name in replicate_names ] all_names = samples + conditions parallel.apply_parallel_iter( all_names, args.num_cpus, _create_figures, config, args, progress_bar=True )
def main(): parser = argparse.ArgumentParser( formatter_class=argparse.ArgumentDefaultsHelpFormatter, description= "This script collects counts of riboseq reads filtered at each step in " "the micropeptide prediction pipeline. It mostly parses fastqc results (using the " "crimson python package).") parser.add_argument('config', help="The yaml config file") parser.add_argument('out', help="The output csv file with the counts") parser.add_argument('-p', '--num-cpus', help="The number of processors to use", type=int, default=default_num_cpus) parser.add_argument('--overwrite', action='store_true') logging_utils.add_logging_options(parser) args = parser.parse_args() logging_utils.update_logging(args) programs = ['samtools'] shell_utils.check_programs_exist(programs) config = yaml.load(open(args.config)) res = parallel.apply_parallel_iter(config['riboseq_samples'].items(), args.num_cpus, get_counts, config, args) res = [r for r in res if r is not None] res_df = pd.DataFrame(res) utils.write_df(res_df, args.out, index=False)
def main(): parser = argparse.ArgumentParser(formatter_class=argparse.ArgumentDefaultsHelpFormatter, description="This script collects counts of riboseq reads filtered at each step in " "the micropeptide prediction pipeline. It mostly parses fastqc results (using the " "crimson python package).") parser.add_argument('config', help="The yaml config file") parser.add_argument('out', help="The output csv file with the counts") parser.add_argument('-p', '--num-cpus', help="The number of processors to use", type=int, default=default_num_cpus) parser.add_argument('--overwrite', action='store_true') logging_utils.add_logging_options(parser) args = parser.parse_args() logging_utils.update_logging(args) programs = ['samtools'] shell_utils.check_programs_exist(programs) config = yaml.load(open(args.config)) res = parallel.apply_parallel_iter(config['riboseq_samples'].items(), args.num_cpus, get_counts, config, args) res = [r for r in res if r is not None] res_df = pd.DataFrame(res) pandas_utils.write_df(res_df, args.out, index=False)
def main(): parser = argparse.ArgumentParser(formatter_class=argparse.ArgumentDefaultsHelpFormatter, description="Summarize the evaluation metrics for all scenarios") clu.add_config(parser) parser.add_argument('out') clu.add_cv_options(parser) clu.add_num_cpus(parser) automl_utils.add_blas_options(parser) logging_utils.add_logging_options(parser) args = parser.parse_args() logging_utils.update_logging(args) # see which folds to run if len(args.folds) == 0: args.folds = list(range(1,11)) clu.validate_folds_options(args) required_keys = ['base_path', 'training_scenarios_path'] config = as_asl_utils.load_config(args.config, required_keys) if automl_utils.spawn_for_blas(args): return scenarios = utils.list_subdirs(config['training_scenarios_path']) use_random_forests = [False] #, True] it = itertools.product(scenarios, use_random_forests) all_stats = parallel.apply_parallel_iter( it, args.num_cpus, get_stats_summary, args, config ) msg = "Combining statistics" logger.info(msg) all_stats_df = pd.DataFrame(all_stats) pd_utils.write_df( all_stats_df, args.out, create_path=True, do_not_compress=True, index=False )
def create_all_figures(config, args): # first, handle all of the regular datasets is_replicate = False sample_names = sorted(config['riboseq_samples'].keys()) sample_name_map = ribo_utils.get_sample_name_map(config) samples = [(name, sample_name_map[name], is_replicate) for name in sample_names] is_replicate = True replicate_names = sorted(ribo_utils.get_riboseq_replicates(config).keys()) condition_name_map = ribo_utils.get_riboseq_condition_name_map(config) conditions = [(name, condition_name_map[name], is_replicate) for name in replicate_names] all_names = samples + conditions parallel.apply_parallel_iter(all_names, args.num_cpus, _create_figures, config, args, progress_bar=True)
def _refit_random_forest_ensemble(self): # we can just retrain these from scratch ret = parallel.apply_parallel_iter(self.solvers, self.args.num_cpus, self._fit_regressor) self.solver_asl_regressors_ = dict(ret) # now, build the stacking datasets using the new regressors self.X_stacking_train = self._get_stacking_model_dataset_rf( self.X_train) best_solvers = self.orig_y_train.idxmin(axis=1) self.y_stacking_train = self.le_.transform(best_solvers) # and actually update the stacking model self.stacking_model_ = self.stacking_model.fit(self.X_stacking_train, self.y_stacking_train)
def _refit_asl_wrapper_ensemble(self): # first, we need to refit each individual regressors ret = parallel.apply_parallel_iter(self.solvers, self.args.num_cpus, self._refit_asl_regressors) # the _refit_asl_regressors method has the side effect of updating # the state of the asl_wrappers.... sorry pure functional folks. # rebuild the stacking model self.X_stacking_train = self._get_stacking_model_dataset_asl( self.X_train) best_solvers = self.orig_y_train.idxmin(axis=1) self.y_stacking_train = self.le_.transform(best_solvers) new_ensemble = automl_utils.retrain_asl_wrapper( self.stacking_model_, self.X_stacking_train, self.y_stacking_train) # and update the ensemble of the model self.stacking_model_.ensemble_ = new_ensemble
def _fit_regressors(self): # create the regressors for each solver if self.use_random_forests: self.solver_asl_regressors = { solver: RandomForestRegressor(n_estimators=100) for solver in self.solvers } else: self.solver_asl_regressors = { solver: automl_utils.AutoSklearnWrapper( estimator_named_step="regressor", args=self.args) for solver in self.solvers } # fit the regressors ret = parallel.apply_parallel_iter(self.solvers, self.args.num_cpus, self._fit_regressor) self.solver_asl_regressors_ = dict(ret) return self
def main(): parser = argparse.ArgumentParser( formatter_class=argparse.ArgumentDefaultsHelpFormatter, description="Run the Bayesian optimization-based approach for " "training models for algorithm selection.") clu.add_config(parser) clu.add_scenario(parser) clu.add_simple_presolver_options(parser) clu.add_num_cpus(parser) clu.add_cv_options(parser) automl_utils.add_automl_options(parser, default_total_training_time=20) automl_utils.add_blas_options(parser) logging_utils.add_logging_options(parser) args = parser.parse_args() logging_utils.update_logging(args) # see which folds to run if len(args.folds) == 0: args.folds = [f for f in range(args.num_folds)] clu.validate_folds_options(args) required_keys = ['base_path'] config = as_asl_utils.load_config(args.config, required_keys) # check if we need to spawn a new process for blas if automl_utils.spawn_for_blas(args): return pipeline = parallel.apply_parallel_iter(args.folds, args.num_cpus, _outer_cv, args, config, progress_bar=True)
def main(): parser = argparse.ArgumentParser( formatter_class=argparse.ArgumentDefaultsHelpFormatter, description="This script counts the number of unique reads in the " "given files, all of which must be the same type. In the case of bam " "files, it only counts primary alignments (so it does not " "double-count multimappers, and it does not include unmapped reads " "present in the file.") parser.add_argument('files', help="The fasta, fastq or bam files", nargs='+') parser.add_argument('-o', '--out', help="The (csv.gz) output file " "containing the lengths and counts", required=True) parser.add_argument( '-f', '--file-type', help="The type of the files. All " "files must be of the same type. If the \"AUTO\" file type is given, " "then the type will be guessed on the extension of the first file " "using the following heuristic: \"bam\" if the extension is\".bam\" " "or \".sam\"; " "\"fastq\" if the extension is \"fastq\", \"fastq.gz\", \"fq\", or " "\"fq.gz\"; \"fasta\" otherwise.", choices=file_type_choices, default=default_file_type) parser.add_argument('-p', '--num-cpus', help="The number of CPUs to use", type=int, default=default_num_cpus) logging_utils.add_logging_options(parser) args = parser.parse_args() logging_utils.update_logging(args) if args.file_type == "AUTO": args.file_type = guess_file_type(args.files[0]) msg = "The guessed file type is: {}".format(args.file_type) logger.info(msg) # grab the correct function pointer get_length_distribution = file_type_get_length_distribution[args.file_type] msg = "Collecting all read length distributions" logger.info(msg) all_length_distribution_dfs = parallel.apply_parallel_iter( args.files, args.num_cpus, get_length_distribution, progress_bar=True) msg = "Combining data frames into one large df" logger.info(msg) length_distribution_df = pd.concat(all_length_distribution_dfs) msg = "Writing counts to disk" logger.info(msg) pd_utils.write_df(length_distribution_df, args.out, index=False)
def main(): parser = argparse.ArgumentParser( formatter_class=argparse.ArgumentDefaultsHelpFormatter, description= "This script converts bam files to bigWig files. It is mostly " "a port of this script: https://github.com/chapmanb/bcbb/blob/master/nextgen/scripts/bam_to_wiggle.py " "by Brad Chapman which avoids a few dependencies.\n\nThe wigToBigWig " "program (from UCSC tools) must be in the path.\n\nN.B. If given, the " "start and end coordinates must be base-0.") parser.add_argument('bam', help="The bam file", nargs='+') parser.add_argument( '-o', '--overwrite', help="If this flag is given, then " "the bigWig file will be created whether it exists or not", action='store_true') parser.add_argument('-c', '--chrom', help="If specified, only alignments " "from this chromosome will be in the output", default=default_chrom) parser.add_argument('-s', '--start', help="If specied, only alignments " "from this position will be in the output", default=default_start) parser.add_argument('-e', '--end', help="If specied, only alignments " "up to this position will be in the output", default=default_end) parser.add_argument('-n', '--normalize', help="If this flag is given, " "then values will be normalized to reads per million", action='store_true') parser.add_argument( '-t', '--use-tempfile', help="If this flag is given, " "then a temp file will be used to avoid permission issues", action='store_true') parser.add_argument('-k', '--keep-wig', help="If this flag is given, then " "the wiggle file will not be deleted", action='store_true') slurm.add_sbatch_options(parser) logging_utils.add_logging_options(parser) args = parser.parse_args() logging_utils.update_logging(args) programs = ['wigToBigWig'] shell_utils.check_programs_exist(programs) if args.use_slurm: cmd = ' '.join(sys.argv) slurm.check_sbatch(cmd, args=args) return parallel.apply_parallel_iter(args.bam, args.num_cpus, bam_to_wiggle, args, progress_bar=True)
def main(): parser = argparse.ArgumentParser(formatter_class=argparse.ArgumentDefaultsHelpFormatter, description="This script trains a model to predict the runtime for a " "solver from an ASlib scenario using autosklearn. It assumes an " "\"outer\" cross-validation strategy, and it only trains a model for " "the indicated folds and solvers. It then writes the learned model to " "disk. It *does not* collect any statistics, make predictions ,etc.") parser.add_argument('scenario', help="The ASlib scenario") parser.add_argument('out', help="A template string for the filenames for " "the learned models. They are written with joblib.dump, so they need " "to be read back in with joblib.load. ${solver} and ${fold} are the " "template part of the string. It is probably necessary to surround " "this argument with single quotes in order to prevent shell " "replacement of the template parts.") parser.add_argument('--config', help="A (yaml) config file which specifies " "options controlling the learner behavior") parser.add_argument('--solvers', help="The solvers for which models will " "be learned. By default, models for all solvers are learned", nargs='*', default=[]) parser.add_argument('--folds', help="The outer-cv folds for which a model " "will be learned. By default, models for all folds are learned", type=int, nargs='*', default=[]) parser.add_argument('-p', '--num-cpus', help="The number of CPUs to use " "for parallel solver/fold training", type=int, default=default_num_cpus) parser.add_argument('--num-blas-threads', help="The number of threads to " "use for parallelizing BLAS. The total number of CPUs will be " "\"num_cpus * num_blas_cpus\". Currently, this flag only affects " "OpenBLAS and MKL.", type=int, default=default_num_blas_cpus) parser.add_argument('--do-not-update-env', help="By default, num-blas-threads " "requires that relevant environment variables are updated. Likewise, " "if num-cpus is greater than one, it is necessary to turn off python " "assertions due to an issue with multiprocessing. If this flag is " "present, then the script assumes those updates are already handled. " "Otherwise, the relevant environment variables are set, and a new " "processes is spawned with this flag and otherwise the same " "arguments. This flag is not inended for external users.", action='store_true') automl_utils.add_automl_options(parser) logging_utils.add_logging_options(parser) args = parser.parse_args() logging_utils.update_logging(args) # see which folds to run folds = args.folds if len(folds) == 0: folds = range(1, 11) for f in folds: math_utils.check_range(f, 1, 10, variable_name="fold") # and which solvers msg = "Reading ASlib scenario" logger.info(msg) scenario = ASlibScenario() scenario.read_scenario(args.scenario) # ensure the selected solver is present solvers = args.solvers if len(solvers) == 0: solvers = scenario.algorithms for solver in solvers: if solver not in scenario.algorithms: solver_str = ','.join(scenario.algorithms) msg = ("[train-auto-sklear]: the solver is not present in the " "ASlib scenario. given: {}. choices: {}".format(solver, solver_str)) raise ValueError(msg) if args.config is not None: msg = "Reading config file" logger.info(msg) config = yaml.load(open(args.config)) else: config = {} # everything is present, so update the environment variables and spawn a # new process, if necessary if not args.do_not_update_env: ### # # There is a lot going on with settings these environment variables. # please see the following references: # # Turning off assertions so we can parallelize sklearn across # multiple CPUs for different solvers/folds # https://github.com/celery/celery/issues/1709 # # Controlling OpenBLAS threads # https://github.com/automl/auto-sklearn/issues/166 # # Other environment variables controlling thread usage # http://stackoverflow.com/questions/30791550 # ### # we only need to turn off the assertions if we parallelize across cpus if args.num_cpus > 1: os.environ['PYTHONOPTIMIZE'] = "1" # openblas os.environ['OPENBLAS_NUM_THREADS'] = str(args.num_blas_threads) # mkl blas os.environ['MKL_NUM_THREADS'] = str(args.num_blas_threads) # other stuff from the SO post os.environ['OMP_NUM_THREADS'] = str(args.num_blas_threads) os.environ['NUMEXPR_NUM_THREADS'] = str(args.num_blas_threads) cmd = ' '.join(shlex.quote(a) for a in sys.argv) cmd += " --do-not-update-env" shell_utils.check_call(cmd) return msg = "Learning regressors" logger.info(msg) it = itertools.product(solvers, folds) regressors = parallel.apply_parallel_iter( it, args.num_cpus, _outer_cv, args, config, progress_bar=True )
def main(): parser = argparse.ArgumentParser( formatter_class=argparse.ArgumentDefaultsHelpFormatter, description= "This script extracts all of the ORFs from the given transcripts. " "It writes the result as a bed12+1 file. The additional field, 'orf_len', gives " "the length of the respective ORF. It removes duplicate ORFs.\n\nN.B. The DEBUG " "output for this script is _very_ verbose. It is not recommended to run this " "script with that logging level.") parser.add_argument('transcripts_bed', help="The bed12 file containing the " "transcript information") parser.add_argument('transcripts_fasta', help="The fasta file containing the " "spliced transcript sequences") parser.add_argument('out', help="The output (bed12+1 gz) file") parser.add_argument('--start-codons', help="A list of codons which will be " "treated as start codons when extracting ORFs", nargs='+', default=default_start_codons) parser.add_argument('--stop-codons', help="A list of codons which will be " "treated as stop codons when extracting ORFs", nargs='+', default=default_stop_codons) slurm.add_sbatch_options(parser) logging_utils.add_logging_options(parser) args = parser.parse_args() logging_utils.update_logging(args) # check if we wanted to use slurm if args.use_slurm: cmd = ' '.join(sys.argv) slurm.check_sbatch(cmd, args=args) return msg = "Compiling start and stop codon regular expressions" logger.info(msg) start_codons_re = '|'.join(args.start_codons) stop_codons_re = '|'.join(args.stop_codons) start_codons_re = re.compile(start_codons_re) stop_codons_re = re.compile(stop_codons_re) msg = "Reading transcripts bed file" logger.info(msg) transcripts_bed = bed_utils.read_bed(args.transcripts_bed) msg = "Creating the sequence iterator" logger.info(msg) transcripts_fasta = fastx_utils.get_read_iterator(args.transcripts_fasta) transcripts_iter = ((get_transcript(transcript_header, transcripts_bed), transcript_sequence) for (transcript_header, transcript_sequence) in transcripts_fasta) msg = "Finding all ORFs" logger.info(msg) orfs = parallel.apply_parallel_iter(transcripts_iter, args.num_cpus, get_orfs, start_codons_re, stop_codons_re, total=len(transcripts_bed), progress_bar=True) msg = "Joining ORFs in a large data frame" logger.info(msg) orfs = pd.concat(orfs) msg = "Removing duplicate ORFs" logger.info(msg) orfs = orfs.drop_duplicates(subset=DUPLICATE_FIELDS) msg = "Numbering remaining ORFs" logger.info(msg) orfs['orf_num'] = np.arange(len(orfs)) msg = "Writing ORFs to disk" logger.info(msg) bed_utils.write_bed(orfs, args.out)
def main(): parser = argparse.ArgumentParser(formatter_class=argparse.ArgumentDefaultsHelpFormatter, description="This script creates a simple latex document containing the read " "filtering images, metagene profiles and analysis, and standard section text.") parser.add_argument('config', help="The (yaml) config file for the project") parser.add_argument('out', help="The path for the output files") parser.add_argument('--show-orf-periodicity', help="If this flag is " "present, bar charts showing the periodicity of each ORF type will be " "included in the report.", action='store_true') parser.add_argument('--show-read-length-bfs', help="If this flag is given, " "plots showing the Bayes factor at each offset for each read length " "are included in the report.", action='store_true') parser.add_argument('--overwrite', help="If this flag is present, existing files will " "be overwritten.", action='store_true') parser.add_argument('--min-visualization-count', help="Read lengths with fewer than this " "number of reads will not be included in the report.", type=int, default=default_min_visualization_count) parser.add_argument('--image-type', help="The type of image types to create. This " "must be an extension which matplotlib can interpret.", default=default_image_type) parser.add_argument('-c', '--create-fastqc-reports', help="If this flag is given, then " "fastqc reports will be created for most fastq and bam files. By default, they are " "not created.", action='store_true') parser.add_argument('--tmp', help="If the fastqc reports are created, " "they will use this location for temp files", default=default_tmp) parser.add_argument('--note', help="If this option is given, it will be used in the " "filenames.\n\nN.B. This REPLACES the note in the config file.", default=default_note) slurm.add_sbatch_options(parser) logging_utils.add_logging_options(parser) args = parser.parse_args() logging_utils.update_logging(args) config = yaml.load(open(args.config)) if args.note is not default_note: config['note'] = args.note note = config.get('note', None) sample_names = sorted(config['riboseq_samples'].keys()) # keep multimappers? is_unique = not ('keep_riboseq_multimappers' in config) programs = [ 'create-read-length-metagene-profile-plot', 'visualize-metagene-profile-bayes-factor', 'get-all-read-filtering-counts', 'samtools', 'visualize-read-filtering-counts', 'get-read-length-distribution', 'plot-read-length-distribution' ] if args.create_fastqc_reports: programs.extend(['fastqc','java']) shell_utils.check_programs_exist(programs) if args.use_slurm: cmd = ' '.join(sys.argv) slurm.check_sbatch(cmd, args=args) return # make sure the path to the output file exists os.makedirs(args.out, exist_ok=True) # first, create the read filtering information... create_read_filtering_plots(args.config, config, args) # ... and all the other figures. for name in sample_names: periodic_offsets = filenames.get_periodic_offsets(config['riboseq_data'], name, is_unique=is_unique, note=note) offsets_df = pd.read_csv(periodic_offsets) create_figures(args.config, config, name, offsets_df, args) min_metagene_profile_count = config.get( "min_metagene_profile_count", default_min_metagene_profile_count) min_metagene_profile_bayes_factor_mean = config.get( "min_metagene_profile_bayes_factor_mean", default_min_metagene_profile_bayes_factor_mean) max_metagene_profile_bayes_factor_var = config.get( "max_metagene_profile_bayes_factor_var", default_max_metagene_profile_bayes_factor_var) project_name = config.get("project_name", default_project_name) title = "Preprocessing results for {}".format(project_name) tex_file = os.path.join(args.out, "preprocessing-report.tex") with open(tex_file, 'w') as out: latex.begin_document(out, title, abstract, commands=commands) latex.section(out, "Introduction") latex.clearpage(out) latex.newpage(out) latex.section(out, "Mapping and filtering") latex.write(out, mapping_and_filtering_text) # the read filtering figures read_filtering_image = filenames.get_riboseq_read_filtering_counts_image( config['riboseq_data'], note=note, image_type=args.image_type) n = "no-rrna-{}".format(note) no_rrna_read_filtering_image = filenames.get_riboseq_read_filtering_counts_image( config['riboseq_data'], note=n, image_type=args.image_type) latex.begin_figure(out) latex.write_graphics(out, read_filtering_image, width=0.45) latex.write_graphics(out, no_rrna_read_filtering_image, width=0.45) latex.write_caption(out, read_filtering_caption, label=read_filtering_label) latex.end_figure(out) latex.clearpage(out) # the read length distributions latex.section(out, "Read length distributions", label=length_distribution_section_label) msg = "Writing length distribution figures" logger.info(msg) latex.begin_table(out, "cc") latex.write_header(out, ["All aligned reads", "Uniquely-aligning reads"]) for name in sample_names: data = config['riboseq_samples'][name] read_length_distribution_image = filenames.get_riboseq_read_length_distribution_image( config['riboseq_data'], name, is_unique=False, note=note, image_type=args.image_type) unique_read_length_distribution_image = filenames.get_riboseq_read_length_distribution_image( config['riboseq_data'], name, is_unique=True, note=note, image_type=args.image_type) msg = "Looking for image file: {}".format(read_length_distribution_image) logger.debug(msg) if os.path.exists(read_length_distribution_image): latex.write_graphics(out, read_length_distribution_image, width=0.45) else: msg = "Could not find image: {}".format(read_length_distribution_image) logger.warning(msg) text = "Missing: {}\n\n".format(name) latex.write(out, text) latex.write_column_sep(out) msg = "Looking for image file: {}".format(unique_read_length_distribution_image) logger.debug(msg) if os.path.exists(unique_read_length_distribution_image): latex.write_graphics(out, unique_read_length_distribution_image, width=0.45) else: msg = "Could not find image: {}".format(unique_read_length_distribution_image) logger.warning(msg) text = "Missing: {}\n\n".format(name) latex.write(out, text) latex.write_row_sep(out) latex.end_table(out) latex.clearpage(out) latex.section(out, "Read length periodicity", label=periodicity_label) for name in sample_names: i = 0 data = config['riboseq_samples'][name] msg = "Processing sample: {}".format(name) logger.info(msg) logger.debug("overwrite: {}".format(args.overwrite)) periodic_offsets = filenames.get_periodic_offsets(config['riboseq_data'], name, is_unique=is_unique, note=note) offsets_df = pd.read_csv(periodic_offsets) min_read_length = int(offsets_df['length'].min()) max_read_length = int(offsets_df['length'].max()) latex.begin_table(out, "YY") header = "\\multicolumn{2}{c}{" + name + "}" header = [header] latex.write_header(out, header) for length in range(min_read_length, max_read_length + 1): msg = "Processing length: {}".format(length) logger.info(msg) # check which offset is used # select the row for this length mask_length = offsets_df['length'] == length # TODO: this is sometimes length 0. why? if sum(mask_length) == 0: continue length_row = offsets_df[mask_length].iloc[0] # now, check all of the filters offset = int(length_row['highest_peak_offset']) offset_status = "Used for analysis" if length_row['highest_peak_bf_mean'] < min_metagene_profile_bayes_factor_mean: offset_status = "BF mean too small" if length_row['highest_peak_bf_var'] > max_metagene_profile_bayes_factor_var: offset_status = "BF variance too high" if length_row['highest_peak_profile_sum'] < min_metagene_profile_count: offset_status = "Count too small" if length_row['highest_peak_profile_sum'] < args.min_visualization_count: msg = "Not enough reads of this length. Skipping." logger.warning(msg) continue metagene_profile_image = filenames.get_metagene_profile_image( config['riboseq_data'], name, image_type=args.image_type, is_unique=is_unique, length=length, note=note) #title = ("length: {}. P-site offset: {}. \\newline status: {}" #"\n".format(length, offset, offset_status)) #latex.write(out, title, size="scriptsize") title = ("Length: {}. P-site offset: {}. Status: {}\n".format(length, offset, offset_status)) if args.show_read_length_bfs: title = "\scriptsize{" + title + "}" title = "\\multicolumn{2}{c}{" + title + "}" latex.write(out, title) latex.write_row_sep(out) else: latex.write(out, title, size="scriptsize") latex.write_graphics(out, metagene_profile_image, width=0.45) i += 1 if i%2 == 1: latex.write_column_sep(out) else: latex.write_row_sep(out) if args.show_read_length_bfs: bayes_factor_image = filenames.get_metagene_profile_bayes_factor_image( config['riboseq_data'], name, image_type=args.image_type, is_unique=is_unique, length=length, note=note) #latex.centering(out) latex.write_graphics(out, bayes_factor_image, width=0.45) i += 1 if i%2 == 1: latex.write_column_sep(out) else: latex.write_row_sep(out) if i%2 == 1: latex.write_row_sep(out) latex.end_table(out) latex.clearpage(out) ### ORF type metagene profiles if args.show_orf_periodicity: title = "ORF type periodicity" latex.section(out, title) strands = ['+', '-'] for sample_name in sample_names: i = 0 try: lengths, offsets = ribo_utils.get_periodic_lengths_and_offsets( config, sample_name, is_unique=is_unique) except FileNotFoundError: msg = ("Could not parse out lengths and offsets for sample: {}. " "Skipping".format(sample_name)) logger.error(msg) continue orf_type_profile_base = filenames.get_orf_type_profile_base( config['riboseq_data'], sample_name, length=lengths, offset=offsets, is_unique=is_unique, note=note, subfolder='orf-profiles') for orf_type in ribo_utils.orf_types: for strand in strands: orf_type_profile = filenames.get_orf_type_profile_image( orf_type_profile_base, orf_type, strand, image_type=args.image_type) msg = "Looking for image file: {}".format(orf_type_profile) logger.debug(msg) if os.path.exists(orf_type_profile): if i%4 == 0: latex.begin_figure(out) i += 1 latex.write_graphics(out, orf_type_profile, height=0.23) if i%4 == 0: latex.end_figure(out) latex.clearpage(out) if (i>0) and (i%4 != 0): latex.end_figure(out) latex.clearpage(out) latex.end_document(out) tex_filename = os.path.basename(tex_file) latex.compile(args.out, tex_filename) if args.create_fastqc_reports: parallel.apply_parallel_iter(config['riboseq_samples'].items(), args.num_cpus, create_fastqc_reports, config, args)
def main(): parser = argparse.ArgumentParser( formatter_class=argparse.ArgumentDefaultsHelpFormatter, description="This script constructs the profile for each ORF. It " "first adjusts the mapped read positions to properly align with " "the P-sites. Second, it uses a custom chrom-sweep algorithm to " "find the coverage of each position in each exon of each ORF. Finally, " "the ORF exons are glued together to find the profile of the entire ORF." ) parser.add_argument('bam', help="The bam file including filtered (unique, " "etc.) alignments") parser.add_argument('orfs', help="The (bed12) file containing the ORFs") parser.add_argument('exons', help="The (bed6+2) file containing the exons") parser.add_argument('out', help="The (mtx.gz) output file containing the " "ORF profiles") parser.add_argument( '-l', '--lengths', help="If any values are given, " "then only reads which have those lengths will be included in the " "signal construction.", type=int, default=default_lengths, nargs='*') parser.add_argument( '-o', '--offsets', help="The 5' end of reads will be " "shifted by this amount. There must be one offset value for each " "length (given by the --lengths argument.", type=int, default=default_offsets, nargs='*') parser.add_argument('-k', '--num-exons', help="If k>0, then only the " "first k exons will be processed.", type=int, default=default_num_exons) parser.add_argument( '-g', '--num-groups', help="The number of groups into " "which to split the exons. More groups means the progress bar is " "updated more frequently but incurs more overhead because of the " "parallel calls.", type=int, default=default_num_groups) parser.add_argument('--seqname-prefix', help="If present, this string " "will be prepended to the seqname field of the ORFs.", default=default_seqname_prefix) slurm.add_sbatch_options(parser) logging_utils.add_logging_options(parser) args = parser.parse_args() logging_utils.update_logging(args) msg = "[extract-orf-profiles]: {}".format(' '.join(sys.argv)) logger.info(msg) # make sure the number of lengths and offsets match if len(args.lengths) != len(args.offsets): msg = "The number of --lengths and --offsets do not match." raise ValueError(msg) # make sure the necessary files exist required_files = [args.bam, args.orfs, args.exons] msg = "[extract-orf-profiles]: Some input files were missing: " utils.check_files_exist(required_files, msg=msg) if args.use_slurm: cmd = ' '.join(sys.argv) slurm.check_sbatch(cmd, args=args) return msg = "Finding P-sites" logger.info(msg) p_sites = ribo_utils.get_p_sites(args.bam, args.lengths, args.offsets) # we do not need the data frame anymore, so save some memory msg = "Reading exons" logger.info(msg) exons = bed_utils.read_bed(args.exons) msg = "Reading ORFs" logger.info(msg) orfs = bed_utils.read_bed(args.orfs) if len(args.seqname_prefix) > 0: orfs['seqname'] = args.seqname_prefix + orfs['seqname'] exons['seqname'] = args.seqname_prefix + exons['seqname'] if args.num_exons > 0: exons = exons.head(args.num_exons) num_orfs = orfs['orf_num'].max() + 1 max_orf_len = orfs['orf_len'].max() msg = "Adding the ORF index to the exons" logger.info(msg) orf_fields = ['id', 'orf_num'] exons_orfs = exons.merge(orfs[orf_fields], on='id') msg = "Splitting exons and P-sites" logger.info(msg) exon_groups = pandas_utils.split_df(exons_orfs, args.num_groups) exons_dfs = [] psites_dfs = [] for group_index, exon_group in exon_groups: # pull out only the p-sites that come from these chromosomes seqnames = set(exon_group['seqname'].unique()) m_psites = p_sites['seqname'].isin(seqnames) exons_dfs.append(exon_group) psites_dfs.append(p_sites[m_psites]) # we no longer need the full list of psites del p_sites del exons_orfs del exon_groups del exons gc.collect() exons_psites = zip(exons_dfs, psites_dfs) msg = "Finding all P-site intersections" logger.info(msg) sum_profiles = parallel.apply_parallel_iter(exons_psites, args.num_cpus, get_all_p_site_intersections, num_orfs, max_orf_len, progress_bar=True, total=args.num_groups) msg = "Combining the ORF profiles into one matrix" logger.info(msg) f = lambda x, y: x + y sum_profiles = functools.reduce(f, sum_profiles) sum_profiles_lil = sum_profiles.tolil() msg = "Flipping the reverse strand profiles" logger.info(msg) m_reverse = orfs['strand'] == '-' reverse_orfs = orfs[m_reverse] for idx, reverse_orf in tqdm.tqdm(reverse_orfs.iterrows()): orf_num = reverse_orf['orf_num'] if sum_profiles[orf_num].sum() == 0: continue orf_len = reverse_orf['orf_len'] dense = utils.to_dense(sum_profiles, orf_num, length=orf_len) dense = dense[::-1] sum_profiles_lil[orf_num, :orf_len] = dense msg = "Writing the sparse matrix to disk" logger.info(msg) math_utils.write_sparse_matrix(args.out, sum_profiles_lil)
def main(): parser = argparse.ArgumentParser( formatter_class=argparse.ArgumentDefaultsHelpFormatter, description= "This script creates a simple latex document containing the read " "filtering images, metagene profiles and analysis, and standard section text." ) parser.add_argument('config', help="The (yaml) config file for the project") parser.add_argument('out', help="The path for the output files") parser.add_argument( '--show-orf-periodicity', help="If this flag is " "present, bar charts showing the periodicity of each ORF type will be " "included in the report.", action='store_true') parser.add_argument( '--show-read-length-bfs', help="If this flag is given, " "plots showing the Bayes factor at each offset for each read length " "are included in the report.", action='store_true') parser.add_argument('--overwrite', help="If this flag is present, existing files will " "be overwritten.", action='store_true') parser.add_argument('--min-visualization-count', help="Read lengths with fewer than this " "number of reads will not be included in the report.", type=int, default=default_min_visualization_count) parser.add_argument('--image-type', help="The type of image types to create. This " "must be an extension which matplotlib can interpret.", default=default_image_type) parser.add_argument( '-c', '--create-fastqc-reports', help="If this flag is given, then " "fastqc reports will be created for most fastq and bam files. By default, they are " "not created.", action='store_true') parser.add_argument('--tmp', help="If the fastqc reports are created, " "they will use this location for temp files", default=default_tmp) parser.add_argument( '--note', help="If this option is given, it will be used in the " "filenames.\n\nN.B. This REPLACES the note in the config file.", default=default_note) slurm.add_sbatch_options(parser) logging_utils.add_logging_options(parser) args = parser.parse_args() logging_utils.update_logging(args) config = yaml.load(open(args.config)) if args.note is not None: config['note'] = args.note # keep multimappers? is_unique = not ('keep_riboseq_multimappers' in config) programs = [ 'create-read-length-metagene-profile-plot', 'visualize-metagene-profile-bayes-factor', 'get-all-read-filtering-counts', 'samtools', 'visualize-read-filtering-counts', 'get-read-length-distribution', 'plot-read-length-distribution' ] if args.create_fastqc_reports: programs.extend(['fastqc', 'java']) shell_utils.check_programs_exist(programs) if args.use_slurm: cmd = ' '.join(sys.argv) slurm.check_sbatch(cmd, args=args) return config = yaml.load(open(args.config)) if args.note is not default_note: config['note'] = args.note note = config.get('note', None) # make sure the path to the output file exists os.makedirs(args.out, exist_ok=True) # first, create the read filtering information create_read_filtering_plots(args.config, config, args) min_metagene_profile_count = config.get( "min_metagene_profile_count", default_min_metagene_profile_count) min_metagene_profile_bayes_factor_mean = config.get( "min_metagene_profile_bayes_factor_mean", default_min_metagene_profile_bayes_factor_mean) max_metagene_profile_bayes_factor_var = config.get( "max_metagene_profile_bayes_factor_var", default_max_metagene_profile_bayes_factor_var) project_name = config.get("project_name", default_project_name) title = "Preprocessing results for {}".format(project_name) sample_names = sorted(config['riboseq_samples'].keys()) tex_file = os.path.join(args.out, "preprocessing-report.tex") with open(tex_file, 'w') as out: latex.begin_document(out, title, abstract, commands=commands) latex.section(out, "Introduction") latex.clearpage(out) latex.newpage(out) latex.section(out, "Mapping and filtering") latex.write(out, mapping_and_filtering_text) # the read filtering figures read_filtering_image = filenames.get_riboseq_read_filtering_counts_image( config['riboseq_data'], note=note, image_type=args.image_type) n = "no-rrna-{}".format(note) no_rrna_read_filtering_image = filenames.get_riboseq_read_filtering_counts_image( config['riboseq_data'], note=n, image_type=args.image_type) latex.begin_figure(out) latex.write_graphics(out, read_filtering_image, height=0.45) latex.write_graphics(out, no_rrna_read_filtering_image, height=0.45) latex.write_caption(out, read_filtering_caption, label=read_filtering_label) latex.end_figure(out) latex.clearpage(out) # the read length distributions latex.section(out, "Read length distributions", label=length_distribution_section_label) msg = "Writing length distribution figures" logger.info(msg) latex.begin_table(out, "cc") latex.write_header(out, ["All aligned reads", "Uniquely-aligning reads"]) for name in sample_names: data = config['riboseq_samples'][name] read_length_distribution_image = filenames.get_riboseq_read_length_distribution_image( config['riboseq_data'], name, is_unique=False, note=note, image_type=args.image_type) unique_read_length_distribution_image = filenames.get_riboseq_read_length_distribution_image( config['riboseq_data'], name, is_unique=True, note=note, image_type=args.image_type) msg = "Looking for image file: {}".format( read_length_distribution_image) logger.debug(msg) if os.path.exists(read_length_distribution_image): latex.write_graphics(out, read_length_distribution_image, width=0.45) else: msg = "Could not find image: {}".format( read_length_distribution_image) logger.warning(msg) text = "Missing: {}\n\n".format(name) latex.write(out, text) latex.write_column_sep(out) msg = "Looking for image file: {}".format( unique_read_length_distribution_image) logger.debug(msg) if os.path.exists(unique_read_length_distribution_image): latex.write_graphics(out, unique_read_length_distribution_image, width=0.45) else: msg = "Could not find image: {}".format( unique_read_length_distribution_image) logger.warning(msg) text = "Missing: {}\n\n".format(name) latex.write(out, text) latex.write_row_sep(out) latex.end_table(out) latex.clearpage(out) latex.section(out, "Read length periodicity", label=periodicity_label) for name in sample_names: i = 0 data = config['riboseq_samples'][name] msg = "Processing sample: {}".format(name) logger.info(msg) logger.debug("overwrite: {}".format(args.overwrite)) periodic_offsets = filenames.get_periodic_offsets( config['riboseq_data'], name, is_unique=is_unique, note=note) offsets_df = pd.read_csv(periodic_offsets) min_read_length = int(offsets_df['length'].min()) max_read_length = int(offsets_df['length'].max()) create_figures(args.config, config, name, offsets_df, args) latex.begin_table(out, "YY") header = "\\multicolumn{2}{c}{" + name + "}" header = [header] latex.write_header(out, header) for length in range(min_read_length, max_read_length + 1): msg = "Processing length: {}".format(length) logger.info(msg) # check which offset is used # select the row for this length mask_length = offsets_df['length'] == length # TODO: this is sometimes length 0. why? if sum(mask_length) == 0: continue length_row = offsets_df[mask_length].iloc[0] # now, check all of the filters offset = int(length_row['highest_peak_offset']) offset_status = "Used for analysis" if length_row[ 'highest_peak_bf_mean'] < min_metagene_profile_bayes_factor_mean: offset_status = "BF mean too small" if length_row[ 'highest_peak_bf_var'] > max_metagene_profile_bayes_factor_var: offset_status = "BF variance too high" if length_row[ 'highest_peak_profile_sum'] < min_metagene_profile_count: offset_status = "Count too small" if length_row[ 'highest_peak_profile_sum'] < args.min_visualization_count: msg = "Not enough reads of this length. Skipping." logger.warning(msg) continue metagene_profile_image = filenames.get_metagene_profile_image( config['riboseq_data'], name, image_type=args.image_type, is_unique=is_unique, length=length, note=note) title = ("length: {}. P-site offset: {}. \\newline status: {}" "\n".format(length, offset, offset_status)) latex.write(out, title, size="scriptsize") latex.write_graphics(out, metagene_profile_image, width=0.45) i += 1 if i % 2 == 1: latex.write_column_sep(out) else: latex.write_row_sep(out) if args.show_read_length_bfs: bayes_factor_image = filenames.get_metagene_profile_bayes_factor_image( config['riboseq_data'], name, image_type=args.image_type, is_unique=is_unique, length=length, note=note) latex.centering(out) latex.write_graphics(out, bayes_factor_image, width=0.45) i += 1 if i % 2 == 1: latex.write_column_sep(out) else: latex.write_row_sep(out) if i % 2 == 1: latex.write_row_sep(out) latex.end_table(out) latex.clearpage(out) ### ORF type metagene profiles if args.show_orf_periodicity: title = "ORF type periodicity" latex.section(out, title) strands = ['+', '-'] for sample_name in sample_names: i = 0 try: lengths, offsets = ribo_utils.get_periodic_lengths_and_offsets( config, sample_name, is_unique=is_unique) except FileNotFoundError: msg = ( "Could not parse out lengths and offsets for sample: {}. " "Skipping".format(sample_name)) logger.error(msg) continue orf_type_profile_base = filenames.get_orf_type_profile_base( config['riboseq_data'], sample_name, length=lengths, offset=offsets, is_unique=is_unique, note=note, subfolder='orf-profiles') for orf_type in ribo_utils.orf_types: for strand in strands: orf_type_profile = filenames.get_orf_type_profile_image( orf_type_profile_base, orf_type, strand, image_type=args.image_type) msg = "Looking for image file: {}".format( orf_type_profile) logger.debug(msg) if os.path.exists(orf_type_profile): if i % 4 == 0: latex.begin_figure(out) i += 1 latex.write_graphics(out, orf_type_profile, height=0.23) if i % 4 == 0: latex.end_figure(out) latex.clearpage(out) if (i > 0) and (i % 4 != 0): latex.end_figure(out) latex.clearpage(out) latex.end_document(out) tex_filename = os.path.basename(tex_file) latex.compile(args.out, tex_filename) if args.create_fastqc_reports: parallel.apply_parallel_iter(config['riboseq_samples'].items(), args.num_cpus, create_fastqc_reports, config, args)