def create_all_figures(config, args):
    
    # first, handle all of the regular datasets

    is_replicate = False
    sample_names = sorted(config['riboseq_samples'].keys())
    sample_name_map = ribo_utils.get_sample_name_map(config)
    samples = [(name, sample_name_map[name], is_replicate) 
        for name in sample_names
    ]
    
    is_replicate = True
    replicate_names = sorted(ribo_utils.get_riboseq_replicates(config).keys())
    condition_name_map = ribo_utils.get_riboseq_condition_name_map(config)
    conditions = [(name, condition_name_map[name], is_replicate)
        for name in replicate_names
    ]

    all_names = samples + conditions
    parallel.apply_parallel_iter(
        all_names,
        args.num_cpus,
        _create_figures,
        config,
        args,
        progress_bar=True
    )
def main():
    parser = argparse.ArgumentParser(
        formatter_class=argparse.ArgumentDefaultsHelpFormatter,
        description=
        "This script collects counts of riboseq reads filtered at each step in "
        "the micropeptide prediction pipeline. It mostly parses fastqc results (using the "
        "crimson python package).")
    parser.add_argument('config', help="The yaml config file")
    parser.add_argument('out', help="The output csv file with the counts")
    parser.add_argument('-p',
                        '--num-cpus',
                        help="The number of processors to use",
                        type=int,
                        default=default_num_cpus)
    parser.add_argument('--overwrite', action='store_true')

    logging_utils.add_logging_options(parser)
    args = parser.parse_args()
    logging_utils.update_logging(args)

    programs = ['samtools']
    shell_utils.check_programs_exist(programs)

    config = yaml.load(open(args.config))

    res = parallel.apply_parallel_iter(config['riboseq_samples'].items(),
                                       args.num_cpus, get_counts, config, args)
    res = [r for r in res if r is not None]
    res_df = pd.DataFrame(res)

    utils.write_df(res_df, args.out, index=False)
def main():
    parser = argparse.ArgumentParser(formatter_class=argparse.ArgumentDefaultsHelpFormatter,
        description="This script collects counts of riboseq reads filtered at each step in "
        "the micropeptide prediction pipeline. It mostly parses fastqc results (using the "
        "crimson python package).")
    parser.add_argument('config', help="The yaml config file")
    parser.add_argument('out', help="The output csv file with the counts")
    parser.add_argument('-p', '--num-cpus', help="The number of processors to use", 
        type=int, default=default_num_cpus)
    parser.add_argument('--overwrite', action='store_true')

    logging_utils.add_logging_options(parser)
    args = parser.parse_args()
    logging_utils.update_logging(args)

    programs = ['samtools']
    shell_utils.check_programs_exist(programs)

    config = yaml.load(open(args.config))

    res = parallel.apply_parallel_iter(config['riboseq_samples'].items(), 
                                        args.num_cpus, 
                                        get_counts, config, args)
    res = [r for r in res if r is not None]
    res_df = pd.DataFrame(res)

    pandas_utils.write_df(res_df, args.out, index=False)
Пример #4
0
def main():
    parser = argparse.ArgumentParser(formatter_class=argparse.ArgumentDefaultsHelpFormatter,
        description="Summarize the evaluation metrics for all scenarios")
    
    clu.add_config(parser)
    parser.add_argument('out')

    clu.add_cv_options(parser)
    clu.add_num_cpus(parser)

    automl_utils.add_blas_options(parser)

    logging_utils.add_logging_options(parser)
    args = parser.parse_args()
    logging_utils.update_logging(args)

    # see which folds to run
    if len(args.folds) == 0:
        args.folds = list(range(1,11))
    clu.validate_folds_options(args)

    required_keys = ['base_path', 'training_scenarios_path']
    config = as_asl_utils.load_config(args.config, required_keys)

    if automl_utils.spawn_for_blas(args):
        return

    scenarios = utils.list_subdirs(config['training_scenarios_path'])
    use_random_forests = [False] #, True]
    it = itertools.product(scenarios, use_random_forests)

    all_stats = parallel.apply_parallel_iter(
        it,
        args.num_cpus,
        get_stats_summary,
        args,
        config
    )

    msg = "Combining statistics"
    logger.info(msg)

    all_stats_df = pd.DataFrame(all_stats)
    pd_utils.write_df(
        all_stats_df,
        args.out,
        create_path=True,
        do_not_compress=True,
        index=False
    )
Пример #5
0
def create_all_figures(config, args):

    # first, handle all of the regular datasets

    is_replicate = False
    sample_names = sorted(config['riboseq_samples'].keys())
    sample_name_map = ribo_utils.get_sample_name_map(config)
    samples = [(name, sample_name_map[name], is_replicate)
               for name in sample_names]

    is_replicate = True
    replicate_names = sorted(ribo_utils.get_riboseq_replicates(config).keys())
    condition_name_map = ribo_utils.get_riboseq_condition_name_map(config)
    conditions = [(name, condition_name_map[name], is_replicate)
                  for name in replicate_names]

    all_names = samples + conditions
    parallel.apply_parallel_iter(all_names,
                                 args.num_cpus,
                                 _create_figures,
                                 config,
                                 args,
                                 progress_bar=True)
Пример #6
0
    def _refit_random_forest_ensemble(self):

        # we can just retrain these from scratch
        ret = parallel.apply_parallel_iter(self.solvers, self.args.num_cpus,
                                           self._fit_regressor)

        self.solver_asl_regressors_ = dict(ret)

        # now, build the stacking datasets using the new regressors
        self.X_stacking_train = self._get_stacking_model_dataset_rf(
            self.X_train)

        best_solvers = self.orig_y_train.idxmin(axis=1)
        self.y_stacking_train = self.le_.transform(best_solvers)

        # and actually update the stacking model
        self.stacking_model_ = self.stacking_model.fit(self.X_stacking_train,
                                                       self.y_stacking_train)
Пример #7
0
    def _refit_asl_wrapper_ensemble(self):

        # first, we need to refit each individual regressors
        ret = parallel.apply_parallel_iter(self.solvers, self.args.num_cpus,
                                           self._refit_asl_regressors)

        # the _refit_asl_regressors method has the side effect of updating
        # the state of the asl_wrappers.... sorry pure functional folks.

        # rebuild the stacking model
        self.X_stacking_train = self._get_stacking_model_dataset_asl(
            self.X_train)
        best_solvers = self.orig_y_train.idxmin(axis=1)
        self.y_stacking_train = self.le_.transform(best_solvers)

        new_ensemble = automl_utils.retrain_asl_wrapper(
            self.stacking_model_, self.X_stacking_train, self.y_stacking_train)

        # and update the ensemble of the model
        self.stacking_model_.ensemble_ = new_ensemble
Пример #8
0
    def _fit_regressors(self):

        # create the regressors for each solver

        if self.use_random_forests:
            self.solver_asl_regressors = {
                solver: RandomForestRegressor(n_estimators=100)
                for solver in self.solvers
            }
        else:
            self.solver_asl_regressors = {
                solver: automl_utils.AutoSklearnWrapper(
                    estimator_named_step="regressor", args=self.args)
                for solver in self.solvers
            }

        # fit the regressors
        ret = parallel.apply_parallel_iter(self.solvers, self.args.num_cpus,
                                           self._fit_regressor)

        self.solver_asl_regressors_ = dict(ret)

        return self
Пример #9
0
def main():
    parser = argparse.ArgumentParser(
        formatter_class=argparse.ArgumentDefaultsHelpFormatter,
        description="Run the Bayesian optimization-based approach for "
        "training models for algorithm selection.")

    clu.add_config(parser)
    clu.add_scenario(parser)
    clu.add_simple_presolver_options(parser)
    clu.add_num_cpus(parser)
    clu.add_cv_options(parser)

    automl_utils.add_automl_options(parser, default_total_training_time=20)
    automl_utils.add_blas_options(parser)

    logging_utils.add_logging_options(parser)
    args = parser.parse_args()
    logging_utils.update_logging(args)

    # see which folds to run
    if len(args.folds) == 0:
        args.folds = [f for f in range(args.num_folds)]
    clu.validate_folds_options(args)

    required_keys = ['base_path']
    config = as_asl_utils.load_config(args.config, required_keys)

    # check if we need to spawn a new process for blas
    if automl_utils.spawn_for_blas(args):
        return

    pipeline = parallel.apply_parallel_iter(args.folds,
                                            args.num_cpus,
                                            _outer_cv,
                                            args,
                                            config,
                                            progress_bar=True)
Пример #10
0
def main():
    parser = argparse.ArgumentParser(
        formatter_class=argparse.ArgumentDefaultsHelpFormatter,
        description="This script counts the number of unique reads in the "
        "given files, all of which must be the same type. In the case of bam "
        "files, it only counts primary alignments (so it does not "
        "double-count multimappers, and it does not include unmapped reads "
        "present in the file.")

    parser.add_argument('files',
                        help="The fasta, fastq or bam files",
                        nargs='+')
    parser.add_argument('-o',
                        '--out',
                        help="The (csv.gz) output file "
                        "containing the lengths and counts",
                        required=True)

    parser.add_argument(
        '-f',
        '--file-type',
        help="The type of the files. All "
        "files must be of the same type. If the \"AUTO\" file type is given, "
        "then the type will be guessed on the extension of the first file "
        "using the following heuristic: \"bam\" if the extension is\".bam\" "
        "or \".sam\"; "
        "\"fastq\" if the extension is \"fastq\", \"fastq.gz\", \"fq\", or "
        "\"fq.gz\"; \"fasta\" otherwise.",
        choices=file_type_choices,
        default=default_file_type)

    parser.add_argument('-p',
                        '--num-cpus',
                        help="The number of CPUs to use",
                        type=int,
                        default=default_num_cpus)

    logging_utils.add_logging_options(parser)
    args = parser.parse_args()
    logging_utils.update_logging(args)

    if args.file_type == "AUTO":
        args.file_type = guess_file_type(args.files[0])
        msg = "The guessed file type is: {}".format(args.file_type)
        logger.info(msg)

    # grab the correct function pointer
    get_length_distribution = file_type_get_length_distribution[args.file_type]

    msg = "Collecting all read length distributions"
    logger.info(msg)

    all_length_distribution_dfs = parallel.apply_parallel_iter(
        args.files, args.num_cpus, get_length_distribution, progress_bar=True)

    msg = "Combining data frames into one large df"
    logger.info(msg)
    length_distribution_df = pd.concat(all_length_distribution_dfs)

    msg = "Writing counts to disk"
    logger.info(msg)

    pd_utils.write_df(length_distribution_df, args.out, index=False)
Пример #11
0
def main():
    parser = argparse.ArgumentParser(
        formatter_class=argparse.ArgumentDefaultsHelpFormatter,
        description=
        "This script converts bam files to bigWig files. It is mostly "
        "a port of this script: https://github.com/chapmanb/bcbb/blob/master/nextgen/scripts/bam_to_wiggle.py "
        "by Brad Chapman which avoids a few dependencies.\n\nThe wigToBigWig "
        "program (from UCSC tools) must be in the path.\n\nN.B. If given, the "
        "start and end coordinates must be base-0.")

    parser.add_argument('bam', help="The bam file", nargs='+')
    parser.add_argument(
        '-o',
        '--overwrite',
        help="If this flag is given, then "
        "the bigWig file will be created whether it exists or not",
        action='store_true')
    parser.add_argument('-c',
                        '--chrom',
                        help="If specified, only alignments "
                        "from this chromosome will be in the output",
                        default=default_chrom)
    parser.add_argument('-s',
                        '--start',
                        help="If specied, only alignments "
                        "from this position will be in the output",
                        default=default_start)
    parser.add_argument('-e',
                        '--end',
                        help="If specied, only alignments "
                        "up to this position will be in the output",
                        default=default_end)

    parser.add_argument('-n',
                        '--normalize',
                        help="If this flag is given, "
                        "then values will be normalized to reads per million",
                        action='store_true')

    parser.add_argument(
        '-t',
        '--use-tempfile',
        help="If this flag is given, "
        "then a temp file will be used to avoid permission issues",
        action='store_true')

    parser.add_argument('-k',
                        '--keep-wig',
                        help="If this flag is given, then "
                        "the wiggle file will not be deleted",
                        action='store_true')

    slurm.add_sbatch_options(parser)
    logging_utils.add_logging_options(parser)
    args = parser.parse_args()
    logging_utils.update_logging(args)

    programs = ['wigToBigWig']
    shell_utils.check_programs_exist(programs)

    if args.use_slurm:
        cmd = ' '.join(sys.argv)
        slurm.check_sbatch(cmd, args=args)
        return

    parallel.apply_parallel_iter(args.bam,
                                 args.num_cpus,
                                 bam_to_wiggle,
                                 args,
                                 progress_bar=True)
Пример #12
0
def main():
    parser = argparse.ArgumentParser(formatter_class=argparse.ArgumentDefaultsHelpFormatter,
        description="This script trains a model to predict the runtime for a "
        "solver from an ASlib scenario using autosklearn. It assumes an "
        "\"outer\" cross-validation strategy, and it only trains a model for "
        "the indicated folds and solvers. It then writes the learned model to "
        "disk. It *does not* collect any statistics, make predictions ,etc.")

    parser.add_argument('scenario', help="The ASlib scenario")
    
    parser.add_argument('out', help="A template string for the filenames for "
        "the learned models. They are written with joblib.dump, so they need "
        "to be read back in with joblib.load. ${solver} and ${fold} are the "
        "template part of the string. It is probably necessary to surround "
        "this argument with single quotes in order to prevent shell "
        "replacement of the template parts.")

    parser.add_argument('--config', help="A (yaml) config file which specifies "
        "options controlling the learner behavior")

    parser.add_argument('--solvers', help="The solvers for which models will "
        "be learned. By default, models for all solvers are learned", 
        nargs='*', default=[])

    parser.add_argument('--folds', help="The outer-cv folds for which a model "
        "will be learned. By default, models for all folds are learned", 
        type=int, nargs='*', default=[])

    parser.add_argument('-p', '--num-cpus', help="The number of CPUs to use "
        "for parallel solver/fold training", type=int, 
        default=default_num_cpus)
    
    parser.add_argument('--num-blas-threads', help="The number of threads to "
        "use for parallelizing BLAS. The total number of CPUs will be "
        "\"num_cpus * num_blas_cpus\". Currently, this flag only affects "
        "OpenBLAS and MKL.", type=int, default=default_num_blas_cpus)

    parser.add_argument('--do-not-update-env', help="By default, num-blas-threads "
        "requires that relevant environment variables are updated. Likewise, "
        "if num-cpus is greater than one, it is necessary to turn off python "
        "assertions due to an issue with multiprocessing. If this flag is "
        "present, then the script assumes those updates are already handled. "
        "Otherwise, the relevant environment variables are set, and a new "
        "processes is spawned with this flag and otherwise the same "
        "arguments. This flag is not inended for external users.",
        action='store_true')

    automl_utils.add_automl_options(parser)
    logging_utils.add_logging_options(parser)
    args = parser.parse_args()
    logging_utils.update_logging(args)

    # see which folds to run
    folds = args.folds
    if len(folds) == 0:
        folds = range(1, 11)

    for f in folds:
        math_utils.check_range(f, 1, 10, variable_name="fold")

    # and which solvers
    msg = "Reading ASlib scenario"
    logger.info(msg)
    scenario = ASlibScenario()
    scenario.read_scenario(args.scenario)

    # ensure the selected solver is present
    solvers = args.solvers
    if len(solvers) == 0:
        solvers = scenario.algorithms

    for solver in solvers:
        if solver not in scenario.algorithms:
            solver_str = ','.join(scenario.algorithms)
            msg = ("[train-auto-sklear]: the solver is not present in the "
                "ASlib scenario. given: {}. choices: {}".format(solver, 
                solver_str))
            raise ValueError(msg)

    if args.config is not None:
        msg = "Reading config file"
        logger.info(msg)
        config = yaml.load(open(args.config))
    else:
        config = {}

    # everything is present, so update the environment variables and spawn a
    # new process, if necessary
    if not args.do_not_update_env:
        ###
        #
        # There is a lot going on with settings these environment variables.
        # please see the following references:
        #
        #   Turning off assertions so we can parallelize sklearn across
        #   multiple CPUs for different solvers/folds
        #       https://github.com/celery/celery/issues/1709
        #
        #   Controlling OpenBLAS threads
        #       https://github.com/automl/auto-sklearn/issues/166
        #
        #   Other environment variables controlling thread usage
        #       http://stackoverflow.com/questions/30791550
        #
        ###
        
        # we only need to turn off the assertions if we parallelize across cpus
        if args.num_cpus > 1:
            os.environ['PYTHONOPTIMIZE'] = "1"

        # openblas
        os.environ['OPENBLAS_NUM_THREADS'] = str(args.num_blas_threads)
        
        # mkl blas
        os.environ['MKL_NUM_THREADS'] = str(args.num_blas_threads)

        # other stuff from the SO post
        os.environ['OMP_NUM_THREADS'] = str(args.num_blas_threads)
        os.environ['NUMEXPR_NUM_THREADS'] = str(args.num_blas_threads)

        cmd = ' '.join(shlex.quote(a) for a in sys.argv)
        cmd += " --do-not-update-env"
        shell_utils.check_call(cmd)
        return

    msg = "Learning regressors"
    logger.info(msg)

    it = itertools.product(solvers, folds)
    regressors = parallel.apply_parallel_iter(
        it,
        args.num_cpus,
        _outer_cv,
        args,
        config,
        progress_bar=True
    )
Пример #13
0
def main():
    parser = argparse.ArgumentParser(
        formatter_class=argparse.ArgumentDefaultsHelpFormatter,
        description=
        "This script extracts all of the ORFs from the given transcripts. "
        "It writes the result as a bed12+1 file. The additional field, 'orf_len', gives "
        "the length of the respective ORF. It removes duplicate ORFs.\n\nN.B. The DEBUG "
        "output for this script is _very_ verbose. It is not recommended to run this "
        "script with that logging level.")

    parser.add_argument('transcripts_bed',
                        help="The bed12 file containing the "
                        "transcript information")

    parser.add_argument('transcripts_fasta',
                        help="The fasta file containing the "
                        "spliced transcript sequences")

    parser.add_argument('out', help="The output (bed12+1 gz) file")

    parser.add_argument('--start-codons',
                        help="A list of codons which will be "
                        "treated as start codons when extracting ORFs",
                        nargs='+',
                        default=default_start_codons)

    parser.add_argument('--stop-codons',
                        help="A list of codons which will be "
                        "treated as stop codons when extracting ORFs",
                        nargs='+',
                        default=default_stop_codons)

    slurm.add_sbatch_options(parser)
    logging_utils.add_logging_options(parser)
    args = parser.parse_args()
    logging_utils.update_logging(args)

    # check if we wanted to use slurm
    if args.use_slurm:
        cmd = ' '.join(sys.argv)
        slurm.check_sbatch(cmd, args=args)
        return

    msg = "Compiling start and stop codon regular expressions"
    logger.info(msg)

    start_codons_re = '|'.join(args.start_codons)
    stop_codons_re = '|'.join(args.stop_codons)

    start_codons_re = re.compile(start_codons_re)
    stop_codons_re = re.compile(stop_codons_re)

    msg = "Reading transcripts bed file"
    logger.info(msg)
    transcripts_bed = bed_utils.read_bed(args.transcripts_bed)

    msg = "Creating the sequence iterator"
    logger.info(msg)

    transcripts_fasta = fastx_utils.get_read_iterator(args.transcripts_fasta)

    transcripts_iter = ((get_transcript(transcript_header,
                                        transcripts_bed), transcript_sequence)
                        for (transcript_header,
                             transcript_sequence) in transcripts_fasta)

    msg = "Finding all ORFs"
    logger.info(msg)

    orfs = parallel.apply_parallel_iter(transcripts_iter,
                                        args.num_cpus,
                                        get_orfs,
                                        start_codons_re,
                                        stop_codons_re,
                                        total=len(transcripts_bed),
                                        progress_bar=True)

    msg = "Joining ORFs in a large data frame"
    logger.info(msg)

    orfs = pd.concat(orfs)

    msg = "Removing duplicate ORFs"
    logger.info(msg)

    orfs = orfs.drop_duplicates(subset=DUPLICATE_FIELDS)

    msg = "Numbering remaining ORFs"
    logger.info(msg)

    orfs['orf_num'] = np.arange(len(orfs))

    msg = "Writing ORFs to disk"
    logger.info(msg)
    bed_utils.write_bed(orfs, args.out)
def main():
    parser = argparse.ArgumentParser(formatter_class=argparse.ArgumentDefaultsHelpFormatter,
        description="This script creates a simple latex document containing the read "
        "filtering images, metagene profiles and analysis, and standard section text.")
    parser.add_argument('config', help="The (yaml) config file for the project")
    parser.add_argument('out', help="The path for the output files")

    parser.add_argument('--show-orf-periodicity', help="If this flag is "
        "present, bar charts showing the periodicity of each ORF type will be "
        "included in the report.", action='store_true')

    parser.add_argument('--show-read-length-bfs', help="If this flag is given, "
        "plots showing the Bayes factor at each offset for each read length "
        "are included in the report.", action='store_true')

    parser.add_argument('--overwrite', help="If this flag is present, existing files will "
        "be overwritten.", action='store_true')

    parser.add_argument('--min-visualization-count', help="Read lengths with fewer than this "
        "number of reads will not be included in the report.", type=int, 
        default=default_min_visualization_count)

    parser.add_argument('--image-type', help="The type of image types to create. This "
        "must be an extension which matplotlib can interpret.", default=default_image_type)

    parser.add_argument('-c', '--create-fastqc-reports', help="If this flag is given, then "
        "fastqc reports will be created for most fastq and bam files. By default, they are "
        "not created.", action='store_true')
    
    parser.add_argument('--tmp', help="If the fastqc reports are created, "
        "they will use this location for temp files", default=default_tmp)
     
    parser.add_argument('--note', help="If this option is given, it will be used in the "
        "filenames.\n\nN.B. This REPLACES the note in the config file.", default=default_note)

    slurm.add_sbatch_options(parser)
    logging_utils.add_logging_options(parser)
    args = parser.parse_args()
    logging_utils.update_logging(args)

    config = yaml.load(open(args.config))

    if args.note is not default_note:
        config['note'] = args.note
    note = config.get('note', None)
    
    sample_names = sorted(config['riboseq_samples'].keys())
    
    # keep multimappers?
    is_unique = not ('keep_riboseq_multimappers' in config)

    programs =  [   
	'create-read-length-metagene-profile-plot',
        'visualize-metagene-profile-bayes-factor',
        'get-all-read-filtering-counts',
        'samtools',
        'visualize-read-filtering-counts',
        'get-read-length-distribution',
        'plot-read-length-distribution'
    ]

    if args.create_fastqc_reports:
        programs.extend(['fastqc','java'])
        
    shell_utils.check_programs_exist(programs)

    if args.use_slurm:
        cmd = ' '.join(sys.argv)
        slurm.check_sbatch(cmd, args=args)
        return

    # make sure the path to the output file exists
    os.makedirs(args.out, exist_ok=True)
   
    # first, create the read filtering information... 
    create_read_filtering_plots(args.config, config, args)
    # ... and all the other figures.
    for name in sample_names:
        periodic_offsets = filenames.get_periodic_offsets(config['riboseq_data'], 
            name, is_unique=is_unique, note=note)
        offsets_df = pd.read_csv(periodic_offsets)
        create_figures(args.config, config, name, offsets_df, args)

    min_metagene_profile_count = config.get(
        "min_metagene_profile_count", default_min_metagene_profile_count)

    min_metagene_profile_bayes_factor_mean = config.get(
        "min_metagene_profile_bayes_factor_mean", 
        default_min_metagene_profile_bayes_factor_mean)

    max_metagene_profile_bayes_factor_var = config.get(
        "max_metagene_profile_bayes_factor_var", 
        default_max_metagene_profile_bayes_factor_var)

    project_name = config.get("project_name", default_project_name)
    title = "Preprocessing results for {}".format(project_name)

    tex_file = os.path.join(args.out, "preprocessing-report.tex")
    with open(tex_file, 'w') as out:

        latex.begin_document(out, title, abstract, commands=commands)

        latex.section(out, "Introduction")

        latex.clearpage(out)
        latex.newpage(out)

        latex.section(out, "Mapping and filtering")
        latex.write(out, mapping_and_filtering_text)

        # the read filtering figures
        read_filtering_image = filenames.get_riboseq_read_filtering_counts_image(
            config['riboseq_data'], note=note, image_type=args.image_type)
    
        n = "no-rrna-{}".format(note)
        no_rrna_read_filtering_image = filenames.get_riboseq_read_filtering_counts_image(
            config['riboseq_data'], note=n, image_type=args.image_type)

        latex.begin_figure(out)
        latex.write_graphics(out, read_filtering_image, width=0.45)
        latex.write_graphics(out, no_rrna_read_filtering_image, width=0.45)
        latex.write_caption(out, read_filtering_caption, label=read_filtering_label)
        latex.end_figure(out)

        latex.clearpage(out)

        # the read length distributions
        latex.section(out, "Read length distributions", 
            label=length_distribution_section_label)

        msg = "Writing length distribution figures"
        logger.info(msg)

        latex.begin_table(out, "cc")

        latex.write_header(out, ["All aligned reads", "Uniquely-aligning reads"])

        for name in sample_names:
            data = config['riboseq_samples'][name]
            read_length_distribution_image = filenames.get_riboseq_read_length_distribution_image(
                config['riboseq_data'], name, is_unique=False, note=note, 
                image_type=args.image_type)

            unique_read_length_distribution_image = filenames.get_riboseq_read_length_distribution_image(
                config['riboseq_data'], name, is_unique=True, note=note, 
                image_type=args.image_type)

            
            msg = "Looking for image file: {}".format(read_length_distribution_image)
            logger.debug(msg)

            if os.path.exists(read_length_distribution_image):
                latex.write_graphics(out, read_length_distribution_image, width=0.45)
            else:
                msg = "Could not find image: {}".format(read_length_distribution_image)
                logger.warning(msg)
                
                text = "Missing: {}\n\n".format(name)
                latex.write(out, text)

            latex.write_column_sep(out)
            

            msg = "Looking for image file: {}".format(unique_read_length_distribution_image)
            logger.debug(msg)

            if os.path.exists(unique_read_length_distribution_image):
                latex.write_graphics(out, unique_read_length_distribution_image, width=0.45)
            else:
                msg = "Could not find image: {}".format(unique_read_length_distribution_image)
                logger.warning(msg)
            
                text = "Missing: {}\n\n".format(name)
                latex.write(out, text)

            latex.write_row_sep(out)

            

        latex.end_table(out)
        latex.clearpage(out)


        latex.section(out, "Read length periodicity", label=periodicity_label)

        for name in sample_names:
            i = 0

            data = config['riboseq_samples'][name]

            msg = "Processing sample: {}".format(name)
            logger.info(msg)

            logger.debug("overwrite: {}".format(args.overwrite))
    
            periodic_offsets = filenames.get_periodic_offsets(config['riboseq_data'], 
                name, is_unique=is_unique, note=note)
            offsets_df = pd.read_csv(periodic_offsets)

            min_read_length = int(offsets_df['length'].min())
            max_read_length = int(offsets_df['length'].max())
    

            latex.begin_table(out, "YY")

            header = "\\multicolumn{2}{c}{" + name + "}"
            header = [header]
            latex.write_header(out, header)

            for length in range(min_read_length, max_read_length + 1):
                msg = "Processing length: {}".format(length)
                logger.info(msg)

                # check which offset is used
                
                # select the row for this length
                mask_length = offsets_df['length'] == length

                # TODO: this is sometimes length 0. why?
                if sum(mask_length) == 0:
                    continue

                length_row = offsets_df[mask_length].iloc[0]

                # now, check all of the filters
                offset = int(length_row['highest_peak_offset'])
                offset_status = "Used for analysis"
                
                if length_row['highest_peak_bf_mean'] < min_metagene_profile_bayes_factor_mean:
                    offset_status = "BF mean too small"

                if length_row['highest_peak_bf_var'] > max_metagene_profile_bayes_factor_var:
                    offset_status = "BF variance too high"

                if length_row['highest_peak_profile_sum'] < min_metagene_profile_count:
                    offset_status = "Count too small"
                
                if length_row['highest_peak_profile_sum'] < args.min_visualization_count:
                    msg = "Not enough reads of this length. Skipping."
                    logger.warning(msg)
                    continue

                metagene_profile_image = filenames.get_metagene_profile_image(
                    config['riboseq_data'], name, image_type=args.image_type, 
                    is_unique=is_unique, length=length, note=note)
                
                #title = ("length: {}. P-site offset: {}. \\newline status: {}"
                    #"\n".format(length, offset, offset_status))
                #latex.write(out, title, size="scriptsize")
                title = ("Length: {}. P-site offset: {}. Status: {}\n".format(length, offset, offset_status))
                if args.show_read_length_bfs:
                    title = "\scriptsize{" + title + "}"
                    title = "\\multicolumn{2}{c}{" + title + "}"
                    latex.write(out, title)
                    latex.write_row_sep(out)
                else:
                    latex.write(out, title, size="scriptsize")

                latex.write_graphics(out, metagene_profile_image, width=0.45)
                               
                i += 1
                if i%2 == 1:
                    latex.write_column_sep(out)
                else:
                    latex.write_row_sep(out)


                if args.show_read_length_bfs:
                    
                    bayes_factor_image = filenames.get_metagene_profile_bayes_factor_image(
                        config['riboseq_data'], name, image_type=args.image_type, 
                        is_unique=is_unique, length=length, note=note)

                    #latex.centering(out)
                    latex.write_graphics(out, bayes_factor_image, width=0.45)
                        
                    i += 1
                    if i%2 == 1:
                        latex.write_column_sep(out)
                    else:
                        latex.write_row_sep(out)

                               
            if i%2 == 1:
                latex.write_row_sep(out)

            latex.end_table(out)
            latex.clearpage(out)

        ### ORF type metagene profiles
        if args.show_orf_periodicity:
            title = "ORF type periodicity"
            latex.section(out, title)
            
            strands = ['+', '-']
            for sample_name in sample_names:
                i = 0

                try:
                    lengths, offsets = ribo_utils.get_periodic_lengths_and_offsets(
                        config, sample_name, is_unique=is_unique)
                except FileNotFoundError:
                    msg = ("Could not parse out lengths and offsets for sample: {}. "
                        "Skipping".format(sample_name))
                    logger.error(msg)
                    continue
                
                orf_type_profile_base = filenames.get_orf_type_profile_base(
                    config['riboseq_data'], sample_name, length=lengths, offset=offsets, 
                    is_unique=is_unique, note=note, subfolder='orf-profiles')

                for orf_type in ribo_utils.orf_types:
                    for strand in strands:
                        orf_type_profile = filenames.get_orf_type_profile_image(
                            orf_type_profile_base, orf_type, strand, 
                            image_type=args.image_type)


                        msg = "Looking for image file: {}".format(orf_type_profile)
                        logger.debug(msg)
                        if os.path.exists(orf_type_profile):
                            if i%4 == 0:
                                latex.begin_figure(out)

                            i += 1
                            latex.write_graphics(out, orf_type_profile, height=0.23)

                            if i%4 == 0:
                                latex.end_figure(out)
                                latex.clearpage(out)

                if (i>0) and (i%4 != 0):
                    latex.end_figure(out)
                    latex.clearpage(out)

        latex.end_document(out)

    tex_filename = os.path.basename(tex_file)
    latex.compile(args.out, tex_filename)

    if args.create_fastqc_reports:
        parallel.apply_parallel_iter(config['riboseq_samples'].items(), 
            args.num_cpus, 
            create_fastqc_reports, config, args)
Пример #15
0
def main():
    parser = argparse.ArgumentParser(
        formatter_class=argparse.ArgumentDefaultsHelpFormatter,
        description="This script constructs the profile for each ORF. It "
        "first adjusts the mapped read positions to properly align with "
        "the P-sites. Second, it uses a custom chrom-sweep algorithm to "
        "find the coverage of each position in each exon of each ORF. Finally, "
        "the ORF exons are glued together to find the profile of the entire ORF."
    )

    parser.add_argument('bam',
                        help="The bam file including filtered (unique, "
                        "etc.) alignments")
    parser.add_argument('orfs', help="The (bed12) file containing the ORFs")
    parser.add_argument('exons', help="The (bed6+2) file containing the exons")
    parser.add_argument('out',
                        help="The (mtx.gz) output file containing the "
                        "ORF profiles")

    parser.add_argument(
        '-l',
        '--lengths',
        help="If any values are given, "
        "then only reads which have those lengths will be included in the "
        "signal construction.",
        type=int,
        default=default_lengths,
        nargs='*')
    parser.add_argument(
        '-o',
        '--offsets',
        help="The 5' end of reads will be "
        "shifted by this amount. There must be one offset value for each "
        "length (given by the --lengths argument.",
        type=int,
        default=default_offsets,
        nargs='*')

    parser.add_argument('-k',
                        '--num-exons',
                        help="If  k>0, then only the "
                        "first k exons will be processed.",
                        type=int,
                        default=default_num_exons)
    parser.add_argument(
        '-g',
        '--num-groups',
        help="The number of groups into "
        "which to split the exons. More groups means the progress bar is "
        "updated more frequently but incurs more overhead because of the "
        "parallel calls.",
        type=int,
        default=default_num_groups)

    parser.add_argument('--seqname-prefix',
                        help="If present, this string "
                        "will be prepended to the seqname field of the ORFs.",
                        default=default_seqname_prefix)

    slurm.add_sbatch_options(parser)
    logging_utils.add_logging_options(parser)
    args = parser.parse_args()
    logging_utils.update_logging(args)

    msg = "[extract-orf-profiles]: {}".format(' '.join(sys.argv))
    logger.info(msg)

    # make sure the number of lengths and offsets match
    if len(args.lengths) != len(args.offsets):
        msg = "The number of --lengths and --offsets do not match."
        raise ValueError(msg)

    # make sure the necessary files exist
    required_files = [args.bam, args.orfs, args.exons]
    msg = "[extract-orf-profiles]: Some input files were missing: "
    utils.check_files_exist(required_files, msg=msg)

    if args.use_slurm:
        cmd = ' '.join(sys.argv)
        slurm.check_sbatch(cmd, args=args)
        return

    msg = "Finding P-sites"
    logger.info(msg)

    p_sites = ribo_utils.get_p_sites(args.bam, args.lengths, args.offsets)

    # we do not need the data frame anymore, so save some memory
    msg = "Reading exons"
    logger.info(msg)
    exons = bed_utils.read_bed(args.exons)

    msg = "Reading ORFs"
    logger.info(msg)

    orfs = bed_utils.read_bed(args.orfs)

    if len(args.seqname_prefix) > 0:
        orfs['seqname'] = args.seqname_prefix + orfs['seqname']
        exons['seqname'] = args.seqname_prefix + exons['seqname']

    if args.num_exons > 0:
        exons = exons.head(args.num_exons)

    num_orfs = orfs['orf_num'].max() + 1
    max_orf_len = orfs['orf_len'].max()

    msg = "Adding the ORF index to the exons"
    logger.info(msg)

    orf_fields = ['id', 'orf_num']
    exons_orfs = exons.merge(orfs[orf_fields], on='id')

    msg = "Splitting exons and P-sites"
    logger.info(msg)
    exon_groups = pandas_utils.split_df(exons_orfs, args.num_groups)

    exons_dfs = []
    psites_dfs = []

    for group_index, exon_group in exon_groups:
        # pull out only the p-sites that come from these chromosomes
        seqnames = set(exon_group['seqname'].unique())
        m_psites = p_sites['seqname'].isin(seqnames)

        exons_dfs.append(exon_group)
        psites_dfs.append(p_sites[m_psites])

    # we no longer need the full list of psites
    del p_sites
    del exons_orfs
    del exon_groups
    del exons
    gc.collect()
    exons_psites = zip(exons_dfs, psites_dfs)

    msg = "Finding all P-site intersections"
    logger.info(msg)

    sum_profiles = parallel.apply_parallel_iter(exons_psites,
                                                args.num_cpus,
                                                get_all_p_site_intersections,
                                                num_orfs,
                                                max_orf_len,
                                                progress_bar=True,
                                                total=args.num_groups)

    msg = "Combining the ORF profiles into one matrix"
    logger.info(msg)

    f = lambda x, y: x + y

    sum_profiles = functools.reduce(f, sum_profiles)
    sum_profiles_lil = sum_profiles.tolil()

    msg = "Flipping the reverse strand profiles"
    logger.info(msg)

    m_reverse = orfs['strand'] == '-'
    reverse_orfs = orfs[m_reverse]

    for idx, reverse_orf in tqdm.tqdm(reverse_orfs.iterrows()):
        orf_num = reverse_orf['orf_num']

        if sum_profiles[orf_num].sum() == 0:
            continue

        orf_len = reverse_orf['orf_len']
        dense = utils.to_dense(sum_profiles, orf_num, length=orf_len)
        dense = dense[::-1]
        sum_profiles_lil[orf_num, :orf_len] = dense

    msg = "Writing the sparse matrix to disk"
    logger.info(msg)
    math_utils.write_sparse_matrix(args.out, sum_profiles_lil)
def main():
    parser = argparse.ArgumentParser(
        formatter_class=argparse.ArgumentDefaultsHelpFormatter,
        description=
        "This script creates a simple latex document containing the read "
        "filtering images, metagene profiles and analysis, and standard section text."
    )
    parser.add_argument('config',
                        help="The (yaml) config file for the project")
    parser.add_argument('out', help="The path for the output files")

    parser.add_argument(
        '--show-orf-periodicity',
        help="If this flag is "
        "present, bar charts showing the periodicity of each ORF type will be "
        "included in the report.",
        action='store_true')

    parser.add_argument(
        '--show-read-length-bfs',
        help="If this flag is given, "
        "plots showing the Bayes factor at each offset for each read length "
        "are included in the report.",
        action='store_true')

    parser.add_argument('--overwrite',
                        help="If this flag is present, existing files will "
                        "be overwritten.",
                        action='store_true')

    parser.add_argument('--min-visualization-count',
                        help="Read lengths with fewer than this "
                        "number of reads will not be included in the report.",
                        type=int,
                        default=default_min_visualization_count)

    parser.add_argument('--image-type',
                        help="The type of image types to create. This "
                        "must be an extension which matplotlib can interpret.",
                        default=default_image_type)

    parser.add_argument(
        '-c',
        '--create-fastqc-reports',
        help="If this flag is given, then "
        "fastqc reports will be created for most fastq and bam files. By default, they are "
        "not created.",
        action='store_true')

    parser.add_argument('--tmp',
                        help="If the fastqc reports are created, "
                        "they will use this location for temp files",
                        default=default_tmp)

    parser.add_argument(
        '--note',
        help="If this option is given, it will be used in the "
        "filenames.\n\nN.B. This REPLACES the note in the config file.",
        default=default_note)

    slurm.add_sbatch_options(parser)
    logging_utils.add_logging_options(parser)
    args = parser.parse_args()
    logging_utils.update_logging(args)

    config = yaml.load(open(args.config))

    if args.note is not None:
        config['note'] = args.note

    # keep multimappers?
    is_unique = not ('keep_riboseq_multimappers' in config)

    programs = [
        'create-read-length-metagene-profile-plot',
        'visualize-metagene-profile-bayes-factor',
        'get-all-read-filtering-counts', 'samtools',
        'visualize-read-filtering-counts', 'get-read-length-distribution',
        'plot-read-length-distribution'
    ]

    if args.create_fastqc_reports:
        programs.extend(['fastqc', 'java'])

    shell_utils.check_programs_exist(programs)

    if args.use_slurm:
        cmd = ' '.join(sys.argv)
        slurm.check_sbatch(cmd, args=args)
        return

    config = yaml.load(open(args.config))

    if args.note is not default_note:
        config['note'] = args.note

    note = config.get('note', None)

    # make sure the path to the output file exists
    os.makedirs(args.out, exist_ok=True)

    # first, create the read filtering information
    create_read_filtering_plots(args.config, config, args)

    min_metagene_profile_count = config.get(
        "min_metagene_profile_count", default_min_metagene_profile_count)

    min_metagene_profile_bayes_factor_mean = config.get(
        "min_metagene_profile_bayes_factor_mean",
        default_min_metagene_profile_bayes_factor_mean)

    max_metagene_profile_bayes_factor_var = config.get(
        "max_metagene_profile_bayes_factor_var",
        default_max_metagene_profile_bayes_factor_var)

    project_name = config.get("project_name", default_project_name)
    title = "Preprocessing results for {}".format(project_name)

    sample_names = sorted(config['riboseq_samples'].keys())

    tex_file = os.path.join(args.out, "preprocessing-report.tex")
    with open(tex_file, 'w') as out:

        latex.begin_document(out, title, abstract, commands=commands)

        latex.section(out, "Introduction")

        latex.clearpage(out)
        latex.newpage(out)

        latex.section(out, "Mapping and filtering")
        latex.write(out, mapping_and_filtering_text)

        # the read filtering figures

        read_filtering_image = filenames.get_riboseq_read_filtering_counts_image(
            config['riboseq_data'], note=note, image_type=args.image_type)

        n = "no-rrna-{}".format(note)
        no_rrna_read_filtering_image = filenames.get_riboseq_read_filtering_counts_image(
            config['riboseq_data'], note=n, image_type=args.image_type)

        latex.begin_figure(out)
        latex.write_graphics(out, read_filtering_image, height=0.45)
        latex.write_graphics(out, no_rrna_read_filtering_image, height=0.45)
        latex.write_caption(out,
                            read_filtering_caption,
                            label=read_filtering_label)
        latex.end_figure(out)

        latex.clearpage(out)

        # the read length distributions
        latex.section(out,
                      "Read length distributions",
                      label=length_distribution_section_label)

        msg = "Writing length distribution figures"
        logger.info(msg)

        latex.begin_table(out, "cc")

        latex.write_header(out,
                           ["All aligned reads", "Uniquely-aligning reads"])

        for name in sample_names:
            data = config['riboseq_samples'][name]
            read_length_distribution_image = filenames.get_riboseq_read_length_distribution_image(
                config['riboseq_data'],
                name,
                is_unique=False,
                note=note,
                image_type=args.image_type)

            unique_read_length_distribution_image = filenames.get_riboseq_read_length_distribution_image(
                config['riboseq_data'],
                name,
                is_unique=True,
                note=note,
                image_type=args.image_type)

            msg = "Looking for image file: {}".format(
                read_length_distribution_image)
            logger.debug(msg)

            if os.path.exists(read_length_distribution_image):
                latex.write_graphics(out,
                                     read_length_distribution_image,
                                     width=0.45)
            else:
                msg = "Could not find image: {}".format(
                    read_length_distribution_image)
                logger.warning(msg)

                text = "Missing: {}\n\n".format(name)
                latex.write(out, text)

            latex.write_column_sep(out)

            msg = "Looking for image file: {}".format(
                unique_read_length_distribution_image)
            logger.debug(msg)

            if os.path.exists(unique_read_length_distribution_image):
                latex.write_graphics(out,
                                     unique_read_length_distribution_image,
                                     width=0.45)
            else:
                msg = "Could not find image: {}".format(
                    unique_read_length_distribution_image)
                logger.warning(msg)

                text = "Missing: {}\n\n".format(name)
                latex.write(out, text)

            latex.write_row_sep(out)

        latex.end_table(out)
        latex.clearpage(out)

        latex.section(out, "Read length periodicity", label=periodicity_label)

        for name in sample_names:
            i = 0

            data = config['riboseq_samples'][name]

            msg = "Processing sample: {}".format(name)
            logger.info(msg)

            logger.debug("overwrite: {}".format(args.overwrite))

            periodic_offsets = filenames.get_periodic_offsets(
                config['riboseq_data'], name, is_unique=is_unique, note=note)
            offsets_df = pd.read_csv(periodic_offsets)

            min_read_length = int(offsets_df['length'].min())
            max_read_length = int(offsets_df['length'].max())

            create_figures(args.config, config, name, offsets_df, args)

            latex.begin_table(out, "YY")

            header = "\\multicolumn{2}{c}{" + name + "}"
            header = [header]
            latex.write_header(out, header)

            for length in range(min_read_length, max_read_length + 1):
                msg = "Processing length: {}".format(length)
                logger.info(msg)

                # check which offset is used

                # select the row for this length
                mask_length = offsets_df['length'] == length

                # TODO: this is sometimes length 0. why?
                if sum(mask_length) == 0:
                    continue

                length_row = offsets_df[mask_length].iloc[0]

                # now, check all of the filters
                offset = int(length_row['highest_peak_offset'])
                offset_status = "Used for analysis"

                if length_row[
                        'highest_peak_bf_mean'] < min_metagene_profile_bayes_factor_mean:
                    offset_status = "BF mean too small"

                if length_row[
                        'highest_peak_bf_var'] > max_metagene_profile_bayes_factor_var:
                    offset_status = "BF variance too high"

                if length_row[
                        'highest_peak_profile_sum'] < min_metagene_profile_count:
                    offset_status = "Count too small"

                if length_row[
                        'highest_peak_profile_sum'] < args.min_visualization_count:
                    msg = "Not enough reads of this length. Skipping."
                    logger.warning(msg)
                    continue

                metagene_profile_image = filenames.get_metagene_profile_image(
                    config['riboseq_data'],
                    name,
                    image_type=args.image_type,
                    is_unique=is_unique,
                    length=length,
                    note=note)

                title = ("length: {}. P-site offset: {}. \\newline status: {}"
                         "\n".format(length, offset, offset_status))
                latex.write(out, title, size="scriptsize")

                latex.write_graphics(out, metagene_profile_image, width=0.45)

                i += 1
                if i % 2 == 1:
                    latex.write_column_sep(out)
                else:
                    latex.write_row_sep(out)

                if args.show_read_length_bfs:

                    bayes_factor_image = filenames.get_metagene_profile_bayes_factor_image(
                        config['riboseq_data'],
                        name,
                        image_type=args.image_type,
                        is_unique=is_unique,
                        length=length,
                        note=note)

                    latex.centering(out)
                    latex.write_graphics(out, bayes_factor_image, width=0.45)

                    i += 1
                    if i % 2 == 1:
                        latex.write_column_sep(out)
                    else:
                        latex.write_row_sep(out)

            if i % 2 == 1:
                latex.write_row_sep(out)

            latex.end_table(out)
            latex.clearpage(out)

        ### ORF type metagene profiles
        if args.show_orf_periodicity:
            title = "ORF type periodicity"
            latex.section(out, title)

            strands = ['+', '-']
            for sample_name in sample_names:
                i = 0

                try:
                    lengths, offsets = ribo_utils.get_periodic_lengths_and_offsets(
                        config, sample_name, is_unique=is_unique)
                except FileNotFoundError:
                    msg = (
                        "Could not parse out lengths and offsets for sample: {}. "
                        "Skipping".format(sample_name))
                    logger.error(msg)
                    continue

                orf_type_profile_base = filenames.get_orf_type_profile_base(
                    config['riboseq_data'],
                    sample_name,
                    length=lengths,
                    offset=offsets,
                    is_unique=is_unique,
                    note=note,
                    subfolder='orf-profiles')

                for orf_type in ribo_utils.orf_types:
                    for strand in strands:
                        orf_type_profile = filenames.get_orf_type_profile_image(
                            orf_type_profile_base,
                            orf_type,
                            strand,
                            image_type=args.image_type)

                        msg = "Looking for image file: {}".format(
                            orf_type_profile)
                        logger.debug(msg)
                        if os.path.exists(orf_type_profile):
                            if i % 4 == 0:
                                latex.begin_figure(out)

                            i += 1
                            latex.write_graphics(out,
                                                 orf_type_profile,
                                                 height=0.23)

                            if i % 4 == 0:
                                latex.end_figure(out)
                                latex.clearpage(out)

                if (i > 0) and (i % 4 != 0):
                    latex.end_figure(out)
                    latex.clearpage(out)

        latex.end_document(out)

    tex_filename = os.path.basename(tex_file)
    latex.compile(args.out, tex_filename)

    if args.create_fastqc_reports:
        parallel.apply_parallel_iter(config['riboseq_samples'].items(),
                                     args.num_cpus, create_fastqc_reports,
                                     config, args)