def create_read_filtering_plots(config_file, config, args):

    # get the filtering counts
    note = config.get('note', None)
    read_filtering_counts = filenames.get_riboseq_read_filtering_counts(
        config['riboseq_data'], note=note)
    overwrite_str = ""
    if args.overwrite:
        overwrite_str = "--overwrite"

    logging_str = logging_utils.get_logging_options_string(args)

    cpus_str = "--num-cpus {}".format(args.num_cpus)
    cmd = "get-all-read-filtering-counts {} {} {} {} {}".format(
        config_file, read_filtering_counts, overwrite_str, cpus_str,
        logging_str)

    in_files = [config_file]
    out_files = [read_filtering_counts]
    shell_utils.call_if_not_exists(cmd,
                                   out_files,
                                   in_files=in_files,
                                   overwrite=args.overwrite,
                                   call=True)

    # and visualize them
    read_filtering_image = filenames.get_riboseq_read_filtering_counts_image(
        config['riboseq_data'], note=note, image_type=args.image_type)

    title = "Read filtering counts"
    title_str = "--title {}".format(shlex.quote(title))
    cmd = "visualize-read-filtering-counts {} {} {} --config {}".format(
        read_filtering_counts, read_filtering_image, title_str, config_file)
    in_files = [read_filtering_counts]
    out_files = [read_filtering_image]
    shell_utils.call_if_not_exists(cmd,
                                   out_files,
                                   in_files=in_files,
                                   overwrite=args.overwrite,
                                   call=True)

    # and visualize the filtering without the rrna
    n = "no-rrna-{}".format(note)
    read_filtering_image = filenames.get_riboseq_read_filtering_counts_image(
        config['riboseq_data'], note=n, image_type=args.image_type)

    title = "Read filtering counts, no ribosomal matches"
    title_str = "--title {}".format(shlex.quote(title))
    cmd = "visualize-read-filtering-counts {} {} {} --config {} --without-rrna".format(
        read_filtering_counts, read_filtering_image, title_str, config_file)

    in_files = [read_filtering_counts]
    out_files = [read_filtering_image]
    shell_utils.call_if_not_exists(cmd,
                                   out_files,
                                   in_files=in_files,
                                   overwrite=args.overwrite,
                                   call=True)
def create_fastqc_reports(name_data, config, args):
    name, data = name_data
    msg = "{}: creating fastqc reports".format(name)
    logger.info(msg)

    note = config.get('note', None)

    # keep multimappers?
    is_unique = not ('keep_riboseq_multimappers' in config)

    # first, get the ribo_filenames
    raw_data = data
    without_adapters = filenames.get_without_adapters_fastq(
        config['riboseq_data'], name, note=note)
    with_rrna = filenames.get_with_rrna_fastq(config['riboseq_data'],
                                              name,
                                              note=note)
    without_rrna = filenames.get_without_rrna_fastq(config['riboseq_data'],
                                                    name,
                                                    note=note)
    genome_bam = filenames.get_riboseq_bam(config['riboseq_data'],
                                           name,
                                           note=note)
    unique_bam = filenames.get_riboseq_bam(config['riboseq_data'],
                                           name,
                                           is_unique=is_unique,
                                           note=note)

    if args.create_fastqc_reports:

        # now, get the fastqc report ribo_filenames
        raw_data_fastqc = filenames.get_raw_data_fastqc_data(
            config['riboseq_data'], raw_data)
        without_adapters_fastqc = filenames.get_without_adapters_fastqc_data(
            config['riboseq_data'], name, note=note)
        with_rrna_fastqc = filenames.get_with_rrna_fastqc_data(
            config['riboseq_data'], name, note=note)
        without_rrna_fastqc = filenames.get_without_rrna_fastqc_data(
            config['riboseq_data'], name, note=note)

        genome_bam_fastqc = filenames.get_riboseq_bam_fastqc_data(
            config['riboseq_data'], name, note=note)
        unique_bam_fastqc = filenames.get_riboseq_bam_fastqc_data(
            config['riboseq_data'], name, is_unique=is_unique, note=note)

        # create the fastqc reports if they do not exist
        raw_data_fastqc_path = filenames.get_raw_data_fastqc_path(
            config['riboseq_data'])
        without_adapters_fastqc_path = filenames.get_without_adapters_fastqc(
            config['riboseq_data'])
        with_rrna_fastqc_path = filenames.get_with_rrna_fastqc(
            config['riboseq_data'])
        without_rrna_fastqc_path = filenames.get_without_rrna_fastqc(
            config['riboseq_data'])
        without_rrna_mapping_fastqc_path = filenames.get_riboseq_bam_fastqc_path(
            config['riboseq_data'])

        fastqc_tmp_str = ""
        if args.tmp is not None:
            fastqc_tmp_str = "--dir {}".format(args.tmp)

        msg = "Looking for raw data fastqc report: '{}'".format(
            raw_data_fastqc)
        logger.debug(msg)
        cmd = "fastqc --outdir {} --extract {} {}".format(
            raw_data_fastqc_path, raw_data, fastqc_tmp_str)
        in_files = [raw_data]
        out_files = [raw_data_fastqc]

        shell_utils.call_if_not_exists(cmd,
                                       out_files,
                                       in_files=in_files,
                                       overwrite=args.overwrite)

        cmd = "fastqc --outdir {} --extract {} {}".format(
            without_adapters_fastqc_path, without_adapters, fastqc_tmp_str)
        in_files = [without_adapters]
        out_files = [without_adapters_fastqc]
        shell_utils.call_if_not_exists(cmd,
                                       out_files,
                                       in_files=in_files,
                                       overwrite=args.overwrite)

        cmd = "fastqc --outdir {} --extract {} {}".format(
            with_rrna_fastqc_path, with_rrna, fastqc_tmp_str)
        in_files = [with_rrna]
        out_files = [with_rrna_fastqc]
        shell_utils.call_if_not_exists(cmd,
                                       out_files,
                                       in_files=in_files,
                                       overwrite=args.overwrite)

        cmd = "fastqc --outdir {} --extract {} {}".format(
            without_rrna_fastqc_path, without_rrna, fastqc_tmp_str)
        in_files = [without_rrna]
        out_files = [without_rrna_fastqc]
        shell_utils.call_if_not_exists(cmd,
                                       out_files,
                                       in_files=in_files,
                                       overwrite=args.overwrite)

        cmd = "fastqc --outdir {} --extract {} {}".format(
            without_rrna_mapping_fastqc_path, genome_bam, fastqc_tmp_str)
        in_files = [genome_bam]
        out_files = [genome_bam_fastqc]

        msg = "genome_bam: '{}'".format(genome_bam)
        logger.debug(msg)

        msg = "genome_bam_fastqc: '{}'".format(genome_bam_fastqc)
        logger.debug(msg)

        shell_utils.call_if_not_exists(cmd,
                                       out_files,
                                       in_files=in_files,
                                       overwrite=args.overwrite)

        cmd = "fastqc --outdir {} --extract {} {}".format(
            without_rrna_mapping_fastqc_path, unique_bam, fastqc_tmp_str)
        in_files = [unique_bam]
        out_files = [unique_bam_fastqc]
        shell_utils.call_if_not_exists(cmd,
                                       out_files,
                                       in_files=in_files,
                                       overwrite=args.overwrite)

        # in some cases, fastqc can fail. make sure all of the reports are present
        all_fastqc_reports = [
            raw_data_fastqc, without_adapters_fastqc, without_rrna_fastqc,
            genome_bam_fastqc, unique_bam_fastqc
        ]

        missing_files = [
            f for f in all_fastqc_reports if not os.path.exists(f)
        ]

        if len(missing_files) > 0:
            msg = "The following fastqc reports were not created correctly:\n"
            msg += '\n'.join(missing_files)
            logger.warning(msg)
def create_figures(config_file, config, name, offsets_df, args):
    """ This function creates all of the figures in the preprocessing report
        for the given dataset.
    """
    logging_str = logging_utils.get_logging_options_string(args)
    note = config.get('note', None)

    note_str = filenames.get_note_string(note)

    # keep multimappers?
    is_unique = not ('keep_riboseq_multimappers' in config)

    image_type_str = "--image-type {}".format(args.image_type)

    min_read_length = int(offsets_df['length'].min())
    max_read_length = int(offsets_df['length'].max())

    min_read_length_str = "--min-read-length {}".format(min_read_length)
    max_read_length_str = "--max-read-length {}".format(max_read_length)

    msg = "{}: Getting and visualizing read length distribution".format(name)
    logger.info(msg)

    # all aligned reads
    genome_bam = filenames.get_riboseq_bam(config['riboseq_data'],
                                           name,
                                           note=note)

    # uniquely aligned reads
    unique_filename = filenames.get_riboseq_bam(config['riboseq_data'],
                                                name,
                                                is_unique=is_unique,
                                                note=note)

    # the read length counts
    read_length_distribution = filenames.get_riboseq_read_length_distribution(
        config['riboseq_data'], name, note=note)

    # the plots
    cmd = "get-read-length-distribution {} {} --out {} {}".format(
        genome_bam, unique_filename, read_length_distribution, logging_str)
    in_files = [genome_bam, unique_filename]
    out_files = [read_length_distribution]
    shell_utils.call_if_not_exists(cmd,
                                   out_files,
                                   in_files=in_files,
                                   overwrite=args.overwrite,
                                   call=True)

    # visualize all read counts
    title = None
    if 'riboseq_sample_name_map' in config:
        title = config['riboseq_sample_name_map'].get(name)
    if title is None:
        title = "{}{}".format(name, note_str)

    title_str = "{}, All aligned reads".format(title)
    title_str = "--title={}".format(shlex.quote(title_str))

    # get the basename for the distribution file
    unique_str = filenames.get_unique_string(False)
    sample_name = "{}{}{}".format(name, note_str, unique_str)

    read_length_distribution_image = filenames.get_riboseq_read_length_distribution_image(
        config['riboseq_data'],
        name,
        is_unique=False,
        note=note,
        image_type=args.image_type)

    cmd = "plot-read-length-distribution {} {} {} {} {} {}".format(
        read_length_distribution, sample_name, read_length_distribution_image,
        title_str, min_read_length_str, max_read_length_str)

    in_files = [read_length_distribution]
    out_files = [read_length_distribution_image]
    shell_utils.call_if_not_exists(cmd,
                                   out_files,
                                   in_files=in_files,
                                   overwrite=args.overwrite,
                                   call=True)

    # visualize unique read counts

    # we already have the title
    title_str = "{}, Uniquely aligned reads".format(title)
    title_str = "--title={}".format(shlex.quote(title_str))

    unique_read_length_distribution_image = filenames.get_riboseq_read_length_distribution_image(
        config['riboseq_data'],
        name,
        is_unique=is_unique,
        note=note,
        image_type=args.image_type)

    # get the basename for the distribution file
    unique_str = filenames.get_unique_string(True)
    sample_name = "{}{}{}".format(name, note_str, unique_str)

    cmd = "plot-read-length-distribution {} {} {} {} {} {}".format(
        read_length_distribution, sample_name,
        unique_read_length_distribution_image, title_str, min_read_length_str,
        max_read_length_str)
    in_files = [read_length_distribution]
    out_files = [unique_read_length_distribution_image]
    shell_utils.call_if_not_exists(cmd,
                                   out_files,
                                   in_files=in_files,
                                   overwrite=args.overwrite,
                                   call=True)

    # visualize the metagene profiles
    msg = "{}: Visualizing metagene profiles and Bayes' factors".format(name)
    logger.info(msg)

    metagene_profiles = filenames.get_metagene_profiles(config['riboseq_data'],
                                                        name,
                                                        is_unique=is_unique,
                                                        note=note)

    profile_bayes_factor = filenames.get_metagene_profiles_bayes_factors(
        config['riboseq_data'], name, is_unique=is_unique, note=note)

    mp_df = pd.read_csv(metagene_profiles)

    for length in range(min_read_length, max_read_length + 1):

        mask_length = offsets_df['length'] == length

        # make sure we had some reads of that length
        if sum(mask_length) == 0:
            continue
        length_row = offsets_df[mask_length].iloc[0]

        # make sure we have enough reads to visualize
        if length_row[
                'highest_peak_profile_sum'] < args.min_visualization_count:
            continue

        # visualize the metagene profile
        metagene_profile_image = filenames.get_metagene_profile_image(
            config['riboseq_data'],
            name,
            image_type=args.image_type,
            is_unique=is_unique,
            length=length,
            note=note)

        title_str = "{}. length: {}".format(title, length)
        title_str = "--title {}".format(shlex.quote(title_str))
        cmd = ("create-read-length-metagene-profile-plot {} {} {} {}".format(
            metagene_profiles, length, metagene_profile_image, title_str))
        in_files = [metagene_profiles]
        out_files = [metagene_profile_image]
        shell_utils.call_if_not_exists(cmd,
                                       out_files,
                                       in_files=in_files,
                                       overwrite=args.overwrite,
                                       call=True)

        # and the Bayes' factor
        if args.show_read_length_bfs:
            metagene_profile_image = filenames.get_metagene_profile_bayes_factor_image(
                config['riboseq_data'],
                name,
                image_type=args.image_type,
                is_unique=is_unique,
                length=length,
                note=note)

            title_str = "Metagene profile Bayes' factors: {}. length: {}".format(
                title, length)
            title_str = "--title {}".format(shlex.quote(title_str))
            fontsize_str = "--font-size 15"

            cmd = ("visualize-metagene-profile-bayes-factor {} {} {} {} {}".
                   format(profile_bayes_factor, length, metagene_profile_image,
                          title_str, fontsize_str))

            in_files = [profile_bayes_factor]
            out_files = [metagene_profile_image]
            shell_utils.call_if_not_exists(cmd,
                                           out_files,
                                           in_files=in_files,
                                           overwrite=args.overwrite,
                                           call=True)

    # the orf-type metagene profiles
    if args.show_orf_periodicity:
        msg = "{}: Visualizing the ORF type metagene profiles".format(title)
        logger.info(msg)

        try:
            lengths, offsets = ribo_utils.get_periodic_lengths_and_offsets(
                config,
                name,
                is_unique=is_unique,
                default_params=metagene_options)
        except FileNotFoundError:
            msg = ("Could not parse out lengths and offsets for sample: {}. "
                   "Skipping".format(name))
            logger.error(msg)
            return

        orfs_genomic = filenames.get_orfs(config['genome_base_path'],
                                          config['genome_name'],
                                          note=config.get('orf_note'))

        profiles = filenames.get_riboseq_profiles(config['riboseq_data'],
                                                  name,
                                                  length=lengths,
                                                  offset=offsets,
                                                  is_unique=is_unique,
                                                  note=note)

        title_str = "{}, ORF-type periodicity".format(title)
        title_str = "--title {}".format(shlex.quote(title_str))

        orf_type_profile_base = filenames.get_orf_type_profile_base(
            config['riboseq_data'],
            name,
            length=lengths,
            offset=offsets,
            is_unique=is_unique,
            note=note,
            subfolder='orf-profiles')

        strand = "+"
        orf_type_profiles_forward = [
            filenames.get_orf_type_profile_image(orf_type_profile_base,
                                                 orf_type, strand,
                                                 args.image_type)
            for orf_type in ribo_utils.orf_types
        ]

        strand = "-"
        orf_type_profiles_reverse = [
            filenames.get_orf_type_profile_image(orf_type_profile_base,
                                                 orf_type, strand,
                                                 args.image_type)
            for orf_type in ribo_utils.orf_types
        ]

        cmd = ("visualize-orf-type-metagene-profiles {} {} {} {} {} {}".format(
            orfs_genomic, profiles, orf_type_profile_base, title_str,
            image_type_str, logging_str))

        in_files = [orfs_genomic, profiles]
        out_files = orf_type_profiles_forward + orf_type_profiles_reverse
        shell_utils.call_if_not_exists(cmd,
                                       out_files,
                                       in_files=in_files,
                                       overwrite=args.overwrite)
def main():
    parser = argparse.ArgumentParser(formatter_class=argparse.ArgumentDefaultsHelpFormatter,
                                     description="Creates base genome profile.")

    parser.add_argument('raw_data', help="The raw data file (fastq[.gz])")

    parser.add_argument('config', help="The (yaml) configuration file")

    parser.add_argument('name', help="The name for the dataset, used in the created files")

    parser.add_argument('-p', '--num-cpus', help="The number of processors to use",
                        type=int, default=default_num_cpus)
    
    parser.add_argument('--mem', help="The amount of RAM to request", default=default_mem)

    parser.add_argument('-t', '--tmp', help="""The location for temporary files. If not
        specified, program-specific temp locations are used.""", default=None)

    parser.add_argument('--do-not-call', action='store_true')

    parser.add_argument('--overwrite', help="""If this flag is present, existing files
        will be overwritten.""", action='store_true')

    parser.add_argument('-k', '--keep-intermediate-files', help="""If this flag is given,
        then all intermediate files will be kept; otherwise, they will be
        deleted. This feature is implemented piecemeal. If the --do-not-call flag
        is given, then nothing will be deleted.""", action='store_true')

    logging_utils.add_logging_options(parser)
    pgrm_utils.add_star_options(parser, star_executable)
    pgrm_utils.add_flexbar_options(parser)
    args = parser.parse_args()
    logging_utils.update_logging(args)

    msg = "[create-base-genome-profile]: {}".format(' '.join(sys.argv))
    logger.info(msg)

    config = yaml.load(open(args.config), Loader=yaml.FullLoader)

    # check that all of the necessary programs are callable
    programs = [
        'flexbar',
        args.star_executable,
        'samtools',
        'bowtie2',
        'remove-multimapping-reads'
    ]
    shell_utils.check_programs_exist(programs)

    required_keys = [
        'riboseq_data',
        'ribosomal_index',
        'gtf',
        'genome_base_path',
        'genome_name'
    ]
    utils.check_keys_exist(config, required_keys)

    note = config.get('note', None)
    call = not args.do_not_call
    keep_delete_files = args.keep_intermediate_files or args.do_not_call

    # Step 0: Running flexbar to remove adapter sequences

    raw_data = args.raw_data
    flexbar_target = filenames.get_without_adapters_base(config['riboseq_data'],
                                                         args.name,
                                                         note=note)
    without_adapters = filenames.get_without_adapters_fastq(config['riboseq_data'],
                                                            args.name,
                                                            note=note)

    adapter_seq_str = utils.get_config_argument(config, 'adapter_sequence', 'adapter-seq')
    adapter_file_str = utils.get_config_argument(config, 'adapter_file', 'adapters')

    # get all options, command line options override defaults
    flexbar_option_str = pgrm_utils.get_final_args(flexbar_options, args.flexbar_options)

    cmd = "flexbar -r {} -t {} {} {} {} -n {}".format(raw_data,
                                                      flexbar_target,
                                                      adapter_seq_str,
                                                      adapter_file_str,
                                                      flexbar_option_str,
                                                      args.num_cpus)
    in_files = [raw_data]
    out_files = [without_adapters]
    file_checkers = {
        without_adapters: fastx_utils.check_fastq_file
    }
    shell_utils.call_if_not_exists(cmd, out_files, in_files=in_files,
                                   file_checkers=file_checkers, overwrite=args.overwrite, call=call)

    # Step 1: Running bowtie2 to remove rRNA alignments

    out = utils.abspath("dev", "null")  # we do not care about the alignments
    without_rrna = filenames.get_without_rrna_fastq(config['riboseq_data'],
                                                    args.name,
                                                    note=note)
    with_rrna = filenames.get_with_rrna_fastq(config['riboseq_data'],
                                              args.name,
                                              note=note)

    cmd = "bowtie2 -p {} --very-fast -x {} -U {} -S {} --un-gz {} --al-gz {}".format(
        args.num_cpus,
        config['ribosomal_index'],
        without_adapters,
        out,
        without_rrna,
        with_rrna)

    in_files = [without_adapters]
    in_files.extend(pgrm_utils.get_bowtie2_index_files(config['ribosomal_index']))
    out_files = [without_rrna, with_rrna]
    to_delete = [without_adapters]
    file_checkers = {
        without_rrna: fastx_utils.check_fastq_file
    }
    shell_utils.call_if_not_exists(cmd, out_files, in_files=in_files,
                                   file_checkers=file_checkers, overwrite=args.overwrite, call=call,
                                   keep_delete_files=keep_delete_files, to_delete=to_delete)

    # Step 2: Running STAR to align rRNA-depleted reads to genome

    star_output_prefix = filenames.get_riboseq_bam_base(config['riboseq_data'],
                                                        args.name,
                                                        note=note)
    genome_star_bam = "{}{}".format(star_output_prefix, "Aligned.sortedByCoord.out.bam")

    # get all options, command line options override defaults

    mem_bytes = utils.human2bytes(args.mem)
    star_options['limitBAMsortRAM'] = mem_bytes

    if args.tmp is not None:
        star_tmp_name = str(args.name + "_STARtmp")
        star_tmp_dir = pgrm_utils.create_star_tmp(args.tmp, star_tmp_name)
        star_options['outTmpDir'] = star_tmp_dir

    star_option_str = pgrm_utils.get_final_args(star_options, args.star_options)

    # If GFF3 specs, then we need to inform STAR.
    # Whether we have de novo or not, the format of "config['gtf']" has precedence.
    sjdb_gtf_tag_str = ""
    use_gff3_specs = config['gtf'].endswith('gff')
    gtf_file = filenames.get_gtf(config['genome_base_path'],
                                 config['genome_name'],
                                 is_gff3=use_gff3_specs,
                                 is_star_input=True)
    if use_gff3_specs:
        sjdb_gtf_tag_str = "--sjdbGTFtagExonParentTranscript Parent"

    cmd = ("{} --runThreadN {} --genomeDir {} --sjdbGTFfile {} {} --readFilesIn {} "
        "{} --outFileNamePrefix {}".format(args.star_executable,
                                                 args.num_cpus,
                                                 config['star_index'],
                                                 gtf_file,
                                                 sjdb_gtf_tag_str,
                                                 without_rrna,
                                                 star_option_str,
                                                 star_output_prefix))
    in_files = [without_rrna]
    in_files.extend(pgrm_utils.get_star_index_files(config['star_index']))
    to_delete = [without_rrna]
    out_files = [genome_star_bam]
    file_checkers = {
        genome_star_bam: bam_utils.check_bam_file
    }
    shell_utils.call_if_not_exists(cmd, out_files, in_files=in_files,
                                   file_checkers=file_checkers, overwrite=args.overwrite,
                                   call=call, keep_delete_files=keep_delete_files, to_delete=to_delete)
    
    # now, we need to symlink the (genome) STAR output to that expected by the rest of the pipeline
    genome_sorted_bam = filenames.get_riboseq_bam(config['riboseq_data'],
                                                  args.name,
                                                  note=note)

    if os.path.exists(genome_star_bam):
        shell_utils.create_symlink(genome_star_bam, genome_sorted_bam, call)
    else:
        msg = ("Could not find the STAR genome bam alignment file. Unless "
               "--do-not-call was given, this is a problem.")
        logger.warning(msg)

    # create the bamtools index
    cmd = "samtools index -b {}".format(genome_sorted_bam)
    shell_utils.check_call(cmd, call=call)

    # check if we want to keep multimappers
    if 'keep_riboseq_multimappers' in config:
        return

    # remove multimapping reads from the genome file
    tmp_str = ""
    if args.tmp is not None:
        tmp_str = "--tmp {}".format(args.tmp)

    unique_genome_filename = filenames.get_riboseq_bam(config['riboseq_data'],
                                                       args.name,
                                                       is_unique=True,
                                                       note=note)

    cmd = "remove-multimapping-reads {} {} {}".format(genome_sorted_bam, 
                                                      unique_genome_filename,
                                                      tmp_str)

    in_files = [genome_sorted_bam]
    out_files = [unique_genome_filename]
    to_delete = [genome_star_bam, genome_sorted_bam]
    file_checkers = {
        unique_genome_filename: bam_utils.check_bam_file
    }
    shell_utils.call_if_not_exists(cmd, out_files, in_files=in_files,
                                   file_checkers=file_checkers, overwrite=args.overwrite,
                                   call=call, keep_delete_files=keep_delete_files, to_delete=to_delete)
示例#5
0
def get_orfs(gtf, args, config, is_annotated=False, is_de_novo=False):
    """ Process a GTF file into its ORFs.
    """

    call = not args.do_not_call
    chr_name_file = os.path.join(config['star_index'], 'chrName.txt')
    chr_name_str = "--chr-name-file {}".format(chr_name_file)

    logging_str = logging_utils.get_logging_options_string(args)
    cpus_str = "--num-cpus {}".format(args.num_cpus)

    # extract a BED12 of the annotated ORFs
    transcript_bed = filenames.get_bed(config['genome_base_path'],
                                       config['genome_name'],
                                       is_merged=False,
                                       is_annotated=is_annotated,
                                       is_de_novo=is_de_novo)

    cmd = ("gtf-to-bed12 {} {} {} {} {}".format(gtf, transcript_bed,
                                                chr_name_str, cpus_str,
                                                logging_str))
    in_files = [gtf]
    out_files = [transcript_bed]
    shell_utils.call_if_not_exists(cmd,
                                   out_files,
                                   in_files=in_files,
                                   overwrite=args.overwrite,
                                   call=call)

    # extract the transcript fasta
    transcript_fasta = filenames.get_transcript_fasta(
        config['genome_base_path'],
        config['genome_name'],
        is_annotated=is_annotated,
        is_de_novo=is_de_novo)

    cmd = ("extract-bed-sequences {} {} {} {}".format(transcript_bed,
                                                      config['fasta'],
                                                      transcript_fasta,
                                                      logging_str))
    in_files = [transcript_bed, config['fasta']]
    out_files = [transcript_fasta]
    shell_utils.call_if_not_exists(cmd,
                                   out_files,
                                   in_files=in_files,
                                   overwrite=args.overwrite,
                                   call=call)

    # extract ORFs from the transcripts using genomic coordinates
    orfs_genomic = filenames.get_orfs(config['genome_base_path'],
                                      config['genome_name'],
                                      note=config.get('orf_note'),
                                      is_annotated=is_annotated,
                                      is_de_novo=is_de_novo)

    start_codons_str = utils.get_config_argument(config,
                                                 'start_codons',
                                                 default=default_start_codons)

    stop_codons_str = utils.get_config_argument(config,
                                                'stop_codons',
                                                default=default_stop_codons)

    cmd = "extract-orf-coordinates {} {} {} {} {} {} {}".format(
        transcript_bed, transcript_fasta, orfs_genomic, cpus_str,
        start_codons_str, stop_codons_str, logging_str)
    in_files = [transcript_fasta, transcript_bed]
    out_files = [orfs_genomic]
    shell_utils.call_if_not_exists(cmd,
                                   out_files,
                                   in_files=in_files,
                                   overwrite=args.overwrite,
                                   call=call)

    # write the ORF exons, used to label the ORFs
    exons_file = filenames.get_exons(config['genome_base_path'],
                                     config['genome_name'],
                                     note=config.get('orf_note'),
                                     is_annotated=is_annotated,
                                     is_de_novo=is_de_novo)

    cmd = ("split-bed12-blocks {} {} --num-cpus {} {}".format(
        orfs_genomic, exons_file, args.num_cpus, logging_str))
    in_files = [orfs_genomic]
    out_files = [exons_file]
    shell_utils.call_if_not_exists(cmd,
                                   out_files,
                                   in_files=in_files,
                                   overwrite=args.overwrite,
                                   call=call)

    # label the ORFs
    labeled_orfs = filenames.get_labels(config['genome_base_path'],
                                        config['genome_name'],
                                        note=config.get('orf_note'),
                                        is_annotated=is_annotated,
                                        is_de_novo=is_de_novo)

    annotated_bed = filenames.get_bed(config['genome_base_path'],
                                      config['genome_name'],
                                      is_merged=False,
                                      is_annotated=True)

    orf_exons_str = '--orf-exons {}'.format(exons_file)

    de_novo_str = ""
    if is_de_novo:
        de_novo_str = '--label-prefix "novel_" --filter --nonoverlapping-label "novel"'

    cmd = "label-orfs {} {} {} {} {} {} {}".format(annotated_bed, orfs_genomic,
                                                   labeled_orfs, orf_exons_str,
                                                   de_novo_str, logging_str,
                                                   cpus_str)
    in_files = [annotated_bed, orfs_genomic, exons_file]
    #  ** this function overwrites the input file `orfs_genomic`
    out_files = [labeled_orfs]
    shell_utils.call_if_not_exists(cmd,
                                   out_files,
                                   in_files=in_files,
                                   overwrite=args.overwrite,
                                   call=call)
示例#6
0
def main():
    parser = argparse.ArgumentParser(
        formatter_class=argparse.ArgumentDefaultsHelpFormatter,
        description='''Prepare a reference genome and matching 
        annotations, including labelled ORFs, for use with the Rp-Bp periodicity estimation 
        and ORF translation prediction pipeline.''')

    parser.add_argument('config', help='''The (yaml) configuration file''')

    parser.add_argument('--overwrite',
                        help='''If this flag is present, existing files
        will be overwritten.''',
                        action='store_true')

    slurm.add_sbatch_options(parser,
                             num_cpus=default_num_cpus,
                             mem=default_mem)
    logging_utils.add_logging_options(parser)
    pgrm_utils.add_star_options(parser, star_executable)
    args = parser.parse_args()
    logging_utils.update_logging(args)

    config = yaml.load(open(args.config), Loader=yaml.FullLoader)

    # check required callable programs, config keys and files
    programs = [
        'extract-orf-coordinates', 'label-orfs', 'bowtie2-build-s',
        'split-bed12-blocks', 'gtf-to-bed12', args.star_executable
    ]
    shell_utils.check_programs_exist(programs)

    required_keys = [
        'genome_base_path', 'genome_name', 'gtf', 'fasta', 'ribosomal_fasta',
        'ribosomal_index', 'star_index'
    ]
    utils.check_keys_exist(config, required_keys)

    files = [config['gtf'], config['fasta'], config['ribosomal_fasta']]
    if 'de_novo_gtf' in config:
        files += [config['de_novo_gtf']]
    utils.check_files_exist(files, source='prepare-rpbp-genome')

    # check if we want to use slurm
    if args.use_slurm:
        cmd = ' '.join(sys.argv)
        slurm.check_sbatch(cmd, args=args)
        return

    call = not args.do_not_call

    # the rRNA index
    cmd = "bowtie2-build-s {} {}".format(config['ribosomal_fasta'],
                                         config['ribosomal_index'])

    in_files = [config['ribosomal_fasta']]
    out_files = pgrm_utils.get_bowtie2_index_files(config['ribosomal_index'])
    shell_utils.call_if_not_exists(cmd,
                                   out_files,
                                   in_files=in_files,
                                   overwrite=args.overwrite,
                                   call=call)

    # the STAR index
    mem = utils.human2bytes(args.mem)
    cmd = ("{} --runMode genomeGenerate --genomeDir {} --genomeFastaFiles {} "
           "--runThreadN {} --limitGenomeGenerateRAM {}".format(
               args.star_executable, config['star_index'], config['fasta'],
               args.num_cpus, mem))

    in_files = [config['fasta']]
    out_files = pgrm_utils.get_star_index_files(config['star_index'])
    shell_utils.call_if_not_exists(cmd,
                                   out_files,
                                   in_files=in_files,
                                   overwrite=args.overwrite,
                                   call=call)

    # get the ORFs
    get_orfs(config['gtf'], args, config, is_annotated=True, is_de_novo=False)

    # we will use these files later in the pipeline
    annotated_orfs = filenames.get_orfs(config['genome_base_path'],
                                        config['genome_name'],
                                        note=config.get('orf_note'),
                                        is_annotated=True,
                                        is_de_novo=False)

    orfs_genomic = filenames.get_orfs(config['genome_base_path'],
                                      config['genome_name'],
                                      note=config.get('orf_note'))

    annotated_exons_file = filenames.get_exons(config['genome_base_path'],
                                               config['genome_name'],
                                               note=config.get('orf_note'),
                                               is_annotated=True,
                                               is_de_novo=False)

    exons_file = filenames.get_exons(config['genome_base_path'],
                                     config['genome_name'],
                                     note=config.get('orf_note'))

    annotated_labeled_orfs = filenames.get_labels(config['genome_base_path'],
                                                  config['genome_name'],
                                                  note=config.get('orf_note'),
                                                  is_annotated=True,
                                                  is_de_novo=False)

    labeled_orfs = filenames.get_labels(config['genome_base_path'],
                                        config['genome_name'],
                                        note=config.get('orf_note'))

    use_gff3_specs = config['gtf'].endswith('gff')
    gtf_file = filenames.get_gtf(config['genome_base_path'],
                                 config['genome_name'],
                                 is_gff3=use_gff3_specs,
                                 is_star_input=True)

    # now, check if we have a de novo assembly
    if 'de_novo_gtf' in config:
        get_orfs(config['de_novo_gtf'],
                 args,
                 config,
                 is_annotated=False,
                 is_de_novo=True)

        # we need to concat the ORF and exon files
        de_novo_orfs = filenames.get_orfs(config['genome_base_path'],
                                          config['genome_name'],
                                          note=config.get('orf_note'),
                                          is_annotated=False,
                                          is_de_novo=True)

        orfs_files = [annotated_orfs, de_novo_orfs]

        orfs_files_str = ' '.join(orfs_files)
        msg = ("Concatenating files. Output file: {}; Input files: {}".format(
            orfs_genomic, orfs_files_str))
        logger.info(msg)

        if call:
            concatenated_bed = bed_utils.concatenate(orfs_files, sort_bed=True)
            concatenated_bed['orf_num'] = range(len(concatenated_bed))
            additional_columns = ['orf_num', 'orf_len', 'orf_type']
            fields = bed_utils.bed12_field_names + additional_columns
            bed_utils.write_bed(concatenated_bed[fields], orfs_genomic)
        else:
            msg = "Skipping concatenation due to --call value"
            logger.info(msg)

        de_novo_exons_file = filenames.get_exons(config['genome_base_path'],
                                                 config['genome_name'],
                                                 note=config.get('orf_note'),
                                                 is_annotated=False,
                                                 is_de_novo=True)

        exons_files = [annotated_exons_file, de_novo_exons_file]

        exons_files_str = ' '.join(exons_files)
        msg = ("Concatenating files. Output file: {}; Input files: {}".format(
            exons_file, exons_files_str))
        logger.info(msg)

        if call:
            concatenated_bed = bed_utils.concatenate(exons_files,
                                                     sort_bed=True)
            fields = bed_utils.bed6_field_names + [
                'exon_index', 'transcript_start'
            ]
            bed_utils.write_bed(concatenated_bed[fields], exons_file)
        else:
            msg = "Skipping concatenation due to --call value"
            logger.info(msg)

        de_novo_labeled_orfs = filenames.get_labels(
            config['genome_base_path'],
            config['genome_name'],
            note=config.get('orf_note'),
            is_annotated=False,
            is_de_novo=True)

        label_files = [annotated_labeled_orfs, de_novo_labeled_orfs]

        label_files_str = ' '.join(label_files)
        msg = ("Concatenating files. Output file: {}; Input files: {}".format(
            labeled_orfs, label_files_str))
        logger.info(msg)

        if call:
            # not sorted, as is
            concatenated_bed = bed_utils.concatenate(label_files,
                                                     sort_bed=False)
            bed_utils.write_bed(concatenated_bed, labeled_orfs)
        else:
            msg = "Skipping concatenation due to --call value"
            logger.info(msg)

        # we also need to concat the annotations to inform STAR
        # there is no particular reason to merge and sort the files, so
        # we just concatenate them...
        if (config['de_novo_gtf'].endswith('gff') == use_gff3_specs):
            cmd = ("awk '!/^#/' {} {} > {}".format(config['gtf'],
                                                   config['de_novo_gtf'],
                                                   gtf_file))
            in_files = [config['gtf'], config['de_novo_gtf']]
            out_files = [gtf_file]
            shell_utils.call_if_not_exists(cmd,
                                           out_files,
                                           in_files=in_files,
                                           overwrite=args.overwrite,
                                           call=call)
        else:
            msg = (
                "Skipping concatenation due to mismatch in format specifications (GTF2/GFF3)"
                "for reference and do novo annotations. Symlink to reference annotations created."
            )
            logger.warning(msg)
            if os.path.exists(config['gtf']):
                shell_utils.create_symlink(config['gtf'], gtf_file, call)

    else:
        # if we do not have a de novo assembly, symlink the files

        if os.path.exists(annotated_orfs):
            shell_utils.create_symlink(annotated_orfs, orfs_genomic, call)

        if os.path.exists(annotated_exons_file):
            shell_utils.create_symlink(annotated_exons_file, exons_file, call)

        if os.path.exists(annotated_labeled_orfs):
            shell_utils.create_symlink(annotated_labeled_orfs, labeled_orfs,
                                       call)

        if os.path.exists(config['gtf']):
            shell_utils.create_symlink(config['gtf'], gtf_file, call)
示例#7
0
def main():
    
    parser = argparse.ArgumentParser(formatter_class=argparse.ArgumentDefaultsHelpFormatter,
                                     description="""This script runs all of the processing necessary to 
        produce the signals used for ORF translation prediction. In particular, it creates the 
        metagene profiles, selected the periodic fragments and generate the ORF profiles.""")

    parser.add_argument('raw_data', help="The raw data file (fastq[.gz])")

    parser.add_argument('config', help="The (yaml) configuration file")

    parser.add_argument('name', help="The name for the dataset, used in the created files")

    parser.add_argument('-p', '--num-cpus', help="The number of processors to use",
                        type=int, default=default_num_cpus)

    parser.add_argument('--mem', help="The amount of RAM to request", default=default_mem)

    parser.add_argument('--tmp', help="The location for temp files", default=None)

    parser.add_argument('--do-not-call', action='store_true')

    parser.add_argument('--overwrite', help="""If this flag is present, existing files 
        will be overwritten.""", action='store_true')
         
    parser.add_argument('-k', '--keep-intermediate-files', help="""If this flag is given,
        then all intermediate files will be kept; otherwise, they will be deleted. 
        This feature is implemented piecemeal. If the --do-not-call flag is given, 
        then nothing will be deleted.""", action='store_true')

    logging_utils.add_logging_options(parser)
    pgrm_utils.add_star_options(parser, star_executable)
    pgrm_utils.add_flexbar_options(parser)
    args = parser.parse_args()
    logging_utils.update_logging(args)

    msg = "[create-orf-profiles]: {}".format(' '.join(sys.argv))
    logger.info(msg)

    config = yaml.load(open(args.config), Loader=yaml.FullLoader)

    # check that all of the necessary programs are callable
    programs = [
        'flexbar',
        args.star_executable,
        'samtools',
        'bowtie2',
        'create-base-genome-profile',
        'remove-multimapping-reads',
        'extract-metagene-profiles',
        'estimate-metagene-profile-bayes-factors',
        'select-periodic-offsets',
        'extract-orf-profiles'
    ]
    shell_utils.check_programs_exist(programs)

    required_keys = [
        'riboseq_data',
        'ribosomal_index',
        'gtf',
        'genome_base_path',
        'genome_name'
    ]
    utils.check_keys_exist(config, required_keys)

    note = config.get('note', None)
    models_base = config.get('models_base', default_models_base)

    logging_str = logging_utils.get_logging_options_string(args)
    star_str = pgrm_utils.get_star_options_string(args)
    flexbar_str = pgrm_utils.get_flexbar_options_string(args)

    # handle do_not_call so that we do call the preprocessing script,
    # but that it does not run anything
    call = not args.do_not_call
    do_not_call_argument = ""
    if not call:
        do_not_call_argument = "--do-not-call"

    overwrite_argument = ""
    if args.overwrite:
        overwrite_argument = "--overwrite"

    keep_intermediate_str = ""
    if args.keep_intermediate_files:
        keep_intermediate_str = "--keep-intermediate-files"

    tmp_str = ""
    if args.tmp is not None:
        tmp_str = "--tmp {}".format(args.tmp)

    mem_str = "--mem {}".format(shlex.quote(args.mem))

    # check if we want to keep multimappers
    is_unique = not ('keep_riboseq_multimappers' in config)

    riboseq_raw_data = args.raw_data
    riboseq_bam_filename = filenames.get_riboseq_bam(config['riboseq_data'],
                                                     args.name,
                                                     is_unique=is_unique,
                                                     note=note)

    cmd = ("create-base-genome-profile {} {} {} --num-cpus {} {} {} {} {} {} {} {} {}".format(
        riboseq_raw_data,
        args.config,
        args.name,
        args.num_cpus,
        do_not_call_argument,
        overwrite_argument,
        logging_str,
        star_str,
        tmp_str,
        flexbar_str,
        keep_intermediate_str,
        mem_str))

    # There could be cases where we start somewhere in the middle of creating
    # the base genome profile. So even if the "raw data" is not available, 
    # we still want to call the base pipeline.
    # in_files = [riboseq_raw_data]
    in_files = []
    out_files = [riboseq_bam_filename]
    # we always call this, and pass --do-not-call through
    shell_utils.call_if_not_exists(cmd, out_files, in_files=in_files,
                                   overwrite=args.overwrite, call=True)

    # Extract the metagene profiles

    start_upstream_str = utils.get_config_argument(config,
                                                   'metagene_start_upstream',
                                                   'start-upstream',
                                                   default=metagene_options['metagene_start_upstream'])
    start_downstream_str = utils.get_config_argument(config,
                                                     'metagene_start_downstream',
                                                     'start-downstream',
                                                     default=metagene_options['metagene_start_downstream'])
    end_upstream_str = utils.get_config_argument(config,
                                                 'metagene_end_upstream',
                                                 'end-upstream',
                                                 default=metagene_options['metagene_end_upstream'])
    end_downstream_str = utils.get_config_argument(config,
                                                   'metagene_end_downstream',
                                                   'end-downstream',
                                                   default=metagene_options['metagene_end_downstream'])

    metagene_profiles = filenames.get_metagene_profiles(config['riboseq_data'],
                                                        args.name,
                                                        is_unique=is_unique,
                                                        note=note)

    # use the canonical transcripts for extracting the metagene profiles
    transcript_bed = filenames.get_bed(config['genome_base_path'],
                                       config['genome_name'],
                                       is_merged=False,
                                       is_annotated=True)

    cmd = ("extract-metagene-profiles {} {} {} --num-cpus {} {} {} {} {} {}".format(
        riboseq_bam_filename,
        transcript_bed,
        metagene_profiles,
        args.num_cpus,
        logging_str,
        start_upstream_str,
        start_downstream_str,
        end_upstream_str,
        end_downstream_str))

    in_files = [riboseq_bam_filename, transcript_bed]
    out_files = [metagene_profiles]
    file_checkers = {
        metagene_profiles: utils.check_gzip_file
    }
    shell_utils.call_if_not_exists(cmd, out_files, in_files=in_files,
                                   file_checkers=file_checkers,
                                   overwrite=args.overwrite, call=call)

    # estimate the periodicity for each offset for all read lengths
    metagene_profile_bayes_factors = filenames.get_metagene_profiles_bayes_factors(
        config['riboseq_data'],
        args.name,
        is_unique=is_unique,
        note=note)

    periodic_models = filenames.get_models(models_base, 'periodic')
    non_periodic_models = filenames.get_models(models_base, 'nonperiodic')
    
    periodic_models_str = ' '.join(periodic_models)
    non_periodic_models_str = ' '.join(non_periodic_models)

    periodic_models_str = "--periodic-models {}".format(periodic_models_str)
    non_periodic_models_str = "--nonperiodic-models {}".format(non_periodic_models_str)

    periodic_offset_start_str = utils.get_config_argument(config,
                                                          'periodic_offset_start',
                                                          default=metagene_options['periodic_offset_start'])
    periodic_offset_end_str = utils.get_config_argument(config,
                                                        'periodic_offset_end',
                                                        default=metagene_options['periodic_offset_end'])
    metagene_profile_length_str = utils.get_config_argument(config,
                                                            'metagene_profile_length',
                                                            default=metagene_options['metagene_profile_length'])
    seed_str = utils.get_config_argument(config,
                                         'seed',
                                         default=metagene_options['seed'])
    chains_str = utils.get_config_argument(config,
                                           'chains',
                                           default=metagene_options['chains'])
    iterations_str = utils.get_config_argument(config,
                                               'metagene_iterations',
                                               'iterations',
                                               default=metagene_options['metagene_iterations'])

    cmd = ("estimate-metagene-profile-bayes-factors {} {} --num-cpus {} {} {} "
           "{} {} {} {} {} {} {}".format(metagene_profiles,
                                         metagene_profile_bayes_factors,
                                         args.num_cpus,
                                         periodic_models_str,
                                         non_periodic_models_str,
                                         periodic_offset_start_str,
                                         periodic_offset_end_str,
                                         metagene_profile_length_str,
                                         seed_str,
                                         chains_str,
                                         iterations_str,
                                         logging_str))

    in_files = [metagene_profiles]
    in_files.extend(periodic_models)
    in_files.extend(non_periodic_models)
    out_files = [metagene_profile_bayes_factors]
    file_checkers = {
        metagene_profile_bayes_factors: utils.check_gzip_file
    }
    shell_utils.call_if_not_exists(cmd, out_files, in_files=in_files,
                                   file_checkers=file_checkers,
                                   overwrite=args.overwrite, call=call)
    
    # select the best read lengths for constructing the signal
    periodic_offsets = filenames.get_periodic_offsets(config['riboseq_data'],
                                                      args.name,
                                                      is_unique=is_unique,
                                                      note=note)

    cmd = "select-periodic-offsets {} {}".format(metagene_profile_bayes_factors,
                                                 periodic_offsets)

    in_files = [metagene_profile_bayes_factors]
    out_files = [periodic_offsets]
    file_checkers = {
        periodic_offsets: utils.check_gzip_file
    }
    shell_utils.call_if_not_exists(cmd, out_files, in_files=in_files,
                                   file_checkers=file_checkers,
                                   overwrite=args.overwrite, call=call)

    # get the lengths and offsets which meet the required criteria from the config file
    lengths, offsets = ribo_utils.get_periodic_lengths_and_offsets(config,
                                                                   args.name,
                                                                   args.do_not_call,
                                                                   is_unique=is_unique,
                                                                   default_params=metagene_options)

    if len(lengths) == 0:
        msg = ("No periodic read lengths and offsets were found. Try relaxing "
               "min_metagene_profile_count, min_metagene_bf_mean, max_metagene_bf_var, "
               "and/or min_metagene_bf_likelihood. Quitting.")
        logger.critical(msg)
        return

    lengths_str = ' '.join(lengths)
    offsets_str = ' '.join(offsets)

    seqname_prefix_str = utils.get_config_argument(config, 'seqname_prefix')
    
    # extract the riboseq profiles for each orf
    unique_filename = filenames.get_riboseq_bam(config['riboseq_data'],
                                                args.name,
                                                is_unique=is_unique,
                                                note=note)

    profiles_filename = filenames.get_riboseq_profiles(config['riboseq_data'],
                                                       args.name,
                                                       length=lengths,
                                                       offset=offsets,
                                                       is_unique=is_unique,
                                                       note=note)

    orfs_genomic = filenames.get_orfs(config['genome_base_path'],
                                      config['genome_name'],
                                      note=config.get('orf_note'))

    exons_file = filenames.get_exons(config['genome_base_path'],
                                     config['genome_name'],
                                     note=config.get('orf_note'))

    cmd = ("extract-orf-profiles {} {} {} {} --lengths {} --offsets {} {} {} --num-cpus {} ".format(
        unique_filename,
        orfs_genomic,
        exons_file,
        profiles_filename,
        lengths_str,
        offsets_str,
        logging_str,
        seqname_prefix_str,
        args.num_cpus))

    in_files = [orfs_genomic, exons_file, unique_filename]
    out_files = [profiles_filename]

    # todo: implement a file checker for mtx files
    shell_utils.call_if_not_exists(cmd, out_files, in_files=in_files,
                                   overwrite=args.overwrite, call=call)
示例#8
0
def _create_figures(name_pretty_name_is_replicate, config, args):
    """ This function creates all of the figures in the prediction report
        for the given dataset.
    """
    name, pretty_name, is_replicate = name_pretty_name_is_replicate

    # keep multimappers?
    is_unique = not ('keep_riboseq_multimappers' in config)

    # by default, we will not include chisq
    chisq_values = [False]
    if args.show_chisq:
        chisq_values = [True, False]

    filtered_values = [True]
    if args.show_unfiltered_orfs:
        filtered_values = [True, False]

    grouped_values = [True, False]

    logging_str = logging_utils.get_logging_options_string(args)

    note_str = config.get('note', None)
    out_note_str = config.get('note', None)
    if args.note is not None and len(args.note) > 0:
        out_note_str = args.note

    image_type_str = "--image-type {}".format(args.image_type)
    num_cpus_str = "--num-cpus {}".format(args.num_cpus)

    fraction = config.get('smoothing_fraction', None)
    reweighting_iterations = config.get('smoothing_reweighting_iterations',
                                        None)

    # if this is a replicate, we do not worry about lengths and offsets
    if is_replicate:
        lengths = None
        offsets = None
    else:
        try:
            lengths, offsets = ribo_utils.get_periodic_lengths_and_offsets(
                config,
                name,
                is_unique=is_unique,
                default_params=metagene_options)
        except FileNotFoundError:
            msg = ("Could not parse out lengths and offsets for sample: {}. "
                   "Skipping".format(name))
            logger.error(msg)
            return

    unsmoothed_profiles = filenames.get_riboseq_profiles(
        config['riboseq_data'],
        name,
        length=lengths,
        offset=offsets,
        is_unique=is_unique,
        note=note_str,
        is_smooth=False)

    msg = "{}: creating the ORF types bar charts".format(name)
    logger.debug(msg)

    it = itertools.product(grouped_values, chisq_values, filtered_values)

    for is_grouped, is_chisq, is_filtered in it:

        is_grouped_str = ""
        if is_grouped:
            is_grouped_str = ", Grouped"

        is_filtered_str = ""
        if is_filtered:
            is_filtered_str = ", Filtered"

        if is_chisq:
            title_str = "{}{}{}, Rp-$\chi^2$".format(pretty_name,
                                                     is_grouped_str,
                                                     is_filtered_str)
            title_str = shlex.quote(title_str)
            title_str = "--title {}".format(title_str)

            f = None
            rw = None

            orfs = filenames.get_riboseq_predicted_orfs(
                config['riboseq_data'],
                name,
                length=lengths,
                offset=offsets,
                is_unique=is_unique,
                note=note_str,
                is_chisq=True,
                is_filtered=is_filtered)

        else:
            title_str = "{}{}{}, Rp-Bp".format(pretty_name, is_grouped_str,
                                               is_filtered_str)
            title_str = shlex.quote(title_str)
            title_str = "--title {}".format(title_str)

            f = fraction
            rw = reweighting_iterations
            orfs = filenames.get_riboseq_predicted_orfs(
                config['riboseq_data'],
                name,
                length=lengths,
                offset=offsets,
                is_unique=is_unique,
                note=note_str,
                fraction=f,
                reweighting_iterations=rw,
                is_filtered=is_filtered)

        use_groups_str = ""
        if is_grouped:
            use_groups_str = "--use-groups"

        orf_types_bar_chart = filenames.get_orf_types_bar_chart(
            config['riboseq_data'],
            name,
            length=lengths,
            offset=offsets,
            is_unique=is_unique,
            note=out_note_str,
            image_type=args.image_type,
            fraction=f,
            reweighting_iterations=rw,
            is_grouped=is_grouped,
            is_chisq=is_chisq,
            is_filtered=is_filtered)

        cmd = "create-orf-types-bar-chart {} {} {} {}".format(
            orfs, orf_types_bar_chart, title_str, use_groups_str)

        in_files = [orfs]
        out_files = [orf_types_bar_chart]
        shell_utils.call_if_not_exists(cmd,
                                       out_files,
                                       in_files=in_files,
                                       overwrite=args.overwrite)

    msg = "{}: creating the ORF length distributions line graph".format(name)
    logger.debug(msg)

    uniprot_str = ""
    uniprot_label_str = ""
    if os.path.exists(args.uniprot):
        uniprot_str = "--uniprot {}".format(args.uniprot)
        uniprot_label_str = shlex.quote(args.uniprot_label)
        uniprot_label_str = "--uniprot-label {}".format(uniprot_label_str)

    for is_grouped in grouped_values:
        for is_chisq in chisq_values:

            if is_chisq:
                title_str = "{}, Rp-$\chi^2$".format(pretty_name)
                title_str = shlex.quote(title_str)
                title_str = "--title {}".format(title_str)

                f = None
                rw = None

                orfs = filenames.get_riboseq_predicted_orfs(
                    config['riboseq_data'],
                    name,
                    length=lengths,
                    offset=offsets,
                    is_unique=is_unique,
                    note=note_str,
                    is_chisq=True)

            else:
                title_str = "{}, Rp-Bp".format(pretty_name)
                title_str = shlex.quote(title_str)
                title_str = "--title {}".format(title_str)

                f = fraction
                rw = reweighting_iterations

                orfs = filenames.get_riboseq_predicted_orfs(
                    config['riboseq_data'],
                    name,
                    length=lengths,
                    offset=offsets,
                    is_unique=is_unique,
                    note=note_str,
                    fraction=f,
                    reweighting_iterations=rw)

            use_groups_str = ""
            if is_grouped:
                use_groups_str = "--use-groups"

            orf_length_line_graph = filenames.get_orf_length_distribution_line_graph(
                config['riboseq_data'],
                name,
                length=lengths,
                offset=offsets,
                is_unique=is_unique,
                note=out_note_str,
                image_type=args.image_type,
                fraction=f,
                reweighting_iterations=rw,
                is_grouped=is_grouped,
                is_chisq=is_chisq)

            cmd = (
                "create-orf-length-distribution-line-graph {} {} {} {} {} {}".
                format(orfs, orf_length_line_graph, title_str, use_groups_str,
                       uniprot_str, uniprot_label_str))

            in_files = [orfs]
            out_files = [orf_length_line_graph]
            shell_utils.call_if_not_exists(cmd,
                                           out_files,
                                           in_files=in_files,
                                           overwrite=args.overwrite)

    if args.show_orf_periodicity:
        msg = "{}: creating the ORF type metagene profiles".format(name)
        logger.debug(msg)

        for is_chisq in chisq_values:

            if is_chisq:
                title_str = "{}, Rp-$\chi^2$".format(pretty_name)
                title_str = shlex.quote(title_str)
                title_str = "--title {}".format(title_str)
                f = None
                rw = None
                is_smooth = False
                profiles = unsmoothed_profiles

                orfs = filenames.get_riboseq_predicted_orfs(
                    config['riboseq_data'],
                    name,
                    length=lengths,
                    offset=offsets,
                    is_unique=is_unique,
                    note=note_str,
                    is_chisq=True,
                    is_filtered=is_filtered)

            else:
                title_str = "{}, Rp-Bp".format(pretty_name)
                title_str = shlex.quote(title_str)
                title_str = "--title {}".format(title_str)

                f = fraction
                rw = reweighting_iterations
                is_smooth = False
                profiles = unsmoothed_profiles

                orfs = filenames.get_riboseq_predicted_orfs(
                    config['riboseq_data'],
                    name,
                    length=lengths,
                    offset=offsets,
                    is_unique=is_unique,
                    note=note_str,
                    fraction=f,
                    reweighting_iterations=rw)

            orf_type_profile_base = filenames.get_orf_type_profile_base(
                config['riboseq_data'],
                name,
                length=lengths,
                offset=offsets,
                is_unique=is_unique,
                note=out_note_str,
                fraction=f,
                reweighting_iterations=rw,
                is_chisq=is_chisq)

            strand = "+"
            orf_type_profiles_forward = [
                filenames.get_orf_type_profile_image(orf_type_profile_base,
                                                     orf_type, strand,
                                                     args.image_type)
                for orf_type in ribo_utils.orf_types
            ]

            strand = "-"
            orf_type_profiles_reverse = [
                filenames.get_orf_type_profile_image(orf_type_profile_base,
                                                     orf_type, strand,
                                                     args.image_type)
                for orf_type in ribo_utils.orf_types
            ]

            cmd = ("visualize-orf-type-metagene-profiles {} {} {} {} {} {}".
                   format(orfs, profiles, orf_type_profile_base, title_str,
                          image_type_str, logging_str))

            in_files = [orfs]
            out_files = orf_type_profiles_forward + orf_type_profiles_reverse
            shell_utils.call_if_not_exists(cmd,
                                           out_files,
                                           in_files=in_files,
                                           overwrite=args.overwrite)