def main():
    parser = argparse.ArgumentParser(
        formatter_class=argparse.ArgumentDefaultsHelpFormatter,
        description=
        "This script collects counts of riboseq reads filtered at each step in "
        "the micropeptide prediction pipeline. It mostly parses fastqc results (using the "
        "crimson python package).")
    parser.add_argument('config', help="The yaml config file")
    parser.add_argument('out', help="The output csv file with the counts")
    parser.add_argument('-p',
                        '--num-cpus',
                        help="The number of processors to use",
                        type=int,
                        default=default_num_cpus)
    parser.add_argument('--overwrite', action='store_true')

    logging_utils.add_logging_options(parser)
    args = parser.parse_args()
    logging_utils.update_logging(args)

    programs = ['samtools']
    shell_utils.check_programs_exist(programs)

    config = yaml.load(open(args.config), Loader=yaml.FullLoader)

    res = parallel.apply_parallel_iter(config['riboseq_samples'].items(),
                                       args.num_cpus, get_counts, config, args)
    res = [r for r in res if r is not None]
    res_df = pd.DataFrame(res)

    pandas_utils.write_df(res_df, args.out, index=False)
예제 #2
0
파일: setup.py 프로젝트: HeyLifeHD/rp-bp
def _post_install(force_recompile):

    import site
    importlib.reload(site)

    import pbio.ribo.ribo_filenames as filenames
    import pbio.misc.shell_utils as shell_utils

    smf = [os.path.join("rpbp_models", s) for s in stan_model_files]

    models_base = filenames.get_default_models_base()
    spf = [os.path.join(models_base, s) for s in stan_pickle_files]

    # Compile and pickle the Stan models
    if force_recompile:
        for stan, pickle in zip(smf, spf):
            _pickle_it(stan, pickle)
    else:  # default
        for stan, pickle in zip(smf, spf):
            if os.path.exists(pickle):
                msg = "A model already exists at: {}. Skipping.".format(pickle)
                logging.warning(msg)
                continue
            _pickle_it(stan, pickle)

    # Check for the prerequisite programs
    programs = ['flexbar']
    shell_utils.check_programs_exist(programs,
                                     raise_on_error=False,
                                     package_name='flexbar',
                                     logger=logger)

    programs = ['STAR']
    shell_utils.check_programs_exist(programs,
                                     raise_on_error=False,
                                     package_name='STAR',
                                     logger=logger)

    programs = ['bowtie2', 'bowtie2-build-s']
    shell_utils.check_programs_exist(programs,
                                     raise_on_error=False,
                                     package_name='bowtie2',
                                     logger=logger)

    programs = ['samtools']
    shell_utils.check_programs_exist(programs,
                                     raise_on_error=False,
                                     package_name='SAMtools',
                                     logger=logger)
def main():
    parser = argparse.ArgumentParser(
        formatter_class=argparse.ArgumentDefaultsHelpFormatter,
        description=
        "This script creates a simple latex document containing the read "
        "filtering images, metagene profiles and analysis, and standard section text."
    )
    parser.add_argument('config',
                        help="The (yaml) config file for the project")
    parser.add_argument('out', help="The path for the output files")

    parser.add_argument(
        '--show-orf-periodicity',
        help="If this flag is "
        "present, bar charts showing the periodicity of each ORF type will be "
        "included in the report.",
        action='store_true')

    parser.add_argument(
        '--show-read-length-bfs',
        help="If this flag is given, "
        "plots showing the Bayes factor at each offset for each read length "
        "are included in the report.",
        action='store_true')

    parser.add_argument('--overwrite',
                        help="If this flag is present, existing files will "
                        "be overwritten.",
                        action='store_true')

    parser.add_argument('--min-visualization-count',
                        help="Read lengths with fewer than this "
                        "number of reads will not be included in the report.",
                        type=int,
                        default=metagene_options['min_metagene_image_count'])

    parser.add_argument('--image-type',
                        help="The type of image types to create. This "
                        "must be an extension which matplotlib can interpret.",
                        default=default_image_type)

    parser.add_argument(
        '-c',
        '--create-fastqc-reports',
        help="If this flag is given, then "
        "fastqc reports will be created for most fastq and bam files. By default, they are "
        "not created.",
        action='store_true')

    parser.add_argument('--tmp',
                        help="If the fastqc reports are created, "
                        "they will use this location for temp files",
                        default=None)

    parser.add_argument(
        '--note',
        help="If this option is given, it will be used in the "
        "filenames.\n\nN.B. This REPLACES the note in the config file.",
        default=None)

    slurm.add_sbatch_options(parser, num_cpus=default_num_cpus)
    logging_utils.add_logging_options(parser)
    args = parser.parse_args()
    logging_utils.update_logging(args)

    config = yaml.load(open(args.config), Loader=yaml.FullLoader)

    if args.note is not None:
        config['note'] = args.note
    note = config.get('note', None)

    sample_names = sorted(config['riboseq_samples'].keys())

    # keep multimappers?
    is_unique = not ('keep_riboseq_multimappers' in config)

    programs = [
        'create-read-length-metagene-profile-plot',
        'visualize-metagene-profile-bayes-factor',
        'get-all-read-filtering-counts', 'samtools',
        'visualize-read-filtering-counts', 'get-read-length-distribution',
        'plot-read-length-distribution'
    ]

    if args.create_fastqc_reports:
        programs.extend(['fastqc', 'java'])

    shell_utils.check_programs_exist(programs)

    if args.use_slurm:
        cmd = ' '.join(sys.argv)
        slurm.check_sbatch(cmd, args=args)
        return

    # make sure the path to the output file exists
    os.makedirs(args.out, exist_ok=True)

    # first, create the read filtering information...
    create_read_filtering_plots(args.config, config, args)
    # ... and all the other figures.
    for name in sample_names:
        periodic_offsets = filenames.get_periodic_offsets(
            config['riboseq_data'], name, is_unique=is_unique, note=note)
        offsets_df = pd.read_csv(periodic_offsets)
        create_figures(args.config, config, name, offsets_df, args)

    min_metagene_profile_count = config.get(
        'min_metagene_profile_count',
        metagene_options['min_metagene_profile_count'])

    min_bf_mean = config.get('min_metagene_bf_mean',
                             metagene_options['min_metagene_bf_mean'])

    max_bf_var = config.get('max_metagene_bf_var',
                            metagene_options['max_metagene_bf_var'])

    min_bf_likelihood = config.get(
        'min_metagene_bf_likelihood',
        metagene_options['min_metagene_bf_likelihood'])

    project_name = config.get("project_name", default_project_name)
    title = "Preprocessing results for {}".format(project_name)

    tex_file = os.path.join(args.out, "preprocessing-report.tex")
    with open(tex_file, 'w') as out:

        latex.begin_document(out, title, abstract, commands=commands)

        latex.section(out, "Introduction")

        latex.clearpage(out)
        latex.newpage(out)

        latex.section(out, "Mapping and filtering")
        latex.write(out, mapping_and_filtering_text)

        # the read filtering figures
        read_filtering_image = filenames.get_riboseq_read_filtering_counts_image(
            config['riboseq_data'], note=note, image_type=args.image_type)

        n = "no-rrna-{}".format(note)
        no_rrna_read_filtering_image = filenames.get_riboseq_read_filtering_counts_image(
            config['riboseq_data'], note=n, image_type=args.image_type)

        latex.begin_figure(out)
        latex.write_graphics(out, read_filtering_image, width=0.45)
        latex.write_graphics(out, no_rrna_read_filtering_image, width=0.45)
        latex.write_caption(out,
                            read_filtering_caption,
                            label=read_filtering_label)
        latex.end_figure(out)

        latex.clearpage(out)

        # the read length distributions
        latex.section(out,
                      "Read length distributions",
                      label=length_distribution_section_label)

        msg = "Writing length distribution figures"
        logger.info(msg)

        latex.begin_table(out, "cc")

        latex.write_header(out,
                           ["All aligned reads", "Uniquely-aligning reads"])

        for name in sample_names:
            data = config['riboseq_samples'][name]
            read_length_distribution_image = filenames.get_riboseq_read_length_distribution_image(
                config['riboseq_data'],
                name,
                is_unique=False,
                note=note,
                image_type=args.image_type)

            unique_read_length_distribution_image = filenames.get_riboseq_read_length_distribution_image(
                config['riboseq_data'],
                name,
                is_unique=True,
                note=note,
                image_type=args.image_type)

            msg = "Looking for image file: {}".format(
                read_length_distribution_image)
            logger.debug(msg)

            if os.path.exists(read_length_distribution_image):
                latex.write_graphics(out,
                                     read_length_distribution_image,
                                     width=0.45)
            else:
                msg = "Could not find image: {}".format(
                    read_length_distribution_image)
                logger.warning(msg)

                text = "Missing: {}\n\n".format(name)
                latex.write(out, text)

            latex.write_column_sep(out)

            msg = "Looking for image file: {}".format(
                unique_read_length_distribution_image)
            logger.debug(msg)

            if os.path.exists(unique_read_length_distribution_image):
                latex.write_graphics(out,
                                     unique_read_length_distribution_image,
                                     width=0.45)
            else:
                msg = "Could not find image: {}".format(
                    unique_read_length_distribution_image)
                logger.warning(msg)

                text = "Missing: {}\n\n".format(name)
                latex.write(out, text)

            latex.write_row_sep(out)

        latex.end_table(out)
        latex.clearpage(out)

        latex.section(out, "Read length periodicity", label=periodicity_label)

        for name in sample_names:
            i = 0

            data = config['riboseq_samples'][name]

            msg = "Processing sample: {}".format(name)
            logger.info(msg)

            logger.debug("overwrite: {}".format(args.overwrite))

            periodic_offsets = filenames.get_periodic_offsets(
                config['riboseq_data'], name, is_unique=is_unique, note=note)
            offsets_df = pd.read_csv(periodic_offsets)

            min_read_length = int(offsets_df['length'].min())
            max_read_length = int(offsets_df['length'].max())

            latex.begin_table(out, "YY")

            header = "\\multicolumn{2}{c}{" + name + "}"
            header = [header]
            latex.write_header(out, header)

            for length in range(min_read_length, max_read_length + 1):
                msg = "Processing length: {}".format(length)
                logger.info(msg)

                # check which offset is used

                # select the row for this length
                mask_length = offsets_df['length'] == length

                # TODO: this is sometimes length 0. why?
                if sum(mask_length) == 0:
                    continue

                length_row = offsets_df[mask_length].iloc[0]

                # now, check all of the filters
                offset = int(length_row['highest_peak_offset'])
                offset_status = "Used for analysis"

                if max_bf_var is not None:
                    if ((length_row['highest_peak_bf_mean'] <= min_bf_mean) or
                            length_row['highest_peak_bf_var'] >= max_bf_var):
                        offset_status = "BF mean too small or BF var too high"

                if min_bf_likelihood is not None:
                    likelihood = 1 - scipy.stats.norm.cdf(
                        min_bf_mean, length_row['highest_peak_bf_mean'],
                        np.sqrt(length_row['highest_peak_bf_var']))
                    if likelihood <= min_bf_likelihood:
                        offset_status = "Likehood too small"

                if (max_bf_var is None) and (min_bf_likelihood is None):
                    if length_row['highest_peak_bf_mean'] <= min_bf_mean:
                        offset_status = "BF mean too small"

                if length_row[
                        'highest_peak_profile_sum'] < min_metagene_profile_count:
                    offset_status = "Count too small"

                if length_row[
                        'highest_peak_profile_sum'] < args.min_visualization_count:
                    msg = "Not enough reads of this length. Skipping."
                    logger.warning(msg)
                    continue

                metagene_profile_image = filenames.get_metagene_profile_image(
                    config['riboseq_data'],
                    name,
                    image_type=args.image_type,
                    is_unique=is_unique,
                    length=length,
                    note=note)

                #title = ("length: {}. P-site offset: {}. \\newline status: {}"
                #"\n".format(length, offset, offset_status))
                #latex.write(out, title, size="scriptsize")
                title = ("Length: {}. P-site offset: {}. Status: {}\n".format(
                    length, offset, offset_status))
                if args.show_read_length_bfs:
                    title = "\scriptsize{" + title + "}"
                    title = "\\multicolumn{2}{c}{" + title + "}"
                    latex.write(out, title)
                    latex.write_row_sep(out)
                else:
                    latex.write(out, title, size="scriptsize")

                latex.write_graphics(out, metagene_profile_image, width=0.45)

                i += 1
                if i % 2 == 1:
                    latex.write_column_sep(out)
                else:
                    latex.write_row_sep(out)

                if args.show_read_length_bfs:

                    bayes_factor_image = filenames.get_metagene_profile_bayes_factor_image(
                        config['riboseq_data'],
                        name,
                        image_type=args.image_type,
                        is_unique=is_unique,
                        length=length,
                        note=note)

                    #latex.centering(out)
                    latex.write_graphics(out, bayes_factor_image, width=0.45)

                    i += 1
                    if i % 2 == 1:
                        latex.write_column_sep(out)
                    else:
                        latex.write_row_sep(out)

            if i % 2 == 1:
                latex.write_row_sep(out)

            latex.end_table(out)
            latex.clearpage(out)

        ### ORF type metagene profiles
        if args.show_orf_periodicity:
            title = "ORF type periodicity"
            latex.section(out, title)

            strands = ['+', '-']
            for sample_name in sample_names:
                i = 0

                try:
                    lengths, offsets = ribo_utils.get_periodic_lengths_and_offsets(
                        config,
                        sample_name,
                        is_unique=is_unique,
                        default_params=metagene_options)
                except FileNotFoundError:
                    msg = (
                        "Could not parse out lengths and offsets for sample: {}. "
                        "Skipping".format(sample_name))
                    logger.error(msg)
                    continue

                orf_type_profile_base = filenames.get_orf_type_profile_base(
                    config['riboseq_data'],
                    sample_name,
                    length=lengths,
                    offset=offsets,
                    is_unique=is_unique,
                    note=note,
                    subfolder='orf-profiles')

                for orf_type in ribo_utils.orf_types:
                    for strand in strands:
                        orf_type_profile = filenames.get_orf_type_profile_image(
                            orf_type_profile_base,
                            orf_type,
                            strand,
                            image_type=args.image_type)

                        msg = "Looking for image file: {}".format(
                            orf_type_profile)
                        logger.debug(msg)
                        if os.path.exists(orf_type_profile):
                            if i % 4 == 0:
                                latex.begin_figure(out)

                            i += 1
                            latex.write_graphics(out,
                                                 orf_type_profile,
                                                 height=0.23)

                            if i % 4 == 0:
                                latex.end_figure(out)
                                latex.clearpage(out)

                if (i > 0) and (i % 4 != 0):
                    latex.end_figure(out)
                    latex.clearpage(out)

        latex.end_document(out)

    tex_filename = os.path.basename(tex_file)
    latex.compile(args.out, tex_filename)

    if args.create_fastqc_reports:
        parallel.apply_parallel_iter(config['riboseq_samples'].items(),
                                     args.num_cpus, create_fastqc_reports,
                                     config, args)
예제 #4
0
def main():
    parser = argparse.ArgumentParser(formatter_class=argparse.ArgumentDefaultsHelpFormatter,
                                     description="Creates base genome profile.")

    parser.add_argument('raw_data', help="The raw data file (fastq[.gz])")

    parser.add_argument('config', help="The (yaml) configuration file")

    parser.add_argument('name', help="The name for the dataset, used in the created files")

    parser.add_argument('-p', '--num-cpus', help="The number of processors to use",
                        type=int, default=default_num_cpus)
    
    parser.add_argument('--mem', help="The amount of RAM to request", default=default_mem)

    parser.add_argument('-t', '--tmp', help="""The location for temporary files. If not
        specified, program-specific temp locations are used.""", default=None)

    parser.add_argument('--do-not-call', action='store_true')

    parser.add_argument('--overwrite', help="""If this flag is present, existing files
        will be overwritten.""", action='store_true')

    parser.add_argument('-k', '--keep-intermediate-files', help="""If this flag is given,
        then all intermediate files will be kept; otherwise, they will be
        deleted. This feature is implemented piecemeal. If the --do-not-call flag
        is given, then nothing will be deleted.""", action='store_true')

    logging_utils.add_logging_options(parser)
    pgrm_utils.add_star_options(parser, star_executable)
    pgrm_utils.add_flexbar_options(parser)
    args = parser.parse_args()
    logging_utils.update_logging(args)

    msg = "[create-base-genome-profile]: {}".format(' '.join(sys.argv))
    logger.info(msg)

    config = yaml.load(open(args.config), Loader=yaml.FullLoader)

    # check that all of the necessary programs are callable
    programs = [
        'flexbar',
        args.star_executable,
        'samtools',
        'bowtie2',
        'remove-multimapping-reads'
    ]
    shell_utils.check_programs_exist(programs)

    required_keys = [
        'riboseq_data',
        'ribosomal_index',
        'gtf',
        'genome_base_path',
        'genome_name'
    ]
    utils.check_keys_exist(config, required_keys)

    note = config.get('note', None)
    call = not args.do_not_call
    keep_delete_files = args.keep_intermediate_files or args.do_not_call

    # Step 0: Running flexbar to remove adapter sequences

    raw_data = args.raw_data
    flexbar_target = filenames.get_without_adapters_base(config['riboseq_data'],
                                                         args.name,
                                                         note=note)
    without_adapters = filenames.get_without_adapters_fastq(config['riboseq_data'],
                                                            args.name,
                                                            note=note)

    adapter_seq_str = utils.get_config_argument(config, 'adapter_sequence', 'adapter-seq')
    adapter_file_str = utils.get_config_argument(config, 'adapter_file', 'adapters')

    # get all options, command line options override defaults
    flexbar_option_str = pgrm_utils.get_final_args(flexbar_options, args.flexbar_options)

    cmd = "flexbar -r {} -t {} {} {} {} -n {}".format(raw_data,
                                                      flexbar_target,
                                                      adapter_seq_str,
                                                      adapter_file_str,
                                                      flexbar_option_str,
                                                      args.num_cpus)
    in_files = [raw_data]
    out_files = [without_adapters]
    file_checkers = {
        without_adapters: fastx_utils.check_fastq_file
    }
    shell_utils.call_if_not_exists(cmd, out_files, in_files=in_files,
                                   file_checkers=file_checkers, overwrite=args.overwrite, call=call)

    # Step 1: Running bowtie2 to remove rRNA alignments

    out = utils.abspath("dev", "null")  # we do not care about the alignments
    without_rrna = filenames.get_without_rrna_fastq(config['riboseq_data'],
                                                    args.name,
                                                    note=note)
    with_rrna = filenames.get_with_rrna_fastq(config['riboseq_data'],
                                              args.name,
                                              note=note)

    cmd = "bowtie2 -p {} --very-fast -x {} -U {} -S {} --un-gz {} --al-gz {}".format(
        args.num_cpus,
        config['ribosomal_index'],
        without_adapters,
        out,
        without_rrna,
        with_rrna)

    in_files = [without_adapters]
    in_files.extend(pgrm_utils.get_bowtie2_index_files(config['ribosomal_index']))
    out_files = [without_rrna, with_rrna]
    to_delete = [without_adapters]
    file_checkers = {
        without_rrna: fastx_utils.check_fastq_file
    }
    shell_utils.call_if_not_exists(cmd, out_files, in_files=in_files,
                                   file_checkers=file_checkers, overwrite=args.overwrite, call=call,
                                   keep_delete_files=keep_delete_files, to_delete=to_delete)

    # Step 2: Running STAR to align rRNA-depleted reads to genome

    star_output_prefix = filenames.get_riboseq_bam_base(config['riboseq_data'],
                                                        args.name,
                                                        note=note)
    genome_star_bam = "{}{}".format(star_output_prefix, "Aligned.sortedByCoord.out.bam")

    # get all options, command line options override defaults

    mem_bytes = utils.human2bytes(args.mem)
    star_options['limitBAMsortRAM'] = mem_bytes

    if args.tmp is not None:
        star_tmp_name = str(args.name + "_STARtmp")
        star_tmp_dir = pgrm_utils.create_star_tmp(args.tmp, star_tmp_name)
        star_options['outTmpDir'] = star_tmp_dir

    star_option_str = pgrm_utils.get_final_args(star_options, args.star_options)

    # If GFF3 specs, then we need to inform STAR.
    # Whether we have de novo or not, the format of "config['gtf']" has precedence.
    sjdb_gtf_tag_str = ""
    use_gff3_specs = config['gtf'].endswith('gff')
    gtf_file = filenames.get_gtf(config['genome_base_path'],
                                 config['genome_name'],
                                 is_gff3=use_gff3_specs,
                                 is_star_input=True)
    if use_gff3_specs:
        sjdb_gtf_tag_str = "--sjdbGTFtagExonParentTranscript Parent"

    cmd = ("{} --runThreadN {} --genomeDir {} --sjdbGTFfile {} {} --readFilesIn {} "
        "{} --outFileNamePrefix {}".format(args.star_executable,
                                                 args.num_cpus,
                                                 config['star_index'],
                                                 gtf_file,
                                                 sjdb_gtf_tag_str,
                                                 without_rrna,
                                                 star_option_str,
                                                 star_output_prefix))
    in_files = [without_rrna]
    in_files.extend(pgrm_utils.get_star_index_files(config['star_index']))
    to_delete = [without_rrna]
    out_files = [genome_star_bam]
    file_checkers = {
        genome_star_bam: bam_utils.check_bam_file
    }
    shell_utils.call_if_not_exists(cmd, out_files, in_files=in_files,
                                   file_checkers=file_checkers, overwrite=args.overwrite,
                                   call=call, keep_delete_files=keep_delete_files, to_delete=to_delete)
    
    # now, we need to symlink the (genome) STAR output to that expected by the rest of the pipeline
    genome_sorted_bam = filenames.get_riboseq_bam(config['riboseq_data'],
                                                  args.name,
                                                  note=note)

    if os.path.exists(genome_star_bam):
        shell_utils.create_symlink(genome_star_bam, genome_sorted_bam, call)
    else:
        msg = ("Could not find the STAR genome bam alignment file. Unless "
               "--do-not-call was given, this is a problem.")
        logger.warning(msg)

    # create the bamtools index
    cmd = "samtools index -b {}".format(genome_sorted_bam)
    shell_utils.check_call(cmd, call=call)

    # check if we want to keep multimappers
    if 'keep_riboseq_multimappers' in config:
        return

    # remove multimapping reads from the genome file
    tmp_str = ""
    if args.tmp is not None:
        tmp_str = "--tmp {}".format(args.tmp)

    unique_genome_filename = filenames.get_riboseq_bam(config['riboseq_data'],
                                                       args.name,
                                                       is_unique=True,
                                                       note=note)

    cmd = "remove-multimapping-reads {} {} {}".format(genome_sorted_bam, 
                                                      unique_genome_filename,
                                                      tmp_str)

    in_files = [genome_sorted_bam]
    out_files = [unique_genome_filename]
    to_delete = [genome_star_bam, genome_sorted_bam]
    file_checkers = {
        unique_genome_filename: bam_utils.check_bam_file
    }
    shell_utils.call_if_not_exists(cmd, out_files, in_files=in_files,
                                   file_checkers=file_checkers, overwrite=args.overwrite,
                                   call=call, keep_delete_files=keep_delete_files, to_delete=to_delete)
예제 #5
0
def main():
    parser = argparse.ArgumentParser(
        formatter_class=argparse.ArgumentDefaultsHelpFormatter,
        description="""This script runs the Rp-Bp pipelines 
        on a given sample. It requires a YAML config file that includes a number of keys. 
        Please see the documentation for a complete description.""")

    parser.add_argument('raw_data', help="The raw data file (fastq[.gz])")

    parser.add_argument('config', help="The (yaml) configuration file")

    parser.add_argument(
        'name', help="The name for the dataset, used in the created files")

    parser.add_argument('--tmp', help="The temp directory", default=None)

    parser.add_argument('--overwrite',
                        help="If this flag is present, existing files "
                        "will be overwritten.",
                        action='store_true')

    parser.add_argument('--profiles-only',
                        help="""If this flag is present, then only 
        the ORF profiles will be created""",
                        action='store_true')

    parser.add_argument('-k',
                        '--keep-intermediate-files',
                        help="""If this flag is given,
        then all intermediate files will be kept; otherwise, they will be
        deleted. This feature is implemented piecemeal. If the --do-not-call flag
        is given, then nothing will be deleted.""",
                        action='store_true')

    slurm.add_sbatch_options(parser,
                             num_cpus=default_num_cpus,
                             mem=default_mem)
    logging_utils.add_logging_options(parser)
    pgrm_utils.add_star_options(parser, star_executable)
    pgrm_utils.add_flexbar_options(parser)
    args = parser.parse_args()
    logging_utils.update_logging(args)

    config = yaml.load(open(args.config), Loader=yaml.FullLoader)

    # check that all of the necessary programs are callable
    programs = [
        'flexbar', args.star_executable, 'samtools', 'bowtie2',
        'create-base-genome-profile', 'remove-multimapping-reads',
        'extract-metagene-profiles', 'estimate-metagene-profile-bayes-factors',
        'select-periodic-offsets', 'extract-orf-profiles',
        'estimate-orf-bayes-factors', 'select-final-prediction-set',
        'create-orf-profiles', 'predict-translated-orfs'
    ]
    shell_utils.check_programs_exist(programs)

    required_keys = [
        'riboseq_data', 'ribosomal_index', 'star_index', 'genome_base_path',
        'genome_name', 'fasta', 'gtf'
    ]
    utils.check_keys_exist(config, required_keys)

    # if using slurm, submit the script, but we cannot use sys.argv directly
    # as the shell strips the quotes around the arguments
    if args.use_slurm:
        cmd = "{}".format(' '.join("'" + s + "'" if '"' in s else s
                                   for s in sys.argv))
        slurm.check_sbatch(cmd, args=args)
        return

    # handle all option strings to call programs
    logging_str = logging_utils.get_logging_options_string(args)
    star_str = pgrm_utils.get_star_options_string(args)
    flexbar_str = pgrm_utils.get_flexbar_options_string(args)

    # handle do_not_call so that we do call the preprocessing script,
    # but that it does not run anything
    call = not args.do_not_call
    do_not_call_str = ""
    if not call:
        do_not_call_str = "--do-not-call"

    overwrite_str = ""
    if args.overwrite:
        overwrite_str = "--overwrite"

    keep_intermediate_str = ""
    if args.keep_intermediate_files:
        keep_intermediate_str = "--keep-intermediate-files"

    tmp_str = ""
    if args.tmp is not None:
        tmp_str = "--tmp {}".format(shlex.quote(args.tmp))

    mem_str = "--mem {}".format(shlex.quote(args.mem))

    cmd = ("create-orf-profiles {} {} {} --num-cpus {} {} {} {} {} {} {} {} {}"
           .format(args.raw_data, args.config, args.name, args.num_cpus,
                   mem_str, do_not_call_str, overwrite_str,
                   keep_intermediate_str, logging_str, tmp_str, star_str,
                   flexbar_str))

    shell_utils.check_call(cmd)

    # check if we only want to create the profiles
    if args.profiles_only:
        return

    # then we predict the ORFs
    cmd = ("predict-translated-orfs {} {} --num-cpus {} {} {} {}".format(
        args.config, args.name, args.num_cpus, do_not_call_str, overwrite_str,
        logging_str))
    shell_utils.check_call(cmd)
예제 #6
0
def main():
    parser = argparse.ArgumentParser(
        formatter_class=argparse.ArgumentDefaultsHelpFormatter,
        description="""This is a helper script to submit a set of
        samples to SLURM. It can also be used to run a set of samples sequentially. Due to limitations 
        on the config file specification, all of the samples must use the same reference indices 
        obtained by running 'create-base-genome-profile.""")

    parser.add_argument('config', help="The (yaml) configuration file")

    parser.add_argument('--tmp', help="The temp directory", default=None)

    parser.add_argument('--overwrite',
                        help="""If this flag is present, existing files 
        will be overwritten.""",
                        action='store_true')

    parser.add_argument('--profiles-only',
                        help="""If this flag is present, then only
        the pre-processing part of the pipeline will be called, i.e. profiles
        will be created for each sample specified in the config file, but no predictions
        will be made.""",
                        action='store_true')

    parser.add_argument('--merge-replicates',
                        help="""If this flag is present, then
        the ORF profiles from the replicates will be merged before making the final
        predictions""",
                        action='store_true')

    parser.add_argument('--run-replicates',
                        help="""If this flag is given with the
        --merge-replicates flag, then both the replicates and the individual
        samples will be run. This flag has no effect if --merge-replicates is not
        given.""",
                        action='store_true')

    parser.add_argument('-k',
                        '--keep-intermediate-files',
                        help="""If this flag is given,
        then all intermediate files will be kept; otherwise, they will be
        deleted. This feature is implemented piecemeal. If the --do-not-call flag
        is given, then nothing will be deleted.""",
                        action='store_true')

    slurm.add_sbatch_options(parser,
                             num_cpus=default_num_cpus,
                             mem=default_mem)
    logging_utils.add_logging_options(parser)
    pgrm_utils.add_star_options(parser, star_executable)
    pgrm_utils.add_flexbar_options(parser)
    args = parser.parse_args()
    logging_utils.update_logging(args)

    config = yaml.load(open(args.config), Loader=yaml.FullLoader)

    # check that all of the necessary programs are callable
    programs = [
        'flexbar', args.star_executable, 'samtools', 'bowtie2',
        'create-base-genome-profile', 'remove-multimapping-reads',
        'extract-metagene-profiles', 'estimate-metagene-profile-bayes-factors',
        'select-periodic-offsets', 'extract-orf-profiles',
        'estimate-orf-bayes-factors', 'select-final-prediction-set',
        'create-orf-profiles', 'predict-translated-orfs', 'run-rpbp-pipeline'
    ]
    shell_utils.check_programs_exist(programs)

    required_keys = [
        'riboseq_data', 'riboseq_samples', 'ribosomal_index', 'star_index',
        'genome_base_path', 'genome_name', 'fasta', 'gtf'
    ]
    utils.check_keys_exist(config, required_keys)

    # handle all option strings to call the pipeline script
    logging_str = logging_utils.get_logging_options_string(args)
    star_str = pgrm_utils.get_star_options_string(args)
    flexbar_str = pgrm_utils.get_flexbar_options_string(args)

    # handle do_not_call so that we do call the pipeline script, but that it does not run anything
    call = not args.do_not_call
    do_not_call_str = ""
    if not call:
        do_not_call_str = "--do-not-call"
    args.do_not_call = False

    overwrite_str = ""
    if args.overwrite:
        overwrite_str = "--overwrite"

    mem_str = "--mem {}".format(shlex.quote(args.mem))

    keep_intermediate_str = ""
    if args.keep_intermediate_files:
        keep_intermediate_str = "--keep-intermediate-files"

    # check if we only want to create the profiles, in this case
    # we call run-rpbp-pipeline with the --profiles-only option
    profiles_only_str = ""
    if args.profiles_only:
        if args.merge_replicates:
            msg = (
                "The --profiles-only option was given, this option has"
                "precedence, and it will override the --merge-replicates option!"
            )
            logger.warning(msg)
        args.merge_replicates = False
        profiles_only_str = "--profiles-only"

    # if we merge the replicates, then we only use the rpbp script to create
    # the ORF profiles, but we still make predictions
    if args.merge_replicates and not args.run_replicates:
        profiles_only_str = "--profiles-only"

    if args.run_replicates and not args.merge_replicates:
        msg = (
            "The --run-replicates option was given without the --merge-replicates "
            "option. It will be ignored.")
        logger.warning(msg)

    # collect the job_ids in case we are using slurm and need to merge replicates
    rep_to_condition = ribo_utils.get_riboseq_replicates_reverse_map(config)
    job_ids_mapping = defaultdict(list)

    sample_names = sorted(config['riboseq_samples'].keys())

    for sample_name in sample_names:
        data = config['riboseq_samples'][sample_name]

        tmp_str = ""
        if args.tmp is not None:
            tmp = os.path.join(args.tmp, "{}_rpbp".format(sample_name))
            tmp_str = "--tmp {}".format(tmp)

        cmd = "run-rpbp-pipeline {} {} {} --num-cpus {} {} {} {} {} {} {} {} {} {}".format(
            data, args.config, sample_name, args.num_cpus, mem_str, tmp_str,
            do_not_call_str, overwrite_str, profiles_only_str,
            keep_intermediate_str, logging_str, star_str, flexbar_str)

        job_id = slurm.check_sbatch(cmd, args=args)
        job_ids_mapping[rep_to_condition[sample_name]].append(job_id)

    # now, if we are running the "standard" pipeline, we are done
    if not args.merge_replicates:
        return

    # otherwise, we need to merge the replicates for each condition
    riboseq_replicates = ribo_utils.get_riboseq_replicates(config)
    merge_replicates_str = "--merge-replicates"

    for condition_name in sorted(riboseq_replicates.keys()):

        # then we predict the ORFs
        cmd = "predict-translated-orfs {} {} --num-cpus {} {} {} {} {}".format(
            args.config, condition_name, args.num_cpus, do_not_call_str,
            overwrite_str, logging_str, merge_replicates_str)

        job_ids = job_ids_mapping[condition_name]
        slurm.check_sbatch(cmd, args=args, dependencies=job_ids)
예제 #7
0
def main():
    parser = argparse.ArgumentParser(
        formatter_class=argparse.ArgumentDefaultsHelpFormatter,
        description='''Prepare a reference genome and matching 
        annotations, including labelled ORFs, for use with the Rp-Bp periodicity estimation 
        and ORF translation prediction pipeline.''')

    parser.add_argument('config', help='''The (yaml) configuration file''')

    parser.add_argument('--overwrite',
                        help='''If this flag is present, existing files
        will be overwritten.''',
                        action='store_true')

    slurm.add_sbatch_options(parser,
                             num_cpus=default_num_cpus,
                             mem=default_mem)
    logging_utils.add_logging_options(parser)
    pgrm_utils.add_star_options(parser, star_executable)
    args = parser.parse_args()
    logging_utils.update_logging(args)

    config = yaml.load(open(args.config), Loader=yaml.FullLoader)

    # check required callable programs, config keys and files
    programs = [
        'extract-orf-coordinates', 'label-orfs', 'bowtie2-build-s',
        'split-bed12-blocks', 'gtf-to-bed12', args.star_executable
    ]
    shell_utils.check_programs_exist(programs)

    required_keys = [
        'genome_base_path', 'genome_name', 'gtf', 'fasta', 'ribosomal_fasta',
        'ribosomal_index', 'star_index'
    ]
    utils.check_keys_exist(config, required_keys)

    files = [config['gtf'], config['fasta'], config['ribosomal_fasta']]
    if 'de_novo_gtf' in config:
        files += [config['de_novo_gtf']]
    utils.check_files_exist(files, source='prepare-rpbp-genome')

    # check if we want to use slurm
    if args.use_slurm:
        cmd = ' '.join(sys.argv)
        slurm.check_sbatch(cmd, args=args)
        return

    call = not args.do_not_call

    # the rRNA index
    cmd = "bowtie2-build-s {} {}".format(config['ribosomal_fasta'],
                                         config['ribosomal_index'])

    in_files = [config['ribosomal_fasta']]
    out_files = pgrm_utils.get_bowtie2_index_files(config['ribosomal_index'])
    shell_utils.call_if_not_exists(cmd,
                                   out_files,
                                   in_files=in_files,
                                   overwrite=args.overwrite,
                                   call=call)

    # the STAR index
    mem = utils.human2bytes(args.mem)
    cmd = ("{} --runMode genomeGenerate --genomeDir {} --genomeFastaFiles {} "
           "--runThreadN {} --limitGenomeGenerateRAM {}".format(
               args.star_executable, config['star_index'], config['fasta'],
               args.num_cpus, mem))

    in_files = [config['fasta']]
    out_files = pgrm_utils.get_star_index_files(config['star_index'])
    shell_utils.call_if_not_exists(cmd,
                                   out_files,
                                   in_files=in_files,
                                   overwrite=args.overwrite,
                                   call=call)

    # get the ORFs
    get_orfs(config['gtf'], args, config, is_annotated=True, is_de_novo=False)

    # we will use these files later in the pipeline
    annotated_orfs = filenames.get_orfs(config['genome_base_path'],
                                        config['genome_name'],
                                        note=config.get('orf_note'),
                                        is_annotated=True,
                                        is_de_novo=False)

    orfs_genomic = filenames.get_orfs(config['genome_base_path'],
                                      config['genome_name'],
                                      note=config.get('orf_note'))

    annotated_exons_file = filenames.get_exons(config['genome_base_path'],
                                               config['genome_name'],
                                               note=config.get('orf_note'),
                                               is_annotated=True,
                                               is_de_novo=False)

    exons_file = filenames.get_exons(config['genome_base_path'],
                                     config['genome_name'],
                                     note=config.get('orf_note'))

    annotated_labeled_orfs = filenames.get_labels(config['genome_base_path'],
                                                  config['genome_name'],
                                                  note=config.get('orf_note'),
                                                  is_annotated=True,
                                                  is_de_novo=False)

    labeled_orfs = filenames.get_labels(config['genome_base_path'],
                                        config['genome_name'],
                                        note=config.get('orf_note'))

    use_gff3_specs = config['gtf'].endswith('gff')
    gtf_file = filenames.get_gtf(config['genome_base_path'],
                                 config['genome_name'],
                                 is_gff3=use_gff3_specs,
                                 is_star_input=True)

    # now, check if we have a de novo assembly
    if 'de_novo_gtf' in config:
        get_orfs(config['de_novo_gtf'],
                 args,
                 config,
                 is_annotated=False,
                 is_de_novo=True)

        # we need to concat the ORF and exon files
        de_novo_orfs = filenames.get_orfs(config['genome_base_path'],
                                          config['genome_name'],
                                          note=config.get('orf_note'),
                                          is_annotated=False,
                                          is_de_novo=True)

        orfs_files = [annotated_orfs, de_novo_orfs]

        orfs_files_str = ' '.join(orfs_files)
        msg = ("Concatenating files. Output file: {}; Input files: {}".format(
            orfs_genomic, orfs_files_str))
        logger.info(msg)

        if call:
            concatenated_bed = bed_utils.concatenate(orfs_files, sort_bed=True)
            concatenated_bed['orf_num'] = range(len(concatenated_bed))
            additional_columns = ['orf_num', 'orf_len', 'orf_type']
            fields = bed_utils.bed12_field_names + additional_columns
            bed_utils.write_bed(concatenated_bed[fields], orfs_genomic)
        else:
            msg = "Skipping concatenation due to --call value"
            logger.info(msg)

        de_novo_exons_file = filenames.get_exons(config['genome_base_path'],
                                                 config['genome_name'],
                                                 note=config.get('orf_note'),
                                                 is_annotated=False,
                                                 is_de_novo=True)

        exons_files = [annotated_exons_file, de_novo_exons_file]

        exons_files_str = ' '.join(exons_files)
        msg = ("Concatenating files. Output file: {}; Input files: {}".format(
            exons_file, exons_files_str))
        logger.info(msg)

        if call:
            concatenated_bed = bed_utils.concatenate(exons_files,
                                                     sort_bed=True)
            fields = bed_utils.bed6_field_names + [
                'exon_index', 'transcript_start'
            ]
            bed_utils.write_bed(concatenated_bed[fields], exons_file)
        else:
            msg = "Skipping concatenation due to --call value"
            logger.info(msg)

        de_novo_labeled_orfs = filenames.get_labels(
            config['genome_base_path'],
            config['genome_name'],
            note=config.get('orf_note'),
            is_annotated=False,
            is_de_novo=True)

        label_files = [annotated_labeled_orfs, de_novo_labeled_orfs]

        label_files_str = ' '.join(label_files)
        msg = ("Concatenating files. Output file: {}; Input files: {}".format(
            labeled_orfs, label_files_str))
        logger.info(msg)

        if call:
            # not sorted, as is
            concatenated_bed = bed_utils.concatenate(label_files,
                                                     sort_bed=False)
            bed_utils.write_bed(concatenated_bed, labeled_orfs)
        else:
            msg = "Skipping concatenation due to --call value"
            logger.info(msg)

        # we also need to concat the annotations to inform STAR
        # there is no particular reason to merge and sort the files, so
        # we just concatenate them...
        if (config['de_novo_gtf'].endswith('gff') == use_gff3_specs):
            cmd = ("awk '!/^#/' {} {} > {}".format(config['gtf'],
                                                   config['de_novo_gtf'],
                                                   gtf_file))
            in_files = [config['gtf'], config['de_novo_gtf']]
            out_files = [gtf_file]
            shell_utils.call_if_not_exists(cmd,
                                           out_files,
                                           in_files=in_files,
                                           overwrite=args.overwrite,
                                           call=call)
        else:
            msg = (
                "Skipping concatenation due to mismatch in format specifications (GTF2/GFF3)"
                "for reference and do novo annotations. Symlink to reference annotations created."
            )
            logger.warning(msg)
            if os.path.exists(config['gtf']):
                shell_utils.create_symlink(config['gtf'], gtf_file, call)

    else:
        # if we do not have a de novo assembly, symlink the files

        if os.path.exists(annotated_orfs):
            shell_utils.create_symlink(annotated_orfs, orfs_genomic, call)

        if os.path.exists(annotated_exons_file):
            shell_utils.create_symlink(annotated_exons_file, exons_file, call)

        if os.path.exists(annotated_labeled_orfs):
            shell_utils.create_symlink(annotated_labeled_orfs, labeled_orfs,
                                       call)

        if os.path.exists(config['gtf']):
            shell_utils.create_symlink(config['gtf'], gtf_file, call)
예제 #8
0
def main():
    
    parser = argparse.ArgumentParser(formatter_class=argparse.ArgumentDefaultsHelpFormatter,
                                     description="""This script runs all of the processing necessary to 
        produce the signals used for ORF translation prediction. In particular, it creates the 
        metagene profiles, selected the periodic fragments and generate the ORF profiles.""")

    parser.add_argument('raw_data', help="The raw data file (fastq[.gz])")

    parser.add_argument('config', help="The (yaml) configuration file")

    parser.add_argument('name', help="The name for the dataset, used in the created files")

    parser.add_argument('-p', '--num-cpus', help="The number of processors to use",
                        type=int, default=default_num_cpus)

    parser.add_argument('--mem', help="The amount of RAM to request", default=default_mem)

    parser.add_argument('--tmp', help="The location for temp files", default=None)

    parser.add_argument('--do-not-call', action='store_true')

    parser.add_argument('--overwrite', help="""If this flag is present, existing files 
        will be overwritten.""", action='store_true')
         
    parser.add_argument('-k', '--keep-intermediate-files', help="""If this flag is given,
        then all intermediate files will be kept; otherwise, they will be deleted. 
        This feature is implemented piecemeal. If the --do-not-call flag is given, 
        then nothing will be deleted.""", action='store_true')

    logging_utils.add_logging_options(parser)
    pgrm_utils.add_star_options(parser, star_executable)
    pgrm_utils.add_flexbar_options(parser)
    args = parser.parse_args()
    logging_utils.update_logging(args)

    msg = "[create-orf-profiles]: {}".format(' '.join(sys.argv))
    logger.info(msg)

    config = yaml.load(open(args.config), Loader=yaml.FullLoader)

    # check that all of the necessary programs are callable
    programs = [
        'flexbar',
        args.star_executable,
        'samtools',
        'bowtie2',
        'create-base-genome-profile',
        'remove-multimapping-reads',
        'extract-metagene-profiles',
        'estimate-metagene-profile-bayes-factors',
        'select-periodic-offsets',
        'extract-orf-profiles'
    ]
    shell_utils.check_programs_exist(programs)

    required_keys = [
        'riboseq_data',
        'ribosomal_index',
        'gtf',
        'genome_base_path',
        'genome_name'
    ]
    utils.check_keys_exist(config, required_keys)

    note = config.get('note', None)
    models_base = config.get('models_base', default_models_base)

    logging_str = logging_utils.get_logging_options_string(args)
    star_str = pgrm_utils.get_star_options_string(args)
    flexbar_str = pgrm_utils.get_flexbar_options_string(args)

    # handle do_not_call so that we do call the preprocessing script,
    # but that it does not run anything
    call = not args.do_not_call
    do_not_call_argument = ""
    if not call:
        do_not_call_argument = "--do-not-call"

    overwrite_argument = ""
    if args.overwrite:
        overwrite_argument = "--overwrite"

    keep_intermediate_str = ""
    if args.keep_intermediate_files:
        keep_intermediate_str = "--keep-intermediate-files"

    tmp_str = ""
    if args.tmp is not None:
        tmp_str = "--tmp {}".format(args.tmp)

    mem_str = "--mem {}".format(shlex.quote(args.mem))

    # check if we want to keep multimappers
    is_unique = not ('keep_riboseq_multimappers' in config)

    riboseq_raw_data = args.raw_data
    riboseq_bam_filename = filenames.get_riboseq_bam(config['riboseq_data'],
                                                     args.name,
                                                     is_unique=is_unique,
                                                     note=note)

    cmd = ("create-base-genome-profile {} {} {} --num-cpus {} {} {} {} {} {} {} {} {}".format(
        riboseq_raw_data,
        args.config,
        args.name,
        args.num_cpus,
        do_not_call_argument,
        overwrite_argument,
        logging_str,
        star_str,
        tmp_str,
        flexbar_str,
        keep_intermediate_str,
        mem_str))

    # There could be cases where we start somewhere in the middle of creating
    # the base genome profile. So even if the "raw data" is not available, 
    # we still want to call the base pipeline.
    # in_files = [riboseq_raw_data]
    in_files = []
    out_files = [riboseq_bam_filename]
    # we always call this, and pass --do-not-call through
    shell_utils.call_if_not_exists(cmd, out_files, in_files=in_files,
                                   overwrite=args.overwrite, call=True)

    # Extract the metagene profiles

    start_upstream_str = utils.get_config_argument(config,
                                                   'metagene_start_upstream',
                                                   'start-upstream',
                                                   default=metagene_options['metagene_start_upstream'])
    start_downstream_str = utils.get_config_argument(config,
                                                     'metagene_start_downstream',
                                                     'start-downstream',
                                                     default=metagene_options['metagene_start_downstream'])
    end_upstream_str = utils.get_config_argument(config,
                                                 'metagene_end_upstream',
                                                 'end-upstream',
                                                 default=metagene_options['metagene_end_upstream'])
    end_downstream_str = utils.get_config_argument(config,
                                                   'metagene_end_downstream',
                                                   'end-downstream',
                                                   default=metagene_options['metagene_end_downstream'])

    metagene_profiles = filenames.get_metagene_profiles(config['riboseq_data'],
                                                        args.name,
                                                        is_unique=is_unique,
                                                        note=note)

    # use the canonical transcripts for extracting the metagene profiles
    transcript_bed = filenames.get_bed(config['genome_base_path'],
                                       config['genome_name'],
                                       is_merged=False,
                                       is_annotated=True)

    cmd = ("extract-metagene-profiles {} {} {} --num-cpus {} {} {} {} {} {}".format(
        riboseq_bam_filename,
        transcript_bed,
        metagene_profiles,
        args.num_cpus,
        logging_str,
        start_upstream_str,
        start_downstream_str,
        end_upstream_str,
        end_downstream_str))

    in_files = [riboseq_bam_filename, transcript_bed]
    out_files = [metagene_profiles]
    file_checkers = {
        metagene_profiles: utils.check_gzip_file
    }
    shell_utils.call_if_not_exists(cmd, out_files, in_files=in_files,
                                   file_checkers=file_checkers,
                                   overwrite=args.overwrite, call=call)

    # estimate the periodicity for each offset for all read lengths
    metagene_profile_bayes_factors = filenames.get_metagene_profiles_bayes_factors(
        config['riboseq_data'],
        args.name,
        is_unique=is_unique,
        note=note)

    periodic_models = filenames.get_models(models_base, 'periodic')
    non_periodic_models = filenames.get_models(models_base, 'nonperiodic')
    
    periodic_models_str = ' '.join(periodic_models)
    non_periodic_models_str = ' '.join(non_periodic_models)

    periodic_models_str = "--periodic-models {}".format(periodic_models_str)
    non_periodic_models_str = "--nonperiodic-models {}".format(non_periodic_models_str)

    periodic_offset_start_str = utils.get_config_argument(config,
                                                          'periodic_offset_start',
                                                          default=metagene_options['periodic_offset_start'])
    periodic_offset_end_str = utils.get_config_argument(config,
                                                        'periodic_offset_end',
                                                        default=metagene_options['periodic_offset_end'])
    metagene_profile_length_str = utils.get_config_argument(config,
                                                            'metagene_profile_length',
                                                            default=metagene_options['metagene_profile_length'])
    seed_str = utils.get_config_argument(config,
                                         'seed',
                                         default=metagene_options['seed'])
    chains_str = utils.get_config_argument(config,
                                           'chains',
                                           default=metagene_options['chains'])
    iterations_str = utils.get_config_argument(config,
                                               'metagene_iterations',
                                               'iterations',
                                               default=metagene_options['metagene_iterations'])

    cmd = ("estimate-metagene-profile-bayes-factors {} {} --num-cpus {} {} {} "
           "{} {} {} {} {} {} {}".format(metagene_profiles,
                                         metagene_profile_bayes_factors,
                                         args.num_cpus,
                                         periodic_models_str,
                                         non_periodic_models_str,
                                         periodic_offset_start_str,
                                         periodic_offset_end_str,
                                         metagene_profile_length_str,
                                         seed_str,
                                         chains_str,
                                         iterations_str,
                                         logging_str))

    in_files = [metagene_profiles]
    in_files.extend(periodic_models)
    in_files.extend(non_periodic_models)
    out_files = [metagene_profile_bayes_factors]
    file_checkers = {
        metagene_profile_bayes_factors: utils.check_gzip_file
    }
    shell_utils.call_if_not_exists(cmd, out_files, in_files=in_files,
                                   file_checkers=file_checkers,
                                   overwrite=args.overwrite, call=call)
    
    # select the best read lengths for constructing the signal
    periodic_offsets = filenames.get_periodic_offsets(config['riboseq_data'],
                                                      args.name,
                                                      is_unique=is_unique,
                                                      note=note)

    cmd = "select-periodic-offsets {} {}".format(metagene_profile_bayes_factors,
                                                 periodic_offsets)

    in_files = [metagene_profile_bayes_factors]
    out_files = [periodic_offsets]
    file_checkers = {
        periodic_offsets: utils.check_gzip_file
    }
    shell_utils.call_if_not_exists(cmd, out_files, in_files=in_files,
                                   file_checkers=file_checkers,
                                   overwrite=args.overwrite, call=call)

    # get the lengths and offsets which meet the required criteria from the config file
    lengths, offsets = ribo_utils.get_periodic_lengths_and_offsets(config,
                                                                   args.name,
                                                                   args.do_not_call,
                                                                   is_unique=is_unique,
                                                                   default_params=metagene_options)

    if len(lengths) == 0:
        msg = ("No periodic read lengths and offsets were found. Try relaxing "
               "min_metagene_profile_count, min_metagene_bf_mean, max_metagene_bf_var, "
               "and/or min_metagene_bf_likelihood. Quitting.")
        logger.critical(msg)
        return

    lengths_str = ' '.join(lengths)
    offsets_str = ' '.join(offsets)

    seqname_prefix_str = utils.get_config_argument(config, 'seqname_prefix')
    
    # extract the riboseq profiles for each orf
    unique_filename = filenames.get_riboseq_bam(config['riboseq_data'],
                                                args.name,
                                                is_unique=is_unique,
                                                note=note)

    profiles_filename = filenames.get_riboseq_profiles(config['riboseq_data'],
                                                       args.name,
                                                       length=lengths,
                                                       offset=offsets,
                                                       is_unique=is_unique,
                                                       note=note)

    orfs_genomic = filenames.get_orfs(config['genome_base_path'],
                                      config['genome_name'],
                                      note=config.get('orf_note'))

    exons_file = filenames.get_exons(config['genome_base_path'],
                                     config['genome_name'],
                                     note=config.get('orf_note'))

    cmd = ("extract-orf-profiles {} {} {} {} --lengths {} --offsets {} {} {} --num-cpus {} ".format(
        unique_filename,
        orfs_genomic,
        exons_file,
        profiles_filename,
        lengths_str,
        offsets_str,
        logging_str,
        seqname_prefix_str,
        args.num_cpus))

    in_files = [orfs_genomic, exons_file, unique_filename]
    out_files = [profiles_filename]

    # todo: implement a file checker for mtx files
    shell_utils.call_if_not_exists(cmd, out_files, in_files=in_files,
                                   overwrite=args.overwrite, call=call)
예제 #9
0
def main():
    parser = argparse.ArgumentParser(
        formatter_class=argparse.ArgumentDefaultsHelpFormatter,
        description=
        "This script identifies the orf peptide matches for all samples in "
        "a project.")
    parser.add_argument('config', help="The (yaml) config file")

    parser.add_argument('--peptide-filter-field',
                        help="The field to use for "
                        "filtering the peptides from MaxQuant",
                        default=default_peptide_filter_field)

    parser.add_argument('--peptide-filter-value',
                        help="All peptides with a value "
                        "greater than the filter value will be removed",
                        type=float,
                        default=default_peptide_filter_value)

    parser.add_argument('--peptide-separator',
                        help="The separator in the "
                        "peptide file",
                        default=default_peptide_separator)

    parser.add_argument(
        '--note',
        help="If this option is given, it will be used in "
        "the output filenames.\n\nN.B. This REPLACES the note in the config file.",
        default=default_note)

    slurm.add_sbatch_options(parser)
    logging_utils.add_logging_options(parser)
    args = parser.parse_args()
    logging_utils.update_logging(args)

    logging_str = logging_utils.get_logging_options_string(args)

    config = yaml.load(open(args.config), Loader=yaml.FullLoader)
    call = not args.do_not_call

    programs = ['get-orf-peptide-matches']
    shell_utils.check_programs_exist(programs)

    required_keys = [
        'peptide_files', 'peptide_cell_type_analysis', 'riboseq_data',
        'riboseq_samples'
    ]
    utils.check_keys_exist(config, required_keys)

    note_str = config.get('note', None)
    out_note_str = note_str

    if args.note is not None and len(args.note) > 0:
        out_note_str = args.note

    args_dict = vars(args)

    peptide_filter_field_str = utils.get_config_argument(
        args_dict, 'peptides_filter_field')
    peptide_filter_value_str = utils.get_config_argument(
        args_dict, 'peptides_filter_value')
    peptide_separator_str = utils.get_config_argument(args_dict,
                                                      'peptide_separator')

    num_cpus_str = utils.get_config_argument(args_dict, 'num_cpus')

    cell_types = ribo_utils.get_riboseq_cell_type_samples(config)
    for cell_type, peptide_files in config['peptide_cell_type_analysis'].items(
    ):
        if cell_type not in cell_types:
            msg = (
                "Could not find cell_type specification. Please check the config "
                "file: {}".format(cell_type))
            logger.warning(msg)
            continue

        cell_type_protein = ribo_filenames.get_riboseq_cell_type_protein(
            config['riboseq_data'], cell_type, is_filtered=True, note=note_str)

        if not os.path.exists(cell_type_protein):
            msg = ("Could not find cell_type protein fasta. Skipping: {}".
                   format(cell_type_protein))
            logger.warning(msg)
            continue

        for peptide_file in peptide_files:
            if peptide_file not in config['peptide_files']:
                msg = (
                    "Could not find peptide_file specification. Please check "
                    "the config file: {}".format(peptide_file))
                logger.warning(msg)
                continue

            peptide_txt_file = config['peptide_files'][peptide_file]

            if not os.path.exists(peptide_txt_file):
                msg = ("Could not find peptide.txt file. Skipping: {}".format(
                    peptide_txt_file))
                logger.warning(msg)
                continue

            peptide_matches = ribo_filenames.get_riboseq_peptide_matches(
                config['riboseq_data'],
                cell_type,
                peptide_file,
                is_filtered=True,
                note=out_note_str)

            cmd = "get-orf-peptide-matches {} {} {} {} {} {} {} {}".format(
                cell_type_protein, peptide_txt_file, peptide_matches,
                num_cpus_str, peptide_filter_field_str,
                peptide_filter_value_str, peptide_separator_str, logging_str)

            slurm.check_sbatch(cmd, args=args)
예제 #10
0
def main():
    parser = argparse.ArgumentParser(
        formatter_class=argparse.ArgumentDefaultsHelpFormatter,
        description=
        "This script creates the plots which detail the basic characteristics "
        "of the ORF predictions from the Rp-Bp pipeline. It also creates and compiles (if "
        "possible) a latex report for them.")

    parser.add_argument('config', help="The (yaml) config file")
    parser.add_argument('out',
                        help="The base output directory for the latex report")

    parser.add_argument(
        '--show-unfiltered-orfs',
        help="If this flag is "
        "present, bar charts showing the distribution of the types of the "
        "unfiltered ORF set will be included",
        action='store_true')

    parser.add_argument(
        '--show-orf-periodicity',
        help="If this flag is "
        "present, bar charts showing the periodicity of each ORF type will be "
        "included in the report.",
        action='store_true')

    parser.add_argument('--uniprot',
                        help="The uniprot ORF lengths, if available",
                        default=default_uniprot)
    parser.add_argument('--uniprot-label',
                        help="The label to use for the uniprot ORFs in "
                        "the plot",
                        default=default_uniprot_label)

    parser.add_argument('--image-type',
                        help="The format of the image files. This must be "
                        "a format usable by matplotlib.",
                        default=default_image_type)

    parser.add_argument('--overwrite',
                        help="If this flag is present, existing files will "
                        "be overwritten.",
                        action='store_true')

    parser.add_argument(
        '--note',
        help="If this option is given, it will be used in the "
        "filenames.\n\nN.B. This REPLACES the note in the config file.",
        default=None)

    parser.add_argument(
        '--show-chisq',
        help="If this flag is given, then the "
        "results from Rp-chi will be included in the document; otherwise, they "
        "will not be created or shown.",
        action='store_true')

    parser.add_argument('-t',
                        '--tmp',
                        help="A location for temporary files",
                        default=None)

    slurm.add_sbatch_options(parser, num_cpus=default_num_cpus)
    logging_utils.add_logging_options(parser)
    args = parser.parse_args()
    logging_utils.update_logging(args)

    config = yaml.load(open(args.config), Loader=yaml.FullLoader)

    # keep multimappers?
    is_unique = not ('keep_riboseq_multimappers' in config)

    programs = [
        'create-orf-length-distribution-line-graph',
        'create-orf-types-bar-chart', 'visualize-orf-type-metagene-profiles'
    ]
    shell_utils.check_programs_exist(programs)

    required_keys = ['riboseq_data', 'riboseq_samples']
    utils.check_keys_exist(config, required_keys)

    if args.use_slurm:
        cmd = ' '.join(sys.argv)
        slurm.check_sbatch(cmd, args=args)
        return

    # by default, we will not include chisq
    chisq_values = [False]
    if args.show_chisq:
        chisq_values = [True, False]

    filtered_values = [True]
    if args.show_unfiltered_orfs:
        filtered_values = [True, False]

    grouped_values = [True, False]

    # make sure the path to the output file exists
    os.makedirs(args.out, exist_ok=True)

    # first, create all of the figures
    create_all_figures(config, args)

    note_str = config.get('note', None)
    out_note_str = note_str

    if args.note is not None and len(args.note) > 0:
        out_note_str = args.note

    fraction = config.get('smoothing_fraction', None)
    reweighting_iterations = config.get('smoothing_reweighting_iterations',
                                        None)

    project_name = config.get("project_name", default_project_name)
    title = "Rp-Bp prediction analysis for {}".format(project_name)
    abstract = "This document shows the results of the Rp-Bp pipeline analysis."

    #tex_file = os.path.join(args.out, "prediction-report.tex")
    tex_file = filenames.get_rpbp_prediction_report(args.out, out_note_str)

    with open(tex_file, 'w') as out:

        latex.begin_document(out, title, abstract)

        latex.write(out, "\n")

        latex.clearpage(out)

        ### ORF type distributions
        title = "Predicted ORF type distributions"
        latex.section(out, title)

        # first, handle all of the regular datasets
        sample_names = sorted(config['riboseq_samples'].keys())

        # and check if we also have replicates
        replicate_names = []
        if 'riboseq_biological_replicates' in config:
            replicate_names = sorted(
                ribo_utils.get_riboseq_replicates(config).keys())

        strands = ["+", "-"]

        i = 0
        for sample_name in sample_names:

            try:
                lengths, offsets = ribo_utils.get_periodic_lengths_and_offsets(
                    config,
                    sample_name,
                    is_unique=is_unique,
                    default_params=metagene_options)
            except FileNotFoundError:
                msg = (
                    "Could not parse out lengths and offsets for sample: {}. "
                    "Skipping".format(sample_name))
                logger.error(msg)
                continue

            caption = "ORF types: {}".format(sample_name)
            is_first = True

            # first, just dump all of the bar charts to the page
            it = itertools.product(grouped_values, chisq_values,
                                   filtered_values)

            for is_grouped, is_chisq, is_filtered in it:

                if is_chisq:
                    f = None
                    rw = None
                else:
                    f = fraction
                    rw = reweighting_iterations

                orf_types_bar_chart = filenames.get_orf_types_bar_chart(
                    config['riboseq_data'],
                    sample_name,
                    length=lengths,
                    offset=offsets,
                    is_unique=is_unique,
                    note=out_note_str,
                    image_type=args.image_type,
                    fraction=f,
                    reweighting_iterations=rw,
                    is_grouped=is_grouped,
                    is_chisq=is_chisq,
                    is_filtered=is_filtered)

                msg = "Looking for image file: {}".format(orf_types_bar_chart)
                logger.debug(msg)

                if os.path.exists(orf_types_bar_chart):
                    if is_first or (i % 4 == 0):
                        latex.begin_figure(out)
                        is_first = False

                    i += 1
                    latex.write_graphics(out, orf_types_bar_chart, height=0.15)

                    if i % 6 == 0:
                        latex.write_caption(out, caption)
                        latex.end_figure(out)
                        latex.clearpage(out)

                else:
                    msg = "Could not find image: {}".format(
                        orf_types_bar_chart)
                    logger.warning(msg)

            if (i > 0) and (i % 6) != 0:
                latex.write_caption(out, caption)
                latex.end_figure(out)
                #latex.clearpage(out)

        if i % 6 != 0:
            latex.clearpage(out)

        # now, if the config file specifies replicates, create figures for those
        i = 0
        for replicate_name in replicate_names:
            lengths = None
            offsets = None

            caption = "ORF types: {}".format(replicate_name)

            it = itertools.product(grouped_values, chisq_values,
                                   filtered_values)

            is_first = True

            for is_grouped, is_chisq, is_filtered in it:

                if is_chisq:
                    f = None
                    rw = None
                else:
                    f = fraction
                    rw = reweighting_iterations

                orf_types_bar_chart = filenames.get_orf_types_bar_chart(
                    config['riboseq_data'],
                    replicate_name,
                    length=lengths,
                    offset=offsets,
                    is_unique=is_unique,
                    note=out_note_str,
                    image_type=args.image_type,
                    fraction=f,
                    reweighting_iterations=rw,
                    is_grouped=is_grouped,
                    is_chisq=is_chisq,
                    is_filtered=is_filtered)

                msg = "Looking for image file: {}".format(orf_types_bar_chart)
                logger.debug(msg)

                if os.path.exists(orf_types_bar_chart):
                    if is_first or (i % 4 == 0):
                        latex.begin_figure(out)
                        is_first = False

                    i += 1
                    latex.write_graphics(out, orf_types_bar_chart, height=0.15)

                    if i % 6 == 0:
                        latex.write_caption(out, caption)
                        latex.end_figure(out)
                        latex.clearpage(out)

                else:
                    msg = "Could not find image: {}".format(
                        orf_types_bar_chart)
                    logger.warning(msg)

            if (i > 0) and (i % 6) != 0:
                latex.write_caption(out, caption)
                latex.end_figure(out)
                #latex.clearpage(out)

        if i % 6 != 0:
            latex.clearpage(out)

        ### ORF type length distributions
        title = "Predicted ORF type length distributions"
        latex.section(out, title)

        i = 0
        for sample_name in sample_names:

            try:
                lengths, offsets = ribo_utils.get_periodic_lengths_and_offsets(
                    config,
                    sample_name,
                    is_unique=is_unique,
                    default_params=metagene_options)
            except FileNotFoundError:
                msg = (
                    "Could not parse out lengths and offsets for sample: {}. "
                    "Skipping".format(sample_name))
                logger.error(msg)
                continue

            caption = "ORF type length distributions: {}".format(sample_name)

            is_first = True
            it = itertools.product(grouped_values, chisq_values)

            for is_grouped, is_chisq in it:

                if is_chisq:
                    f = None
                    rw = None
                else:
                    f = fraction
                    rw = reweighting_iterations

                orf_length_line_graph = filenames.get_orf_length_distribution_line_graph(
                    config['riboseq_data'],
                    sample_name,
                    length=lengths,
                    offset=offsets,
                    is_unique=is_unique,
                    note=out_note_str,
                    image_type=args.image_type,
                    fraction=f,
                    reweighting_iterations=rw,
                    is_grouped=is_grouped,
                    is_chisq=is_chisq)

                if os.path.exists(orf_length_line_graph):

                    if is_first or (i % 4 == 0):
                        latex.begin_figure(out)
                        is_first = False

                    i += 1
                    latex.write_graphics(out,
                                         orf_length_line_graph,
                                         height=0.15)

                    if i % 4 == 0:
                        latex.write_caption(out, caption)
                        latex.end_figure(out)
                        latex.clearpage(out)

                else:
                    msg = "Could not find image: {}".format(
                        orf_length_line_graph)
                    logger.debug(msg)

            if (i > 0) and (i % 4) != 0:
                latex.write_caption(out, caption)
                latex.end_figure(out)
                #latex.clearpage(out)

        if i % 4 != 0:
            latex.clearpage(out)

        # now, if the config file specifies replicates, create figures for those
        i = 0
        for replicate_name in replicate_names:
            lengths = None
            offsets = None

            caption = "ORF types: {}".format(replicate_name)

            is_first = True
            it = itertools.product(grouped_values, chisq_values)

            for is_grouped, is_chisq in it:

                if is_chisq:
                    f = None
                    rw = None
                else:
                    f = fraction
                    rw = reweighting_iterations

                orf_length_line_graph = filenames.get_orf_length_distribution_line_graph(
                    config['riboseq_data'],
                    replicate_name,
                    length=lengths,
                    offset=offsets,
                    is_unique=is_unique,
                    note=out_note_str,
                    image_type=args.image_type,
                    fraction=f,
                    reweighting_iterations=rw,
                    is_grouped=is_grouped,
                    is_chisq=is_chisq)

                if os.path.exists(orf_length_line_graph):

                    if is_first or (i % 4 == 0):
                        latex.begin_figure(out)
                        is_first = False

                    i += 1
                    latex.write_graphics(out,
                                         orf_length_line_graph,
                                         height=0.15)

                    if i % 4 == 0:
                        latex.write_caption(out, caption)
                        latex.end_figure(out)
                        latex.clearpage(out)

                else:
                    msg = "Could not find image: {}".format(
                        orf_length_line_graph)
                    logger.debug(msg)

            if (i > 0) and (i % 4) != 0:
                latex.write_caption(out, caption)
                latex.end_figure(out)
                #latex.clearpage(out)

        if i % 4 != 0:
            latex.clearpage(out)

        ### ORF type metagene profiles
        if args.show_orf_periodicity:
            title = "Predicted ORF type metagene profiles"
            latex.section(out, title)

            i = 0
            for sample_name in sample_names:

                try:
                    lengths, offsets = ribo_utils.get_periodic_lengths_and_offsets(
                        config,
                        sample_name,
                        is_unique=is_unique,
                        default_params=metagene_options)
                except FileNotFoundError:
                    msg = (
                        "Could not parse out lengths and offsets for sample: {}. "
                        "Skipping".format(sample_name))
                    logger.error(msg)
                    continue

                caption = "ORF type metagene profiles: {}".format(sample_name)

                is_first = True

                for is_chisq in chisq_values:

                    if is_chisq:
                        f = None
                        rw = None
                    else:
                        f = fraction
                        rw = reweighting_iterations

                    orf_type_profile_base = filenames.get_orf_type_profile_base(
                        config['riboseq_data'],
                        sample_name,
                        length=lengths,
                        offset=offsets,
                        is_unique=is_unique,
                        note=out_note_str,
                        fraction=f,
                        reweighting_iterations=rw,
                        is_chisq=is_chisq)

                    it = itertools.product(ribo_utils.orf_types, strands)

                    for orf_type, strand in it:
                        orf_type_profile = filenames.get_orf_type_profile_image(
                            orf_type_profile_base, orf_type, strand,
                            args.image_type)

                        msg = "Looking for image file: {}".format(
                            orf_type_profile)
                        logger.debug(msg)
                        if os.path.exists(orf_type_profile):

                            if is_first or (i % 4 == 0):
                                latex.begin_figure(out)
                                is_first = False

                            i += 1
                            latex.write_graphics(out,
                                                 orf_type_profile,
                                                 height=0.23)

                            if i % 4 == 0:
                                latex.write_caption(out, caption)
                                latex.end_figure(out)
                                latex.clearpage(out)

                        else:
                            msg = "Could not find image: {}".format(
                                orf_type_profile)
                            logger.warning(msg)

                if (i > 0) and (i % 4 != 0):
                    latex.write_caption(out, caption)
                    latex.end_figure(out)
                    #latex.clearpage(out)

            if i % 4 != 0:
                latex.clearpage(out)

            i = 0
            for replicate_name in replicate_names:
                lengths = None
                offsets = None

                caption = "ORF type metagene profiles: {}".format(
                    replicate_name)
                is_first = True
                for is_chisq in chisq_values:

                    if is_chisq:
                        f = None
                        rw = None
                    else:
                        f = fraction
                        rw = reweighting_iterations

                    orf_type_profile_base = filenames.get_orf_type_profile_base(
                        config['riboseq_data'],
                        replicate_name,
                        length=lengths,
                        offset=offsets,
                        is_unique=is_unique,
                        note=out_note_str,
                        fraction=f,
                        reweighting_iterations=rw,
                        is_chisq=is_chisq)

                    it = itertools.product(ribo_utils.orf_types, strands)

                    for orf_type, strand in it:

                        orf_type_profile = filenames.get_orf_type_profile_image(
                            orf_type_profile_base, orf_type, strand,
                            args.image_type)

                        if os.path.exists(orf_type_profile):

                            if is_first or (i % 4 == 0):
                                latex.begin_figure(out)
                                is_first = False

                            i += 1
                            latex.write_graphics(out,
                                                 orf_type_profile,
                                                 height=0.23)

                            if i % 4 == 0:
                                latex.write_caption(out, caption)
                                latex.end_figure(out)
                                latex.clearpage(out)
                        else:
                            msg = "Could not find image: {}".format(
                                orf_type_profile)
                            logger.debug(msg)

                if (i > 0) and (i % 4 != 0):
                    latex.write_caption(out, caption)
                    latex.end_figure(out)
                    #latex.clearpage(out)

            if i % 4 != 0:
                latex.clearpage(out)

        latex.end_document(out)

    tex_filename = os.path.basename(tex_file)
    latex.compile(args.out, tex_filename)