def main():
    parser = argparse.ArgumentParser(formatter_class=argparse.ArgumentDefaultsHelpFormatter,
        description="""Convert featureCount output to BED12 with exon-union coordinates at
        meta-feature level.""")

    parser.add_argument('tsv', help="The featureCount tsv file")
    parser.add_argument('out', help="The (output) BED12 file, compressed by default")

    parser.add_argument('-p', '--num-cpus', help="The number of CPUs to use",
        type=int, default=12)

    logging_utils.add_logging_options(parser)
    args = parser.parse_args()
    logging_utils.update_logging(args)

    msg = "Reading featureCount tsv file"
    logger.info(msg)
    
    tsv = pd.read_csv(args.tsv, 
                      usecols=['Geneid', 'Chr', 'Start', 'End', 'Strand', 'Length'], 
                      sep='\t', 
                      comment='#')
    
    msg = "Merging..."
    logger.info(msg)
    merged = parallel.apply_parallel(tsv, args.num_cpus, merge_gene_group)  
    merged = pd.DataFrame(merged)
    
    msg = "Sorting..."
    logger.info(msg)
    # We will break ties among transcripts by the order they appear 
    # in the GTF file. This is the same way star breaks ties.
    merged = bed_utils.sort(merged)

    msg = "Writing BED12 to disk"
    logger.info(msg)
    
    fields = bed_utils.bed12_field_names
    fields.append('length')
    bed_utils.write_bed(merged[fields], args.out)
Exemplo n.º 2
0
def main():
    parser = argparse.ArgumentParser(
        formatter_class=argparse.ArgumentDefaultsHelpFormatter,
        description='''Label the ORFs based on their transcript
        exon structure wrt the annotated transcripts.''')

    parser.add_argument('annotated_transcripts',
                        help='''The annotated transcripts for the genome
        in BED12+ format.''')

    parser.add_argument('extracted_orfs',
                        help='''The ORFs extracted from the transcripts 
        in BED12+ format.''')

    parser.add_argument('out', help='''The output (BED12+.gz) file.''')

    parser.add_argument('-e',
                        '--annotated-exons',
                        help='''The annotated transcript 
        exons can be passed with this option. If they are not given, they will be 
        split from the annotated transcripts.''',
                        default=None)

    parser.add_argument('-o',
                        '--orf-exons',
                        help='''The exon blocks for the ORFs, in BED6+ format, 
        obtained from "split-bed12-blocks". If they are not given, they will be split from the
        extracted ORFs.''',
                        default=None)

    parser.add_argument('-n',
                        '--nonoverlapping-label',
                        help='''If this option is given, 
        then the ORFs which do not overlap the annotated transcripts at all will be given this label.
        By default, remaining oof overlapping ORFs are assigned the "overlap" label.
        If not given, the ORFs outside of annotated regions are labeled as "suspect".''',
                        default=None)

    parser.add_argument('-l',
                        '--label-prefix',
                        help='''This string is prepended to all labels 
        assigned to ORFs, e.g. to indicate ORFs from a de novo assembly (Rp-Bp assigns the label
        "novel" to these, however the string is not prepended to "canonical ORFs").''',
                        default='')

    parser.add_argument('-f',
                        '--filter',
                        help='''If this flag is given, then ORFs
        which are completely covered by an annotated transcript are discarded. Use to filter 
        uninteresting ORFs from a de novo assembly.''',
                        action='store_true')

    parser.add_argument('-p',
                        '--num-cpus',
                        help='''The number of CPUs to use to perform
            BED operations.''',
                        type=int,
                        default=default_num_cpus)

    logging_utils.add_logging_options(parser)
    args = parser.parse_args()
    logging_utils.update_logging(args)

    msg = "Reading annotated transcripts"
    logger.info(msg)
    annotated_transcripts = bed_utils.read_bed(args.annotated_transcripts)

    # get the annotated transcript exons
    if args.annotated_exons is None:
        msg = "Splitting the annotated transcripts into exon blocks"
        logger.info(msg)

        annotated_exons = bed_utils.split_bed12(annotated_transcripts,
                                                num_cpus=args.num_cpus,
                                                progress_bar=True)
    else:
        msg = "Reading the annotated transcript exons"
        logger.info(msg)

        annotated_exons = bed_utils.read_bed(args.annotated_exons)

    msg = "Reading extracted ORFs"
    logger.info(msg)
    extracted_orfs = bed_utils.read_bed(args.extracted_orfs)

    if args.orf_exons is None:
        msg = "Splitting the extracted ORFs into exon blocks"
        logger.info(msg)
        extracted_orf_exons = bed_utils.split_bed12(extracted_orfs,
                                                    num_cpus=args.num_cpus,
                                                    progress_bar=True)
    else:
        msg = "Reading the extracted ORFs exons"
        logger.info(msg)
        extracted_orf_exons = bed_utils.read_bed(args.orf_exons)

    msg = "Found {} extracted ORFs with {} exons".format(
        len(extracted_orfs), len(extracted_orf_exons))
    logger.debug(msg)

    # filter out the ORFs that are entirely within annotated transcripts
    if args.filter:
        msg = "Removing ORFs which are completely covered by the annotated transcripts"
        logger.info(msg)

        nonoverlapping_ids = bed_utils.subtract_bed(extracted_orf_exons,
                                                    annotated_exons,
                                                    min_a_overlap=1)
        m_unfiltered = extracted_orfs['id'].isin(nonoverlapping_ids)
        extracted_orfs = extracted_orfs[m_unfiltered]
        # discard the unnecessary exons
        m_unfiltered = extracted_orf_exons['id'].isin(nonoverlapping_ids)
        extracted_orf_exons = extracted_orf_exons[m_unfiltered]

        msg = "After filtering, {} extracted ORFs remain".format(
            len(extracted_orfs))
        logger.info(msg)

    # annotate and remove the ORFs which do not at all overlap the annotations
    if args.nonoverlapping_label is not None:
        nonoverlapping_ids = bed_utils.subtract_bed(
            extracted_orfs,
            annotated_transcripts,
            exons_a=extracted_orf_exons,
            exons_b=annotated_exons)
        m_nonoverlapping = extracted_orf_exons['id'].isin(nonoverlapping_ids)
        extracted_orf_exons = extracted_orf_exons[~m_nonoverlapping]
        m_nonoverlapping = extracted_orfs['id'].isin(nonoverlapping_ids)
        extracted_orfs.loc[m_nonoverlapping,
                           'orf_type'] = args.nonoverlapping_label

        msg = ("Found {} ORFs completely non-overlapping annotated transcripts"
               .format(len(nonoverlapping_ids)))
        logger.info(msg)

    msg = "Removing the annotated UTRs from the transcripts"
    logger.info(msg)
    canonical_orfs = bed_utils.retain_all_thick_only(annotated_transcripts,
                                                     num_cpus=args.num_cpus)

    msg = "Splitting the canonical ORFs into exons"
    logger.info(msg)
    canonical_orf_exons = bed_utils.split_bed12(canonical_orfs,
                                                num_cpus=args.num_cpus,
                                                progress_bar=True)

    msg = "Extracting annotated 5' leader regions"
    logger.info(msg)
    five_prime_regions = bed_utils.retain_all_five_prime_of_thick(
        annotated_transcripts, num_cpus=args.num_cpus)

    if len(five_prime_regions) == 0:
        msg = "No annotated 5' leader regions were found"
        logger.warning(msg)

    msg = "Splitting the 5' leaders into exons"
    logger.info(msg)
    five_prime_exons = bed_utils.split_bed12(five_prime_regions,
                                             num_cpus=args.num_cpus,
                                             progress_bar=True)

    msg = "Extracting annotated 3' trailer regions"
    logger.info(msg)
    three_prime_regions = bed_utils.retain_all_three_prime_of_thick(
        annotated_transcripts, num_cpus=args.num_cpus)

    if len(three_prime_regions) == 0:
        msg = "No annotated 3' trailer regions were found"
        logger.warning(msg)

    msg = "Splitting the 3' trailers into exons"
    logger.info(msg)
    three_prime_exons = bed_utils.split_bed12(three_prime_regions,
                                              num_cpus=args.num_cpus,
                                              progress_bar=True)

    msg = "Splitting non-coding transcripts into exons"
    logger.info(msg)

    m_no_thick_start = annotated_transcripts['thick_start'] == -1
    m_no_thick_end = annotated_transcripts['thick_end'] == -1
    m_no_thick = m_no_thick_start & m_no_thick_end
    noncoding_transcripts = annotated_transcripts[m_no_thick]
    noncoding_exons = bed_utils.split_bed12(noncoding_transcripts,
                                            num_cpus=args.num_cpus,
                                            progress_bar=True)

    # First, remove all in-frame (canonical, canonical variants), and also within and oof ORFs

    msg = "Marking canonical and extracted ORFs with the same stop codon"
    logger.info(msg)

    # first, add the "true" ORF end
    m_reverse_canonical = canonical_orfs['strand'] == '-'
    canonical_orfs['orf_end'] = canonical_orfs['end']
    canonical_orfs.loc[m_reverse_canonical,
                       'orf_end'] = canonical_orfs.loc[m_reverse_canonical,
                                                       'start']

    m_reverse_extracted = extracted_orfs['strand'] == '-'
    extracted_orfs['orf_end'] = extracted_orfs['end']
    extracted_orfs.loc[m_reverse_extracted,
                       'orf_end'] = extracted_orfs.loc[m_reverse_extracted,
                                                       'start']

    # then, find extracted ORFs with the same "orf_end" (and seqname, strand) as canonical ORFs
    merge_fields = ['seqname', 'strand', 'orf_end']
    canonical_extracted_orf_ends = canonical_orfs.merge(
        extracted_orfs, on=merge_fields, suffixes=['_canonical', '_extracted'])

    # finally, pull this into a set
    zip_it = zip(canonical_extracted_orf_ends['id_canonical'],
                 canonical_extracted_orf_ends['id_extracted'])
    canonical_extracted_matching_ends = {(c, a) for c, a in zip_it}

    msg = "Finding ORFs which exactly overlap the canonical ORFs"
    logger.info(msg)

    exact_matches = bed_utils.get_bed_overlaps(canonical_orf_exons,
                                               extracted_orf_exons,
                                               min_a_overlap=1,
                                               min_b_overlap=1)

    exact_match_orf_ids = {m.b_info for m in exact_matches}

    m_exact_orf_matches = extracted_orf_exons['id'].isin(exact_match_orf_ids)
    extracted_orf_exons = extracted_orf_exons[~m_exact_orf_matches]

    m_canonical = extracted_orfs['id'].isin(exact_match_orf_ids)
    label = 'canonical'
    extracted_orfs.loc[m_canonical, 'orf_type'] = label

    msg = "Found {} canonical ORFs".format(len(exact_match_orf_ids))
    logger.info(msg)

    msg = "Finding truncated canonical ORFs"
    logger.info(msg)

    truncated_matches = bed_utils.get_bed_overlaps(canonical_orf_exons,
                                                   extracted_orf_exons,
                                                   min_b_overlap=1)

    truncated_match_ids = {
        m.b_info
        for m in truncated_matches
        if (m.a_info, m.b_info) in canonical_extracted_matching_ends
    }

    m_truncated_matches = extracted_orf_exons['id'].isin(truncated_match_ids)
    extracted_orf_exons = extracted_orf_exons[~m_truncated_matches]

    m_canonical_truncated = extracted_orfs['id'].isin(truncated_match_ids)

    msg = "Finding extended canonical ORFs"
    logger.info(msg)

    extended_matches = bed_utils.get_bed_overlaps(canonical_orf_exons,
                                                  extracted_orf_exons,
                                                  min_a_overlap=1)

    # For standard assembly, we also need to make sure that
    # all extended matches are fully contained within the
    # transcript structure (i.e start upstream but otherwise
    # have the same structure).
    if args.nonoverlapping_label is None:

        transcript_matches = bed_utils.get_bed_overlaps(annotated_exons,
                                                        extracted_orf_exons,
                                                        min_b_overlap=1)
        transcript_match_pairs = {(m.a_info, m.b_info)
                                  for m in transcript_matches}

        extended_match_ids = {
            m.b_info
            for m in extended_matches
            if (m.a_info, m.b_info) in transcript_match_pairs and (
                m.a_info, m.b_info) in canonical_extracted_matching_ends
        }

    else:

        extended_match_ids = {
            m.b_info
            for m in extended_matches
            if (m.a_info, m.b_info) in canonical_extracted_matching_ends
        }

    m_extended_matches = extracted_orf_exons['id'].isin(extended_match_ids)
    extracted_orf_exons = extracted_orf_exons[~m_extended_matches]

    m_canonical_extended = extracted_orfs['id'].isin(extended_match_ids)
    m_canonical_variants = m_canonical_truncated | m_canonical_extended

    label = "{}canonical_variant".format(args.label_prefix)
    extracted_orfs.loc[m_canonical_variants, 'orf_type'] = label

    msg = "Found {} canonical_variant ORFs".\
          format(len(extended_match_ids | truncated_match_ids))
    logger.info(msg)

    msg = ("Finding within canonical ORFs that do not share an "
           "annotated stop codon with a canonical ORF (e.g. in "
           "frame stop, out-of-frame)")
    logger.info(msg)

    within_ids = {
        m.b_info
        for m in truncated_matches if m.b_info not in truncated_match_ids
    }

    m_within_matches = extracted_orf_exons['id'].isin(within_ids)
    extracted_orf_exons = extracted_orf_exons[~m_within_matches]

    m_within = extracted_orfs['id'].isin(within_ids)
    label = "{}within".format(args.label_prefix)
    extracted_orfs.loc[m_within, 'orf_type'] = label

    msg = "Found {} within ORFs".format(len(within_ids))
    logger.info(msg)

    # find all overlapping ORFs
    msg = "Finding all UTR overlap matches"
    logger.info(msg)
    out_of_frame_matches = bed_utils.get_bed_overlaps(canonical_orf_exons,
                                                      extracted_orf_exons)

    leader_matches = bed_utils.get_bed_overlaps(five_prime_exons,
                                                extracted_orf_exons)

    trailer_matches = bed_utils.get_bed_overlaps(three_prime_exons,
                                                 extracted_orf_exons)

    msg = ("Labeling ORFs which have (out-of-frame) overlaps with both a "
           "canonical ORF and annotated leaders or trailers")
    logger.info(msg)

    # We need to choose how to ensure that up-/downstream overlaps are unique.
    # Where an ORF overlaps both the 5'UTR and the 3'UTR of different same
    # sense overlapping transcripts, it is assigned by default to the downstream overlap.
    # For de novo, everything is labeled as overlap.

    leader_match_pairs = {(m.a_info, m.b_info) for m in leader_matches}
    trailer_match_pairs = {(m.a_info, m.b_info) for m in trailer_matches}

    if args.nonoverlapping_label is None:

        # For standard assembly, we also need to make sure that
        # all overlap matches are fully contained within the
        # transcript structure.
        transcript_matches = bed_utils.get_bed_overlaps(annotated_exons,
                                                        extracted_orf_exons,
                                                        min_b_overlap=1)

        transcript_match_pairs = {(m.a_info, m.b_info)
                                  for m in transcript_matches}

        leader_overlap_pairs = {
            (m.a_info, m.b_info)
            for m in out_of_frame_matches
            if (m.a_info, m.b_info) in leader_match_pairs and (
                m.a_info, m.b_info) not in trailer_match_pairs and (
                    m.a_info, m.b_info) in transcript_match_pairs
        }

        trailer_overlap_pairs = {
            (m.a_info, m.b_info)
            for m in out_of_frame_matches
            if (m.a_info, m.b_info) in trailer_match_pairs and (
                m.a_info, m.b_info) not in leader_match_pairs and (
                    m.a_info, m.b_info) in transcript_match_pairs
        }

        # We do not assign preference where the ORF overlaps both sides
        # of the coding sequence on the same transcript, any ORF
        # satisfying both will be labeled simply as overlap.
        overlap_ids = {
            m.b_info
            for m in out_of_frame_matches
            if (m.a_info, m.b_info) in leader_match_pairs and (
                m.a_info, m.b_info) in trailer_match_pairs and (
                    m.a_info, m.b_info) in transcript_match_pairs
        }

        trailer_overlap_ids = {
            pair[1]
            for pair in trailer_overlap_pairs if pair[1] not in overlap_ids
        }

        leader_overlap_ids = {
            pair[1]
            for pair in leader_overlap_pairs if
            pair[1] not in trailer_overlap_ids and pair[1] not in overlap_ids
        }

        m_overlap_matches = extracted_orf_exons['id'].isin(overlap_ids)
        extracted_orf_exons = extracted_orf_exons[~m_overlap_matches]

        m_leader_overlap_matches = extracted_orf_exons['id'].isin(
            leader_overlap_ids)
        extracted_orf_exons = extracted_orf_exons[~m_leader_overlap_matches]

        m_five_prime_overlap = extracted_orfs['id'].isin(leader_overlap_ids)
        label = "{}five_prime_overlap".format(args.label_prefix)
        extracted_orfs.loc[m_five_prime_overlap, 'orf_type'] = label

        m_trailer_overlap_matches = extracted_orf_exons['id'].isin(
            trailer_overlap_ids)
        extracted_orf_exons = extracted_orf_exons[~m_trailer_overlap_matches]

        m_three_prime_overlap = extracted_orfs['id'].isin(trailer_overlap_ids)
        label = "{}three_prime_overlap".format(args.label_prefix)
        extracted_orfs.loc[m_three_prime_overlap, 'orf_type'] = label

        msg = "Found {} five_prime_overlap ORFs".format(
            len(leader_overlap_ids))
        logger.info(msg)
        msg = "Found {} three_prime_overlap ORFs".format(
            len(trailer_overlap_ids))
        logger.info(msg)

    else:

        overlap_ids = {m.b_info for m in out_of_frame_matches}
        overlap_ids |= {m.b_info for m in leader_matches}
        overlap_ids |= {m.b_info for m in trailer_matches}

        m_overlap_matches = extracted_orf_exons['id'].isin(overlap_ids)
        extracted_orf_exons = extracted_orf_exons[~m_overlap_matches]

    m_overlap = extracted_orfs['id'].isin(overlap_ids)
    label = "{}overlap".format(args.label_prefix)
    extracted_orfs.loc[m_overlap, 'orf_type'] = label

    msg = "Found {} overlap ORFs".format(len(overlap_ids))
    logger.info(msg)

    msg = "Finding ORFs completely within 5' or 3' leaders"
    logger.info(msg)

    leader_matches = bed_utils.get_bed_overlaps(five_prime_exons,
                                                extracted_orf_exons,
                                                min_b_overlap=1)

    leader_ids = {m.b_info for m in leader_matches}

    m_leader_matches = extracted_orf_exons['id'].isin(leader_ids)
    extracted_orf_exons = extracted_orf_exons[~m_leader_matches]

    m_five_prime = extracted_orfs['id'].isin(leader_ids)
    label = "{}five_prime".format(args.label_prefix)
    extracted_orfs.loc[m_five_prime, 'orf_type'] = label

    msg = "Found {} five_prime ORFs".format(len(leader_ids))
    logger.info(msg)

    trailer_matches = bed_utils.get_bed_overlaps(three_prime_exons,
                                                 extracted_orf_exons,
                                                 min_b_overlap=1)

    trailer_ids = {m.b_info for m in trailer_matches}

    m_trailer_matches = extracted_orf_exons['id'].isin(trailer_ids)
    extracted_orf_exons = extracted_orf_exons[~m_trailer_matches]

    m_three_prime = extracted_orfs['id'].isin(trailer_ids)
    label = "{}three_prime".format(args.label_prefix)
    extracted_orfs.loc[m_three_prime, 'orf_type'] = label

    msg = "Found {} three_prime ORFs".format(len(trailer_ids))
    logger.info(msg)

    msg = "Finding ORFs completely within annotated, non-coding transcripts"
    logger.info(msg)

    noncoding_matches = bed_utils.get_bed_overlaps(noncoding_exons,
                                                   extracted_orf_exons,
                                                   min_b_overlap=1)

    noncoding_ids = {m.b_info for m in noncoding_matches}

    m_noncoding_matches = extracted_orf_exons['id'].isin(noncoding_ids)
    extracted_orf_exons = extracted_orf_exons[~m_noncoding_matches]

    m_noncoding = extracted_orfs['id'].isin(noncoding_ids)
    label = "{}noncoding".format(args.label_prefix)
    extracted_orfs.loc[m_noncoding, 'orf_type'] = label

    msg = "Found {} noncoding ORFs".format(len(noncoding_ids))
    logger.info(msg)

    # all of the remaining ORFs fall into the "suspect" category
    suspect_ids = {orf_id for orf_id in extracted_orf_exons['id']}

    m_suspect = extracted_orfs['id'].isin(suspect_ids)
    label = "{}suspect".format(args.label_prefix)
    extracted_orfs.loc[m_suspect, 'orf_type'] = label

    n_suspect_ids = len(suspect_ids)
    msg = "Remaining {} ORFs labeled as suspect".format(n_suspect_ids)
    logger.info(msg)

    m_no_orf_type = extracted_orfs['orf_type'].isnull()
    msg = "Found {} unlabeled ORFs".format(sum(m_no_orf_type))
    logger.info(msg)

    msg = "Writing ORFs with labels to disk"
    logger.info(msg)

    extracted_orfs = bed_utils.sort(extracted_orfs)

    msg = ("The ORF labels will be written to {} in the next major release.".
           format(args.out))
    logger.warning(msg)

    additional_columns = ['orf_num', 'orf_len', 'orf_type']
    fields = bed_utils.bed12_field_names + additional_columns
    orfs_genomic = extracted_orfs[fields]
    bed_utils.write_bed(orfs_genomic, args.extracted_orfs)

    label_columns = ['id', 'duplicates', 'orf_type']
    extracted_orfs = extracted_orfs[label_columns]
    bed_utils.write_bed(extracted_orfs, args.out)
Exemplo n.º 3
0
def main():
    parser = argparse.ArgumentParser(
        formatter_class=argparse.ArgumentDefaultsHelpFormatter,
        description='''Prepare a reference genome and matching 
        annotations, including labelled ORFs, for use with the Rp-Bp periodicity estimation 
        and ORF translation prediction pipeline.''')

    parser.add_argument('config', help='''The (yaml) configuration file''')

    parser.add_argument('--overwrite',
                        help='''If this flag is present, existing files
        will be overwritten.''',
                        action='store_true')

    slurm.add_sbatch_options(parser,
                             num_cpus=default_num_cpus,
                             mem=default_mem)
    logging_utils.add_logging_options(parser)
    pgrm_utils.add_star_options(parser, star_executable)
    args = parser.parse_args()
    logging_utils.update_logging(args)

    config = yaml.load(open(args.config), Loader=yaml.FullLoader)

    # check required callable programs, config keys and files
    programs = [
        'extract-orf-coordinates', 'label-orfs', 'bowtie2-build-s',
        'split-bed12-blocks', 'gtf-to-bed12', args.star_executable
    ]
    shell_utils.check_programs_exist(programs)

    required_keys = [
        'genome_base_path', 'genome_name', 'gtf', 'fasta', 'ribosomal_fasta',
        'ribosomal_index', 'star_index'
    ]
    utils.check_keys_exist(config, required_keys)

    files = [config['gtf'], config['fasta'], config['ribosomal_fasta']]
    if 'de_novo_gtf' in config:
        files += [config['de_novo_gtf']]
    utils.check_files_exist(files, source='prepare-rpbp-genome')

    # check if we want to use slurm
    if args.use_slurm:
        cmd = ' '.join(sys.argv)
        slurm.check_sbatch(cmd, args=args)
        return

    call = not args.do_not_call

    # the rRNA index
    cmd = "bowtie2-build-s {} {}".format(config['ribosomal_fasta'],
                                         config['ribosomal_index'])

    in_files = [config['ribosomal_fasta']]
    out_files = pgrm_utils.get_bowtie2_index_files(config['ribosomal_index'])
    shell_utils.call_if_not_exists(cmd,
                                   out_files,
                                   in_files=in_files,
                                   overwrite=args.overwrite,
                                   call=call)

    # the STAR index
    mem = utils.human2bytes(args.mem)
    cmd = ("{} --runMode genomeGenerate --genomeDir {} --genomeFastaFiles {} "
           "--runThreadN {} --limitGenomeGenerateRAM {}".format(
               args.star_executable, config['star_index'], config['fasta'],
               args.num_cpus, mem))

    in_files = [config['fasta']]
    out_files = pgrm_utils.get_star_index_files(config['star_index'])
    shell_utils.call_if_not_exists(cmd,
                                   out_files,
                                   in_files=in_files,
                                   overwrite=args.overwrite,
                                   call=call)

    # get the ORFs
    get_orfs(config['gtf'], args, config, is_annotated=True, is_de_novo=False)

    # we will use these files later in the pipeline
    annotated_orfs = filenames.get_orfs(config['genome_base_path'],
                                        config['genome_name'],
                                        note=config.get('orf_note'),
                                        is_annotated=True,
                                        is_de_novo=False)

    orfs_genomic = filenames.get_orfs(config['genome_base_path'],
                                      config['genome_name'],
                                      note=config.get('orf_note'))

    annotated_exons_file = filenames.get_exons(config['genome_base_path'],
                                               config['genome_name'],
                                               note=config.get('orf_note'),
                                               is_annotated=True,
                                               is_de_novo=False)

    exons_file = filenames.get_exons(config['genome_base_path'],
                                     config['genome_name'],
                                     note=config.get('orf_note'))

    annotated_labeled_orfs = filenames.get_labels(config['genome_base_path'],
                                                  config['genome_name'],
                                                  note=config.get('orf_note'),
                                                  is_annotated=True,
                                                  is_de_novo=False)

    labeled_orfs = filenames.get_labels(config['genome_base_path'],
                                        config['genome_name'],
                                        note=config.get('orf_note'))

    use_gff3_specs = config['gtf'].endswith('gff')
    gtf_file = filenames.get_gtf(config['genome_base_path'],
                                 config['genome_name'],
                                 is_gff3=use_gff3_specs,
                                 is_star_input=True)

    # now, check if we have a de novo assembly
    if 'de_novo_gtf' in config:
        get_orfs(config['de_novo_gtf'],
                 args,
                 config,
                 is_annotated=False,
                 is_de_novo=True)

        # we need to concat the ORF and exon files
        de_novo_orfs = filenames.get_orfs(config['genome_base_path'],
                                          config['genome_name'],
                                          note=config.get('orf_note'),
                                          is_annotated=False,
                                          is_de_novo=True)

        orfs_files = [annotated_orfs, de_novo_orfs]

        orfs_files_str = ' '.join(orfs_files)
        msg = ("Concatenating files. Output file: {}; Input files: {}".format(
            orfs_genomic, orfs_files_str))
        logger.info(msg)

        if call:
            concatenated_bed = bed_utils.concatenate(orfs_files, sort_bed=True)
            concatenated_bed['orf_num'] = range(len(concatenated_bed))
            additional_columns = ['orf_num', 'orf_len', 'orf_type']
            fields = bed_utils.bed12_field_names + additional_columns
            bed_utils.write_bed(concatenated_bed[fields], orfs_genomic)
        else:
            msg = "Skipping concatenation due to --call value"
            logger.info(msg)

        de_novo_exons_file = filenames.get_exons(config['genome_base_path'],
                                                 config['genome_name'],
                                                 note=config.get('orf_note'),
                                                 is_annotated=False,
                                                 is_de_novo=True)

        exons_files = [annotated_exons_file, de_novo_exons_file]

        exons_files_str = ' '.join(exons_files)
        msg = ("Concatenating files. Output file: {}; Input files: {}".format(
            exons_file, exons_files_str))
        logger.info(msg)

        if call:
            concatenated_bed = bed_utils.concatenate(exons_files,
                                                     sort_bed=True)
            fields = bed_utils.bed6_field_names + [
                'exon_index', 'transcript_start'
            ]
            bed_utils.write_bed(concatenated_bed[fields], exons_file)
        else:
            msg = "Skipping concatenation due to --call value"
            logger.info(msg)

        de_novo_labeled_orfs = filenames.get_labels(
            config['genome_base_path'],
            config['genome_name'],
            note=config.get('orf_note'),
            is_annotated=False,
            is_de_novo=True)

        label_files = [annotated_labeled_orfs, de_novo_labeled_orfs]

        label_files_str = ' '.join(label_files)
        msg = ("Concatenating files. Output file: {}; Input files: {}".format(
            labeled_orfs, label_files_str))
        logger.info(msg)

        if call:
            # not sorted, as is
            concatenated_bed = bed_utils.concatenate(label_files,
                                                     sort_bed=False)
            bed_utils.write_bed(concatenated_bed, labeled_orfs)
        else:
            msg = "Skipping concatenation due to --call value"
            logger.info(msg)

        # we also need to concat the annotations to inform STAR
        # there is no particular reason to merge and sort the files, so
        # we just concatenate them...
        if (config['de_novo_gtf'].endswith('gff') == use_gff3_specs):
            cmd = ("awk '!/^#/' {} {} > {}".format(config['gtf'],
                                                   config['de_novo_gtf'],
                                                   gtf_file))
            in_files = [config['gtf'], config['de_novo_gtf']]
            out_files = [gtf_file]
            shell_utils.call_if_not_exists(cmd,
                                           out_files,
                                           in_files=in_files,
                                           overwrite=args.overwrite,
                                           call=call)
        else:
            msg = (
                "Skipping concatenation due to mismatch in format specifications (GTF2/GFF3)"
                "for reference and do novo annotations. Symlink to reference annotations created."
            )
            logger.warning(msg)
            if os.path.exists(config['gtf']):
                shell_utils.create_symlink(config['gtf'], gtf_file, call)

    else:
        # if we do not have a de novo assembly, symlink the files

        if os.path.exists(annotated_orfs):
            shell_utils.create_symlink(annotated_orfs, orfs_genomic, call)

        if os.path.exists(annotated_exons_file):
            shell_utils.create_symlink(annotated_exons_file, exons_file, call)

        if os.path.exists(annotated_labeled_orfs):
            shell_utils.create_symlink(annotated_labeled_orfs, labeled_orfs,
                                       call)

        if os.path.exists(config['gtf']):
            shell_utils.create_symlink(config['gtf'], gtf_file, call)
def main():
    parser = argparse.ArgumentParser(formatter_class=argparse.ArgumentDefaultsHelpFormatter,
                                     description="""Given a list of ORFs with associated Bayes 
        factors and a fasta sequence file, this script extracts the sequences of the ORFs whose 
        Bayes factor exceeds the given threshold. Finally, biopython is used to translate the 
        selected ORFs into protein sequences. The min-length and minimum-profile-sum filters 
        are applied in the obvious way. For both BF and chi-square predictions, only ORFs 
        which have more reads in the first reading frame than either of the other two will 
        be selected as translated. (This is called the 'frame filter' below.) The selection 
        based on Bayes factors follows this logic: if max_bf_var is given, then it and 
        min_bf_mean are taken as a hard threshold on the estimated Bayes factor mean. 
        If min_bf_likelihood is given, then this min_bf_mean is taken as the boundary value; 
        that is, an ORF is 'translated' if: [P(bf > min_bf_mean)] > min_bf_likelihood.
        If both max_bf_var and min_bf_likelihood are None, then min_bf_mean is taken as a
        hard threshold on the mean for selecting translated ORFs. If both max_bf_var and 
        min_bf_likelihood are given, then both filters will be applied and the result will 
        be the intersection. If the --use-chi-square option is given, the significance value is
        Bonferroni-corrected based on the number of ORFs which meet the length, profile
        and frame filters.""")

    parser.add_argument('bayes_factors', help="""The file containing the ORFs and Bayes'
        factors (BED12+).""")

    parser.add_argument('fasta', help="The *genome* fasta file")

    parser.add_argument('predicted_orfs', help="""The (output) BED12+ file containing
        the predicted ORFs.""")

    parser.add_argument('predicted_dna_sequences', help="""The (output) fasta file 
        containing the predicted ORF sequences, as DNA sequences.""")

    parser.add_argument('predicted_protein_sequences', help="""The (output) fasta file 
        containing the predicted ORF sequences, as protein sequences.""")

    parser.add_argument('--select-longest-by-stop', help="""If this flag is given, then
        the selected ORFs will be merged based on stop codons. In particular, only the
        longest translated ORF at each stop codon will be selected.""", action='store_true')

    parser.add_argument('--select-best-overlapping', help="""If this flag is given, then
        only the ORF with the highest estimated Bayes factor will be kept among each
        set of overlapping ORFs. N.B. This filter is applied *AFTER* selecting the
        longest ORF at each stop codon, if the --select-longest-by-stop flag is given.""",
                        action='store_true')

    parser.add_argument('--min-length', help="The minimum length to predict an ORF as translated",
                        type=int, default=translation_options['orf_min_length'])
    
    parser.add_argument('--min-bf-mean', help="""The minimum Bayes' factor mean to predict
        an ORF as translated (use --help for more details)""",
                        type=float, default=translation_options['min_bf_mean'])

    parser.add_argument('--max-bf-var', help="""The maximum Bayes' factor variance to predict
        an ORF as translated (use --help for more details).""",
                        type=float, default=translation_options['max_bf_var'])

    parser.add_argument('--min-bf-likelihood', help="""If given, then this is taken a threshold 
        on the likelihood of translation (use --help for more details).""",
                        type=float, default=translation_options['min_bf_likelihood'])

    parser.add_argument('--min-profile', help="""The minimum sum across all reading frames to consider
        an ORF as translated""", type=float, default=translation_options['orf_min_profile_count'])
   
    parser.add_argument('--chi-square-only', help="""If this flag is present, then the
        chi square value will be used to predict ORFs rather than the Bayes' factor.""",
                        action='store_true')

    parser.add_argument('--chisq-significance-level', help="""If using chi square, then this
        value is Bonferroni corrected and used as the significance cutoff, else it is ignored.""",
                        type=float, default=translation_options['chisq_alpha'])

    parser.add_argument('--filtered-orf-types', help=""""A list of ORF types which will be
        removed before selecting the final prediction set.""", nargs='*', default=[])

    logging_utils.add_logging_options(parser)
    args = parser.parse_args()
    logging_utils.update_logging(args)

    # first, extract all of the predictions which exceed the threshold
    msg = "Reading Bayes factor information"
    logger.info(msg)
    
    bayes_factors = bed_utils.read_bed(args.bayes_factors)

    if len(args.filtered_orf_types) > 0:
        filtered_orf_types_str = ','.join(args.filtered_orf_types)
        msg = "Filtering these ORF types: {}".format(filtered_orf_types_str)
        logger.info(msg)

        m_orf_types = bayes_factors['orf_type'].isin(args.filtered_orf_types)
        bayes_factors = bayes_factors[~m_orf_types]

    msg = "Identifying ORFs which meet the prediction thresholds"
    logger.info(msg)

    all_orfs, predicted_orfs = ribo_utils.get_predicted_orfs(
        bayes_factors,
        min_signal=args.min_profile,
        min_length=args.min_length,
        min_bf_mean=args.min_bf_mean, 
        max_bf_var=args.max_bf_var, 
        min_bf_likelihood=args.min_bf_likelihood,
        chisq_alpha=args.chisq_significance_level,
        select_longest_by_stop=args.select_longest_by_stop,
        use_chi_square=args.chi_square_only
    )

    msg = "Number of selected ORFs: {}".format(len(predicted_orfs))
    logger.info(msg)

    if args.select_best_overlapping:

        msg = "Finding overlapping ORFs"
        logger.info(msg)

        merged_intervals = bed_utils.merge_all_intervals(predicted_orfs)

        msg = "Selecting best among overlapping ORFs"
        logger.info(msg)

        predicted_orfs = parallel.apply_iter_simple(
            merged_intervals['merged_ids'], 
            get_best_overlapping_orf, 
            predicted_orfs, 
            progress_bar=True
        )

        predicted_orfs = pd.DataFrame(predicted_orfs)

    msg = "Sorting selected ORFs"
    logger.info(msg)

    predicted_orfs = bed_utils.sort(predicted_orfs)

    msg = "Writing selected ORFs to disk"
    logger.info(msg)
    bed_utils.write_bed(predicted_orfs, args.predicted_orfs)

    # now get the sequences
    msg = "Extracting predicted ORFs DNA sequence"
    logger.info(msg)

    split_exons = True
    transcript_sequences = bed_utils.get_all_bed_sequences(
        predicted_orfs, 
        args.fasta, 
        split_exons
    )

    fastx_utils.write_fasta(transcript_sequences,
                            args.predicted_dna_sequences,
                            compress=False)

    # translate the remaining ORFs into protein sequences
    msg = "Converting predicted ORF sequences to amino acids"
    logger.info(msg)

    records = fastx_utils.get_read_iterator(args.predicted_dna_sequences)
    protein_records = {
        r[0]: Bio.Seq.translate(r[1]) for r in records
    }
    
    fastx_utils.write_fasta(
        protein_records.items(), 
        args.predicted_protein_sequences, 
        compress=False
    )
Exemplo n.º 5
0
def main():
    parser = argparse.ArgumentParser(
        formatter_class=argparse.ArgumentDefaultsHelpFormatter,
        description='''Extract the ORFs from the given transcripts and
        write as a BED12+ file. Additional fields, 'orf_len' and 'orf_num', give the length 
        of each ORF and it's index (used to write the ORF profiles). A third additional field
        records duplicated ORFs from transcript variants.''')

    parser.add_argument('transcripts_bed',
                        help='''The BED12 file containing the 
        transcript information.''')

    parser.add_argument('transcripts_fasta',
                        help='''The fasta file containing the 
        spliced transcript sequences.''')

    parser.add_argument('out', help='''The output (BED12+ gz) file.''')

    parser.add_argument('--start-codons',
                        help='''A list of codons which will be treated 
        as start codons when extracting the ORFs.''',
                        nargs='+',
                        default=default_start_codons)

    parser.add_argument('--stop-codons',
                        help='''A list of codons which will be treated 
        as stop codons when extracting the ORFs.''',
                        nargs='+',
                        default=default_stop_codons)

    slurm.add_sbatch_options(parser)
    logging_utils.add_logging_options(parser)
    args = parser.parse_args()
    logging_utils.update_logging(args)

    # check if we wanted to use slurm
    if args.use_slurm:
        cmd = ' '.join(sys.argv)
        slurm.check_sbatch(cmd, args=args)
        return

    msg = "Compiling start and stop codon regular expressions"
    logger.info(msg)

    start_codons_re = '|'.join(args.start_codons)
    stop_codons_re = '|'.join(args.stop_codons)

    start_codons_re = re.compile(start_codons_re)
    stop_codons_re = re.compile(stop_codons_re)

    msg = "Reading transcripts bed file"
    logger.info(msg)
    transcripts_bed = bed_utils.read_bed(args.transcripts_bed)

    msg = "Creating the sequence iterator"
    logger.info(msg)

    transcripts_fasta = fastx_utils.get_read_iterator(args.transcripts_fasta)

    transcripts_iter = ((get_transcript(transcript_header,
                                        transcripts_bed), transcript_sequence)
                        for (transcript_header,
                             transcript_sequence) in transcripts_fasta)

    msg = "Finding all ORFs"
    logger.info(msg)

    orfs = parallel.apply_parallel_iter(transcripts_iter,
                                        args.num_cpus,
                                        get_orfs,
                                        start_codons_re,
                                        stop_codons_re,
                                        total=len(transcripts_bed),
                                        progress_bar=True)

    msg = "Joining ORFs in a large data frame"
    logger.info(msg)

    orfs = pd.concat(orfs)
    orfs.reset_index(drop=True, inplace=True)

    #  This is done arbitrarily, however we keep all matching
    #  transcripts for reference
    msg = "Marking and removing duplicate ORFs"
    logger.info(msg)

    groupby_duplicates = orfs.groupby(DUPLICATE_FIELDS,
                                      as_index=False).agg({'id': ','.join})
    orfs = orfs.merge(groupby_duplicates, how='left', on=DUPLICATE_FIELDS)
    orfs.drop_duplicates(subset=DUPLICATE_FIELDS, inplace=True, keep='first')
    orfs.rename(columns={'id_x': 'id', 'id_y': 'duplicates'}, inplace=True)

    msg = "Numbering remaining ORFs"
    logger.info(msg)

    orfs['orf_num'] = np.arange(len(orfs))

    msg = "Writing ORFs to disk"
    logger.info(msg)
    bed_utils.write_bed(orfs, args.out)
Exemplo n.º 6
0
def main():
    global profiles_data, profiles_indices, profiles_indptr, profiles_shape
    global translated_models, untranslated_models
    global args

    parser = argparse.ArgumentParser(
        formatter_class=argparse.ArgumentDefaultsHelpFormatter,
        description="""This script uses Hamiltonian MCMC with Stan 
        to estimate translation parameters for a set of regions (presumably ORFs). Roughly, it takes 
        as input: (1) a set of regions (ORFs) and their corresponding profiles
                  (2) a "translated" model which gives the probability that a region is translated
                  (3) an "untranslated" model which gives the probability that a region is not translated.
        The script first smoothes the profiles using LOWESS. It then calculates both the Bayes' factor 
        (using the smoothed profile) and chi2 value (using the raw counts) for each ORF."""
    )

    parser.add_argument('profiles', help="The ORF profiles (counts) (mtx)")

    parser.add_argument(
        'regions',
        help="The regions (ORFs) for which predictions will be made (BED12+)")

    parser.add_argument('out',
                        help="The output file for the Bayes' factors (BED12+)")

    parser.add_argument('--chi-square-only',
                        help="""If this flag is present, then only the chi
        square test will be performed for each ORF. This can also be a way to get the counts within 
        each of the ORFs.""",
                        action='store_true')

    parser.add_argument('--translated-models',
                        help="The models to use as H_t (pkl)",
                        nargs='+')

    parser.add_argument('--untranslated-models',
                        help="The models to use as H_u (pkl)",
                        nargs='+')

    # filtering options
    parser.add_argument(
        '--orf-types',
        help=
        "If values are given, then only orfs with those types are processed.",
        nargs='*',
        default=translation_options['orf_types'])

    parser.add_argument('--orf-type-field', default=default_orf_type_field)

    parser.add_argument(
        '--min-length',
        help="ORFs with length less than this value will not be processed",
        type=int,
        default=translation_options['orf_min_length_pre'])

    parser.add_argument(
        '--max-length',
        help="ORFs with length greater than this value will not be processed",
        type=int,
        default=translation_options['orf_max_length_pre'])

    parser.add_argument(
        '--min-profile',
        help="""ORFs with profile sum (i.e., number of reads) less than this
        value will not be processed.""",
        type=float,
        default=translation_options['orf_min_profile_count_pre'])

    # smoothing options
    parser.add_argument('--fraction',
                        help="The fraction of signal to use in LOWESS",
                        type=float,
                        default=translation_options['smoothing_fraction'])

    parser.add_argument(
        '--reweighting-iterations',
        help="The number of reweighting "
        "iterations to use in LOWESS. "
        "Please see the statsmodels documentation for a "
        "detailed description of this parameter.",
        type=int,
        default=translation_options['smoothing_reweighting_iterations'])

    # MCMC options
    parser.add_argument('-s',
                        '--seed',
                        help="The random seeds to use for inference",
                        type=int,
                        default=translation_options['seed'])
    parser.add_argument('-c',
                        '--chains',
                        help="The number of MCMC chains to use",
                        type=int,
                        default=translation_options['chains'])
    parser.add_argument(
        '-i',
        '--iterations',
        help="The number of MCMC iterations to use for each chain",
        type=int,
        default=translation_options['translation_iterations'])

    # behavior options
    parser.add_argument(
        '--num-orfs',
        help="If n>0, then only this many ORFs will be processed",
        type=int,
        default=0)

    parser.add_argument('--orf-num-field', default=default_orf_num_field)

    parser.add_argument('--do-not-compress',
                        help="Unless otherwise specified, the output will "
                        "be written in GZip format",
                        action='store_true')

    parser.add_argument('-g',
                        '--num-groups',
                        help="The number of groups into which to split "
                        "the ORFs. More groups means the progress bar is "
                        "updated more frequently but incurs more overhead "
                        "because of the parallel calls.",
                        type=int,
                        default=default_num_groups)

    slurm.add_sbatch_options(parser)
    logging_utils.add_logging_options(parser)
    args = parser.parse_args()
    logging_utils.update_logging(args)

    if args.use_slurm:
        cmd = ' '.join(sys.argv)
        slurm.check_sbatch(cmd, args=args)
        return

    # read in the regions and apply the filters
    msg = "Reading and filtering ORFs"
    logger.info(msg)
    regions = bed_utils.read_bed(args.regions)

    # by default, keep everything
    m_filters = np.array([True] * len(regions))

    if len(args.orf_types) > 0:
        m_orf_type = regions[args.orf_type_field].isin(args.orf_types)
        m_filters = m_orf_type & m_filters

    # min length
    if args.min_length > 0:
        m_min_length = regions['orf_len'] >= args.min_length
        m_filters = m_min_length & m_filters

    # max length
    if args.max_length > 0:
        m_max_length = regions['orf_len'] <= args.max_length
        m_filters = m_max_length & m_filters

    # min profile
    profiles = scipy.io.mmread(args.profiles).tocsr()
    profiles_sums = profiles.sum(axis=1)
    good_orf_nums = np.where(profiles_sums >= args.min_profile)
    good_orf_nums = set(good_orf_nums[0])
    m_profile = regions['orf_num'].isin(good_orf_nums)
    m_filters = m_profile & m_filters

    regions = regions[m_filters]

    if args.num_orfs > 0:
        regions = regions.head(args.num_orfs)

    regions = regions.reset_index(drop=True)

    msg = "Number of regions after filtering: {}".format(len(regions))
    logger.info(msg)

    logger.debug("Reading models")
    translated_models = [
        pickle.load(open(tm, 'rb')) for tm in args.translated_models
    ]
    untranslated_models = [
        pickle.load(open(bm, 'rb')) for bm in args.untranslated_models
    ]

    profiles_data = multiprocessing.RawArray(ctypes.c_double,
                                             profiles.data.flat)
    profiles_indices = multiprocessing.RawArray(ctypes.c_int, profiles.indices)
    profiles_indptr = multiprocessing.RawArray(ctypes.c_int, profiles.indptr)
    profiles_shape = multiprocessing.RawArray(ctypes.c_int, profiles.shape)

    with suppress_stdout_stderr():

        bfs_l = parallel.apply_parallel_split(regions,
                                              args.num_cpus,
                                              get_all_bayes_factors_args,
                                              num_groups=args.num_groups,
                                              progress_bar=True,
                                              backend='multiprocessing')

    bfs = pd.concat(bfs_l)

    # write the results as a bed12+ file
    bed_utils.write_bed(bfs, args.out)