예제 #1
0
def main():
    parser = argparse.ArgumentParser(
        formatter_class=argparse.ArgumentDefaultsHelpFormatter,
        description="This script rejoins annotations which were split with the "
        "split-long-chromosomes script. Importantly, the \"max-size\" parameter here "
        "must match the \"max-parameter\" used for that script.")

    parser.add_argument(
        'bed',
        help="A BED file using the annotations from a GTF file "
        "created by split-long-chromosomes")
    parser.add_argument(
        'out', help="A BED file in which the chromosomes have been rejoined")
    parser.add_argument('--max-size',
                        help="The largest allowed size (in bp) for a "
                        "chromosome",
                        type=int,
                        default=default_max_size)

    parser.add_argument('--num-procs',
                        help="The number of processors to use",
                        type=int,
                        default=default_num_procs)

    utils.add_logging_options(parser)
    args = parser.parse_args()
    utils.update_logging(args)

    msg = "Reading BED"
    logging.info(msg)

    bed = bio.read_bed(args.bed)

    msg = "Updating BED coordinates"
    logging.info(msg)

    original_coordinates = parallel.apply_parallel(bed,
                                                   args.num_procs,
                                                   get_original_coordinates,
                                                   args.max_size,
                                                   progress_bar=True)

    original_coordinates_df = pd.DataFrame(original_coordinates)

    bed['seqname'] = original_coordinates_df['seqname']
    bed['start'] = original_coordinates_df['start'].astype(int)
    bed['end'] = original_coordinates_df['end'].astype(int)
    bed['thick_start'] = original_coordinates_df['thick_start'].astype(int)
    bed['thick_end'] = original_coordinates_df['thick_end'].astype(int)

    msg = "Writing updated BED to disk"
    logging.info(msg)

    bio.write_bed(bed, args.out)
예제 #2
0
def main():
    parser = argparse.ArgumentParser(
        formatter_class=argparse.ArgumentDefaultsHelpFormatter,
        description=
        "This script reorders sequences in a fasta file to be in the same "
        "order as the sequences in the STAR transcriptInfo.tab file.")
    parser.add_argument('transcript_info',
                        help="The STAR transcriptInfo.txt file, which "
                        "contains the desired sequence order")
    parser.add_argument(
        'fasta',
        help="The fasta file containing the sequences. The index "
        "should already be created, and the keys for the index must exactly match the "
        "sequence names in the transcript_info file.")
    parser.add_argument(
        'out', help="The output fasta file with the sequences rearranged")
    parser.add_argument('--compress',
                        help="If this flag is present, then the output "
                        "will be gzipped",
                        action='store_true')

    utils.add_logging_options(parser)
    args = parser.parse_args()
    utils.update_logging(args)

    msg = "Reading STAR transcript file"
    logging.info(msg)
    transcript_info = bio.read_star_tr_file(args.transcript_info)

    msg = "Reading transcript fasta file"
    logging.info(msg)
    fasta = bio.get_fasta_dict(args.fasta)  #
    trans_id_mapping = {x.split()[0]: x for x in fasta.keys()}

    # open the tab file
    if args.compress:
        out = gzip.open(args.out, 'wt')
    else:
        out = open(args.out, 'w')

    for i in tqdm.trange(len(transcript_info)):
        trans_id = transcript_info.iloc[i]['ID']
        fasta_id = trans_id_mapping[trans_id]

        header = ">{}\n".format(fasta_id)
        out.write(header)
        seq = str(fasta[fasta_id])

        wrapped_seq = textwrap.fill(seq)
        out.write(wrapped_seq)
        out.write("\n")

    out.close()
def main():
    parser = argparse.ArgumentParser(formatter_class=argparse.ArgumentDefaultsHelpFormatter,
        description="This script matches QTI-seq peaks from (Gao et al., 2015) to ORFs "
        "based on genomic coordinates.")
    parser.add_argument('orfs', help="The ORFs (bed12+) file")
    parser.add_argument('qti_peaks', help="The QTI-seq peak (BED6) files")
    parser.add_argument('out', help="The augmented ORFs (BED12+) file")

    parser.add_argument('--output-prefix', help="A string to prefix before all of the "
        "fields related to the closest QTI-seq peak (if there is one)",
        default=default_output_prefix)

    parser.add_argument('--seqname-prefix', help="If present, this string is prepended "
        "to all of the ORF seqnames. It is then removed again in the final output.", 
        default=default_seqname_prefix)

    utils.add_logging_options(parser)
    args = parser.parse_args()
    utils.update_logging(args)

    programs = ['closestBed']
    utils.check_programs_exist(programs)

    msg = "Reading ORFs"
    logger.info(msg)

    orfs = bio.read_bed(args.orfs)

    # we need to keep a copy that we use later for output
    orfs_copy = orfs.copy()

    # for matching qti-seq peaks, we only want to consider the start position of each ORF

    # for forward strand ORFs, replace orf_genomic_end with orf_genomic_start
    msg = "Updating genomic positions to consider only start codon"
    logger.info(msg)
    m_forward = orfs['strand'] == '+'
    #forward_orfs = orfs[mask_forward]
    #forward_orfs['end'] = forward_orfs['start'] + 1
    orfs.loc[m_forward, 'end'] = orfs.loc[m_forward, 'start'] + 1

    # for reverse ORFs, replace orf_genomic_start with orf_genomic_end
    m_reverse = orfs['strand'] == '-'
    #reverse_orfs = orfs[mask_reverse]
    #reverse_orfs['start'] = reverse_orfs['end'] - 1
    orfs.loc[m_reverse, 'start'] = orfs.loc[m_reverse, 'end'] - 1

    # join together the orf start positions, correct the seqname and sort for bedtools
    msg = "Converting ORF data frame to pybedtools"
    logger.info(msg)
    #orfs_start_only = pd.concat([forward_orfs, reverse_orfs])
    #orfs_start_only['seqname'] = args.seqname_prefix + orfs_start_only['seqname']
    #orfs_start_only = orfs_start_only.sort_values(['seqname', 'start'])
    #orfs_bed = pybedtools.BedTool.from_dataframe(orfs_start_only)

    orfs['seqname'] = args.seqname_prefix + orfs['seqname']
    orfs = orfs.sort_values(['seqname', 'start'])
    orfs_bed = pybedtools.BedTool.from_dataframe(orfs)

    msg = "Reading QTI peaks"
    qti_bed_df = bio.read_bed(args.qti_peaks)
    qti_bed_df.columns = ["{}_{}".format(args.output_prefix, c) for c in qti_bed_df.columns]

    # and covert to bed
    msg = "Converting QTI peaks data frame to pybedtools"
    logger.info(msg)

    chr_field = '{}_chr'.format(args.output_prefix)
    start_field = '{}_start'.format(args.output_prefix)
    qti_bed_df = qti_bed_df.sort_values([chr_field, start_field])
    qti_bed = pybedtools.BedTool.from_dataframe(qti_bed_df)

    msg = "Finding closest QTI peak for all ORFs"
    logger.info(msg)
    # s means to consider strandedness
    # D means to report the distance
    # a means to report upstream positions (relative to orfs_start) as negative
    closest_bed = orfs_bed.closest(qti_bed, s=True, D="a")

    # covert back to a df for clean up
    msg = "Converting closest results back to data frame"
    logger.info(msg)
    
    peak_distance_field = '{}_peak_distance'.format(args.output_prefix)
    closest_bed_fields = list(orfs.columns) + list(qti_bed_df.columns) + [peak_distance_field]
    closest_df = closest_bed.to_dataframe(names=closest_bed_fields, index_col=False)

    # now join the relevant fields back to the original ORF data frame
    fields_to_join = list(qti_bed_df.columns) + [peak_distance_field, 'id']
    closest_df = closest_df[fields_to_join]

    msg = "Joining closest results to original ORFs"
    logger.info(msg)
    orf_qti_df = pd.merge(orfs_copy, closest_df, on='id', how='left')
    orf_qti_df = orf_qti_df.sort_values(['seqname', 'start'])

    # and write this out as a bed12+ file
    msg = "Writing joined BED12+ file to disk"
    logger.info(msg)

    bio.write_bed(orf_qti_df, args.out)
예제 #4
0
def main():
    parser = argparse.ArgumentParser(
        formatter_class=argparse.ArgumentDefaultsHelpFormatter,
        description=
        "This script creates a plot showing the fraction of predicted ORFs "
        "which have a set amount of peptide coverage.")
    parser.add_argument(
        'rpbp_peptide_matches',
        help="The (csv) file containing the peptides "
        "matching to each ORF predicted as translated using Rp-Bp (produced by "
        "get-orf-peptide-matches)")
    parser.add_argument(
        'rpchi_peptide_matches',
        help="The (csv) file containing the peptides "
        "matching to each ORF predicted as translated using Rp-chi (produced by "
        "get-orf-peptide-matches)")
    parser.add_argument('out', help="The output (image) file")

    parser.add_argument('-l',
                        '--min-length',
                        help="The minimum length for ORFs (in "
                        "nucleotides) to consider in the analyis",
                        type=int,
                        default=default_min_length)

    parser.add_argument('-p',
                        '--num-cpus',
                        help="The number of processors to use",
                        type=int,
                        default=default_num_cpus)

    parser.add_argument('--title', default=default_title)
    parser.add_argument('--fontsize', type=int, default=default_fontsize)
    parser.add_argument('--note-fontsize',
                        type=int,
                        default=default_note_fontsize)
    parser.add_argument('--line-width', type=int, default=default_line_width)

    utils.add_logging_options(parser)
    args = parser.parse_args()
    utils.update_logging(args)

    msg = "Reading predictions"
    logging.info(msg)
    rpbp_peptide_matches = pd.read_csv(args.rpbp_peptide_matches)
    rpchi_peptide_matches = pd.read_csv(args.rpchi_peptide_matches)

    if args.min_length > 0:
        msg = "Filtering predictions by: length > {}".format(args.min_length)
        logging.warning(msg)

        # multiply by 3 because the orf sequences are amino acid sequences
        bf_lengths = rpbp_peptide_matches['orf_sequence'].str.len() * 3
        m_bf_length = bf_lengths > args.min_length
        rpbp_peptide_matches = rpbp_peptide_matches[m_bf_length]

        chisq_lengths = rpchi_peptide_matches['orf_sequence'].str.len() * 3
        m_chisq_length = chisq_lengths > args.min_length
        rpchi_peptide_matches = rpchi_peptide_matches[m_chisq_length]

    msg = "Calculating Rp-Bp coverage"
    logging.info(msg)
    bf_coverage = parallel.apply_parallel(rpbp_peptide_matches,
                                          args.num_cpus,
                                          get_orf_coverage,
                                          progress_bar=True)
    bf_coverage = pd.DataFrame(bf_coverage)

    msg = "Calculating Rp-chi coverage"
    logging.info(msg)
    chisq_coverage = parallel.apply_parallel(rpchi_peptide_matches,
                                             args.num_cpus,
                                             get_orf_coverage,
                                             progress_bar=True)
    chisq_coverage = pd.DataFrame(chisq_coverage)

    msg = "Creating image"
    logging.info(msg)

    # plot the empirical distribution of ORF lengths
    hist_min = 0
    hist_max = 1.1
    hist_step = 0.05
    hist_range = (hist_min, hist_max)
    hist_bins = np.arange(hist_min, hist_max, hist_step)

    bf_covered_hist, b = np.histogram(bf_coverage['coverage'],
                                      bins=hist_bins,
                                      range=hist_range,
                                      density=True)

    chisq_covered_hist, b = np.histogram(chisq_coverage['coverage'],
                                         bins=hist_bins,
                                         range=hist_range,
                                         density=True)

    # now, normalize the histograms
    bf_covered_hist = bf_covered_hist / np.sum(bf_covered_hist)
    chisq_covered_hist = chisq_covered_hist / np.sum(chisq_covered_hist)

    # multiply by 100 to give actual percentages
    bf_covered_hist = 100 * bf_covered_hist
    chisq_covered_hist = 100 * chisq_covered_hist

    hist_bins = 100 * hist_bins

    fig, ax = plt.subplots(figsize=(10, 5))

    cm = plt.cm.gist_earth

    x = np.arange(len(bf_covered_hist))

    bf_label = r'\textsc{Rp-Bp}'
    ax.plot(x,
            bf_covered_hist,
            color=cm(0.1),
            label=bf_label,
            linewidth=args.line_width,
            linestyle='--',
            marker='^')

    chisq_label = r'\textsc{Rp-$\chi^2$}'
    ax.plot(x,
            chisq_covered_hist,
            color=cm(0.3),
            label=chisq_label,
            linewidth=args.line_width,
            linestyle='-.',
            marker='D')

    ax.set_xlabel('Peptide Coverage (\%)', fontsize=args.fontsize)
    ax.set_ylabel('\% of predicted ORFs', fontsize=args.fontsize)

    if args.title is not None and len(args.title) > 0:
        ax.set_title(args.title, fontsize=args.fontsize)

    # only show every 20% on the x-axis
    ax.set_xticks(x[::4])
    ax.set_xticklabels(hist_bins[::4])

    def my_formatter_fun(x, p):
        return "${:d}$".format(20 * p)

    ax.get_xaxis().set_major_formatter(
        matplotlib.ticker.FuncFormatter(my_formatter_fun))

    # hide the "0" tick label
    yticks = ax.yaxis.get_major_ticks()
    yticks[0].label1.set_visible(False)

    ax.set_xlim((0, len(bf_covered_hist) - 1))
    ax.set_ylim((0, 10))

    ax.legend(loc='upper right', fontsize=args.fontsize)
    ax.tick_params(axis='both', which='major', labelsize=args.note_fontsize)

    fig.savefig(args.out, bbox_inches='tight')
예제 #5
0
def main():
    parser = argparse.ArgumentParser(
        formatter_class=argparse.ArgumentDefaultsHelpFormatter,
        description=
        "This script matches QTI-seq peaks from (Gao et al., 2015) to ORFs "
        "based on genomic coordinates.")
    parser.add_argument('orfs', help="The ORFs (bed12+) file")
    parser.add_argument('qti_peaks', help="The QTI-seq peak (BED6) files")
    parser.add_argument('out', help="The augmented ORFs (BED12+) file")

    parser.add_argument(
        '--output-prefix',
        help="A string to prefix before all of the "
        "fields related to the closest QTI-seq peak (if there is one)",
        default=default_output_prefix)

    parser.add_argument(
        '--seqname-prefix',
        help="If present, this string is prepended "
        "to all of the ORF seqnames. It is then removed again in the final output.",
        default=default_seqname_prefix)

    utils.add_logging_options(parser)
    args = parser.parse_args()
    utils.update_logging(args)

    programs = ['closestBed']
    utils.check_programs_exist(programs)

    msg = "Reading ORFs"
    logger.info(msg)

    orfs = bio.read_bed(args.orfs)

    # we need to keep a copy that we use later for output
    orfs_copy = orfs.copy()

    # for matching qti-seq peaks, we only want to consider the start position of each ORF

    # for forward strand ORFs, replace orf_genomic_end with orf_genomic_start
    msg = "Updating genomic positions to consider only start codon"
    logger.info(msg)
    m_forward = orfs['strand'] == '+'
    #forward_orfs = orfs[mask_forward]
    #forward_orfs['end'] = forward_orfs['start'] + 1
    orfs.loc[m_forward, 'end'] = orfs.loc[m_forward, 'start'] + 1

    # for reverse ORFs, replace orf_genomic_start with orf_genomic_end
    m_reverse = orfs['strand'] == '-'
    #reverse_orfs = orfs[mask_reverse]
    #reverse_orfs['start'] = reverse_orfs['end'] - 1
    orfs.loc[m_reverse, 'start'] = orfs.loc[m_reverse, 'end'] - 1

    # join together the orf start positions, correct the seqname and sort for bedtools
    msg = "Converting ORF data frame to pybedtools"
    logger.info(msg)
    #orfs_start_only = pd.concat([forward_orfs, reverse_orfs])
    #orfs_start_only['seqname'] = args.seqname_prefix + orfs_start_only['seqname']
    #orfs_start_only = orfs_start_only.sort_values(['seqname', 'start'])
    #orfs_bed = pybedtools.BedTool.from_dataframe(orfs_start_only)

    orfs['seqname'] = args.seqname_prefix + orfs['seqname']
    orfs = orfs.sort_values(['seqname', 'start'])
    orfs_bed = pybedtools.BedTool.from_dataframe(orfs)

    msg = "Reading QTI peaks"
    qti_bed_df = bio.read_bed(args.qti_peaks)
    qti_bed_df.columns = [
        "{}_{}".format(args.output_prefix, c) for c in qti_bed_df.columns
    ]

    # and covert to bed
    msg = "Converting QTI peaks data frame to pybedtools"
    logger.info(msg)

    chr_field = '{}_chr'.format(args.output_prefix)
    start_field = '{}_start'.format(args.output_prefix)
    qti_bed_df = qti_bed_df.sort_values([chr_field, start_field])
    qti_bed = pybedtools.BedTool.from_dataframe(qti_bed_df)

    msg = "Finding closest QTI peak for all ORFs"
    logger.info(msg)
    # s means to consider strandedness
    # D means to report the distance
    # a means to report upstream positions (relative to orfs_start) as negative
    closest_bed = orfs_bed.closest(qti_bed, s=True, D="a")

    # covert back to a df for clean up
    msg = "Converting closest results back to data frame"
    logger.info(msg)

    peak_distance_field = '{}_peak_distance'.format(args.output_prefix)
    closest_bed_fields = list(orfs.columns) + list(
        qti_bed_df.columns) + [peak_distance_field]
    closest_df = closest_bed.to_dataframe(names=closest_bed_fields,
                                          index_col=False)

    # now join the relevant fields back to the original ORF data frame
    fields_to_join = list(qti_bed_df.columns) + [peak_distance_field, 'id']
    closest_df = closest_df[fields_to_join]

    msg = "Joining closest results to original ORFs"
    logger.info(msg)
    orf_qti_df = pd.merge(orfs_copy, closest_df, on='id', how='left')
    orf_qti_df = orf_qti_df.sort_values(['seqname', 'start'])

    # and write this out as a bed12+ file
    msg = "Writing joined BED12+ file to disk"
    logger.info(msg)

    bio.write_bed(orf_qti_df, args.out)
def main():
    parser = argparse.ArgumentParser(formatter_class=argparse.ArgumentDefaultsHelpFormatter,
        description="This script plots the (log) Bayes factor against the estimated "
        "RPKM for all ORFs. All relevant values will be clipped according to the "
        "specified arguments for viewing.")

    parser.add_argument('config', help="The (yaml) config file")
    parser.add_argument('name', help="The name of the dataset or replicate to plot")
    parser.add_argument('out', help="The output image file")

    parser.add_argument('-p', '--use-predictions', help="If this flag is present, then "
        "the \"predicted ORFs\" files will be used. Otherwise, all ORFs in the dataset "
        "will be visualized.", action='store_true')
    parser.add_argument('-r', '--is-replicate', help="If the name corresponds to one "
        "of the replicates, this flag must be used to ensure the filenames are "
        "handled correctly.", action='store_true')

    parser.add_argument('--title', default=default_title)

    parser.add_argument('--min-rpkm', type=float, default=default_min_rpkm)
    parser.add_argument('--max-rpkm', type=float, default=default_max_rpkm)
    parser.add_argument('--min-bf', type=float, default=default_min_bf)
    parser.add_argument('--max-bf', type=float, default=default_max_bf)
    
    utils.add_logging_options(parser)
    args = parser.parse_args()
    utils.update_logging(args)

    config = yaml.load(open(args.config))
    note = config.get('note', None)

    if args.is_replicate:
        lengths = None
        offsets = None
    else:
        lengths, offsets = ribo_utils.get_periodic_lengths_and_offsets(config, args.name)
        
    fraction = config.get('smoothing_fraction', None)
    reweighting_iterations = config.get('smoothing_reweighting_iterations', None)

    # we will need these to get the appropriate log BFs
    if args.use_predictions:
        bayes_factors = filenames.get_riboseq_predicted_orfs(config['riboseq_data'], args.name,
            length=lengths, offset=offsets, is_unique=True, note=note, is_smooth=True,
            fraction=fraction, reweighting_iterations=reweighting_iterations)
    else:
        bayes_factors = filenames.get_riboseq_bayes_factors(config['riboseq_data'], args.name,
            length=lengths, offset=offsets, is_unique=True, note=note, is_smooth=True,
            fraction=fraction, reweighting_iterations=reweighting_iterations)

    if not os.path.exists(bayes_factors):
        msg = ("Could not find the Bayes factor file: {}\nIf this is for a particular "
            "sample and the --merge-replicates option was used, this is not a problem. "
            "Will not create this scatter plot".format(bayes_factors))
        logger.warning(msg)
        return

    msg = "Reading Bayes factors"
    logger.info(msg)
    bayes_factors = bio.read_bed(bayes_factors)

    # we need these to get the raw counts for calculating RPKM

    # we always need all of the counts, so no need to check which ORFs
    rpchi_pvalues = filenames.get_riboseq_bayes_factors(config['riboseq_data'], args.name, 
        length=lengths, offset=offsets, is_unique=True, note=note, is_smooth=False)

    
    if not os.path.exists(rpchi_pvalues):
        msg = ("Could not find the Rp-chi pvalues file: {}\nIf this is for a particular "
            "sample and the --merge-replicates option was used, this is not a problem. "
            "Will not create this scatter plot".format(rpchi_pvalues))
        logger.warning(msg)
        return

    msg = "Reading Rp-chi pvalues"
    logger.info(msg)
    rpchi_pvalues = bio.read_bed(rpchi_pvalues)

    msg = "Calculating RPKM values"
    logger.info(msg)

    # we approximate the number of mapping reads as the sum across all ORFs.
    # this double-counts some reads
    num_reads = np.sum(rpchi_pvalues['profile_sum'])
    all_rpkm = (1e6 * rpchi_pvalues['x_1_sum']) / (rpchi_pvalues['orf_len'] * num_reads)

    # only include things that have some reads in the visualization
    m_rpkm = all_rpkm > 0

    msg = "Creating plot"
    logger.info(msg)

    fig, ax = plt.subplots(figsize=(10, 5))

    cm = plt.cm.gist_earth

    for i, orf_label in enumerate(ribo_utils.orf_type_labels):
        
        orf_types = ribo_utils.orf_type_labels_mapping[orf_label]
        m_type = bayes_factors['orf_type'].isin(orf_types)

        # now, pull out the RPKMs
        if args.use_predictions:
            # if we are using predictions, we have to filter and join
            orf_ids = bayes_factors.loc[m_rpkm & m_type, 'id']
            bfs = np.array(bayes_factors.loc[m_rpkm & m_type, 'bayes_factor_mean'])

            m_ids = rpchi_pvalues['id'].isin(orf_ids)
            rpkm = np.array(all_rpkm[m_ids])

        else:
            # otherwise ,the data frames match, so we can just use the masks
            rpkm = np.array(all_rpkm[m_rpkm & m_type])
            bfs = np.array(bayes_factors.loc[m_rpkm & m_type, 'bayes_factor_mean'])
        
        rpkm = np.clip(rpkm, args.min_rpkm, args.max_rpkm)
        bfs = np.clip(bfs, args.min_bf, args.max_bf)
        
        color = i / len(ribo_utils.orf_type_labels)
        color = cm(color)
        
        label = "{} ({})".format(orf_label, len(rpkm))

        ax.scatter(rpkm, bfs, label=label, color=color, edgecolor='k')


    ax.set_ylim((args.min_bf * 1.5, args.max_bf * 1.5))
    ax.set_xlim((args.min_rpkm * 1.5, args.max_rpkm * 1.25))

    ax.set_yscale('symlog')
    ax.set_xscale('symlog')

    ax.set_xlabel('RPKM')
    ax.set_ylabel('log BF')

    lgd = ax.legend(loc='center right', bbox_to_anchor=(1.5, 0.5))

    if len(args.title) > 0:
        ax.set_title(args.title)

    fig.savefig(args.out, bbox_inches='tight', bbox_extra_artists=(lgd,))
def main():
    parser = argparse.ArgumentParser(formatter_class=argparse.ArgumentDefaultsHelpFormatter,
        description="This script creates a plot showing the fraction of predicted ORFs "
        "which have a set amount of peptide coverage.")
    parser.add_argument('rpbp_peptide_matches', help="The (csv) file containing the peptides "
        "matching to each ORF predicted as translated using Rp-Bp (produced by "
        "get-orf-peptide-matches)")
    parser.add_argument('rpchi_peptide_matches', help="The (csv) file containing the peptides "
        "matching to each ORF predicted as translated using Rp-chi (produced by "
        "get-orf-peptide-matches)")
    parser.add_argument('out', help="The output (image) file")

    parser.add_argument('-l', '--min-length', help="The minimum length for ORFs (in "
        "nucleotides) to consider in the analyis", type=int, default=default_min_length)

    parser.add_argument('-p', '--num-cpus', help="The number of processors to use", type=int,
        default=default_num_cpus)

    parser.add_argument('--title', default=default_title)
    parser.add_argument('--fontsize', type=int, default=default_fontsize)
    parser.add_argument('--note-fontsize', type=int, default=default_note_fontsize)
    parser.add_argument('--line-width', type=int, default=default_line_width)

    utils.add_logging_options(parser)
    args = parser.parse_args()
    utils.update_logging(args)

    msg = "Reading predictions"
    logging.info(msg)
    rpbp_peptide_matches = pd.read_csv(args.rpbp_peptide_matches)
    rpchi_peptide_matches = pd.read_csv(args.rpchi_peptide_matches)

    if args.min_length > 0:
        msg = "Filtering predictions by: length > {}".format(args.min_length)
        logging.warning(msg)

        # multiply by 3 because the orf sequences are amino acid sequences
        bf_lengths = rpbp_peptide_matches['orf_sequence'].str.len() * 3
        m_bf_length = bf_lengths > args.min_length
        rpbp_peptide_matches = rpbp_peptide_matches[m_bf_length]


        chisq_lengths = rpchi_peptide_matches['orf_sequence'].str.len() * 3
        m_chisq_length = chisq_lengths > args.min_length
        rpchi_peptide_matches = rpchi_peptide_matches[m_chisq_length]
        
    msg = "Calculating Rp-Bp coverage"
    logging.info(msg)
    bf_coverage = parallel.apply_parallel(rpbp_peptide_matches, 
                                            args.num_cpus, 
                                            get_orf_coverage, 
                                            progress_bar=True)
    bf_coverage = pd.DataFrame(bf_coverage)

    msg = "Calculating Rp-chi coverage"
    logging.info(msg)
    chisq_coverage = parallel.apply_parallel(rpchi_peptide_matches, 
                                                args.num_cpus,
                                                get_orf_coverage, 
                                                progress_bar=True)
    chisq_coverage = pd.DataFrame(chisq_coverage)

    msg = "Creating image"
    logging.info(msg)

    # plot the empirical distribution of ORF lengths
    hist_min = 0
    hist_max = 1.1
    hist_step = 0.05
    hist_range = (hist_min, hist_max)
    hist_bins = np.arange(hist_min, hist_max, hist_step)

    bf_covered_hist, b = np.histogram(bf_coverage['coverage'], 
                                        bins=hist_bins, 
                                        range=hist_range, 
                                        density=True)
    
    chisq_covered_hist, b = np.histogram(chisq_coverage['coverage'], 
                                            bins=hist_bins, 
                                            range=hist_range, 
                                            density=True)

    # now, normalize the histograms
    bf_covered_hist = bf_covered_hist / np.sum(bf_covered_hist)
    chisq_covered_hist = chisq_covered_hist / np.sum(chisq_covered_hist)

    # multiply by 100 to give actual percentages
    bf_covered_hist = 100 * bf_covered_hist
    chisq_covered_hist = 100 * chisq_covered_hist

    hist_bins = 100*hist_bins

    fig, ax = plt.subplots(figsize=(10,5))

    cm = plt.cm.gist_earth

    x = np.arange(len(bf_covered_hist))

    bf_label = r'\textsc{Rp-Bp}'
    ax.plot(x, 
            bf_covered_hist, 
            color=cm(0.1), 
            label=bf_label, 
            linewidth=args.line_width, 
            linestyle='--', 
            marker='^')

    chisq_label = r'\textsc{Rp-$\chi^2$}'
    ax.plot(x, 
            chisq_covered_hist, 
            color=cm(0.3), 
            label=chisq_label, 
            linewidth=args.line_width,
            linestyle='-.',
            marker='D')

    ax.set_xlabel('Peptide Coverage (\%)', fontsize=args.fontsize)
    ax.set_ylabel('\% of predicted ORFs', fontsize=args.fontsize)
    
    if args.title is not None and len(args.title) > 0:
        ax.set_title(args.title, fontsize=args.fontsize)

    # only show every 20% on the x-axis
    ax.set_xticks(x[::4])
    ax.set_xticklabels(hist_bins[::4])

    def my_formatter_fun(x, p):
        return "${:d}$".format(20*p)
    ax.get_xaxis().set_major_formatter(matplotlib.ticker.FuncFormatter(my_formatter_fun))

    # hide the "0" tick label
    yticks = ax.yaxis.get_major_ticks()
    yticks[0].label1.set_visible(False)

    ax.set_xlim((0, len(bf_covered_hist)-1))
    ax.set_ylim((0,10))

    ax.legend(loc='upper right', fontsize=args.fontsize)
    ax.tick_params(axis='both', which='major', labelsize=args.note_fontsize)

    fig.savefig(args.out, bbox_inches='tight')
def main():
    parser = argparse.ArgumentParser(formatter_class=argparse.ArgumentDefaultsHelpFormatter,
        description="This script extracts reads of various types from a processed dataset "
        "to create an \"interesting\" test dataset.\n\nN.B. This script is not "
        "particularly efficient and is not intended for external use.")
    parser.add_argument('config', help="The (yaml) config file")
    parser.add_argument('name', help="The name of the dataset to use to create the test data")
    parser.add_argument('out', help="The output (fasta.gz) which contains reads of various "
        "types, subject to the other parameters")

    parser.add_argument('-r', '--reference', help="The name of the reference sequence (chromosome) "
        "from which aligned reads will be extracted", default=default_reference)

    parser.add_argument('-m', '--max-reads', help="At most <max_reads> reads of each type "
        "will be included in the final output", type=int, default=default_max_reads)
    
    utils.add_logging_options(parser)
    args = parser.parse_args()
    utils.update_logging(args)

    config = yaml.load(open(args.config))

    note = config.get('note', None)
    
    # keep multimappers?
    is_unique = not ('keep_riboseq_multimappers' in config)

    ###
    msg = "Reading alignments from BAM file"
    logging.info(msg)

    bam_file = filenames.get_riboseq_bam(config['riboseq_data'], 
        args.name, is_unique=is_unique, note=note)
    bam = pysam.AlignmentFile(bam_file)
    
    alignments = bam.fetch(reference=args.reference)
    num_alignments = bam.count(reference=args.reference)
    alignment_qnames = {get_first_token(a.qname) for a in alignments}

    ###
    msg = "Extracting a similar number of rRNA reads"
    logging.info(msg)

    with_rrna = filenames.get_with_rrna_fastq(config['riboseq_data'], args.name, 
        note=note)
        
    rrna = bio.get_read_iterator(with_rrna, is_fasta=False)
    rrna = itertools.islice(rrna, num_alignments)
    rrna_qnames = {get_first_token(read[0]) for read in rrna}

    ###
    msg = "Extracting a similar number of reads which do not uniquely map to the genome"
    logging.info(msg)

    # first, pull out the qnames of all alignments
    all_alignments = bam.fetch()
    all_alignment_qnames = {get_first_token(a.qname) for a in all_alignments}

    # iterate over all reads which passed the rRNA and quality filtering
    without_rrna_file = filenames.get_without_rrna_fastq(config['riboseq_data'], 
        args.name, note=note)
    without_rrna = bio.get_read_iterator(without_rrna_file, is_fasta=False)
    without_rrna_qnames = {get_first_token(read[0]) for read in without_rrna}

    no_mapping_qnames = without_rrna_qnames - all_alignment_qnames

    ###
    msg = "Extracting a similar number of reads which are filtered due to quality issues"
    logging.info(msg)

    # first, pull in all the reads and their names

    msg = "Reading all reads into a dictionary"
    logging.debug(msg)

    raw_data_file = config['riboseq_samples'][args.name]
    raw_data = bio.get_fasta_dict(raw_data_file, is_fasta=False, key_fn=get_first_token)
    raw_data_qnames = set(raw_data.keys())

    msg = "Reading quality scores into dictionary"
    logging.debug(msg)

    raw_data_qual = bio.get_fastq_qual_dict(raw_data_file, key_fn=get_first_token)

    # now, the reads which _did_ pass quality filtering
    msg = "Reading reads which pass quality filtering into a set"
    logging.debug(msg)

    without_adapters_file = filenames.get_without_adapters_fastq(config['riboseq_data'], args.name, note=note)
    without_adapters = bio.get_read_iterator(without_adapters_file, is_fasta=False)
    without_adapters_qnames = {get_first_token(read[0]) for read in without_adapters}

    # and pull out the qnames of the reads which did not pass quality filtering
    filtered_reads_qnames = raw_data_qnames - without_adapters_qnames

    ###
    msg = "Constructing the set of reads to output"
    logging.info(msg)
        
    alignment_raw_data = {qname: raw_data[qname] 
        for qname in itertools.islice(alignment_qnames, args.max_reads)}

    rrna_raw_data = {qname: raw_data[qname] 
        for qname in itertools.islice(rrna_qnames, args.max_reads)}

    no_mapping_raw_data = {qname: raw_data[qname] 
        for qname in itertools.islice(no_mapping_qnames, args.max_reads)}

    filtered_reads_raw_data = {qname: raw_data[qname] 
        for qname in itertools.islice(filtered_reads_qnames, args.max_reads)}

    out_raw_data = alignment_raw_data
    out_raw_data.update(rrna_raw_data)
    out_raw_data.update(no_mapping_raw_data)
    out_raw_data.update(filtered_reads_raw_data)

    ###
    msg = "Writing sequences to disk"
    logging.info(msg)

    bio.write_fastq(out_raw_data, raw_data_qual, args.out, progress_bar=True)
예제 #9
0
def main():
    parser = argparse.ArgumentParser(
        formatter_class=argparse.ArgumentDefaultsHelpFormatter,
        description=
        "This script plots the (log) Bayes factor against the estimated "
        "RPKM for all ORFs. All relevant values will be clipped according to the "
        "specified arguments for viewing.")

    parser.add_argument('config', help="The (yaml) config file")
    parser.add_argument('name',
                        help="The name of the dataset or replicate to plot")
    parser.add_argument('out', help="The output image file")

    parser.add_argument(
        '-p',
        '--use-predictions',
        help="If this flag is present, then "
        "the \"predicted ORFs\" files will be used. Otherwise, all ORFs in the dataset "
        "will be visualized.",
        action='store_true')
    parser.add_argument(
        '-r',
        '--is-replicate',
        help="If the name corresponds to one "
        "of the replicates, this flag must be used to ensure the filenames are "
        "handled correctly.",
        action='store_true')

    parser.add_argument('--title', default=default_title)

    parser.add_argument('--min-rpkm', type=float, default=default_min_rpkm)
    parser.add_argument('--max-rpkm', type=float, default=default_max_rpkm)
    parser.add_argument('--min-bf', type=float, default=default_min_bf)
    parser.add_argument('--max-bf', type=float, default=default_max_bf)

    utils.add_logging_options(parser)
    args = parser.parse_args()
    utils.update_logging(args)

    config = yaml.load(open(args.config))
    note = config.get('note', None)

    if args.is_replicate:
        lengths = None
        offsets = None
    else:
        lengths, offsets = ribo_utils.get_periodic_lengths_and_offsets(
            config, args.name)

    fraction = config.get('smoothing_fraction', None)
    reweighting_iterations = config.get('smoothing_reweighting_iterations',
                                        None)

    # we will need these to get the appropriate log BFs
    if args.use_predictions:
        bayes_factors = filenames.get_riboseq_predicted_orfs(
            config['riboseq_data'],
            args.name,
            length=lengths,
            offset=offsets,
            is_unique=True,
            note=note,
            is_smooth=True,
            fraction=fraction,
            reweighting_iterations=reweighting_iterations)
    else:
        bayes_factors = filenames.get_riboseq_bayes_factors(
            config['riboseq_data'],
            args.name,
            length=lengths,
            offset=offsets,
            is_unique=True,
            note=note,
            is_smooth=True,
            fraction=fraction,
            reweighting_iterations=reweighting_iterations)

    if not os.path.exists(bayes_factors):
        msg = (
            "Could not find the Bayes factor file: {}\nIf this is for a particular "
            "sample and the --merge-replicates option was used, this is not a problem. "
            "Will not create this scatter plot".format(bayes_factors))
        logger.warning(msg)
        return

    msg = "Reading Bayes factors"
    logger.info(msg)
    bayes_factors = bio.read_bed(bayes_factors)

    # we need these to get the raw counts for calculating RPKM

    # we always need all of the counts, so no need to check which ORFs
    rpchi_pvalues = filenames.get_riboseq_bayes_factors(config['riboseq_data'],
                                                        args.name,
                                                        length=lengths,
                                                        offset=offsets,
                                                        is_unique=True,
                                                        note=note,
                                                        is_smooth=False)

    if not os.path.exists(rpchi_pvalues):
        msg = (
            "Could not find the Rp-chi pvalues file: {}\nIf this is for a particular "
            "sample and the --merge-replicates option was used, this is not a problem. "
            "Will not create this scatter plot".format(rpchi_pvalues))
        logger.warning(msg)
        return

    msg = "Reading Rp-chi pvalues"
    logger.info(msg)
    rpchi_pvalues = bio.read_bed(rpchi_pvalues)

    msg = "Calculating RPKM values"
    logger.info(msg)

    # we approximate the number of mapping reads as the sum across all ORFs.
    # this double-counts some reads
    num_reads = np.sum(rpchi_pvalues['profile_sum'])
    all_rpkm = (1e6 * rpchi_pvalues['x_1_sum']) / (rpchi_pvalues['orf_len'] *
                                                   num_reads)

    # only include things that have some reads in the visualization
    m_rpkm = all_rpkm > 0

    msg = "Creating plot"
    logger.info(msg)

    fig, ax = plt.subplots(figsize=(10, 5))

    cm = plt.cm.gist_earth

    for i, orf_label in enumerate(ribo_utils.orf_type_labels):

        orf_types = ribo_utils.orf_type_labels_mapping[orf_label]
        m_type = bayes_factors['orf_type'].isin(orf_types)

        # now, pull out the RPKMs
        if args.use_predictions:
            # if we are using predictions, we have to filter and join
            orf_ids = bayes_factors.loc[m_rpkm & m_type, 'id']
            bfs = np.array(bayes_factors.loc[m_rpkm & m_type,
                                             'bayes_factor_mean'])

            m_ids = rpchi_pvalues['id'].isin(orf_ids)
            rpkm = np.array(all_rpkm[m_ids])

        else:
            # otherwise ,the data frames match, so we can just use the masks
            rpkm = np.array(all_rpkm[m_rpkm & m_type])
            bfs = np.array(bayes_factors.loc[m_rpkm & m_type,
                                             'bayes_factor_mean'])

        rpkm = np.clip(rpkm, args.min_rpkm, args.max_rpkm)
        bfs = np.clip(bfs, args.min_bf, args.max_bf)

        color = i / len(ribo_utils.orf_type_labels)
        color = cm(color)

        label = "{} ({})".format(orf_label, len(rpkm))

        ax.scatter(rpkm, bfs, label=label, color=color, edgecolor='k')

    ax.set_ylim((args.min_bf * 1.5, args.max_bf * 1.5))
    ax.set_xlim((args.min_rpkm * 1.5, args.max_rpkm * 1.25))

    ax.set_yscale('symlog')
    ax.set_xscale('symlog')

    ax.set_xlabel('RPKM')
    ax.set_ylabel('log BF')

    lgd = ax.legend(loc='center right', bbox_to_anchor=(1.5, 0.5))

    if len(args.title) > 0:
        ax.set_title(args.title)

    fig.savefig(args.out, bbox_inches='tight', bbox_extra_artists=(lgd, ))
예제 #10
0
def main():
    parser = argparse.ArgumentParser(formatter_class=argparse.ArgumentDefaultsHelpFormatter,
        description="This script creates a simple latex document.")
    parser.add_argument('config', help="The (yaml) config file for the project")
    parser.add_argument('out', help="The path for the output files")

    parser.add_argument('-l', '--min-orf-length', help="The minimum length for ORFs (in "
        "nucleotides) to consider in the analyis", type=int, default=default_min_orf_length)


    parser.add_argument('--num-cpus', help="The number of processors to use for counting "
        "the matching peptides coverage for each predicted ORF.",
        type=int, default=default_num_cpus)
    parser.add_argument('--overwrite', help="If this flag is present, existing files will "
        "be overwritten.", action='store_true')
        
    parser.add_argument('--note', help="If this option is given, it will be used in the "
        "filenames.\n\nN.B. This REPLACES the note in the config file.", default=default_note)


    utils.add_logging_options(parser)
    args = parser.parse_args()
    utils.update_logging(args)

    config = yaml.load(open(args.config))

    # keep multimappers?
    is_unique = not ('keep_riboseq_multimappers' in config)

    programs =  [   
        'create-orf-peptide-coverage-line-graph'
    ]
    utils.check_programs_exist(programs)
    
    required_keys = [
        'riboseq_data',
        'riboseq_samples'
    ]
    utils.check_keys_exist(config, required_keys)

    # make sure the path to the output file exists
    os.makedirs(args.out, exist_ok=True)

    note_str = config.get('note', None)
    out_note_str = note_str

    if args.note is not None and len(args.note) > 0:
        out_note_str = args.note

    project_name = config.get("project_name", default_project_name)
    title = "Proteomics analysis for {}".format(project_name)
    abstract = "This document shows the results of proteomics analysis."

    header = latex.get_header_text(title, abstract)
    footer = latex.get_footer_text()

    tex_file = os.path.join(args.out, "proteomics-report.tex")

    with open(tex_file, 'w') as out:

        out.write(header)
        out.write("\n")

        title = "ORF peptide coverage"
        latex.section(out, title)


        for name, data in config['riboseq_samples'].items():
            msg = "Processing sample: {}".format(name)
            logging.info(msg)

            logging.debug("overwrite: {}".format(args.overwrite))

            create_figures(args.config, config, name, args)

            title = name
            latex.subsection(out, title)

            try:
                lengths, offsets = riboutils.ribo_utils.get_periodic_lengths_and_offsets(
                    config, name, is_unique=is_unique)
            except FileNotFoundError:
                msg = "Could not parse out lengths and offsets for sample: {}. Skipping".format(name)
                logging.error(msg)
                return
            
            peptide_coverage_line_graph = filenames.get_peptide_coverage_line_graph(config['riboseq_data'], 
                name, length=lengths, offset=offsets, is_unique=is_unique, note=out_note_str)

            if os.path.exists(peptide_coverage_line_graph):
                latex.begin_figure(out)
                latex.write_graphics(out, peptide_coverage_line_graph, width=0.9)
                latex.end_figure(out)

            else:
                text = "Problem creating ORF peptide coverage plot"
                out.write(text)
                out.write("\n")

            latex.clearpage(out)

        out.write(footer)

    
    os.chdir(args.out)
    cmd = "pdflatex -shell-escape proteomics-report"
    utils.check_call(cmd)
    utils.check_call(cmd) # call again to fix references