def main():
    parser = argparse.ArgumentParser(formatter_class=argparse.ArgumentDefaultsHelpFormatter,
        description="This script performs a permutation test to estimate the"
        "significance of the number of micropeptides which have a nearby QTI-"
        "seq peak. Specifically, this script writes the number of micropeptides "
        "with a nearby QTI-seq peak found during each permutation to a file.")

    parser.add_argument('bf', help="The Bayes factor file from "
        "estimate-orfs-bayes-factors")
    parser.add_argument('qti_seq_peaks', help="The QTI-seq peaks in BED format. "
        "This should be the output of qtipeaks2bed.py, not the Supplemental "
        "Files distributed with the paper")
    parser.add_argument('out', help="The output (csv.gz) file. Each line will "
        "contain a permutation number and the number of micropeptides with a "
        "QTI-seq peak.")

    parser.add_argument('--max-length', help="The maximum length to consider an "
        "ORF as translated. This is used to keep only \"micropeptides\".",
        type=int, default=default_max_length)

    
    parser.add_argument('-n', '--num-random-samples', help="The number of random samples "
        "to draw for the permuation test.", type=int, default=default_num_random_samples)
    
    parser.add_argument('-p', '--num-cpus', help="The number of processors to use "
        "for parallelization the sampling", type=int, default=default_num_cpus)

    parser.add_argument('-g', '--num-groups', help="The number of groups to use for "
        "performing the permutation tests. More groups means the progress bar is "
        "updated more frequently but incurs more overhead because of the parallel "
        "calls.", type=int, default=default_num_groups)    

    parser.add_argument('--seed', help="The random seed", type=int, default=default_seed)

    parser.add_argument('--min-length', help="The minimum length to predict an ORF "
        "as translated", type=int, default=default_min_length)

    parser.add_argument('--min-profile', help="ORFs with profile sum (i.e., number "
        "of reads) less than this value will not be processed.", type=float, 
        default=default_min_profile)

    
    parser.add_argument('--min-bf-mean', help="The minimum Bayes' factor mean to predict "
        "an ORF as translated (use --help for more details)", 
        type=float, default=default_min_bf_mean)
    parser.add_argument('--max-bf-var', help="The maximum Bayes' factor variance to predict "
        "an ORF as translated (use --help for more details)", 
        type=float, default=default_max_bf_var)

    parser.add_argument('--min-bf-likelihood', help="If given, then this is taken a threshold "
        "on the likelihood of translation (use --help for more details)", 
        type=float, default=default_min_bf_likelihood)

    utils.add_logging_options(parser)
    args = parser.parse_args()
    utils.update_logging(args)
def main():
    parser = argparse.ArgumentParser(
        formatter_class=argparse.ArgumentDefaultsHelpFormatter,
        description=
        "This script creates a plot showing the fraction of predicted ORFs "
        "which have a set amount of peptide coverage.")
    parser.add_argument(
        'rpbp_peptide_matches',
        help="The (csv) file containing the peptides "
        "matching to each ORF predicted as translated using Rp-Bp (produced by "
        "get-orf-peptide-matches)")
    parser.add_argument(
        'rpchi_peptide_matches',
        help="The (csv) file containing the peptides "
        "matching to each ORF predicted as translated using Rp-chi (produced by "
        "get-orf-peptide-matches)")
    parser.add_argument('out', help="The output (image) file")

    parser.add_argument('-l',
                        '--min-length',
                        help="The minimum length for ORFs (in "
                        "nucleotides) to consider in the analyis",
                        type=int,
                        default=default_min_length)

    parser.add_argument('-p',
                        '--num-cpus',
                        help="The number of processors to use",
                        type=int,
                        default=default_num_cpus)

    parser.add_argument('--title', default=default_title)
    parser.add_argument('--fontsize', type=int, default=default_fontsize)
    parser.add_argument('--note-fontsize',
                        type=int,
                        default=default_note_fontsize)
    parser.add_argument('--line-width', type=int, default=default_line_width)

    utils.add_logging_options(parser)
    args = parser.parse_args()
    utils.update_logging(args)

    msg = "Reading predictions"
    logging.info(msg)
    rpbp_peptide_matches = pd.read_csv(args.rpbp_peptide_matches)
    rpchi_peptide_matches = pd.read_csv(args.rpchi_peptide_matches)

    if args.min_length > 0:
        msg = "Filtering predictions by: length > {}".format(args.min_length)
        logging.warning(msg)

        # multiply by 3 because the orf sequences are amino acid sequences
        bf_lengths = rpbp_peptide_matches['orf_sequence'].str.len() * 3
        m_bf_length = bf_lengths > args.min_length
        rpbp_peptide_matches = rpbp_peptide_matches[m_bf_length]

        chisq_lengths = rpchi_peptide_matches['orf_sequence'].str.len() * 3
        m_chisq_length = chisq_lengths > args.min_length
        rpchi_peptide_matches = rpchi_peptide_matches[m_chisq_length]

    msg = "Calculating Rp-Bp coverage"
    logging.info(msg)
    bf_coverage = parallel.apply_parallel(rpbp_peptide_matches,
                                          args.num_cpus,
                                          get_orf_coverage,
                                          progress_bar=True)
    bf_coverage = pd.DataFrame(bf_coverage)

    msg = "Calculating Rp-chi coverage"
    logging.info(msg)
    chisq_coverage = parallel.apply_parallel(rpchi_peptide_matches,
                                             args.num_cpus,
                                             get_orf_coverage,
                                             progress_bar=True)
    chisq_coverage = pd.DataFrame(chisq_coverage)

    msg = "Creating image"
    logging.info(msg)

    # plot the empirical distribution of ORF lengths
    hist_min = 0
    hist_max = 1.1
    hist_step = 0.05
    hist_range = (hist_min, hist_max)
    hist_bins = np.arange(hist_min, hist_max, hist_step)

    bf_covered_hist, b = np.histogram(bf_coverage['coverage'],
                                      bins=hist_bins,
                                      range=hist_range,
                                      density=True)

    chisq_covered_hist, b = np.histogram(chisq_coverage['coverage'],
                                         bins=hist_bins,
                                         range=hist_range,
                                         density=True)

    # now, normalize the histograms
    bf_covered_hist = bf_covered_hist / np.sum(bf_covered_hist)
    chisq_covered_hist = chisq_covered_hist / np.sum(chisq_covered_hist)

    # multiply by 100 to give actual percentages
    bf_covered_hist = 100 * bf_covered_hist
    chisq_covered_hist = 100 * chisq_covered_hist

    hist_bins = 100 * hist_bins

    fig, ax = plt.subplots(figsize=(10, 5))

    cm = plt.cm.gist_earth

    x = np.arange(len(bf_covered_hist))

    bf_label = r'\textsc{Rp-Bp}'
    ax.plot(x,
            bf_covered_hist,
            color=cm(0.1),
            label=bf_label,
            linewidth=args.line_width,
            linestyle='--',
            marker='^')

    chisq_label = r'\textsc{Rp-$\chi^2$}'
    ax.plot(x,
            chisq_covered_hist,
            color=cm(0.3),
            label=chisq_label,
            linewidth=args.line_width,
            linestyle='-.',
            marker='D')

    ax.set_xlabel('Peptide Coverage (\%)', fontsize=args.fontsize)
    ax.set_ylabel('\% of predicted ORFs', fontsize=args.fontsize)

    if args.title is not None and len(args.title) > 0:
        ax.set_title(args.title, fontsize=args.fontsize)

    # only show every 20% on the x-axis
    ax.set_xticks(x[::4])
    ax.set_xticklabels(hist_bins[::4])

    def my_formatter_fun(x, p):
        return "${:d}$".format(20 * p)

    ax.get_xaxis().set_major_formatter(
        matplotlib.ticker.FuncFormatter(my_formatter_fun))

    # hide the "0" tick label
    yticks = ax.yaxis.get_major_ticks()
    yticks[0].label1.set_visible(False)

    ax.set_xlim((0, len(bf_covered_hist) - 1))
    ax.set_ylim((0, 10))

    ax.legend(loc='upper right', fontsize=args.fontsize)
    ax.tick_params(axis='both', which='major', labelsize=args.note_fontsize)

    fig.savefig(args.out, bbox_inches='tight')
Exemplo n.º 3
0
def main():
    parser = argparse.ArgumentParser(
        formatter_class=argparse.ArgumentDefaultsHelpFormatter,
        description=
        "This script extracts reads of various types from a processed dataset "
        "to create an \"interesting\" test dataset.\n\nN.B. This script is not "
        "particularly efficient and is not intended for external use.")
    parser.add_argument('config', help="The (yaml) config file")
    parser.add_argument(
        'name', help="The name of the dataset to use to create the test data")
    parser.add_argument(
        'out',
        help="The output (fasta.gz) which contains reads of various "
        "types, subject to the other parameters")

    parser.add_argument('-r',
                        '--reference',
                        help="The name of the reference sequence (chromosome) "
                        "from which aligned reads will be extracted",
                        default=default_reference)

    parser.add_argument('-m',
                        '--max-reads',
                        help="At most <max_reads> reads of each type "
                        "will be included in the final output",
                        type=int,
                        default=default_max_reads)

    utils.add_logging_options(parser)
    args = parser.parse_args()
    utils.update_logging(args)

    config = yaml.load(open(args.config), Loader=yaml.FullLoader)

    note = config.get('note', None)

    # keep multimappers?
    is_unique = not ('keep_riboseq_multimappers' in config)

    ###
    msg = "Reading alignments from BAM file"
    logging.info(msg)

    bam_file = filenames.get_riboseq_bam(config['riboseq_data'],
                                         args.name,
                                         is_unique=is_unique,
                                         note=note)
    bam = pysam.AlignmentFile(bam_file)

    alignments = bam.fetch(reference=args.reference)
    num_alignments = bam.count(reference=args.reference)
    alignment_qnames = {get_first_token(a.qname) for a in alignments}

    ###
    msg = "Extracting a similar number of rRNA reads"
    logging.info(msg)

    with_rrna = filenames.get_with_rrna_fastq(config['riboseq_data'],
                                              args.name,
                                              note=note)

    rrna = bio.get_read_iterator(with_rrna, is_fasta=False)
    rrna = itertools.islice(rrna, num_alignments)
    rrna_qnames = {get_first_token(read[0]) for read in rrna}

    ###
    msg = "Extracting a similar number of reads which do not uniquely map to the genome"
    logging.info(msg)

    # first, pull out the qnames of all alignments
    all_alignments = bam.fetch()
    all_alignment_qnames = {get_first_token(a.qname) for a in all_alignments}

    # iterate over all reads which passed the rRNA and quality filtering
    without_rrna_file = filenames.get_without_rrna_fastq(
        config['riboseq_data'], args.name, note=note)
    without_rrna = bio.get_read_iterator(without_rrna_file, is_fasta=False)
    without_rrna_qnames = {get_first_token(read[0]) for read in without_rrna}

    no_mapping_qnames = without_rrna_qnames - all_alignment_qnames

    ###
    msg = "Extracting a similar number of reads which are filtered due to quality issues"
    logging.info(msg)

    # first, pull in all the reads and their names

    msg = "Reading all reads into a dictionary"
    logging.debug(msg)

    raw_data_file = config['riboseq_samples'][args.name]
    raw_data = bio.get_fasta_dict(raw_data_file,
                                  is_fasta=False,
                                  key_fn=get_first_token)
    raw_data_qnames = set(raw_data.keys())

    msg = "Reading quality scores into dictionary"
    logging.debug(msg)

    raw_data_qual = bio.get_fastq_qual_dict(raw_data_file,
                                            key_fn=get_first_token)

    # now, the reads which _did_ pass quality filtering
    msg = "Reading reads which pass quality filtering into a set"
    logging.debug(msg)

    without_adapters_file = filenames.get_without_adapters_fastq(
        config['riboseq_data'], args.name, note=note)
    without_adapters = bio.get_read_iterator(without_adapters_file,
                                             is_fasta=False)
    without_adapters_qnames = {
        get_first_token(read[0])
        for read in without_adapters
    }

    # and pull out the qnames of the reads which did not pass quality filtering
    filtered_reads_qnames = raw_data_qnames - without_adapters_qnames

    ###
    msg = "Constructing the set of reads to output"
    logging.info(msg)

    alignment_raw_data = {
        qname: raw_data[qname]
        for qname in itertools.islice(alignment_qnames, args.max_reads)
    }

    rrna_raw_data = {
        qname: raw_data[qname]
        for qname in itertools.islice(rrna_qnames, args.max_reads)
    }

    no_mapping_raw_data = {
        qname: raw_data[qname]
        for qname in itertools.islice(no_mapping_qnames, args.max_reads)
    }

    filtered_reads_raw_data = {
        qname: raw_data[qname]
        for qname in itertools.islice(filtered_reads_qnames, args.max_reads)
    }

    out_raw_data = alignment_raw_data
    out_raw_data.update(rrna_raw_data)
    out_raw_data.update(no_mapping_raw_data)
    out_raw_data.update(filtered_reads_raw_data)

    ###
    msg = "Writing sequences to disk"
    logging.info(msg)

    bio.write_fastq(out_raw_data, raw_data_qual, args.out, progress_bar=True)
Exemplo n.º 4
0
def main():
    parser = argparse.ArgumentParser(
        formatter_class=argparse.ArgumentDefaultsHelpFormatter,
        description="This script creates a simple latex document.")
    parser.add_argument('config',
                        help="The (yaml) config file for the project")
    parser.add_argument('out', help="The path for the output files")

    parser.add_argument('-l',
                        '--min-orf-length',
                        help="The minimum length for ORFs (in "
                        "nucleotides) to consider in the analyis",
                        type=int,
                        default=default_min_orf_length)

    parser.add_argument(
        '--num-cpus',
        help="The number of processors to use for counting "
        "the matching peptides coverage for each predicted ORF.",
        type=int,
        default=default_num_cpus)
    parser.add_argument('--overwrite',
                        help="If this flag is present, existing files will "
                        "be overwritten.",
                        action='store_true')

    parser.add_argument(
        '--note',
        help="If this option is given, it will be used in the "
        "filenames.\n\nN.B. This REPLACES the note in the config file.",
        default=default_note)

    utils.add_logging_options(parser)
    args = parser.parse_args()
    utils.update_logging(args)

    config = yaml.load(open(args.config), Loader=yaml.FullLoader)

    # keep multimappers?
    is_unique = not ('keep_riboseq_multimappers' in config)

    programs = ['create-orf-peptide-coverage-line-graph']
    utils.check_programs_exist(programs)

    required_keys = ['riboseq_data', 'riboseq_samples']
    utils.check_keys_exist(config, required_keys)

    # make sure the path to the output file exists
    os.makedirs(args.out, exist_ok=True)

    note_str = config.get('note', None)
    out_note_str = note_str

    if args.note is not None and len(args.note) > 0:
        out_note_str = args.note

    project_name = config.get("project_name", default_project_name)
    title = "Proteomics analysis for {}".format(project_name)
    abstract = "This document shows the results of proteomics analysis."

    header = latex.get_header_text(title, abstract)
    footer = latex.get_footer_text()

    tex_file = os.path.join(args.out, "proteomics-report.tex")

    with open(tex_file, 'w') as out:

        out.write(header)
        out.write("\n")

        title = "ORF peptide coverage"
        latex.section(out, title)

        for name, data in config['riboseq_samples'].items():
            msg = "Processing sample: {}".format(name)
            logging.info(msg)

            logging.debug("overwrite: {}".format(args.overwrite))

            create_figures(args.config, config, name, args)

            title = name
            latex.subsection(out, title)

            try:
                lengths, offsets = riboutils.ribo_utils.get_periodic_lengths_and_offsets(
                    config, name, is_unique=is_unique)
            except FileNotFoundError:
                msg = "Could not parse out lengths and offsets for sample: {}. Skipping".format(
                    name)
                logging.error(msg)
                return

            peptide_coverage_line_graph = filenames.get_peptide_coverage_line_graph(
                config['riboseq_data'],
                name,
                length=lengths,
                offset=offsets,
                is_unique=is_unique,
                note=out_note_str)

            if os.path.exists(peptide_coverage_line_graph):
                latex.begin_figure(out)
                latex.write_graphics(out,
                                     peptide_coverage_line_graph,
                                     width=0.9)
                latex.end_figure(out)

            else:
                text = "Problem creating ORF peptide coverage plot"
                out.write(text)
                out.write("\n")

            latex.clearpage(out)

        out.write(footer)

    os.chdir(args.out)
    cmd = "pdflatex -shell-escape proteomics-report"
    utils.check_call(cmd)
    utils.check_call(cmd)  # call again to fix references
def main():
    parser = argparse.ArgumentParser(
        formatter_class=argparse.ArgumentDefaultsHelpFormatter,
        description=
        "This script plots the (log) Bayes factor against the estimated "
        "RPKM for all ORFs. All relevant values will be clipped according to the "
        "specified arguments for viewing.")

    parser.add_argument('config', help="The (yaml) config file")
    parser.add_argument('name',
                        help="The name of the dataset or replicate to plot")
    parser.add_argument('out', help="The output image file")

    parser.add_argument(
        '-p',
        '--use-predictions',
        help="If this flag is present, then "
        "the \"predicted ORFs\" files will be used. Otherwise, all ORFs in the dataset "
        "will be visualized.",
        action='store_true')
    parser.add_argument(
        '-r',
        '--is-replicate',
        help="If the name corresponds to one "
        "of the replicates, this flag must be used to ensure the filenames are "
        "handled correctly.",
        action='store_true')

    parser.add_argument('--title', default=default_title)

    parser.add_argument('--min-rpkm', type=float, default=default_min_rpkm)
    parser.add_argument('--max-rpkm', type=float, default=default_max_rpkm)
    parser.add_argument('--min-bf', type=float, default=default_min_bf)
    parser.add_argument('--max-bf', type=float, default=default_max_bf)

    utils.add_logging_options(parser)
    args = parser.parse_args()
    utils.update_logging(args)

    config = yaml.load(open(args.config), Loader=yaml.FullLoader)
    note = config.get('note', None)

    if args.is_replicate:
        lengths = None
        offsets = None
    else:
        lengths, offsets = ribo_utils.get_periodic_lengths_and_offsets(
            config, args.name)

    fraction = config.get('smoothing_fraction', None)
    reweighting_iterations = config.get('smoothing_reweighting_iterations',
                                        None)

    # we will need these to get the appropriate log BFs
    if args.use_predictions:
        bayes_factors = filenames.get_riboseq_predicted_orfs(
            config['riboseq_data'],
            args.name,
            length=lengths,
            offset=offsets,
            is_unique=True,
            note=note,
            is_smooth=True,
            fraction=fraction,
            reweighting_iterations=reweighting_iterations)
    else:
        bayes_factors = filenames.get_riboseq_bayes_factors(
            config['riboseq_data'],
            args.name,
            length=lengths,
            offset=offsets,
            is_unique=True,
            note=note,
            is_smooth=True,
            fraction=fraction,
            reweighting_iterations=reweighting_iterations)

    if not os.path.exists(bayes_factors):
        msg = (
            "Could not find the Bayes factor file: {}\nIf this is for a particular "
            "sample and the --merge-replicates option was used, this is not a problem. "
            "Will not create this scatter plot".format(bayes_factors))
        logger.warning(msg)
        return

    msg = "Reading Bayes factors"
    logger.info(msg)
    bayes_factors = bio.read_bed(bayes_factors)

    # we need these to get the raw counts for calculating RPKM

    # we always need all of the counts, so no need to check which ORFs
    rpchi_pvalues = filenames.get_riboseq_bayes_factors(config['riboseq_data'],
                                                        args.name,
                                                        length=lengths,
                                                        offset=offsets,
                                                        is_unique=True,
                                                        note=note,
                                                        is_smooth=False)

    if not os.path.exists(rpchi_pvalues):
        msg = (
            "Could not find the Rp-chi pvalues file: {}\nIf this is for a particular "
            "sample and the --merge-replicates option was used, this is not a problem. "
            "Will not create this scatter plot".format(rpchi_pvalues))
        logger.warning(msg)
        return

    msg = "Reading Rp-chi pvalues"
    logger.info(msg)
    rpchi_pvalues = bio.read_bed(rpchi_pvalues)

    msg = "Calculating RPKM values"
    logger.info(msg)

    # we approximate the number of mapping reads as the sum across all ORFs.
    # this double-counts some reads
    num_reads = np.sum(rpchi_pvalues['profile_sum'])
    all_rpkm = (1e6 * rpchi_pvalues['x_1_sum']) / (rpchi_pvalues['orf_len'] *
                                                   num_reads)

    # only include things that have some reads in the visualization
    m_rpkm = all_rpkm > 0

    msg = "Creating plot"
    logger.info(msg)

    fig, ax = plt.subplots(figsize=(10, 5))

    cm = plt.cm.gist_earth

    for i, orf_label in enumerate(ribo_utils.orf_type_labels):

        orf_types = ribo_utils.orf_type_labels_mapping[orf_label]
        m_type = bayes_factors['orf_type'].isin(orf_types)

        # now, pull out the RPKMs
        if args.use_predictions:
            # if we are using predictions, we have to filter and join
            orf_ids = bayes_factors.loc[m_rpkm & m_type, 'id']
            bfs = np.array(bayes_factors.loc[m_rpkm & m_type,
                                             'bayes_factor_mean'])

            m_ids = rpchi_pvalues['id'].isin(orf_ids)
            rpkm = np.array(all_rpkm[m_ids])

        else:
            # otherwise ,the data frames match, so we can just use the masks
            rpkm = np.array(all_rpkm[m_rpkm & m_type])
            bfs = np.array(bayes_factors.loc[m_rpkm & m_type,
                                             'bayes_factor_mean'])

        rpkm = np.clip(rpkm, args.min_rpkm, args.max_rpkm)
        bfs = np.clip(bfs, args.min_bf, args.max_bf)

        color = i / len(ribo_utils.orf_type_labels)
        color = cm(color)

        label = "{} ({})".format(orf_label, len(rpkm))

        ax.scatter(rpkm, bfs, label=label, color=color, edgecolor='k')

    ax.set_ylim((args.min_bf * 1.5, args.max_bf * 1.5))
    ax.set_xlim((args.min_rpkm * 1.5, args.max_rpkm * 1.25))

    ax.set_yscale('symlog')
    ax.set_xscale('symlog')

    ax.set_xlabel('RPKM')
    ax.set_ylabel('log BF')

    lgd = ax.legend(loc='center right', bbox_to_anchor=(1.5, 0.5))

    if len(args.title) > 0:
        ax.set_title(args.title)

    fig.savefig(args.out, bbox_inches='tight', bbox_extra_artists=(lgd, ))
def main():
    parser = argparse.ArgumentParser(formatter_class=argparse.ArgumentDefaultsHelpFormatter,
        description="This script matches QTI-seq peaks from (Gao et al., 2015) to ORFs "
        "based on genomic coordinates.")
    parser.add_argument('orfs', help="The ORFs (bed12+) file")
    parser.add_argument('qti_peaks', help="The QTI-seq peak (BED6) files")
    parser.add_argument('out', help="The augmented ORFs (BED12+) file")

    parser.add_argument('--output-prefix', help="A string to prefix before all of the "
        "fields related to the closest QTI-seq peak (if there is one)",
        default=default_output_prefix)

    parser.add_argument('--seqname-prefix', help="If present, this string is prepended "
        "to all of the ORF seqnames. It is then removed again in the final output.", 
        default=default_seqname_prefix)

    utils.add_logging_options(parser)
    args = parser.parse_args()
    utils.update_logging(args)

    programs = ['closestBed']
    utils.check_programs_exist(programs)

    msg = "Reading ORFs"
    logger.info(msg)

    orfs = bio.read_bed(args.orfs)

    # we need to keep a copy that we use later for output
    orfs_copy = orfs.copy()

    # for matching qti-seq peaks, we only want to consider the start position of each ORF

    # for forward strand ORFs, replace orf_genomic_end with orf_genomic_start
    msg = "Updating genomic positions to consider only start codon"
    logger.info(msg)
    m_forward = orfs['strand'] == '+'
    #forward_orfs = orfs[mask_forward]
    #forward_orfs['end'] = forward_orfs['start'] + 1
    orfs.loc[m_forward, 'end'] = orfs.loc[m_forward, 'start'] + 1

    # for reverse ORFs, replace orf_genomic_start with orf_genomic_end
    m_reverse = orfs['strand'] == '-'
    #reverse_orfs = orfs[mask_reverse]
    #reverse_orfs['start'] = reverse_orfs['end'] - 1
    orfs.loc[m_reverse, 'start'] = orfs.loc[m_reverse, 'end'] - 1

    # join together the orf start positions, correct the seqname and sort for bedtools
    msg = "Converting ORF data frame to pybedtools"
    logger.info(msg)
    #orfs_start_only = pd.concat([forward_orfs, reverse_orfs])
    #orfs_start_only['seqname'] = args.seqname_prefix + orfs_start_only['seqname']
    #orfs_start_only = orfs_start_only.sort_values(['seqname', 'start'])
    #orfs_bed = pybedtools.BedTool.from_dataframe(orfs_start_only)

    orfs['seqname'] = args.seqname_prefix + orfs['seqname']
    orfs = orfs.sort_values(['seqname', 'start'])
    orfs_bed = pybedtools.BedTool.from_dataframe(orfs)

    msg = "Reading QTI peaks"
    qti_bed_df = bio.read_bed(args.qti_peaks)
    qti_bed_df.columns = ["{}_{}".format(args.output_prefix, c) for c in qti_bed_df.columns]

    # and covert to bed
    msg = "Converting QTI peaks data frame to pybedtools"
    logger.info(msg)

    chr_field = '{}_chr'.format(args.output_prefix)
    start_field = '{}_start'.format(args.output_prefix)
    qti_bed_df = qti_bed_df.sort_values([chr_field, start_field])
    qti_bed = pybedtools.BedTool.from_dataframe(qti_bed_df)

    msg = "Finding closest QTI peak for all ORFs"
    logger.info(msg)
    # s means to consider strandedness
    # D means to report the distance
    # a means to report upstream positions (relative to orfs_start) as negative
    closest_bed = orfs_bed.closest(qti_bed, s=True, D="a")

    # covert back to a df for clean up
    msg = "Converting closest results back to data frame"
    logger.info(msg)
    
    peak_distance_field = '{}_peak_distance'.format(args.output_prefix)
    closest_bed_fields = list(orfs.columns) + list(qti_bed_df.columns) + [peak_distance_field]
    closest_df = closest_bed.to_dataframe(names=closest_bed_fields, index_col=False)

    # now join the relevant fields back to the original ORF data frame
    fields_to_join = list(qti_bed_df.columns) + [peak_distance_field, 'id']
    closest_df = closest_df[fields_to_join]

    msg = "Joining closest results to original ORFs"
    logger.info(msg)
    orf_qti_df = pd.merge(orfs_copy, closest_df, on='id', how='left')
    orf_qti_df = orf_qti_df.sort_values(['seqname', 'start'])

    # and write this out as a bed12+ file
    msg = "Writing joined BED12+ file to disk"
    logger.info(msg)

    bio.write_bed(orf_qti_df, args.out)