def main(): parser = argparse.ArgumentParser( formatter_class=argparse.ArgumentDefaultsHelpFormatter, description="This script rejoins annotations which were split with the " "split-long-chromosomes script. Importantly, the \"max-size\" parameter here " "must match the \"max-parameter\" used for that script.") parser.add_argument( 'bed', help="A BED file using the annotations from a GTF file " "created by split-long-chromosomes") parser.add_argument( 'out', help="A BED file in which the chromosomes have been rejoined") parser.add_argument('--max-size', help="The largest allowed size (in bp) for a " "chromosome", type=int, default=default_max_size) parser.add_argument('--num-procs', help="The number of processors to use", type=int, default=default_num_procs) utils.add_logging_options(parser) args = parser.parse_args() utils.update_logging(args) msg = "Reading BED" logging.info(msg) bed = bio.read_bed(args.bed) msg = "Updating BED coordinates" logging.info(msg) original_coordinates = parallel.apply_parallel(bed, args.num_procs, get_original_coordinates, args.max_size, progress_bar=True) original_coordinates_df = pd.DataFrame(original_coordinates) bed['seqname'] = original_coordinates_df['seqname'] bed['start'] = original_coordinates_df['start'].astype(int) bed['end'] = original_coordinates_df['end'].astype(int) bed['thick_start'] = original_coordinates_df['thick_start'].astype(int) bed['thick_end'] = original_coordinates_df['thick_end'].astype(int) msg = "Writing updated BED to disk" logging.info(msg) bio.write_bed(bed, args.out)
def main(): parser = argparse.ArgumentParser( formatter_class=argparse.ArgumentDefaultsHelpFormatter, description= "This script reorders sequences in a fasta file to be in the same " "order as the sequences in the STAR transcriptInfo.tab file.") parser.add_argument('transcript_info', help="The STAR transcriptInfo.txt file, which " "contains the desired sequence order") parser.add_argument( 'fasta', help="The fasta file containing the sequences. The index " "should already be created, and the keys for the index must exactly match the " "sequence names in the transcript_info file.") parser.add_argument( 'out', help="The output fasta file with the sequences rearranged") parser.add_argument('--compress', help="If this flag is present, then the output " "will be gzipped", action='store_true') utils.add_logging_options(parser) args = parser.parse_args() utils.update_logging(args) msg = "Reading STAR transcript file" logging.info(msg) transcript_info = bio.read_star_tr_file(args.transcript_info) msg = "Reading transcript fasta file" logging.info(msg) fasta = bio.get_fasta_dict(args.fasta) # trans_id_mapping = {x.split()[0]: x for x in fasta.keys()} # open the tab file if args.compress: out = gzip.open(args.out, 'wt') else: out = open(args.out, 'w') for i in tqdm.trange(len(transcript_info)): trans_id = transcript_info.iloc[i]['ID'] fasta_id = trans_id_mapping[trans_id] header = ">{}\n".format(fasta_id) out.write(header) seq = str(fasta[fasta_id]) wrapped_seq = textwrap.fill(seq) out.write(wrapped_seq) out.write("\n") out.close()
def main(): parser = argparse.ArgumentParser(formatter_class=argparse.ArgumentDefaultsHelpFormatter, description="This script matches QTI-seq peaks from (Gao et al., 2015) to ORFs " "based on genomic coordinates.") parser.add_argument('orfs', help="The ORFs (bed12+) file") parser.add_argument('qti_peaks', help="The QTI-seq peak (BED6) files") parser.add_argument('out', help="The augmented ORFs (BED12+) file") parser.add_argument('--output-prefix', help="A string to prefix before all of the " "fields related to the closest QTI-seq peak (if there is one)", default=default_output_prefix) parser.add_argument('--seqname-prefix', help="If present, this string is prepended " "to all of the ORF seqnames. It is then removed again in the final output.", default=default_seqname_prefix) utils.add_logging_options(parser) args = parser.parse_args() utils.update_logging(args) programs = ['closestBed'] utils.check_programs_exist(programs) msg = "Reading ORFs" logger.info(msg) orfs = bio.read_bed(args.orfs) # we need to keep a copy that we use later for output orfs_copy = orfs.copy() # for matching qti-seq peaks, we only want to consider the start position of each ORF # for forward strand ORFs, replace orf_genomic_end with orf_genomic_start msg = "Updating genomic positions to consider only start codon" logger.info(msg) m_forward = orfs['strand'] == '+' #forward_orfs = orfs[mask_forward] #forward_orfs['end'] = forward_orfs['start'] + 1 orfs.loc[m_forward, 'end'] = orfs.loc[m_forward, 'start'] + 1 # for reverse ORFs, replace orf_genomic_start with orf_genomic_end m_reverse = orfs['strand'] == '-' #reverse_orfs = orfs[mask_reverse] #reverse_orfs['start'] = reverse_orfs['end'] - 1 orfs.loc[m_reverse, 'start'] = orfs.loc[m_reverse, 'end'] - 1 # join together the orf start positions, correct the seqname and sort for bedtools msg = "Converting ORF data frame to pybedtools" logger.info(msg) #orfs_start_only = pd.concat([forward_orfs, reverse_orfs]) #orfs_start_only['seqname'] = args.seqname_prefix + orfs_start_only['seqname'] #orfs_start_only = orfs_start_only.sort_values(['seqname', 'start']) #orfs_bed = pybedtools.BedTool.from_dataframe(orfs_start_only) orfs['seqname'] = args.seqname_prefix + orfs['seqname'] orfs = orfs.sort_values(['seqname', 'start']) orfs_bed = pybedtools.BedTool.from_dataframe(orfs) msg = "Reading QTI peaks" qti_bed_df = bio.read_bed(args.qti_peaks) qti_bed_df.columns = ["{}_{}".format(args.output_prefix, c) for c in qti_bed_df.columns] # and covert to bed msg = "Converting QTI peaks data frame to pybedtools" logger.info(msg) chr_field = '{}_chr'.format(args.output_prefix) start_field = '{}_start'.format(args.output_prefix) qti_bed_df = qti_bed_df.sort_values([chr_field, start_field]) qti_bed = pybedtools.BedTool.from_dataframe(qti_bed_df) msg = "Finding closest QTI peak for all ORFs" logger.info(msg) # s means to consider strandedness # D means to report the distance # a means to report upstream positions (relative to orfs_start) as negative closest_bed = orfs_bed.closest(qti_bed, s=True, D="a") # covert back to a df for clean up msg = "Converting closest results back to data frame" logger.info(msg) peak_distance_field = '{}_peak_distance'.format(args.output_prefix) closest_bed_fields = list(orfs.columns) + list(qti_bed_df.columns) + [peak_distance_field] closest_df = closest_bed.to_dataframe(names=closest_bed_fields, index_col=False) # now join the relevant fields back to the original ORF data frame fields_to_join = list(qti_bed_df.columns) + [peak_distance_field, 'id'] closest_df = closest_df[fields_to_join] msg = "Joining closest results to original ORFs" logger.info(msg) orf_qti_df = pd.merge(orfs_copy, closest_df, on='id', how='left') orf_qti_df = orf_qti_df.sort_values(['seqname', 'start']) # and write this out as a bed12+ file msg = "Writing joined BED12+ file to disk" logger.info(msg) bio.write_bed(orf_qti_df, args.out)
def main(): parser = argparse.ArgumentParser( formatter_class=argparse.ArgumentDefaultsHelpFormatter, description= "This script creates a plot showing the fraction of predicted ORFs " "which have a set amount of peptide coverage.") parser.add_argument( 'rpbp_peptide_matches', help="The (csv) file containing the peptides " "matching to each ORF predicted as translated using Rp-Bp (produced by " "get-orf-peptide-matches)") parser.add_argument( 'rpchi_peptide_matches', help="The (csv) file containing the peptides " "matching to each ORF predicted as translated using Rp-chi (produced by " "get-orf-peptide-matches)") parser.add_argument('out', help="The output (image) file") parser.add_argument('-l', '--min-length', help="The minimum length for ORFs (in " "nucleotides) to consider in the analyis", type=int, default=default_min_length) parser.add_argument('-p', '--num-cpus', help="The number of processors to use", type=int, default=default_num_cpus) parser.add_argument('--title', default=default_title) parser.add_argument('--fontsize', type=int, default=default_fontsize) parser.add_argument('--note-fontsize', type=int, default=default_note_fontsize) parser.add_argument('--line-width', type=int, default=default_line_width) utils.add_logging_options(parser) args = parser.parse_args() utils.update_logging(args) msg = "Reading predictions" logging.info(msg) rpbp_peptide_matches = pd.read_csv(args.rpbp_peptide_matches) rpchi_peptide_matches = pd.read_csv(args.rpchi_peptide_matches) if args.min_length > 0: msg = "Filtering predictions by: length > {}".format(args.min_length) logging.warning(msg) # multiply by 3 because the orf sequences are amino acid sequences bf_lengths = rpbp_peptide_matches['orf_sequence'].str.len() * 3 m_bf_length = bf_lengths > args.min_length rpbp_peptide_matches = rpbp_peptide_matches[m_bf_length] chisq_lengths = rpchi_peptide_matches['orf_sequence'].str.len() * 3 m_chisq_length = chisq_lengths > args.min_length rpchi_peptide_matches = rpchi_peptide_matches[m_chisq_length] msg = "Calculating Rp-Bp coverage" logging.info(msg) bf_coverage = parallel.apply_parallel(rpbp_peptide_matches, args.num_cpus, get_orf_coverage, progress_bar=True) bf_coverage = pd.DataFrame(bf_coverage) msg = "Calculating Rp-chi coverage" logging.info(msg) chisq_coverage = parallel.apply_parallel(rpchi_peptide_matches, args.num_cpus, get_orf_coverage, progress_bar=True) chisq_coverage = pd.DataFrame(chisq_coverage) msg = "Creating image" logging.info(msg) # plot the empirical distribution of ORF lengths hist_min = 0 hist_max = 1.1 hist_step = 0.05 hist_range = (hist_min, hist_max) hist_bins = np.arange(hist_min, hist_max, hist_step) bf_covered_hist, b = np.histogram(bf_coverage['coverage'], bins=hist_bins, range=hist_range, density=True) chisq_covered_hist, b = np.histogram(chisq_coverage['coverage'], bins=hist_bins, range=hist_range, density=True) # now, normalize the histograms bf_covered_hist = bf_covered_hist / np.sum(bf_covered_hist) chisq_covered_hist = chisq_covered_hist / np.sum(chisq_covered_hist) # multiply by 100 to give actual percentages bf_covered_hist = 100 * bf_covered_hist chisq_covered_hist = 100 * chisq_covered_hist hist_bins = 100 * hist_bins fig, ax = plt.subplots(figsize=(10, 5)) cm = plt.cm.gist_earth x = np.arange(len(bf_covered_hist)) bf_label = r'\textsc{Rp-Bp}' ax.plot(x, bf_covered_hist, color=cm(0.1), label=bf_label, linewidth=args.line_width, linestyle='--', marker='^') chisq_label = r'\textsc{Rp-$\chi^2$}' ax.plot(x, chisq_covered_hist, color=cm(0.3), label=chisq_label, linewidth=args.line_width, linestyle='-.', marker='D') ax.set_xlabel('Peptide Coverage (\%)', fontsize=args.fontsize) ax.set_ylabel('\% of predicted ORFs', fontsize=args.fontsize) if args.title is not None and len(args.title) > 0: ax.set_title(args.title, fontsize=args.fontsize) # only show every 20% on the x-axis ax.set_xticks(x[::4]) ax.set_xticklabels(hist_bins[::4]) def my_formatter_fun(x, p): return "${:d}$".format(20 * p) ax.get_xaxis().set_major_formatter( matplotlib.ticker.FuncFormatter(my_formatter_fun)) # hide the "0" tick label yticks = ax.yaxis.get_major_ticks() yticks[0].label1.set_visible(False) ax.set_xlim((0, len(bf_covered_hist) - 1)) ax.set_ylim((0, 10)) ax.legend(loc='upper right', fontsize=args.fontsize) ax.tick_params(axis='both', which='major', labelsize=args.note_fontsize) fig.savefig(args.out, bbox_inches='tight')
def main(): parser = argparse.ArgumentParser( formatter_class=argparse.ArgumentDefaultsHelpFormatter, description= "This script matches QTI-seq peaks from (Gao et al., 2015) to ORFs " "based on genomic coordinates.") parser.add_argument('orfs', help="The ORFs (bed12+) file") parser.add_argument('qti_peaks', help="The QTI-seq peak (BED6) files") parser.add_argument('out', help="The augmented ORFs (BED12+) file") parser.add_argument( '--output-prefix', help="A string to prefix before all of the " "fields related to the closest QTI-seq peak (if there is one)", default=default_output_prefix) parser.add_argument( '--seqname-prefix', help="If present, this string is prepended " "to all of the ORF seqnames. It is then removed again in the final output.", default=default_seqname_prefix) utils.add_logging_options(parser) args = parser.parse_args() utils.update_logging(args) programs = ['closestBed'] utils.check_programs_exist(programs) msg = "Reading ORFs" logger.info(msg) orfs = bio.read_bed(args.orfs) # we need to keep a copy that we use later for output orfs_copy = orfs.copy() # for matching qti-seq peaks, we only want to consider the start position of each ORF # for forward strand ORFs, replace orf_genomic_end with orf_genomic_start msg = "Updating genomic positions to consider only start codon" logger.info(msg) m_forward = orfs['strand'] == '+' #forward_orfs = orfs[mask_forward] #forward_orfs['end'] = forward_orfs['start'] + 1 orfs.loc[m_forward, 'end'] = orfs.loc[m_forward, 'start'] + 1 # for reverse ORFs, replace orf_genomic_start with orf_genomic_end m_reverse = orfs['strand'] == '-' #reverse_orfs = orfs[mask_reverse] #reverse_orfs['start'] = reverse_orfs['end'] - 1 orfs.loc[m_reverse, 'start'] = orfs.loc[m_reverse, 'end'] - 1 # join together the orf start positions, correct the seqname and sort for bedtools msg = "Converting ORF data frame to pybedtools" logger.info(msg) #orfs_start_only = pd.concat([forward_orfs, reverse_orfs]) #orfs_start_only['seqname'] = args.seqname_prefix + orfs_start_only['seqname'] #orfs_start_only = orfs_start_only.sort_values(['seqname', 'start']) #orfs_bed = pybedtools.BedTool.from_dataframe(orfs_start_only) orfs['seqname'] = args.seqname_prefix + orfs['seqname'] orfs = orfs.sort_values(['seqname', 'start']) orfs_bed = pybedtools.BedTool.from_dataframe(orfs) msg = "Reading QTI peaks" qti_bed_df = bio.read_bed(args.qti_peaks) qti_bed_df.columns = [ "{}_{}".format(args.output_prefix, c) for c in qti_bed_df.columns ] # and covert to bed msg = "Converting QTI peaks data frame to pybedtools" logger.info(msg) chr_field = '{}_chr'.format(args.output_prefix) start_field = '{}_start'.format(args.output_prefix) qti_bed_df = qti_bed_df.sort_values([chr_field, start_field]) qti_bed = pybedtools.BedTool.from_dataframe(qti_bed_df) msg = "Finding closest QTI peak for all ORFs" logger.info(msg) # s means to consider strandedness # D means to report the distance # a means to report upstream positions (relative to orfs_start) as negative closest_bed = orfs_bed.closest(qti_bed, s=True, D="a") # covert back to a df for clean up msg = "Converting closest results back to data frame" logger.info(msg) peak_distance_field = '{}_peak_distance'.format(args.output_prefix) closest_bed_fields = list(orfs.columns) + list( qti_bed_df.columns) + [peak_distance_field] closest_df = closest_bed.to_dataframe(names=closest_bed_fields, index_col=False) # now join the relevant fields back to the original ORF data frame fields_to_join = list(qti_bed_df.columns) + [peak_distance_field, 'id'] closest_df = closest_df[fields_to_join] msg = "Joining closest results to original ORFs" logger.info(msg) orf_qti_df = pd.merge(orfs_copy, closest_df, on='id', how='left') orf_qti_df = orf_qti_df.sort_values(['seqname', 'start']) # and write this out as a bed12+ file msg = "Writing joined BED12+ file to disk" logger.info(msg) bio.write_bed(orf_qti_df, args.out)
def main(): parser = argparse.ArgumentParser(formatter_class=argparse.ArgumentDefaultsHelpFormatter, description="This script plots the (log) Bayes factor against the estimated " "RPKM for all ORFs. All relevant values will be clipped according to the " "specified arguments for viewing.") parser.add_argument('config', help="The (yaml) config file") parser.add_argument('name', help="The name of the dataset or replicate to plot") parser.add_argument('out', help="The output image file") parser.add_argument('-p', '--use-predictions', help="If this flag is present, then " "the \"predicted ORFs\" files will be used. Otherwise, all ORFs in the dataset " "will be visualized.", action='store_true') parser.add_argument('-r', '--is-replicate', help="If the name corresponds to one " "of the replicates, this flag must be used to ensure the filenames are " "handled correctly.", action='store_true') parser.add_argument('--title', default=default_title) parser.add_argument('--min-rpkm', type=float, default=default_min_rpkm) parser.add_argument('--max-rpkm', type=float, default=default_max_rpkm) parser.add_argument('--min-bf', type=float, default=default_min_bf) parser.add_argument('--max-bf', type=float, default=default_max_bf) utils.add_logging_options(parser) args = parser.parse_args() utils.update_logging(args) config = yaml.load(open(args.config)) note = config.get('note', None) if args.is_replicate: lengths = None offsets = None else: lengths, offsets = ribo_utils.get_periodic_lengths_and_offsets(config, args.name) fraction = config.get('smoothing_fraction', None) reweighting_iterations = config.get('smoothing_reweighting_iterations', None) # we will need these to get the appropriate log BFs if args.use_predictions: bayes_factors = filenames.get_riboseq_predicted_orfs(config['riboseq_data'], args.name, length=lengths, offset=offsets, is_unique=True, note=note, is_smooth=True, fraction=fraction, reweighting_iterations=reweighting_iterations) else: bayes_factors = filenames.get_riboseq_bayes_factors(config['riboseq_data'], args.name, length=lengths, offset=offsets, is_unique=True, note=note, is_smooth=True, fraction=fraction, reweighting_iterations=reweighting_iterations) if not os.path.exists(bayes_factors): msg = ("Could not find the Bayes factor file: {}\nIf this is for a particular " "sample and the --merge-replicates option was used, this is not a problem. " "Will not create this scatter plot".format(bayes_factors)) logger.warning(msg) return msg = "Reading Bayes factors" logger.info(msg) bayes_factors = bio.read_bed(bayes_factors) # we need these to get the raw counts for calculating RPKM # we always need all of the counts, so no need to check which ORFs rpchi_pvalues = filenames.get_riboseq_bayes_factors(config['riboseq_data'], args.name, length=lengths, offset=offsets, is_unique=True, note=note, is_smooth=False) if not os.path.exists(rpchi_pvalues): msg = ("Could not find the Rp-chi pvalues file: {}\nIf this is for a particular " "sample and the --merge-replicates option was used, this is not a problem. " "Will not create this scatter plot".format(rpchi_pvalues)) logger.warning(msg) return msg = "Reading Rp-chi pvalues" logger.info(msg) rpchi_pvalues = bio.read_bed(rpchi_pvalues) msg = "Calculating RPKM values" logger.info(msg) # we approximate the number of mapping reads as the sum across all ORFs. # this double-counts some reads num_reads = np.sum(rpchi_pvalues['profile_sum']) all_rpkm = (1e6 * rpchi_pvalues['x_1_sum']) / (rpchi_pvalues['orf_len'] * num_reads) # only include things that have some reads in the visualization m_rpkm = all_rpkm > 0 msg = "Creating plot" logger.info(msg) fig, ax = plt.subplots(figsize=(10, 5)) cm = plt.cm.gist_earth for i, orf_label in enumerate(ribo_utils.orf_type_labels): orf_types = ribo_utils.orf_type_labels_mapping[orf_label] m_type = bayes_factors['orf_type'].isin(orf_types) # now, pull out the RPKMs if args.use_predictions: # if we are using predictions, we have to filter and join orf_ids = bayes_factors.loc[m_rpkm & m_type, 'id'] bfs = np.array(bayes_factors.loc[m_rpkm & m_type, 'bayes_factor_mean']) m_ids = rpchi_pvalues['id'].isin(orf_ids) rpkm = np.array(all_rpkm[m_ids]) else: # otherwise ,the data frames match, so we can just use the masks rpkm = np.array(all_rpkm[m_rpkm & m_type]) bfs = np.array(bayes_factors.loc[m_rpkm & m_type, 'bayes_factor_mean']) rpkm = np.clip(rpkm, args.min_rpkm, args.max_rpkm) bfs = np.clip(bfs, args.min_bf, args.max_bf) color = i / len(ribo_utils.orf_type_labels) color = cm(color) label = "{} ({})".format(orf_label, len(rpkm)) ax.scatter(rpkm, bfs, label=label, color=color, edgecolor='k') ax.set_ylim((args.min_bf * 1.5, args.max_bf * 1.5)) ax.set_xlim((args.min_rpkm * 1.5, args.max_rpkm * 1.25)) ax.set_yscale('symlog') ax.set_xscale('symlog') ax.set_xlabel('RPKM') ax.set_ylabel('log BF') lgd = ax.legend(loc='center right', bbox_to_anchor=(1.5, 0.5)) if len(args.title) > 0: ax.set_title(args.title) fig.savefig(args.out, bbox_inches='tight', bbox_extra_artists=(lgd,))
def main(): parser = argparse.ArgumentParser(formatter_class=argparse.ArgumentDefaultsHelpFormatter, description="This script creates a plot showing the fraction of predicted ORFs " "which have a set amount of peptide coverage.") parser.add_argument('rpbp_peptide_matches', help="The (csv) file containing the peptides " "matching to each ORF predicted as translated using Rp-Bp (produced by " "get-orf-peptide-matches)") parser.add_argument('rpchi_peptide_matches', help="The (csv) file containing the peptides " "matching to each ORF predicted as translated using Rp-chi (produced by " "get-orf-peptide-matches)") parser.add_argument('out', help="The output (image) file") parser.add_argument('-l', '--min-length', help="The minimum length for ORFs (in " "nucleotides) to consider in the analyis", type=int, default=default_min_length) parser.add_argument('-p', '--num-cpus', help="The number of processors to use", type=int, default=default_num_cpus) parser.add_argument('--title', default=default_title) parser.add_argument('--fontsize', type=int, default=default_fontsize) parser.add_argument('--note-fontsize', type=int, default=default_note_fontsize) parser.add_argument('--line-width', type=int, default=default_line_width) utils.add_logging_options(parser) args = parser.parse_args() utils.update_logging(args) msg = "Reading predictions" logging.info(msg) rpbp_peptide_matches = pd.read_csv(args.rpbp_peptide_matches) rpchi_peptide_matches = pd.read_csv(args.rpchi_peptide_matches) if args.min_length > 0: msg = "Filtering predictions by: length > {}".format(args.min_length) logging.warning(msg) # multiply by 3 because the orf sequences are amino acid sequences bf_lengths = rpbp_peptide_matches['orf_sequence'].str.len() * 3 m_bf_length = bf_lengths > args.min_length rpbp_peptide_matches = rpbp_peptide_matches[m_bf_length] chisq_lengths = rpchi_peptide_matches['orf_sequence'].str.len() * 3 m_chisq_length = chisq_lengths > args.min_length rpchi_peptide_matches = rpchi_peptide_matches[m_chisq_length] msg = "Calculating Rp-Bp coverage" logging.info(msg) bf_coverage = parallel.apply_parallel(rpbp_peptide_matches, args.num_cpus, get_orf_coverage, progress_bar=True) bf_coverage = pd.DataFrame(bf_coverage) msg = "Calculating Rp-chi coverage" logging.info(msg) chisq_coverage = parallel.apply_parallel(rpchi_peptide_matches, args.num_cpus, get_orf_coverage, progress_bar=True) chisq_coverage = pd.DataFrame(chisq_coverage) msg = "Creating image" logging.info(msg) # plot the empirical distribution of ORF lengths hist_min = 0 hist_max = 1.1 hist_step = 0.05 hist_range = (hist_min, hist_max) hist_bins = np.arange(hist_min, hist_max, hist_step) bf_covered_hist, b = np.histogram(bf_coverage['coverage'], bins=hist_bins, range=hist_range, density=True) chisq_covered_hist, b = np.histogram(chisq_coverage['coverage'], bins=hist_bins, range=hist_range, density=True) # now, normalize the histograms bf_covered_hist = bf_covered_hist / np.sum(bf_covered_hist) chisq_covered_hist = chisq_covered_hist / np.sum(chisq_covered_hist) # multiply by 100 to give actual percentages bf_covered_hist = 100 * bf_covered_hist chisq_covered_hist = 100 * chisq_covered_hist hist_bins = 100*hist_bins fig, ax = plt.subplots(figsize=(10,5)) cm = plt.cm.gist_earth x = np.arange(len(bf_covered_hist)) bf_label = r'\textsc{Rp-Bp}' ax.plot(x, bf_covered_hist, color=cm(0.1), label=bf_label, linewidth=args.line_width, linestyle='--', marker='^') chisq_label = r'\textsc{Rp-$\chi^2$}' ax.plot(x, chisq_covered_hist, color=cm(0.3), label=chisq_label, linewidth=args.line_width, linestyle='-.', marker='D') ax.set_xlabel('Peptide Coverage (\%)', fontsize=args.fontsize) ax.set_ylabel('\% of predicted ORFs', fontsize=args.fontsize) if args.title is not None and len(args.title) > 0: ax.set_title(args.title, fontsize=args.fontsize) # only show every 20% on the x-axis ax.set_xticks(x[::4]) ax.set_xticklabels(hist_bins[::4]) def my_formatter_fun(x, p): return "${:d}$".format(20*p) ax.get_xaxis().set_major_formatter(matplotlib.ticker.FuncFormatter(my_formatter_fun)) # hide the "0" tick label yticks = ax.yaxis.get_major_ticks() yticks[0].label1.set_visible(False) ax.set_xlim((0, len(bf_covered_hist)-1)) ax.set_ylim((0,10)) ax.legend(loc='upper right', fontsize=args.fontsize) ax.tick_params(axis='both', which='major', labelsize=args.note_fontsize) fig.savefig(args.out, bbox_inches='tight')
def main(): parser = argparse.ArgumentParser(formatter_class=argparse.ArgumentDefaultsHelpFormatter, description="This script extracts reads of various types from a processed dataset " "to create an \"interesting\" test dataset.\n\nN.B. This script is not " "particularly efficient and is not intended for external use.") parser.add_argument('config', help="The (yaml) config file") parser.add_argument('name', help="The name of the dataset to use to create the test data") parser.add_argument('out', help="The output (fasta.gz) which contains reads of various " "types, subject to the other parameters") parser.add_argument('-r', '--reference', help="The name of the reference sequence (chromosome) " "from which aligned reads will be extracted", default=default_reference) parser.add_argument('-m', '--max-reads', help="At most <max_reads> reads of each type " "will be included in the final output", type=int, default=default_max_reads) utils.add_logging_options(parser) args = parser.parse_args() utils.update_logging(args) config = yaml.load(open(args.config)) note = config.get('note', None) # keep multimappers? is_unique = not ('keep_riboseq_multimappers' in config) ### msg = "Reading alignments from BAM file" logging.info(msg) bam_file = filenames.get_riboseq_bam(config['riboseq_data'], args.name, is_unique=is_unique, note=note) bam = pysam.AlignmentFile(bam_file) alignments = bam.fetch(reference=args.reference) num_alignments = bam.count(reference=args.reference) alignment_qnames = {get_first_token(a.qname) for a in alignments} ### msg = "Extracting a similar number of rRNA reads" logging.info(msg) with_rrna = filenames.get_with_rrna_fastq(config['riboseq_data'], args.name, note=note) rrna = bio.get_read_iterator(with_rrna, is_fasta=False) rrna = itertools.islice(rrna, num_alignments) rrna_qnames = {get_first_token(read[0]) for read in rrna} ### msg = "Extracting a similar number of reads which do not uniquely map to the genome" logging.info(msg) # first, pull out the qnames of all alignments all_alignments = bam.fetch() all_alignment_qnames = {get_first_token(a.qname) for a in all_alignments} # iterate over all reads which passed the rRNA and quality filtering without_rrna_file = filenames.get_without_rrna_fastq(config['riboseq_data'], args.name, note=note) without_rrna = bio.get_read_iterator(without_rrna_file, is_fasta=False) without_rrna_qnames = {get_first_token(read[0]) for read in without_rrna} no_mapping_qnames = without_rrna_qnames - all_alignment_qnames ### msg = "Extracting a similar number of reads which are filtered due to quality issues" logging.info(msg) # first, pull in all the reads and their names msg = "Reading all reads into a dictionary" logging.debug(msg) raw_data_file = config['riboseq_samples'][args.name] raw_data = bio.get_fasta_dict(raw_data_file, is_fasta=False, key_fn=get_first_token) raw_data_qnames = set(raw_data.keys()) msg = "Reading quality scores into dictionary" logging.debug(msg) raw_data_qual = bio.get_fastq_qual_dict(raw_data_file, key_fn=get_first_token) # now, the reads which _did_ pass quality filtering msg = "Reading reads which pass quality filtering into a set" logging.debug(msg) without_adapters_file = filenames.get_without_adapters_fastq(config['riboseq_data'], args.name, note=note) without_adapters = bio.get_read_iterator(without_adapters_file, is_fasta=False) without_adapters_qnames = {get_first_token(read[0]) for read in without_adapters} # and pull out the qnames of the reads which did not pass quality filtering filtered_reads_qnames = raw_data_qnames - without_adapters_qnames ### msg = "Constructing the set of reads to output" logging.info(msg) alignment_raw_data = {qname: raw_data[qname] for qname in itertools.islice(alignment_qnames, args.max_reads)} rrna_raw_data = {qname: raw_data[qname] for qname in itertools.islice(rrna_qnames, args.max_reads)} no_mapping_raw_data = {qname: raw_data[qname] for qname in itertools.islice(no_mapping_qnames, args.max_reads)} filtered_reads_raw_data = {qname: raw_data[qname] for qname in itertools.islice(filtered_reads_qnames, args.max_reads)} out_raw_data = alignment_raw_data out_raw_data.update(rrna_raw_data) out_raw_data.update(no_mapping_raw_data) out_raw_data.update(filtered_reads_raw_data) ### msg = "Writing sequences to disk" logging.info(msg) bio.write_fastq(out_raw_data, raw_data_qual, args.out, progress_bar=True)
def main(): parser = argparse.ArgumentParser( formatter_class=argparse.ArgumentDefaultsHelpFormatter, description= "This script plots the (log) Bayes factor against the estimated " "RPKM for all ORFs. All relevant values will be clipped according to the " "specified arguments for viewing.") parser.add_argument('config', help="The (yaml) config file") parser.add_argument('name', help="The name of the dataset or replicate to plot") parser.add_argument('out', help="The output image file") parser.add_argument( '-p', '--use-predictions', help="If this flag is present, then " "the \"predicted ORFs\" files will be used. Otherwise, all ORFs in the dataset " "will be visualized.", action='store_true') parser.add_argument( '-r', '--is-replicate', help="If the name corresponds to one " "of the replicates, this flag must be used to ensure the filenames are " "handled correctly.", action='store_true') parser.add_argument('--title', default=default_title) parser.add_argument('--min-rpkm', type=float, default=default_min_rpkm) parser.add_argument('--max-rpkm', type=float, default=default_max_rpkm) parser.add_argument('--min-bf', type=float, default=default_min_bf) parser.add_argument('--max-bf', type=float, default=default_max_bf) utils.add_logging_options(parser) args = parser.parse_args() utils.update_logging(args) config = yaml.load(open(args.config)) note = config.get('note', None) if args.is_replicate: lengths = None offsets = None else: lengths, offsets = ribo_utils.get_periodic_lengths_and_offsets( config, args.name) fraction = config.get('smoothing_fraction', None) reweighting_iterations = config.get('smoothing_reweighting_iterations', None) # we will need these to get the appropriate log BFs if args.use_predictions: bayes_factors = filenames.get_riboseq_predicted_orfs( config['riboseq_data'], args.name, length=lengths, offset=offsets, is_unique=True, note=note, is_smooth=True, fraction=fraction, reweighting_iterations=reweighting_iterations) else: bayes_factors = filenames.get_riboseq_bayes_factors( config['riboseq_data'], args.name, length=lengths, offset=offsets, is_unique=True, note=note, is_smooth=True, fraction=fraction, reweighting_iterations=reweighting_iterations) if not os.path.exists(bayes_factors): msg = ( "Could not find the Bayes factor file: {}\nIf this is for a particular " "sample and the --merge-replicates option was used, this is not a problem. " "Will not create this scatter plot".format(bayes_factors)) logger.warning(msg) return msg = "Reading Bayes factors" logger.info(msg) bayes_factors = bio.read_bed(bayes_factors) # we need these to get the raw counts for calculating RPKM # we always need all of the counts, so no need to check which ORFs rpchi_pvalues = filenames.get_riboseq_bayes_factors(config['riboseq_data'], args.name, length=lengths, offset=offsets, is_unique=True, note=note, is_smooth=False) if not os.path.exists(rpchi_pvalues): msg = ( "Could not find the Rp-chi pvalues file: {}\nIf this is for a particular " "sample and the --merge-replicates option was used, this is not a problem. " "Will not create this scatter plot".format(rpchi_pvalues)) logger.warning(msg) return msg = "Reading Rp-chi pvalues" logger.info(msg) rpchi_pvalues = bio.read_bed(rpchi_pvalues) msg = "Calculating RPKM values" logger.info(msg) # we approximate the number of mapping reads as the sum across all ORFs. # this double-counts some reads num_reads = np.sum(rpchi_pvalues['profile_sum']) all_rpkm = (1e6 * rpchi_pvalues['x_1_sum']) / (rpchi_pvalues['orf_len'] * num_reads) # only include things that have some reads in the visualization m_rpkm = all_rpkm > 0 msg = "Creating plot" logger.info(msg) fig, ax = plt.subplots(figsize=(10, 5)) cm = plt.cm.gist_earth for i, orf_label in enumerate(ribo_utils.orf_type_labels): orf_types = ribo_utils.orf_type_labels_mapping[orf_label] m_type = bayes_factors['orf_type'].isin(orf_types) # now, pull out the RPKMs if args.use_predictions: # if we are using predictions, we have to filter and join orf_ids = bayes_factors.loc[m_rpkm & m_type, 'id'] bfs = np.array(bayes_factors.loc[m_rpkm & m_type, 'bayes_factor_mean']) m_ids = rpchi_pvalues['id'].isin(orf_ids) rpkm = np.array(all_rpkm[m_ids]) else: # otherwise ,the data frames match, so we can just use the masks rpkm = np.array(all_rpkm[m_rpkm & m_type]) bfs = np.array(bayes_factors.loc[m_rpkm & m_type, 'bayes_factor_mean']) rpkm = np.clip(rpkm, args.min_rpkm, args.max_rpkm) bfs = np.clip(bfs, args.min_bf, args.max_bf) color = i / len(ribo_utils.orf_type_labels) color = cm(color) label = "{} ({})".format(orf_label, len(rpkm)) ax.scatter(rpkm, bfs, label=label, color=color, edgecolor='k') ax.set_ylim((args.min_bf * 1.5, args.max_bf * 1.5)) ax.set_xlim((args.min_rpkm * 1.5, args.max_rpkm * 1.25)) ax.set_yscale('symlog') ax.set_xscale('symlog') ax.set_xlabel('RPKM') ax.set_ylabel('log BF') lgd = ax.legend(loc='center right', bbox_to_anchor=(1.5, 0.5)) if len(args.title) > 0: ax.set_title(args.title) fig.savefig(args.out, bbox_inches='tight', bbox_extra_artists=(lgd, ))
def main(): parser = argparse.ArgumentParser(formatter_class=argparse.ArgumentDefaultsHelpFormatter, description="This script creates a simple latex document.") parser.add_argument('config', help="The (yaml) config file for the project") parser.add_argument('out', help="The path for the output files") parser.add_argument('-l', '--min-orf-length', help="The minimum length for ORFs (in " "nucleotides) to consider in the analyis", type=int, default=default_min_orf_length) parser.add_argument('--num-cpus', help="The number of processors to use for counting " "the matching peptides coverage for each predicted ORF.", type=int, default=default_num_cpus) parser.add_argument('--overwrite', help="If this flag is present, existing files will " "be overwritten.", action='store_true') parser.add_argument('--note', help="If this option is given, it will be used in the " "filenames.\n\nN.B. This REPLACES the note in the config file.", default=default_note) utils.add_logging_options(parser) args = parser.parse_args() utils.update_logging(args) config = yaml.load(open(args.config)) # keep multimappers? is_unique = not ('keep_riboseq_multimappers' in config) programs = [ 'create-orf-peptide-coverage-line-graph' ] utils.check_programs_exist(programs) required_keys = [ 'riboseq_data', 'riboseq_samples' ] utils.check_keys_exist(config, required_keys) # make sure the path to the output file exists os.makedirs(args.out, exist_ok=True) note_str = config.get('note', None) out_note_str = note_str if args.note is not None and len(args.note) > 0: out_note_str = args.note project_name = config.get("project_name", default_project_name) title = "Proteomics analysis for {}".format(project_name) abstract = "This document shows the results of proteomics analysis." header = latex.get_header_text(title, abstract) footer = latex.get_footer_text() tex_file = os.path.join(args.out, "proteomics-report.tex") with open(tex_file, 'w') as out: out.write(header) out.write("\n") title = "ORF peptide coverage" latex.section(out, title) for name, data in config['riboseq_samples'].items(): msg = "Processing sample: {}".format(name) logging.info(msg) logging.debug("overwrite: {}".format(args.overwrite)) create_figures(args.config, config, name, args) title = name latex.subsection(out, title) try: lengths, offsets = riboutils.ribo_utils.get_periodic_lengths_and_offsets( config, name, is_unique=is_unique) except FileNotFoundError: msg = "Could not parse out lengths and offsets for sample: {}. Skipping".format(name) logging.error(msg) return peptide_coverage_line_graph = filenames.get_peptide_coverage_line_graph(config['riboseq_data'], name, length=lengths, offset=offsets, is_unique=is_unique, note=out_note_str) if os.path.exists(peptide_coverage_line_graph): latex.begin_figure(out) latex.write_graphics(out, peptide_coverage_line_graph, width=0.9) latex.end_figure(out) else: text = "Problem creating ORF peptide coverage plot" out.write(text) out.write("\n") latex.clearpage(out) out.write(footer) os.chdir(args.out) cmd = "pdflatex -shell-escape proteomics-report" utils.check_call(cmd) utils.check_call(cmd) # call again to fix references