def draw_fraction_of_retained_pairs_per_tile_histogram( self, output_prefix): data = self.get_fraction_of_retained_pairs_per_tile() MatplotlibRoutines.percent_histogram( data, output_prefix, n_bins=20, title="Distribution of retained pairs per tile", xlabel="Fraction of retained pairs", ylabel="Number of tiles", label=None, extensions=("png", "svg"), legend=None, legend_location="best", input_mode="fraction", xmax=None, xmin=None)
AUGUSTUS.extract_CDS_annotations_from_output(final_gff, final_CDS_gff) print("Drawing histograms...") for stat_file in output_evidence_stats, output_supported_stats, \ output_swissprot_pfam_or_hints_supported_transcripts_longest_pep_evidence, \ output_swissprot_pfam_and_hints_supported_transcripts_longest_pep_evidence, \ output_swissprot_pfam_or_hints_supported_transcripts_evidence, \ output_swissprot_pfam_and_hints_supported_transcripts_evidence: MatplotlibRoutines.percent_histogram_from_file( stat_file, stat_file, data_type=None, column_list=(2, ), comments="#", n_bins=20, title="Transcript support by hints", extensions=("png", "svg"), legend_location="upper center", stats_as_legend=True) print("Creating final directories...") if args.pfam_db and args.swissprot_db: db_or_hints_dir = "supported_by_db_or_hints/" db_and_hints_dir = "supported_by_db_and_hints/" for directory in db_and_hints_dir, db_or_hints_dir: FileRoutines.safe_mkdir(directory) os.system("mv %s.supported.transcripts.swissprot_or_pfam_or_hints* %s" % (args.output, db_or_hints_dir)) os.system("mv %s.supported.transcripts.swissprot_or_pfam_and_hints* %s" %
if args.index is None: args.index = [None for i in range(0, len(args.input))] if args.max_value is None: args.max_value = [None for i in range(0, len(args.input))] MatplotlibRoutines.draw_tetra_histogram_with_two_logscaled_from_file( args.input, args.index, args.output_prefix, figsize=(10, 10), number_of_bins_list=args.number_of_bins, width_of_bins_list=args.width_of_bins, max_threshold_list=args.max_value, min_threshold_list=args.min_value, xlabel=args.xlabel, ylabel=args.ylabel, title_list=args.title_list, logbase=args.logbase, label_list=None, extensions=args.extensions, suptitle=None, separator=args.separator, share_y_axis=args.share_y_axis, share_x_axis=args.share_x_axis) """ Example: ~/Dropbox/MAVR/scripts/draw/draw_tetra_histogram_with_two_logscaled.py -i kirill.dn.ds.w.tab,solenodon.raw_alns.all.tab -d 3,3 -o dnds.ratio.log -l 'dN/dS' -y "Number of genes" -w 20 -n 0 -x 999 -t "11 species,4 species" """ """ if (args.number_of_bins is not None) and (args.width_of_bins is not None):
def handle_sanger_data(self, input_dir, output_prefix, outdir=None, read_subfolders=False, min_mean_qual=0, min_median_qual=0, min_len=50): if outdir: self.workdir = outdir self.init_dirs() sanger_filelist = self.make_list_of_path_to_files( input_dir, expression=self.is_sanger_file, recursive=read_subfolders, return_absolute_paths=True) stat_dict = TwoLvlDict() record_dict = OrderedDict() trimmed_record_dict = OrderedDict() excluded_list = IdList() excluded_counter = 0 low_quality_counter = 0 too_short_counter = 0 merged_raw_fastq = "%s/%s.raw.fastq" % (self.workdir, output_prefix) merged_raw_fasta = "%s/%s.raw.fasta" % (self.workdir, output_prefix) merged_trimmed_fastq = "%s/%s.trimmed.fastq" % (self.workdir, output_prefix) merged_trimmed_fasta = "%s/%s.trimmed.fasta" % (self.workdir, output_prefix) for filename in sanger_filelist: filename_list = self.split_filename(filename) record_raw_fastq = "%s/fastq/raw/%s.raw.fastq" % (self.workdir, filename_list[1]) record_raw_fasta = "%s/fasta/raw/%s.raw.fasta" % (self.workdir, filename_list[1]) record_raw_qual_plot_prefix = "%s/qual_plot/raw/%s.raw.qual" % ( self.workdir, filename_list[1]) record_trimmed_fastq = "%s/fastq/trimmed/%s.trimmed.fastq" % ( self.workdir, filename_list[1]) record_trimmed_fasta = "%s/fasta/trimmed/%s.trimmed.fasta" % ( self.workdir, filename_list[1]) record_trimmed_qual_plot_prefix = "%s/qual_plot/trimmed/%s.trimmed.qual" % ( self.workdir, filename_list[1]) record = SeqIO.read(self.metaopen(filename, "rb"), format="abi") record_dict[record.id] = record SeqIO.write(record, record_raw_fastq, format="fastq") SeqIO.write(record, record_raw_fasta, format="fasta") trimmed_record = SeqIO.AbiIO._abi_trim(record) stat_dict[record.id] = OrderedDict({ "raw_len": len(record), "raw_mean_qual": np.mean(record.letter_annotations["phred_quality"]), "raw_median_qual": np.median(record.letter_annotations["phred_quality"]), "trimmed_len": len(trimmed_record), "trimmed_mean_qual": np.mean(trimmed_record.letter_annotations["phred_quality"]), "trimmed_median_qual": np.median(trimmed_record.letter_annotations["phred_quality"]), "retained": "-", }) MatplotlibRoutines.draw_bar_plot( record.letter_annotations["phred_quality"], record_raw_qual_plot_prefix, extentions=["png"], xlabel="Position", ylabel="Phred quality", title="Per base quality", min_value=None, max_value=None, new_figure=True, figsize=(3 * (int(len(record) / 100) + 1), 3), close_figure=True) if stat_dict[record.id]["trimmed_len"] >= min_len: if min_median_qual: if (stat_dict[record.id]["trimmed_median_qual"] >= min_median_qual) and ( stat_dict[record.id]["trimmed_mean_qual"] >= min_mean_qual): stat_dict[record.id]["retained"] = "+" else: low_quality_counter += 1 else: stat_dict[record.id]["retained"] = "+" else: too_short_counter += 1 if stat_dict[record.id]["retained"] == "-": excluded_list.append(record.id) continue SeqIO.write(trimmed_record, record_trimmed_fastq, format="fastq") SeqIO.write(trimmed_record, record_trimmed_fasta, format="fasta") MatplotlibRoutines.draw_bar_plot( trimmed_record.letter_annotations["phred_quality"], record_trimmed_qual_plot_prefix, extentions=["png"], xlabel="Position", ylabel="Phred quality", title="Per base quality", min_value=None, max_value=None, new_figure=True, figsize=(3 * (int(len(record) / 100) + 1), 3), close_figure=True) trimmed_record_dict[record.id] = trimmed_record SeqIO.write(self.record_from_dict_generator(record_dict), merged_raw_fastq, format="fastq") SeqIO.write(self.record_from_dict_generator(record_dict), merged_raw_fasta, format="fasta") SeqIO.write(self.record_from_dict_generator(trimmed_record_dict), merged_trimmed_fastq, format="fastq") SeqIO.write(self.record_from_dict_generator(trimmed_record_dict), merged_trimmed_fasta, format="fasta") excluded_list.write("%s.excluded.ids" % output_prefix) stat_dict.write(out_filename="%s.stats" % output_prefix) print("Excluded: %i" % excluded_counter) print("\tToo short( < %i ): %i" % (min_len, too_short_counter)) print("\tLow quality( median < %i or mean < %i ): %i" % (min_median_qual, min_mean_qual, low_quality_counter))
"-e", "--extensions", action="store", dest="extensions", type=lambda x: x.split(","), default=["png", "svg"], help="Comma-separated list of extensions for histogram files") """ parser.add_argument("-l", "--xlabel", action="store", dest="xlabel", help="X label") parser.add_argument("-y", "--ylabel", action="store", dest="ylabel", help="Y label") """ parser.add_argument("-t", "--title", action="store", dest="title", help="Title of histogram") args = parser.parse_args() MatplotlibRoutines.venn_diagram_from_sets_from_files( args.id_file_a, args.id_file_b, set3_file=args.id_file_c, set_labels=args.set_labels, set_colors=args.set_colors, output_prefix=args.output_prefix, extensions=args.extensions, title=args.title)
dest="ylabel", help="Y label") parser.add_argument("-t", "--title", action="store", dest="title", help="Title of histogram") args = parser.parse_args() MatplotlibRoutines.draw_histogram_from_file(args.input_file, args.output_prefix, number_of_bins=args.number_of_bins, width_of_bins=args.width_of_bins, separator=args.separator, max_length=args.max_length, min_length=args.min_length, xlabel=args.xlabel, ylabel=args.ylabel, title=args.title, extensions=args.extensions, logbase=args.logbase) """ if (args.number_of_bins is not None) and (args.width_of_bins is not None): raise AttributeError("Options -w/--width_of_bins and -b/--number_of_bins mustn't be set simultaneously") lengths = np.fromfile(args.input_file, sep=args.separator) max_len = max(lengths) if args.max_length is None: args.max_length = max_len
def draw_general_stats_distributions(self, output_prefix, figsize=(12, 6), extensions=("png", "svg"), dpi=300, logscale_heatmaps=True): nrows = 2 ncols = 4 #figure = plt.figure(figsize=(12, 6), dpi=dpi) #ax_array = figure.subplots(nrows=nrows, ncols=ncols, squeeze=False) percent_histogram_bin_number = 20 min_seq_number_in_alignment = min(self.general_stats_table[:, 0]) max_seq_number_in_alignment = max(self.general_stats_table[:, 0]) seq_bin_number = max_seq_number_in_alignment - min_seq_number_in_alignment figure, ax_array = plt.subplots(nrows=nrows, ncols=ncols, figsize=figsize, squeeze=False, dpi=dpi) # Histogram: distribution of sequence number in alignment print("Histogram: distribution of sequence number in alignment") MatplotlibRoutines.draw_histogram(self.general_stats_table[:, 0], output_prefix="%s.seq_number_distibution" % output_prefix, number_of_bins=None, width_of_bins=1, max_threshold=None, min_threshold=None, xlabel=None, ylabel="N of alignments", title="Distribution of\nsequence numbers", extensions=("png",), ylogbase=None, subplot=ax_array[0, 0], suptitle=None, close_figure=False, save_histovalues_only=True) # Histogram(logscaled): distribution of sequence number in alignment print("Histogram(logscaled): distribution of sequence number in alignment") MatplotlibRoutines.draw_histogram(self.general_stats_table[:, 0], output_prefix=None, number_of_bins=None, width_of_bins=1, max_threshold=None, min_threshold=None, xlabel="N of sequences\nin alignment", ylabel="N of alignments", title=None, extensions=("png",), ylogbase=10, subplot=ax_array[1, 0], suptitle=None, close_figure=False) #print self.general_stats_table[:, 4].astype(float) / self.general_stats_table[:, 2].astype(float) * 100 # Heatmap: x: max_seq_len/aln_len, y: min_seq_len/aln_len print("Heatmap: x: max_seq_len/aln_len, y: min_seq_len/aln_len") MatplotlibRoutines.draw_percent_heatmap(self.general_stats_table[:, 3].astype(float) / self.general_stats_table[:, 1].astype(float) * 100, self.general_stats_table[:, 2].astype(float) / self.general_stats_table[:, 1].astype(float) * 100, output_prefix="%s.min_max_seq_len" % output_prefix, xlabel="Max seq len, % of aln", ylabel="Min seq len, % of aln", title=None, figsize=(8, 8), minimum_counts_to_show=1, extensions=("png", "svg"), show_colorbar=True, bin_number=percent_histogram_bin_number, bin_width=None, bin_array=None, type="percent", add_max_value=True, subplot=ax_array[0, 1], header="#left_xedge\tleft_yedge\tvalue", save_histovalues_only=True, logscaled=logscale_heatmaps) # Heatmap; x: seq number, y: max_unique_pos_in_seq/aln_len print("Heatmap: x: seq number, y: max_unique_pos_in_seq/aln_len") MatplotlibRoutines.draw_heatmap(self.general_stats_table[:, 0], 100 * self.general_stats_table[:, 13], output_prefix="%s.max_unique_pos_seq_number" % output_prefix, xlabel="N of sequences\nin alignment", ylabel="Max uniq positions, % of seq", title=None, figsize=figsize, minimum_counts_to_show=1, extensions=extensions, show_colorbar=True, bin_number=(seq_bin_number, percent_histogram_bin_number), bin_width=None, bin_array=None, min_x_value=min_seq_number_in_alignment, max_x_value=max_seq_number_in_alignment, min_y_value=0, max_y_value=100, add_max_value=True, subplot=ax_array[1, 1], save_histovalues_only=True, header="#left_xedge\tleft_yedge\tcounts", logscaled=logscale_heatmaps) # Heatmap; x: seq number, y: max_unique_insertions_in_seq/aln_len print("Heatmap: x: seq number, y: max_unique_insertions_in_seq/aln_len") MatplotlibRoutines.draw_heatmap(self.general_stats_table[:, 0], 100 * self.general_stats_table[:, 5], output_prefix="%s.max_unique_insertions_seq_number" % output_prefix, xlabel=None, #"N of sequences\nin alignment", ylabel="Max uniq insertions, % of seq", title=None, figsize=figsize, minimum_counts_to_show=1, extensions=extensions, show_colorbar=True, bin_number=(seq_bin_number, percent_histogram_bin_number), bin_width=None, bin_array=None, min_x_value=min_seq_number_in_alignment, max_x_value=max_seq_number_in_alignment, min_y_value=0, max_y_value=100, add_max_value=True, subplot=ax_array[0, 2], save_histovalues_only=True, header="#left_xedge\tleft_yedge\tcounts", logscaled=logscale_heatmaps) # Heatmap: x: seq number, y: max_unique_gaps_in_seq/aln_len print("Heatmap: x: seq number, y: max_unique_gaps_in_seq/aln_len") MatplotlibRoutines.draw_heatmap(self.general_stats_table[:, 0], 100 * self.general_stats_table[:, 7], output_prefix="%s.max_unique_gaps_seq_number" % output_prefix, xlabel="N of sequences\nin alignment", ylabel="Max uniq gaps, % of seq", title=None, figsize=figsize, minimum_counts_to_show=1, extensions=extensions, show_colorbar=True, bin_number=(seq_bin_number, percent_histogram_bin_number), bin_width=None, bin_array=None, min_x_value=min_seq_number_in_alignment, max_x_value=max_seq_number_in_alignment, min_y_value=0, max_y_value=100, add_max_value=True, subplot=ax_array[1, 2], save_histovalues_only=True, header="#left_xedge\tleft_yedge\tcounts", logscaled=logscale_heatmaps) # Heatmap; x: seq number, y: max_unique_leading_pos_in_seq/aln_len print("Heatmap: x: seq number, y: max_unique_leading_pos_in_seq/aln_len") MatplotlibRoutines.draw_heatmap(self.general_stats_table[:, 0], 100 * self.general_stats_table[:, 9], output_prefix="%s.max_unique_leading_pos_seq_number" % output_prefix, xlabel=None, #"N of sequences\nin alignment", ylabel="Max uniq leading pos, % of seq", title=None, figsize=figsize, minimum_counts_to_show=1, extensions=extensions, show_colorbar=True, bin_number=(seq_bin_number, percent_histogram_bin_number), bin_width=None, bin_array=None, min_x_value=min_seq_number_in_alignment, max_x_value=max_seq_number_in_alignment, min_y_value=0, max_y_value=100, add_max_value=True, subplot=ax_array[0, 3], save_histovalues_only=True, header="#left_xedge\tleft_yedge\tcounts", logscaled=logscale_heatmaps) # Heatmap: x: seq number, y: max_unique_trailing_pos_in_seq/aln_len print("Heatmap: x: seq number, y: max_unique_trailing_pos_in_seq/aln_len") MatplotlibRoutines.draw_heatmap(self.general_stats_table[:, 0], 100 * self.general_stats_table[:, 11], output_prefix="%s.max_unique_trailing_pos_seq_number" % output_prefix, xlabel="N of sequences\nin alignment", ylabel="Max uniq trailing pos, % of seq", title=None, figsize=figsize, minimum_counts_to_show=1, extensions=extensions, show_colorbar=True, bin_number=(seq_bin_number, percent_histogram_bin_number), bin_width=None, bin_array=None, min_x_value=min_seq_number_in_alignment, max_x_value=max_seq_number_in_alignment, min_y_value=0, max_y_value=100, add_max_value=True, subplot=ax_array[1, 3], save_histovalues_only=True, header="#left_xedge\tleft_yedge\tcounts", logscaled=logscale_heatmaps) plt.tight_layout() #aaa = self.general_stats_table[:, 3].astype(float) / self.general_stats_table[:, 1].astype(float) * 100 #for i in range(0, len(aaa)): # print "%i\t%i\t%i\t%f" % (i, self.general_stats_table[:, 3][i], self.general_stats_table[:, 1][i], aaa[i]) for ext in extensions: plt.savefig("%s.%s" % (output_prefix, ext))
"-a", "--legend_location", action="store", dest="legend_location", default="upper center", help="Location of legend on histogram. Default - 'upper center'") parser.add_argument( "-m", "--input_mode", action="store", dest="input_mode", default="percent", help="Type of input data. Allowed: fraction, percent. Default - percent") args = parser.parse_args() MatplotlibRoutines.percent_histogram_from_file( args.input_file, args.output_prefix, data_type=args.data_type, column_list=args.columns_list, separator=args.separator, comments=args.comments_prefix, n_bins=args.number_of_bins, title=args.title, xlabel=args.xlabel, ylabel=args.ylabel, extensions=args.extensions, legend_location=args.legend_location, stats_as_legend=True, input_mode=args.input_mode)
for alignment_file in args.input: alignment_name_list = FileRoutines.split_filename(alignment_file) output_prefix = "%s/%s.unique_positions" % (args.output_dir, alignment_name_list[1]) unique_position_dict[alignment_name_list[ 1]] = MultipleAlignmentRoutines.count_unique_positions_per_sequence_from_file( alignment_file, output_prefix, format=args.format, gap_symbol="-", return_mode="relative", verbose=False) species_list = unique_position_dict.sl_keys() data_dict = OrderedDict() for species in species_list: data_dict[species] = [] for alignment in unique_position_dict: data_dict[species].append(unique_position_dict[alignment][species]) data_list = [data_dict[species] for species in data_dict] MatplotlibRoutines.extended_percent_histogram(data_list, args.histogram_output, input_mode="percent", label=species_list)
def test_roh_parameters(self, output_dir, output_prefix, input_vcf_file, allow_noncanonical_chromosome_names=True, keep_autoconverted_files=None, window_length_in_kb=None, min_homozygous_snps_per_window=(2, 51, 1), min_homozygous_snps_in_roh=(2, 101, 1), max_heterozygous_snps_per_window=(1, 11, 1), max_heterozygous_snps=(1, 21, 1), max_inverse_density_of_homozygous_snps_in_kb_per_snp=(50, 1000, 50), ): self.safe_mkdir(output_dir) plink_report_dict = OrderedDict() roh_count_array = np.zeros((len(range(*min_homozygous_snps_per_window)), len(range(*max_heterozygous_snps_per_window)), len(range(*min_homozygous_snps_in_roh)), len(range(*max_heterozygous_snps)), len(range(*max_inverse_density_of_homozygous_snps_in_kb_per_snp)), ), dtype=int) i_ticks = range(*min_homozygous_snps_per_window) j_ticks = range(*max_heterozygous_snps_per_window) k_ticks = range(*min_homozygous_snps_in_roh) l_ticks = range(*max_heterozygous_snps) m_ticks = range(*max_inverse_density_of_homozygous_snps_in_kb_per_snp) for i in i_ticks: plink_report_dict[i] = OrderedDict() for j in j_ticks: plink_report_dict[i][j] = OrderedDict() for k in k_ticks: plink_report_dict[i][j][k] = OrderedDict() for l in l_ticks: plink_report_dict[i][j][k][l] = OrderedDict() for m in m_ticks: dir_name = "%s/%i_%i_%i_%i_%i/" % (output_dir, i, j, k, l, m) description_text = "Minimum homozygous SNPs per window:\t%i\n" % i description_text += "Minimum homozygous SNPs in ROh:\t%i\n" % k description_text += "Maximum heterozygous SNPs per window:\t%i\n" % j description_text += "Max heterozygous SNPs:\t%i\n" % l description_text += "Max inverse density of homozygous SNPs(kb/SNP):\t%i\n" % m self.safe_mkdir(dir_name, description_text=description_text, description_filename="DESCRIPTION") self.find_runs_of_homozygosity("%s/%s" % (dir_name, output_prefix), input_vcf_file=input_vcf_file, allow_noncanonical_chromosome_names=allow_noncanonical_chromosome_names, keep_autoconverted_files=keep_autoconverted_files, roh_calling_method=None, window_length_in_kb=window_length_in_kb, min_homozygous_snps_per_window=i, max_heterozygous_snps_per_window=j, max_missing_snps_per_window=None, max_inverse_density_of_homozygous_snps_in_kb_per_snp=m, max_internal_gap_in_kb=None, min_roh_length=None, min_homozygous_snps_in_roh=k, min_scanning_window_hit_rate=None, generate_overlapping_segments=False, max_heterozygous_snps=l, min_concordance_across_jointly_homozygous_variants=None, homozygous_verbose=False) plink_report_dict[i][j][k][l][m] = PLINKReport("%s/%s.hom" % (dir_name, output_prefix), report_type="ROH") roh_count_array[i_ticks.index(i)][j_ticks.index(j)][k_ticks.index(k)][l_ticks.index(l)][m_ticks.index(m)] = len(plink_report_dict[i][j][k][l][m]) figure_dir = "%s/pic/" % output_dir self.safe_mkdir(figure_dir) num_k_ticks = len(k_ticks) num_l_ticks = len(l_ticks) for m in m_ticks: figure, subplot_list = plt.subplots(num_k_ticks, num_l_ticks, sharex=True, sharey=True) plt.suptitle("Number of ROH depending on several parameters") for subplot_index in range(0, len(subplot_list)): k = int(subplot_index / num_l_ticks) l = subplot_index % num_l_ticks roh_counts = roh_count_array[:, :, k, l, m] title = "Max heterozygous SNPs: %i" % l_ticks[l] if k == 0 else None xlabel = "Min homozygous SNPs per window" if k == num_k_ticks - 1 else None ylabel = "" if l == 0 else None image, colorbar = MatplotlibRoutines.annotated_heatmap(roh_counts, i_ticks, j_ticks, subplot=subplot_list[subplot_index], title=title, xlabel=xlabel, ylabel=None) """ heatmap = subplot_list[subplot_index].imshow(roh_counts) subplot_list[subplot_index].set_xticks(np.arange(len(i_ticks))) subplot_list[subplot_index].set_yticks(np.arange(len(j_ticks))) subplot_list[subplot_index].set_xticklabels(i_ticks) subplot_list[subplot_index].set_yticklabels(j_ticks) colorbar = subplot_list[subplot_index].figure.colorbar(heatmap, ax=subplot_list[subplot_index]) """ plt.savefig("%s/%i.png" % figure_dir, m)
dest="ylabel", help="Y label") parser.add_argument("-t", "--title", action="store", dest="title", help="Title of histogram") args = parser.parse_args() MatplotlibRoutines.draw_heatmap_from_file( args.input_file, args.output_prefix, x_column=args.x_col, y_column=args.y_col, xlabel=args.xlabel, ylabel=args.ylabel, title=args.title, figsize=(8, 8), minimum_counts_to_show=args.min_counts_to_show, extensions=args.extensions, show_colorbar=not args.remove_colorbar, bin_number=args.number_of_bins, bin_width=args.width_of_bins, bin_array=args.array_of_bins, min_x_value=args.min_x, max_x_value=args.max_x, min_y_value=args.min_y, max_y_value=args.max_y, add_max_value=True)
def extract_proteins_from_output(self, augustus_output, protein_output, evidence_stats_file=None, supported_by_hints_file=None, complete_proteins_id_file=None, id_prefix="p."): if evidence_stats_file: ev_fd = open(evidence_stats_file, "w") ev_fd.write( "#gene_id\ttranscript_id\tsupported_fraction\tcds_support\tintron_support\t" ) ev_fd.write( "5'UTR_support\t3'UTR_support\tincompatible_hints_groups\tprotein_length\n" ) if evidence_stats_file: sup_fd = open(supported_by_hints_file, "w") sup_fd.write( "#gene_id\ttranscript_id\tsupported_fraction\tcds_support\tintron_support\t" ) sup_fd.write( "5'UTR_support\t3'UTR_support\tincompatible_hints_groups\tprotein_length\n" ) if complete_proteins_id_file: complete_fd = open(complete_proteins_id_file, "w") with open(protein_output, "w") as out_fd: with open(augustus_output, "r") as in_fd: for line in in_fd: if line[:12] == "# start gene": gene = line.strip().split()[-1] elif "\ttranscript\t" in line: transcript_id = line.split("\t")[8].split( ";")[0].split("=")[1] start_presence = False stop_presence = False #out_fd.write(">%s%s\t gene=%s\n" % (id_prefix, transcript_id, gene)) elif "\tstart_codon\t" in line: start_presence = True elif "\tstop_codon\t" in line: stop_presence = True elif "# protein sequence" in line: protein = line.strip().split("[")[-1] if "]" in protein: protein = protein.split("]")[0] else: while True: part = in_fd.readline().split()[-1] if "]" in part: protein += part.split("]")[0] break else: protein += part if complete_proteins_id_file: #print "AAAAA" #print (start_presence, stop_presence) if start_presence and stop_presence: complete_fd.write("%s%s\n" % (id_prefix, transcript_id)) out_fd.write( ">%s%s\t gene=%s start_presence=%s stop_presence=%s\n" % (id_prefix, transcript_id, gene, str(start_presence), str(stop_presence))) out_fd.write(protein) protein_len = len(protein) out_fd.write("\n") elif evidence_stats_file or supported_by_hints_file: if line[:17] == "# % of transcript": supported_fraction = line.strip().split()[-1] while True: tmp_line = in_fd.readline() if tmp_line[:12] == "# CDS exons:": cds_support = tmp_line.strip().split()[-1] elif tmp_line[:14] == "# CDS introns:": introns_support = tmp_line.strip().split( )[-1] elif tmp_line[:13] == "# 5'UTR exons": five_utr_support = tmp_line.strip().split( )[-1] elif tmp_line[:13] == "# 3'UTR exons": three_introns_support = tmp_line.strip( ).split()[-1] elif tmp_line[: 27] == "# incompatible hint groups:": incompatible_hint_groups = tmp_line.strip( ).split()[-1] if evidence_stats_file: ev_fd.write("%s\t%s\t%s\t" % (gene, transcript_id, supported_fraction)) ev_fd.write( "%s\t%s\t%s\t%s\t%s\t%i\n" % (cds_support, introns_support, five_utr_support, three_introns_support, incompatible_hint_groups, protein_len)) if supported_by_hints_file and ( float(supported_fraction) > 0): sup_fd.write("%s\t%s\t%s\t" % (gene, transcript_id, supported_fraction)) sup_fd.write( "%s\t%s\t%s\t%s\t%s\t%i\n" % (cds_support, introns_support, five_utr_support, three_introns_support, incompatible_hint_groups, protein_len)) break if evidence_stats_file: ev_fd.close() self.extract_longest_isoforms(evidence_stats_file, "%s.longest_pep" % evidence_stats_file, minimum_supported_fraction=0) SequenceRoutines.extract_sequence_by_ids( protein_output, "%s.longest_pep.ids" % evidence_stats_file, "%s.longest_pep.pep" % evidence_stats_file) if supported_by_hints_file: supported_by_hints_longest_pep_evidence = "%s.longest_pep" % supported_by_hints_file supported_by_hints_longest_pep = "%s.longest_pep.pep" % supported_by_hints_file supported_by_hints_longest_pep_ids = "%s.longest_pep.ids" % supported_by_hints_file self.extract_longest_isoforms( evidence_stats_file, supported_by_hints_longest_pep_evidence, minimum_supported_fraction=0.00001) SequenceRoutines.extract_sequence_by_ids( protein_output, supported_by_hints_longest_pep_ids, supported_by_hints_longest_pep) evidence_files = (evidence_stats_file, "%s.longest_pep" % evidence_stats_file, "%s.longest_pep" % supported_by_hints_file) if supported_by_hints_file else \ (evidence_stats_file,) for evidence_file in evidence_files: print("Drawing transcript support distribution for %s" % evidence_file) MatplotlibRoutines.percent_histogram_from_file( evidence_file, evidence_file, column_list=(2, ), separator=None, comments="#", n_bins=20, title="Transcript support by hints", xlabel="%%", ylabel="Number", extensions=["svg", "png"], legend_location="upper center", stats_as_legend=True)
def compare_multiple_genome_results(self, busco_file_list, output_prefix, label_list=None, black_scaffold_list=(), white_scaffold_list=()): busco_table_dict = OrderedDict() gene_id_dict = OrderedDict() counts_dict = OrderedDict() output_path_list = self.split_filename(output_prefix) pairwise_overlaps_dir = "%s/pairwise_overlaps/" % (output_path_list[0] if output_path_list[0] else ".") pairwise_overlap_counts_dir = "%s/pairwise_overlap_counts/" % (output_path_list[0] if output_path_list[0] else ".") self.safe_mkdir(pairwise_overlaps_dir) self.safe_mkdir(pairwise_overlap_counts_dir) lllabels_list = label_list if label_list else ["A%i" % i for i in range(1, len(busco_file_list) + 1)] for busco_table, label in zip(busco_file_list, lllabels_list): busco_table_dict[label] = BUSCOtable(in_file=busco_table, black_list=black_scaffold_list, white_list=white_scaffold_list) gene_id_dict[label] = OrderedDict() counts_dict[label] = OrderedDict() gene_id_dict[label], counts_dict[label] = busco_table_dict[label].count_statuses() # TODO: draw piecharts # TODO: count overlaps pairwise_overlap_dict = OrderedDict() count_pairwise_overlap_dict = OrderedDict() for label1 in lllabels_list: for label2 in lllabels_list: if label1 == label2: continue overlap_id = "%s_vs_%s" % (label1, label2) pairwise_overlap_dict[overlap_id] = TwoLvlDict() count_pairwise_overlap_dict[overlap_id] = TwoLvlDict() for status1 in self.status_list: pairwise_overlap_dict[overlap_id]["%s@%s" % (label1, status1)] = OrderedDict() count_pairwise_overlap_dict[overlap_id]["%s@%s" % (label1, status1)] = OrderedDict() for status2 in self.status_list: pairwise_overlap_dict[overlap_id]["%s@%s" % (label1, status1)]["%s@%s" % (label2, status2)] = IdSet(gene_id_dict[label1][status1] & gene_id_dict[label2][status2]) count_pairwise_overlap_dict[overlap_id]["%s@%s" % (label1, status1)]["%s@%s" % (label2, status2)] = len(pairwise_overlap_dict[overlap_id]["%s@%s" % (label1, status1)]["%s@%s" % (label2, status2)]) pairwise_overlap_dict[overlap_id]["%s@%s" % (label1, status1)]["%s@%s" % (label2, status2)].write("%s/%s.%s_vs_%s.ids" % (pairwise_overlaps_dir, output_prefix, "%s@%s" % (label1, status1), "%s@%s" % (label2, status2))) count_pairwise_overlap_dict[overlap_id].write("%s/%s.overlap.%s.tsv" % (pairwise_overlap_counts_dir, output_prefix, overlap_id)) if 2 <= len(busco_file_list) <= 3: fig, subplot_list = plt.subplots(2, 2, figsize=(6, 6)) plt.suptitle("Overlaps for BUSCO categories between assemblies/genomes") #print(subplot_list) for status, index in zip(self.status_list, range(0, 4)): plt.sca(subplot_list[index // 2][index % 2]) plt.title(status) MatplotlibRoutines.venn_diagram_from_sets(gene_id_dict[lllabels_list[0]][status], gene_id_dict[lllabels_list[1]][status], set3=gene_id_dict[lllabels_list[2]][status] if len(lllabels_list) > 2 else None, set_labels=lllabels_list, set_colors=["red", "yellow", "green"], output_prefix=None, extensions=("png",), title=None) plt.savefig("%s.venn.png" % output_prefix) plt.close()
def get_feature_length_distribution_from_gff(self, input_gff, output_prefix, feature_list=None): from RouToolPa.Routines import MatplotlibRoutines len_file = "%s.len" % output_prefix stat_file = "%s.stat" % output_prefix feature_length_list = [] total_feature_length = 0 feature_number = 0 feature_type_set = set() with open(input_gff, "r") as in_fd: for line in in_fd: if line[0] == "#": continue tmp = line.split("\t") feature = tmp[self.GFF_FEATURETYPE_COLUMN] feature_type_set.add(feature) if feature_list is not None: if isinstance(feature_list, str): if feature != feature_list: continue else: if feature not in feature_list: continue start = int(tmp[self.GFF_START_COLUMN]) end = int(tmp[self.GFF_END_COLUMN]) feature_number += 1 feature_length = end - start + 1 feature_length_list.append(feature_length) #len_fd.write("%i\n" % feature_length) total_feature_length += feature_length stat_string = "Features\t%s\n" % (",".join(feature_list) if feature_list else "all") stat_string += "Number of features\t%i\n" % feature_number stat_string += "Total length\t%i\n" % total_feature_length print(stat_string) with open(stat_file, "w") as stat_fd: stat_fd.write(stat_string) feature_length_list = np.array(feature_length_list) np.savetxt(len_file, feature_length_list, fmt='%i') feature_name = "feature" if len(feature_type_set) == 1: feature_name = list(feature_type_set)[0] elif feature_list is None: feature_name = "feature" elif isinstance(feature_list, str): feature_name = feature_list elif len(feature_list) == 1: feature_name = feature_list[0] else: feature_name = "feature" MatplotlibRoutines.draw_histogram( feature_length_list, output_prefix=output_prefix + ".all", xlabel="Feature length", ylabel="N of features", title="Distribution of %s lengths" % feature_name, ylogbase=10, xlogbase=10, bins_list=[1, 10, 100, 1000, 10000, 100000, 1000000, 10000000], close_figure=True) MatplotlibRoutines.draw_histogram( feature_length_list, output_prefix=output_prefix + ".max_10000", width_of_bins=100, max_threshold=10000, min_threshold=1, xlabel="Feature length", ylabel="N of features", title="Distribution of %s lengths" % feature_name, close_figure=True) MatplotlibRoutines.draw_histogram( feature_length_list, output_prefix=output_prefix + ".max_1000", width_of_bins=10, max_threshold=1000, min_threshold=1, xlabel="Feature length", ylabel="N of features", title="Distribution of %s lengths" % feature_name, close_figure=True)