def combine_count_files(count_file_list, output_file, sample_name_list=None): if sample_name_list is not None: if len(count_file_list) != len(sample_name_list): raise ValueError( "Several files doesn't have corresponding sample name") samples = zip( sample_name_list if sample_name_list else count_file_list, count_file_list) count_table = TwoLvlDict() for sample, filename in samples: count_table[sample] = SynDict(filename=filename, header=False, separator="\t", allow_repeats_of_key=False, split_values=False, values_separator=",", key_index=0, value_index=1, close_after_if_file_object=False, expression=None, comments_prefix="__") count_table.write(output_file)
def results_extraction_listener(queue, output_file_prefix, selected_species_list=None): """listens for messages on the queue, writes to file.""" positive_selection_dict = TwoLvlDict() selected_species_positive_selection_dict = TwoLvlDict() error_fd = open("errors.err", "w") error_fd.write("#sample\terror_code\n") while 1: result = queue.get() if isinstance(result[1], int): error_fd.write("%s\t%i\n" % (result[0], result[1])) continue if result == 'finish': positive_selection_dict.write("%s.all" % output_file_prefix, absent_symbol=".") if selected_species_list: selected_species_positive_selection_dict.write( "%s.selected_species" % output_file_prefix, absent_symbol=".") # print positive_selection_dict.table_form(absent_symbol=".") break if result[1]: positive_selection_dict[result[0]] = result[1] if selected_species_list: for species in selected_species_list: if species in result[1]: if result[ 0] not in selected_species_positive_selection_dict: selected_species_positive_selection_dict[ result[0]] = {} selected_species_positive_selection_dict[ result[0]][species] = result[1][species]
def count_reads_and_bases(self, fastq_file_list, stat_file=None): fastq_list = [fastq_file_list] if isinstance(fastq_file_list, str) else fastq_file_list counts = TwoLvlDict() for fastq_file in fastq_list: counts[fastq_file] = OrderedDict() counts[fastq_file]["Reads"] = 0 counts[fastq_file]["Bases"] = 0 for fastq_file in fastq_list: with self.metaopen(fastq_file, "r") as fastq_fd: for line in fastq_fd: counts[fastq_file]["Bases"] += len(fastq_fd.readline()) counts[fastq_file]["Reads"] += 1 fastq_fd.readline() fastq_fd.readline() # to take into account "\n" at the end of each line counts[fastq_file]["Bases"] = counts[fastq_file]["Bases"] - counts[fastq_file]["Reads"] counts.write() if stat_file: counts.write(stat_file) return counts
def count_locations(self, annotation_black_list=[], allow_several_counts_of_record=False, out_filename="location_counts.t", write=True, count_dir="location_counts"): os.system("mkdir -p %s" % count_dir) regions_dict = self._split_regions() region_counts_dict = TwoLvlDict({}) for region in regions_dict: count_locations_dict = {"igc": 0} for record in regions_dict[region]: if (not record.description["Loc"]) or ( "Loc" not in record.description): count_locations_dict["unknown"] += 1 continue #print(record.description["Loc"]) if allow_several_counts_of_record: for location in record.description["Loc"]: if location in annotation_black_list: continue if location not in count_locations_dict: count_locations_dict[location] = 1 else: count_locations_dict[location] += 1 else: full_location = [] for location in record.description["Loc"]: if location in annotation_black_list: continue full_location.append(location) if not full_location: continue full_location.sort() full_location = "/".join(full_location) if full_location not in count_locations_dict: count_locations_dict[full_location] = 1 else: count_locations_dict[full_location] += 1 labels = [] counts = [] #colors = [] for location in count_locations_dict: if count_locations_dict[ location] == 0 or location in annotation_black_list: continue labels.append(location) counts.append(count_locations_dict[location]) region_counts_dict[region] = OrderedDict([ (label, count) for label, count in zip(labels, counts) ]) if write: region_counts_dict.write("%s/%s" % (count_dir, out_filename)) return region_counts_dict
def write_stats(self, output_prefix): Ns_dict = TwoLvlDict() gaps_dict = TwoLvlDict() for record_id in self.records: Ns_dict[self.records[record_id].id] = self.records[record_id].N_counts gaps_dict[self.records[record_id].id] = self.records[record_id].gap_counts Ns_dict.write(out_filename="%s.N_counts" % output_prefix) gaps_dict.write(out_filename="%s.gaps_counts" % output_prefix)
def count_types(self, output_file=None, total_output_file=None, return_mode="chrom"): annotated_types = self.get_annotated_types() count_dict = TwoLvlDict() total_count_dict = OrderedDict() for type in annotated_types: total_count_dict[type] = OrderedDict() total_count_dict[type]["complete"] = 0 total_count_dict[type]["partial"] = 0 for chrom in self.records: count_dict[chrom] = OrderedDict() for type in annotated_types: count_dict[chrom][type] = 0 for chrom in self.records: for record in self.records[chrom]: count_dict[chrom][record.type] += 1 if record.partial: total_count_dict[record.type]["partial"] += 1 else: total_count_dict[record.type]["complete"] += 1 if output_file: count_dict.write(output_file) if total_output_file: with open(total_output_file, "w") as out_fd: out_fd.write("#rRNA\tComplete%s\tPartial%s\n" % ("(>%.2f of expected length)" % self.partial_threshold if self.partial_threshold else "", "(<%.2f of expected length)" % self.partial_threshold if self.partial_threshold else "")) for type in total_count_dict: out_fd.write("%s\t%i\t%i\n" % (type, total_count_dict[type]["complete"], total_count_dict[type]["partial"])) if return_mode == "chrom": return count_dict elif return_mode == "total": return total_count_dict elif return_mode == "both": return count_dict, total_count_dict else: raise ValueError("Unknown return type. Allowed variants: 'chrom', 'total', 'both'")
from collections import OrderedDict from RouToolPa.Collections.General import TwoLvlDict from RouToolPa.Routines.File import check_path parser = argparse.ArgumentParser() parser.add_argument("-s", "--species_list", action="store", dest="species_list", type=lambda s: s.split(","), required=True, help="Comma-separated list of species") parser.add_argument("-d", "--species_dir", action="store", dest="species_dir", default="./", type=check_path, help="Directory with per species statistics") parser.add_argument("-o", "--output_file", action="store", dest="output", default="stdout", help="Output file. Default: stdout") args = parser.parse_args() out_fd = sys.stdout if args.output == "stdout" else open(args.output, "w") species_stat_dict = TwoLvlDict() for species in args.species_list: with open("%s%s/stat.t" % (args.species_dir, species), "r") as stat_fd: statistics = map(lambda s: s.strip().split("\t"), stat_fd.readlines()) species_stat_dict[species] = OrderedDict(statistics) species_stat_dict.write(out_fd) if args.output != "stdout": out_fd.close()
cluster_3d_set.add(gene) for cluster_3d_sub in PmCDA1_3d_sub_clusters: for variant in cluster_3d_sub: if "Genes" in variant.info_dict: for gene in variant.info_dict["Genes"]: cluster_3d_sub_set.add(gene) print ("PmCDA1 3d : %i" % len(cluster_3d_set)) print ("PmCDA1 3d sub: %i" % len(cluster_3d_sub_set)) intersection = cluster_3d_set & cluster_3d_sub_set print("Intersection: % i" % len(intersection)) n_intersection_genes = len(intersection) n_cluster_3d_genes = len(cluster_3d_set) n_cluster_3d_sub_genes = len(cluster_3d_sub_set) #print intersection #print cluster_3d_set #print cluster_3d_sub_set overlap_clusters_percent[size][power] = 100 * float(len(intersection))/float(len(cluster_3d_set)) p_value = hypergeom(n_intersection_genes, totaly_genes, n_cluster_3d_genes, n_cluster_3d_sub_genes) test_fd.write("%i\t%.2f\t%i\t%i\t%i\t%i\t%e\n" % (size, power, totaly_genes, n_cluster_3d_genes, n_cluster_3d_sub_genes, n_intersection_genes, p_value)) overlap_clusters_percent.write("overlap_clusters_percent_genes.t") test_fd.close()
number_of_bins = len(bins) - 1 # add zeroes to absent bins for all assemblies for assembly in assembly_contig_cumulative_length: bin_number_difference = number_of_bins - len( assembly_contig_cumulative_length[assembly]) if bin_number_difference > 0: assembly_contig_cumulative_length[assembly] += [ 0 for i in range(0, bin_number_difference) ] assembly_contig_number_values[assembly] += [ 0 for i in range(0, bin_number_difference) ] assembly_N50_dict.write("%s.N50" % args.output_prefix) assembly_L50.write("%s.L50" % args.output_prefix) assembly_general_stats.write("%s.general" % args.output_prefix) assembly_lengths.write("%s.lengths" % args.output_prefix) #assembly_bins.write("%s.bins" % args.output_prefix) #print(assembly_contig_cumulative_length) #assembly_contig_cumulative_length.write("%s.cumulative_length" % args.output_prefix) #assembly_contig_number_values.write("%s.contig_number_values" % args.output_prefix) fig = plt.figure(figsize=(12, 6)) subplot_1 = plt.subplot(1, 2, 1) plt.hist( [assembly_length_array[assembly] for assembly in assembly_length_array], bins, label=assembly_length_array.keys())
def handle_sanger_data(self, input_dir, output_prefix, outdir=None, read_subfolders=False, min_mean_qual=0, min_median_qual=0, min_len=50): if outdir: self.workdir = outdir self.init_dirs() sanger_filelist = self.make_list_of_path_to_files( input_dir, expression=self.is_sanger_file, recursive=read_subfolders, return_absolute_paths=True) stat_dict = TwoLvlDict() record_dict = OrderedDict() trimmed_record_dict = OrderedDict() excluded_list = IdList() excluded_counter = 0 low_quality_counter = 0 too_short_counter = 0 merged_raw_fastq = "%s/%s.raw.fastq" % (self.workdir, output_prefix) merged_raw_fasta = "%s/%s.raw.fasta" % (self.workdir, output_prefix) merged_trimmed_fastq = "%s/%s.trimmed.fastq" % (self.workdir, output_prefix) merged_trimmed_fasta = "%s/%s.trimmed.fasta" % (self.workdir, output_prefix) for filename in sanger_filelist: filename_list = self.split_filename(filename) record_raw_fastq = "%s/fastq/raw/%s.raw.fastq" % (self.workdir, filename_list[1]) record_raw_fasta = "%s/fasta/raw/%s.raw.fasta" % (self.workdir, filename_list[1]) record_raw_qual_plot_prefix = "%s/qual_plot/raw/%s.raw.qual" % ( self.workdir, filename_list[1]) record_trimmed_fastq = "%s/fastq/trimmed/%s.trimmed.fastq" % ( self.workdir, filename_list[1]) record_trimmed_fasta = "%s/fasta/trimmed/%s.trimmed.fasta" % ( self.workdir, filename_list[1]) record_trimmed_qual_plot_prefix = "%s/qual_plot/trimmed/%s.trimmed.qual" % ( self.workdir, filename_list[1]) record = SeqIO.read(self.metaopen(filename, "rb"), format="abi") record_dict[record.id] = record SeqIO.write(record, record_raw_fastq, format="fastq") SeqIO.write(record, record_raw_fasta, format="fasta") trimmed_record = SeqIO.AbiIO._abi_trim(record) stat_dict[record.id] = OrderedDict({ "raw_len": len(record), "raw_mean_qual": np.mean(record.letter_annotations["phred_quality"]), "raw_median_qual": np.median(record.letter_annotations["phred_quality"]), "trimmed_len": len(trimmed_record), "trimmed_mean_qual": np.mean(trimmed_record.letter_annotations["phred_quality"]), "trimmed_median_qual": np.median(trimmed_record.letter_annotations["phred_quality"]), "retained": "-", }) MatplotlibRoutines.draw_bar_plot( record.letter_annotations["phred_quality"], record_raw_qual_plot_prefix, extentions=["png"], xlabel="Position", ylabel="Phred quality", title="Per base quality", min_value=None, max_value=None, new_figure=True, figsize=(3 * (int(len(record) / 100) + 1), 3), close_figure=True) if stat_dict[record.id]["trimmed_len"] >= min_len: if min_median_qual: if (stat_dict[record.id]["trimmed_median_qual"] >= min_median_qual) and ( stat_dict[record.id]["trimmed_mean_qual"] >= min_mean_qual): stat_dict[record.id]["retained"] = "+" else: low_quality_counter += 1 else: stat_dict[record.id]["retained"] = "+" else: too_short_counter += 1 if stat_dict[record.id]["retained"] == "-": excluded_list.append(record.id) continue SeqIO.write(trimmed_record, record_trimmed_fastq, format="fastq") SeqIO.write(trimmed_record, record_trimmed_fasta, format="fasta") MatplotlibRoutines.draw_bar_plot( trimmed_record.letter_annotations["phred_quality"], record_trimmed_qual_plot_prefix, extentions=["png"], xlabel="Position", ylabel="Phred quality", title="Per base quality", min_value=None, max_value=None, new_figure=True, figsize=(3 * (int(len(record) / 100) + 1), 3), close_figure=True) trimmed_record_dict[record.id] = trimmed_record SeqIO.write(self.record_from_dict_generator(record_dict), merged_raw_fastq, format="fastq") SeqIO.write(self.record_from_dict_generator(record_dict), merged_raw_fasta, format="fasta") SeqIO.write(self.record_from_dict_generator(trimmed_record_dict), merged_trimmed_fastq, format="fastq") SeqIO.write(self.record_from_dict_generator(trimmed_record_dict), merged_trimmed_fasta, format="fasta") excluded_list.write("%s.excluded.ids" % output_prefix) stat_dict.write(out_filename="%s.stats" % output_prefix) print("Excluded: %i" % excluded_counter) print("\tToo short( < %i ): %i" % (min_len, too_short_counter)) print("\tLow quality( median < %i or mean < %i ): %i" % (min_median_qual, min_mean_qual, low_quality_counter))
filtered.write("%s/%s_adjusted_size_3+_power_0.05+.ccf" % (clustering_dir, sample)) filtered_out.write("%s/%s_adjusted_size_3+_power_less_0.05.ccf" % (clustering_dir, sample)) """ if "HAP" not in sample: filtered.heatmap_statistics(filename="%s/%s_3+_power_0.05+_heatmap_statistics.svg" % (clustering_dir,sample_adjusted), additional_data=("Median", "Mean", "Power", "Homogeneity")) #filtered_out.heatmap_statistics(filename="%s/%s_3+_power_less_0.05_heatmap_statistics.svg" % (clustering_dir, sample_adjusted), # additional_data=("Median", "Mean", "Power", "Homogeneity")) else: filtered.heatmap_statistics(filename="%s/%s_3+_power_0.05+_heatmap_statistics.svg" % (clustering_dir,sample_adjusted), additional_data=("Median", "Mean", "Power")) #filtered_out.heatmap_statistics(filename="%s/%s_3+_power_less_0.05_heatmap_statistics.svg" % (clustering_dir, sample_adjusted), # additional_data=("Median", "Mean", "Power")) """ filtered, filtered_out = filtered.filter(filter_by_power_10) filtered.write("%s/%s_adjusted_size_3+_power_0.1+.ccf" % (clustering_dir, sample)) filtered_out.write("%s/%s_adjusted_size_3+_power_0.05+_less_0.1.ccf" % (clustering_dir, sample)) """ if "HAP" not in sample: filtered.heatmap_statistics(filename="%s/%s_3+_power_0.10+_heatmap_statistics.svg" % (clustering_dir,sample_adjusted), additional_data=("Median", "Mean", "Power", "Homogeneity")) #filtered_out.heatmap_statistics(filename="%s/%s_3+_power_0.05+_less_0.1_heatmap_statistics.svg" % (clustering_dir, sample_adjusted), # additional_data=("Median", "Mean", "Power", "Homogeneity")) else: filtered.heatmap_statistics(filename="%s/%s_3+_power_0.10+_heatmap_statistics.svg" % (clustering_dir,sample_adjusted), additional_data=("Median", "Mean", "Power")) #filtered_out.heatmap_statistics(filename="%s/%s_3+_power_0.05+_less_0.1_heatmap_statistics.svg" % (clustering_dir, sample_adjusted), # additional_data=("Median", "Mean", "Power")) """ statistics_dict.write(out_filename="%s/%s_mutation_count_statistics.t" % (clustering_dir, sample))
def filter(self, samples_directory, output_directory, adapter_fragment_file, trimmomatic_adapter_file, general_stat_file, samples_to_handle=None, threads=4, trimmomatic_dir="", coockiecutter_dir="", facut_dir="", mismatch_number=2, pe_reads_score=30, se_read_score=10, min_adapter_len=1, sliding_window_size=None, average_quality_threshold=15, base_quality="phred33", read_name_type="illumina", leading_base_quality_threshold=None, trailing_base_quality_threshold=None, crop_length=None, head_crop_length=None, min_len=50, remove_intermediate_files=False, skip_coockiecutter=False, retain_single_end_reads=True, input_is_se=False): Cookiecutter.path = coockiecutter_dir Trimmomatic.jar_path = trimmomatic_dir Trimmomatic.threads = threads FaCut.path = facut_dir self.safe_mkdir(output_directory) """ merged_raw_dir = "%s/merged/" % output_directory filtered_dir = "%s/filtered/" % output_directory coockie_filtered_dir = "%s/coockiecutter/" % filtered_dir coockie_trimmomatic_filtered_dir = "%s/coockiecutter_trimmomatic/" % filtered_dir coockie_trimmomatic_quality_filtered_dir = "%s/coockiecutter_trimmomatic_quality/" % filtered_dir final_filtered_dir = "%s/final/" % filtered_dir filtering_stat_dir = "%s/filtered_stat/" % output_directory """ sample_list = samples_to_handle if samples_to_handle else self.get_sample_list( samples_directory) merged_raw_dir, filtered_dir, coockie_filtered_dir, \ coockie_trimmomatic_filtered_dir, coockie_trimmomatic_quality_filtered_dir, \ final_filtered_dir, filtering_stat_dir = self.prepare_filtering_directories(output_directory, sample_list) filtering_statistics = TwoLvlDict() for sample in sample_list: print("Handling sample %s" % sample) filtering_statistics[sample] = OrderedDict() merged_raw_sample_dir = "%s/%s/" % (merged_raw_dir, sample) #merged_forward_reads = "%s/%s_1.fq" % (merged_raw_sample_dir, sample) #merged_reverse_reads = "%s/%s_2.fq" % (merged_raw_sample_dir, sample) coockie_filtered_sample_dir = "%s/%s/" % (coockie_filtered_dir, sample) coockie_stats = "%s/%s.coockiecutter.stats" % ( coockie_filtered_sample_dir, sample) coockie_trimmomatic_filtered_sample_dir = "%s/%s/" % ( coockie_trimmomatic_filtered_dir, sample) coockie_trimmomatic_quality_filtered_sample_dir = "%s/%s/" % ( coockie_trimmomatic_quality_filtered_dir, sample) final_filtered_sample_dir = "%s/%s/" % (final_filtered_dir, sample) filtering_stat_sample_dir = "%s/%s" % (filtering_stat_dir, sample) #""" print("\tMerging fastqs if necessary...") merged_forward_reads, merged_reverse_reads, merged_se_reads = self.combine_fastq_files( samples_directory, sample, merged_raw_sample_dir, use_links_if_merge_not_necessary=True, input_is_se=input_is_se) if not skip_coockiecutter: print("\tFiltering by Cookiecutter") #""" Cookiecutter.rm_reads( adapter_fragment_file, merged_forward_reads if merged_forward_reads else merged_se_reads, coockie_stats, right_reads=merged_reverse_reads, out_dir=coockie_filtered_sample_dir, use_dust_filter=False, dust_cutoff=None, dust_window_size=None, use_N_filter=False, read_length_cutoff=None, polyGC_length_cutoff=None) #""" print("\tParsing Cookiecutter report...") coockiecutter_report = CoockiecutterReport( coockie_stats, input_is_se=input_is_se) filtering_statistics[sample][ "raw_pairs"] = coockiecutter_report.input_pairs filtering_statistics[sample][ "pairs_after_coockiecutter"] = coockiecutter_report.retained_pairs filtering_statistics[sample][ "pairs_after_coockiecutter,%"] = float( "%.2f" % (float(coockiecutter_report.retained_pairs) / float(coockiecutter_report.input_pairs) * 100)) os.system("cp %s %s" % (coockie_stats, filtering_stat_sample_dir)) coockie_filtered_paired_forward_reads = "%s/%s_1.ok.fastq" % ( coockie_filtered_sample_dir, sample) coockie_filtered_paired_reverse_reads = "%s/%s_2.ok.fastq" % ( coockie_filtered_sample_dir, sample) coockie_filtered_paired_se_reads = "" coockie_filtered_se_reads = "%s/%s.se.ok.fastq" % ( coockie_filtered_sample_dir, sample) # se reads produced by Coockiecutter are ignored now!! #coockie_trimmomatic_filtered_sample_dir = "%s/%s/" % (coockie_trimmomatic_filtered_dir, sample) trimmomatic_output_prefix = "%s/%s" % ( coockie_trimmomatic_filtered_sample_dir, sample) trimmomatic_log = "%s.trimmomatic.log" % trimmomatic_output_prefix #""" if (merged_forward_reads is None) and (merged_reverse_reads is None): print("Filtering by Trimmomatic...") Trimmomatic.filter( merged_se_reads if skip_coockiecutter else coockie_filtered_se_reads, trimmomatic_output_prefix, output_extension="fq", right_reads=None, adapters_file=trimmomatic_adapter_file, mismatch_number=mismatch_number, pe_reads_score=pe_reads_score, se_read_score=se_read_score, min_adapter_len=min_adapter_len, sliding_window_size=sliding_window_size, average_quality_threshold=average_quality_threshold, leading_base_quality_threshold= leading_base_quality_threshold, trailing_base_quality_threshold= trailing_base_quality_threshold, crop_length=crop_length, head_crop_length=head_crop_length, min_length=min_len, logfile=trimmomatic_log, base_quality=base_quality) else: print("\tFiltering by Trimmomatic...") Trimmomatic.filter( merged_forward_reads if skip_coockiecutter else coockie_filtered_paired_forward_reads, trimmomatic_output_prefix, output_extension="fq", right_reads=merged_reverse_reads if skip_coockiecutter else coockie_filtered_paired_reverse_reads, adapters_file=trimmomatic_adapter_file, mismatch_number=mismatch_number, pe_reads_score=pe_reads_score, se_read_score=se_read_score, min_adapter_len=min_adapter_len, sliding_window_size=sliding_window_size, average_quality_threshold=average_quality_threshold, leading_base_quality_threshold= leading_base_quality_threshold, trailing_base_quality_threshold= trailing_base_quality_threshold, crop_length=crop_length, head_crop_length=head_crop_length, min_length=min_len, logfile=trimmomatic_log, base_quality=base_quality) #""" trimmomatic_report = TrimmomaticReport(trimmomatic_log, input_is_se=input_is_se) if skip_coockiecutter: filtering_statistics[sample][ "raw_pairs"] = trimmomatic_report.stats["input"] filtering_statistics[sample][ "pairs_after_trimmomatic"] = trimmomatic_report.stats[ "surviving"] if input_is_se else trimmomatic_report.stats[ "both_surviving"] filtering_statistics[sample][ "pairs_after_trimmomatic,%"] = trimmomatic_report.stats[ "surviving,%"] if input_is_se else trimmomatic_report.stats[ "both_surviving,%"] if retain_single_end_reads and not input_is_se: filtering_statistics[sample][ "forward_se_after_trimmomatic"] = trimmomatic_report.stats[ "forward_only_surviving"] filtering_statistics[sample][ "forward_se_after_trimmomatic,%"] = trimmomatic_report.stats[ "forward_only_surviving"] filtering_statistics[sample][ "reverse_se_after_trimmomatic"] = trimmomatic_report.stats[ "reverse_only_surviving,%"] filtering_statistics[sample][ "forward_se_after_trimmomatic,%"] = trimmomatic_report.stats[ "forward_only_surviving,%"] os.system("cp %s %s" % (trimmomatic_log, filtering_stat_sample_dir)) coockie_trimmomatic_filtered_paired_forward_reads = "%s/%s_1.pe.fq" % ( coockie_trimmomatic_filtered_sample_dir, sample) coockie_trimmomatic_filtered_paired_reverse_reads = "%s/%s_2.pe.fq" % ( coockie_trimmomatic_filtered_sample_dir, sample) coockie_trimmomatic_filtered_unpaired_forward_reads = "%s/%s_1.se.fq" % ( coockie_trimmomatic_filtered_sample_dir, sample) coockie_trimmomatic_filtered_unpaired_reverse_reads = "%s/%s_2.se.fq" % ( coockie_trimmomatic_filtered_sample_dir, sample) coockie_trimmomatic_filtered_se_reads = "%s/%s.se.fq" % ( coockie_trimmomatic_filtered_sample_dir, sample) final_forward_reads = "%s/%s.final_1.fastq" % ( final_filtered_sample_dir, sample) final_reverse_reads = "%s/%s.final_2.fastq" % ( final_filtered_sample_dir, sample) final_forward_se_reads = "%s/%s.final_1.se.fastq" % ( final_filtered_sample_dir, sample) final_reverse_se_reads = "%s/%s.final_2.se.fastq" % ( final_filtered_sample_dir, sample) final_se_reads = "%s/%s.final.se.fastq" % ( final_filtered_sample_dir, sample) if sliding_window_size is None: facut_pe_output_prefix = "%s/%s.pe" % ( coockie_trimmomatic_quality_filtered_sample_dir, sample) facut_forward_se_output_prefix = "%s/%s.forward.se" % ( coockie_trimmomatic_quality_filtered_sample_dir, sample) facut_reverse_se_output_prefix = "%s/%s.reverse.se" % ( coockie_trimmomatic_quality_filtered_sample_dir, sample) facut_pe_stat_file = "%s.facut.stat" % facut_pe_output_prefix facut_forward_se_stat_file = "%s.facut.stat" % facut_forward_se_output_prefix facut_reverse_se_stat_file = "%s.facut.stat" % facut_reverse_se_output_prefix #""" FaCut.filter_by_mean_quality( average_quality_threshold, facut_pe_output_prefix, coockie_trimmomatic_filtered_paired_forward_reads, reverse_reads= coockie_trimmomatic_filtered_paired_reverse_reads, quality_type=base_quality, stat_file=facut_pe_stat_file, name_type=read_name_type) FaCut.filter_by_mean_quality( average_quality_threshold, facut_forward_se_output_prefix, coockie_trimmomatic_filtered_unpaired_forward_reads, quality_type=base_quality, stat_file=facut_forward_se_stat_file, name_type=read_name_type) FaCut.filter_by_mean_quality( average_quality_threshold, facut_reverse_se_output_prefix, coockie_trimmomatic_filtered_unpaired_reverse_reads, quality_type=base_quality, stat_file=facut_reverse_se_stat_file, name_type=read_name_type) #""" #if input_is_se: #else: facut_report = FaCutReport(facut_pe_stat_file) filtering_statistics[sample][ "pairs_after_facut"] = facut_report.retained_pairs filtering_statistics[sample]["pairs_after_facut,%"] = float( "%.2f" % (float(facut_report.retained_pairs) / float(facut_report.input_pairs) * 100)) filtering_statistics[sample][ "retained_pairs_in_worst_tile,%"] = facut_report.minimum_retained_pairs_in_tiles_fraction * 100 filtering_statistics[sample][ "pairs_survived_after_filtration,%"] = float( "%.2f" % (float(facut_report.retained_pairs) / filtering_statistics[sample]["raw_pairs"] * 100)) facut_filtered_forward_reads = "%s_1.pe.fq" % facut_pe_output_prefix facut_filtered_reverse_reads = "%s_2.pe.fq" % facut_pe_output_prefix facut_filtered_forward_se_reads = "%s.se.fq" % facut_forward_se_output_prefix facut_filtered_reverse_se_reads = "%s.se.fq" % facut_reverse_se_output_prefix os.system("cp %s %s" % (facut_pe_stat_file, filtering_stat_sample_dir)) if retain_single_end_reads: os.system("cp %s %s" % (facut_forward_se_stat_file, filtering_stat_sample_dir)) os.system("cp %s %s" % (facut_reverse_se_stat_file, filtering_stat_sample_dir)) os.system("ln %s %s" % (facut_filtered_forward_reads, final_forward_reads)) os.system("ln %s %s" % (facut_filtered_reverse_reads, final_reverse_reads)) if retain_single_end_reads and not input_is_se: os.system("cat %s %s > %s" % (facut_filtered_forward_se_reads, facut_filtered_reverse_se_reads, final_forward_se_reads)) #os.system("ln %s %s" % (facut_filtered_forward_se_reads, final_forward_se_reads)) #os.system("ln %s %s" % (facut_filtered_reverse_se_reads, final_reverse_se_reads)) if input_is_se: pass #os.system("ln %s %s" % (coockie_trimmomatic_filtered_se_reads, final_se_reads)) else: os.system("ln %s %s" % (coockie_trimmomatic_filtered_paired_forward_reads, final_forward_reads)) os.system("ln %s %s" % (coockie_trimmomatic_filtered_paired_reverse_reads, final_reverse_reads)) if retain_single_end_reads and not input_is_se: os.system( "cat %s %s > %s" % (coockie_trimmomatic_filtered_unpaired_forward_reads, coockie_trimmomatic_filtered_unpaired_reverse_reads, final_forward_se_reads)) """ os.system("ln %s %s" % (coockie_trimmomatic_filtered_unpaired_forward_reads, final_forward_se_reads)) os.system("ln %s %s" % (coockie_trimmomatic_filtered_unpaired_reverse_reads, final_reverse_se_reads)) """ if input_is_se: os.system("ln %s %s" % (coockie_trimmomatic_filtered_se_reads, final_se_reads)) filtering_statistics[sample][ "pairs_survived_after_filtration,%"] = float( "%.2f" % (float(trimmomatic_report.stats[ "surviving" if input_is_se else "both_surviving"]) / filtering_statistics[sample]["raw_pairs"] * 100)) print(filtering_statistics.table_form()) if remove_intermediate_files: shutil.rmtree(merged_raw_sample_dir) shutil.rmtree(coockie_filtered_sample_dir) shutil.rmtree(coockie_trimmomatic_filtered_sample_dir) shutil.rmtree(coockie_trimmomatic_quality_filtered_sample_dir) if remove_intermediate_files: shutil.rmtree(coockie_filtered_dir) shutil.rmtree(coockie_trimmomatic_filtered_dir) shutil.rmtree(coockie_trimmomatic_quality_filtered_dir) shutil.rmtree(merged_raw_dir) filtering_statistics.write(general_stat_file, sort=False)
def draw_variant_window_densities(self, count_df, scaffold_length_dict, window_size, window_step, output_prefix, masking_dict=None, gap_fraction_threshold=0.4, record_style=None, ext_list=("svg", "png"), label_fontsize=13, left_offset=0.2, figure_width=12, figure_height_scale_factor=0.5, scaffold_synonym_dict=None, id_replacement_mode="partial", suptitle=None, density_multiplicator=1000, scaffold_black_list=[], sort_scaffolds=False, scaffold_ordered_list=None, scaffold_white_list=[], add_sample_name_to_labels=False, dist_between_scaffolds_scaling_factor=1, gap_color="grey", masked_color="grey", no_snp_color="white", colormap=None, colors=("#333a97", "#3d3795","#5d3393", "#813193", "#9d2d7f", "#b82861", "#d33845", "#ea2e2e", "#f5ae27"), thresholds=(0.0, 0.1, 0.5, 0.75, 1.0, 1.25, 1.5, 2.0, 2.5), colormap_tuple_list=((0.0, "#333a97"), (0.1, "#3d3795"), (0.5, "#5d3393"), (0.75, "#813193"), (1.0, "#9d2d7f"), (1.25, "#b82861"), (1.5, "#d33845"), (2.0, "#ea2e2e"), (2.5, "#f5ae27"))): """ cont_dict = {sample: {scaffold: }}""" if dist_between_scaffolds_scaling_factor < 1: raise ValueError("Scaling factor for distance between scaffolds have to be >=1.0") final_scaffold_list = self.get_filtered_scaffold_list(count_df.index.get_level_values('CHROM').unique().to_list(), scaffold_black_list=scaffold_black_list, sort_scaffolds=sort_scaffolds, scaffold_ordered_list=scaffold_ordered_list, scaffold_white_list=scaffold_white_list) scaffold_number = len(final_scaffold_list) max_scaffold_length = max([scaffold_length_dict[scaf] for scaf in final_scaffold_list]) #max_scaffold_length = max(scaffold_length_dict.values()) window_number, sample_number = np.shape(count_df) figure = plt.figure(figsize=(figure_width, int(figure_height_scale_factor * scaffold_number * sample_number))) subplot = plt.subplot(1, 1, 1) subplot.get_yaxis().set_visible(False) #subplot.get_xaxis().set_visible(False) #axes.xaxis.set_major_formatter(x_formatter) #subplot.spines['bottom'].set_color('none') subplot.spines['right'].set_color('none') subplot.spines['left'].set_color('none') subplot.spines['top'].set_color('none') scaffold_height = 10 dist_between_scaffolds = 5 start_x = 0 start_y = - dist_between_scaffolds label_line_y_shift = int(scaffold_height/2) label_line_y_jump = int(scaffold_height/2) #normalize_color_func = LinearSegmentedColormap.from_list("Densities_custom", colormap_tuple_list) #plt.register_cmap(cmap=colormap) #colormap = cm.get_cmap(name="plasma", lut=None) #normalize_colors = colors.BoundaryNorm(boundaries_for_colormap, len(boundaries_for_colormap) - 1) * int(256/(len(boundaries_for_colormap) - 1)) #normalize_colors = colors.Normalize(vmin=boundaries_for_colormap[0], vmax=boundaries_for_colormap[-1]) masked_windows_count_dict = TwoLvlDict() no_snps_windows_count_dict = TwoLvlDict() for sample in count_df: masked_windows_count_dict[sample] = OrderedDict() no_snps_windows_count_dict[sample] = OrderedDict() if colormap: cmap = plt.get_cmap(colormap, len(thresholds)) masked_regions_fd = open("%s.masked_regions" % output_prefix, "w") masked_regions_fd.write("#scaffold\twindow\tmasked_position\tmasked_position,fraction\n") for scaffold in final_scaffold_list: sample_index = 0 for sample in count_df: masked_windows_count_dict[sample][scaffold] = 0 no_snps_windows_count_dict[sample][scaffold] = 0 #if scaffold in scaffold_black_list: # continue #print gap_coords_list, gap_len_list start_y += scaffold_height + dist_between_scaffolds * (dist_between_scaffolds_scaling_factor if sample_index == 0 else 1) label_y_start = label_line_y_shift + start_y gap_y_jump = label_y_start + label_line_y_jump prev_x = 0 #figure.text(0, start_y, scaffold, rotation=0, fontweight="bold", transform=subplot.transAxes, fontsize=9, # horizontalalignment='center', # verticalalignment='center') if scaffold_synonym_dict: if id_replacement_mode == "exact": if scaffold in scaffold_synonym_dict: scaffold_label = scaffold_synonym_dict[scaffold] else: scaffold_label = scaffold print("WARNING!!! Synonym for %s was not found" % scaffold) elif id_replacement_mode == "partial": partial_syn_list = [] for partial_syn in scaffold_synonym_dict: if partial_syn in scaffold: partial_syn_list.append(partial_syn) if len(partial_syn_list) > 1: print("WARNING!!! More than one possible replacement for %s was found: %s. No replacement then." % (scaffold, ",".join(partial_syn_list))) scaffold_label = scaffold elif not partial_syn_list: scaffold_label = scaffold print("WARNING!!! Synonym for %s was not found" % scaffold) else: scaffold_label = scaffold_synonym_dict[partial_syn_list[0]] else: raise ValueError("Unknown id replacement mode") else: scaffold_label = scaffold subplot.annotate(("%s (%s)" % (scaffold, sample))if add_sample_name_to_labels else scaffold_label, xy=(0, label_y_start), xycoords='data', fontsize=16, xytext=(-15, 1.5 * label_line_y_shift), textcoords='offset points', ha='right', va='top') if scaffold in count_df[sample]: for window_index in count_df.loc[scaffold].index: window_start = window_index * window_step window_end = window_start + window_size - 1 # TODO: check end coordinate if masking_dict: if scaffold in masking_dict: unmasked_length = window_size - masking_dict[scaffold][window_index] if unmasked_length > 0: variant_density = float(count_df[sample].loc[scaffold, window_index] * density_multiplicator) / float(unmasked_length) else: variant_density = None else: variant_density = float(count_df[sample].loc[scaffold, window_index] * density_multiplicator) / float(window_size) if variant_density is None: window_color = masked_color else: if colormap: if variant_density <= thresholds[0]: window_color = no_snp_color else: for threshold_index in range(0, len(thresholds) - 1): if thresholds[threshold_index] < variant_density <= thresholds[threshold_index+1]: window_color = cmap(threshold_index) break else: window_color = cmap(threshold_index+1) else: if variant_density <= colormap_tuple_list[0][0]: window_color = no_snp_color else: for lower_boundary, color in colormap_tuple_list: if variant_density <= lower_boundary: break if variant_density > lower_boundary: prev_color = color else: prev_color = color window_color = prev_color if masking_dict: if scaffold in masking_dict: if float(masking_dict[scaffold][window_index]) / float(window_size) > gap_fraction_threshold: window_color = masked_color #print scaffold #print i, variant_density, window_color if window_color == masked_color: masked_windows_count_dict[sample][scaffold] += 1 masked_regions_fd.write("%s\t%i\t%i\t%f\n" % (scaffold, window_index, masking_dict[scaffold][window_index], float(masking_dict[scaffold][window_index]) / float(window_size))) elif window_color == no_snp_color: no_snps_windows_count_dict[sample][scaffold] += 1 window = Rectangle((window_start, start_y), window_size, scaffold_height, fill=True, edgecolor=None, facecolor=window_color, linewidth=0.0000000000001) #print prev_x #print gap_coords[0] - prev_x subplot.add_patch(window) # draw_chromosome fragment = Rectangle((0, start_y), scaffold_length_dict[scaffold], scaffold_height, fill=False, edgecolor="black", facecolor=None, linewidth=0.5) subplot.add_patch(fragment) sample_index += 1 legend_y_position = int(start_y/2) legend_x_position = int(max_scaffold_length * 1.05) legend_element_side = scaffold_height square_y_pos = legend_y_position - legend_element_side for color, legend_label in zip((masked_color, no_snp_color), ("masked", "no SNPs")): square_y_pos += legend_element_side fragment = Rectangle((legend_x_position, square_y_pos), max_scaffold_length/64, legend_element_side, fill=True, edgecolor="black", facecolor=color, linewidth=0.5) subplot.add_patch(fragment) subplot.annotate(legend_label, xy=(legend_x_position + 2 * max_scaffold_length/64, square_y_pos), xycoords='data', fontsize=13, xytext=(legend_x_position + 2 * max_scaffold_length/64, square_y_pos),) if colormap: for i in range(0, len(thresholds)): square_y_pos += legend_element_side #print (colormap_tuple_list[i][1]) fragment = Rectangle((legend_x_position, square_y_pos), max_scaffold_length/64, legend_element_side, fill=True, edgecolor="black", facecolor=cmap(i), linewidth=0.5) subplot.add_patch(fragment) if i == (len(thresholds) - 1): legend_element_label = "> %.2f" % thresholds[i] else: legend_element_label = "%.2f - %.2f" % (thresholds[i], thresholds[i + 1]) subplot.annotate(legend_element_label, xy=(legend_x_position + 2 * max_scaffold_length/64, square_y_pos), xycoords='data', fontsize=13, xytext=(legend_x_position + 2 * max_scaffold_length/64, square_y_pos),) else: for i in range(0, len(colormap_tuple_list)): square_y_pos += legend_element_side #print (colormap_tuple_list[i][1]) fragment = Rectangle((legend_x_position, square_y_pos), max_scaffold_length/64, legend_element_side, fill=True, edgecolor="black", facecolor=colormap_tuple_list[i][1], linewidth=0.5) subplot.add_patch(fragment) if i == (len(colormap_tuple_list) - 1): legend_element_label = "> %.2f" % colormap_tuple_list[i][0] else: legend_element_label = "%.2f - %.2f" % (colormap_tuple_list[i][0], colormap_tuple_list[i + 1][0]) subplot.annotate(legend_element_label, xy=(legend_x_position + 2 * max_scaffold_length/64, square_y_pos), xycoords='data', fontsize=13, xytext=(legend_x_position + 2 * max_scaffold_length/64, square_y_pos),) plt.xlim(xmin=0, xmax=int(1.2 * max_scaffold_length)) plt.ylim(ymin=0, ymax=start_y + 2 * scaffold_height) #plt.colorbar(subplot) #plt.tight_layout() plt.subplots_adjust(left=left_offset, right=0.95)#bottom=0.1, right=0.8, top=0.9) if suptitle: plt.suptitle(suptitle) for extension in ext_list: plt.savefig("%s.%s" % (output_prefix, extension)) plt.close() no_snps_windows_count_dict.write("%s.no_snps.windows.count" % output_prefix) masked_windows_count_dict.write("%s.masked.windows.count" % output_prefix) masked_regions_fd.close()
dest="split_values", help="Split values. Default: False") parser.add_argument("-s", "--value_separator", action="store", dest="value_separator", default=",'", help="Value separator. Default: ','") parser.add_argument( "-g", "--ignore_value_repeats", action="store_true", dest="ignore_value_repeats", help= "Ignore repeats of values(i.e values that corresponds to same fl_key and sl_key) " "and don't raise exception. If yes value from first entry is stored. Default: False" ) args = parser.parse_args() combined_table = TwoLvlDict(input_file=args.files, absent_symbol=args.absent_symbol, split_values=args.split_values, value_sep=args.value_separator, ignore_value_repeats=args.ignore_value_repeats) #print combined_table combined_table.write(args.output, absent_symbol=args.absent_symbol, close_after_if_file_object=False, sort=False)
def star_and_htseq(self, genome_dir, samples_directory, output_directory, gff_for_htseq, count_table_file_prefix, genome_fasta=None, samples_to_handle=None, genome_size=None, annotation_gtf=None, feature_from_gtf_to_use_as_exon=None, exon_tag_to_use_as_transcript_id=None, exon_tag_to_use_as_gene_id=None, length_of_sequences_flanking_junction=None, junction_tab_file_list=None, three_prime_trim=None, five_prime_trim=None, adapter_seq_for_three_prime_clip=None, max_mismatch_percent_for_adapter_trimming=None, three_prime_trim_after_adapter_clip=None, output_type="BAM", sort_bam=True, max_memory_per_thread_for_bam_sorting="4G", include_unmapped_reads_in_bam=True, output_unmapped_reads=True, two_pass_mode=False, star_dir=None, threads=1, max_intron_length=None, stranded_rnaseq="yes", min_alignment_quality=10, feature_type_for_htseq="exon", feature_id_attribute_for_htseq="gene_id", htseq_mode="union"): STAR.threads = threads STAR.path = star_dir if genome_fasta: STAR.index(genome_dir, genome_fasta, annotation_gtf=None, junction_tab_file=None, sjdboverhang=None, genomeSAindexNbases=None, genomeChrBinNbits=None, genome_size=genome_size) sample_list = samples_to_handle if samples_to_handle else self.get_sample_list( samples_directory) self.prepare_diff_expression_directories(output_directory, sample_list) alignment_dir = "%s/alignment/" % output_directory count_pe_table = TwoLvlDict() count_se_table = TwoLvlDict() count_all_table = TwoLvlDict() count_pe_table_file = "%s/%s.pe.tab" % (output_directory, count_table_file_prefix) count_se_table_file = "%%s/%s.se.tab" % (output_directory, count_table_file_prefix) count_all_table_file = "%s/%s.all.tab" % (output_directory, count_table_file_prefix) for sample in sample_list: print("Handling %s" % sample) sample_dir = "%s/%s/" % (samples_directory, sample) alignment_sample_dir = "%s/%s/" % (alignment_dir, sample) alignment_sample_se_dir = "%s/se/" % alignment_sample_dir filetypes, forward_files, reverse_files, se_files = self.make_lists_forward_and_reverse_files( sample_dir) if se_files: self.safe_mkdir(alignment_sample_se_dir) print("\tAligning paired reads...") count_file = "%s/%s.htseq.count" % (alignment_sample_dir, sample) #""" STAR.align( genome_dir, forward_files, reverse_read_list=reverse_files, annotation_gtf=annotation_gtf, feature_from_gtf_to_use_as_exon=feature_from_gtf_to_use_as_exon, exon_tag_to_use_as_transcript_id= exon_tag_to_use_as_transcript_id, exon_tag_to_use_as_gene_id=exon_tag_to_use_as_gene_id, length_of_sequences_flanking_junction= length_of_sequences_flanking_junction, junction_tab_file_list=junction_tab_file_list, three_prime_trim=three_prime_trim, five_prime_trim=five_prime_trim, adapter_seq_for_three_prime_clip= adapter_seq_for_three_prime_clip, max_mismatch_percent_for_adapter_trimming= max_mismatch_percent_for_adapter_trimming, three_prime_trim_after_adapter_clip= three_prime_trim_after_adapter_clip, output_type=output_type, sort_bam=sort_bam, max_memory_per_thread_for_bam_sorting= max_memory_per_thread_for_bam_sorting, include_unmapped_reads_in_bam=include_unmapped_reads_in_bam, output_unmapped_reads=output_unmapped_reads, output_dir=alignment_sample_dir, two_pass_mode=two_pass_mode, max_intron_length=max_intron_length) alignment_file = "%s/Aligned.sortedByCoord.out.bam" % alignment_sample_dir print("\tIndexing alignment file for paired reads...") os.system("samtools index %s" % alignment_file) print("\tCounting paired reads aligned to features...") HTSeq.count(alignment_file, gff_for_htseq, count_file, samtype="bam", order="pos", stranded_rnaseq=stranded_rnaseq, min_alignment_quality=min_alignment_quality, feature_type=feature_type_for_htseq, feature_id_attribute=feature_id_attribute_for_htseq, mode=htseq_mode, suppress_progres_report=False) #""" sample_counts = SynDict(filename=count_file, header=False, separator="\t", allow_repeats_of_key=False, split_values=False, values_separator=",", key_index=0, value_index=1, close_after_if_file_object=False, expression=int, comments_prefix="__") count_pe_table[sample] = sample_counts if se_files: print("\tAligning single reads...") count_se_file = "%s/%s.htseq.count" % (alignment_sample_se_dir, sample) #""" STAR.align( genome_dir, se_files, reverse_read_list=None, annotation_gtf=annotation_gtf, feature_from_gtf_to_use_as_exon= feature_from_gtf_to_use_as_exon, exon_tag_to_use_as_transcript_id= exon_tag_to_use_as_transcript_id, exon_tag_to_use_as_gene_id=exon_tag_to_use_as_gene_id, length_of_sequences_flanking_junction= length_of_sequences_flanking_junction, junction_tab_file_list=junction_tab_file_list, three_prime_trim=three_prime_trim, five_prime_trim=five_prime_trim, adapter_seq_for_three_prime_clip= adapter_seq_for_three_prime_clip, max_mismatch_percent_for_adapter_trimming= max_mismatch_percent_for_adapter_trimming, three_prime_trim_after_adapter_clip= three_prime_trim_after_adapter_clip, output_type=output_type, sort_bam=sort_bam, max_memory_per_thread_for_bam_sorting= max_memory_per_thread_for_bam_sorting, include_unmapped_reads_in_bam=include_unmapped_reads_in_bam, output_unmapped_reads=output_unmapped_reads, output_dir=alignment_sample_se_dir, two_pass_mode=two_pass_mode, max_intron_length=max_intron_length) alignment_se_file = "%s/Aligned.sortedByCoord.out.bam" % alignment_sample_se_dir print("\tIndexing alignment file for single reads...") os.system("samtools index %s" % alignment_se_file) print("\tCounting single reads aligned to features...") HTSeq.count( alignment_se_file, gff_for_htseq, count_se_file, samtype="bam", order="pos", stranded_rnaseq=stranded_rnaseq, min_alignment_quality=min_alignment_quality, feature_type=feature_type_for_htseq, feature_id_attribute=feature_id_attribute_for_htseq, mode=htseq_mode, suppress_progres_report=False) #""" sample_se_counts = SynDict(filename=count_se_file, header=False, separator="\t", allow_repeats_of_key=False, split_values=False, values_separator=",", key_index=0, value_index=1, close_after_if_file_object=False, expression=int, comments_prefix="__") count_se_table[sample] = sample_se_counts else: count_se_table[sample] = SynDict() count_all_table[sample] = SynDict() if se_files: for gene_id in set(sample_counts.keys()) | set( sample_se_counts.keys()): if (gene_id in sample_counts) and (gene_id in sample_se_counts): count_all_table[sample][gene_id] = sample_counts[ gene_id] + sample_se_counts[gene_id] elif gene_id in sample_counts: count_all_table[sample][gene_id] = sample_counts[ gene_id] elif gene_id in sample_se_counts: count_all_table[sample][gene_id] = sample_se_counts[ gene_id] else: count_all_table[sample] = count_pe_table[sample] count_pe_table.write(count_pe_table_file) count_se_table.write(count_se_table_file) count_all_table.write(count_all_table_file)
dest="output", required=True, help="File to write statistics") parser.add_argument( "-l", "--log_file", action="store", dest="log_file", default="trimmomatic.log", help="Name of files with trimmomatic log. Default - trimmomatic.log") args = parser.parse_args() samples = sorted( args.samples.split(",") if args.samples else os.listdir(args.samples_dir)) present_samples = [] for sample in samples: if os.path.isdir(args.samples_dir + sample): present_samples.append(sample) reports_dict = TwoLvlDict() for sample in present_samples: print("Handling report from %s" % sample) sample_dir = "%s%s/" % (args.samples_dir, sample) trimmomatic_log = "%s/trimmomatic.log" % sample_dir reports_dict[sample] = Trimmomatic.parse_log(trimmomatic_log) reports_dict.write(args.output)
cluster_3d_dict[cluster_3d.id]["intersection"].append( intersection) cluster_3d_dict[cluster_3d.id][ "intersection % of main cluster"].append( intersection * 100 / cluster_3d.len) cluster_3d_dict[ cluster_3d.id]["interscection % of clusters"].append( intersection * 100 / cluster_3d_sub.len) cluster_3d_dict[cluster_3d.id]["total_intersection"] = sum( cluster_3d_dict[cluster_3d.id] ["intersection % of main cluster"]) if cluster_3d_dict[ cluster_3d.id]["intersection % of main cluster"] else 0 cluster_3d_dict.write( "intersection_PmCDA1_3d_sub_and_nonsub_%i+_%.2f+.t" % (size, power)) total_intersection = [ cluster_3d_dict[cluster_id]["total_intersection"] for cluster_id in cluster_3d_dict ] print("Total %i" % len(total_intersection)) print("No intersection %i" % total_intersection.count(0)) print("Intersection %i" % (len(total_intersection) - total_intersection.count(0))) figure = plt.figure(1, figsize=(5, 5), dpi=300) subplot = plt.subplot(1, 1, 1) plt.hist(total_intersection) plt.xlabel("% of intersection") plt.ylabel("N")
help="Directory with families of species") """ parser.add_argument("-o", "--output_file", action="store", dest="output", default="stdout", help="Output file. Default: stdout") """ args = parser.parse_args() # run after scripts/expansion/compare_cluster.py # out_fd = sys.stdout if args.output == "stdout" else open(args.output, "w") species_syn_dict = TwoLvlDict() for species in args.species_list: species_syn_dict[species] = read_synonyms_dict("%s%s/all.t" % (args.species_dir, species)) species_syn_dict.write("families_all_species.t", absent_symbol=".") not_assembled = species_syn_dict.filter_by_line(is_assembled) species_syn_dict.write("correctly_assembled_families_species.t", absent_symbol=".") assembled_ids = IdSet(species_syn_dict.sl_keys()) assembled_ids.write("assembled_families.ids") not_assembled_ids = IdSet(not_assembled.sl_keys()) not_assembled_ids.write("non_assembled_families.ids") """ if args.output != "stdout": out_fd.close() """
gene_name = tmp[10] substitution = tmp[8] if substitution == ".": # skip substitutions not in CDS continue if gene_alias_dict: if gene_name in gene_alias_dict: gene_name = gene_alias_dict[gene_name] if args.rem_nuc_sub: substitution = substitution.split("/")[0][2:] if args.convert_to_single_letter: ref_aa = seq1(substitution[:3]) try: if substitution[-1] == "*": alt_aa = "*" pos = substitution[3:-1] else: alt_aa = seq1(substitution[-3:]) pos = substitution[3:-3] substitution = "%s%s%s" % (ref_aa, pos, alt_aa) except: print(substitution, "aaa", filename, gene_name) if gene_name not in summary_dict[name]: summary_dict[name][gene_name] = [substitution] else: summary_dict[name][gene_name].append(substitution) summary_dict.write(out_fd, absent_symbol=".") if args.output != "stdout": out_fd.close()
with open("%s_test.t" % args.prefix, "w") as out_fd: for gene in gene_dict: for sub_feature in gene_dict[gene]: out_fd.write("%s\t%s\t%i\n" % (gene, sub_feature, gene_dict[gene][sub_feature])) lengths_dict = get_feature_lengths(record_dict) count_dict = TwoLvlDict({}) for record in lengths_dict: count_dict[record] = {} for feature_type in lengths_dict[record]: count_dict[record][feature_type] = len( lengths_dict[record][feature_type]) count_dict.write("%s_counts.t" % args.prefix) total_lengths = get_total_feature_lengths(lengths_dict, out_filename="%s_feature_lengths.t" % args.prefix) white_list = ["five_prime_UTR", "three_prime_UTR", "CDS", "ncRNA"] collapsed_dict = feature_lengths_collapse_records(lengths_dict, synonym_dict={ "snoRNA": "ncRNA", "snRNA": "ncRNA" }) for feature in collapsed_dict: collapsed_dict[feature] = np.array(collapsed_dict[feature]) bin_dict = {
"general_tree.nwk") if args.species_synonym_file: synonyms_dict = read_synonyms_dict(args.species_synonym_file, header=False, separator="\t", split_values=False) for node in cafe_report.general_data.tree.traverse(): if node.name in synonyms_dict: node.name = synonyms_dict[node.name] cafe_report.general_data.write_general_tree(general_trees_dir + "general_tree_latin.nwk") cafe_report.general_data.draw_expansion_contraction() cafe_report.general_data.draw_significant_expansion_contraction() """ with open(background_genes_dir + "background_genes.t", "w") as back_fd: with open(background_genes_dir + "background_genes_list.txt", "w") as back_list_fd: back_fd.write("#id\tfamaliy_p_value\tref_gene\n") for record in filtered_out_report: #print(record) if reference_genes_dict[record.id][0]: random_reference_gene = choice(reference_genes_dict[record.id][0]) back_list_fd.write(random_reference_gene + "\n") else: random_reference_gene = "." back_string = "%s\t%f\t%s\n" % (record.id, record.family_p_value, random_reference_gene) back_fd.write(back_string) """ statistics_dict.write(statistics_dir + "node_statistics.t", absent_symbol=".")