def count_locations(self, annotation_black_list=[], allow_several_counts_of_record=False, out_filename="location_counts.t", write=True, count_dir="location_counts"): os.system("mkdir -p %s" % count_dir) regions_dict = self._split_regions() region_counts_dict = TwoLvlDict({}) for region in regions_dict: count_locations_dict = {"igc": 0} for record in regions_dict[region]: if (not record.description["Loc"]) or ( "Loc" not in record.description): count_locations_dict["unknown"] += 1 continue #print(record.description["Loc"]) if allow_several_counts_of_record: for location in record.description["Loc"]: if location in annotation_black_list: continue if location not in count_locations_dict: count_locations_dict[location] = 1 else: count_locations_dict[location] += 1 else: full_location = [] for location in record.description["Loc"]: if location in annotation_black_list: continue full_location.append(location) if not full_location: continue full_location.sort() full_location = "/".join(full_location) if full_location not in count_locations_dict: count_locations_dict[full_location] = 1 else: count_locations_dict[full_location] += 1 labels = [] counts = [] #colors = [] for location in count_locations_dict: if count_locations_dict[ location] == 0 or location in annotation_black_list: continue labels.append(location) counts.append(count_locations_dict[location]) region_counts_dict[region] = OrderedDict([ (label, count) for label, count in zip(labels, counts) ]) if write: region_counts_dict.write("%s/%s" % (count_dir, out_filename)) return region_counts_dict
def count_types(self, output_file=None, total_output_file=None, return_mode="chrom"): annotated_types = self.get_annotated_types() count_dict = TwoLvlDict() total_count_dict = OrderedDict() for type in annotated_types: total_count_dict[type] = OrderedDict() total_count_dict[type]["complete"] = 0 total_count_dict[type]["partial"] = 0 for chrom in self.records: count_dict[chrom] = OrderedDict() for type in annotated_types: count_dict[chrom][type] = 0 for chrom in self.records: for record in self.records[chrom]: count_dict[chrom][record.type] += 1 if record.partial: total_count_dict[record.type]["partial"] += 1 else: total_count_dict[record.type]["complete"] += 1 if output_file: count_dict.write(output_file) if total_output_file: with open(total_output_file, "w") as out_fd: out_fd.write( "#rRNA\tComplete%s\tPartial%s\n" % ("(>%.2f of expected length)" % self.partial_threshold if self.partial_threshold else "", "(<%.2f of expected length)" % self.partial_threshold if self.partial_threshold else "")) for type in total_count_dict: out_fd.write("%s\t%i\t%i\n" % (type, total_count_dict[type]["complete"], total_count_dict[type]["partial"])) if return_mode == "chrom": return count_dict elif return_mode == "total": return total_count_dict elif return_mode == "both": return count_dict, total_count_dict else: raise ValueError( "Unknown return type. Allowed variants: 'chrom', 'total', 'both'" )
def get_leaf_values(self, write=True): leaf_values_dict = TwoLvlDict() dN_dict = self._get_tree_dist_dict(self.dNtree) dS_dict = self._get_tree_dist_dict(self.dStree) W_fict = self._get_tree_dist_dict(self.Wtree) leaf_values_dict["dN"] = dN_dict leaf_values_dict["dS"] = dS_dict leaf_values_dict["W"] = W_fict if write: leaf_values_dict.write("leaf_values.t") return leaf_values_dict
def results_extraction_listener(queue, output_file_prefix, selected_species_list=None): """listens for messages on the queue, writes to file.""" positive_selection_dict = TwoLvlDict() selected_species_positive_selection_dict = TwoLvlDict() error_fd = open("errors.err", "w") error_fd.write("#sample\terror_code\n") while 1: result = queue.get() if isinstance(result[1], int): error_fd.write("%s\t%i\n" % (result[0], result[1])) continue if result == 'finish': positive_selection_dict.write("%s.all" % output_file_prefix, absent_symbol=".") if selected_species_list: selected_species_positive_selection_dict.write( "%s.selected_species" % output_file_prefix, absent_symbol=".") # print positive_selection_dict.table_form(absent_symbol=".") break if result[1]: positive_selection_dict[result[0]] = result[1] if selected_species_list: for species in selected_species_list: if species in result[1]: if result[ 0] not in selected_species_positive_selection_dict: selected_species_positive_selection_dict[ result[0]] = {} selected_species_positive_selection_dict[ result[0]][species] = result[1][species]
def combine_count_files(count_file_list, output_file, sample_name_list=None): if sample_name_list is not None: if len(count_file_list) != len(sample_name_list): raise ValueError("Several files doesn't have corresponding sample name") samples = zip(sample_name_list if sample_name_list else count_file_list, count_file_list) count_table = TwoLvlDict() for sample, filename in samples: count_table[sample] = SynDict(filename=filename, header=False, separator="\t", allow_repeats_of_key=False, split_values=False, values_separator=",", key_index=0, value_index=1, close_after_if_file_object=False, expression=None, comments_prefix="__") count_table.write(output_file)
def get_general_stats(self): stat_dict = TwoLvlDict() for report_id in self: stat_dict[report_id] = OrderedDict() stat_dict[report_id]["input_pairs"] = self[report_id].input_pairs stat_dict[report_id]["pairs_without_adapters"] = self[report_id].retained_pairs stat_dict[report_id]["pairs_without_adapters_fraction"] = self[report_id].retained_pairs_fraction return stat_dict
def count_reads_and_bases(self, fastq_file_list, stat_file=None): fastq_list = [fastq_file_list] if isinstance(fastq_file_list, str) else fastq_file_list counts = TwoLvlDict() for fastq_file in fastq_list: counts[fastq_file] = OrderedDict() counts[fastq_file]["Reads"] = 0 counts[fastq_file]["Bases"] = 0 for fastq_file in fastq_list: with self.metaopen(fastq_file, "r") as fastq_fd: for line in fastq_fd: counts[fastq_file]["Bases"] += len(fastq_fd.next()) counts[fastq_file]["Reads"] += 1 fastq_fd.next() fastq_fd.next() # to take into account "\n" at the end of each line counts[fastq_file]["Bases"] = counts[fastq_file][ "Bases"] - counts[fastq_file]["Reads"] counts.write() if stat_file: counts.write(stat_file)
def get_general_stats(self): stat_dict = TwoLvlDict() for report_id in self: stat_dict[report_id] = OrderedDict() stat_dict[report_id]["machine_number"] = len(self[report_id].machine_id_list) stat_dict[report_id]["machine_ids"] = self[report_id].machine_id_list stat_dict[report_id]["flowcell_number"] = len(self[report_id].flowcell_id_list) stat_dict[report_id]["flowcell_ids"] = self[report_id].flowcell_id_list stat_dict[report_id]["lane_number"] = len(self[report_id].lane_table) stat_dict[report_id]["full_lane_ids"] = self[report_id].full_lane_id_list stat_dict[report_id]["short_lane_ids"] = self[report_id].short_lane_id_list stat_dict[report_id]["input_pairs"] = self[report_id].input_pairs stat_dict[report_id]["retained_pairs"] = self[report_id].retained_pairs stat_dict[report_id]["retained_pairs_fraction"] = self[report_id].retained_pairs_fraction stat_dict[report_id]["retained_forward_only"] = self[report_id].retained_forward_only stat_dict[report_id]["retained_reverse_only"] = self[report_id].retained_reverse_only stat_dict[report_id]["both_discarded"] = self[report_id].both_discarded stat_dict[report_id]["min_retained_pairs_in_tiles_fraction"] = self[report_id].minimum_retained_pairs_in_tiles_fraction return stat_dict
def get_results(samples_list, data_type): results = TwoLvlDict() for sample in samples_list: results[sample] = OrderedDict() filename = "%s/all_reads/%s_all_%s_coverage.tab" % (sample, sample, data_type) data = read_data(filename) if not data: print sample continue #print sample for gene in data: results[sample][gene] = data[gene] for proportions, name in zip([[1, 2], [2, 1], [1, 1]], ["1:2", "2:1", "1:1"]): chi_results = calculate_chi_squared(data, proportions) #print name results[sample][name + " Chi"] = chi_results[0] results[sample][name + " p-value"] = chi_results[1] #print chi_results return results
if args.labels_list is not None: if len(args.labels_list) != len(args.input_file_list): raise ValueError( "Length of labels list is not equal to number of files with assemblies" ) assemblies_dict = OrderedDict() for i in range(0, len(args.input_file_list)): assembly_label = args.labels_list[i] if args.labels_list else "A%i" % (i + 1) tmp_index = "%s.tmp.idx" % assembly_label assemblies_dict[assembly_label] = SeqIO.index_db(tmp_index, args.input_file_list[i], format=args.format) assembly_N50_dict = TwoLvlDict() assembly_L50 = TwoLvlDict() assembly_bins = [] assembly_contig_cumulative_length = OrderedDict() assembly_contig_number_values = OrderedDict() assembly_general_stats = TwoLvlDict() assembly_length_array = OrderedDict() assembly_lengths = TwoLvlDict() for assembly in assemblies_dict: lengths_array, N50_dict, L50_dict, length_dict, total_length, longest_contig, Ns_number, bins, contig_cumulative_length_values, \ contig_number_values = SequenceRoutines.calculate_assembly_stats(assemblies_dict[assembly], thresholds_list=args.thresholds, seq_len_file="%s.%s.len" % (args.output_prefix, assembly)) assembly_N50_dict[assembly] = N50_dict assembly_L50[assembly] = L50_dict assembly_contig_cumulative_length[
} annotation_black_list = [ "gene", "region", "ARS", "long_terminal_repeat", "noncoding_exon", "intron", "repeat_region", "telomere", "gene_cassette", "five_prime_UTR_intron" ] with open(args.annotations) as gff_fd: for record in GFF.parse(gff_fd): annotations_dict[record.id] = record bad_region_dict = {} with open(args.masking) as gff_fd: for record in GFF.parse(gff_fd): bad_region_dict[record.id] = record statistics_dict = TwoLvlDict(OrderedDict({})) print("Handling %s" % sample) statistics_dict[sample] = OrderedDict({}) os.system("mkdir -p %s" % clustering_dir) mutations = CollectionVCF( in_file=args.vcf_file if args.vcf_file else "%s.vcf" % args.sample_name, from_file=True) mutations.get_location(annotations_dict, use_synonym=True, synonym_dict=annotation_synonym_dict) mutations.set_location_flag(bad_region_dict, check_location, "BR") mutations.check_by_ref_and_alt(ref_alt_variants["deaminases"],
import argparse from Routines import FileRoutines from CustomCollections.GeneralCollections import TwoLvlDict parser = argparse.ArgumentParser() parser.add_argument("-f", "--files", action="store", dest="files", required=True, type=FileRoutines.make_list_of_path_to_files_from_string, help="Comma-separated list of files/directories with tables") parser.add_argument("-o", "--output", action="store", dest="output", required=True, help="Output file with combined table.") parser.add_argument("-a", "--absent_symbol", action="store", dest="absent_symbol", default=".", help="Symbol to be treated as absent value") parser.add_argument("-v", "--split_values", action="store_true", dest="split_values", help="Split values. Default: False") parser.add_argument("-s", "--value_separator", action="store", dest="value_separator", default=",'", help="Value separator. Default: ','") parser.add_argument("-g", "--ignore_value_repeats", action="store_true", dest="ignore_value_repeats", help="Ignore repeats of values(i.e values that corresponds to same fl_key and sl_key) " "and don't raise exception. If yes value from first entry is stored. Default: False") args = parser.parse_args() combined_table = TwoLvlDict(input_file=args.files, absent_symbol=args.absent_symbol, split_values=args.split_values, value_sep=args.value_separator, ignore_value_repeats=args.ignore_value_repeats) #print combined_table combined_table.write(args.output, absent_symbol=args.absent_symbol, close_after_if_file_object=False, sort=False)
parser = argparse.ArgumentParser() parser.add_argument("-s", "--species_list", action="store", dest="species_list", type=lambda s: s.split(","), required=True, help="Comma-separated list of species") parser.add_argument("-d", "--species_dir", action="store", dest="species_dir", default="./", type=check_path, help="Comma-separated list of species") parser.add_argument("-o", "--output_file", action="store", dest="output", default="stdout", help="Output file. Default: stdout") args = parser.parse_args() out_fd = sys.stdout if args.output == "stdout" else open(args.output, "w") species_syn_dict = TwoLvlDict() for species in args.species_list: species_syn_dict[species] = read_synonyms_dict("%s%s/all.t" % (args.species_dir, species)) species_syn_dict.write("families_all_species.t", absent_symbol=".") nonassembled = species_syn_dict.filter_by_line(filter_nonassembled) species_syn_dict.write("correctly_assembled_families_species.t", absent_symbol=".") nonassembled.write("not_assembled_families_in_all_species.t", absent_symbol=".") complicated_families_dict = nonassembled.filter_by_line(filter_splited_to_several_fam) complicated_families_dict.write("complicated_families.t", absent_symbol=".") complicated_families_syn_dict = SynDict()
from CustomCollections.GeneralCollections import TwoLvlDict from Parsers.CCF import CollectionCCF def get_intersection_length(start1, end1, start2, end2): if start1 - end2 > 0 or start2 - end1 > 0: return 0 start_shift = start1 - start2 start_coef_shift = 0 if start_shift < 0 else 1 end_shift = end1 - end2 end_coef_shift = 0 if end_shift > 0 else 1 return (end2 - start2 + 1) - start_coef_shift * start_shift + end_coef_shift * end_shift overlap_clusters_percent = TwoLvlDict({}) #size = 8 #power = 0.05 print([float(f) / float(100) for f in range(1, 11)]) for size in range(3, 11): overlap_clusters_percent[size] = {} for power in [float(f) / float(100) for f in range(1, 11)]: PmCDA1_3d_clusters = CollectionCCF(from_file=True, input_file="/media/mahajrod/d9e6e5ee-1bf7-4dba-934e-3f898d9611c8/Data/LAN2xx/combined_vcf/clusters/%i/%.2f/PmCDA1_3d_size_%i+_power_%.2f+_good.ccf" % (size, power, size, power)) PmCDA1_3d_sub_clusters = CollectionCCF(from_file=True, input_file="/media/mahajrod/d9e6e5ee-1bf7-4dba-934e-3f898d9611c8/Data/LAN2xx/combined_vcf/clusters/%i/%.2f/PmCDA1_sub1_3d_size_%i+_power_%.2f+_good.ccf" % (size, power, size, power)) PmCDA1_3d_clusters.write_gff("/media/mahajrod/d9e6e5ee-1bf7-4dba-934e-3f898d9611c8/Data/LAN2xx/combined_vcf/clusters/%i/%.2f/PmCDA1_3d_size_%i+_power_%.2f+_good.gff" % (size, power, size, power)) PmCDA1_3d_sub_clusters.write_gff("/media/mahajrod/d9e6e5ee-1bf7-4dba-934e-3f898d9611c8/Data/LAN2xx/combined_vcf/clusters/%i/%.2f/PmCDA1_sub1_3d_size_%i+_power_%.2f+_good.gff" % (size, power, size, power)) #cluster_3d_dict = OrderedDict({}) cluster_3d_dict = TwoLvlDict({})
gene_dict[feature.qualifiers["Name"][0]] = OrderedDict({}) for sub_feature in feature.sub_features: gene_dict[feature.qualifiers["Name"][0]][ sub_feature.type] = len(sub_feature) if feature.type in ("snoRNA", "ncRNA", "snRNA"): gene_dict[feature.qualifiers["Name"][0]] = OrderedDict( {"ncRNA": len(feature)}) with open("%s_test.t" % args.prefix, "w") as out_fd: for gene in gene_dict: for sub_feature in gene_dict[gene]: out_fd.write("%s\t%s\t%i\n" % (gene, sub_feature, gene_dict[gene][sub_feature])) lengths_dict = get_feature_lengths(record_dict) count_dict = TwoLvlDict({}) for record in lengths_dict: count_dict[record] = {} for feature_type in lengths_dict[record]: count_dict[record][feature_type] = len( lengths_dict[record][feature_type]) count_dict.write("%s_counts.t" % args.prefix) total_lengths = get_total_feature_lengths(lengths_dict, out_filename="%s_feature_lengths.t" % args.prefix) white_list = ["five_prime_UTR", "three_prime_UTR", "CDS", "ncRNA"] collapsed_dict = feature_lengths_collapse_records(lengths_dict, synonym_dict={ "snoRNA": "ncRNA",
return -ss.hypergeom.sf(mmax, n, n1, n2) + ss.hypergeom.sf(mmin, n, n1, n2) def get_intersection_length(start1, end1, start2, end2): if start1 - end2 > 0 or start2 - end1 > 0: return 0 start_shift = start1 - start2 start_coef_shift = 0 if start_shift < 0 else 1 end_shift = end1 - end2 end_coef_shift = 0 if end_shift > 0 else 1 return (end2 - start2 + 1) - start_coef_shift * start_shift + end_coef_shift * end_shift overlap_clusters_percent = TwoLvlDict({}) totaly_genes = 6074 test_fd = open("probability.t", "w") test_fd.write( "#size\tpower\ttotal\tPmCDA1_3d\tPmCDA1_sub_3d\tintersection\tp-value\n") print([float(f) / float(100) for f in range(1, 11)]) for size in range(3, 11): overlap_clusters_percent[size] = {} for power in [float(f) / float(100) for f in range(1, 11)]: PmCDA1_3d_clusters = CollectionCCF( from_file=True, input_file= "/media/mahajrod/d9e6e5ee-1bf7-4dba-934e-3f898d9611c8/Data/LAN2xx/combined_vcf/clusters/%i/%.2f/PmCDA1_3d_size_%i+_power_%.2f+_good.ccf" % (size, power, size, power)) PmCDA1_3d_sub_clusters = CollectionCCF(
required=True, help="Comma-separated list of species") parser.add_argument("-d", "--species_dir", action="store", dest="species_dir", default="./", type=check_path, help="Directory with per species statistics") parser.add_argument("-o", "--output_file", action="store", dest="output", default="stdout", help="Output file. Default: stdout") args = parser.parse_args() out_fd = sys.stdout if args.output == "stdout" else open(args.output, "w") species_stat_dict = TwoLvlDict() for species in args.species_list: with open("%s%s/stat.t" % (args.species_dir, species), "r") as stat_fd: statistics = map(lambda s: s.strip().split("\t"), stat_fd.readlines()) species_stat_dict[species] = OrderedDict(statistics) species_stat_dict.write(out_fd) if args.output != "stdout": out_fd.close()
"--gap_symbol", action="store", dest="gap_symbol", default="-", help="Gap symbol. Default - '-'") parser.add_argument("-m", "--histogram_output", action="store", dest="histogram_output", required=True, help="File to write histogram") args = parser.parse_args() unique_position_dict = TwoLvlDict() FileRoutines.safe_mkdir(args.output_dir) for alignment_file in args.input: alignment_name_list = FileRoutines.split_filename(alignment_file) output_prefix = "%s/%s.unique_positions" % (args.output_dir, alignment_name_list[1]) unique_position_dict[alignment_name_list[ 1]] = MultipleAlignmentRoutines.count_unique_positions_per_sequence_from_file( alignment_file, output_prefix, format=args.format, gap_symbol="-", return_mode="relative",
} annotation_black_list = [ "gene", "region", "ARS", "long_terminal_repeat", "noncoding_exon", "intron", "repeat_region", "telomere", "gene_cassette", "five_prime_UTR_intron" ] with open(gff_file) as gff_fd: for record in GFF.parse(gff_fd): annotations_dict[record.id] = record bad_region_dict = {} with open(bad_regions_file) as gff_fd: for record in GFF.parse(gff_fd): bad_region_dict[record.id] = record statistics_dict = TwoLvlDict(OrderedDict({})) for sample_set_name in sample_set_names_list: print("Handling %s" % sample_set_name) statistics_dict[sample_set_name] = OrderedDict({}) os.chdir(workdir) os.system("mkdir -p %s" % sample_set_name) os.chdir(sample_set_name) os.system("mkdir -p %s" % clustering_dir) mutations = CollectionVCF(in_file="../../%s_SNP.vcf" % sample_set_name, from_file=True) mutations.get_location(annotations_dict, use_synonym=True, synonym_dict=annotation_synonym_dict) mutations.set_location_flag(bad_region_dict, check_location, "BR")
dest="output", required=True, help="File to write statistics") parser.add_argument( "-l", "--log_file", action="store", dest="log_file", default="trimmomatic.log", help="Name of files with trimmomatic log. Default - trimmomatic.log") args = parser.parse_args() samples = sorted( args.samples.split(",") if args.samples else os.listdir(args.samples_dir)) present_samples = [] for sample in samples: if os.path.isdir(args.samples_dir + sample): present_samples.append(sample) reports_dict = TwoLvlDict() for sample in present_samples: print("Handling report from %s" % sample) sample_dir = "%s%s/" % (args.samples_dir, sample) trimmomatic_log = "%s/trimmomatic.log" % sample_dir reports_dict[sample] = Trimmomatic.parse_log(trimmomatic_log) reports_dict.write(args.output)
action="store", dest="species_dir", default="./", type=FileRoutines.check_path, help="Directory with families of species") """ parser.add_argument("-o", "--output_file", action="store", dest="output", default="stdout", help="Output file. Default: stdout") """ args = parser.parse_args() # run after scripts/expansion/compare_cluster.py # out_fd = sys.stdout if args.output == "stdout" else open(args.output, "w") species_syn_dict = TwoLvlDict() for species in args.species_list: species_syn_dict[species] = read_synonyms_dict("%s%s/all.t" % (args.species_dir, species)) species_syn_dict.write("families_all_species.t", absent_symbol=".") not_assembled = species_syn_dict.filter_by_line(is_assembled) species_syn_dict.write("correctly_assembled_families_species.t", absent_symbol=".") assembled_ids = IdSet(species_syn_dict.sl_keys()) assembled_ids.write("assembled_families.ids") not_assembled_ids = IdSet(not_assembled.sl_keys()) not_assembled_ids.write("non_assembled_families.ids")
def star_and_htseq(self, genome_dir, samples_directory, output_directory, gff_for_htseq, count_table_file, genome_fasta=None, samples_to_handle=None, genome_size=None, annotation_gtf=None, feature_from_gtf_to_use_as_exon=None, exon_tag_to_use_as_transcript_id=None, exon_tag_to_use_as_gene_id=None, length_of_sequences_flanking_junction=None, junction_tab_file_list=None, three_prime_trim=None, five_prime_trim=None, adapter_seq_for_three_prime_clip=None, max_mismatch_percent_for_adapter_trimming=None, three_prime_trim_after_adapter_clip=None, output_type="BAM", sort_bam=True, max_memory_for_bam_sorting=None, include_unmapped_reads_in_bam=True, output_unmapped_reads=True, two_pass_mode=False, star_dir=None, threads=1, max_intron_length=None, stranded_rnaseq="yes", min_alignment_quality=10, feature_type_for_htseq="exon", feature_id_attribute_for_htseq="gene_id", htseq_mode="union"): STAR.threads = threads STAR.path = star_dir if genome_fasta: STAR.index(genome_dir, genome_fasta, annotation_gtf=None, junction_tab_file=None, sjdboverhang=None, genomeSAindexNbases=None, genomeChrBinNbits=None, genome_size=genome_size) sample_list = samples_to_handle if samples_to_handle else self.get_sample_list( samples_directory) self.prepare_diff_expression_directories(output_directory, sample_list) alignment_dir = "%s/alignment/" % output_directory count_table = TwoLvlDict() for sample in sample_list: print("Handling %s" % sample) sample_dir = "%s/%s/" % (samples_directory, sample) alignment_sample_dir = "%s/%s/" % (alignment_dir, sample) filetypes, forward_files, reverse_files = self.make_lists_forward_and_reverse_files( sample_dir) print "\tAligning reads..." STAR.align( genome_dir, forward_files, reverse_read_list=reverse_files, annotation_gtf=annotation_gtf, feature_from_gtf_to_use_as_exon=feature_from_gtf_to_use_as_exon, exon_tag_to_use_as_transcript_id= exon_tag_to_use_as_transcript_id, exon_tag_to_use_as_gene_id=exon_tag_to_use_as_gene_id, length_of_sequences_flanking_junction= length_of_sequences_flanking_junction, junction_tab_file_list=junction_tab_file_list, three_prime_trim=three_prime_trim, five_prime_trim=five_prime_trim, adapter_seq_for_three_prime_clip= adapter_seq_for_three_prime_clip, max_mismatch_percent_for_adapter_trimming= max_mismatch_percent_for_adapter_trimming, three_prime_trim_after_adapter_clip= three_prime_trim_after_adapter_clip, output_type=output_type, sort_bam=sort_bam, max_memory_for_bam_sorting=max_memory_for_bam_sorting, include_unmapped_reads_in_bam=include_unmapped_reads_in_bam, output_unmapped_reads=output_unmapped_reads, output_dir=alignment_sample_dir, two_pass_mode=two_pass_mode, max_intron_length=max_intron_length) alignment_file = "%s/Aligned.sortedByCoord.out.bam" % alignment_sample_dir print "\tIndexing alignment file..." os.system("samtools index %s" % alignment_file) print "\tCounting reads aligned to features..." count_file = "%s/%s.htseq.count" % (alignment_sample_dir, sample) HTSeq.count(alignment_file, gff_for_htseq, count_file, samtype="bam", order="pos", stranded_rnaseq=stranded_rnaseq, min_alignment_quality=min_alignment_quality, feature_type=feature_type_for_htseq, feature_id_attribute=feature_id_attribute_for_htseq, mode=htseq_mode, suppress_progres_report=False) sample_counts = SynDict() sample_counts.read(count_file, header=False, separator="\t", allow_repeats_of_key=False, split_values=False, values_separator=",", key_index=0, value_index=1, close_after_if_file_object=False, expression=None, comments_prefix="__") count_table[sample] = sample_counts count_table.write(count_table_file)
def get_taxa_genomes_summary(self, taxa, email, output_directory, output_prefix, max_ids_per_query=8000, max_download_attempts=500, min_scaffold_n50=None, min_contig_n50=None, max_scaffold_l50=None, max_contig_l50=None, max_contig_count=None, max_scaffold_count=None, max_chromosome_count=None, min_chromosome_count=None, max_unlocalized_scaffolds=None, max_unplaced_scaffolds=None, max_total_length=None, min_total_length=None, max_ungapped_length=None, min_ungapped_length=None, no_ambiguous_species=True): Entrez.email = email taxa_list = taxa if isinstance(taxa, Iterable) else [taxa] all_files_dir = "%s%s/" % (self.check_path(output_directory), "all") nonambiguous_species_all_dir = "%snonambiguous_species_all/" % self.check_path(output_directory) ambiguous_species_all_dir = "%s%s/" % (self.check_path(output_directory), "ambiguous_species_all") chromosome_lvl_dir = "%s%s/" % (self.check_path(output_directory), "chromosome_lvl") non_chromosome_lvl_dir = "%s%s/" % (self.check_path(output_directory), "nonchromosome_lvl") filtered_by_integrity_dir = "%s%s/" % (self.check_path(output_directory), "passed_integrity_filters") filtered_out_by_integrity_dir = "%s%s/" % (self.check_path(output_directory), "not_passed_integrity_filters") stat_dir = "%s%s/" % (self.check_path(output_directory), "stat") taxa_stat_dir = "%s%s/" % (self.check_path(output_directory), "taxa_stat") for subdir in (all_files_dir, chromosome_lvl_dir, non_chromosome_lvl_dir, stat_dir, taxa_stat_dir, nonambiguous_species_all_dir, ambiguous_species_all_dir): self.save_mkdir(subdir) filter_by_integrity = min_scaffold_n50 or min_contig_n50 or max_scaffold_l50 or max_contig_l50 \ or max_contig_count or max_scaffold_count or max_chromosome_count \ or min_chromosome_count or max_unlocalized_scaffolds \ or max_unplaced_scaffolds or max_total_length or min_total_length \ or max_ungapped_length or min_ungapped_length if filter_by_integrity: for subdir in (filtered_by_integrity_dir, filtered_out_by_integrity_dir): self.save_mkdir(subdir) for taxon in taxa_list: search_term = "%s[Orgn]" % taxon attempt_counter = 1 while True: try: summary = Entrez.read(Entrez.esearch(db="genome", term=search_term, retmax=10000, retmode="xml")) break except URLError: if attempt_counter > max_download_attempts: URLError("Network problems. Maximum attempt number is exceeded") print "URLError. Retrying... Attempt %i" % attempt_counter attempt_counter += 1 print "Were found %s species" % summary["Count"] #print summary taxon_stat_file = "%s/%s.stat" % (taxa_stat_dir, taxon.replace(" ", "_")) taxon_stat_dict = TwoLvlDict() for species_id in summary["IdList"]: #[167] : print "Handling species id %s " % species_id species_stat_file = "%s/%s.stat" % (stat_dir, species_id) species_stat_dict = TwoLvlDict() species_stat_dict[species_id] = OrderedDict() taxon_stat_dict[species_id] = OrderedDict() for stat in "all", "chromosome_lvl", "non_chromosome_lvl": species_stat_dict[species_id][stat] = 0 taxon_stat_dict[species_id][stat] = 0 #species_summary = Entrez.read(Entrez.esummary(db="genome", id=species_id, retmax=10000, retmode="xml")) #print species_summary # get assemblies linked with genome of species attempt_counter = 1 while True: try: assembly_links = Entrez.read(Entrez.elink(dbfrom="genome", id=species_id, retmode="xml", retmax=10000, linkname="genome_assembly")) break except URLError: if attempt_counter > max_download_attempts: URLError("Network problems. Maximum attempt number is exceeded") print "URLError. Retrying... Attempt %i" % attempt_counter attempt_counter += 1 assembly_number = len(assembly_links) #print links #print links[0]["LinkSetDb"][0]["Link"] if assembly_links: if "LinkSetDb" in assembly_links[0]: if assembly_links[0]["LinkSetDb"]: if "Link" in assembly_links[0]["LinkSetDb"][0]: assembly_ids = [id_dict["Id"] for id_dict in assembly_links[0]["LinkSetDb"][0]["Link"]] else: continue else: continue else: continue else: continue number_of_ids = len(assembly_ids) print "\tFound %i assemblies" % number_of_ids id_group_edges = np.arange(0, number_of_ids+1, max_ids_per_query) if id_group_edges[-1] != number_of_ids: id_group_edges = np.append(id_group_edges, number_of_ids) number_of_id_groups = len(id_group_edges) - 1 #print len(assembly_links[0]["LinkSetDb"][0]["Link"]) #print assembly_ids #print len(assembly_ids) #assembly_dict = TwoLvlDict() #assemblies_with_ambiguous_taxonomies = SynDict() #summaries = Entrez.read(Entrez.esummary(db="assembly", id=",".join(assembly_ids), retmode="xml")) summary_list = None for i in range(0, number_of_id_groups): print "\tDownloading summary about assemblies %i - %i" % (id_group_edges[i]+1, id_group_edges[i+1]) #print len(assembly_ids[id_group_edges[i]:id_group_edges[i+1]]) summaries = Entrez.read(Entrez.esummary(db="assembly", id=",".join(assembly_ids[id_group_edges[i]:id_group_edges[i+1]]), retmode="xml"), validate=False) tmp_summary_list = AssemblySummaryList(entrez_summary_biopython=summaries) summary_list = (summary_list + tmp_summary_list) if summary_list else tmp_summary_list print "\tDownloaded %i" % len(summary_list) if len(summary_list) != number_of_ids: print "\tWARNING:Not all assemblies were downloaded" """ print "\tFollowing assemblies were not downloaded(ids):%s" % ",".join(set()) """ if summary_list: species_stat_dict[species_id]["all"] = len(summary_list) taxon_stat_dict[species_id]["all"] = len(summary_list) output_file = "%s%s.genome.summary" % ((output_prefix + ".") if output_prefix else "", species_id) #summary_list[0]['SpeciesName'].replace(" ", "_")) all_output_file = "%s/%s" % (all_files_dir, output_file) chromosome_lvl_output_file = "%s/%s" % (chromosome_lvl_dir, output_file) non_chromosome_lvl_output_file = "%s/%s" % (non_chromosome_lvl_dir, output_file) nonambiguous_species_output_file = "%s/%s" % (nonambiguous_species_all_dir, output_file) ambiguous_species_output_file = "%s/%s" % (ambiguous_species_all_dir, output_file) chromosome_lvl_summary_list, non_chromosome_lvl_summary_list = summary_list.filter_non_chrom_level_genomes() filtered_by_integrity_file = "%s/%s" % (filtered_by_integrity_dir, output_file) filtered_out_by_integrity_file = "%s/%s" % (filtered_out_by_integrity_dir, output_file) species_stat_dict[species_id]["chromosome_lvl"] = len(chromosome_lvl_summary_list) taxon_stat_dict[species_id]["chromosome_lvl"] = len(chromosome_lvl_summary_list) species_stat_dict[species_id]["non_chromosome_lvl"] = len(non_chromosome_lvl_summary_list) taxon_stat_dict[species_id]["non_chromosome_lvl"] = len(non_chromosome_lvl_summary_list) print("\tChromosome level assemblies %i" % species_stat_dict[species_id]["chromosome_lvl"]) print("\tNon chromosome level assemblies %i" % species_stat_dict[species_id]["non_chromosome_lvl"]) if chromosome_lvl_summary_list: chromosome_lvl_summary_list.write(chromosome_lvl_output_file) if non_chromosome_lvl_summary_list: non_chromosome_lvl_summary_list.write(non_chromosome_lvl_output_file) nonambiguous_species_summary_list, ambiguous_species_summary_list = summary_list.filter_ambiguous_species() #print(len(nonambiguous_species_summary_list), len(ambiguous_species_summary_list)) species_stat_dict[species_id]["nonambiguous_species"] = len(nonambiguous_species_summary_list) species_stat_dict[species_id]["ambiguous_species"] = len(ambiguous_species_summary_list) print "\tAmbiguous species %i" % species_stat_dict[species_id]["ambiguous_species"] if nonambiguous_species_summary_list: nonambiguous_species_summary_list.write(nonambiguous_species_output_file) if ambiguous_species_summary_list: ambiguous_species_summary_list.write(ambiguous_species_output_file) summary_list.write(all_output_file) if filter_by_integrity: filtered_by_integrity, filtered_out_by_integrity = summary_list.filter_by_integrity(min_scaffold_n50=min_scaffold_n50, min_contig_n50=min_contig_n50, max_scaffold_l50=max_scaffold_l50, max_contig_l50=max_contig_l50, max_contig_count=max_contig_count, max_scaffold_count=max_scaffold_count, max_chromosome_count=max_chromosome_count, min_chromosome_count=min_chromosome_count, max_unlocalized_scaffolds=max_unlocalized_scaffolds, max_unplaced_scaffolds=max_unplaced_scaffolds, max_total_length=max_total_length, min_total_length=min_total_length, max_ungapped_length=max_ungapped_length, min_ungapped_length=min_ungapped_length, no_ambiguous_species=no_ambiguous_species) species_stat_dict[species_id]["filtered_by_integrity"] = len(filtered_by_integrity) species_stat_dict[species_id]["filtered_out_by_integrity"] = len(filtered_out_by_integrity) if filtered_by_integrity: filtered_by_integrity.write(filtered_by_integrity_file) if filtered_out_by_integrity: filtered_out_by_integrity.write(filtered_out_by_integrity_file) print "\tPassed integrity filters %i" % species_stat_dict[species_id]["filtered_by_integrity"] species_stat_dict.write(species_stat_file) print "\n\n" taxon_stat_dict.write(taxon_stat_file) """
"--header", action="store_true", dest="header", help="Header is present in input file") parser.add_argument("-o", "--output_file", action="store", dest="output", default="stdout", help="Output file. Default: stdout") args = parser.parse_args() out_fd = sys.stdout if args.output == "stdout" else open(args.output, "w") species_syn_dict = TwoLvlDict() out_fd.write( "#family\tspecies_with_family\tspecies_with_errors\tspecies_with_correct_fam\terror_ratio\n" ) with open(args.input, "r") as in_fd: if args.header: in_fd.readline() for line in in_fd: species_with_errors = 0 species_with_fam = 0 tmp = line.strip().split("\t") family_name = tmp[0] for fam in tmp[1:]: if fam != ".": species_with_fam += 1 if "_" in fam:
#"PmCDA1_sub1_3d", #"PmCDA1_6d", "HAP_sub1", #"PmCDA1_sub1_6d", #"A1_3d", #"A1_6d", #"A3G_3d", #"AID_3d", #"AID_6d" ] power_limits = [f / 100 for f in range(1, 11)] size_limits = [i for i in range(3, 11)] os.chdir(workdir) for sample_set in sample_set_names_list: stat_dict = TwoLvlDict(OrderedDict({})) print("Handling %s" % sample_set) all_clusters = CollectionCCF(from_file=True, input_file=workdir + all_files_subdir + sample_set + all_files_suffix) if "HAP" not in sample_set: all_clusters.check_strandness() for min_size in size_limits: stat_dict[min_size] = OrderedDict({}) os.system("mkdir -p %i %i/all " % (min_size, min_size)) above_size_clusters, below_size_clusters = all_clusters.filter_by_expression( "record.size >= %i" % min_size) above_size_clusters.write( "%i/all/%s_size_%i+%s" % (min_size, sample_set, min_size, all_files_suffix)) stat_dict[min_size][0.00] = len(above_size_clusters)
parser.add_argument("-c", "--convert_aa_to_single_letter", action="store_true", dest="convert_to_single_letter", help="Convert aminoacids to single letters") args = parser.parse_args() args.input = make_list_of_path_to_files(args.input) gene_alias_dict = SynDict() if args.gene_alias_file: gene_alias_dict.read(args.gene_alias_file, split_values=False) out_fd = sys.stdout if args.output == "stdout" else open(args.output, "w") summary_dict = TwoLvlDict() for filename in args.input: directory, prefix, extension = split_filename(filename) if args.write_dir_path and args.write_ext: name = filename elif args.write_dir_path: name = (directory + prefix) if directory else prefix elif args.write_ext: name = prefix + extension else: name = prefix if args.suffix_to_remove in name: name = name.replace(args.suffix_to_remove, "") summary_dict[name] = OrderedDict() with open(filename, "r") as file_fd:
len(filtered_out_report.records)) if args.ref_species_gene_file: reference_genes_dict = {} with open(args.ref_species_gene_file, "r") as ref_fd: for line in ref_fd: gene_family_id, genes = line.strip().split("\t") genes = [] if genes == "." else genes.split(",") reference_genes_dict[gene_family_id] = [genes[:]] if genes: reference_genes_dict[gene_family_id].append(choice(genes)) # print gene_family_id #print reference_genes_dict[gene_family_id] node_header_list = features_list + ["reference_gene"] delta_index = features_list.index("delta") statistics_dict = TwoLvlDict({}) for node_id in node_values: statistics_dict[node_id] = OrderedDict({ "lost": 0, "new": 0, "lost_ref_ann": 0, "new_ref_ann": 0 }) for node_id in node_values: fd_list = [] for directory in node_info_dir, node_ref_dir: for mode in "all", "new", "lost": fd_list.append( open(