def _check_deseq_args(self, arg_libs, conditions): """Test if the given arguments are sufficient.""" if len(arg_libs) != len(conditions): self._write_err_msg_and_quit( "Error - The read library file list and condition list must " "have the same number of elements. You entered \n%s " "(= %s elements)\nand \n%s (= %s elements).\n" % ( self._args.libs, len(arg_libs), self._args.conditions, len(conditions), )) raw_stat_data_reader = RawStatDataReader() alignment_stats = [ raw_stat_data_reader.read(self._paths.read_alignments_stats_path) ] lib_names = list(alignment_stats[0].keys()) if len(lib_names) != len(arg_libs): self._write_err_msg_and_quit( "The number of read libraries is lower or higher than " "expected. The following read libs are available: %s\nThe " 'following read list string is suggested: "%s"\n' % (", ".join(lib_names), ",".join(lib_names))) for lib in lib_names: if lib not in arg_libs: self._write_err_msg_and_quit( 'The library "%s" is not present in your list of ' "libraries. Please add it.\n" % (lib))
def _write_alignment_stat_table(self): """Manage the creation of the mapping statistic output table.""" raw_stat_data_reader = RawStatDataReader() read_processing_stats = raw_stat_data_reader.read( self._paths.read_processing_stats_path) final_alignment_stats = raw_stat_data_reader.read( self._paths.read_alignments_stats_path) realignment_stats = None primary_aligner_stats = None if self._args.realign: primary_aligner_stats = raw_stat_data_reader.read( self._paths.primary_read_aligner_stats_path) realignment_stats = raw_stat_data_reader.read( self._paths.read_realigner_stats_path) read_aligner_stats_table = ReadAlignerStatsTable( read_processing_stats, final_alignment_stats, primary_aligner_stats, realignment_stats, self._lib_names, self._paths.read_alignment_stats_table_path, self._args.paired_end) read_aligner_stats_table.write()
def create_coverage_files(self): """Create coverage files based on the read alignments. The coverages are calculated per replicon and the results are written to the output file. This might be slower but if all coverages are determined at once the data structure will become too large when working with large reference sequences. """ self._test_folder_existance(self._paths.required_coverage_folders()) raw_stat_data_reader = RawStatDataReader() alignment_stats = [ raw_stat_data_reader.read(self._paths.read_alignments_stats_path) ] lib_names = list(alignment_stats[0].keys()) was_paired_end_alignment = self._was_paired_end_alignment(lib_names) if not was_paired_end_alignment: self._paths.set_read_files_dep_file_lists_single_end( self._paths.get_read_files(), lib_names) else: self._paths.set_read_files_dep_file_lists_paired_end( self._paths.get_read_files(), lib_names) # Get number of aligned or number of uniquely aligned reads if not self._args.normalize_by_uniquely: aligned_counting = "no_of_aligned_reads" else: aligned_counting = "no_of_uniquely_aligned_reads" read_files_aligned_read_freq = dict([ (read_file, round(attributes["stats_total"][aligned_counting])) for read_file, attributes in alignment_stats[0].items() ]) min_no_of_aligned_reads = float( min(read_files_aligned_read_freq.values())) # Run the generation of coverage in parallel jobs = [] with concurrent.futures.ProcessPoolExecutor( max_workers=self._args.processes) as executor: for lib_name, bam_path in zip( lib_names, self._paths.read_alignment_bam_paths): no_of_aligned_reads = float( read_files_aligned_read_freq[lib_name]) jobs.append( executor.submit( self._create_coverage_files_for_lib, lib_name, bam_path, no_of_aligned_reads, min_no_of_aligned_reads, )) # Evaluate thread outcome self._check_job_completeness(jobs)
def create_coverage_files(self): """Create coverage files based on the read alignments. The coverages are calculated per replicon and the results are written to the output file. This might be slower but if all coverages are detmined at once the data structure will become too large when working with large reference sequences. """ self._test_folder_existance(self._paths.required_coverage_folders()) raw_stat_data_reader = RawStatDataReader() alignment_stats = [raw_stat_data_reader.read( self._paths.read_alignments_stats_path)] lib_names = list(alignment_stats[0].keys()) was_paired_end_alignment = self._was_paired_end_alignment(lib_names) if not was_paired_end_alignment: self._paths.set_read_files_dep_file_lists_single_end( self._paths.get_read_files(), lib_names) else: self._paths.set_read_files_dep_file_lists_paired_end( self._paths.get_read_files(), lib_names) # Get number of aligned or number of uniquely aligned reads if not self._args.normalize_by_uniquely: aligned_counting = "no_of_aligned_reads" else: aligned_counting = "no_of_uniquely_aligned_reads" read_files_aligned_read_freq = dict([ (read_file, round(attributes["stats_total"][aligned_counting])) for read_file, attributes in alignment_stats[0].items()]) min_no_of_aligned_reads = float(min( read_files_aligned_read_freq.values())) # Run the generation of coverage in parallel jobs = [] with concurrent.futures.ProcessPoolExecutor( max_workers=self._args.processes) as executor: for lib_name, bam_path in zip( lib_names, self._paths.read_alignment_bam_paths): no_of_aligned_reads = float( read_files_aligned_read_freq[lib_name]) jobs.append(executor.submit( self._create_coverage_files_for_lib, lib_name, bam_path, no_of_aligned_reads, min_no_of_aligned_reads)) # Evaluate thread outcome self._check_job_completeness(jobs)
def quantify_gene_wise(self): """Manage the counting of aligned reads per gene.""" self._test_folder_existance(self._paths.required_gene_quanti_folders()) norm_by_alignment_freq = True norm_by_overlap_freq = True if self._args.no_count_split_by_alignment_no: norm_by_alignment_freq = False if self._args.no_count_splitting_by_gene_no: norm_by_overlap_freq = False raw_stat_data_reader = RawStatDataReader() alignment_stats = [ raw_stat_data_reader.read(self._paths.read_alignments_stats_path) ] lib_names = sorted(list(alignment_stats[0].keys())) annotation_files = self._paths.get_annotation_files() self._paths.set_annotation_paths(annotation_files) was_paired_end_alignment = self._was_paired_end_alignment(lib_names) if not was_paired_end_alignment: self._paths.set_read_files_dep_file_lists_single_end( self._paths.get_read_files(), lib_names) else: self._paths.set_read_files_dep_file_lists_paired_end( self._paths.get_read_files(), lib_names) jobs = [] with concurrent.futures.ProcessPoolExecutor( max_workers=self._args.processes) as executor: for lib_name, read_alignment_path in zip( lib_names, self._paths.read_alignment_bam_paths): jobs.append( executor.submit( self._quantify_gene_wise, lib_name, read_alignment_path, norm_by_alignment_freq, norm_by_overlap_freq, annotation_files, )) # Evaluate thread outcome self._check_job_completeness(jobs) self._gene_quanti_create_overview(annotation_files, self._paths.annotation_paths, lib_names)
def _check_deseq_args(self, arg_libs, conditions): """Test if the given arguments are sufficient.""" if len(arg_libs) != len(conditions): self._write_err_msg_and_quit( "Error - The read library file list and condition list must " "have the same number of elements. You entered \n%s " "(= %s elements)\nand \n%s (= %s elements).\n" % ( self._args.libs, len(arg_libs), self._args.conditions, len(conditions))) raw_stat_data_reader = RawStatDataReader() alignment_stats = [raw_stat_data_reader.read( self._paths.read_alignments_stats_path)] lib_names = list(alignment_stats[0].keys()) if len(lib_names) != len(arg_libs): self._write_err_msg_and_quit( "The number of read libraries is lower or higher than " "expected. The following read libs are available: %s\nThe " "following read list string is suggested: \"%s\"\n" % ( ", ".join(lib_names), ",".join(lib_names))) for lib in lib_names: if lib not in arg_libs: self._write_err_msg_and_quit( "The library \"%s\" is not present in your list of " "libraries. Please add it.\n" % (lib))
def quantify_gene_wise(self): """Manage the counting of aligned reads per gene.""" self._test_folder_existance( self._paths.required_gene_quanti_folders()) norm_by_alignment_freq = True norm_by_overlap_freq = True if self._args.no_count_split_by_alignment_no: norm_by_alignment_freq = False if self._args.no_count_splitting_by_gene_no: norm_by_overlap_freq = False raw_stat_data_reader = RawStatDataReader() alignment_stats = [raw_stat_data_reader.read( self._paths.read_alignments_stats_path)] lib_names = sorted(list(alignment_stats[0].keys())) annotation_files = self._paths.get_annotation_files() self._paths.set_annotation_paths(annotation_files) was_paired_end_alignment = self._was_paired_end_alignment(lib_names) if not was_paired_end_alignment: self._paths.set_read_files_dep_file_lists_single_end( self._paths.get_read_files(), lib_names) else: self._paths.set_read_files_dep_file_lists_paired_end( self._paths.get_read_files(), lib_names) jobs = [] with concurrent.futures.ProcessPoolExecutor( max_workers=self._args.processes) as executor: for lib_name, read_alignment_path in zip( lib_names, self._paths.read_alignment_bam_paths): jobs.append(executor.submit( self._quantify_gene_wise, lib_name, read_alignment_path, norm_by_alignment_freq, norm_by_overlap_freq, annotation_files)) # Evaluate thread outcome self._check_job_completeness(jobs) self._gene_quanti_create_overview( annotation_files, self._paths.annotation_paths, lib_names)