def _bam2stats(self): def __get_base_alignment_stats(string: str): d = {} # SamTools stats file columns: ID, stat, value, comment for line_list in Utilities.string_to_2d_array(string): if len(line_list) < 3 or line_list[0] != "SN": continue d[re.sub(":$", "", line_list[1])] = line_list[2] if len(d) == 0: logging.critical("Bad alignment: no SAMTools stats to extract!") return {} try: out = {"total_reads": d["raw total sequences"], "mapped_reads": d["reads mapped"], "total_bp": d["total length"], "mapped_bp": d["bases mapped"]} except KeyError: return {} return {"sample_{}".format(k): int(out[k]) for k in out} Utilities.batch_remove(self._pk.samtools_stats_file_name, self._pk.samtools_stats_log_file_name) s = subprocess.getoutput("samtools stats {a} 2> {b}".format(a=self._pk.samtools_sorted_file_name, b=self._pk.samtools_stats_log_file_name)) Utilities.dump_string(string=s, file=self._pk.samtools_stats_file_name) logging.info("Saved SAMTools total coverage statistics: '{}'".format(self._pk.samtools_stats_file_name)) self._samtools_stats_dict = __get_base_alignment_stats(s) del s
def __init__(self, path_keeper: PathsKeeper, threads_number: int): self._pk = path_keeper self._threads_number = threads_number Utilities.batch_remove(self._pk.mapped_reads_file_name, self._pk.samtools_converted_file_name, self._pk.samtools_sorted_file_name, self._pk.unmapped_reads_file_name, *self._pk.pairwise_unmapped_reads_files_list, self._pk.aligner_log_file_name)
def _bam2idxstats(self): Utilities.batch_remove(self._pk.samtools_idxstats_file_name, self._pk.samtools_idxstats_log_file_name) s = subprocess.getoutput("samtools idxstats {a} 2> {b}".format(a=self._pk.samtools_sorted_file_name, b=self._pk.samtools_idxstats_log_file_name)) Utilities.dump_string(string=s, file=self._pk.samtools_idxstats_file_name) logging.info("Saved SAMTools mapped reads statistics: '{}'".format(self._pk.samtools_idxstats_file_name)) self._samtools_idxstats_df = pd.DataFrame(Utilities.string_to_2d_array(s), columns=[self._index_column, "id_bp", "id_mapped_reads", "id_unmapped_reads"]) del s
def run(self): subprocess.getoutput("rm -f {}*".format(self._pk.samtools_sorted_file_name)) Utilities.batch_remove(self._pk.aligner_log_file_name) bwt_cmd_string = " ".join(self._get_cmd()) pipeline = """{a} 2> {b} | \ samtools view - -bu -@ {c} | \ samtools sort - -@ {c} -o {d}""".format(a=bwt_cmd_string, b=self._pk.aligner_log_file_name, c=self._threads_number, d=self._pk.samtools_sorted_file_name) logging.debug("Started alignment pipeline with arguments: '{}'".format(pipeline)) s = subprocess.getoutput(pipeline) logging.info("Completed alignment pipeline with arguments: '{a}' and output:\n{b}\n".format(a=pipeline, b=s))
def _bam2histogram(self): Utilities.batch_remove(self._pk.bedtools_histogram_file_name, self._pk.genomeCoverageBed_log_file_name) s = subprocess.getoutput("genomeCoverageBed -ibam {a} 2> {b}".format(a=self._pk.samtools_sorted_file_name, b=self._pk.genomeCoverageBed_log_file_name)) # GenomeCoverageBed details: https://bedtools.readthedocs.io/en/stable/content/tools/genomecov.html # Cannot be converted to DataFrame before stacking Utilities.dump_string(string=s, file=self._pk.bedtools_histogram_file_name) self._bedtools_histogram_2d_array = Utilities.string_to_2d_array(s) if len(self._bedtools_histogram_2d_array) == 0: logging.critical("Bad alignment: no BEDTools coverage histogram to save!") logging.info("Saved BEDTools coverage histogram data: '{}'".format(self._pk.bedtools_histogram_file_name)) del s
def _sam2bam2sorted_bam(self): subprocess.getoutput("rm -f {}*".format(self._pk.samtools_sorted_file_name)) Utilities.batch_remove(self._pk.samtools_converted_log_file_name) # SamTools details: http://www.htslib.org/doc/samtools.html # Avoiding self._pk.samtools_converted_file_name s = subprocess.getoutput("samtools view -bu -@ 1 {a} | \ samtools sort - -o -@ 1 {b}".format(a=self._pk.mapped_reads_file_name, b=self._pk.samtools_sorted_file_name)) Utilities.dump_string(string=s, file=self._pk.samtools_converted_log_file_name) logging.info("Sorted SAM file: '{}'".format(self._pk.samtools_sorted_file_name)) del s
def _reference2statistics(self): Utilities.batch_remove(self._pk.final_coverage_file_name) stats_dict = self._samtools_stats_dict if len(stats_dict) == 0: logging.critical("Bad alignment: empty SAMTools stats: '{}'".format(self._pk.samtools_stats_file_name)) return if len(self._stacked_coverages_df) == 0: logging.critical("Bad alignment: empty stacked BEDTools coverage: '{}'".format(self._pk.stacked_coverage_file_name)) return chunk_size = 10 ** 6 reader = pd.read_table(self._pk.bedtools_genome_file, sep='\t', header="infer", names=[self._index_column, "id_bp"], chunksize=chunk_size) for chunk_number, reference_df in enumerate(reader): genomes_coverages_df = reference_df.merge(self._stacked_coverages_df.loc[:, [self._index_column] + [i for i in list(self._stacked_coverages_df) if i not in list(reference_df)]], on=self._index_column, how="left") genomes_coverages_df = genomes_coverages_df[~genomes_coverages_df[self._index_column].isin(["*", "genome"])] if self._non_zero_bool: genomes_coverages_df = genomes_coverages_df[genomes_coverages_df.id_coverage_breadth.notnull()] else: genomes_coverages_df = genomes_coverages_df.fillna(0) genomes_coverages_df["id_total_relative_abundance"] = (10 ** 12) * genomes_coverages_df["id_mapped_bp"].astype(int) / (genomes_coverages_df["id_bp"].astype(int) * int(stats_dict["sample_total_bp"])) genomes_coverages_df["id_mapped_relative_abundance"] = (10 ** 12) * genomes_coverages_df["id_mapped_bp"].astype(int) / (genomes_coverages_df["id_bp"].astype(int) * int(stats_dict["sample_mapped_bp"])) # MRA details: http://www.ibmc.msk.ru/content/thesisDocs/TyakhtAV_thesis.pdf (p.63) genomes_coverages_df["sample_total_reads"] = stats_dict["sample_total_reads"] genomes_coverages_df["sample_mapped_reads"] = stats_dict["sample_mapped_reads"] genomes_coverages_df["sample_total_bp"] = stats_dict["sample_total_bp"] genomes_coverages_df["sample_mapped_bp"] = stats_dict["sample_mapped_bp"] genomes_coverages_df["sample_average_total_reads_bp"] = float(stats_dict["sample_total_reads"]) / float(stats_dict["sample_total_bp"]) genomes_coverages_df["sample_average_mapped_reads_bp"] = float(stats_dict["sample_mapped_reads"]) / float(stats_dict["sample_total_bp"]) genomes_coverages_df["sample_mapped_reads_to_total_reads"] = float(stats_dict["sample_mapped_reads"]) / float(stats_dict["sample_total_reads"]) genomes_coverages_df = genomes_coverages_df.merge(self._samtools_idxstats_df.loc[:, [self._index_column] + [i for i in list(self._samtools_idxstats_df) if i not in list(genomes_coverages_df)]], on=self._index_column, how="left") genomes_coverages_df["id_mapped_reads_per_million_sample_total_reads"] = genomes_coverages_df["id_mapped_reads"].astype(int) * (10 ** 6) / int(stats_dict["sample_total_reads"]) genomes_coverages_df["id_mapped_reads_per_million_sample_mapped_reads"] = genomes_coverages_df["id_mapped_reads"].astype(int) * (10 ** 6) / int(stats_dict["sample_mapped_reads"]) # RPM details: https://www.biostars.org/p/273537/ genomes_coverages_df["id_mapped_reads_per_kbp_per_million_sample_total_reads"] = genomes_coverages_df["id_mapped_reads"].astype(int) * (10 ** 9) / (int(stats_dict["sample_total_reads"]) * genomes_coverages_df["id_bp"]) genomes_coverages_df["id_mapped_reads_per_kbp_per_million_sample_mapped_reads"] = genomes_coverages_df["id_mapped_reads"].astype(int) * (10 ** 9) / (int(stats_dict["sample_mapped_reads"]) * genomes_coverages_df["id_bp"]) # RPKM details: https://www.biostars.org/p/273537/ for int_column in ["id_bp", "id_coverage_breadth", "id_mapped_bp", "id_maximal_coverage_depth", "id_mapped_reads", "sample_total_reads", "sample_mapped_reads", "sample_total_bp", "sample_mapped_bp"]: genomes_coverages_df[int_column] = genomes_coverages_df[int_column].astype(int) genomes_coverages_df = genomes_coverages_df.loc[:, [i for i in list(genomes_coverages_df) if len(i.strip()) > 0]] if chunk_number == 0: genomes_coverages_df.to_csv(self._pk.final_coverage_file_name, sep='\t', header=True, index=False) else: with open(file=self._pk.final_coverage_file_name, mode="a", encoding="utf-8") as f: genomes_coverages_df.to_csv(f, sep='\t', header=False, index=False) logging.info("Processed chunk {} with size of {} lines".format(chunk_number, chunk_size)) logging.info("Finished processing coverage table: '{}'".format(self._pk.final_coverage_file_name))
def _stack_coverage(self): Utilities.batch_remove(self._pk.stacked_coverage_file_name) # genomecov file columns: reference sequence name, depth of coverage, breadth of coverage with that depth, sequence length, coverage ratio stacked_coverages_2d_array = [] row_processing_2d_array = [] counting_id = "" for row_list in self._bedtools_histogram_2d_array: if len(row_list) != 5: logging.warning("Cannot parse coverage histogram row '{a}' from file '{b}'".format(a=row_list, b=self._pk.bedtools_histogram_file_name)) continue reference_id, id_local_coverage_depth, id_local_coverage_breadth, id_bp, id_local_coverage_ratio = row_list if reference_id == 'genome' or '*' in reference_id: continue if reference_id == counting_id and int(id_local_coverage_depth) > 0: row_processing_2d_array.append(row_list) else: if len(row_processing_2d_array) > 0: # output file columns: reference sequence name, maximal depth of coverage, total breadth of coverage, sequence length, coverage ratio, total mapped bases id_maximal_coverage_depth = max([int(i[1]) for i in row_processing_2d_array]) id_coverage_breadth = sum([int(i[2]) for i in row_processing_2d_array]) id_bp = int(row_processing_2d_array[0][3]) id_coverage_breadth_to_id_bp = sum([float(i[4]) for i in row_processing_2d_array]) id_mapped_bp = sum([int(i[1]) * int(i[2]) for i in row_processing_2d_array]) stacked_coverages_2d_array.append([counting_id, id_maximal_coverage_depth, id_coverage_breadth, id_bp, id_coverage_breadth_to_id_bp, id_mapped_bp]) row_processing_2d_array = [] counting_id = reference_id if len(stacked_coverages_2d_array) == 0: logging.critical("Bad alignment: no coverage to stack!") return self._stacked_coverages_df = pd.DataFrame(stacked_coverages_2d_array, columns=[self._index_column, "id_maximal_coverage_depth", "id_coverage_breadth", "id_bp", "id_coverage_breadth_to_id_bp", "id_mapped_bp"]) self._stacked_coverages_df.to_csv(self._pk.stacked_coverage_file_name, sep='\t', index=False) logging.info("Stacked BEDTools coverage: '{}'".format(self._pk.stacked_coverage_file_name)) del self._bedtools_histogram_2d_array, stacked_coverages_2d_array gc.collect()
def _index_bam(self): Utilities.batch_remove(self._pk.samtools_index_file_name, self._pk.samtools_index_log_file_name) s = subprocess.getoutput("samtools index {}".format(self._pk.samtools_sorted_file_name)) Utilities.dump_string(string=s, file=self._pk.samtools_index_log_file_name) logging.info("Indexed BAM file: '{}'".format(self._pk.samtools_index_file_name)) del s