def __run_vital_rule__(self, sample_path, keep_in_temp=True): def rule(read, constants, master): stats = {} if read.is_read1 and read.is_proper_pair and read.mapq > 38: insert_size = abs(read.template_length) stats["sum"] = insert_size stats["power_2"] = insert_size ** 2 stats["N"] = 1 stats["read_len"] = len(read.seq) byte_vals = [ ord(char) for char in read.qual ] min_qual = min(byte_vals) max_qual = max(byte_vals) qual_mean = np.mean(byte_vals) stats["qual_sum"] = qual_mean stats["qual_power_2"] = qual_mean ** 2 stats["qual_N"] = 1 stats["min_qual"] = min_qual stats["max_qual"] = max_qual return stats structures = {} structures["sum"] = {"data": 0, "store_method": "cumu"} structures["power_2"] = {"data": 0, "store_method": "cumu"} structures["N"] = {"data": 0, "store_method": "cumu"} structures["read_len"] = {"data": 0, "store_method": "max"} structures["min_qual"] = {"data": 999, "store_method": "min"} structures["max_qual"] = {"data": 0, "store_method": "max"} structures["qual_sum"] = {"data": 0, "store_method": "cumu"} structures["qual_power_2"] = {"data": 0, "store_method": "cumu"} structures["qual_N"] = {"data": 0, "store_method": "cumu"} stat_interface = parabam.Stat( temp_dir=self.temp_dir, total_procs=self._total_procs, task_size=10000, keep_in_temp=keep_in_temp, ) out_paths = stat_interface.run( input_paths=[sample_path], constants={}, rule=rule, struc_blueprint=structures ) return out_paths["global"]["stats"]
def run_read_stat_rule(self, path, vital_stats, keep_in_temp=True): simple_read_factory = SimpleReadFactory(vital_stats, trim_reads=self._trim) phred_offset = vital_stats["phred_offset"] maxtrix_max = (vital_stats["max_qual"] - phred_offset) + 1 matrix_shape = (vital_stats["read_len"] + 1, maxtrix_max) def get_return_stats(reads): return_stats = [ len(reads[0].mima_loci), int(reads[0].five_prime), len(reads[1].mima_loci), int(reads[1].five_prime), reads[0].avg_qual, reads[1].avg_qual ] return return_stats def rule(reads, constants, master): simple_reads = [ simple_read_factory.get_simple_read(read) for read in reads ] return_dat = np.zeros((2, 6)) return_dat[0, :] = get_return_stats(simple_reads) return_dat[1, :] = get_return_stats(simple_reads[::-1]) random_counts = np.zeros(matrix_shape) mima_counts = np.zeros(matrix_shape) for read in simple_reads: mima_counts[read.n_loci, int(read.avg_qual)] += 1 sample_size = len(read.mima_loci) if sample_size > 0: rand_quals = np.random.choice(list(read.qual), sample_size) qual_bytes = [ord(q) - phred_offset for q in rand_quals] rand_avg = np.mean(qual_bytes) random_counts[int(sample_size), int(rand_avg)] += 1 results = { "read_array": np.array(return_dat), "random_counts": random_counts, "mima_counts": mima_counts } return results structures = { "read_array": { "data": np.zeros((2, 6)), "store_method": "vstack" }, "mima_counts": { "data": np.zeros(matrix_shape), "store_method": "cumu" }, "random_counts": { "data": np.zeros(matrix_shape), "store_method": "cumu" }, } stat_interface = parabam.Stat(temp_dir=self.temp_dir, pair_process=True, total_procs=self._total_procs, task_size=self._task_size, keep_in_temp=keep_in_temp, verbose=0) out_paths = stat_interface.run(input_paths=[path], constants={}, rule=rule, struc_blueprint=structures) return out_paths[path]