예제 #1
0
  def __run_vital_rule__(self, sample_path, keep_in_temp=True):
    def rule(read, constants, master):
      stats = {}

      if read.is_read1 and read.is_proper_pair and read.mapq > 38:
        insert_size = abs(read.template_length)
        stats["sum"] = insert_size
        stats["power_2"] = insert_size ** 2
        stats["N"] = 1

      stats["read_len"] = len(read.seq)
      byte_vals = [ ord(char) for char in read.qual ]

      min_qual = min(byte_vals)
      max_qual = max(byte_vals)

      qual_mean = np.mean(byte_vals)
      stats["qual_sum"] = qual_mean
      stats["qual_power_2"] = qual_mean ** 2
      stats["qual_N"] = 1

      stats["min_qual"] = min_qual
      stats["max_qual"] = max_qual

      return stats

    structures = {}

    structures["sum"] = {"data": 0, "store_method": "cumu"}
    structures["power_2"] = {"data": 0, "store_method": "cumu"}
    structures["N"] = {"data": 0, "store_method": "cumu"}
    structures["read_len"] = {"data": 0, "store_method": "max"}

    structures["min_qual"] = {"data": 999, "store_method": "min"}
    structures["max_qual"] = {"data": 0, "store_method": "max"}

    structures["qual_sum"] = {"data": 0, "store_method": "cumu"}
    structures["qual_power_2"] = {"data": 0, "store_method": "cumu"}
    structures["qual_N"] = {"data": 0, "store_method": "cumu"}

    stat_interface = parabam.Stat(
      temp_dir=self.temp_dir,
      total_procs=self._total_procs,
      task_size=10000,
      keep_in_temp=keep_in_temp,
    )

    out_paths = stat_interface.run(
      input_paths=[sample_path], constants={}, rule=rule, struc_blueprint=structures
    )

    return out_paths["global"]["stats"]
예제 #2
0
    def run_read_stat_rule(self, path, vital_stats, keep_in_temp=True):

        simple_read_factory = SimpleReadFactory(vital_stats,
                                                trim_reads=self._trim)
        phred_offset = vital_stats["phred_offset"]

        maxtrix_max = (vital_stats["max_qual"] - phred_offset) + 1
        matrix_shape = (vital_stats["read_len"] + 1, maxtrix_max)

        def get_return_stats(reads):

            return_stats = [
                len(reads[0].mima_loci),
                int(reads[0].five_prime),
                len(reads[1].mima_loci),
                int(reads[1].five_prime), reads[0].avg_qual, reads[1].avg_qual
            ]

            return return_stats

        def rule(reads, constants, master):
            simple_reads = [
                simple_read_factory.get_simple_read(read) for read in reads
            ]
            return_dat = np.zeros((2, 6))
            return_dat[0, :] = get_return_stats(simple_reads)
            return_dat[1, :] = get_return_stats(simple_reads[::-1])

            random_counts = np.zeros(matrix_shape)
            mima_counts = np.zeros(matrix_shape)

            for read in simple_reads:
                mima_counts[read.n_loci, int(read.avg_qual)] += 1

                sample_size = len(read.mima_loci)
                if sample_size > 0:
                    rand_quals = np.random.choice(list(read.qual), sample_size)
                    qual_bytes = [ord(q) - phred_offset for q in rand_quals]
                    rand_avg = np.mean(qual_bytes)

                    random_counts[int(sample_size), int(rand_avg)] += 1

            results = {
                "read_array": np.array(return_dat),
                "random_counts": random_counts,
                "mima_counts": mima_counts
            }

            return results

        structures = {
            "read_array": {
                "data": np.zeros((2, 6)),
                "store_method": "vstack"
            },
            "mima_counts": {
                "data": np.zeros(matrix_shape),
                "store_method": "cumu"
            },
            "random_counts": {
                "data": np.zeros(matrix_shape),
                "store_method": "cumu"
            },
        }

        stat_interface = parabam.Stat(temp_dir=self.temp_dir,
                                      pair_process=True,
                                      total_procs=self._total_procs,
                                      task_size=self._task_size,
                                      keep_in_temp=keep_in_temp,
                                      verbose=0)

        out_paths = stat_interface.run(input_paths=[path],
                                       constants={},
                                       rule=rule,
                                       struc_blueprint=structures)

        return out_paths[path]