示例#1
0
    def estimate(self):
        # This is a temporary solution that requires R and the
        # ShadowRegression package. Eventually, this will be
        # replaced with a pure-python implementation.
        import os
        import subprocess
        import tempfile

        tempfiles = (tempfile.mkstemp()[1] for i in range(4))
        read_counts, per_read, per_cycle, script_file = tempfiles
        try:
            # write counts to a file
            with open(read_counts, 'wt') as out:
                self._write_read_counts(out)

            # execute R script
            script = SHADOW_REGRESSION_R_SCRIPT.format(reads=read_counts,
                                                       method=self.method,
                                                       per_read=per_read,
                                                       per_cycle=per_cycle)
            with open(script_file, 'wt') as out:
                out.write(script)
            proc = subprocess.Popen(
                [self.rscript_exe, "--vanilla", script_file],
                stdout=subprocess.PIPE,
                stderr=subprocess.PIPE)
            with proc:
                stdout, stderr = proc.communicate()
                if proc.returncode != 0:
                    raise AtroposError(
                        "R script failed: rc={}; stdout={}; stderr={}".format(
                            proc.returncode, stdout, stderr))

            # read the results
            with open(per_read, 'rt') as i:
                reader = csv.reader(i, delimiter="\t")
                per_read_error = dict(reader)
                if len(per_read_error) != 4:
                    raise AtroposError("Invalid output from R script")
            with open(per_cycle, 'rt') as i:
                reader = csv.reader(i, delimiter="\t")
                per_cycle_error = list(row[0:3] for row in reader)
                if not per_cycle_error:
                    raise AtroposError("Invalid output from R script")

            return (per_read_error["error rate"],
                    dict(per_read=per_read_error, per_cycle=per_cycle_error))
        finally:
            for path in tempfiles:
                os.remove(path)
示例#2
0
    def __call__(self, read1, read2):
        len1 = len(read1.sequence)
        len2 = len(read2.sequence)
        min_overlap = self.min_overlap
        if min_overlap <= 1:
            min_overlap = max(2, round(self.min_overlap * min(len1, len2)))

        if len1 < min_overlap or len2 < min_overlap:
            return (read1, read2)

        insert_matched = read1.insert_overlap and read2.insert_overlap

        if insert_matched:
            # If we've already determined that there is an insert overlap
            # with a 3' overhang, we can constrain our alignment
            aflags = START_WITHIN_SEQ1 | STOP_WITHIN_SEQ2
        else:
            aflags = SEMIGLOBAL
        # align read1 to read2 reverse-complement to be compatible with
        # InsertAligner
        read2_rc = reverse_complement(read2.sequence)
        aligner = Aligner(read2_rc, self.error_rate, aflags)
        alignment = aligner.locate(read1.sequence)

        if alignment:
            r2_start, r2_stop, r1_start, r1_stop, matches, errors = alignment
            if matches >= min_overlap:
                # Only correct errors if we haven't already done correction in
                # the InsertAligner
                if (self.mismatch_action and errors > 0 and not insert_matched
                        and read1.corrected == 0 and read2.corrected == 0):
                    self.correct_errors(read1, read2, alignment)

                if r2_start == 0 and r2_stop == len2:
                    # r2 is fully contained in r1
                    pass
                elif r1_start == 0 and r1_stop == len1:
                    # r1 is fully contained in r2
                    read1.sequence = read2_rc
                    read1.qualities = "".join(reversed(read2.qualities))
                elif r1_start > 0:
                    read1.sequence += read2_rc[r2_stop:]
                    if read1.qualities and read2.qualities:
                        read1.qualities += "".join(reversed(
                            read2.qualities))[r2_stop:]
                elif r2_start > 0:
                    read1.sequence = read2_rc + read1.sequence[r1_stop:]
                    if read1.qualities and read2.qualities:
                        read1.qualities = ("".join(reversed(read2.qualities)) +
                                           read1.qualities[r1_stop:])
                else:
                    raise AtroposError(
                        "Invalid alignment while trying to merge read "
                        "{}: {}".format(read1.name,
                                        ",".join(str(i) for i in alignment)))

                read1.merged = True
                read2 = None

        return (read1, read2)
示例#3
0
 def summary_fail_callback():
     """Raises AtroposError with workers that did not report summaries.
     """
     missing_summaries = (
         set(range(1, self.threads)) - self.seen_summaries)
     raise AtroposError(
         "Missing summaries from processes %s",
         ",".join(str(summ) for summ in missing_summaries))
示例#4
0
 def handle_records(self, context, records):
     """Handle a sequence of records.
     
     Args:
         context: The pipeline context (dict).
         records: The sequence of records.
     """
     for idx, record in enumerate(records):
         try:
             self.handle_record(context, record)
         except Exception as err:
             raise AtroposError(
                 "An error occurred at record {} of batch {}".format(
                     idx, context['index'])) from err
示例#5
0
    def _iter(self, sam):
        for reads in zip(sam, sam):
            if reads[0].query_name != reads[1].query_name:
                raise AtroposError(
                    "Consecutive reads {}, {} in paired-end SAM/BAM file do "
                    "not have the same name; make sure your file is "
                    "name-sorted and does not contain any "
                    "secondary/supplementary alignments.", reads[0].query_name,
                    reads[1].query_name)

            if reads[0].is_read1:
                assert reads[1].is_read2
            else:
                assert reads[1].is_read1
                reads = (reads[1], reads[0])

            yield tuple(self._as_sequence(r) for r in reads)
示例#6
0
    def __call__(self):
        # Start worker processes, reserve a thread for the reader process,
        # which we will get back after it completes
        worker_args = (
            self.input_queue, self.pipeline, self.summary_queue, self.timeout)
        self.worker_processes = launch_workers(self.threads - 1, worker_args)

        self.num_batches = enqueue_all(
            self.command_runner.iterator(), self.input_queue, self.timeout,
            self.ensure_alive)

        logging.getLogger().debug(
            "Main loop complete; saw %d batches", self.num_batches)

        # Tell the worker processes no more input is coming
        enqueue_all(
            (None,) * self.threads, self.input_queue, self.timeout,
            self.ensure_alive)

        self.after_enqueue()

        # Now that the reader process is done, it essentially
        # frees up another thread to use for a worker
        self.worker_processes.extend(
            launch_workers(1, worker_args, offset=self.threads-1))

        # Wait for all summaries to be available on queue
        def summary_timeout_callback():
            """Ensure that workers are still alive.
            """
            try:
                ensure_processes(
                    self.worker_processes,
                    "Workers are still alive and haven't returned summaries: {}",
                    alive=False)
            except Exception as err:
                logging.getLogger().error(err)

        wait_on(
            self.summary_queue.full,
            wait_message="Waiting on worker summaries {}",
            timeout=self.timeout,
            wait=True,
            timeout_callback=summary_timeout_callback)

        # Process summary information from worker processes
        logging.getLogger().debug(
            "Processing summary information from worker processes")

        self.seen_summaries = set()
        self.seen_batches = set()

        def summary_fail_callback():
            """Raises AtroposError with workers that did not report summaries.
            """
            missing_summaries = (
                set(range(1, self.threads)) - self.seen_summaries)
            raise AtroposError(
                "Missing summaries from processes %s",
                ",".join(str(summ) for summ in missing_summaries))

        for _ in range(1, self.threads+1):
            batch = dequeue(
                self.summary_queue, fail_callback=summary_fail_callback)
            worker_index, worker_batches, worker_summary = batch
            if worker_summary is None:
                raise MulticoreError(
                    "Worker process {} died unexpectedly".format(worker_index))
            elif (
                    'exception' in worker_summary and
                    worker_summary['exception'] is not None):
                raise AtroposError(
                    "Worker process {} died unexpectedly".format(worker_index),
                    worker_summary['exception'])
            else:
                logging.getLogger().debug(
                    "Processing summary for worker %d", worker_index)
            self.seen_summaries.add(worker_index)
            self.seen_batches |= worker_batches
            self.command_runner.summary.merge(worker_summary)

        # Check if any batches were missed
        if self.num_batches > 0:
            missing_batches = (
                set(range(1, self.num_batches+1)) - self.seen_batches)
            if len(missing_batches) > 0:
                raise AtroposError(
                    "Workers did not process batches {}".format(
                        ",".join(str(batch) for batch in missing_batches)))

        self.finish()