def estimate(self): # This is a temporary solution that requires R and the # ShadowRegression package. Eventually, this will be # replaced with a pure-python implementation. import os import subprocess import tempfile tempfiles = (tempfile.mkstemp()[1] for i in range(4)) read_counts, per_read, per_cycle, script_file = tempfiles try: # write counts to a file with open(read_counts, 'wt') as out: self._write_read_counts(out) # execute R script script = SHADOW_REGRESSION_R_SCRIPT.format(reads=read_counts, method=self.method, per_read=per_read, per_cycle=per_cycle) with open(script_file, 'wt') as out: out.write(script) proc = subprocess.Popen( [self.rscript_exe, "--vanilla", script_file], stdout=subprocess.PIPE, stderr=subprocess.PIPE) with proc: stdout, stderr = proc.communicate() if proc.returncode != 0: raise AtroposError( "R script failed: rc={}; stdout={}; stderr={}".format( proc.returncode, stdout, stderr)) # read the results with open(per_read, 'rt') as i: reader = csv.reader(i, delimiter="\t") per_read_error = dict(reader) if len(per_read_error) != 4: raise AtroposError("Invalid output from R script") with open(per_cycle, 'rt') as i: reader = csv.reader(i, delimiter="\t") per_cycle_error = list(row[0:3] for row in reader) if not per_cycle_error: raise AtroposError("Invalid output from R script") return (per_read_error["error rate"], dict(per_read=per_read_error, per_cycle=per_cycle_error)) finally: for path in tempfiles: os.remove(path)
def __call__(self, read1, read2): len1 = len(read1.sequence) len2 = len(read2.sequence) min_overlap = self.min_overlap if min_overlap <= 1: min_overlap = max(2, round(self.min_overlap * min(len1, len2))) if len1 < min_overlap or len2 < min_overlap: return (read1, read2) insert_matched = read1.insert_overlap and read2.insert_overlap if insert_matched: # If we've already determined that there is an insert overlap # with a 3' overhang, we can constrain our alignment aflags = START_WITHIN_SEQ1 | STOP_WITHIN_SEQ2 else: aflags = SEMIGLOBAL # align read1 to read2 reverse-complement to be compatible with # InsertAligner read2_rc = reverse_complement(read2.sequence) aligner = Aligner(read2_rc, self.error_rate, aflags) alignment = aligner.locate(read1.sequence) if alignment: r2_start, r2_stop, r1_start, r1_stop, matches, errors = alignment if matches >= min_overlap: # Only correct errors if we haven't already done correction in # the InsertAligner if (self.mismatch_action and errors > 0 and not insert_matched and read1.corrected == 0 and read2.corrected == 0): self.correct_errors(read1, read2, alignment) if r2_start == 0 and r2_stop == len2: # r2 is fully contained in r1 pass elif r1_start == 0 and r1_stop == len1: # r1 is fully contained in r2 read1.sequence = read2_rc read1.qualities = "".join(reversed(read2.qualities)) elif r1_start > 0: read1.sequence += read2_rc[r2_stop:] if read1.qualities and read2.qualities: read1.qualities += "".join(reversed( read2.qualities))[r2_stop:] elif r2_start > 0: read1.sequence = read2_rc + read1.sequence[r1_stop:] if read1.qualities and read2.qualities: read1.qualities = ("".join(reversed(read2.qualities)) + read1.qualities[r1_stop:]) else: raise AtroposError( "Invalid alignment while trying to merge read " "{}: {}".format(read1.name, ",".join(str(i) for i in alignment))) read1.merged = True read2 = None return (read1, read2)
def summary_fail_callback(): """Raises AtroposError with workers that did not report summaries. """ missing_summaries = ( set(range(1, self.threads)) - self.seen_summaries) raise AtroposError( "Missing summaries from processes %s", ",".join(str(summ) for summ in missing_summaries))
def handle_records(self, context, records): """Handle a sequence of records. Args: context: The pipeline context (dict). records: The sequence of records. """ for idx, record in enumerate(records): try: self.handle_record(context, record) except Exception as err: raise AtroposError( "An error occurred at record {} of batch {}".format( idx, context['index'])) from err
def _iter(self, sam): for reads in zip(sam, sam): if reads[0].query_name != reads[1].query_name: raise AtroposError( "Consecutive reads {}, {} in paired-end SAM/BAM file do " "not have the same name; make sure your file is " "name-sorted and does not contain any " "secondary/supplementary alignments.", reads[0].query_name, reads[1].query_name) if reads[0].is_read1: assert reads[1].is_read2 else: assert reads[1].is_read1 reads = (reads[1], reads[0]) yield tuple(self._as_sequence(r) for r in reads)
def __call__(self): # Start worker processes, reserve a thread for the reader process, # which we will get back after it completes worker_args = ( self.input_queue, self.pipeline, self.summary_queue, self.timeout) self.worker_processes = launch_workers(self.threads - 1, worker_args) self.num_batches = enqueue_all( self.command_runner.iterator(), self.input_queue, self.timeout, self.ensure_alive) logging.getLogger().debug( "Main loop complete; saw %d batches", self.num_batches) # Tell the worker processes no more input is coming enqueue_all( (None,) * self.threads, self.input_queue, self.timeout, self.ensure_alive) self.after_enqueue() # Now that the reader process is done, it essentially # frees up another thread to use for a worker self.worker_processes.extend( launch_workers(1, worker_args, offset=self.threads-1)) # Wait for all summaries to be available on queue def summary_timeout_callback(): """Ensure that workers are still alive. """ try: ensure_processes( self.worker_processes, "Workers are still alive and haven't returned summaries: {}", alive=False) except Exception as err: logging.getLogger().error(err) wait_on( self.summary_queue.full, wait_message="Waiting on worker summaries {}", timeout=self.timeout, wait=True, timeout_callback=summary_timeout_callback) # Process summary information from worker processes logging.getLogger().debug( "Processing summary information from worker processes") self.seen_summaries = set() self.seen_batches = set() def summary_fail_callback(): """Raises AtroposError with workers that did not report summaries. """ missing_summaries = ( set(range(1, self.threads)) - self.seen_summaries) raise AtroposError( "Missing summaries from processes %s", ",".join(str(summ) for summ in missing_summaries)) for _ in range(1, self.threads+1): batch = dequeue( self.summary_queue, fail_callback=summary_fail_callback) worker_index, worker_batches, worker_summary = batch if worker_summary is None: raise MulticoreError( "Worker process {} died unexpectedly".format(worker_index)) elif ( 'exception' in worker_summary and worker_summary['exception'] is not None): raise AtroposError( "Worker process {} died unexpectedly".format(worker_index), worker_summary['exception']) else: logging.getLogger().debug( "Processing summary for worker %d", worker_index) self.seen_summaries.add(worker_index) self.seen_batches |= worker_batches self.command_runner.summary.merge(worker_summary) # Check if any batches were missed if self.num_batches > 0: missing_batches = ( set(range(1, self.num_batches+1)) - self.seen_batches) if len(missing_batches) > 0: raise AtroposError( "Workers did not process batches {}".format( ",".join(str(batch) for batch in missing_batches))) self.finish()