def sam_string_to_aligned_segment(sam_string, header=None): """Convert a correctly formatted sam string into a pysam AlignedSegment object :param sam_string: correctly formatted SAM string :param header: AlignmentHeader object :return AlignedSegment """ if not header: header = AlignmentHeader.from_references([sam_string.split("\t")[2]], [100000000]) new_segment = AlignedSegment.fromstring(sam_string, header) return new_segment
def run(self): with CSVLogger(summary_file(), sep='\t') as summary: for read, res in self.iterator: seq = res['sequence'] qstring = res.get('qstring', '*') mean_qscore = res.get('mean_qscore', mean_qscore_from_qstring(qstring)) mapping = res.get('mapping', False) mods_tags = res.get('mods', []) if self.duplex: samples = len(read[0].signal) + len(read[1].signal) read_id = '%s;%s' % (read[0].read_id, read[1].read_id) else: samples = len(read.signal) read_id = read.read_id tags = [ f'RG:Z:{read.run_id}_{self.group_key}', f'qs:i:{round(mean_qscore)}', *read.tagdata(), *mods_tags, ] if len(seq): if self.mode == 'wfq': write_fastq(read_id, seq, qstring, fd=self.fd, tags=tags) else: self.output.write( AlignedSegment.fromstring( sam_record(read_id, seq, qstring, mapping, tags=tags), self.output.header ) ) if self.duplex: summary.append(duplex_summary_row(read[0], read[1], len(seq), mean_qscore, alignment=mapping)) else: summary.append(summary_row(read, len(seq), mean_qscore, alignment=mapping)) self.log.append((read_id, samples)) else: logger.warn("> skipping empty sequence %s", read_id)
def run(self): chunks = [] targets = [] lengths = [] with CSVLogger(summary_file(), sep='\t') as summary: for read, ctc_data in self.iterator: seq = ctc_data['sequence'] qstring = ctc_data['qstring'] mean_qscore = ctc_data.get('mean_qscore', mean_qscore_from_qstring(qstring)) mapping = ctc_data.get('mapping', False) self.log.append((read.read_id, len(read.signal))) if len(seq) == 0 or mapping is None: continue cov = (mapping.q_en - mapping.q_st) / len(seq) acc = mapping.mlen / mapping.blen refseq = self.aligner.seq(mapping.ctg, mapping.r_st, mapping.r_en) if acc < self.min_accuracy or cov < self.min_coverage or 'N' in refseq: continue self.output.write( AlignedSegment.fromstring( sam_record(read.read_id, seq, qstring, mapping), self.output.header ) ) summary.append(summary_row(read, len(seq), mean_qscore, alignment=mapping)) if mapping.strand == -1: refseq = mappy.revcomp(refseq) target = [int(x) for x in refseq.translate({65: '1', 67: '2', 71: '3', 84: '4'})] targets.append(target) chunks.append(read.signal) lengths.append(len(target)) if len(chunks) == 0: sys.stderr.write("> no suitable ctc data to write\n") return chunks = np.array(chunks, dtype=np.float16) targets_ = np.zeros((chunks.shape[0], max(lengths)), dtype=np.uint8) for idx, target in enumerate(targets): targets_[idx, :len(target)] = target lengths = np.array(lengths, dtype=np.uint16) indices = np.random.permutation(typical_indices(lengths)) chunks = chunks[indices] targets_ = targets_[indices] lengths = lengths[indices] summary = pd.read_csv(summary_file(), sep='\t') summary.iloc[indices].to_csv(summary_file(), sep='\t', index=False) output_directory = '.' if sys.stdout.isatty() else dirname(realpath('/dev/fd/1')) np.save(os.path.join(output_directory, "chunks.npy"), chunks) np.save(os.path.join(output_directory, "references.npy"), targets_) np.save(os.path.join(output_directory, "reference_lengths.npy"), lengths) sys.stderr.write("> written ctc training data\n") sys.stderr.write(" - chunks.npy with shape (%s)\n" % ','.join(map(str, chunks.shape))) sys.stderr.write(" - references.npy with shape (%s)\n" % ','.join(map(str, targets_.shape))) sys.stderr.write(" - reference_lengths.npy shape (%s)\n" % ','.join(map(str, lengths.shape)))
def write(self, fields): line = '\t'.join(map(str, fields)) segment = AlignedSegment.fromstring(line, self.writer.header) self.writer.write(segment)
def create_bam_record(sam_line): sam_header = AlignmentHeader.from_text("@SQ SN:chr1_subsampled LN:2755\n" "@PG ID:dummy PN:dummy VN:dummy CL:dummy") bam_record = BamRecord(AlignedSegment.fromstring(sam_line, sam_header)) return bam_record