예제 #1
0
def sam_string_to_aligned_segment(sam_string, header=None):
    """Convert a correctly formatted sam string into a pysam AlignedSegment object

    :param sam_string: correctly formatted SAM string
    :param header: AlignmentHeader object

    :return AlignedSegment
    """
    if not header:
        header = AlignmentHeader.from_references([sam_string.split("\t")[2]],
                                                 [100000000])

    new_segment = AlignedSegment.fromstring(sam_string, header)

    return new_segment
예제 #2
0
    def run(self):
        with CSVLogger(summary_file(), sep='\t') as summary:
            for read, res in self.iterator:

                seq = res['sequence']
                qstring = res.get('qstring', '*')
                mean_qscore = res.get('mean_qscore', mean_qscore_from_qstring(qstring))
                mapping = res.get('mapping', False)
                mods_tags = res.get('mods', [])

                if self.duplex:
                    samples = len(read[0].signal) + len(read[1].signal)
                    read_id = '%s;%s' % (read[0].read_id, read[1].read_id)
                else:
                    samples = len(read.signal)
                    read_id = read.read_id

                tags = [
                    f'RG:Z:{read.run_id}_{self.group_key}',
                    f'qs:i:{round(mean_qscore)}',
                    *read.tagdata(),
                    *mods_tags,
                ]

                if len(seq):
                    if self.mode == 'wfq':
                        write_fastq(read_id, seq, qstring, fd=self.fd, tags=tags)
                    else:
                        self.output.write(
                            AlignedSegment.fromstring(
                                sam_record(read_id, seq, qstring, mapping, tags=tags),
                                self.output.header
                            )
                        )
                    if self.duplex:
                        summary.append(duplex_summary_row(read[0], read[1], len(seq), mean_qscore, alignment=mapping))
                    else:
                        summary.append(summary_row(read, len(seq), mean_qscore, alignment=mapping))

                    self.log.append((read_id, samples))

                else:
                    logger.warn("> skipping empty sequence %s", read_id)
예제 #3
0
    def run(self):

        chunks = []
        targets = []
        lengths = []

        with CSVLogger(summary_file(), sep='\t') as summary:
            for read, ctc_data in self.iterator:

                seq = ctc_data['sequence']
                qstring = ctc_data['qstring']
                mean_qscore = ctc_data.get('mean_qscore', mean_qscore_from_qstring(qstring))
                mapping = ctc_data.get('mapping', False)

                self.log.append((read.read_id, len(read.signal)))

                if len(seq) == 0 or mapping is None:
                    continue

                cov = (mapping.q_en - mapping.q_st) / len(seq)
                acc = mapping.mlen / mapping.blen
                refseq = self.aligner.seq(mapping.ctg, mapping.r_st, mapping.r_en)

                if acc < self.min_accuracy or cov < self.min_coverage or 'N' in refseq:
                    continue

                self.output.write(
                    AlignedSegment.fromstring(
                        sam_record(read.read_id, seq, qstring, mapping),
                        self.output.header
                    )
                )
                summary.append(summary_row(read, len(seq), mean_qscore, alignment=mapping))

                if mapping.strand == -1:
                    refseq = mappy.revcomp(refseq)

                target = [int(x) for x in refseq.translate({65: '1', 67: '2', 71: '3', 84: '4'})]
                targets.append(target)
                chunks.append(read.signal)
                lengths.append(len(target))

        if len(chunks) == 0:
            sys.stderr.write("> no suitable ctc data to write\n")
            return

        chunks = np.array(chunks, dtype=np.float16)
        targets_ = np.zeros((chunks.shape[0], max(lengths)), dtype=np.uint8)
        for idx, target in enumerate(targets): targets_[idx, :len(target)] = target
        lengths = np.array(lengths, dtype=np.uint16)
        indices = np.random.permutation(typical_indices(lengths))

        chunks = chunks[indices]
        targets_ = targets_[indices]
        lengths = lengths[indices]

        summary = pd.read_csv(summary_file(), sep='\t')
        summary.iloc[indices].to_csv(summary_file(), sep='\t', index=False)

        output_directory = '.' if sys.stdout.isatty() else dirname(realpath('/dev/fd/1'))
        np.save(os.path.join(output_directory, "chunks.npy"), chunks)
        np.save(os.path.join(output_directory, "references.npy"), targets_)
        np.save(os.path.join(output_directory, "reference_lengths.npy"), lengths)

        sys.stderr.write("> written ctc training data\n")
        sys.stderr.write("  - chunks.npy with shape (%s)\n" % ','.join(map(str, chunks.shape)))
        sys.stderr.write("  - references.npy with shape (%s)\n" % ','.join(map(str, targets_.shape)))
        sys.stderr.write("  - reference_lengths.npy shape (%s)\n" % ','.join(map(str, lengths.shape)))
예제 #4
0
 def write(self, fields):
     line = '\t'.join(map(str, fields))
     segment = AlignedSegment.fromstring(line, self.writer.header)
     self.writer.write(segment)
예제 #5
0
 def create_bam_record(sam_line):
     sam_header = AlignmentHeader.from_text("@SQ	SN:chr1_subsampled	LN:2755\n"
                                            "@PG	ID:dummy	PN:dummy	VN:dummy	CL:dummy")
     bam_record =  BamRecord(AlignedSegment.fromstring(sam_line, sam_header))
     return bam_record