def map(self, name, seq, qual): seq = seq.replace('U', 'T') seqmaps = list(self.aligner.map(seq)) if not seqmaps: yield (name, int(FUNMAP), '*', 0, 0, '*', '*', 0, 0, seq, qual) return for i, h in enumerate(seqmaps): if i > 0: flag = int(FSECONDARY) elif not h.is_primary: flag = int(FSUPPLEMENTARY) else: flag = 0 leftclip = '{}S'.format(h.q_st) if h.q_st > 0 else '' rightclip = '{}S'.format(len(seq) - h.q_en) if h.q_en < len(seq) else '' if h.strand > 0: seq_f = seq qual_f = qual else: seq_f = mappy.revcomp(seq) qual_f = qual[::-1] leftclip, rightclip = rightclip, leftclip flag |= FREVERSE fullcigar = leftclip + h.cigar_str + rightclip yield (name, flag, h.ctg, h.r_st + 1, h.mapq, fullcigar, '*', 0, 0, seq_f, qual_f, 'NM:i:{}'.format(h.NM))
def extract_fastq(input_f, ref_f, mode=0): """ Args: input_f: intput fast5 file handle ref_f: file name of the reference mode: 0-dna, 1-rna, -1-rna 180mV """ with h5py.File(input_f, 'r') as input_fh: raw_signal = list(input_fh['/Raw/Reads'].values())[0]['Signal'].value raw_seq = input_fh[ '/Analyses/Basecall_1D_000/BaseCalled_template/Fastq'].value ref = mappy.Aligner(ref_f) align = ref.map(raw_seq) ref = mappy.Aligner(ref_f, preset="map-ont", best_n=5) aligns = ref.map(raw_seq.split(b'\n')[1]) maxmapq = -np.inf align = None for aln in aligns: if aln.mapq > maxmapq: maxmapq = aln.mapq align = aln if align is None: print("FAIL MAPPING " + input_f) if align.strand == -1: ref_seq = mappy.revcomp( ref.seq(align.ctg, start=align.r_st, end=align.r_en)) else: ref_seq = ref.seq(align.ctg, start=align.r_st, end=align.r_en) if (mode == 1) or (mode == -1): raw_signal = raw_signal[::-1] if ref_seq is None: print(input_f) print(aligns) return raw_signal, raw_seq, ref_seq
def sam_record(read_id, sequence, qstring, mapping, tags=None, sep='\t'): """ Format a string sam record. """ if mapping: softclip = [ '%sS' % mapping.q_st if mapping.q_st else '', mapping.cigar_str, '%sS' % (len(sequence) - mapping.q_en) if len(sequence) - mapping.q_en else '' ] record = [ read_id, 0 if mapping.strand == +1 else 16, mapping.ctg, mapping.r_st + 1, mapping.mapq, ''.join(softclip if mapping.strand == +1 else softclip[::-1]), '*', 0, 0, sequence if mapping.strand == +1 else mappy.revcomp(sequence), qstring, 'NM:i:%s' % mapping.NM, 'MD:Z:%s' % mapping.MD, ] else: record = [ read_id, 4, '*', 0, 0, '*', '*', 0, 0, sequence, qstring, 'NM:i:0' ] if tags is not None: record.extend(tags) return sep.join(map(str, record))
def write_sam(read_id, sequence, qstring, mapping, fd=sys.stdout, unaligned=False, sep='\t'): """ Write a sam record to a file descriptor. """ if unaligned: fd.write("%s\n" % sep.join(map(str, [ read_id, 4, '*', 0, 0, '*', '*', 0, 0, sequence, qstring, 'NM:i:0' ]))) else: softclip = [ '%sS' % mapping.q_st if mapping.q_st else '', mapping.cigar_str, '%sS' % (len(sequence) - mapping.q_en) if len(sequence) - mapping.q_en else '' ] fd.write("%s\n" % sep.join(map(str, [ read_id, 0 if mapping.strand == +1 else 16, mapping.ctg, mapping.r_st + 1, mapping.mapq, ''.join(softclip if mapping.strand == +1 else softclip[::-1]), '*', 0, 0, sequence if mapping.strand == +1 else revcomp(sequence), qstring, 'NM:i:%s' % mapping.NM, ]))) fd.flush()
def __init__(self, fn): self.name = fn self.min = 100 self.max = 0 self.seqs = {} _seqs = {} for l in open(fn): amplicon, name, seq, left, forward, pos = l.strip().split("\t") pos = int(pos) forward = forward.lower() in ["t", "+", "forward", "true"] left = left.lower() in ["left", "true", "t"] length = len(seq) if length > self.max: self.max = length if length < self.min: self.min = length _seqs[seq] = Primer(amplicon, name, left, forward, True, pos, length) _seqs[mp.revcomp(seq)] = Primer( amplicon, name, left, forward, False, pos, length ) for k, v in _seqs.items(): self.seqs[k[: self.min]] = v
def extract_fa(fa, rep1, rep2, known_rep1_5_fa, known_rep2_5_fa, novel_rep1_5_fa, novel_rep2_5_fa, known_rep1_3_fa, known_rep2_3_fa, novel_rep1_3_fa, novel_rep2_3_fa): bsj_dict = dd(lambda: dd(lambda:0)) rep = 0 for fn in [rep1, rep2]: rep += 1 with open(fn) as fp: for line in fp: if line.startswith('#'): continue ele = line.rsplit() bsj = (ele[idx['chrom']], ele[idx['startCoor0base']], ele[idx['endCoor']], ele[idx['canoBSJMotif']][0]) is_known = ele[idx['isKnownBSJ']] if 'True' not in is_known: bsj_dict[bsj]['Cate'] = 'Novel' else: bsj_dict[bsj]['Cate'] = 'Known' bsj_dict[bsj][rep] = 1 with open(known_rep1_5_fa, 'w') as k1_5, open(known_rep2_5_fa, 'w') as k2_5, open(novel_rep1_5_fa, 'w') as n1_5, open(novel_rep2_5_fa, 'w') as n2_5, open(known_rep1_3_fa, 'w') as k1_3, open(known_rep2_3_fa, 'w') as k2_3, open(novel_rep1_3_fa, 'w') as n1_3, open(novel_rep2_3_fa, 'w') as n2_3: for bsj in bsj_dict: (chrom, start, end, strand) = bsj start, end = int(start), int(end) ref_seq = fa[chrom] # 5' if strand == '+': seq = ref_seq[end-3:end+6].seq.upper() _5_str = '>{}:{}-{} {}\n{}\n'.format(chrom, end-2, end+6, strand, seq) seq = ref_seq[start-20:start+3].seq.upper() _3_str = '>{}:{}-{} {}\n{}\n'.format(chrom, start-19, start+3, strand, seq) elif strand == '-': seq = ref_seq[start-6:start+3].seq.upper() _5_str = '>{}:{}-{} {}\n{}\n'.format(chrom, start-5, start+3, strand, mp.revcomp(seq)) seq = ref_seq[end-3:end+20].seq.upper() _3_str = '>{}:{}-{} {}\n{}\n'.format(chrom, end-2, end+20, strand, mp.revcomp(seq)) else: print('Unexpected strand: {}'.format(strand)) cate = bsj_dict[bsj]['Cate'] rep_cnt = bsj_dict[bsj][1] + bsj_dict[bsj][2] if cate == 'Known': if rep_cnt == 1: k1_5.write(_5_str) k1_3.write(_3_str) elif rep_cnt == 2: k2_5.write(_5_str) k2_3.write(_3_str) else: print('Unexpected rep_cnt: {}'.format(rep_cnt)) elif cate == 'Novel': if rep_cnt == 1: n1_5.write(_5_str) n1_3.write(_3_str) elif rep_cnt == 2: n2_5.write(_5_str) n2_3.write(_3_str) else: print('Unexpected rep_cnt: {}'.format(rep_cnt)) else: print('Unexpected cate: {}'.format(cate))
def match_single(fq, primersets): for r in mp.fastx_read(fq, read_comment=True): r = Read(*r) rc = mp.revcomp(r.seq) matches = {} for pset in primersets: matches[pset.name] = Matched(r, pset.match(r.seq), r, pset.match(rc)) yield r, matches
def get_seqs(ref_fa, chrom, strand, up_site, down_site): _5_name, _3_name, _5_seq, _3_seq = '', '', '', '' ref_seq = ref_fa[chrom] if strand == '+': _5_name = '{}:{}-{} {}'.format(chrom, up_site - 2, up_site + 6, strand) _5_seq = ref_seq[up_site - 3:up_site + 6].seq.upper() _3_name = '{}:{}-{} {}'.format(chrom, down_site - 20, down_site + 2, strand) _3_seq = ref_seq[down_site - 21:down_site + 2].seq.upper() else: # '-' _5_name = '{}:{}-{} {}'.format(chrom, down_site - 6, down_site + 2, strand) _5_seq = ref_seq[down_site - 7:down_site + 2].seq.upper() _5_seq = mp.revcomp(_5_seq) _3_name = '{}:{}-{} {}'.format(chrom, up_site - 2, up_site + 20, strand) _3_seq = ref_seq[up_site - 3:up_site + 20].seq.upper() _3_seq = mp.revcomp(_3_seq) if _5_seq[3:5] != 'GT' or _3_seq[-5:-3] != 'AG': _5_name = '' return _5_name, _5_seq, _3_name, _3_seq
def _align(dna_pred): mapped = get_mapping(dna_pred) if mapped == None: raise Exception("Unable to map prediction.") dna_cigar = mapped.cigar_str dna_true = get_reference(mapped.ctg) dna_true = dna_true[mapped.r_st:mapped.r_en] dna_pred = dna_pred[mapped.q_st:mapped.q_en] if mapped.strand == -1: dna_pred = mp.revcomp(dna_pred) return dna_pred, dna_true, dna_cigar
def update_transcript(a, h): ''' for each matched regex group in the cs string, concatenate to a string representing the aligned sequence from the reference. for matches, insertions, and mismatches, add to the string. for deletions (in the reference), do nothing. skip over introns called by minimap2. ''' my_transcript = "" idx = 0 ## retrieve the matching subsequence from the index. s = a.seq(h.ctg, h.r_st, h.r_en) ''' see the documentation for the cs string regex at the minimap2 manpage, online at: https://lh3.github.io/minimap2/minimap2.html ''' cs_regex = re.compile(r'(=[ACGTN]+|:[0-9]+|\*[acgtn][acgtn]|\-[acgtn]+|\+[acgtn]+|~[acgtn]{2}[0-9]+[acgtn]{2})') for m in cs_regex.findall(h.cs): if m.startswith(':'): ## get the length of the match and add to the transcript. seq_match_len = int(m[1:]) my_transcript = my_transcript + s[idx:idx + seq_match_len] idx = idx + seq_match_len elif m.startswith('*'): ## mismatch of a single base. add to the transcript. my_transcript = my_transcript + s[idx:idx + 1] idx = idx + 1 elif m.startswith('-'): ## deletion in the transcript; add the genome sequence to the transcript. deletion_len = len(m[1:]) my_transcript = my_transcript + s[idx:idx + deletion_len] idx = idx + deletion_len elif m.startswith('+'): ## insertion in the transcript that is not in the genome. skip over. insertion_len = len(m[1:]) continue elif m.startswith('~'): ## in an intron: skip over genome sequence and update idx. ## because pattern is something like '~ag123ag' where 123 is intron length, with 4bp of flanking splice signal 'ag'. intron_len = int(m[3:-2]) + 4 idx = idx + intron_len else: ## note that we don't deal with '=' for the long-form cs string. raise Exception("ERROR: failed to match cs string group '{}'".format(m)) ## check the strand, take the reverse complement if needed. if h.strand == -1: my_transcript = mp.revcomp(my_transcript) return my_transcript
def align_mappy(dir_in, file_out, file_fasta): a = mp.Aligner(file_fasta, preset='map-ont') # Load or build index if not a: raise Exception("ERROR: failed to load/build index") reads = get_files(dir_in) files_fastq = {} data = [] for read in tqdm(reads): with h5py.File(read, 'r', libver='latest') as fd: no_alignment = True fastq = read_fastq(fd) files_fastq[fastq.id] = len(fastq.seq) for hit in a.map(fastq.seq): # Traverse alignments if hit.is_primary: # Check if the alignment is primary # Reference for seq_record in SeqIO.parse(file_fasta, 'fasta'): ref = seq_record.seq[hit.r_st:hit.r_en] r_CG_num = len(re.findall(r'(CG)', str(ref))) # Query query = fastq.seq[hit.q_st:hit.q_en] if hit.strand == -1: query = mp.revcomp(query) q_CG_num = len(re.findall(r'(CG)', str(query))) no_alignment = False data.append([ fastq.id, hit.r_st, hit.r_en, hit.q_st, hit.q_en, r_CG_num, q_CG_num, hit.cigar_str ]) break if no_alignment: data.append([fastq.id, '', '', '', '', 0, 0, '']) data = pd.DataFrame(data, columns=[ 'read_id', 'r_st', 'r_en', 'q_st', 'q_en', 'r_CG_num', 'q_CG_num', 'cigar_str' ]) data.sort_values('read_id', inplace=True) data.to_csv(file_out, index=False) print("Average length of fastq files:", sum(files_fastq.values()) / len(files_fastq.values()))
def process(self) -> Optional[List[ResegmentationData]]: read, called = self.basecall_data alignment = self.align(called.seq) if not alignment: return None relevant_motif_positions = self._get_relevant_motif_positions( alignment) if not relevant_motif_positions: return None seq_to_raw = sequence_to_raw(read, called) signal_intervals, deletion_idx = CustomProcessor.resolve_insertions( alignment, seq_to_raw) signal_intervals = CustomProcessor.resolve_deletions( signal_intervals, deletion_idx) resegmentation_data = [] for motif_position in relevant_motif_positions: r_len = alignment.r_en - alignment.r_st if motif_position - self.window < 0 or motif_position + self.window >= r_len: continue position = alignment.r_st + motif_position if alignment.strand == 1 else alignment.r_en - 1 - motif_position event_intervals = signal_intervals[motif_position - self.window:motif_position + self.window + 1] event_lens = np.array([ interval.end - interval.start for interval in event_intervals ]) reference = get_reference(self.reference_file, alignment.ctg) region = reference[position - self.window:position + self.window + 1] bases = region if alignment.strand == 1 else mappy.revcomp(region) assert len(event_intervals) == len(event_lens) == len(bases) resegmentation_data.append( ResegmentationData(position, event_intervals, event_lens, bases)) return resegmentation_data
def analyse_CG(dir_in, file_out, file_fasta): a = mp.Aligner(file_fasta, preset='map-ont') # Load or build index if not a: raise Exception("ERROR: failed to load/build index") reads = get_files(dir_in) data = [] for read in tqdm(reads): with h5py.File(read, 'r', libver='latest') as fd: matches = {'M': 0, 'X': 0, 'D': 0, 'I': 0} CG_cnt = {'M': 0, 'X': 0, 'D': 0, 'I': 0} fastq = read_fastq(fd) ref = '' mapq = 0 for hit in a.map(fastq.seq, cs=True): # Traverse alignments if hit.is_primary: # Check if the alignment is primary # Alignment matches = count_matches(hit.cs) # Reference for seq_record in SeqIO.parse(file_fasta, 'fasta'): ref = seq_record.seq[hit.r_st: hit.r_en] # Query query = fastq.seq[hit.q_st: hit.q_en] if hit.strand == -1: query = mp.revcomp(query) # Normalize ref, query = normalize(ref, query, hit.cigar_str) # Analyse CG motif CG_cnt = count_CG(ref, query) mapq = hit.mapq break data.append([fastq.id, len(ref), matches['M'], matches['X'], matches['D'], matches['I'], CG_cnt['M'], CG_cnt['X'], CG_cnt['D'], CG_cnt['I'], mapq]) data = pd.DataFrame(data, columns=['read_id', 'alignment_len', 'M', 'X', 'D', 'I', 'M_CG', 'X_CG', 'D_CG', 'I_CG', 'mapq']) data.sort_values('read_id', inplace=True) data.to_csv(file_out, index=False)
def _get_mod_likelihoods(self, read): """Return modification likelihoods of the given read as numpy array for each base of the read. The parameter `read` is AlignedSegment from pysam and must contain full sequence of the read, i.e. no secondary alignments or hard clips.""" import numpy as np import logging as log import mappy meth = self.db.get(read.query_name) if meth is None: log.info(f"Couldn't find modification data for {read.query_name}") return None # The read from mapping is given in the reference orientation but the extraction was done in the read orientation. if read.is_reverse: read_sequence = mappy.revcomp(read.query_sequence) MOD_BASE = b"G" else: read_sequence = read.query_sequence MOD_BASE = b"C" try: assert len(meth) == read_sequence.count( "C" ), "Unexpected number of methylation observations {} instead of {}.\n Read {}".format( len(meth), read_sequence.count("C"), read.tostring()) except AssertionError: log.info( "Unexpected number of methylation observations {} instead of {}.\n Read {}" .format(len(meth), read_sequence.count("C"), read.tostring())) log.info( "Query length: {} len(read sequence): {} Inferred read length: {}" .format(read.query_length, len(read_sequence), read.infer_read_length())) return None # Tabulate methylation values to an array meth_like = np.zeros(read.query_length, np.uint8) read_sequence_arr = np.fromstring(read_sequence, dtype="|S1") mod_indices = np.where(read_sequence_arr == b"C")[0] meth_like[ mod_indices] = meth # 255*P(Data | base modified, base called) Likelihood of modification for each base of the called sequence (read_sequence). return meth_like
def extract_fastq(input_f, ref_f, mode=0, trans_start=None, alignment=True): """ Args: input_f: intput fast5 file handle ref_f: file name of the reference mode: 0-dna, 1-rna, -1-rna 180mV trans_start: Start position of the transcription(required in RNA mode). alignment: If requrie alignment. """ with h5py.File(input_f, 'r') as input_fh: raw_entry = list(input_fh['/Raw/Reads'].values())[0] raw_signal = raw_entry['Signal'].value raw_seq = input_fh[BASECALL_ENTRY + '/BaseCalled_template/Fastq'].value if mode != 0: assert trans_start is not None raw_signal, raw_seq, decap_event = _decap(input_fh, trans_start, raw_signal, raw_seq) else: decap_event = input_fh[BASECALL_ENTRY + '/BaseCalled_template/Events'].value align = None ref_seq = None if alignment: ref = mappy.Aligner(ref_f, preset="map-ont", best_n=5) aligns = ref.map(raw_seq.split(b'\n')[1]) maxmapq = -np.inf for aln in aligns: if aln.mapq > maxmapq: maxmapq = aln.mapq align = aln if align is None: print("FAIL MAPPING " + input_f) else: if align.strand == -1: ref_seq = mappy.revcomp( ref.seq(align.ctg, start=align.r_st, end=align.r_en)) else: ref_seq = ref.seq(align.ctg, start=align.r_st, end=align.r_en) if (mode == 1) or (mode == -1): raw_signal = raw_signal[::-1] if ref_seq is None and alignment: print("No Reference sequence found in %s" % (input_f)) return raw_signal, raw_seq, ref_seq, decap_event
def get_motif_positions(reference_file: str, motif: str, index: int) -> Dict[str, Tuple[Set[int], Set[int]]]: chromosomes = SeqIO.to_dict(SeqIO.parse(reference_file, 'fasta')) motif_positions = dict() for chromosome, record in chromosomes.items(): reference = str(record.seq) # Forward strand fwd_matches = re.finditer(motif, reference, re.I) fwd_pos = set(m.start() + index for m in fwd_matches) # Reverse strand rev_matches = re.finditer(motif, mappy.revcomp(reference), re.I) rev_pos = set(len(reference) - (m.start() + index) - 1 for m in rev_matches) motif_positions[chromosome] = fwd_pos, rev_pos return motif_positions
def test_write_sequence_coverage_minimap_hits_reversed(projects, sequence_report): hxb2_name = 'HIV1-B-FR-K03455-seed' ref = projects.getReference(hxb2_name) seq = ref[1000:1100] + revcomp(ref[2000:2100]) expected_minimap_hits = """\ contig,ref_name,start,end,ref_start,ref_end 1-my-contig,HIV1-B-FR-K03455-seed,1,100,1001,1100 1-my-contig,HIV1-B-FR-K03455-seed,101,200,2100,2001 """ report_file = StringIO() sequence_report.projects = projects sequence_report.write_genome_coverage_header(StringIO()) sequence_report.write_minimap_hits_header(report_file) sequence_report.write_sequence_coverage_counts('1-my-contig', hxb2_name, seq) assert report_file.getvalue() == expected_minimap_hits
def custom_processor(basecall_data: Tuple[ReadData, CalledReadData], aligner: mappy.Aligner, reference_file: str, motif_positions: Dict[str, Tuple[Set[int], Set[int]]], mapq: int, window: int) -> ResegmentationData: read, called = basecall_data alignment = align(aligner, called.seq, mapq) if not alignment: return None relevant_motif_positions = get_relevant_motif_positions(motif_positions, alignment) if not relevant_motif_positions: return None seq_to_raw = sequence_to_raw(read, called) signal_intervals, deletion_idx = resolve_insertions(alignment, seq_to_raw) signal_intervals = resolve_deletions(signal_intervals, deletion_idx) resegmentation_data = [] for motif_position in relevant_motif_positions: r_len = alignment.r_en - alignment.r_st if motif_position - window < 0 or motif_position + window >= r_len: continue position = alignment.r_st + motif_position if alignment.strand == 1 else alignment.r_en - 1 - motif_position event_intervals = signal_intervals[motif_position - window: motif_position + window + 1] event_lens = np.array([interval.end - interval.start for interval in event_intervals]) reference = get_reference(reference_file, alignment.ctg) region = reference[position - window: position + window + 1] bases = region if alignment.strand == 1 else mappy.revcomp(region) assert len(event_intervals) == len(event_lens) == len(bases) resegmentation_data.append(ResegmentationData(position, event_intervals, event_lens, bases)) return resegmentation_data
def run(self): chunks = [] targets = [] target_lens = [] while True: job = self.queue.get() if job is None: break chunks_, predictions = job # convert logprobs to probs predictions = np.exp(predictions.astype(np.float32)) for chunk, pred in zip(chunks_, predictions): try: sequence = self.model.decode(pred) except: continue if not sequence: continue for mapping in self.aligner.map(sequence): cov = (mapping.q_en - mapping.q_st) / len(sequence) acc = mapping.mlen / mapping.blen refseq = self.aligner.seq(mapping.ctg, mapping.r_st + 1, mapping.r_en) if 'N' in refseq: continue if mapping.strand == -1: refseq = revcomp(refseq) break else: continue if acc > self.min_accuracy and cov > self.min_accuracy: chunks.append(chunk.squeeze()) targets.append([ int(x) for x in refseq.translate({ 65: '1', 67: '2', 71: '3', 84: '4' }) ]) target_lens.append(len(refseq)) if len(chunks) == 0: return chunks = np.array(chunks, dtype=np.float32) chunk_lens = np.full(chunks.shape[0], chunks.shape[1], dtype=np.int16) targets_ = np.zeros((chunks.shape[0], max(target_lens)), dtype=np.uint8) for idx, target in enumerate(targets): targets_[idx, :len(target)] = target target_lens = np.array(target_lens, dtype=np.uint16) training = ChunkDataSet(chunks, chunk_lens, targets_, target_lens) training = filter_chunks(training) output_directory = '.' if sys.stdout.isatty() else dirname( realpath('/dev/fd/1')) np.save(os.path.join(output_directory, "chunks.npy"), training.chunks.squeeze(1)) np.save(os.path.join(output_directory, "chunk_lengths.npy"), training.chunk_lengths) np.save(os.path.join(output_directory, "references.npy"), training.targets) np.save(os.path.join(output_directory, "reference_lengths.npy"), training.target_lengths) sys.stderr.write("> written ctc training data\n") sys.stderr.write(" - chunks.npy with shape (%s)\n" % ','.join(map(str, training.chunks.squeeze(1).shape))) sys.stderr.write(" - chunk_lengths.npy with shape (%s)\n" % ','.join(map(str, training.chunk_lengths.shape))) sys.stderr.write(" - references.npy with shape (%s)\n" % ','.join(map(str, training.targets.shape))) sys.stderr.write(" - reference_lengths.npy shape (%s)\n" % ','.join(map(str, training.target_lengths.shape)))
def run(self): chunks = [] targets = [] lengths = [] with CSVLogger(summary_file(), sep='\t') as summary: for read, ctc_data in self.iterator: seq = ctc_data['sequence'] qstring = ctc_data['qstring'] mean_qscore = ctc_data['mean_qscore'] mapping = ctc_data.get('mapping', False) self.log.append((read.read_id, len(read.signal))) if len(seq) == 0 or mapping is None: continue cov = (mapping.q_en - mapping.q_st) / len(seq) acc = mapping.mlen / mapping.blen refseq = self.aligner.seq(mapping.ctg, mapping.r_st, mapping.r_en) if acc < self.min_accuracy or cov < self.min_coverage or 'N' in refseq: continue write_sam(read.read_id, seq, qstring, mapping, fd=self.fd, unaligned=mapping is None) summary.append( summary_row(read, len(seq), mean_qscore, alignment=mapping)) if mapping.strand == -1: refseq = revcomp(refseq) target = [ int(x) for x in refseq.translate({ 65: '1', 67: '2', 71: '3', 84: '4' }) ] targets.append(target) chunks.append(read.signal) lengths.append(len(target)) if len(chunks) == 0: sys.stderr.write("> no suitable ctc data to write\n") return chunks = np.array(chunks, dtype=np.float16) targets_ = np.zeros((chunks.shape[0], max(lengths)), dtype=np.uint8) for idx, target in enumerate(targets): targets_[idx, :len(target)] = target lengths = np.array(lengths, dtype=np.uint16) indices = np.random.permutation(typical_indices(lengths)) chunks = chunks[indices] targets_ = targets_[indices] lengths = lengths[indices] summary = pd.read_csv(summary_file(), sep='\t') summary.iloc[indices].to_csv(summary_file(), sep='\t', index=False) output_directory = '.' if sys.stdout.isatty() else dirname( realpath('/dev/fd/1')) np.save(os.path.join(output_directory, "chunks.npy"), chunks) np.save(os.path.join(output_directory, "references.npy"), targets_) np.save(os.path.join(output_directory, "reference_lengths.npy"), lengths) sys.stderr.write("> written ctc training data\n") sys.stderr.write(" - chunks.npy with shape (%s)\n" % ','.join(map(str, chunks.shape))) sys.stderr.write(" - references.npy with shape (%s)\n" % ','.join(map(str, targets_.shape))) sys.stderr.write(" - reference_lengths.npy shape (%s)\n" % ','.join(map(str, lengths.shape)))
def StrandSim(w, c): ''' Perform first part of strand-seq simulations and re-align to the original haplotype ''' hfa = pyfaidx.Fasta(c.ffile) if w.chrom not in hfa.keys(): now = datetime.now().strftime('%d/%m/%Y %H:%M:%S') print('[' + now + '][Warning] Chromosome ' + w.chrom + ' not found in ' + c.ffile + '. Skipped simulation') else: now = datetime.now().strftime('%d/%m/%Y %H:%M:%S') print('[' + now + '][Message] Preparing simulation from ' + c.ffile + '. Haplotype ' + str(c.hapnumber)) chr_ = hfa[w.chrom] seq_ = chr_[w.start - 1:w.end].seq tmpfa = os.path.abspath(c.haplodir + '/' + 'htmp.fa') region = w.chrom + '_' + str(w.start) + '_' + str(w.end) with open(tmpfa, 'w') as tmpfout: #write temporary fa for sampling reads tmpfout.write('>' + region + '\n' + '\n'.join(re.findall('.{1,60}', seq_)) + '\n') Ns = seq_.count('N') #normalize coverage on Ns Nreads = round(((c.regioncoverage * (len(seq_) - Ns)) / c.length) / 2) #for paired-end sequencing mate1h = os.path.abspath(c.haplodir + '/hr1.tmp.fq') mate2h = os.path.abspath(c.haplodir + '/hr2.tmp.fq') hapcov = Nreads * c.length * 2 / ((w.end - w.start) - Ns) now = datetime.now().strftime('%d/%m/%Y %H:%M:%S') print('[' + now + '][Message] Simulated coverage for this region will be ' + str(hapcov)) now = datetime.now().strftime('%d/%m/%Y %H:%M:%S') print('[' + now + '][Message] Simulating') wgsim.core(r1=mate1h, r2=mate2h, ref=tmpfa, err_rate=c.error, mut_rate=c.mutation, indel_frac=c.indels, indel_ext=c.extindels, N=Nreads, dist=c.distance, stdev=c.stdev, size_l=c.length, size_r=c.length, max_n=0.05, is_hap=0, is_fixed=0, seed=0) os.remove(tmpfa) mate1hnew = os.path.abspath(c.haplodir + '/hr1.fq') mate2hnew = os.path.abspath(c.haplodir + '/hr2.fq') with open(mate1hnew, 'w') as out1, open(mate2hnew, 'w') as out2: for (name1, seq1, qual1), (name2, seq2, qual2) in zip(mp.fastx_read(mate1h), mp.fastx_read(mate2h)): #change name1/name2 newname1 = '@c' + str(c.singlecellnum) + 'h' + str( c.hapnumber) + 'fh_' + name1 newname2 = '@c' + str(c.singlecellnum) + 'h' + str( c.hapnumber) + 'fh_' + name2 read1 = [newname1, seq1, '+', qual1] read2 = [newname2, seq2, '+', qual2] out1.write('\n'.join(read1) + '\n') out2.write('\n'.join(read2) + '\n') os.remove(mate1h) os.remove(mate2h) now = datetime.now().strftime('%d/%m/%Y %H:%M:%S') print( '[' + now + '][Message] Mapping simulated reads to the corresponding haplotype' ) BAM = os.path.abspath(c.haplodir + '/' + str(c.r_number) + '.srt.bam') sam_cmd = [ 'minimap2', '-ax', 'sr', '--MD', '--cs', '-Y', '--sam-hit-only', '-t', str(c.threads), c.ffile, mate1hnew, mate2hnew ] bam_cmd = [ 'samtools', 'sort', '-@', str(round(c.threads / 2)), '-o', BAM ] p1 = subprocess.Popen(sam_cmd, stderr=open(os.devnull, 'wb'), stdout=subprocess.PIPE) bout = open(BAM, 'wb') p2 = subprocess.run(bam_cmd, stdin=p1.stdout, stderr=open(os.devnull, 'wb'), stdout=bout) bout.close() os.remove(mate1hnew) os.remove(mate2hnew) #now re-parse BAM file to keep only Watson/Crick reads #Watson reads: read1 forward, read2 reverse #Crick reads: read2 forward, read1 reverse ivf = None if len(c.sce_bedregion) != 0: sce_string = '' for s in c.sce_bedregion: if s[3] == c.cellid and s[4] == c.hapid: sce_string += s.chrom + '\t' + str(s.start) + '\t' + str( s.end) + '\n' if sce_string != '': sce_fromscratch = pybedtools.BedTool(sce_string.rstrip(), from_string=True) ivf = sce_fromscratch.as_intervalfile( ) #intervals where to perform SCE events now = datetime.now().strftime('%d/%m/%Y %H:%M:%S') print( '[' + now + '][Message] Detected one ore more SCE event for current cell/haplotype' ) now = datetime.now().strftime('%d/%m/%Y %H:%M:%S') print( '[' + now + '][Message] Extracting Watson (R1F,R2R) and Crick (R1R,R2F) reads') save = pysam.set_verbosity(0) bamstrand = pysam.AlignmentFile( BAM, 'rb', require_index=False) #until-eof consumes the bamfile pysam.set_verbosity(save) Wreads = list(WR(bamstrand, ivf)) bamstrand.close() save = pysam.set_verbosity(0) bamstrand = pysam.AlignmentFile( BAM, 'rb', require_index=False) #re-open for second round pysam.set_verbosity(save) Creads = list(CR(bamstrand, ivf)) bamstrand.close() os.remove(BAM) if c.noise > 0: now = datetime.now().strftime('%d/%m/%Y %H:%M:%S') print('[' + now + '][Message] Adding noise to strands') CtoW = random.sample(Creads, round(len(Wreads) / 100 * c.noise)) Wreads += CtoW WtoC = random.sample(Wreads, round(len(Creads) / 100 * c.noise)) Creads += WtoC now = datetime.now().strftime('%d/%m/%Y %H:%M:%S') print('[' + now + '][Message] Writing Watson and Crick FASTQ') w1 = os.path.abspath(c.haplodir + '/' + str(c.r_number) + '.w1.fq') w2 = os.path.abspath(c.haplodir + '/' + str(c.r_number) + '.w2.fq') c1 = os.path.abspath(c.haplodir + '/' + str(c.r_number) + '.c1.fq') c2 = os.path.abspath(c.haplodir + '/' + str(c.r_number) + '.c2.fq') with open(w1, 'w') as wout1, open(w2, 'w') as wout2: for r1, r2 in Wreads: if r1.get_tag('OS') == 'W': #this is true W read1 = [ '@' + r1.query_name, r1.query_sequence, '+', '2' * c.length ] read2 = [ '@' + r2.query_name, mp.revcomp(r2.query_sequence), '+', '2' * c.length ] else: #write to Watson, but is Crick read1 = [ '@' + r1.query_name, mp.revcomp(r1.query_sequence), '+', '2' * c.length ] read2 = [ '@' + r2.query_name, r2.query_sequence, '+', '2' * c.length ] wout1.write('\n'.join(read1) + '\n') wout2.write('\n'.join(read2) + '\n') with open(c1, 'w') as cout1, open(c2, 'w') as cout2: for r1, r2 in Creads: if r1.get_tag('OS') == 'C': #this is true C read1 = [ '@' + r1.query_name, mp.revcomp(r1.query_sequence), '+', '2' * c.length ] read2 = [ '@' + r2.query_name, r2.query_sequence, '+', '2' * c.length ] else: #write to Crick, but is Watson read1 = [ '@' + r1.query_name, r1.query_sequence, '+', '2' * c.length ] read2 = [ '@' + r2.query_name, mp.revcomp(r2.query_sequence), '+', '2' * c.length ] cout1.write('\n'.join(read1) + '\n') cout2.write('\n'.join(read2) + '\n') now = datetime.now().strftime('%d/%m/%Y %H:%M:%S') print( '[' + now + '][Message] Mapping Watson and Crick reads to the original reference' ) BAM = os.path.abspath(c.haplodir + '/' + str(c.r_number) + '.W.srt.bam') sam_cmd = [ 'minimap2', '-ax', 'sr', '--MD', '--cs', '-Y', '--sam-hit-only', '-t', str(c.threads), '-R', '@RG\\tID:illumina\\tSM:strand', c.REF, w1, w2 ] bam_cmd = [ 'samtools', 'sort', '-@', str(round(c.threads / 2)), '-o', BAM ] p1 = subprocess.Popen(sam_cmd, stderr=open(os.devnull, 'wb'), stdout=subprocess.PIPE) bout = open(BAM, 'wb') p2 = subprocess.run(bam_cmd, stdin=p1.stdout, stderr=open(os.devnull, 'wb'), stdout=bout) bout.close() os.remove(w1) os.remove(w2) BAM = os.path.abspath(c.haplodir + '/' + str(c.r_number) + '.C.srt.bam') sam_cmd = [ 'minimap2', '-ax', 'sr', '--MD', '--cs', '-Y', '--sam-hit-only', '-t', str(c.threads), '-R', '@RG\\tID:illumina\\tSM:strand', c.REF, c1, c2 ] bam_cmd = [ 'samtools', 'sort', '-@', str(round(c.threads / 2)), '-o', BAM ] p1 = subprocess.Popen(sam_cmd, stderr=open(os.devnull, 'wb'), stdout=subprocess.PIPE) bout = open(BAM, 'wb') p2 = subprocess.run(bam_cmd, stdin=p1.stdout, stderr=open(os.devnull, 'wb'), stdout=bout) bout.close() os.remove(c1) os.remove(c2)
def determine_consensus(name, fasta, fastq_reads_full, fastq_reads_partial, counter): '''Aligns and returns the consensus''' corrected_consensus = '' repeats = '0' fasta_read_dict = fasta fasta_reads = [] for read, seq in fasta_read_dict.items(): fasta_reads.append((read, seq)) repeats = str(len(fasta_reads)) out_Fq = temp_folder + '/' + counter + '_subsampled.fastq' out_F = temp_folder + '/' + counter + '_subsampled.fasta' combined_consensus_file = open(temp_folder + '/' + counter + '.fasta', 'w') out = open(out_Fq, 'w') poa_cons = temp_folder + '/consensus.' + counter + '.fasta' output_cons = temp_folder + '/corrected_consensus.' + counter + '.fasta' overlap = temp_folder + '/overlaps.' + counter + '.paf' overlap_fh = open(overlap, 'w') fastq_reads = fastq_reads_full + fastq_reads_partial if len(fastq_reads) > 0: if len(fastq_reads_full) < subsample: subsample_fastq_reads = fastq_reads else: indeces = np.random.choice(np.arange(0, len(fastq_reads_full)), min(len(fastq_reads_full), subsample), replace=False) subsample_fastq_reads = [] for index in indeces: subsample_fastq_reads.append(fastq_reads_full[index]) subread_counter = 0 subsample_fastq_reads_numbered = [] for read in subsample_fastq_reads: subread_counter += 1 out.write('@' + read[0] + '_' + str(subread_counter) + '\n' + read[1] + '\n+\n' + read[2] + '\n') subsample_fastq_reads_numbered.append( (read[0] + '_' + str(subread_counter), read[1], read[2], read[3])) out.close() subsample_fastq_reads = list(subsample_fastq_reads_numbered) indeces = np.random.choice(np.arange(0, len(fasta_reads)), min(len(fasta_reads), 20), replace=False) subsample_fasta_reads = [] for index in indeces: subsample_fasta_reads.append(fasta_reads[index]) first = subsample_fasta_reads[0][1] sequences = [] mm_align = mm.Aligner(seq=first, preset='map-ont') for read, sequence in subsample_fasta_reads: for hit in mm_align.map(sequence): if hit.is_primary: if hit.strand == 1: sequences.append(sequence) elif hit.strand == -1: sequences.append(mm.revcomp(sequence)) res = poa_aligner.msa(sequences, out_cons=True, out_msa=False) if len(sequences) <= 2: consensus_sequence = sequences[0] elif not res.cons_seq: consensus_sequence = sequences[0] else: consensus_sequence = res.cons_seq[0] out_cons_file = open(poa_cons, 'w') out_cons_file.write('>Consensus\n' + consensus_sequence + '\n') out_cons_file.close() final = poa_cons mm_align = mm.Aligner(seq=consensus_sequence, preset='map-ont') for name, sequence, q, le in subsample_fastq_reads: for hit in mm_align.map(sequence): if hit.is_primary: overlap_fh.write( "{}\t{}\t{}\t{}\t{}\t{}\t{}\t{}\t{}\t{}\t{}\t{}\n". format(name, str(len(sequence)), hit.q_st, hit.q_en, hit.strand, 'Consensus', hit.ctg_len, hit.r_st, hit.r_en, hit.mlen, hit.blen, hit.mapq)) overlap_fh.close() os.system('%s -q 5 -t 1 --no-trimming %s %s %s >%s 2>./racon_messages.txt' \ %(racon,out_Fq, overlap, poa_cons, output_cons)) final = output_cons reads = read_fasta(final) if len(reads) == 0: print('racon no') reads = read_fasta(poa_cons) forMedaka = open(output_cons, 'w') for read in reads: corrected_consensus = reads[read] forMedaka.write('>Corrected_Consensus\n' + corrected_consensus + '\n') forMedaka.close() os.system('mkdir ' + temp_folder + '/' + counter) os.system('%s -f -i %s -d %s -o %s > %s_medaka_messages.txt 2>&1' % (medaka, out_Fq, final, temp_folder + '/' + counter, temp_folder + '/' + counter)) final = temp_folder + '/' + counter + '/consensus.fasta' reads = read_fasta(final) for read in reads: corrected_consensus = reads[ read] # if no read in file, corrected_consensus from racon output is used implicitly return corrected_consensus
def write_fasta_file(args, path, adapter_dict, reads, seq_to_idx, idx_to_seq): undirectional = args.undirectional barcoded = args.barcoded trim = args.trim odT = True if seq_to_idx else False if barcoded: out10X = open(path + 'R2C2_full_length_consensus_reads_10X_sequences.fasta', 'w') if odT: outdT = open(path + 'R2C2_oligodT_multiplexing.tsv', 'w') for idx in idx_to_seq: if os.path.exists(path + idx): shutil.rmtree(path + idx) else: out = open(path + 'R2C2_full_length_consensus_reads.fasta', 'w') out3 = open(path + 'R2C2_full_length_consensus_reads_left_splint.fasta', 'w') out5 = open(path + 'R2C2_full_length_consensus_reads_right_splint.fasta', 'w') for name, sequence in (tqdm(reads.items()) if args.threads==1 else reads.items()): adapter_plus = sorted(adapter_dict[name]['+'], key=lambda x: x[2], reverse=False) adapter_minus = sorted(adapter_dict[name]['-'], key=lambda x: x[2], reverse=False) plus_list_name, plus_positions = [], [] minus_list_name, minus_positions = [], [] for adapter in adapter_plus: if adapter[0] != '-': plus_list_name.append(adapter[0]) plus_positions.append(adapter[2]) for adapter in adapter_minus: if adapter[0] != '-': minus_list_name.append(adapter[0]) minus_positions.append(adapter[2]) if len(plus_list_name) != 1 or len(minus_list_name) != 1: continue if minus_positions[0] <= plus_positions[0]: continue if undirectional: direction = '+' elif plus_list_name[0] != minus_list_name[0]: if plus_list_name[0] == '5Prime_adapter': direction = '+' else: direction = '-' else: continue if odT: outdT.write('%s\t%s\t%s\n' %( name, mm.revcomp(sequence[minus_positions[0]-16:minus_positions[0]+4]), sequence[plus_positions[0]-4:plus_positions[0]+16]) ) reverse_index, forward_index = '-', '-' forward_index = match_index(sequence[plus_positions[0]-4:plus_positions[0]+16], seq_to_idx) reverse_index = match_index(mm.revcomp(sequence[minus_positions[0]-16:minus_positions[0]+4]), seq_to_idx) demux = False if forward_index in idx_to_seq and reverse_index not in idx_to_seq: direction, idx_name, demux = '-', forward_index, True if reverse_index in idx_to_seq and forward_index not in idx_to_seq: direction, idx_name, demux = '+', reverse_index, True if not demux: idx_name = 'no_index_found' demux_path = path + idx_name + '/' if not os.path.isdir(demux_path): os.mkdir(demux_path) out = open(demux_path + 'R2C2_full_length_consensus_reads.fasta', 'a+') out3 = open(demux_path + 'R2C2_full_length_consensus_reads_left_splint.fasta', 'a+') out5 = open(demux_path + 'R2C2_full_length_consensus_reads_right_splint.fasta', 'a+') seq = sequence[plus_positions[0]:minus_positions[0]] ada = sequence[max(plus_positions[0]-40, 0):minus_positions[0]+40] name += '_' + str(len(seq)) if direction == '+': if trim: out.write('>%s\n%s\n' %(name, seq)) else: out.write('>%s\n%s\n' %(name, ada)) out5.write('>%s\n%s\n' %(name, mm.revcomp(sequence[:plus_positions[0]]))) out3.write('>%s\n%s\n' %(name, sequence[minus_positions[0]:])) if barcoded: out10X.write('>%s\n%splus\n' %(name, mm.revcomp(sequence[minus_positions[0]-40:minus_positions[0]]))) elif direction == '-': if trim: out.write('>%s\n%s\n' %(name, mm.revcomp(seq))) else: out.write('>%s\n%s\n' %(name, mm.revcomp(ada))) out3.write('>%s\n%s\n' %(name, mm.revcomp(sequence[:plus_positions[0]+40]))) out5.write('>%s\n%s\n' %(name, sequence[minus_positions[0]:])) if barcoded: out10X.write('>%s\n%sminus\n' %(name, sequence[plus_positions[0]:plus_positions[0]+40])) if odT: out.close() out3.close() out5.close() if not odT: out.close() out3.close() out5.close() if barcoded: out10X.close() if odT: outdT.close()
def main(args): if not args.out_path.endswith('/'): args.out_path += '/' if not os.path.exists(args.out_path): os.mkdir(args.out_path) log_file = open(args.out_path + 'c3poa.log', 'w+') if args.config: progs = configReader(args.out_path, args.config) racon = progs['racon'] blat = progs['blat'] else: racon = 'racon' blat = 'blat' tmp_dir = args.out_path + 'tmp/' if not os.path.isdir(tmp_dir): os.mkdir(tmp_dir) # read in the file and preprocess read_list, total_reads = [], 0 short_reads = 0 tmp_fasta = tmp_dir + 'R2C2_temp_for_BLAT.fasta' align_psl = tmp_dir + 'splint_to_read_alignments.psl' tmp_adapter_dict = {} for read in mm.fastx_read(args.reads, read_comment=False): if len(read[1]) < args.lencutoff: short_reads += 1 continue tmp_adapter_dict[read[0]] = [[None, 1, None]] # [adapter, matches, strand] total_reads += 1 adapter_dict, adapter_set, no_splint = preprocess(blat, args, tmp_dir, tmp_adapter_dict, total_reads) for adapter in adapter_set: if not os.path.exists(args.out_path + adapter): os.mkdir(args.out_path + adapter) all_reads = total_reads + short_reads print('C3POa version:', VERSION, file=log_file) print('No splint reads:', no_splint, '({:.2f}%)'.format((no_splint / all_reads) * 100), file=log_file) print('Under len cutoff:', short_reads, '({:.2f}%)'.format((short_reads / all_reads) * 100), file=log_file) print('Total thrown away reads:', short_reads + no_splint, '({:.2f}%)'.format(((short_reads + no_splint) / all_reads) * 100), file=log_file) print('Total reads:', all_reads, file=log_file) log_file.close() splint_dict = {} for splint in mm.fastx_read(args.splint_file, read_comment=False): splint_dict[splint[0]] = [splint[1]] splint_dict[splint[0]].append(mm.revcomp(splint[1])) pool = mp.Pool(args.numThreads, maxtasksperchild=1) pbar = tqdm(total=total_reads // args.groupSize + 1, desc='Calling consensi') iteration, current_num, tmp_reads, target = 1, 0, [], args.groupSize for read in mm.fastx_read(args.reads, read_comment=False): if len(read[1]) < args.lencutoff: continue tmp_reads.append(read) current_num += 1 if current_num == target: pool.apply_async(analyze_reads, args=(args, tmp_reads, splint_dict, adapter_dict, adapter_set, iteration, racon), callback=lambda _: pbar.update(1)) iteration += 1 target = args.groupSize * iteration if target >= total_reads: target = total_reads tmp_reads = [] gc.collect() pool.close() pool.join() pbar.close() for adapter in adapter_set: cat_files(args.out_path + adapter, '/tmp*/R2C2_Consensus.fasta', args.out_path + adapter + '/R2C2_Consensus.fasta', 'Catting consensus reads') cat_files(args.out_path + adapter, '/tmp*/subreads.fastq', args.out_path + adapter + '/R2C2_Subreads.fastq', 'Catting subreads') remove_files(args.out_path + adapter, '/tmp*')
#print(orientation) if args.debug: for k, v in natsorted(flags.items()): print(k, v) Complete = basename + '_complete_facs.fasta' Unassigned = basename + '_unassigned_facs.fasta' complete_count = 0 the_rest = 0 with open(Complete, 'w') as comp: with open(Unassigned, 'w') as bad: for seq in mappy.fastx_read(os.path.abspath(args.assembly), read_comment=True): if seq[0] in completeFACs: if seq[0] in orientation: if orientation[seq[0]] == -1: FinalSeq = mappy.revcomp(seq[1]) else: FinalSeq = seq[1] else: FinalSeq = seq[1] comments = seq[3].split(' ') facname = completeFACs[seq[0]][0] complete_count += len(completeFACs[seq[0]]) comp.write('>{:};organism={:};{:};\n{:}\n'.format( '|'.join(completeFACs[seq[0]]), inputDict[facname]['organism'], ';'.join(comments), FinalSeq)) else: the_rest += 1 bad.write('>{:} {:}\n{:}\n'.format(seq[0], seq[3], seq[1])) print('[{:}] Found {:,} full-length FACs corresponing to {:} unique sequences'.