def extract_fastq(input_f, ref_f, mode=0): """ Args: input_f: intput fast5 file handle ref_f: file name of the reference mode: 0-dna, 1-rna, -1-rna 180mV """ with h5py.File(input_f, 'r') as input_fh: raw_signal = list(input_fh['/Raw/Reads'].values())[0]['Signal'].value raw_seq = input_fh[ '/Analyses/Basecall_1D_000/BaseCalled_template/Fastq'].value ref = mappy.Aligner(ref_f) align = ref.map(raw_seq) ref = mappy.Aligner(ref_f, preset="map-ont", best_n=5) aligns = ref.map(raw_seq.split(b'\n')[1]) maxmapq = -np.inf align = None for aln in aligns: if aln.mapq > maxmapq: maxmapq = aln.mapq align = aln if align is None: print("FAIL MAPPING " + input_f) if align.strand == -1: ref_seq = mappy.revcomp( ref.seq(align.ctg, start=align.r_st, end=align.r_en)) else: ref_seq = ref.seq(align.ctg, start=align.r_st, end=align.r_en) if (mode == 1) or (mode == -1): raw_signal = raw_signal[::-1] if ref_seq is None: print(input_f) print(aligns) return raw_signal, raw_seq, ref_seq
def parse_target(fn): target = defaultdict(dict) with open(fn, 'r') as f: for line in f: (name, left, donor, right, total, cut, in_s, in_e, fa) = line.rstrip().split() target_type = 'gDNA' if name.endswith('gDNA') else 'donor' left, right = int(left), int(right) total, cut = int(total), int(cut) in_s, in_e = int(in_s), int(in_e) donor = int(donor) target[name]['left_bond'] = left target[name]['right_bond'] = right target[name]['donor'] = donor target[name]['total'] = total if target_type == 'gDNA': target[name]['cut_left'] = cut - 10 target[name]['cut_right'] = cut + 10 interval = [[0, left, 'L'], [left, cut - 25, 'LH'], [cut - 25, cut + 25, 'C'], [cut + 25, left + donor, 'RH'], [left + donor, total, 'R']] target[name]['fa'] = mp.Aligner(fa, preset='map-pb') else: interval = [[0, in_s, 'LH'], [in_s, in_e, 'I'], [in_e, total, 'RH']] target[name]['fa'] = mp.Aligner(fa, preset='sr') target[name]['interval'] = interval target[name]['rev_interval'] = [[total - x[1], total - x[0], x[2]] for x in interval][::-1] return target
def main(argv): opts, args = getopt.getopt(argv[1:], "x:n:m:k:w:r:c") if len(args) < 2: print("Usage: minimap2.py [options] <ref.fa>|<ref.mmi> <query.fq>") print("Options:") print(" -x STR preset: sr, map-pb, map-ont, asm5, asm10 or splice") print(" -n INT mininum number of minimizers") print(" -m INT mininum chaining score") print(" -k INT k-mer length") print(" -w INT minimizer window length") print(" -r INT band width") print(" -c output the cs tag") sys.exit(1) preset = min_cnt = min_sc = k = w = bw = None out_cs = False for opt, arg in opts: if opt == '-x': preset = arg elif opt == '-n': min_cnt = int(arg) elif opt == '-m': min_chain_score = int(arg) elif opt == '-r': bw = int(arg) elif opt == '-k': k = int(arg) elif opt == '-w': w = int(arg) elif opt == '-c': out_cs = True a = mp.Aligner(args[0], preset=preset, min_cnt=min_cnt, min_chain_score=min_sc, k=k, w=w, bw=bw) if not a: raise Exception("ERROR: failed to load/build index file '{}'".format(args[0])) for name, seq, qual in mp.fastx_read(args[1]): # read one sequence for h in a.map(seq, cs=out_cs): # traverse hits print('{}\t{}\t{}'.format(name, len(seq), h))
def getIndex(reference, thread): """ Find reference sequence make a index return mappy alignment result default only keep one best alignment default using 2 threads """ if reference: reffa = reference else: reffa = path.join(path.dirname(path.abspath(path.dirname(__file__))), "reference.fa") if not path.isfile(reffa): logging.error("Could not find reference.fa") sys.exit( "ERROR: Could not find reference.fa! Programme exit due to reference.fa problem." ) if thread is None: #check wether thread arugment is specified, default using 2 threads thread = 2 aligner = mp.Aligner( reffa, preset="map-ont", best_n=1, n_threads=int(thread) ) #invoke minimap2 API from Li Heng, keep only best alignmen if not aligner: logging.error("Failed to load/build index") raise Exception( "ERROR: failed to load/build index! Programme exit due to mappy problem." ) return aligner
def runsingle(reffile, reads1, reads2, fname, distance, cut_site, min_len, output_type): """Run single processor version of PAtChER""" print("Loading in Reference") reference = mp.Aligner(reffile, preset="sr") if not reference: raise Exception("ERROR: failed to load/build index file") print("Done.") sambam_output = SAMBAMWriter(fname, reference, output_type) print("Running Alignment") while True: try: read1 = process_reads2.Read(reads1.__next__()) #Read is a class from process_reads2.py read1.split_read(cut_site, min_len) read1.qual_trim(10, 10) read2 = process_reads2.Read(reads2.__next__()) read2.split_read(cut_site, min_len) read2.qual_trim(10, 10) #r1 and r2 could be None at this stage #.seq is part of mappy if read1.seq and read2.seq: res = alignment2.map_reads(reference, read1, read2, distance) if res: sambam_output.process_output(res, read1, read2, distance) except StopIteration: break
def process_chunk(virtuals, ref): if ref is not None: if type(ref) == str: #print("Loading allign, chunks") Al = mappy.Aligner(ref, preset="map-ont") else: Al = ref res = {} if virtuals is None or len(virtuals) == 0: return res #print(virtuals) for block in virtuals: if block is None: continue virtual, k = block if virtual is not None: try: res[k] = virtual_h5_to_processing(virtual, Al) except IndexError as err: error = {} if len(err.args) > 0: msg = err.args[0] error[msg] = 1 else: error["IndexError"] += 1 res[k] = [[], [], error] return res
def get_mapping(pred): try: aligner = mp.Aligner(reference_file) return next(aligner.map(pred)) except Exception as e: print(e) return None
def align_to_chromosomes(teloes): aligner = mp.Aligner('../../GRCh38_latest_genomic.fna.gz', preset='map-ont') if not aligner: raise Exception("ERROR: failed to load/build index") chromosome_dict = {} chromosome_graph = {} for i in range(1, NUM_CHROMOSOMES + 1): chromosome_dict[str(i)] = [[0, 0, 0], [0, 0, 0]] chromosome_graph[str(i)] = [[], []] chromosome_dict["X"] = [[0, 0, 0], [0, 0, 0]] chromosome_graph["X"] = [[], []] chromosome_dict["Y"] = [[0, 0, 0], [0, 0, 0]] chromosome_graph["Y"] = [[], []] chromosome_dict["Unknown"] = [[0, 0, 0], [0, 0, 0]] chromosome_graph["Unknown"] = [[], []] chromo_count = [[0, 0], [0, 0]] for telo in teloes: first_print = True aligns_dict = {} for to_align in telo.non_telomeric_parts: if len(to_align) < MIN_ALIGNMENT_LENGTH: continue for hit in aligner.map(to_align): if hit.is_primary: aligns_dict[hit.mlen] = hit if (first_print): print(telo.rec_num) first_print = False print(hit.ctg + " : " + str(hit.r_st) + " - " + str(hit.r_en) + " starnd: " + str(hit.strand) + " blen: " + str(hit.blen) + " mlen: " + str(hit.mlen) + " NM: " + str(hit.NM)) if aligns_dict: best = aligns_dict[max(aligns_dict)] chromosome_matcher(best, chromosome_dict, chromosome_graph, telo.longest_telomere_len, chromo_count) for j in chromosome_dict: if (chromosome_dict[j][0] != 0) or (chromosome_dict[j][1] != 0) or (chromosome_dict[j][2] != 0): print(j + " " + str(chromosome_dict[j])) for chromosome in chromosome_graph: plt.axhline(y=1, color='b', linestyle='-') plt.axhline(y=0, color='b', linestyle='-') plt.axhline(y=8, color='b', linestyle='-') plt.axhline(y=-7, color='b', linestyle='-') for dot in chromosome_graph[chromosome][1]: plt.plot(dot, 0, 'ro') for dot in chromosome_graph[chromosome][0]: plt.plot(dot, 1, 'ro') plt.savefig('Chromosome_' + chromosome + '.jpg') plt.close() print("Average telo legth on edges - " + str(chromo_count[0][1] / chromo_count[0][0]) + "\n") print("Average telo legth in center - " + str(chromo_count[1][1] / chromo_count[1][0]) + "\n")
def _main(args): try: mh.mkdir(args.guppy_logs_output_directory, False) except mh.MegaError: LOGGER.warning( "Guppy logs output directory exists. Potentially overwriting " + "guppy logs.") logging.init_logger(args.guppy_logs_output_directory) # add required attributes for loading guppy, but not valid options for # this script. args.do_not_use_guppy_server = False args.output_directory = args.guppy_logs_output_directory args.outputs = [mh.PR_VAR_NAME] LOGGER.info("Loading model.") backend_params = backends.parse_backend_params(args) with backends.ModelInfo(backend_params, args.processes) as model_info: LOGGER.info("Loading reference.") aligner = mappy.Aligner(str(args.reference), preset=str("map-ont"), best_n=1) process_all_reads( args.fast5s_dir, not args.not_recursive, args.num_reads, args.read_ids_filename, model_info, aligner, args.processes, args.output, args.suppress_progress, args.compute_false_reference_scores, )
def getFlankAligner(ref,ctg,start,stop,**kwargs): tmpRef = NamedTemporaryFile(mode='w',delete=False) for side,seq in zip(['L','R'],getFlanks(ref,ctg,start,stop,**kwargs)): tmpRef.write(f'>{"_".join([str(ctg),side])}\n{seq}\n') tmpRef.close() aligner = mp.Aligner(tmpRef.name,preset='sr') return aligner,tmpRef
def align_contigs(**kwargs): if 'infile_fasta' in kwargs: infile = kwargs['infile_fasta'] if 'out' in kwargs: outfile = kwargs['out'] if 'genome' in kwargs: genome = kwargs['genome'] if 'preset' in kwargs: preset = kwargs['preset'] if 'nthreads' in kwargs: nthreads = kwargs['nthreads'] a = mp.Aligner(str(genome), preset=preset, n_threads=nthreads) if not a: raise Exception("ERROR: failed to load/build index") outfile = open(outfile, 'w') outfile.write( "read\tchr\tpos\tr_st\tr_en\tq_st\tq_en\tq_len\tprimary\tstrand\tcs\tcigstr\tcigtup\n" ) for name, seq, qual in mp.fastx_read(infile): seq_len = len(seq) print name for hit in a.map(seq, cs=True): outfile.write( "{}\t{}\t{}\t{}\t{}\t{}\t{}\t{}\t{}\t{}\t{}\n".format( name, hit.ctg, hit.r_st, hit.r_st, hit.r_en, hit.q_st, hit.q_en, seq_len, hit.is_primary, hit.strand, hit.cs, hit.cigar_str, hit.cigar)) outfile.close()
def get_relation(seq1, seq2): len1, len2 = len(seq1), len(seq2) if 0.9 * len1 > len2 or 1.1 * len1 < len2: return '', 0, 0 res = '' min_len = min(len1, len2) # seq1 = seq1 + seq1 # seq2 = seq2 + seq2 a = mp.Aligner(seq=seq1) f_iden = r_iden = 0 for h in a.map(seq2): # only use primary alignment if not h.is_primary: continue f_iden = h.mlen / (min_len + 0.0) f_strand = h.strand break rseq2 = seq2[::-1] for h in a.map(rseq2): # only use primary alignment if not h.is_primary: continue r_iden = h.mlen / (min_len + 0.0) r_strand = h.strand break if max(f_iden, r_iden) < 0.8: return 'NA', f_iden, r_iden if f_iden > r_iden: if f_strand == 1: res = 'ID' # 'identical' else: res = 'RC' else: if r_strand == 1: res = 'R' else: res = 'C' return res
def filter_fastq(TE, R1, R2, out_fastq): '''Filter reads with a single read aligning to a given sequence ''' reference = mp.Aligner(TE, preset="sr") # load or build index if not reference: raise Exception("ERROR: failed to load/build index") out_fastq = open(out_fastq, "w") iterator1 = SeqIO.parse(R1, "fastq") iterator2 = SeqIO.parse(R2, "fastq") for r1 in iterator1: r2 = iterator2.__next__() r1_maps = list(reference.map(r1.seq)) hits1 = [get_output_var(x) for x in r1_maps] for hit in hits1: if hit["blen"] < 0.95 * len(r1.seq): r1_maps = [] r2_maps = list(reference.map(r2.seq)) hits2 = [get_output_var(x) for x in r2_maps] for hix in hits2: if hix["blen"] < 0.95 * len(r2.seq): r2_maps = [] if (len(r1_maps) >= 1) and not (len(r2_maps) >= 1): SeqIO.write(r2, out_fastq, 'fastq') elif not (len(r1_maps) >= 1) and (len(r2_maps) >= 1): SeqIO.write(r1, out_fastq, 'fastq') out_fastq.close()
def get_minimap_cigar(genome, sequence, preset='map-ont', cigar_string=True): """Get the alignment between a genome and alignment file :param genome: fasta file to genome :param sequence: sequence to align :param preset: sr for single-end short reads; map-pb for PacBio read-to-reference mapping; map-ont for Oxford Nanopore read mapping; splice for long-read spliced alignment; asm5 for assembly-to-assembly alignment; asm10 for full genome alignment of closely related species. :param cigar_string: if True return normal cigar string, if false return array of shape (n_cigar, 2) The two numbers give the length and the operator of each CIGAR operation. """ assert os.path.exists(genome), "Genome path does not exist: {}".format( genome) assert preset in ["sr", "map-pb", "map-ont", "splice", "asm5", "asm10"] assert len( sequence) > 60, "minimap does not find alignments for small reads" a = mp.Aligner(genome, preset=preset) # load or build index if not a: raise Exception("ERROR: failed to load/build index") for hit in a.map(sequence): if hit.is_primary: print(hit) if cigar_string: return str(hit.cigar_str) else: return hit.cigar
def get_mp_error_rate(ref_seq, read_seq): a = mp.Aligner(seq=read_seq) error = -1 for h in a.map(ref_seq): if not h.is_primary: continue error = h.NM / (h.NM + mlen + 0.0) break return error
def __init__(self, index): self.index = index if self.index: self.mapper = mp.Aligner(self.index, preset="map-ont") self.initialised = True else: self.mapper = None self.initialised = False
def read_in_contigs_as_reference(): global contigs_as_reference contigs_format = guess_fileformat(arguments.contigs) if contigs_format == "fasta": import mappy contigs_as_reference = mappy.Aligner(arguments.contigs) else: raise ValueError("Contigs are in weird format, I refuse to cooperate.")
def __init__(self, model_config, test_config, model_filepath): self._generator = get_generator(model_config, test_config, kind="testing") self._reads = test_config['reads'] self._batch_size = test_config['batch_size'] self._aligner = mp.Aligner("../useful_files/zymo-ref-uniq_2019-03-15.fa") self._with_assembler = model_config['encoder_max_length'] == test_config['stride'] self._model_file_path = model_filepath self._result_dic = self._get_result_dic(self._model_file_path)
def _main(args): logging.init_logger() LOGGER.info("Loading reference") aligner = mappy.Aligner(str(args.reference), preset=str("map-ont"), best_n=1) LOGGER.info("Loading variants") var_data = variants.VarInfo(args.in_vcf, aligner, args.max_indel_size, keep_var_fp_open=True) contigs = var_data.variants_idx.header.contigs.values() LOGGER.info("Atomizing variants") with open(args.out_vcf, "w") as out_vars: # preprocess contigs to set contig lengths for VCF header ctg_lens = {} for ctg in contigs: chrm_seq = aligner.seq(ctg.name) if len(chrm_seq) != ctg.length: LOGGER.warning( ("Mismatched contig lengths ({}) between " + "reference ({}) and input VCF ({}) using length from " "reference").format(ctg.name, len(chrm_seq), ctg.length)) ctg_lens[ctg.name] = len(chrm_seq) out_vars.write("\n".join(HEADER + [ CONTIG_HEADER_LINE.format(ctg, ctg_len) for ctg, ctg_len in ctg_lens.items() ] + [ variants.CONTEXT_BASE_MI_LINE, COMMAND_HEADER_LINE.format(" ".join(sys.argv)), FIELDS_LINE, ]) + "\n") for ctg in contigs: chrm_seq = aligner.seq(ctg.name) map_pos = mapping.MAP_POS( chrm=ctg.name, strand=None, start=0, end=len(chrm_seq), q_trim_start=None, q_trim_end=None, ) for var in var_data.fetch_read_variants(map_pos, mh.seq_to_int(chrm_seq)): out_vars.write( RECORD_LINE.format( chrm=ctg.name, pos=var.ref_start + 1, rid=var.id, ref=var.ref, alts=",".join(var.alts), info=variants.HAS_CONTEXT_BASE_TAG if var.has_context_base else ".", )) LOGGER.info("Indexing output variant file") variants.index_variants(args.out_vcf)
def process_unique_one(reference, hits, read1, read2, distance): """ Method to process non unique mapping hits """ if len(hits[0]) == 1 and len(hits[1]) > 0: indx = 0 seq = read2.seq out = [[get_output_var(hits[0][0]), "u"], []] elif len(hits[1]) == 1 and len(hits[0]) > 0: indx = 1 seq = read1.seq out = [[], [get_output_var(hits[1][0]), "u"]] #Get reference sequence for unique hit refseq = reference.seq(hits[indx][0].ctg, hits[indx][0].r_st - distance, hits[indx][0].r_en + distance) if refseq: local_reference = mp.Aligner(seq=refseq, preset="sr", n_threads=1) new_hits = [] for hit in local_reference.map(seq): # traverse alignments if hit.mlen / (len(seq) * 1.0) > 0.8: new_hits.append(hit) # Need to fix hit.ctg and hit.r_st??? if len(new_hits) == 1: if indx == 0: out[1] = [get_output_var(new_hits[0]), "r"] out[1][0]["ctg"] = out[0][0]["ctg"] else: out[0] = [get_output_var(new_hits[0]), "r"] out[0][0]["ctg"] = out[1][0]["ctg"] elif len(new_hits) > 0: distance_list = [] for new_hit in new_hits: if new_hit.r_st >= distance: distance_list.append(new_hit.r_st - distance) else: distance_list.append(distance - new_hit.r_en) cumulative_probability = 0 probability_list = [] for dist in distance_list: scale_probability = math.exp(-0.8 * dist / 50 - 0.6618) if scale_probability < 0: scale_probability = 0 cumulative_probability += scale_probability probability_list.append(cumulative_probability) selected_probability = random.random() * cumulative_probability selected_probability = random.random() * cumulative_probability pindex = 0 while selected_probability >= probability_list[pindex]: pindex += 1 if indx == 0: out[1] = [get_output_var(new_hits[pindex - 1]), "p"] out[1][0]["ctg"] = out[0][0]["ctg"] else: out[0] = [get_output_var(new_hits[pindex - 1]), "p"] out[0][0]["ctg"] = out[1][0]["ctg"] return out
def __init__(self, reference, preset=None): self.kwargs = {'fn_idx_in': reference, 'best_n': 1} if preset: self.kwargs['preset'] = preset else: #self.kwargs['scoring'] = (2,5,5,4,56,0) #self.kwargs['scoring'] = (1,2,2,1,32,0) # (A,B,o,e,O,E) self.kwargs['scoring'] = (1, 2, 2, 1, 18, 0) # (A,B,o,e,O,E) self._aligner = mp.Aligner(**self.kwargs)
def prep(args): if pathlib.Path(args.fast5_dir).is_dir(): fast5s = find_all_fast5s(args.fast5_dir) else: fast5s = [args.fast5_dir] read_seqs = load_fastq(args.fastq) albacore_barcodes = load_albacore_barcodes_from_sequencing_summary( args.sequencing_summary) # For the ligation kit we need to align to reference (but not for the rapid kit). if args.kit == 'EXP-NBD103_start' or args.kit == 'EXP-NBD103_end': mappy_aligner = mp.Aligner(args.ref) else: mappy_aligner = None read_count = 0 for fast5_file in fast5s: try: read_id, signal = get_read_id_and_signal(fast5_file) except KeyError: continue if read_id not in read_seqs: continue print('', file=sys.stderr) print(fast5_file, file=sys.stderr) print(' read ID: {}'.format(read_id), file=sys.stderr) if albacore_barcodes is not None: try: albacore_barcode = albacore_barcodes[read_id] except KeyError: albacore_barcode = None else: albacore_barcode = None if args.kit == 'EXP-NBD103_start': prep_native_read_start(signal, read_seqs[read_id], mappy_aligner, args.signal_size, albacore_barcode) if args.kit == 'EXP-NBD103_end': prep_native_read_end(signal, read_seqs[read_id], mappy_aligner, args.signal_size, albacore_barcode) elif args.kit == 'SQK-RBK004_start': prep_rapid_read_start() read_count += 1 if args.read_limit is not None: if read_count >= args.read_limit: break print('', file=sys.stderr)
def extract_fastq(input_f, ref_f, mode=0, trans_start=None): """ Args: input_f: intput fast5 file handle ref_f: file name of the reference mode: 0-dna, 1-rna, -1-rna 180mV trans_start: Start position of the transcription(required in RNA mode). """ with h5py.File(input_f, 'r') as input_fh: raw_entry = list(input_fh['/Raw/Reads'].values())[0] raw_signal = raw_entry['Signal'].value raw_seq = input_fh[BASECALL_ENTRY + '/BaseCalled_template/Fastq'].value if mode != 0: assert trans_start is not None raw_signal, raw_seq, decap_event = _decap(input_fh, trans_start, raw_signal, raw_seq) else: decap_event = input_fh[BASECALL_ENTRY + '/BaseCalled_template/Events'].value ref = mappy.Aligner(ref_f) align = ref.map(raw_seq) ref = mappy.Aligner(ref_f, preset="map-ont", best_n=5) aligns = ref.map(raw_seq.split(b'\n')[1]) maxmapq = -np.inf for aln in aligns: if aln.mapq > maxmapq: maxmapq = aln.mapq align = aln if align is None: print("FAIL MAPPING " + input_f) if align.strand == -1: ref_seq = mappy.revcomp( ref.seq(align.ctg, start=align.r_st, end=align.r_en)) else: ref_seq = ref.seq(align.ctg, start=align.r_st, end=align.r_en) if (mode == 1) or (mode == -1): raw_signal = raw_signal[::-1] if ref_seq is None: print("No Reference sequence found in %s" % (input_f)) print(aligns) raise return raw_signal, raw_seq, ref_seq, decap_event
def test(config, experiment_name, new_testing=False): if new_testing: discard_existing_testing(experiment_name) model = get_trained_model(config, experiment_name) controller = TestingController(config, experiment_name, model, new_testing) for bacteria in config['testing']['bacteria']: name = bacteria['name'] generator = data_api.get_raw_generator(config, bacteria['data']) aligner = mp.Aligner(bacteria['reference']) controller.test(name, generator, aligner)
def align_before_after(output_dir, sv, query_seq, ref_seq_1, ref_seq_2): #within length limit if not sv.is_third_fil: aligner = Align.PairwiseAligner() aligner.mode = 'global' #aligner.mode = 'local' aligner.match_score = 1 aligner.mismatch_score = -1 aligner.open_gap_score = -1 aligner.extend_gap_score = -0.5 #aligner.score_only = True alignment_beforeSV = aligner.score(query_seq, ref_seq_1) alignment_afterSV = aligner.score(query_seq, ref_seq_2) else: h = open(output_dir + "tmp_query.fasta", "w") h.write('>' + str(sv.idx) + "\n") h.write(query_seq + "\n") h.close() # aligner = mappy.Aligner(fn_idx_in=output_dir+"tmp_query.fasta", scoring=[1,1,2,1]) aligner = mappy.Aligner(fn_idx_in=output_dir + "tmp_query.fasta") #if not alignment: raise Exception("ERROR: failed to load/build index") aligner_beforeSV = aligner.map(ref_seq_1, seq2=None, cs=False, MD=False) aligner_afterSV = aligner.map(ref_seq_2, seq2=None, cs=False, MD=False) #test # for agt in aligner_beforeSV: # alignment_beforeSV = len(query_seq) - (len(ref_seq_1) - agt.mlen) # break # for agt in aligner_afterSV: # alignment_afterSV = len(query_seq) - (len(ref_seq_2) - agt.mlen) # break try: agt_before = next(aligner_beforeSV) except: os.remove(output_dir + "tmp_query.fasta") return None, None try: agt_after = next(aligner_afterSV) except: os.remove(output_dir + "tmp_query.fasta") return None, None alignment_beforeSV = len(query_seq) - (len(ref_seq_1) - agt_before.mlen) alignment_afterSV = len(query_seq) - (len(ref_seq_2) - agt_after.mlen) os.remove(output_dir + "tmp_query.fasta") return alignment_beforeSV, alignment_afterSV
def remove_by_alignment(fq, ref, out, mapq, preset, human_out, threads, logger): fout = smart_open(filename=out, mode="w") if human_out: hout = smart_open(filename=human_out, mode="w") else: hout = None logger.info(f"Starting to map reads against: {ref}") logger.info(f"Initiating aligner: {ref}") aligner = mp.Aligner(str(ref), preset=preset, n_threads=threads) logger.info(f"Opening file handle: {fq}") if fq: reads = mp.fastx_read(str(fq)) else: reads = None # PE ref_maps = 0 total_reads = 0 logger.info(f"Filtering mapped reads [Q >= {mapq}]") human = [] not_human = [] for name, seq, qual in reads: mapped = aligner.map(seq) for aln in mapped: if aln.mapq >= mapq: ref_maps += 1 if name not in human: human.append(name) if hout is not None: hout.write(str(f"@{name}\n{seq}\n+\n{qual}\n")) continue if name not in human: fout.write(str(f"@{name}\n{seq}\n+\n{qual}\n")) if name not in not_human: not_human.append(name) total_reads += 1 fout.close() if hout is not None: hout.close() logger.info(f"Computed {ref_maps} mappings against reference: {ref}") logger.info(f"Recovered {len(not_human)} / {total_reads} reads from {fq}")
def create_index(reference_file): aligner = mp.Aligner(reference_file, best_n=1) for name, seq, qual in mp.fastx_read(reference_file, read_comment=False): reference_names.append(name) reference_lengths[name] = len(seq) if not aligner: raise Exception("ERROR: failed to load/build index file '{}'".format( reference_file)) return aligner
def hdf_to_sam_worker(reference, fname): """Extract and align basecall and methylation data from `.fast5`. :param reference: `.fasta` file containing reference sequence(s). :param fname: `.'fast5` file containing read data. """ logger = medaka.common.get_named_logger('ModExtract') logger.info("Processing {}.".format(fname)) results = list() aligner = mappy.Aligner(reference, preset='map-ont') with get_fast5_file(fname, mode="r") as f5: reads = list(f5.get_read_ids()) logger.info("Found {} reads for {}.".format(len(reads), fname)) for read_id in reads: read = f5.get_read(read_id) tool = Basecall1DTools(read) name, sequence, qstring = tool.get_called_sequence('template', fastq=False) try: align = next(aligner.map(sequence, MD=True, cs=True)) except StopIteration: continue else: if align.strand == +1: flag = '0' seq = sequence else: flag = '16' seq = medaka.common.reverse_complement(sequence) rname = align.ctg pos = str(align.r_st + 1) mapq = str(align.mapq) clip = [ '' if x == 0 else '{}S'.format(x) for x in (align.q_st, len(sequence) - align.q_en) ] if align.strand == -1: clip = clip[::-1] cigar = clip[0] + align.cigar_str + clip[1] NM = 'NM:i:' + str(align.NM) latest = read.get_latest_analysis('Basecall_1D') mod_base = read.get_analysis_dataset(latest, MODBASEPATH) mod_base = mod_base.view(dtype=MODTYPE) mA = 'MA:B:C,{}'.format(','.join( str(x) for x in mod_base['6mA'].reshape(-1))) mC = 'MC:B:C,{}'.format(','.join( str(x) for x in mod_base['5mC'].reshape(-1))) results.append('\t'.join( (read_id, flag, rname, pos, mapq, cigar, '*', '0', '0', seq, qstring, NM, mA, mC))) return results
def is_mt(seq, rnr=False): is_chrM = 'not_MT' chrom_path = '/stor/work/Lambowitz/ref/hg19' if rnr: genome = chrom_path + '/new_genes/mt_rnr.fa' else: genome = chrom_path + '/genome/chrM.minimap2_idx' aligner = mappy.Aligner(genome, preset='sr') if list(aligner.map(seq)): is_chrM = 'is_MT' return is_chrM
def generate_coverage(read1, read2, mapping, ref, pwid=0.95, ncpu=1, chunk_size=500000, quiet=False): if not quiet: print("Building index and data structures...") seq_cov = {} for name, seq in pyfastx.Fasta(ref, build_index=False): seq_cov[name] = np.zeros(len(seq), dtype=int) nreads = 0 read_len = 0 for r in mp.fastx_read(read1): nreads+=1 read_len += len(r[1]) read_len /= nreads min_chain_score = int(0.9*read_len) min_mis_match = int(read_len-pwid*read_len) a = mp.Aligner(ref, preset='sr', n_threads=ncpu, best_n=1000, min_chain_score=min_chain_score) # load or build index if not a: raise Exception("ERROR: failed to load/build index") def mpile(seqs): if seqs is None: return([]) thrbuf = mp.ThreadBuffer() hits = [] chrom=None for hit in a.map(seqs[1], buf=thrbuf): if (hit.NM<=min_mis_match) and ('S' not in hit.cigar_str) and ('H' not in hit.cigar_str): if chrom is None: chrom=mapping[hit.ctg] hits.append((hit.ctg, hit.r_st-1, hit.r_en)) elif mapping[hit.ctg] == chrom: hits.append((hit.ctg, hit.r_st-1, hit.r_en)) else: break return(hits) if not quiet: print("Aligning reads...") pool = ThreadPool(ncpu) for reads in tqdm(grouper(chain( mp.fastx_read(read1), mp.fastx_read(read2)), chunk_size), total=int(1+2*nreads/chunk_size), disable=quiet): hits = pool.map(mpile, reads) for hit in chain.from_iterable(hits): if hit is None: continue seq_cov[hit[0]][hit[1]:hit[2]] += 1 #close the pool and wait for the work to finish pool.close() pool.join() return(seq_cov)