def run(self): for fast5 in tqdm(glob("%s/*fast5" % self.directory), ascii=True, ncols=100): for read_id, raw_data in get_raw_data(fast5): self.queue.put((read_id, raw_data)) self.queue.put(None)
def run(self): for fast5 in tqdm(glob("%s/*fast5" % self.directory), ascii=True, ncols=100, leave=False): for read in get_raw_data(fast5): self.queue.put(read) self.queue.put(None)
def basecall(rank, total_gpu, args, input_files): setup(rank, total_gpu) device_id = rank sys.stderr.write("INFO: LOADING MODEL ON DEVICE: {}\n".format(device_id)) model = load_model(args.model_directory, args.device, weights=int(args.weights), half=args.half) alphabet = model.alphabet torch.cuda.set_device(device_id) model.to(device_id) model.eval() model = DDP(model, device_ids=[device_id]) sys.stderr.write("INFO: LOADED MODEL ON DEVICE: {}\n".format(device_id)) samples = 0 num_reads = 0 max_read_size = 1e9 dtype = np.float16 if args.half else np.float32 sys.stderr.write('No of files:{}, index: {}'.format(len(input_files[rank]), rank)) hdf5_file = h5py.File('{}/{}_{}.hdf5'.format(args.output_directory, args.prefix, device_id), 'w') hdf5_file.create_group('Reads') reads = hdf5_file['Reads'] fasta_file = open('{}/{}_{}.fasta'.format(args.output_directory, args.prefix, device_id), 'w') t0 = time.perf_counter() sys.stderr.write("STARTING INFERENCE\n") st = time.time() with torch.no_grad(): for fast5 in input_files[device_id]: for read_id, raw_data in get_raw_data(fast5): num_reads += 1 samples += len(raw_data) signal_data = raw_data raw_data = raw_data[np.newaxis, np.newaxis, :].astype(dtype) gpu_data = torch.tensor(raw_data).to(args.device) posteriors = model(gpu_data).exp().cpu().numpy().squeeze() sequence, means = decode_revised(posteriors, alphabet, signal_data, args.kmer_length, args.beamsize) if len(means) > 0: # sys.stderr.write("\n> No. of kmers: {}\n".format(len(means))) reads.create_group(read_id) reads[read_id]['means'] = means fasta_file.write(">%s\n" % read_id) fasta_file.write("%s\n" % os.linesep.join(wrap(sequence, 100))) ct = time.time() sys.stderr.write("\nINFO: FINISHED PROCESSING: {}/{} FILES. DEVICE: {} ELAPSED TIME: {}".format(num_reads, len(input_files), device_id, ct-st)) t1 = time.perf_counter() sys.stderr.write("INFO: TOTAL READS: %s\n" % num_reads) sys.stderr.write("INFO: TOTAL DURATION %.1E\n" % (t1 - t0)) sys.stderr.write("INFO: SAMPLES PER SECOND %.1E\n" % (num_reads/(t1 - t0))) sys.stderr.write("DONE\n") cleanup()