示例#1
0
文件: io.py 项目: lcerdeira/bonito
 def run(self):
     for fast5 in tqdm(glob("%s/*fast5" % self.directory),
                       ascii=True,
                       ncols=100):
         for read_id, raw_data in get_raw_data(fast5):
             self.queue.put((read_id, raw_data))
     self.queue.put(None)
示例#2
0
文件: io.py 项目: TimD1/bonito
 def run(self):
     for fast5 in tqdm(glob("%s/*fast5" % self.directory),
                       ascii=True,
                       ncols=100,
                       leave=False):
         for read in get_raw_data(fast5):
             self.queue.put(read)
     self.queue.put(None)
示例#3
0
def basecall(rank, total_gpu, args, input_files):
    setup(rank, total_gpu)

    device_id = rank
    sys.stderr.write("INFO: LOADING MODEL ON DEVICE: {}\n".format(device_id))
    model = load_model(args.model_directory, args.device, weights=int(args.weights), half=args.half)
    alphabet = model.alphabet
    torch.cuda.set_device(device_id)
    model.to(device_id)
    model.eval()
    model = DDP(model, device_ids=[device_id])
    sys.stderr.write("INFO: LOADED MODEL ON DEVICE: {}\n".format(device_id))

    samples = 0
    num_reads = 0
    max_read_size = 1e9
    dtype = np.float16 if args.half else np.float32
    sys.stderr.write('No of files:{}, index: {}'.format(len(input_files[rank]), rank))
    hdf5_file = h5py.File('{}/{}_{}.hdf5'.format(args.output_directory, args.prefix, device_id), 'w')
    hdf5_file.create_group('Reads')
    reads = hdf5_file['Reads']
    fasta_file = open('{}/{}_{}.fasta'.format(args.output_directory, args.prefix, device_id), 'w')

    t0 = time.perf_counter()
    sys.stderr.write("STARTING INFERENCE\n")
    st = time.time()
    with torch.no_grad():
        for fast5 in input_files[device_id]:
            for read_id, raw_data in get_raw_data(fast5):
                num_reads += 1
                samples += len(raw_data)
                signal_data = raw_data

                raw_data = raw_data[np.newaxis, np.newaxis, :].astype(dtype)
                gpu_data = torch.tensor(raw_data).to(args.device)   
                posteriors = model(gpu_data).exp().cpu().numpy().squeeze()

                sequence, means = decode_revised(posteriors, alphabet, signal_data, args.kmer_length, args.beamsize)
                if len(means) > 0:
                    # sys.stderr.write("\n> No. of kmers: {}\n".format(len(means)))
                    reads.create_group(read_id)
                    reads[read_id]['means'] = means
                fasta_file.write(">%s\n" % read_id)
                fasta_file.write("%s\n" % os.linesep.join(wrap(sequence, 100)))

            ct = time.time()
            sys.stderr.write("\nINFO: FINISHED PROCESSING: {}/{} FILES. DEVICE: {} ELAPSED TIME: {}".format(num_reads, len(input_files), device_id, ct-st))

    t1 = time.perf_counter()
    sys.stderr.write("INFO: TOTAL READS: %s\n" % num_reads)
    sys.stderr.write("INFO: TOTAL DURATION %.1E\n" % (t1 - t0))
    sys.stderr.write("INFO: SAMPLES PER SECOND %.1E\n" % (num_reads/(t1 - t0)))
    sys.stderr.write("DONE\n")

    cleanup()