예제 #1
0
def prepare_input_file(in_out, **kwargs):
    path, in_list, output = in_out

    print "Creating training data NetCDF: {}".format(output)
    fast5_files = list(iterate_fast5(path, paths=True, strand_list=in_list))
    return make_currennt_training_input_multi(fast5_files=fast5_files,
                                              netcdf_file=output,
                                              **kwargs)
예제 #2
0
def main():
    if len(sys.argv) == 1:
        sys.argv.append("-h")
    args = get_parser().parse_args()
    
    logging.basicConfig(format='[%(asctime)s - %(name)s] %(message)s', datefmt='%H:%M:%S', level=logging.DEBUG)
    if not args.debug:
        logging.disable('root')
    logging.info('Starting 2D basecalling.')
 
    modelfiles = {
        'template': os.path.abspath(args.template_model),
        'complement': os.path.abspath(args.complement_model)
    }
            
    #TODO: handle case where there are pre-existing files.
    if args.watch is not None:
        # An optional component
        from nanonet.watcher import Fast5Watcher
        fast5_files = Fast5Watcher(args.input, timeout=args.watch)
    else:
        sort_by_size = None
        fast5_files = iterate_fast5(args.input, paths=True, strand_list=args.strand_list, limit=args.limit, sort_by_size=sort_by_size)

    fix_args = [
        modelfiles
    ]
    fix_kwargs = {a: getattr(args, a) for a in ( 
        'min_len', 'max_len', 'section',
        'event_detect', 'fast_decode',
        'write_events', 'opencl_2d', 'ed_params',
        'sloika_model'
    )}

    # Define worker functions   
    mapper = tang_imap(
        process_read_2d, fast5_files,
        fix_args=fix_args, fix_kwargs=fix_kwargs,
        threads=args.jobs, unordered=True
    )

    # Off we go
    n_reads = 0
    n_bases = 0
    n_events = 0
    n_bases_2d = 0
    timings = [0.0, 0.0, 0.0]
    t0 = now()
    sections = ('template', 'complement', '2d')
    if args.output_prefix is not None:
        ext = 'fastq' if args.fastq else 'fasta'
        filenames = ['{}_{}.{}'.format(args.output_prefix, x, ext) for x in sections]
    else:
        filenames = ['-'] * 3

    with FastaWrite(filenames[0], args.fastq) as fasta_temp, FastaWrite(filenames[1], args.fastq) as fasta_comp, FastaWrite(filenames[2], args.fastq) as fasta_2d:
        for result in mapper:
            if result['template'] is None:
                continue
            data, time = result['template']
            fname, basecall, _, n_ev = data
            basecall, quality = basecall
            name, _ = short_names(fname)
            if args.fastq:
                fasta_temp.write(name, basecall, quality)
            else:
                fasta_temp.write(name, basecall)
            n_reads += 1
            n_bases += len(basecall)
            n_events += n_ev
            timings = [x + y for x, y in zip(timings, time + (0.0,))]

            if result['complement'] is None:
                continue
            data, time = result['complement']
            _, basecall, _, _ = data
            basecall, quality = basecall
            if args.fastq:
                fasta_comp.write(name, basecall, quality)
            else:
                fasta_comp.write(name, basecall)

            if result['2d'] is None:
                continue
            basecall, time_2d = result['2d']
            basecall, quality = basecall
            if args.fastq:
                fasta_2d.write(name, basecall, quality)
            else:
                fasta_2d.write(name, basecall)
            n_bases_2d += len(basecall)
            timings[2] += time_2d
    t1 = now()

    sys.stderr.write('Processed {} reads in {}s (wall time)\n'.format(n_reads, t1 - t0))
    if n_reads > 0:
        network, decoding, call_2d  = timings
        time_2d = 0 if n_bases_2d == 0 else n_bases_2d/1000.0/call_2d
        sys.stderr.write(
            '1D Run network: {:6.2f} ({:6.3f} kb/s, {:6.3f} kev/s)\n'
            '1D Decoding:    {:6.2f} ({:6.3f} kb/s, {:6.3f} kev/s)\n'
            '2D calling:     {:6.2f} ({:6.3f} kb/s)\n'
            .format(
                network, n_bases/1000.0/network, n_events/1000.0/network,
                decoding, n_bases/1000.0/decoding, n_events/1000.0/decoding,
                call_2d, time_2d
            )
        )
예제 #3
0
def main():
    if len(sys.argv) == 1:
        sys.argv.append("-h")
    args = get_parser().parse_args()

    if args.list_platforms:
        list_opencl_platforms()
        sys.exit(0)

    modelfile = os.path.abspath(args.model)
    if args.section is None:
        try:
            args.section = np.load(modelfile).item().meta['section']
        except:
            sys.stderr.write(
                "No 'section' found in modelfile, try specifying --section.\n")
            sys.exit(1)

    #TODO: handle case where there are pre-existing files.
    if args.watch is not None:
        # An optional component
        from nanonet.watcher import Fast5Watcher
        initial_jobs = iterate_fast5(args.input, paths=True)
        fast5_files = Fast5Watcher(args.input,
                                   timeout=args.watch,
                                   initial_jobs=initial_jobs)
    else:
        sort_by_size = 'desc' if args.platforms is not None else None
        fast5_files = iterate_fast5(args.input,
                                    paths=True,
                                    strand_list=args.strand_list,
                                    limit=args.limit,
                                    sort_by_size=sort_by_size)

    fix_args = [modelfile]
    fix_kwargs = {
        a: getattr(args, a)
        for a in ('min_len', 'max_len', 'section', 'event_detect',
                  'fast_decode', 'write_events', 'ed_params', 'sloika_model')
    }

    # Define worker functions
    workers = []
    if not args.exc_opencl:
        cpu_function = partial(process_read, *fix_args, **fix_kwargs)
        workers.extend([(cpu_function, None)] * args.jobs)
    if args.platforms is not None:
        if cl is None:
            raise ImportError('pyopencl is not installed, install with pip.')
        for platform in args.platforms:
            vendor, device_id, n_files = platform.split(':')
            pa = ProcessAttr(use_opencl=True,
                             vendor=vendor,
                             device_id=int(device_id))
            fargs = fix_args + [pa]
            opencl_function = partial(process_read_opencl, *fargs,
                                      **fix_kwargs)
            workers.append((opencl_function, int(n_files)))

    # Select how to spread load
    if args.platforms is None:
        # just CPU
        worker, n_files = workers[0]
        mapper = tang_imap(worker,
                           fast5_files,
                           threads=args.jobs,
                           unordered=True)
    elif len(workers) == 1:
        # single opencl device
        #    need to wrap files in lists, and unwrap results
        worker, n_files = workers[0]
        fast5_files = group_by_list(fast5_files, [n_files])
        mapper = itertools.chain.from_iterable(
            itertools.imap(worker, fast5_files))
    else:
        # Heterogeneous compute
        mapper = JobQueue(fast5_files, workers)

    # Off we go
    n_reads = 0
    n_bases = 0
    n_events = 0
    timings = [0.0, 0.0]
    t0 = now()
    with FastaWrite(args.output, args.fastq) as fasta:
        for result in mapper:
            if result is None:
                continue
            data, time = result
            fname, call_data, _, n_ev = data
            name, _ = short_names(fname)
            basecall, quality = call_data
            if args.fastq:
                fasta.write(name, basecall, quality)
            else:
                fasta.write(name, basecall)
            n_reads += 1
            n_bases += len(basecall)
            n_events += n_ev
            timings = [x + y for x, y in zip(timings, time)]
    t1 = now()
    sys.stderr.write(
        'Basecalled {} reads ({} bases, {} events) in {}s (wall time)\n'.
        format(n_reads, n_bases, n_events, t1 - t0))
    if n_reads > 0:
        network, decoding = timings
        sys.stderr.write(
            'Run network: {:6.2f} ({:6.3f} kb/s, {:6.3f} kev/s)\n'
            'Decoding:    {:6.2f} ({:6.3f} kb/s, {:6.3f} kev/s)\n'.format(
                network,
                n_bases / 1000.0 / network,
                n_events / 1000.0 / network,
                decoding,
                n_bases / 1000.0 / decoding,
                n_events / 1000.0 / decoding,
            ))