示例#1
0
def main(args):

    if args.save_ctc and not args.reference:
        sys.stderr.write("> a reference is needed to output ctc training data\n")
        exit(1)

    sys.stderr.write("> loading model\n")
    model = load_model(args.model_directory, args.device, weights=int(args.weights))

    if args.reference:
        sys.stderr.write("> loading reference\n")
        aligner = Aligner(args.reference, preset='ont-map')
        if not aligner:
            sys.stderr.write("> failed to load/build index\n")
            exit(1)
    else:
        aligner = None

    reads = get_reads(
        args.reads_directory, n_proc=8, recursive=args.recursive,
        read_ids=column_to_set(args.read_ids), skip=args.skip,
    )

    basecall = load_symbol(args.model_directory, "basecall")

    if args.save_ctc:
        reads = (
            chunk for read in reads if len(read.signal) >= 3600 for chunk in read_chunks(read)
        )
        basecalls = basecall(model, reads, aligner=aligner, qscores=args.fastq, batchsize=64)
        writer = CTCWriter(
            tqdm(basecalls, desc="> calling", unit=" reads", leave=False),
            aligner, args.ctc_min_coverage, args.ctc_min_accuracy
        )
    else:
        basecalls = basecall(model, reads, aligner=aligner, qscores=args.fastq)
        writer = Writer(
            tqdm(basecalls, desc="> calling", unit=" reads", leave=False), aligner, fastq=args.fastq
        )

    t0 = perf_counter()
    writer.start()
    writer.join()
    duration = perf_counter() - t0
    num_samples = sum(num_samples for read_id, num_samples in writer.log)

    sys.stderr.write("> completed reads: %s\n" % len(writer.log))
    sys.stderr.write("> duration: %s\n" % timedelta(seconds=np.round(duration)))
    sys.stderr.write("> samples per second %.1E\n" % (num_samples / duration))
    sys.stderr.write("> done\n")
示例#2
0
def main(args):

    init(args.seed, args.device)

    if args.model_directory in models and args.model_directory not in os.listdir(
            __models__):
        sys.stderr.write("> downloading model\n")
        File(__models__, models[args.model_directory]).download()

    sys.stderr.write(f"> loading model {args.model_directory}\n")
    try:
        model = load_model(
            args.model_directory,
            args.device,
            weights=int(args.weights),
            chunksize=args.chunksize,
            overlap=args.overlap,
            batchsize=args.batchsize,
            quantize=args.quantize,
            use_koi=True,
        )
    except FileNotFoundError:
        sys.stderr.write(f"> error: failed to load {args.model_directory}\n")
        sys.stderr.write(f"> available models:\n")
        for model in sorted(models):
            sys.stderr.write(f" - {model}\n")
        exit(1)

    if args.verbose:
        sys.stderr.write(
            f"> model basecaller params: {model.config['basecaller']}\n")

    basecall = load_symbol(args.model_directory, "basecall")

    mods_model = None
    if args.modified_base_model is not None or args.modified_bases is not None:
        sys.stderr.write("> loading modified base model\n")
        mods_model = load_mods_model(args.modified_bases, args.model_directory,
                                     args.modified_base_model)
        sys.stderr.write(f"> {mods_model[1]['alphabet_str']}\n")

    if args.reference:
        sys.stderr.write("> loading reference\n")
        aligner = Aligner(args.reference, preset='ont-map', best_n=1)
        if not aligner:
            sys.stderr.write("> failed to load/build index\n")
            exit(1)
    else:
        aligner = None

    fmt = biofmt(aligned=args.reference is not None)

    if args.reference and args.reference.endswith(
            ".mmi") and fmt.name == "cram":
        sys.stderr.write(
            "> error: reference cannot be a .mmi when outputting cram\n")
        exit(1)
    elif args.reference and fmt.name == "fastq":
        sys.stderr.write(
            f"> warning: did you really want {fmt.aligned} {fmt.name}?\n")
    else:
        sys.stderr.write(f"> outputting {fmt.aligned} {fmt.name}\n")

    if args.save_ctc and not args.reference:
        sys.stderr.write(
            "> a reference is needed to output ctc training data\n")
        exit(1)

    if fmt.name != 'fastq':
        groups = get_read_groups(args.reads_directory,
                                 args.model_directory,
                                 n_proc=8,
                                 recursive=args.recursive,
                                 read_ids=column_to_set(args.read_ids),
                                 skip=args.skip,
                                 cancel=process_cancel())
    else:
        groups = []

    reads = get_reads(args.reads_directory,
                      n_proc=8,
                      recursive=args.recursive,
                      read_ids=column_to_set(args.read_ids),
                      skip=args.skip,
                      cancel=process_cancel())

    if args.max_reads:
        reads = take(reads, args.max_reads)

    if args.save_ctc:
        reads = (chunk for read in reads for chunk in read_chunks(
            read,
            chunksize=model.config["basecaller"]["chunksize"],
            overlap=model.config["basecaller"]["overlap"]))
        ResultsWriter = CTCWriter
    else:
        ResultsWriter = Writer

    results = basecall(model,
                       reads,
                       reverse=args.revcomp,
                       batchsize=model.config["basecaller"]["batchsize"],
                       chunksize=model.config["basecaller"]["chunksize"],
                       overlap=model.config["basecaller"]["overlap"])

    if mods_model is not None:
        results = process_itemmap(partial(call_mods, mods_model), results)
    if aligner:
        results = align_map(aligner, results, n_thread=os.cpu_count())

    writer = ResultsWriter(
        fmt.mode,
        tqdm(results, desc="> calling", unit=" reads", leave=False),
        aligner=aligner,
        group_key=args.model_directory,
        ref_fn=args.reference,
        groups=groups,
    )

    t0 = perf_counter()
    writer.start()
    writer.join()
    duration = perf_counter() - t0
    num_samples = sum(num_samples for read_id, num_samples in writer.log)

    sys.stderr.write("> completed reads: %s\n" % len(writer.log))
    sys.stderr.write("> duration: %s\n" %
                     timedelta(seconds=np.round(duration)))
    sys.stderr.write("> samples per second %.1E\n" % (num_samples / duration))
    sys.stderr.write("> done\n")
示例#3
0
def main(args):

    sys.stderr.write("> loading model\n")
    model = load_model(args.model, args.device)

    if args.reference:
        sys.stderr.write("> loading reference\n")
        aligner = Aligner(args.reference, preset='ont-map')
        if not aligner:
            sys.stderr.write("> failed to load/build index\n")
            exit(1)
    else:
        aligner = None

    if args.summary:
        sys.stderr.write("> finding follow on strands\n")
        pairs = pd.read_csv(args.summary, '\t', low_memory=False)
        pairs = pairs[pairs.sequence_length_template.gt(0)]
        if 'filename' in pairs.columns:
            pairs = pairs.rename(columns={'filename': 'filename_fast5'})
        if 'alignment_strand_coverage' in pairs.columns:
            pairs = pairs.rename(
                columns={'alignment_strand_coverage': 'alignment_coverage'})
        valid_fast5s = [
            f for f in pairs.filename_fast5.unique()
            if ((args.reads_directory / Path(f)).exists())
        ]
        pairs = pairs[pairs.filename_fast5.isin(valid_fast5s)]
        pairs = find_follow_on(pairs)
        sys.stderr.write("> found %s follow strands in summary\n" %
                         (len(pairs) // 2))

        if args.max_reads > 0: pairs = pairs.head(args.max_reads)

        temp_reads = pairs.iloc[0::2]
        comp_reads = pairs.iloc[1::2]
    else:
        if args.index is not None:
            sys.stderr.write("> loading read index\n")
            index = json.load(open(args.index, 'r'))
        else:
            sys.stderr.write("> building read index\n")
            files = list(glob(os.path.join(args.reads_directory, '*.fast5')))
            index = build_index(files, n_proc=8)
            if args.save_index:
                with open('bonito-read-id.idx', 'w') as f:
                    json.dump(index, f)

        pairs = pd.read_csv(args.pairs,
                            sep=args.sep,
                            names=['read_1', 'read_2'])
        if args.max_reads > 0: pairs = pairs.head(args.max_reads)

        pairs['file_1'] = pairs['read_1'].apply(index.get)
        pairs['file_2'] = pairs['read_2'].apply(index.get)
        pairs = pairs.dropna().reset_index()

        temp_reads = pairs[['read_1',
                            'file_1']].rename(columns={
                                'read_1': 'read_id',
                                'file_1': 'filename_fast5'
                            })
        comp_reads = pairs[['read_2',
                            'file_2']].rename(columns={
                                'read_2': 'read_id',
                                'file_2': 'filename_fast5'
                            })

    if len(pairs) == 0:
        print("> no matched pairs found in given directory", file=sys.stderr)
        exit(1)

    # https://github.com/clara-parabricks/GenomeWorks/issues/648
    with devnull():
        CudaPoaBatch(1000, 1000, 3724032)

    basecalls = call(model,
                     args.reads_directory,
                     temp_reads,
                     comp_reads,
                     aligner=aligner)
    writer = Writer(tqdm(basecalls,
                         desc="> calling",
                         unit=" reads",
                         leave=False),
                    aligner,
                    duplex=True)

    t0 = perf_counter()
    writer.start()
    writer.join()
    duration = perf_counter() - t0
    num_samples = sum(num_samples for read_id, num_samples in writer.log)

    print("> duration: %s" % timedelta(seconds=np.round(duration)),
          file=sys.stderr)
    print("> samples per second %.1E" % (num_samples / duration),
          file=sys.stderr)