def get_parser(): parser = argparse.ArgumentParser( description='Map sequence to current trace using squiggle ' + 'predictor model', formatter_class=argparse.ArgumentDefaultsHelpFormatter) add_common_command_args( parser, "limit jobs output recursive version".split()) parser.add_argument( '--back_prob', default=1e-15, metavar='probability', type=proportion, help='Probability of backwards move') parser.add_argument( '--input_strand_list', default=None, action=FileExists, help='Strand summary file containing subset') parser.add_argument( '--localpen', default=None, type=Maybe(NonNegative(float)), help='Penalty for staying in start and end states, or None to ' + 'disable them') parser.add_argument( '--minscore', default=None, type=Maybe(NonNegative(float)), help='Minimum score for matching') parser.add_argument( '--trim', default=(200, 10), nargs=2, type=NonNegative(int), metavar=('beginning', 'end'), help='Number of samples to trim off start and end') parser.add_argument( 'model', action=FileExists, help='Model file') parser.add_argument( 'references', action=FileExists, help='Fasta file') parser.add_argument( 'read_dir', action=FileExists, help='Directory for fast5 reads')
def get_parser(): parser = argparse.ArgumentParser( description='Train a flip-flop neural network', formatter_class=argparse.ArgumentDefaultsHelpFormatter) add_common_command_args( parser, """adam alphabet device eps limit niteration outdir overwrite quiet save_every version weight_decay""".split()) parser.add_argument('--batch_size', default=128, metavar='chunks', type=Positive(int), help='Number of chunks to run in parallel') parser.add_argument( '--gradient_cap_fraction', default=0.05, metavar='f', type=Maybe(NonNegative(float)), help='Cap L2 norm of gradient so that a fraction f of gradients ' + 'are capped. Use --gradient_cap_fraction None for no capping.') parser.add_argument('--lr_max', default=4.0e-3, metavar='rate', type=Positive(float), help='Initial learning rate') parser.add_argument('--size', default=96, metavar='neurons', type=Positive(int), help='Base layer size for model') parser.add_argument('--seed', default=None, metavar='integer', type=Positive(int), help='Set random number seed') parser.add_argument('--stride', default=2, metavar='samples', type=Positive(int), help='Stride for model') parser.add_argument('--winlen', default=19, type=Positive(int), help='Length of window over data') parser.add_argument('model', action=FileExists, help='File to read python model description from') parser.add_argument('chunks', action=FileExists, help='file containing chunks') parser.add_argument('reference', action=FileExists, help='file containing fasta reference') return parser
def get_parser(): parser = argparse.ArgumentParser( description='Dump JSON representation of model', formatter_class=argparse.ArgumentDefaultsHelpFormatter) add_common_command_args(parser, ["output"]) parser.add_argument('model', action=FileExists, help='Model checkpoint') return parser
def get_parser(): """Get argparser object. Returns: :argparse:`ArgumentParser` : the argparser object """ parser = argparse.ArgumentParser( description="Prepare data for model training and save to hdf5 file " + "by remapping with flip-flop model", formatter_class=argparse.ArgumentDefaultsHelpFormatter) add_common_command_args( parser, ('alphabet input_folder input_strand_list jobs limit ' + 'overwrite recursive version').split()) parser.add_argument('--localpen', metavar='penalty', default=0.0, type=float, help='Penalty for local mapping') parser.add_argument( '--max_read_length', metavar='bases', default=None, type=Maybe(int), help='Don\'t attempt remapping for reads longer than this') parser.add_argument('--mod', nargs=3, metavar=('mod_base', 'canonical_base', 'mod_long_name'), default=[], action='append', help='Modified base description') parser.add_argument( '--batch_format', action='store_true', help='Output batched mapped signal file format. This can ' + 'significantly improve I/O performance and use less ' + 'disk space. An entire batch must be loaded into memory in order ' + 'access any read potentailly increasing RAM requirements.') parser.add_argument('input_per_read_params', action=FileExists, help='Input per read parameter .tsv file') parser.add_argument('output', help='Output HDF5 file') parser.add_argument('model', action=FileExists, help='Taiyaki model file') parser.add_argument( 'references', action=FileExists, help='Single fasta file containing references for each read') return parser
def get_parser(): parser = argparse.ArgumentParser( description="Basecall reads using a taiyaki model", formatter_class=argparse.ArgumentDefaultsHelpFormatter) add_common_command_args( parser, """alphabet device input_folder input_strand_list jobs limit output quiet recursive version""".split()) parser.add_argument( '--beam', default=None, metavar=('width', 'guided'), nargs=2, type=(int, bool), action=ParseToNamedTuple, help='Use beam search decoding') parser.add_argument( "--chunk_size", type=Positive(int), metavar="blocks", default=basecall_helpers._DEFAULT_CHUNK_SIZE, help="Size of signal chunks sent to GPU is chunk_size * model stride") parser.add_argument( '--fastq', default=False, action=AutoBool, help='Write output in fastq format (default is fasta)') parser.add_argument( "--max_concurrent_chunks", type=Positive(int), default=128, help="Maximum number of chunks to call at " "once. Lower values will consume less (GPU) RAM.") parser.add_argument( "--overlap", type=NonNegative(int), metavar="blocks", default=basecall_helpers._DEFAULT_OVERLAP, help="Overlap between signal chunks sent to GPU") parser.add_argument( '--posterior', default=True, action=AutoBool, help='Use posterior-viterbi decoding') parser.add_argument( "--qscore_offset", type=float, default=0.0, help="Offset to apply to q scores in fastq (after scale)") parser.add_argument( "--qscore_scale", type=float, default=1.0, help="Scaling factor to apply to q scores in fastq") parser.add_argument( '--reverse', default=False, action=AutoBool, help='Reverse sequences in output') parser.add_argument( '--scaling', action=FileExists, default=None, help='Path to TSV containing per-read scaling params') parser.add_argument( '--temperature', default=1.0, type=float, help='Scaling factor applied to network outputs before decoding') parser.add_argument( "model", action=FileExists, help="Model checkpoint file to use for basecalling") return parser
def get_parser(): parser = argparse.ArgumentParser( description='Upgrade mapped signal HDF5 file', formatter_class=argparse.ArgumentDefaultsHelpFormatter) add_common_command_args(parser, ['version']) parser.add_argument('input', action=FileExists, help='Mapped signal to read from') parser.add_argument('output', action=FileAbsent, help='Name for output upgraded mapped signal file') return parser
def get_parser(): parser = argparse.ArgumentParser() add_common_command_args(parser, ('input_folder input_strand_list limit output ' + 'recursive version jobs').split()) parser.add_argument('--trim', default=(200, 50), nargs=2, type=NonNegative(int), metavar=('beginning', 'end'), help='Number of samples to trim off start and end') return parser
def get_parser(): """Get argparser object. Returns: :argparse:`ArgumentParser` : the argparser object """ parser = argparse.ArgumentParser( description='Predict squiggle from sequence', formatter_class=argparse.ArgumentDefaultsHelpFormatter) add_common_command_args(parser, "output version".split()) parser.add_argument('model', action=FileExists, help='Model file') parser.add_argument('input', action=FileExists, help='Fasta file') return parser
def get_parser(): parser = argparse.ArgumentParser( description='Extract reference sequence for each read from a SAM ' + 'alignment file', formatter_class=argparse.ArgumentDefaultsHelpFormatter) add_common_command_args(parser, ["output"]) parser.add_argument('--complement', default=False, action=AutoBool, help='Complement all reference sequences') parser.add_argument('--input_strand_list', default=None, action=FileExists, help='Strand summary file containing subset') parser.add_argument( '--min_coverage', metavar='proportion', default=0.6, type=proportion, help='Ignore reads with alignments shorter than min_coverage * ' + 'read length') parser.add_argument( '--pad', type=int, default=0, help='Number of bases by which to pad reference sequence') parser.add_argument('--reverse', default=False, action=AutoBool, help='Reverse all reference sequences (for RNA)') parser.add_argument( 'reference', action=FileExists, help="Genomic references that reads were aligned against") parser.add_argument( 'input', metavar='input.sam', nargs='+', help="SAM or BAM file(s) containing read alignments to reference") return parser
COMPATIBLE_LAYERS = set(( 'convolution', 'LSTM', 'GruMod', 'reverse', 'GlobalNormTwoState', 'GlobalNormTwoStateCatMod')) parser = argparse.ArgumentParser( description='Convert JSON representation of model to pytorch checkpoint ' + 'for use within taiyaki/megalodon.', formatter_class=argparse.ArgumentDefaultsHelpFormatter) add_common_command_args( parser, ["output"]) parser.add_argument( 'json_model', action=FileExists, help='JSON model with params') def set_params_gru(layer, params_name, jsn_params, layer_params): # convert from guppy format back to pytorch format if re.search('weight_ih', params_name) and 'iW' in jsn_params: jsn_layer_params = torch.Tensor(np.concatenate([ jsn_params['iW'][1], jsn_params['iW'][0], jsn_params['iW'][2]])) elif re.search('weight_hh', params_name) and 'sW' in jsn_params: jsn_layer_params = torch.Tensor(np.concatenate([ jsn_params['sW'][1], jsn_params['sW'][0], jsn_params['sW'][2]])) elif re.search('bias_ih', params_name) and 'b' in jsn_params: jsn_layer_params = torch.Tensor(np.concatenate([
import time import torch from torch.optim.lr_scheduler import CosineAnnealingLR from taiyaki import ctc, flipflopfings, helpers from taiyaki.cmdargs import FileExists, Positive from taiyaki.common_cmdargs import add_common_command_args # This is here, not in main to allow documentation to be built parser = argparse.ArgumentParser( description='Train a flip-flop neural network', formatter_class=argparse.ArgumentDefaultsHelpFormatter) add_common_command_args(parser, """adam alphabet device eps limit niteration outdir overwrite quiet save_every version""".split()) parser.add_argument('--batch_size', default=128, metavar='chunks', type=Positive(int), help='Number of chunks to run in parallel') parser.add_argument( '--lr_max', default=4.0e-3, metavar='rate', type=Positive(float), help='Initial learning rate') parser.add_argument('--size', default=96, metavar='neurons', type=Positive(int), help='Base layer size for model') parser.add_argument('--seed', default=None, metavar='integer', type=Positive(int), help='Set random number seed') parser.add_argument('--stride', default=2, metavar='samples', type=Positive(int), help='Stride for model') parser.add_argument('--winlen', default=19, type=Positive(int), help='Length of window over data') parser.add_argument('model', action=FileExists,
import torch from taiyaki import (alphabet, chunk_selection, constants, ctc, flipflopfings, helpers, layers, mapped_signal_files, maths, optim) from taiyaki.cmdargs import AutoBool, FileExists, Maybe, NonNegative, Positive from taiyaki.common_cmdargs import add_common_command_args from taiyaki.constants import DOTROWLENGTH # This is here, not in main to allow documentation to be built parser = argparse.ArgumentParser( description='Train flip-flop neural network', formatter_class=argparse.ArgumentDefaultsHelpFormatter) add_common_command_args( parser, """adam device eps filter_max_dwell filter_mean_dwell limit lr_cosine_iters niteration outdir overwrite quiet save_every sample_nreads_before_filtering version weight_decay""".split()) parser.add_argument('--chunk_len_min', default=2000, metavar='samples', type=Positive(int), help='Min length of each chunk in samples' + ' (chunk lengths are random between min and max)') parser.add_argument('--chunk_len_max', default=4000, metavar='samples', type=Positive(int), help='Max length of each chunk in samples ' + '(chunk lengths are random between min and max)')
import numpy as np import os import sys from taiyaki import alphabet, bio, fast5utils, helpers, prepare_mapping_funcs from taiyaki.cmdargs import FileExists, Maybe from taiyaki.common_cmdargs import add_common_command_args from taiyaki.iterators import imap_mp program_description = "Prepare data for model training and save to hdf5 file by remapping with flip-flop model" parser = argparse.ArgumentParser( description=program_description, formatter_class=argparse.ArgumentDefaultsHelpFormatter) add_common_command_args( parser, 'alphabet input_folder input_strand_list jobs limit overwrite recursive version' .split()) parser.add_argument('--localpen', metavar='penalty', default=0.0, type=float, help='Penalty for local mapping') parser.add_argument('--max_read_length', metavar='bases', default=None, type=Maybe(int), help='Don\'t attempt remapping for reads longer than this') parser.add_argument('--mod', nargs=3, metavar=('base', 'canonical', 'name'),
from taiyaki import (chunk_selection, ctc, flipflopfings, helpers, mapped_signal_files, optim) from taiyaki import __version__ from taiyaki.cmdargs import FileExists, Positive from taiyaki.common_cmdargs import add_common_command_args from taiyaki.constants import DOTROWLENGTH # This is here, not in main to allow documentation to be built parser = argparse.ArgumentParser( description='Train a flip-flop neural network', formatter_class=argparse.ArgumentDefaultsHelpFormatter) add_common_command_args( parser, """adam chunk_logging_threshold device filter_max_dwell filter_mean_dwell limit lr_cosine_iters niteration overwrite quiet save_every sample_nreads_before_filtering version weight_decay""" .split()) parser.add_argument( '--chunk_len_min', default=2000, metavar='samples', type=Positive(int), help= 'Min length of each chunk in samples (chunk lengths are random between min and max)' ) parser.add_argument( '--chunk_len_max', default=4000, metavar='samples',
#!/usr/bin/env python3 import argparse from Bio import SeqIO import numpy as np import torch from taiyaki import helpers, squiggle_match from taiyaki.cmdargs import FileExists from taiyaki.common_cmdargs import add_common_command_args parser = argparse.ArgumentParser( description='Predict squiggle from sequence', formatter_class=argparse.ArgumentDefaultsHelpFormatter) add_common_command_args(parser, "output version".split()) parser.add_argument('model', action=FileExists, help='Model file') parser.add_argument('input', action=FileExists, help='Fasta file') def main(): args = parser.parse_args() predict_squiggle = helpers.load_model(args.model) with helpers.open_file_or_stdout(args.output) as fh: for seq in SeqIO.parse(args.input, 'fasta'): seqstr = str(seq.seq) embedded_seq_numpy = np.expand_dims( squiggle_match.embed_sequence(seqstr), axis=1) embedded_seq_torch = torch.tensor(embedded_seq_numpy,
import os import sys from ont_fast5_api import fast5_interface from taiyaki.cmdargs import NonNegative from taiyaki.common_cmdargs import add_common_command_args import taiyaki.fast5utils as fast5utils from taiyaki.helpers import open_file_or_stdout from taiyaki.iterators import imap_mp from taiyaki.maths import med_mad from taiyaki.signal import Signal parser = argparse.ArgumentParser() add_common_command_args( parser, 'input_folder input_strand_list limit output recursive version jobs'.split( )) parser.add_argument('--trim', default=(200, 50), nargs=2, type=NonNegative(int), metavar=('beginning', 'end'), help='Number of samples to trim off start and end') def one_read_shift_scale(read_tuple): read_filename, read_id = read_tuple try:
#!/usr/bin/env python3 import argparse from taiyaki import common_cmdargs, fast5utils, helpers, squiggle_match from taiyaki.cmdargs import (display_version_and_exit, FileExists, Maybe, NonNegative, Positive, proportion) from taiyaki.iterators import imap_mp from taiyaki import __version__ parser = argparse.ArgumentParser( description='Map sequence to current trace using squiggle predictor model', formatter_class=argparse.ArgumentDefaultsHelpFormatter) common_cmdargs.add_common_command_args(parser, "limit jobs version".split()) parser.add_argument('--back_prob', default=1e-15, metavar='probability', type=proportion, help='Probability of backwards move') parser.add_argument('--input_strand_list', default=None, action=FileExists, help='Strand summary file containing subset') parser.add_argument( '--localpen', default=None, type=Maybe(NonNegative(float)), help='Penalty for staying in start and end states, or None to disable them' ) parser.add_argument('--minscore',
#!/usr/bin/env python3 import argparse from taiyaki import fast5utils, helpers, squiggle_match from taiyaki.cmdargs import (FileExists, Maybe, NonNegative, proportion) from taiyaki.common_cmdargs import add_common_command_args from taiyaki.iterators import imap_mp parser = argparse.ArgumentParser( description='Map sequence to current trace using squiggle predictor model', formatter_class=argparse.ArgumentDefaultsHelpFormatter) add_common_command_args(parser, "limit jobs output recursive version".split()) parser.add_argument('--back_prob', default=1e-15, metavar='probability', type=proportion, help='Probability of backwards move') parser.add_argument('--input_strand_list', default=None, action=FileExists, help='Strand summary file containing subset') parser.add_argument('--localpen', default=None, type=Maybe(NonNegative(float)), help='Penalty for staying in start and end states, or None to disable them') parser.add_argument('--minscore', default=None, type=Maybe(NonNegative(float)), help='Minimum score for matching') parser.add_argument('--trim', default=(200, 10), nargs=2, type=NonNegative(int), metavar=('beginning', 'end'), help='Number of samples to trim off start and end') parser.add_argument('model', action=FileExists, help='Model file') parser.add_argument('references', action=FileExists, help='Fasta file') parser.add_argument('read_dir', action=FileExists, help='Directory for fast5 reads')
from taiyaki.decode import flipflop_make_trans, flipflop_viterbi from taiyaki.flipflopfings import extract_mod_weights, nstate_flipflop, path_to_str from taiyaki.helpers import (guess_model_stride, load_model, open_file_or_stdout, Progress) from taiyaki.maths import med_mad from taiyaki.prepare_mapping_funcs import get_per_read_params_dict_from_tsv from taiyaki.signal import Signal STITCH_BEFORE_VITERBI = False parser = argparse.ArgumentParser( description="Basecall reads using a taiyaki model", formatter_class=argparse.ArgumentDefaultsHelpFormatter) add_common_command_args( parser, """alphabet device input_folder input_strand_list limit output quiet recursive version""".split()) parser.add_argument( "--chunk_size", type=Positive(int), metavar="blocks", default=basecall_helpers._DEFAULT_CHUNK_SIZE, help="Size of signal chunks sent to GPU is chunk_size * model stride") parser.add_argument('--fastq', default=False, action=AutoBool, help='Write output in fastq format (default is fasta)') parser.add_argument("--max_concurrent_chunks", type=Positive(int), default=128,
#!/usr/bin/env python3 # Combine mapped-read files in HDF5 format into a single file import argparse from taiyaki import mapped_signal_files from taiyaki.cmdargs import Positive from taiyaki.common_cmdargs import add_common_command_args parser = argparse.ArgumentParser( description='Combine HDF5 mapped-read files into a single file') add_common_command_args(parser, ['version']) parser.add_argument('output', help='Output filename') parser.add_argument('input', nargs='+', help='One or more input files') #To convert to any new mapped read format (e.g. mapped_signal_files.SQL) #we should be able to just change MAPPED_READ_CLASS to equal the new class. MAPPED_READ_CLASS = mapped_signal_files.HDF5Reader MAPPED_WRITE_CLASS = mapped_signal_files.HDF5Writer def main(): args = parser.parse_args() with MAPPED_READ_CLASS(args.input[0]) as hin: # Copy alphabet and modification information from first file in_alphabet, in_collapse_alphabet, in_mod_long_names \ = hin.get_alphabet_information() args.alphabet = in_alphabet args.collapse_alphabet = in_collapse_alphabet args.mod_long_names = in_mod_long_names
#!/usr/bin/env python import argparse from taiyaki.iterators import imap_mp import os import sys from taiyaki.cmdargs import FileExists import taiyaki.common_cmdargs as common_cmdargs from taiyaki import fast5utils, helpers, prepare_mapping_funcs, variables program_description = "Prepare data for model training and save to hdf5 file by remapping with flip-flop model" parser = argparse.ArgumentParser( description=program_description, formatter_class=argparse.ArgumentDefaultsHelpFormatter) common_cmdargs.add_common_command_args( parser, 'device input_folder input_strand_list jobs limit overwrite version'.split( )) default_alphabet_str = variables.DEFAULT_ALPHABET.decode("utf-8") parser.add_argument('--alphabet', default=default_alphabet_str, help='Alphabet for basecalling. Defaults to ' + default_alphabet_str) parser.add_argument('--collapse_alphabet', default=default_alphabet_str, help='Collapsed alphabet for basecalling. Defaults to ' + default_alphabet_str) parser.add_argument('input_per_read_params', action=FileExists, help='Input per read parameter .tsv file') parser.add_argument('output', help='Output HDF5 file') parser.add_argument('model', action=FileExists, help='Taiyaki model file')
def get_parser(): parser = argparse.ArgumentParser( description='Train a model to predict ionic current levels ' + 'from sequence', formatter_class=argparse.ArgumentDefaultsHelpFormatter) add_common_command_args( parser, """adam device eps filter_max_dwell filter_mean_dwell limit niteration outdir overwrite quiet reverse save_every sample_nreads_before_filtering version weight_decay""".split()) parser.add_argument('--batch_size', default=100, metavar='chunks', type=Positive(int), help='Number of chunks to run in parallel') parser.add_argument('--back_prob', default=1e-15, metavar='probability', type=proportion, help='Probability of backwards move') parser.add_argument('--depth', metavar='layers', default=4, type=Positive(int), help='Number of residual convolution layers') parser.add_argument( '--drop_slip', default=5, type=Maybe(Positive(int)), metavar='length', help='Drop chunks with slips greater than given length (None = off)') parser.add_argument( '--filter_path_buffer', default=1.1, metavar='ratio', type=float, help='Drop chunks with small ratio of signal length to bases * ' + 'model stride, which would restrict potential CTC paths.') parser.add_argument( '--filter_min_pass_fraction', default=0.5, metavar='fraction', type=Maybe(Positive(float)), help='Halt if fraction of chunks passing tests is less than this') parser.add_argument('--full_filter_status', default=False, action=AutoBool, help='Output full chunk filtering statistics. ' + 'Default: only proportion of filtered chunks.') parser.add_argument( '--input_strand_list', default=None, action=FileExists, help='Strand summary file containing column read_id. Filenames in ' + 'file are ignored.') parser.add_argument( '--lr_decay', default=5000, metavar='n', type=Positive(float), help='Learning rate for batch i is lr_max / (1.0 + i / n)') parser.add_argument('--lr_max', default=1.0e-4, metavar='rate', type=Positive(float), help='Max (and starting) learning rate') parser.add_argument('--sd', default=0.5, metavar='value', type=Positive(float), help='Standard deviation to initialise with') parser.add_argument('--seed', default=None, metavar='integer', type=Positive(int), help='Set random number seed') parser.add_argument('--size', metavar='n', default=32, type=Positive(int), help='Size of layers in convolution network') parser.add_argument('--target_len', metavar='n', default=300, type=Positive(int), help='Target length of sequence') parser.add_argument('--winlen', metavar='n', default=9, type=Positive(int), help='Window for convolution network') parser.add_argument('input', action=FileExists, help='HDF5 file containing mapped reads') return parser