예제 #1
0
def get_parser():
    parser = argparse.ArgumentParser(
        description='Map sequence to current trace using squiggle ' +
        'predictor model',
        formatter_class=argparse.ArgumentDefaultsHelpFormatter)

    add_common_command_args(
        parser, "limit jobs output recursive version".split())

    parser.add_argument(
        '--back_prob', default=1e-15, metavar='probability',
        type=proportion, help='Probability of backwards move')
    parser.add_argument(
        '--input_strand_list', default=None, action=FileExists,
        help='Strand summary file containing subset')
    parser.add_argument(
        '--localpen', default=None, type=Maybe(NonNegative(float)),
        help='Penalty for staying in start and end states, or None to ' +
        'disable them')
    parser.add_argument(
        '--minscore', default=None, type=Maybe(NonNegative(float)),
        help='Minimum score for matching')
    parser.add_argument(
        '--trim', default=(200, 10), nargs=2, type=NonNegative(int),
        metavar=('beginning', 'end'),
        help='Number of samples to trim off start and end')

    parser.add_argument(
        'model', action=FileExists, help='Model file')
    parser.add_argument(
        'references', action=FileExists, help='Fasta file')
    parser.add_argument(
        'read_dir', action=FileExists,
        help='Directory for fast5 reads')
예제 #2
0
def get_parser():
    parser = argparse.ArgumentParser(
        description='Train a flip-flop neural network',
        formatter_class=argparse.ArgumentDefaultsHelpFormatter)

    add_common_command_args(
        parser, """adam alphabet device eps limit niteration
        outdir overwrite quiet save_every version
        weight_decay""".split())

    parser.add_argument('--batch_size',
                        default=128,
                        metavar='chunks',
                        type=Positive(int),
                        help='Number of chunks to run in parallel')
    parser.add_argument(
        '--gradient_cap_fraction',
        default=0.05,
        metavar='f',
        type=Maybe(NonNegative(float)),
        help='Cap L2 norm of gradient so that a fraction f of gradients ' +
        'are capped. Use --gradient_cap_fraction None for no capping.')
    parser.add_argument('--lr_max',
                        default=4.0e-3,
                        metavar='rate',
                        type=Positive(float),
                        help='Initial learning rate')
    parser.add_argument('--size',
                        default=96,
                        metavar='neurons',
                        type=Positive(int),
                        help='Base layer size for model')
    parser.add_argument('--seed',
                        default=None,
                        metavar='integer',
                        type=Positive(int),
                        help='Set random number seed')
    parser.add_argument('--stride',
                        default=2,
                        metavar='samples',
                        type=Positive(int),
                        help='Stride for model')
    parser.add_argument('--winlen',
                        default=19,
                        type=Positive(int),
                        help='Length of window over data')

    parser.add_argument('model',
                        action=FileExists,
                        help='File to read python model description from')
    parser.add_argument('chunks',
                        action=FileExists,
                        help='file containing chunks')
    parser.add_argument('reference',
                        action=FileExists,
                        help='file containing fasta reference')

    return parser
예제 #3
0
def get_parser():
    """Get argparser object.

    Returns:
        :argparse:`ArgumentParser` : the argparser object
    """
    parser = argparse.ArgumentParser(
        description="Prepare data for model training and save to hdf5 file " +
        "by remapping with flip-flop model",
        formatter_class=argparse.ArgumentDefaultsHelpFormatter)

    add_common_command_args(
        parser, ('alphabet input_folder input_strand_list jobs limit ' +
                 'overwrite recursive version').split())

    parser.add_argument('--localpen',
                        metavar='penalty',
                        default=0.0,
                        type=float,
                        help='Penalty for local mapping')
    parser.add_argument(
        '--max_read_length',
        metavar='bases',
        default=None,
        type=Maybe(int),
        help='Don\'t attempt remapping for reads longer than this')
    parser.add_argument('--mod',
                        nargs=3,
                        metavar=('mod_base', 'canonical_base',
                                 'mod_long_name'),
                        default=[],
                        action='append',
                        help='Modified base description')
    parser.add_argument(
        '--batch_format',
        action='store_true',
        help='Output batched mapped signal file format. This can ' +
        'significantly improve I/O performance and use less ' +
        'disk space. An entire batch must be loaded into memory in order ' +
        'access any read potentailly increasing RAM requirements.')

    parser.add_argument('input_per_read_params',
                        action=FileExists,
                        help='Input per read parameter .tsv file')
    parser.add_argument('output', help='Output HDF5 file')
    parser.add_argument('model', action=FileExists, help='Taiyaki model file')
    parser.add_argument(
        'references',
        action=FileExists,
        help='Single fasta file containing references for each read')

    return parser
예제 #4
0
def get_parser():
    parser = argparse.ArgumentParser(
        description='Align reads to reference and output accuracy statistics',
        formatter_class=argparse.ArgumentDefaultsHelpFormatter)

    parser.add_argument('--coverage',
                        metavar='proportion',
                        default=0.6,
                        type=proportion,
                        help='Minimum coverage')
    parser.add_argument('--data_name',
                        default=None,
                        type=Maybe(str),
                        help="Data name. If not set file name is used.")
    parser.add_argument('--figure_format',
                        default="png",
                        help="Figure file format.")
    parser.add_argument('--show_median',
                        default=False,
                        action=AutoBool,
                        help='Show median in a histogram plot')
    parser.add_argument('--output_text',
                        default=True,
                        action=AutoBool,
                        help='Output per-read text report.')
    parser.add_argument('--output_plot',
                        default=True,
                        action=AutoBool,
                        help='Output accuracy distribution plot(s).')
    parser.add_argument(
        '--quantiles',
        type=int,
        default=DEFAULT_QUANTILES,
        nargs='+',
        help='Quantiles to report in summary. Default: %(default)s')

    parser.add_argument(
        'files',
        metavar='input',
        nargs='+',
        help="One or more alignment files in SAM/BAM/CRAM format.")

    return parser
예제 #5
0
def get_parser():
    parser = argparse.ArgumentParser(
        description='Combine mapped-signal files into a single file. ' +
        'Checks that alphabets are compatible.')
    parser.add_argument('output', help='Output filename')

    parser.add_argument(
        '--input',
        required=True,
        nargs=2,
        action='append',
        metavar=('mapped_signal_file', 'num_reads'),
        help='Mapped signal filename and the number of reads to merge from ' +
        'this file. Specify "None" to merge all reads from a file.')
    parser.add_argument(
        '--load_in_mem',
        action=AutoBool,
        default=True,
        help='Load each input file into memory before processing. ' +
        'Potentially large increase in speed but also increased memory usage')
    parser.add_argument(
        '--seed',
        type=Maybe(NonNegative(int)),
        default=None,
        help='Seed for randomly selected reads when limits are set ' +
        '(default random seed)')
    parser.add_argument(
        '--allow_mod_merge',
        action='store_true',
        help='Allow merging of data sets with different modified bases. ' +
        'While alphabets may differ, incompatible alphabets are not allowed ' +
        '(e.g. same single letter code used for different canonical bases).')
    parser.add_argument(
        '--batch_format',
        action='store_true',
        help='Output batched mapped signal file format. This can ' +
        'significantly improve I/O performance and use less ' +
        'disk space. An entire batch must be loaded into memory in order ' +
        'access any read potentailly increasing RAM requirements.')

    return parser
예제 #6
0
    formatter_class=argparse.ArgumentDefaultsHelpFormatter)

add_common_command_args(
    parser,
    'alphabet input_folder input_strand_list jobs limit overwrite recursive version'
    .split())

parser.add_argument('--localpen',
                    metavar='penalty',
                    default=0.0,
                    type=float,
                    help='Penalty for local mapping')
parser.add_argument('--max_read_length',
                    metavar='bases',
                    default=None,
                    type=Maybe(int),
                    help='Don\'t attempt remapping for reads longer than this')
parser.add_argument('--mod',
                    nargs=3,
                    metavar=('base', 'canonical', 'name'),
                    default=[],
                    action='append',
                    help='Modified base description')
parser.add_argument('input_per_read_params',
                    action=FileExists,
                    help='Input per read parameter .tsv file')
parser.add_argument('output', help='Output HDF5 file')
parser.add_argument('model', action=FileExists, help='Taiyaki model file')
parser.add_argument(
    'references',
    action=FileExists,
예제 #7
0
def add_common_command_args(parser, arglist):
    """Given an argparse parser object and a list of keys such as
    ['input_strand_list', 'jobs'], add these command line args
    to the parser.

    Not all command line args used in the package are
    included in this func: only those that are used by more than
    one script and which have the same defaults.

    Some args are positional and some are optional.
    The optional ones are listed first below."""

    ############################################################################
    #
    # Optional arguments
    #
    ############################################################################

    if 'adam' in arglist:
        parser.add_argument(
            '--adam',
            nargs=2,
            metavar=('beta1', 'beta2'),
            default=[0.9, 0.999],
            type=NonNegative(float),
            help=
            'Parameters beta1, beta2 for Exponential Decay Adaptive Momentum')

    if 'alphabet' in arglist:
        parser.add_argument('--alphabet',
                            default=DEFAULT_ALPHABET,
                            help='Canonical base alphabet')

    if 'device' in arglist:
        parser.add_argument(
            '--device',
            default='cpu',
            action=DeviceAction,
            help=
            'Integer specifying which GPU to use, or "cpu" to use CPU only. '
            'Other accepted formats: "cuda" (use default GPU), "cuda:2" '
            'or "cuda2" (use GPU 2).')
    if 'eps' in arglist:
        parser.add_argument('--eps',
                            default=1e-6,
                            metavar='adjustment',
                            type=Positive(float),
                            help='Small value to stabilise optimiser')

    if 'filter_max_dwell' in arglist:
        parser.add_argument(
            '--filter_max_dwell',
            default=10.0,
            metavar='multiple',
            type=Maybe(Positive(float)),
            help=
            'Drop chunks with max dwell more than multiple of median (over chunks)'
        )

    if 'filter_mean_dwell' in arglist:
        parser.add_argument(
            '--filter_mean_dwell',
            default=3.0,
            metavar='radius',
            type=Maybe(Positive(float)),
            help=
            'Drop chunks with mean dwell more than radius deviations from the median (over chunks)'
        )

    if 'input_strand_list' in arglist:
        parser.add_argument(
            '--input_strand_list',
            default=None,
            action=FileExists,
            help=
            'Strand list TSV file with columns filename_fast5 or read_id or both'
        )

    if 'jobs' in arglist:
        parser.add_argument(
            '--jobs',
            default=1,
            metavar='n',
            type=Positive(int),
            help='Number of threads to use when processing data')

    if 'limit' in arglist:
        parser.add_argument('--limit',
                            default=None,
                            type=Maybe(Positive(int)),
                            help='Limit number of reads to process')

    if 'niteration' in arglist:
        parser.add_argument('--niteration',
                            metavar='batches',
                            type=Positive(int),
                            default=50000,
                            help='Maximum number of batches to train for')

    if 'outdir' in arglist:
        parser.add_argument('--outdir',
                            default='training',
                            help='Output directory, created when run.')

    if 'output' in arglist:
        parser.add_argument('--output',
                            default=None,
                            metavar='filename',
                            action=FileAbsent,
                            help='Write output to file')

    if 'overwrite' in arglist:
        parser.add_argument('--overwrite',
                            default=False,
                            action=AutoBool,
                            help='Whether to overwrite any output files')

    if 'quiet' in arglist:
        parser.add_argument('--quiet',
                            default=False,
                            action=AutoBool,
                            help="Don't print progress information to stdout")

    if 'recursive' in arglist:
        parser.add_argument('--recursive',
                            default=True,
                            action=AutoBool,
                            help='Search for fast5s recursively within ' +
                            'input_folder. Otherwise only search first level.')

    if 'sample_nreads_before_filtering' in arglist:
        parser.add_argument(
            '--sample_nreads_before_filtering',
            metavar='n',
            type=NonNegative(int),
            default=1000,
            help=
            'Sample n reads to decide on bounds for filtering before training. Set to 0 to do all.'
        )

    if 'save_every' in arglist:
        parser.add_argument('--save_every',
                            metavar='x',
                            type=Positive(int),
                            default=5000,
                            help='Save model every x batches')

    if 'version' in arglist:
        parser.add_argument('--version',
                            nargs=0,
                            action=display_version_and_exit,
                            metavar=__version__,
                            help='Display version information.')

    if 'weight_decay' in arglist:
        parser.add_argument(
            '--weight_decay',
            default=0.0,
            metavar='penalty',
            type=NonNegative(float),
            help='Adam weight decay (L2 normalisation penalty)')

    ############################################################################
    #
    # Positional arguments
    #
    ############################################################################

    if 'input_folder' in arglist:
        parser.add_argument(
            'input_folder',
            action=FileExists,
            help='Directory containing single or multi-read fast5 files')
예제 #8
0
def get_train_flipflop_parser():
    parser = argparse.ArgumentParser(
        description='Train flip-flop neural network',
        formatter_class=argparse.ArgumentDefaultsHelpFormatter)

    mdl_grp = parser.add_argument_group('Model Arguments')
    mdl_grp.add_argument(
        '--size', default=384, metavar='neurons',
        type=Positive(int), help='Base layer size for model')
    mdl_grp.add_argument(
        '--stride', default=5, metavar='samples',
        type=Positive(int), help='Stride for model')
    mdl_grp.add_argument(
        '--winlen', default=19, type=Positive(int),
        help='Length of window over data')

    trn_grp = parser.add_argument_group('Training Arguments')
    trn_grp.add_argument(
        '--adam', nargs=2, metavar=('beta1', 'beta2'),
        default=[0.9, 0.999], type=NonNegative(float),
        help='Parameters beta1, beta2 for Exponential Decay ' +
        'Adaptive Momentum')
    trn_grp.add_argument(
        '--eps', default=1e-6, metavar='adjustment',
        type=Positive(float), help='Small value to stabilise optimiser')
    trn_grp.add_argument(
        '--niteration', metavar='batches', type=Positive(int),
        default=150000, help='Maximum number of batches to train for')
    trn_grp.add_argument(
        '--weight_decay', default=0.01, metavar='penalty',
        type=NonNegative(float),
        help='Adam weight decay (L2 normalisation penalty)')
    trn_grp.add_argument(
        '--gradient_clip_num_mads', default=0, metavar='num_MADs',
        type=Maybe(NonNegative(float)),
        help='Clip gradients (by value) at num_MADs above the median of ' +
        'the last 1000 parameter gradient maximums. Gradient threshold ' +
        'values are computed for each parameter group independently. Use ' +
        '"--gradient_clip_num_mads None" for no clipping.')
    trn_grp.add_argument(
        '--lr_max', default=4.0e-3, metavar='rate', type=Positive(float),
        help='Max learning rate, reached at --warmup_batches iterations.')
    trn_grp.add_argument(
        '--lr_min', default=1.0e-4, metavar='rate', type=Positive(float),
        help='Min (starting and final) learning rate')
    trn_grp.add_argument(
        '--seed', default=None, metavar='integer', type=Positive(int),
        help='Set random number seed')
    trn_grp.add_argument(
        '--sharpen', default=(1.0, 1.0, 25000), nargs=3,
        metavar=('min', 'max', 'niter'), action=ParseToNamedTuple,
        type=(Positive(float), Positive(float), Positive(int)),
        help='Increase sharpening factor linearly from "min" to ' +
        '"max" over "niter" iterations')
    trn_grp.add_argument(
        '--warmup_batches', type=int, default=200,
        help='Over first n batches, increase learning rate like cosine.')
    trn_grp.add_argument(
        '--lr_warmup', metavar='rate', type=Positive(float),
        help='Start learning rate for warmup. Defaults to lr_min.')
    trn_grp.add_argument(
        '--min_momentum', type=Positive(float),
        help='Min momentum in cycling. default = Adam beta1, no cycling')

    data_grp = parser.add_argument_group('Data Arguments')
    data_grp.add_argument(
        '--filter_max_dwell', default=10.0, metavar='multiple',
        type=Maybe(Positive(float)),
        help='Drop chunks with max dwell more than multiple of median ' +
        '(over chunks)')
    data_grp.add_argument(
        '--filter_mean_dwell', default=3.0, metavar='radius',
        type=Maybe(Positive(float)),
        help='Drop chunks with mean dwell more than radius deviations ' +
        'from the median (over chunks)')
    data_grp.add_argument(
        '--filter_min_pass_fraction', default=0.5, metavar='fraction',
        type=Maybe(Positive(float)),
        help='Halt if fraction of chunks passing tests is less than this')
    data_grp.add_argument(
        '--filter_path_buffer', default=1.1, metavar='ratio',
        type=Bounded(float, lower=1.0),
        help='Drop chunks with small ratio of signal length to bases * ' +
        'model stride, which would restrict potential CTC paths. Must be ' +
        'greater than 1.0.')
    data_grp.add_argument(
        '--limit', default=None, type=Maybe(Positive(int)),
        help='Limit number of reads to process')
    data_grp.add_argument(
        '--reverse', default=False, action=AutoBool,
        help='Reverse input sequence and current')
    data_grp.add_argument(
        '--sample_nreads_before_filtering', metavar='n',
        type=NonNegative(int), default=100000,
        help='Sample n reads to decide on bounds for filtering before ' +
        'training. Set to 0 to do all.')
    data_grp.add_argument(
        '--chunk_len_min', default=3000, metavar='samples', type=Positive(int),
        help='Min length of each chunk in samples (chunk lengths are ' +
        'random between min and max)')
    data_grp.add_argument(
        '--chunk_len_max', default=8000, metavar='samples', type=Positive(int),
        help='Max length of each chunk in samples (chunk lengths are ' +
        'random between min and max)')
    data_grp.add_argument(
        '--include_reporting_strands', default=False, action=AutoBool,
        help='Include reporting strands in training. Default: Hold ' +
        'training strands out of training.')
    data_grp.add_argument(
        '--input_strand_list', default=None, action=FileExists,
        help='Strand summary file containing column read_id. Filenames in ' +
        'file are ignored.')
    data_grp.add_argument(
        '--min_sub_batch_size', default=128, metavar='chunks',
        type=Positive(int),
        help='Number of chunks to run in parallel per sub-batch for ' +
        'chunk_len = chunk_len_max. Actual length of sub-batch used is ' +
        '(min_sub_batch_size * chunk_len_max / chunk_len).')
    data_grp.add_argument(
        '--reporting_percent_reads', default=1, metavar='sub_batches',
        type=Positive(float),
        help='Percent of reads to use for std loss reporting')
    data_grp.add_argument(
        '--reporting_strand_list', action=FileExists,
        help='Strand summary file containing column read_id. All other ' +
        'fields are ignored. If not provided reporting strands will be ' +
        'randomly selected.')
    data_grp.add_argument(
        '--reporting_sub_batches', default=100, metavar='sub_batches',
        type=Positive(int),
        help='Number of sub-batches to use for std loss reporting')
    data_grp.add_argument(
        '--standardize', default=True, action=AutoBool,
        help='Standardize currents for each read')
    data_grp.add_argument(
        '--sub_batches', default=1, metavar='sub_batches', type=Positive(int),
        help='Number of sub-batches per batch')

    cmp_grp = parser.add_argument_group('Compute Arguments')
    cmp_grp.add_argument(
        '--device', default='cpu', action=DeviceAction,
        help='Integer specifying which GPU to use, or "cpu" to use CPU only. '
        'Other accepted formats: "cuda" (use default GPU), "cuda:2" '
        'or "cuda2" (use GPU 2).')
    # Argument local_rank is used only by when the script is run in multi-GPU
    # mode using torch.distributed.launch. See the README.
    cmp_grp.add_argument(
        '--local_rank', type=int, default=None, help=argparse.SUPPRESS)

    out_grp = parser.add_argument_group('Output Arguments')
    out_grp.add_argument(
        '--full_filter_status', default=False, action=AutoBool,
        help='Output full chunk filtering statistics. Default: only ' +
        'proportion of filtered chunks.')
    out_grp.add_argument(
        '--outdir', default='training',
        help='Output directory, created when run.')
    out_grp.add_argument(
        '--overwrite', default=False, action=AutoBool,
        help='Whether to overwrite any output files')
    out_grp.add_argument(
        '--quiet', default=False, action=AutoBool,
        help="Don't print progress information to stdout")
    out_grp.add_argument(
        '--save_every', metavar='x', type=Positive(int), default=2500,
        help='Save model every x batches')

    mod_grp = parser.add_argument_group('Modified Base Arguments')
    mod_grp.add_argument(
        '--mod_factor', default=(8.0, 1.0, 50000), nargs=3,
        metavar=('start', 'final', 'niter'), action=ParseToNamedTuple,
        type=(Positive(float), Positive(float), Positive(int)),
        help='Relative weight applied to modified base transitions in ' +
        'loss/gradient compared to canonical transitions. Larger values ' +
        'increase the effective modified base learning rate. Scale factor ' +
        'linearly from "start" to "final" over first "niter" iterations')
    mod_grp.add_argument(
        '--mod_prior_factor', type=float,
        help='Exponential factor applied to prior mod weights estimated ' +
        'from training data. Intended to balance modified base scores. ' +
        'Default: no mod prior')
    mod_grp.add_argument(
        '--num_mod_weight_reads', type=int, default=5000,
        help='Number of reads to sample to compute the modified base prior ' +
        'weights from the training data.')

    misc_grp = parser.add_argument_group('Miscellaneous  Arguments')
    misc_grp.add_argument(
        '--version', nargs=0, action=display_version_and_exit,
        metavar=__version__,
        help='Display version information.')

    parser.add_argument(
        'model', action=FileExists,
        help='File to read python model (or checkpoint) from')
    parser.add_argument(
        'input', action=FileExists,
        help='file containing mapped reads')

    return parser
예제 #9
0
                    type=Positive(int),
                    help='Number of chunks to run in parallel')
parser.add_argument('--back_prob',
                    default=1e-15,
                    metavar='probability',
                    type=proportion,
                    help='Probability of backwards move')
parser.add_argument('--depth',
                    metavar='layers',
                    default=4,
                    type=Positive(int),
                    help='Number of residual convolution layers')
parser.add_argument(
    '--drop_slip',
    default=5,
    type=Maybe(Positive(int)),
    metavar='length',
    help='Drop chunks with slips greater than given length (None = off)')
parser.add_argument(
    '--input_strand_list',
    default=None,
    action=FileExists,
    help=
    'Strand summary file containing column read_id. Filenames in file are ignored.'
)
parser.add_argument('--lr_decay',
                    default=5000,
                    metavar='n',
                    type=Positive(float),
                    help='Learning rate for batch i is lr_max / (1.0 + i / n)')
parser.add_argument('--lr_max',
예제 #10
0
                    ' (chunk lengths are random between min and max)')
parser.add_argument('--chunk_len_max',
                    default=4000,
                    metavar='samples',
                    type=Positive(int),
                    help='Max length of each chunk in samples ' +
                    '(chunk lengths are random between min and max)')
parser.add_argument('--full_filter_status',
                    default=False,
                    action=AutoBool,
                    help='Output full chunk filtering statistics. ' +
                    'Default: only proportion of filtered chunks.')
parser.add_argument('--gradient_cap_fraction',
                    default=0.05,
                    metavar='f',
                    type=Maybe(NonNegative(float)),
                    help='Cap L2 norm of gradient so that a fraction f of ' +
                    'gradients are capped. ' +
                    'Use --gradient_cap_fraction None for no capping.')
parser.add_argument('--input_strand_list',
                    default=None,
                    action=FileExists,
                    help='Strand summary file containing column read_id. ' +
                    'Filenames in file are ignored.')
#Argument local_rank is used only by when the script is run in multi-GPU
#mode using torch.distributed.launch. See the README.
parser.add_argument('--local_rank',
                    type=int,
                    default=None,
                    help=argparse.SUPPRESS)
parser.add_argument('--lr_cosine_iters',
예제 #11
0
def add_common_command_args(parser, arglist):
    """Given an argparse parser object and a list of keys such as
    ['input_strand_list', 'jobs'], add these command line args
    to the parser.
    
    Note that not all command line args used in the package are
    included in this func: only those that are used by more than
    one script and which have the same defaults.

    Also note that some args are positional and some are optional.
    The optional ones are listed first below."""

    ############################################################################
    #
    # Optional arguments
    #
    ############################################################################

    if 'adam' in arglist:
        parser.add_argument(
            '--adam',
            nargs=3,
            metavar=('rate', 'decay1', 'decay2'),
            default=(1e-3, 0.9, 0.999),
            type=(NonNegative(float), NonNegative(float), NonNegative(float)),
            action=ParseToNamedTuple,
            help='Parameters for Exponential Decay Adaptive Momementum')

    if 'chunk_logging_threshold' in arglist:
        parser.add_argument(
            '--chunk_logging_threshold',
            default=10.0,
            metavar='multiple',
            type=NonNegative(float),
            help=
            'If loss > (threshold * smoothed loss) for a batch, then log chunks to '
            +
            'output/chunklog.tsv. Set to zero to log all, including rejected chunks'
        )

    if 'device' in arglist:
        parser.add_argument(
            '--device',
            default='cpu',
            action=DeviceAction,
            help=
            'Integer specifying which GPU to use, or "cpu" to use CPU only. '
            'Other accepted formats: "cuda" (use default GPU), "cuda:2" '
            'or "cuda2" (use GPU 2).')

    if 'filter_max_dwell' in arglist:
        parser.add_argument(
            '--filter_max_dwell',
            default=10.0,
            metavar='multiple',
            type=Maybe(Positive(float)),
            help=
            'Drop chunks with max dwell more than multiple of median (over chunks)'
        )

    if 'filter_mean_dwell' in arglist:
        parser.add_argument(
            '--filter_mean_dwell',
            default=3.0,
            metavar='radius',
            type=Maybe(Positive(float)),
            help=
            'Drop chunks with mean dwell more than radius deviations from the median (over chunks)'
        )

    if 'input_strand_list' in arglist:
        parser.add_argument('--input_strand_list',
                            default=None,
                            action=FileExists,
                            help='Strand summary file containing subset')

    if 'jobs' in arglist:
        parser.add_argument(
            '--jobs',
            default=1,
            metavar='n',
            type=Positive(int),
            help='Number of threads to use when processing data')

    if 'limit' in arglist:
        parser.add_argument('--limit',
                            default=None,
                            type=Maybe(Positive(int)),
                            help='Limit number of reads to process')

    if 'lrdecay' in arglist:
        parser.add_argument(
            '--lrdecay',
            default=5000,
            metavar='n',
            type=Positive(float),
            help='Learning rate for batch i is adam.rate / (1.0 + i / n)')

    if 'niteration' in arglist:
        parser.add_argument('--niteration',
                            metavar='batches',
                            type=Positive(int),
                            default=50000,
                            help='Maximum number of batches to train for')

    if 'overwrite' in arglist:
        parser.add_argument('--overwrite',
                            default=False,
                            action=AutoBool,
                            help='Whether to overwrite any output files')

    if 'quiet' in arglist:
        parser.add_argument('--quiet',
                            default=False,
                            action=AutoBool,
                            help="Don't print progress information to stdout")

    if 'sample_nreads_before_filtering' in arglist:
        parser.add_argument(
            '--sample_nreads_before_filtering',
            metavar='n',
            type=NonNegative(int),
            default=1000,
            help=
            'Sample n reads to decide on bounds for filtering before training. Set to 0 to do all.'
        )

    if 'save_every' in arglist:
        parser.add_argument('--save_every',
                            metavar='x',
                            type=Positive(int),
                            default=5000,
                            help='Save model every x batches')

    if 'version' in arglist:
        parser.add_argument('--version',
                            nargs=0,
                            action=display_version_and_exit,
                            metavar=__version__,
                            help='Display version information.')

    if 'weight_decay' in arglist:
        parser.add_argument(
            '--weight_decay',
            default=0.0,
            metavar='penalty',
            type=NonNegative(float),
            help='Adam weight decay (L2 normalisation penalty)')

    ############################################################################
    #
    # Positional arguments
    #
    ############################################################################

    if 'input_folder' in arglist:
        parser.add_argument(
            'input_folder',
            action=FileExists,
            help='Directory containing single-read fast5 files')
예제 #12
0

parser = argparse.ArgumentParser(description='Train a model to predict ionic current levels from sequence',
                                 formatter_class=argparse.ArgumentDefaultsHelpFormatter)

add_common_command_args(parser, """adam chunk_logging_threshold device filter_max_dwell filter_mean_dwell
                                   limit niteration overwrite quiet save_every
                                   sample_nreads_before_filtering version weight_decay""".split())

parser.add_argument('--batch_size', default=100, metavar='chunks', type=Positive(int),
                    help='Number of chunks to run in parallel')
parser.add_argument('--back_prob', default=1e-15, metavar='probability',
                    type=proportion, help='Probability of backwards move')
parser.add_argument('--depth', metavar='layers' , default=4, type=Positive(int),
                    help='Number of residual convolution layers')
parser.add_argument('--drop_slip', default=5, type=Maybe(Positive(int)), metavar='length',
                    help='Drop chunks with slips greater than given length (None = off)')
parser.add_argument('--input_strand_list', default=None, action=FileExists,
                    help='Strand summary file containing column read_id. Filenames in file are ignored.')
parser.add_argument('--lr_decay', default=5000, metavar='n', type=Positive(float),
                     help='Learning rate for batch i is lr_max / (1.0 + i / n)')
parser.add_argument('--lr_max', default=1.0e-4, metavar='rate',
                            type=Positive(float),
                            help='Max (and starting) learning rate')
parser.add_argument('--sd', default=0.5, metavar='value', type=Positive(float),
                    help='Standard deviation to initialise with')
parser.add_argument('--seed', default=None, metavar='integer', type=Positive(int),
                    help='Set random number seed')
parser.add_argument('--size', metavar='n', default=32, type=Positive(int),
                    help='Size of layers in convolution network')
parser.add_argument('--target_len', metavar='n', default=300, type=Positive(int),
예제 #13
0
from taiyaki.common_cmdargs import add_common_command_args
from taiyaki.iterators import imap_mp


parser = argparse.ArgumentParser(
    description='Map sequence to current trace using squiggle predictor model',
    formatter_class=argparse.ArgumentDefaultsHelpFormatter)


add_common_command_args(parser, "limit jobs output recursive version".split())

parser.add_argument('--back_prob', default=1e-15, metavar='probability',
                    type=proportion, help='Probability of backwards move')
parser.add_argument('--input_strand_list', default=None, action=FileExists,
                    help='Strand summary file containing subset')
parser.add_argument('--localpen', default=None, type=Maybe(NonNegative(float)),
                    help='Penalty for staying in start and end states, or None to disable them')
parser.add_argument('--minscore', default=None, type=Maybe(NonNegative(float)),
                    help='Minimum score for matching')
parser.add_argument('--trim', default=(200, 10), nargs=2, type=NonNegative(int),
                    metavar=('beginning', 'end'), help='Number of samples to trim off start and end')
parser.add_argument('model', action=FileExists, help='Model file')
parser.add_argument('references', action=FileExists, help='Fasta file')
parser.add_argument('read_dir', action=FileExists, help='Directory for fast5 reads')


def main():
    args = parser.parse_args()

    worker_kwarg_names = ['back_prob', 'localpen', 'minscore', 'trim']
예제 #14
0
def get_parser():
    parser = argparse.ArgumentParser(
        description='Train a model to predict ionic current levels ' +
        'from sequence',
        formatter_class=argparse.ArgumentDefaultsHelpFormatter)

    add_common_command_args(
        parser, """adam device eps filter_max_dwell filter_mean_dwell limit
        niteration outdir overwrite quiet reverse save_every
        sample_nreads_before_filtering version weight_decay""".split())

    parser.add_argument('--batch_size',
                        default=100,
                        metavar='chunks',
                        type=Positive(int),
                        help='Number of chunks to run in parallel')
    parser.add_argument('--back_prob',
                        default=1e-15,
                        metavar='probability',
                        type=proportion,
                        help='Probability of backwards move')
    parser.add_argument('--depth',
                        metavar='layers',
                        default=4,
                        type=Positive(int),
                        help='Number of residual convolution layers')
    parser.add_argument(
        '--drop_slip',
        default=5,
        type=Maybe(Positive(int)),
        metavar='length',
        help='Drop chunks with slips greater than given length (None = off)')
    parser.add_argument(
        '--filter_path_buffer',
        default=1.1,
        metavar='ratio',
        type=float,
        help='Drop chunks with small ratio of signal length to bases * ' +
        'model stride, which would restrict potential CTC paths.')
    parser.add_argument(
        '--filter_min_pass_fraction',
        default=0.5,
        metavar='fraction',
        type=Maybe(Positive(float)),
        help='Halt if fraction of chunks passing tests is less than this')
    parser.add_argument('--full_filter_status',
                        default=False,
                        action=AutoBool,
                        help='Output full chunk filtering statistics. ' +
                        'Default: only proportion of filtered chunks.')
    parser.add_argument(
        '--input_strand_list',
        default=None,
        action=FileExists,
        help='Strand summary file containing column read_id. Filenames in ' +
        'file are ignored.')
    parser.add_argument(
        '--lr_decay',
        default=5000,
        metavar='n',
        type=Positive(float),
        help='Learning rate for batch i is lr_max / (1.0 + i / n)')
    parser.add_argument('--lr_max',
                        default=1.0e-4,
                        metavar='rate',
                        type=Positive(float),
                        help='Max (and starting) learning rate')
    parser.add_argument('--sd',
                        default=0.5,
                        metavar='value',
                        type=Positive(float),
                        help='Standard deviation to initialise with')
    parser.add_argument('--seed',
                        default=None,
                        metavar='integer',
                        type=Positive(int),
                        help='Set random number seed')
    parser.add_argument('--size',
                        metavar='n',
                        default=32,
                        type=Positive(int),
                        help='Size of layers in convolution network')
    parser.add_argument('--target_len',
                        metavar='n',
                        default=300,
                        type=Positive(int),
                        help='Target length of sequence')
    parser.add_argument('--winlen',
                        metavar='n',
                        default=9,
                        type=Positive(int),
                        help='Window for convolution network')
    parser.add_argument('input',
                        action=FileExists,
                        help='HDF5 file containing mapped reads')

    return parser