def parse_arguments():
    info = 'Checks mutations to see what strand they are reported on and for unmapped mutations.'
    parser = argparse.ArgumentParser(description=info)

    # logging arguments
    parser.add_argument('-ll', '--log-level',
                        type=str,
                        action='store',
                        default='',
                        help='Write a log file (--log-level=DEBUG for debug mode, '
                        '--log-level=INFO for info mode)')
    parser.add_argument('-l', '--log',
                        type=str,
                        action='store',
                        default='stdout',
                        help='Path to log file. (Default: stdout)')
    parser.add_argument('-v', '--verbose',
                        action='store_true',
                        default=False,
                        help='Flag for more verbose log output')

    # program arguments
    help_str = 'Human genome FASTA file'
    parser.add_argument('-f', '--fasta',
                        type=str, required=True,
                        help=help_str)
    help_str = 'Text file specifying mutations in the format required for permutation test'
    parser.add_argument('-m', '--mutations',
                        type=str, required=True,
                        help=help_str)
    help_str = 'BED file of reference transcripts'
    parser.add_argument('-b', '--bed',
                        type=str, required=True,
                        help=help_str)
    help_str = 'Save mutations that could not be found on the reference transcript'
    parser.add_argument('-u', '--unmapped',
                        type=str, required=True,
                        help=help_str)
    args = parser.parse_args()

    # handle logging
    if args.log_level or args.log:
        if args.log:
            log_file = args.log
        else:
            log_file = ''  # auto-name the log file
    else:
        log_file = os.devnull
    log_level = args.log_level
    utils.start_logging(log_file=log_file,
                        log_level=log_level,
                        verbose=args.verbose)  # start logging

    return vars(args)
def parse_arguments():
    info = 'Extracts gene sequences from a genomic FASTA file'
    parser = argparse.ArgumentParser(description=info)

    # logging arguments
    parser.add_argument('-ll', '--log-level',
                        type=str,
                        action='store',
                        default='',
                        help='Write a log file (--log-level=DEBUG for debug mode, '
                        '--log-level=INFO for info mode)')
    parser.add_argument('-l', '--log',
                        type=str,
                        action='store',
                        default='',
                        help='Path to log file. (accepts stdout)')
    parser.add_argument('-v', '--verbose',
                        action='store_true',
                        default=False,
                        help='Flag for more verbose log output')

    # program arguments
    help_str = 'Human genome FASTA file'
    parser.add_argument('-i', '--input',
                        type=str, required=True,
                        help=help_str)
    help_str = 'BED file annotation of genes'
    parser.add_argument('-b', '--bed',
                        type=str, required=True,
                        help=help_str)
    help_str = 'Output a single FASTA file with gene sequences'
    parser.add_argument('-o', '--output',
                        type=str, required=True,
                        help=help_str)
    args = parser.parse_args()

    # handle logging
    if args.log_level or args.log:
        if args.log:
            log_file = args.log
        else:
            log_file = ''  # auto-name the log file
    else:
        log_file = os.devnull
    log_level = args.log_level
    utils.start_logging(log_file=log_file,
                        log_level=log_level,
                        verbose=args.verbose)  # start logging

    # log user entered command
    logger.info('Command: {0}'.format(' '.join(sys.argv)))

    return vars(args)
def parse_arguments():
    # make a parser
    info = 'Performs a randomization-based test on the oncogene and TSG score'
    parser = argparse.ArgumentParser(description=info)

    # logging arguments
    parser.add_argument(
        '-ll',
        '--log-level',
        type=str,
        action='store',
        default='',
        help='Write a log file (--log-level=DEBUG for debug mode, '
        '--log-level=INFO for info mode)')
    parser.add_argument('-l',
                        '--log',
                        type=str,
                        action='store',
                        default='',
                        help='Path to log file. (accepts "stdout")')
    parser.add_argument('-v',
                        '--verbose',
                        action='store_true',
                        default=False,
                        help='Flag for more verbose log output')

    # program arguments
    help_str = 'gene FASTA file from extract_gene_seq.py script'
    parser.add_argument('-i',
                        '--input',
                        type=str,
                        required=True,
                        help=help_str)
    help_str = 'DNA mutations file'
    parser.add_argument('-m',
                        '--mutations',
                        type=str,
                        required=True,
                        help=help_str)
    help_str = 'BED file annotation of genes'
    parser.add_argument('-b', '--bed', type=str, required=True, help=help_str)
    help_str = 'Directory containing score information in pickle files (Default: None).'
    parser.add_argument('-s',
                        '--score-dir',
                        type=str,
                        default=None,
                        help=help_str)
    help_str = 'Directory containing neighbor graph information in pickle files (Default: None).'
    parser.add_argument('-ng',
                        '--neighbor-graph-dir',
                        type=str,
                        default=None,
                        help=help_str)
    help_str = ('Number of processes to use. 0 indicates using a single '
                'process without using a multiprocessing pool '
                '(more means Faster, default: 0).')
    parser.add_argument('-p',
                        '--processes',
                        type=int,
                        default=0,
                        help=help_str)
    help_str = ('Number of iterations for null model. p-value precision '
                'increases with more iterations, however this will also '
                'increase the run time (Default: 10000).')
    parser.add_argument('-n',
                        '--num-iterations',
                        type=int,
                        default=10000,
                        help=help_str)
    help_str = (
        'Number of iterations more significant then the observed statistic '
        'to stop further computations. This decreases compute time spent in resolving '
        'p-values for non-significant genes. (Default: 1000).')
    parser.add_argument('-sc',
                        '--stop-criteria',
                        type=int,
                        default=1000,
                        help=help_str)
    help_str = (
        'Kind of permutation test to perform ("oncogene" or "tsg"). "position-based" permutation '
        'test is intended to find oncogenes using position based statistics. '
        'The "deleterious" permutation test is intended to find tumor '
        'suppressor genes. (Default: oncogene)')
    parser.add_argument('-k',
                        '--kind',
                        type=str,
                        default='oncogene',
                        help=help_str)
    help_str = (
        'Number of DNA bases to use as context. 0 indicates no context. '
        '1 indicates only use the mutated base.  1.5 indicates using '
        'the base context used in CHASM '
        '(http://wiki.chasmsoftware.org/index.php/CHASM_Overview). '
        '2 indicates using the mutated base and the upstream base. '
        '3 indicates using the mutated base and both the upstream '
        'and downstream bases. (Default: 1.5)')
    parser.add_argument('-c',
                        '--context',
                        type=float,
                        default=1.5,
                        help=help_str)
    help_str = (
        'Use mutations that are not mapped to the the single reference '
        'transcript for a gene specified in the bed file indicated by '
        'the -b option.')
    parser.add_argument('-u',
                        '--use-unmapped',
                        action='store_true',
                        default=False,
                        help=help_str)
    help_str = (
        'Path to the genome fasta file. Required if --use-unmapped flag '
        'is used. (Default: None)')
    parser.add_argument('-g', '--genome', type=str, default='', help=help_str)
    help_str = ('Only keep unique mutations for each tumor sample.'
                'Mutations reproted from heterogeneous sources may contain'
                ' duplicates, e.g. a tumor sample was sequenced twice.')
    parser.add_argument('--unique',
                        action='store_true',
                        default=False,
                        help=help_str)
    help_str = ('Minimum number of mutations at a position for it to be '
                'considered a recurrently mutated position (Default: 3).')
    parser.add_argument('-r',
                        '--recurrent',
                        type=int,
                        default=3,
                        help=help_str)
    help_str = ('Fraction of total mutations in a gene. This define the '
                'minimumm number of mutations for a position to be defined '
                'as recurrently mutated (Defaul: .02).')
    parser.add_argument('-f',
                        '--fraction',
                        type=float,
                        default=.02,
                        help=help_str)
    help_str = (
        'Perform tsg permutation test if gene has '
        'at least a user specified number of deleterious mutations (default: 1)'
    )
    parser.add_argument('-d',
                        '--deleterious',
                        type=int,
                        default=1,
                        help=help_str)
    help_str = ('Maximum TSG score to allow gene to be tested for oncogene '
                'permutation test. Values greater than one indicate all '
                'genes will be tested (Default: 1.01).')
    parser.add_argument('-t',
                        '--tsg-score',
                        type=float,
                        default=1.01,
                        help=help_str)
    help_str = ('Deleterious mutation pseudo-count for null distribution '
                'statistics. (Default: 0)')
    parser.add_argument('-dp',
                        '--deleterious-pseudo-count',
                        type=int,
                        default=0,
                        help=help_str)
    help_str = (
        'Recurrent missense mutation pseudo-count for null distribution '
        'statistics. (Default: 0)')
    parser.add_argument('-rp',
                        '--recurrent-pseudo-count',
                        type=int,
                        default=0,
                        help=help_str)
    help_str = ('Specify the seed for the pseudo random number generator. '
                'By default, the seed is randomly chosen based. The seed will '
                'be used for the permutation test monte carlo simulations.')
    parser.add_argument('-seed',
                        '--seed',
                        type=int,
                        default=None,
                        help=help_str)
    help_str = 'Output of probabilistic 20/20 results'
    parser.add_argument('-o',
                        '--output',
                        type=str,
                        required=True,
                        help=help_str)
    args = parser.parse_args()

    # handle logging
    if args.log_level or args.log:
        if args.log:
            log_file = args.log
        else:
            log_file = ''  # auto-name the log file
    else:
        log_file = os.devnull
    log_level = args.log_level
    utils.start_logging(log_file=log_file,
                        log_level=log_level,
                        verbose=args.verbose)  # start logging

    opts = vars(args)
    if opts['use_unmapped'] and not opts['genome']:
        print('You must specify a genome fasta with -g if you set the '
              '--use-unmapped flag to true.')
        sys.exit(1)

    # log user entered command
    logger.info('Command: {0}'.format(' '.join(sys.argv)))
    return opts
示例#4
0
def parse_arguments():
    # make a parser
    info = 'Performs a statistical test for oncogene, TSG, or driver gene'
    parent_parser = argparse.ArgumentParser(description=info)

    # logging arguments
    parent_parser.add_argument(
        '-ll',
        '--log-level',
        type=str,
        action='store',
        default='',
        help='Write a log file (--log-level=DEBUG for debug mode, '
        '--log-level=INFO for info mode)')
    parent_parser.add_argument('-l',
                               '--log',
                               type=str,
                               action='store',
                               default='stdout',
                               help='Path to log file. (accepts "stdout")')
    parent_parser.add_argument('-v',
                               '--verbose',
                               action='store_true',
                               default=False,
                               help='Flag for more verbose log output')

    # add subparsers
    subparsers = parent_parser.add_subparsers(title='Driver Gene Type',
                                              dest='kind')
    parser_og = subparsers.add_parser(
        'oncogene',
        help='Find statistically significant oncogene-like genes.',
        description='Find statsitically significant oncogene-like genes. '
        'Evaluates clustering of missense mutations and high in '
        'silico pathogenicity scores for missense mutations.')
    help_info = 'Find statistically significant Tumor Suppressor-like genes.'
    parser_tsg = subparsers.add_parser(
        'tsg',
        help=help_info,
        description=help_info + ' Evaluates for a higher proportion '
        'of inactivating mutations than expected.')
    #parser_protein = subparsers.add_parser('protein', help='Find statistically significant '
    #'3D clustering in genes based on protein structure.')

    # program arguments
    for i, parser in enumerate([parser_og, parser_tsg]):
        # group of parameters
        major_parser = parser.add_argument_group(title='Major options')
        advance_parser = parser.add_argument_group(title='Advanced options')

        # set the CLI params
        help_str = 'gene FASTA file from extract_gene_seq.py script'
        major_parser.add_argument('-i',
                                  '--input',
                                  type=str,
                                  required=True,
                                  help=help_str)
        help_str = (
            'DNA mutations file (MAF file). Columns can be in any order, '
            'but should contain the correct column header names.')
        major_parser.add_argument('-m',
                                  '--mutations',
                                  type=str,
                                  required=True,
                                  help=help_str)
        help_str = 'BED file annotation of genes'
        major_parser.add_argument('-b',
                                  '--bed',
                                  type=str,
                                  required=True,
                                  help=help_str)
        help_str = (
            'Number of processes to use for parallelization. 0 indicates using a single '
            'process without using a multiprocessing pool '
            '(more means Faster, default: 0).')
        major_parser.add_argument('-p',
                                  '--processes',
                                  type=int,
                                  default=0,
                                  help=help_str)
        help_str = ('Number of iterations for null model. p-value precision '
                    'increases with more iterations, however this will also '
                    'increase the run time (Default: 100,000).')
        major_parser.add_argument('-n',
                                  '--num-iterations',
                                  type=int,
                                  default=100000,
                                  help=help_str)
        help_str = (
            'Number of iterations more significant then the observed statistic '
            'to stop further computations. This decreases compute time spent in resolving '
            'p-values for non-significant genes. (Default: 1000).')
        advance_parser.add_argument('-sc',
                                    '--stop-criteria',
                                    type=int,
                                    default=1000,
                                    help=help_str)
        help_str = (
            'Number of DNA bases to use as context. 0 indicates no context. '
            '1 indicates only use the mutated base.  1.5 indicates using '
            'the base context used in CHASM '
            '(http://wiki.chasmsoftware.org/index.php/CHASM_Overview). '
            '2 indicates using the mutated base and the upstream base. '
            '3 indicates using the mutated base and both the upstream '
            'and downstream bases. (Default: 1.5)')
        major_parser.add_argument('-c',
                                  '--context',
                                  type=float,
                                  default=1.5,
                                  help=help_str)
        if i == 0:
            help_str = 'Directory containing VEST score information in pickle files (Default: None).'
            major_parser.add_argument('-s',
                                      '--score-dir',
                                      type=str,
                                      default=None,
                                      help=help_str)
            help_str = (
                'Minimum number of mutations at a position for it to be '
                'considered a recurrently mutated position (Default: 3).')
            advance_parser.add_argument('-r',
                                        '--recurrent',
                                        type=int,
                                        default=3,
                                        help=help_str)
            help_str = (
                'Fraction of total mutations in a gene. This define the '
                'minimumm number of mutations for a position to be defined '
                'as recurrently mutated (Defaul: .02).')
            advance_parser.add_argument('-f',
                                        '--fraction',
                                        type=float,
                                        default=.02,
                                        help=help_str)
        elif i == 1:
            help_str = (
                'Perform tsg randomization-based test if gene has '
                'at least a user specified number of deleterious mutations (default: 1)'
            )
            advance_parser.add_argument('-d',
                                        '--deleterious',
                                        type=int,
                                        default=1,
                                        help=help_str)
        elif i == 2:
            help_str = 'Directory containing codon neighbor graph information in pickle files (Default: None).'
            major_parser.add_argument('-ng',
                                      '--neighbor-graph-dir',
                                      type=str,
                                      required=True,
                                      help=help_str)
            help_str = (
                'Minimum number of mutations at a position for it to be '
                'considered a recurrently mutated position (Default: 3).')
            advance_parser.add_argument('-r',
                                        '--recurrent',
                                        type=int,
                                        default=3,
                                        help=help_str)
            help_str = (
                'Fraction of total mutations in a gene. This define the '
                'minimumm number of mutations for a position to be defined '
                'as recurrently mutated (Defaul: .02).')
            advance_parser.add_argument('-f',
                                        '--fraction',
                                        type=float,
                                        default=.02,
                                        help=help_str)
        help_str = ('Only keep unique mutations for each tumor sample. '
                    'Mutations reported from heterogeneous sources may contain'
                    ' duplicates, e.g. a tumor sample was sequenced twice.')
        advance_parser.add_argument('--unique',
                                    action='store_true',
                                    default=False,
                                    help=help_str)
        help_str = (
            'Use mutations that are not mapped to the the single reference '
            'transcript for a gene specified in the bed file indicated by '
            'the -b option.')
        advance_parser.add_argument('-u',
                                    '--use-unmapped',
                                    action='store_true',
                                    default=False,
                                    help=help_str)
        help_str = (
            'Path to the genome fasta file. Required if --use-unmapped flag '
            'is used. (Default: None)')
        advance_parser.add_argument('-g',
                                    '--genome',
                                    type=str,
                                    default='',
                                    help=help_str)
        help_str = ('Specify the seed for the pseudo random number generator. '
                    'By default, the seed is randomly chosen. The seed will '
                    'be used for the monte carlo simulations (Default: 101).')
        advance_parser.add_argument('-seed',
                                    '--seed',
                                    type=int,
                                    default=101,
                                    help=help_str)
        help_str = 'Output text file of probabilistic 20/20 results'
        major_parser.add_argument('-o',
                                  '--output',
                                  type=str,
                                  required=True,
                                  help=help_str)
    args = parent_parser.parse_args()

    # handle logging
    if args.log_level or args.log:
        if args.log:
            log_file = args.log
        else:
            log_file = ''  # auto-name the log file
    else:
        log_file = os.devnull
    log_level = args.log_level
    utils.start_logging(log_file=log_file,
                        log_level=log_level,
                        verbose=args.verbose)  # start logging

    opts = vars(args)
    if opts['use_unmapped'] and not opts['genome']:
        print('You must specify a genome fasta with -g if you set the '
              '--use-unmapped flag to true.')
        sys.exit(1)

    # log user entered command
    logger.info('Version: {0}'.format(prob2020.__version__))
    logger.info('Command: {0}'.format(' '.join(sys.argv)))
    return opts
def parse_arguments():
    # make a parser
    info = 'Simulates the non-silent mutation ratio by randomly permuting mutations'
    parser = argparse.ArgumentParser(description=info)

    # logging arguments
    parser.add_argument(
        '-ll',
        '--log-level',
        type=str,
        action='store',
        default='',
        help='Write a log file (--log-level=DEBUG for debug mode, '
        '--log-level=INFO for info mode)')
    parser.add_argument('-l',
                        '--log',
                        type=str,
                        action='store',
                        default='',
                        help='Path to log file. (accepts "stdout")')

    # program arguments
    help_str = 'gene FASTA file from extract_gene_seq.py script'
    parser.add_argument('-i',
                        '--input',
                        type=str,
                        required=True,
                        help=help_str)
    help_str = 'DNA mutations file'
    parser.add_argument('-m',
                        '--mutations',
                        type=str,
                        required=True,
                        help=help_str)
    help_str = 'BED file annotation of genes'
    parser.add_argument('-b', '--bed', type=str, required=True, help=help_str)
    help_str = ('Number of processes to use. 0 indicates using a single '
                'process without using a multiprocessing pool '
                '(more means Faster, default: 0).')
    parser.add_argument('-p',
                        '--processes',
                        type=int,
                        default=0,
                        help=help_str)
    help_str = ('Number of permutations for null model. p-value precision '
                'increases with more permutations (Default: 10000).')
    parser.add_argument('-n',
                        '--num-permutations',
                        type=int,
                        default=10000,
                        help=help_str)
    help_str = (
        'Number of DNA bases to use as context. 0 indicates no context. '
        '1 indicates only use the mutated base.  1.5 indicates using '
        'the base context used in CHASM '
        '(http://wiki.chasmsoftware.org/index.php/CHASM_Overview). '
        '2 indicates using the mutated base and the upstream base. '
        '3 indicates using the mutated base and both the upstream '
        'and downstream bases. (Default: 1.5)')
    parser.add_argument('-c',
                        '--context',
                        type=float,
                        default=1.5,
                        help=help_str)
    help_str = 'Directory containing score information in pickle files (Default: None).'
    parser.add_argument('-s',
                        '--score-dir',
                        type=str,
                        default=None,
                        help=help_str)
    help_str = 'Report counts for observed mutations stratified by the tumor sample'
    parser.add_argument('-bs',
                        '--by-sample',
                        action='store_true',
                        help=help_str)
    help_str = (
        'Use mutations that are not mapped to the the single reference '
        'transcript for a gene specified in the bed file indicated by '
        'the -b option.')
    parser.add_argument('-u',
                        '--use-unmapped',
                        action='store_true',
                        default=False,
                        help=help_str)
    help_str = (
        'Path to the genome fasta file. Required if --use-unmapped flag '
        'is used. (Default: None)')
    parser.add_argument('-g', '--genome', type=str, default='', help=help_str)
    help_str = 'Output text file of observed results (optional).'
    parser.add_argument('-oo',
                        '--observed-output',
                        type=str,
                        default=None,
                        help=help_str)
    help_str = 'Output text file of simulation results'
    parser.add_argument('-o',
                        '--output',
                        type=str,
                        required=True,
                        help=help_str)
    args = parser.parse_args()

    # handle logging
    if args.log_level or args.log:
        if args.log:
            log_file = args.log
        else:
            log_file = ''  # auto-name the log file
    else:
        log_file = os.devnull
    log_level = args.log_level
    utils.start_logging(log_file=log_file,
                        log_level=log_level)  # start logging

    opts = vars(args)
    if opts['use_unmapped'] and not opts['genome']:
        print('You must specify a genome fasta with -g if you set the '
              '--use-unmapped flag to true.')
        sys.exit(1)

    # log user entered command
    logger.info('Command: {0}'.format(' '.join(sys.argv)))
    return opts
示例#6
0
def parse_arguments():
    info = 'Checks mutations to see what strand they are reported on and for unmapped mutations.'
    parser = argparse.ArgumentParser(description=info)

    # logging arguments
    parser.add_argument(
        '-ll',
        '--log-level',
        type=str,
        action='store',
        default='',
        help='Write a log file (--log-level=DEBUG for debug mode, '
        '--log-level=INFO for info mode)')
    parser.add_argument('-l',
                        '--log',
                        type=str,
                        action='store',
                        default='stdout',
                        help='Path to log file. (Default: stdout)')
    parser.add_argument('-v',
                        '--verbose',
                        action='store_true',
                        default=False,
                        help='Flag for more verbose log output')

    # program arguments
    help_str = 'Human genome FASTA file'
    parser.add_argument('-f',
                        '--fasta',
                        type=str,
                        required=True,
                        help=help_str)
    help_str = 'Text file specifying mutations in the format required for permutation test'
    parser.add_argument('-m',
                        '--mutations',
                        type=str,
                        required=True,
                        help=help_str)
    help_str = 'BED file of reference transcripts'
    parser.add_argument('-b', '--bed', type=str, required=True, help=help_str)
    help_str = 'Save mutations that could not be found on the reference transcript'
    parser.add_argument('-u',
                        '--unmapped',
                        type=str,
                        required=True,
                        help=help_str)
    args = parser.parse_args()

    # handle logging
    if args.log_level or args.log:
        if args.log:
            log_file = args.log
        else:
            log_file = ''  # auto-name the log file
    else:
        log_file = os.devnull
    log_level = args.log_level
    utils.start_logging(log_file=log_file,
                        log_level=log_level,
                        verbose=args.verbose)  # start logging

    return vars(args)
示例#7
0
def parse_arguments():
    # make a parser
    info = 'Either simulates or summarizes observed mutation data.'
    parser = argparse.ArgumentParser(description=info)

    # logging arguments
    parser.add_argument(
        '-ll',
        '--log-level',
        type=str,
        action='store',
        default='',
        help='Write a log file (--log-level=DEBUG for debug mode, '
        '--log-level=INFO for info mode)')
    parser.add_argument('-l',
                        '--log',
                        type=str,
                        action='store',
                        default='stdout',
                        help='Path to log file. (accepts "stdout")')
    parser.add_argument('-v',
                        '--verbose',
                        action='store_true',
                        default=False,
                        help='Flag for more verbose log output')

    # program arguments
    help_str = 'gene FASTA file from extract_gene_seq script'
    parser.add_argument('-i',
                        '--input',
                        type=str,
                        required=True,
                        help=help_str)
    help_str = 'DNA mutations file (MAF file)'
    parser.add_argument('-m',
                        '--mutations',
                        type=str,
                        required=True,
                        help=help_str)
    help_str = 'BED file annotation of genes'
    parser.add_argument('-b', '--bed', type=str, required=True, help=help_str)
    help_str = (
        'Directory containing pre-compute score information in '
        'for VEST and evolutionary conservation in pickle format (Default: None).'
    )
    parser.add_argument('-s',
                        '--score-dir',
                        type=str,
                        default=None,
                        help=help_str)
    help_str = ('Number of processes to use. 0 indicates using a single '
                'process without using a multiprocessing pool '
                '(more means Faster, default: 0).')
    parser.add_argument('-p',
                        '--processes',
                        type=int,
                        default=0,
                        help=help_str)
    help_str = (
        'Number of iterations for null model simulations. If zero is '
        'specified then output represents a result from actually observed mutations (provided by -m parameter), '
        'otherwise results will be from simulated mutations. (Default: 0).')
    parser.add_argument('-n',
                        '--num-iterations',
                        type=int,
                        default=0,
                        help=help_str)
    help_str = (
        'Number of DNA bases to use as context. 0 indicates no context. '
        '1 indicates only use the mutated base.  1.5 indicates using '
        'the base context used in CHASM '
        '(http://wiki.chasmsoftware.org/index.php/CHASM_Overview). '
        '2 indicates using the mutated base and the upstream base. '
        '3 indicates using the mutated base and both the upstream '
        'and downstream bases. (Default: 1.5)')
    parser.add_argument('-c',
                        '--context',
                        type=float,
                        default=1.5,
                        help=help_str)
    parser_grouper = parser.add_mutually_exclusive_group(required=True)
    parser_grouper.add_argument('--summary',
                                action='store_true',
                                help='Flag for saving results as summarized '
                                'features used (Default: True).')
    parser_grouper.add_argument('--maf',
                                action='store_true',
                                help='Flag for saving results in MAF format '
                                '(Default: False).')
    help_str = (
        'Use mutations that are not mapped to the the single reference '
        'transcript for a gene specified in the bed file indicated by '
        'the -b option.')
    parser.add_argument('-u',
                        '--use-unmapped',
                        action='store_true',
                        default=False,
                        help=help_str)
    help_str = (
        'Path to the genome fasta file. Required if --use-unmapped flag '
        'is used. (Default: None)')
    parser.add_argument('-g', '--genome', type=str, default='', help=help_str)
    help_str = ('Minimum number of mutations at a position for it to be '
                'considered a recurrently mutated position (Default: 3).')
    parser.add_argument('-r',
                        '--recurrent',
                        type=int,
                        default=3,
                        help=help_str)
    help_str = ('Fraction of total mutations in a gene. This define the '
                'minimumm number of mutations for a position to be defined '
                'as recurrently mutated (Default: .02).')
    parser.add_argument('-f',
                        '--fraction',
                        type=float,
                        default=.02,
                        help=help_str)
    help_str = ('Only keep unique mutations for each tumor sample.'
                'Mutations reproted from heterogeneous sources may contain'
                ' duplicates, e.g. a tumor sample was sequenced twice.')
    parser.add_argument('--unique',
                        action='store_true',
                        default=False,
                        help=help_str)
    help_str = ('Specify the seed for the pseudo random number generator. '
                'By default, the seed is randomly chosen based. The seed will '
                'be used for the monte carlo simulations (Default: 101).')
    parser.add_argument('-seed',
                        '--seed',
                        type=int,
                        default=101,
                        help=help_str)
    help_str = 'Output text file of results'
    parser.add_argument('-o',
                        '--output',
                        type=str,
                        required=True,
                        help=help_str)
    args = parser.parse_args()

    # handle logging
    if args.log_level or args.log:
        if args.log:
            log_file = args.log
        else:
            log_file = ''  # auto-name the log file
    else:
        log_file = os.devnull
    log_level = args.log_level
    utils.start_logging(log_file=log_file,
                        log_level=log_level,
                        verbose=args.verbose)  # start logging

    opts = vars(args)
    if opts['use_unmapped'] and not opts['genome']:
        print('You must specify a genome fasta with -g if you set the '
              '--use-unmapped flag to true.')
        sys.exit(1)

    # log user entered command
    logger.info('Command: {0}'.format(' '.join(sys.argv)))
    return opts
def parse_arguments():
    # make a parser
    info = 'Simulates the non-silent mutation ratio by randomly permuting mutations'
    parser = argparse.ArgumentParser(description=info)

    # logging arguments
    parser.add_argument('-ll', '--log-level',
                        type=str,
                        action='store',
                        default='',
                        help='Write a log file (--log-level=DEBUG for debug mode, '
                        '--log-level=INFO for info mode)')
    parser.add_argument('-l', '--log',
                        type=str,
                        action='store',
                        default='',
                        help='Path to log file. (accepts "stdout")')

    # program arguments
    help_str = 'gene FASTA file from extract_gene_seq.py script'
    parser.add_argument('-i', '--input',
                        type=str, required=True,
                        help=help_str)
    help_str = 'DNA mutations file'
    parser.add_argument('-m', '--mutations',
                        type=str, required=True,
                        help=help_str)
    help_str = 'BED file annotation of genes'
    parser.add_argument('-b', '--bed',
                        type=str, required=True,
                        help=help_str)
    help_str = ('Number of processes to use. 0 indicates using a single '
                'process without using a multiprocessing pool '
                '(more means Faster, default: 0).')
    parser.add_argument('-p', '--processes',
                        type=int, default=0,
                        help=help_str)
    help_str = ('Number of permutations for null model. p-value precision '
                'increases with more permutations (Default: 10000).')
    parser.add_argument('-n', '--num-permutations',
                        type=int, default=10000,
                        help=help_str)
    help_str = ('Number of DNA bases to use as context. 0 indicates no context. '
                '1 indicates only use the mutated base.  1.5 indicates using '
                'the base context used in CHASM '
                '(http://wiki.chasmsoftware.org/index.php/CHASM_Overview). '
                '2 indicates using the mutated base and the upstream base. '
                '3 indicates using the mutated base and both the upstream '
                'and downstream bases. (Default: 1.5)')
    parser.add_argument('-c', '--context',
                        type=float, default=1.5,
                        help=help_str)
    help_str = 'Directory containing score information in pickle files (Default: None).'
    parser.add_argument('-s', '--score-dir',
                        type=str, default=None,
                        help=help_str)
    help_str = 'Report counts for observed mutations stratified by the tumor sample'
    parser.add_argument('-bs', '--by-sample',
                        action='store_true',
                        help=help_str)
    help_str = ('Use mutations that are not mapped to the the single reference '
                'transcript for a gene specified in the bed file indicated by '
                'the -b option.')
    parser.add_argument('-u', '--use-unmapped',
                        action='store_true',
                        default=False,
                        help=help_str)
    help_str = ('Path to the genome fasta file. Required if --use-unmapped flag '
                'is used. (Default: None)')
    parser.add_argument('-g', '--genome',
                        type=str, default='',
                        help=help_str)
    help_str = 'Output text file of observed results (optional).'
    parser.add_argument('-oo', '--observed-output',
                        type=str, default=None,
                        help=help_str)
    help_str = 'Output text file of simulation results'
    parser.add_argument('-o', '--output',
                        type=str, required=True,
                        help=help_str)
    args = parser.parse_args()

    # handle logging
    if args.log_level or args.log:
        if args.log:
            log_file = args.log
        else:
            log_file = ''  # auto-name the log file
    else:
        log_file = os.devnull
    log_level = args.log_level
    utils.start_logging(log_file=log_file,
                        log_level=log_level)  # start logging

    opts = vars(args)
    if opts['use_unmapped'] and not opts['genome']:
        print('You must specify a genome fasta with -g if you set the '
              '--use-unmapped flag to true.')
        sys.exit(1)

    # log user entered command
    logger.info('Command: {0}'.format(' '.join(sys.argv)))
    return opts
def parse_arguments():
    # make a parser
    info = 'Performs a randomization-based test on the oncogene and TSG score'
    parser = argparse.ArgumentParser(description=info)

    # logging arguments
    parser.add_argument('-ll', '--log-level',
                        type=str,
                        action='store',
                        default='',
                        help='Write a log file (--log-level=DEBUG for debug mode, '
                        '--log-level=INFO for info mode)')
    parser.add_argument('-l', '--log',
                        type=str,
                        action='store',
                        default='',
                        help='Path to log file. (accepts "stdout")')
    parser.add_argument('-v', '--verbose',
                        action='store_true',
                        default=False,
                        help='Flag for more verbose log output')

    # program arguments
    help_str = 'gene FASTA file from extract_gene_seq.py script'
    parser.add_argument('-i', '--input',
                        type=str, required=True,
                        help=help_str)
    help_str = 'DNA mutations file'
    parser.add_argument('-m', '--mutations',
                        type=str, required=True,
                        help=help_str)
    help_str = 'BED file annotation of genes'
    parser.add_argument('-b', '--bed',
                        type=str, required=True,
                        help=help_str)
    help_str = 'Directory containing score information in pickle files (Default: None).'
    parser.add_argument('-s', '--score-dir',
                        type=str, default=None,
                        help=help_str)
    help_str = 'Directory containing neighbor graph information in pickle files (Default: None).'
    parser.add_argument('-ng', '--neighbor-graph-dir',
                        type=str, default=None,
                        help=help_str)
    help_str = ('Number of processes to use. 0 indicates using a single '
                'process without using a multiprocessing pool '
                '(more means Faster, default: 0).')
    parser.add_argument('-p', '--processes',
                        type=int, default=0,
                        help=help_str)
    help_str = ('Number of iterations for null model. p-value precision '
                'increases with more iterations, however this will also '
                'increase the run time (Default: 10000).')
    parser.add_argument('-n', '--num-iterations',
                        type=int, default=10000,
                        help=help_str)
    help_str = ('Number of iterations more significant then the observed statistic '
                'to stop further computations. This decreases compute time spent in resolving '
                'p-values for non-significant genes. (Default: 1000).')
    parser.add_argument('-sc', '--stop-criteria',
                        type=int, default=1000,
                        help=help_str)
    help_str = ('Kind of permutation test to perform ("oncogene" or "tsg"). "position-based" permutation '
                'test is intended to find oncogenes using position based statistics. '
                'The "deleterious" permutation test is intended to find tumor '
                'suppressor genes. (Default: oncogene)')
    parser.add_argument('-k', '--kind',
                        type=str, default='oncogene',
                        help=help_str)
    help_str = ('Number of DNA bases to use as context. 0 indicates no context. '
                '1 indicates only use the mutated base.  1.5 indicates using '
                'the base context used in CHASM '
                '(http://wiki.chasmsoftware.org/index.php/CHASM_Overview). '
                '2 indicates using the mutated base and the upstream base. '
                '3 indicates using the mutated base and both the upstream '
                'and downstream bases. (Default: 1.5)')
    parser.add_argument('-c', '--context',
                        type=float, default=1.5,
                        help=help_str)
    help_str = ('Use mutations that are not mapped to the the single reference '
                'transcript for a gene specified in the bed file indicated by '
                'the -b option.')
    parser.add_argument('-u', '--use-unmapped',
                        action='store_true',
                        default=False,
                        help=help_str)
    help_str = ('Path to the genome fasta file. Required if --use-unmapped flag '
                'is used. (Default: None)')
    parser.add_argument('-g', '--genome',
                        type=str, default='',
                        help=help_str)
    help_str = ('Only keep unique mutations for each tumor sample.'
                'Mutations reproted from heterogeneous sources may contain'
                ' duplicates, e.g. a tumor sample was sequenced twice.')
    parser.add_argument('--unique',
                        action='store_true',
                        default=False,
                        help=help_str)
    help_str = ('Minimum number of mutations at a position for it to be '
                'considered a recurrently mutated position (Default: 3).')
    parser.add_argument('-r', '--recurrent',
                        type=int, default=3,
                        help=help_str)
    help_str = ('Fraction of total mutations in a gene. This define the '
                'minimumm number of mutations for a position to be defined '
                'as recurrently mutated (Defaul: .02).')
    parser.add_argument('-f', '--fraction',
                        type=float, default=.02,
                        help=help_str)
    help_str = ('Perform tsg permutation test if gene has '
                'at least a user specified number of deleterious mutations (default: 1)')
    parser.add_argument('-d', '--deleterious',
                        type=int, default=1,
                        help=help_str)
    help_str = ('Maximum TSG score to allow gene to be tested for oncogene '
                'permutation test. Values greater than one indicate all '
                'genes will be tested (Default: 1.01).')
    parser.add_argument('-t', '--tsg-score',
                        type=float, default=1.01,
                        help=help_str)
    help_str = ('Deleterious mutation pseudo-count for null distribution '
                'statistics. (Default: 0)')
    parser.add_argument('-dp', '--deleterious-pseudo-count',
                        type=int, default=0,
                        help=help_str)
    help_str = ('Recurrent missense mutation pseudo-count for null distribution '
                'statistics. (Default: 0)')
    parser.add_argument('-rp', '--recurrent-pseudo-count',
                        type=int, default=0,
                        help=help_str)
    help_str = ('Specify the seed for the pseudo random number generator. '
                'By default, the seed is randomly chosen based. The seed will '
                'be used for the permutation test monte carlo simulations.')
    parser.add_argument('-seed', '--seed',
                        type=int, default=None,
                        help=help_str)
    help_str = 'Output of probabilistic 20/20 results'
    parser.add_argument('-o', '--output',
                        type=str, required=True,
                        help=help_str)
    args = parser.parse_args()

    # handle logging
    if args.log_level or args.log:
        if args.log:
            log_file = args.log
        else:
            log_file = ''  # auto-name the log file
    else:
        log_file = os.devnull
    log_level = args.log_level
    utils.start_logging(log_file=log_file,
                        log_level=log_level,
                        verbose=args.verbose)  # start logging

    opts = vars(args)
    if opts['use_unmapped'] and not opts['genome']:
        print('You must specify a genome fasta with -g if you set the '
              '--use-unmapped flag to true.')
        sys.exit(1)

    # log user entered command
    logger.info('Command: {0}'.format(' '.join(sys.argv)))
    return opts
def parse_arguments():
    # make a parser
    info = 'Performs a statistical test for oncogene, TSG, or driver gene'
    parent_parser = argparse.ArgumentParser(description=info)

    # logging arguments
    parent_parser.add_argument('-ll', '--log-level',
                               type=str,
                               action='store',
                               default='',
                               help='Write a log file (--log-level=DEBUG for debug mode, '
                               '--log-level=INFO for info mode)')
    parent_parser.add_argument('-l', '--log',
                               type=str,
                               action='store',
                               default='stdout',
                               help='Path to log file. (accepts "stdout")')
    parent_parser.add_argument('-v', '--verbose',
                               action='store_true',
                               default=False,
                               help='Flag for more verbose log output')

    # add subparsers
    subparsers = parent_parser.add_subparsers(title='Driver Gene Type', dest='kind')
    parser_og = subparsers.add_parser('oncogene',
                                      help='Find statistically significant oncogene-like genes.',
                                      description='Find statsitically significant oncogene-like genes. '
                                      'Evaluates clustering of missense mutations and high in '
                                      'silico pathogenicity scores for missense mutations.')
    help_info = 'Find statistically significant Tumor Suppressor-like genes.'
    parser_tsg = subparsers.add_parser('tsg',
                                       help=help_info,
                                       description=help_info + ' Evaluates for a higher proportion '
                                       'of inactivating mutations than expected.')
    help_info = 'Find codons with significant clustering of missense mutations in sequence.'
    parser_hotmaps = subparsers.add_parser('hotmaps1d',
                                           help=help_info,
                                           description=help_info + ' Evaluates for a higher ammount of '
                                           'clustering of missense mutations.')
    #parser_protein = subparsers.add_parser('protein', help='Find statistically significant '
                                           #'3D clustering in genes based on protein structure.')

    # program arguments
    for i, parser in enumerate([parser_og, parser_tsg, parser_hotmaps]):
        # group of parameters
        major_parser = parser.add_argument_group(title='Major options')
        advance_parser = parser.add_argument_group(title='Advanced options')

        # set the CLI params
        help_str = 'gene FASTA file from extract_gene_seq.py script'
        major_parser.add_argument('-i', '--input',
                                  type=str, required=True,
                                  help=help_str)
        help_str = ('DNA mutations file (MAF file). Columns can be in any order, '
                    'but should contain the correct column header names.')
        major_parser.add_argument('-m', '--mutations',
                                  type=str, required=True,
                                  help=help_str)
        help_str = 'BED file annotation of genes'
        major_parser.add_argument('-b', '--bed',
                                  type=str, required=True,
                                  help=help_str)
        help_str = ('Number of processes to use for parallelization. 0 indicates using a single '
                    'process without using a multiprocessing pool '
                    '(more means Faster, default: 0).')
        major_parser.add_argument('-p', '--processes',
                                  type=int, default=0,
                                  help=help_str)
        help_str = ('Number of iterations for null model. p-value precision '
                    'increases with more iterations, however this will also '
                    'increase the run time (Default: 100,000).')
        major_parser.add_argument('-n', '--num-iterations',
                                  type=int, default=100000,
                                  help=help_str)
        help_str = ('Number of iterations more significant then the observed statistic '
                    'to stop further computations. This decreases compute time spent in resolving '
                    'p-values for non-significant genes. (Default: 1000).')
        advance_parser.add_argument('-sc', '--stop-criteria',
                                    type=int, default=1000,
                                    help=help_str)
        help_str = ('Number of DNA bases to use as context. 0 indicates no context. '
                    '1 indicates only use the mutated base.  1.5 indicates using '
                    'the base context used in CHASM '
                    '(http://wiki.chasmsoftware.org/index.php/CHASM_Overview). '
                    '2 indicates using the mutated base and the upstream base. '
                    '3 indicates using the mutated base and both the upstream '
                    'and downstream bases. (Default: 1.5)')
        major_parser.add_argument('-c', '--context',
                                  type=float, default=1.5,
                                  help=help_str)
        if i == 0:
            help_str = 'Directory containing VEST score information in pickle files (Default: None).'
            major_parser.add_argument('-s', '--score-dir',
                                      type=str, default=None,
                                      help=help_str)
            help_str = ('Minimum number of mutations at a position for it to be '
                        'considered a recurrently mutated position (Default: 3).')
            advance_parser.add_argument('-r', '--recurrent',
                                        type=int, default=3,
                                        help=help_str)
            help_str = ('Fraction of total mutations in a gene. This define the '
                        'minimumm number of mutations for a position to be defined '
                        'as recurrently mutated (Defaul: .02).')
            advance_parser.add_argument('-f', '--fraction',
                                        type=float, default=.02,
                                        help=help_str)
        elif i == 1:
            help_str = ('Perform tsg randomization-based test if gene has '
                        'at least a user specified number of deleterious mutations (default: 1)')
            advance_parser.add_argument('-d', '--deleterious',
                                        type=int, default=1,
                                        help=help_str)
        elif i == 2:
            help_str = ('Sequence window size for HotMAPS 1D algorithm '
                        'by number of codons (Default: 3)')
            advance_parser.add_argument('-w', '--window',
                                        type=str, default='3',
                                        help=help_str)
            help_str = ('Flag for reporting index (row number, starts at zero) in associated mutation file')
            advance_parser.add_argument('-r', '--report-index',
                                        action='store_true', default=False,
                                        help=help_str)
            help_str = ('Path to directory to save empirical null distribution')
            advance_parser.add_argument('-nd', '--null-distr-dir',
                                        type=str,
                                        help=help_str)
        elif i == 3:
            help_str = 'Directory containing codon neighbor graph information in pickle files (Default: None).'
            major_parser.add_argument('-ng', '--neighbor-graph-dir',
                                      type=str, required=True,
                                      help=help_str)
            help_str = ('Minimum number of mutations at a position for it to be '
                        'considered a recurrently mutated position (Default: 3).')
            advance_parser.add_argument('-r', '--recurrent',
                                type=int, default=3,
                                help=help_str)
            help_str = ('Fraction of total mutations in a gene. This define the '
                        'minimumm number of mutations for a position to be defined '
                        'as recurrently mutated (Default: .02).')
            advance_parser.add_argument('-f', '--fraction',
                                        type=float, default=.02,
                                        help=help_str)
        help_str = ('Only keep unique mutations for each tumor sample. '
                    'Mutations reported from heterogeneous sources may contain'
                    ' duplicates, e.g. a tumor sample was sequenced twice.')
        advance_parser.add_argument('--unique',
                                    action='store_true',
                                    default=False,
                                    help=help_str)
        help_str = ('Use mutations that are not mapped to the the single reference '
                    'transcript for a gene specified in the bed file indicated by '
                    'the -b option.')
        advance_parser.add_argument('-u', '--use-unmapped',
                                    action='store_true',
                                    default=False,
                                    help=help_str)
        help_str = ('Path to the genome fasta file. Required if --use-unmapped flag '
                    'is used. (Default: None)')
        advance_parser.add_argument('-g', '--genome',
                                    type=str, default='',
                                    help=help_str)
        help_str = ('Specify the seed for the pseudo random number generator. '
                    'By default, the seed is randomly chosen. The seed will '
                    'be used for the monte carlo simulations (Default: 101).')
        advance_parser.add_argument('-seed', '--seed',
                                    type=int, default=101,
                                    help=help_str)
        help_str = 'Output text file of probabilistic 20/20 results'
        major_parser.add_argument('-o', '--output',
                                  type=str, required=True,
                                  help=help_str)
    args = parent_parser.parse_args()

    # handle logging
    if args.log_level or args.log:
        if args.log:
            log_file = args.log
        else:
            log_file = ''  # auto-name the log file
    else:
        log_file = os.devnull
    log_level = args.log_level
    utils.start_logging(log_file=log_file,
                        log_level=log_level,
                        verbose=args.verbose)  # start logging

    opts = vars(args)
    if opts['use_unmapped'] and not opts['genome']:
        print('You must specify a genome fasta with -g if you set the '
              '--use-unmapped flag to true.')
        sys.exit(1)

    # log user entered command
    logger.info('Version: {0}'.format(prob2020.__version__))
    logger.info('Command: {0}'.format(' '.join(sys.argv)))
    return opts