Пример #1
0
 def setUp(self):
     self.set_up()
     self.cfg_path = package_paths.data_path('4pairs_1locus.cfg')
     self.cfg_path2 = package_paths.data_path('4pairs_1locus_maxt5.cfg')
     self.np_new_cfg = package_paths.data_path('negros_panay_new.cfg')
     self.np_new_sps_cfg = package_paths.data_path('negros_panay_new_subs_per_site.cfg')
     self.np_cfg = package_paths.data_path('negros_panay_timescale.cfg')
     self.np_sps_cfg = package_paths.data_path('negros_panay_timescale_subs_per_site.cfg')
     self.seed = GLOBAL_RNG.randint(1, 999999999)
     self.rng = random.Random()
     self.rng.seed(self.seed)
     self.output_dir = self.get_test_subdir(prefix='dmc-test-')
     self.output_prefix = self.temp_fs.token_id
Пример #2
0
 def setUp(self):
     self.set_up()
     self.cfg_path = package_paths.data_path('4pairs_1locus.cfg')
     self.cfg_path2 = package_paths.data_path('4pairs_1locus_maxt5.cfg')
     self.np_new_cfg = package_paths.data_path('negros_panay_new.cfg')
     self.np_new_sps_cfg = package_paths.data_path(
         'negros_panay_new_subs_per_site.cfg')
     self.np_cfg = package_paths.data_path('negros_panay_timescale.cfg')
     self.np_sps_cfg = package_paths.data_path(
         'negros_panay_timescale_subs_per_site.cfg')
     self.seed = GLOBAL_RNG.randint(1, 999999999)
     self.rng = random.Random()
     self.rng.seed(self.seed)
     self.output_dir = self.get_test_subdir(prefix='dmc-test-')
     self.output_prefix = self.temp_fs.token_id
Пример #3
0
def main():
    keys_to_print = {
        'ncats': 'expected number of categories',
        'concentration': 'concentration parameter',
    }
    parameter_options = ['concentration', 'ncats']
    description = '{name} {version}'.format(**_program_info)
    parser = argparse.ArgumentParser(
        description=description, formatter_class=argparse.RawTextHelpFormatter)
    parser.add_argument('--version',
                        action='version',
                        version='%(prog)s ' + _program_info['version'],
                        help='report version and exit')
    parser.add_argument('parameter',
                        choices=parameter_options,
                        nargs=1,
                        help=('The parameter provided. The two options are:\n'
                              '`concentration`: the concentration parameter\n'
                              '\tof the Dirichlet process.\n'
                              '`ncats`: the expected (mean) number of\n'
                              '\tcategories for the dirichlet\n'
                              '\tprocess.\n'
                              'You provide one of these two parameters along\n'
                              'with the number of elements (taxon pairs),\n'
                              'and this program calculates and returns the\n'
                              'other one accordingly.'))
    parser.add_argument('parameter_value',
                        metavar='X',
                        type=argparse_utils.arg_is_positive_float,
                        help=('Value of the parameter'))
    parser.add_argument('--shape',
                        required=False,
                        type=argparse_utils.arg_is_positive_float,
                        help=('Shape parameter of a gamma hyperprior on the\n'
                              'concentration parameter of the Dirichlet\n'
                              'process. If provided, the program will\n'
                              'calculate a corresponding scale parameter\n'
                              'for the gamma hyperprior such that the\n'
                              'mean of the gamma hyperprior is equal to\n'
                              'the reported concentration parameter and the\n'
                              'prior expectation for the number of\n'
                              'categories is equal to `ncats`.'))
    parser.add_argument(
        '--reps',
        action='store',
        type=int,
        required=False,
        help=('The number of simulation replicates to use for\n'
              'estimating the probability of the number of categories.\n'
              'By default, no simulations are run and the probabilities\n'
              'are not estimated or reported.'))
    parser.add_argument(
        '--np',
        action='store',
        type=argparse_utils.arg_is_positive_int,
        default=multiprocessing.cpu_count(),
        help=('The maximum number of processes to run in parallel. The\n'
              'default is the number of CPUs available on the machine.\n'
              'This option is only used if `--reps` is specified.'))
    parser.add_argument('--seed',
                        action='store',
                        type=argparse_utils.arg_is_positive_int,
                        help='Random number seed to use for the analysis.')
    parser.add_argument(
        'num_elements',
        metavar='N',
        type=argparse_utils.arg_is_nonnegative_int,
        help='Number of elements (i.e., number of taxon pairs).')

    args = parser.parse_args()

    from pymsbayes.utils.messaging import (LoggingControl, InfoLogger)

    LoggingControl.set_logging_level("WARNING")

    from pymsbayes.teams import DppSimTeam
    from pymsbayes.utils import GLOBAL_RNG, probability
    from pymsbayes.utils.stats import Partition

    p = Partition('0' * args.num_elements)

    results = dict(zip(parameter_options, [None for k in parameter_options]))
    args.parameter = args.parameter[0]
    if args.parameter == 'concentration':
        results['concentration'] = args.parameter_value
        results['ncats'] = p.get_dpp_expected_num_cats(args.parameter_value)

    elif args.parameter == 'ncats':
        if args.parameter_value > args.num_elements:
            sys.stderr.write(
                'ERROR: `ncats` cannot be greater than the number '
                'of elements\n')
            sys.exit(1)
        elif args.parameter_value < 1.0:
            sys.stderr.write('ERROR: `ncats` cannot be less than 1\n')
            sys.exit(1)
        results['ncats'] = args.parameter_value
        results['concentration'] = p.get_dpp_concentration(
            args.parameter_value)

    else:
        raise Exception('parameter option {0} is not valid'.format(
            args.parameter))

    alpha = results['concentration']
    if args.shape:
        results['shape'] = args.shape
        results['scale'] = results['concentration'] / args.shape
        parameter_options.extend(['shape', 'scale'])
        alpha = probability.GammaDistribution(shape=results['shape'],
                                              scale=results['scale'])

    sys.stdout.write('number of elements = {0}\n'.format(args.num_elements))
    for key in parameter_options:
        sys.stdout.write('{0} = {1}\n'.format(keys_to_print.get(key, key),
                                              results[key]))

    if args.reps:
        sys.stderr.write(
            '\nStarting simulations to estimate probabilities...\n')
        if not args.seed:
            args.seed = random.randint(1, 999999999)
        sys.stderr.write('Using seed {0}\n\n'.format(args.seed))
        GLOBAL_RNG.seed(args.seed)

        sim_team = DppSimTeam(alpha=alpha,
                              num_elements=args.num_elements,
                              base_distribution=None,
                              num_samples=args.reps,
                              num_processors=args.np)
        sim_team.start()

        sys.stderr.write(
            'Estimated probabilities of the number of categories:\n')
        for k, prob in sim_team.psi_probs.iteritems():
            sys.stdout.write('\tp(ncats = {0}) = {1:.4f} (n = {2})\n'.format(
                k, prob, p.number_of_partitions_into_k_subsets(k)))
Пример #4
0
def main():
    keys_to_print = {
            'ncats': 'expected number of categories',
            'concentration': 'concentration parameter',
    }
    parameter_options = ['concentration', 'ncats']
    description = '{name} {version}'.format(**_program_info)
    parser = argparse.ArgumentParser(description = description,
            formatter_class=argparse.RawTextHelpFormatter)
    parser.add_argument('--version',
            action='version',
            version='%(prog)s ' + _program_info['version'],
            help='report version and exit')
    parser.add_argument('parameter',
            choices=parameter_options,
            nargs=1,
            help = ('The parameter provided. The two options are:\n'
                    '`concentration`: the concentration parameter\n'
                        '\tof the Dirichlet process.\n'
                    '`ncats`: the expected (mean) number of\n'
                        '\tcategories for the dirichlet\n'
                        '\tprocess.\n'
                    'You provide one of these two parameters along\n'
                    'with the number of elements (taxon pairs),\n'
                    'and this program calculates and returns the\n'
                    'other one accordingly.'))
    parser.add_argument('parameter_value',
            metavar='X',
            type=argparse_utils.arg_is_positive_float,
            help=('Value of the parameter'))
    parser.add_argument('--shape',
            required = False,
            type=argparse_utils.arg_is_positive_float,
            help = ('Shape parameter of a gamma hyperprior on the\n'
                    'concentration parameter of the Dirichlet\n'
                    'process. If provided, the program will\n'
                    'calculate a corresponding scale parameter\n'
                    'for the gamma hyperprior such that the\n'
                    'mean of the gamma hyperprior is equal to\n'
                    'the reported concentration parameter and the\n'
                    'prior expectation for the number of\n'
                    'categories is equal to `ncats`.'))
    parser.add_argument('--reps',
            action = 'store',
            type = int,
            required = False,
            help = ('The number of simulation replicates to use for\n'
                    'estimating the probability of the number of categories.\n'
                    'By default, no simulations are run and the probabilities\n'
                    'are not estimated or reported.'))
    parser.add_argument('--np',
            action = 'store',
            type = argparse_utils.arg_is_positive_int,
            default = multiprocessing.cpu_count(),
            help = ('The maximum number of processes to run in parallel. The\n'
                    'default is the number of CPUs available on the machine.\n'
                    'This option is only used if `--reps` is specified.'))
    parser.add_argument('--seed',
            action = 'store',
            type = argparse_utils.arg_is_positive_int,
            help = 'Random number seed to use for the analysis.')
    parser.add_argument('num_elements',
            metavar='N',
            type=argparse_utils.arg_is_nonnegative_int,
            help='Number of elements (i.e., number of taxon pairs).')

    args = parser.parse_args()

    from pymsbayes.utils.messaging import (LoggingControl,
            InfoLogger)

    LoggingControl.set_logging_level("WARNING")


    from pymsbayes.teams import DppSimTeam
    from pymsbayes.utils import GLOBAL_RNG, probability
    from pymsbayes.utils.stats import Partition

    p = Partition('0' * args.num_elements)

    results = dict(zip(parameter_options,
            [None for k in parameter_options]))
    args.parameter = args.parameter[0]
    if args.parameter == 'concentration':
        results['concentration'] = args.parameter_value
        results['ncats'] = p.get_dpp_expected_num_cats(args.parameter_value)

    elif args.parameter == 'ncats':
        if args.parameter_value > args.num_elements:
            sys.stderr.write('ERROR: `ncats` cannot be greater than the number '
                    'of elements\n')
            sys.exit(1)
        elif args.parameter_value < 1.0:
            sys.stderr.write('ERROR: `ncats` cannot be less than 1\n')
            sys.exit(1)
        results['ncats'] = args.parameter_value
        results['concentration'] = p.get_dpp_concentration(args.parameter_value)

    else:
        raise Exception('parameter option {0} is not valid'.format(
                args.parameter))

    alpha = results['concentration']
    if args.shape:
        results['shape'] = args.shape
        results['scale'] = results['concentration'] / args.shape
        parameter_options.extend(['shape', 'scale'])
        alpha = probability.GammaDistribution(
                shape = results['shape'],
                scale = results['scale'])
        
    sys.stdout.write('number of elements = {0}\n'.format(args.num_elements))
    for key in parameter_options:
        sys.stdout.write('{0} = {1}\n'.format(
                keys_to_print.get(key, key),
                results[key]))

    if args.reps:
        sys.stderr.write('\nStarting simulations to estimate probabilities...\n')
        if not args.seed:
            args.seed = random.randint(1, 999999999)
        sys.stderr.write('Using seed {0}\n\n'.format(args.seed))
        GLOBAL_RNG.seed(args.seed)

        sim_team = DppSimTeam(
                alpha = alpha,
                num_elements = args.num_elements,
                base_distribution = None,
                num_samples = args.reps,
                num_processors = args.np)
        sim_team.start()

        sys.stderr.write('Estimated probabilities of the number of categories:\n')
        for k, prob in sim_team.psi_probs.iteritems():
            sys.stdout.write('\tp(ncats = {0}) = {1:.4f} (n = {2})\n'.format(
                    k,
                    prob,
                    p.number_of_partitions_into_k_subsets(k)))
            label_axis_label_size = 16.0)
    pg.savefig(os.path.join(out_dir, 'negros-panay-marginal-times.pdf'))

def main_cli():
    create_plots( 
            dpp_config_path = project_util.PHILIPPINES_DPP_CFG,
            uniform_config_path = project_util.PHILIPPINES_UNIFORM_CFG,
            old_config_path = project_util.PHILIPPINES_OLD_CFG,
            dpp_info_path = project_util.PHILIPPINES_DPP_INFO,
            dpp_simple_info_path = project_util.PHILIPPINES_DPP_SIMPLE_INFO,
            dpp_inform_info_path = project_util.PHILIPPINES_DPP_INFORM_INFO,
            uniform_info_path = project_util.PHILIPPINES_UNIFORM_INFO,
            old_info_path = project_util.PHILIPPINES_OLD_INFO,
            out_dir = project_util.PLOT_DIR)
    create_negros_panay_plots(
            config_path = project_util.NEGROS_PANAY_CFG,
            ordered_info_path = project_util.NP_DPP_ORDERED_INFO,
            unordered_info_path = project_util.NP_DPP_UNORDERED_INFO,
            out_dir = project_util.PLOT_DIR)
    create_time_plot(
            config_path = project_util.NEGROS_PANAY_CFG,
            info_path = project_util.NP_DPP_ORDERED_INFO,
            out_dir = project_util.PLOT_DIR)

if __name__ == '__main__':
    if len(sys.argv) > 1:
        seed = int(sys.argv[1])
        GLOBAL_RNG.seed(seed)
    main_cli()

Пример #6
0
def main_cli():
    description = '{name} {version}'.format(**_program_info)
    parser = argparse.ArgumentParser(description = description)
    parser.add_argument('-c', '--config',
            type = arg_is_config,
            required = True,
            help = ('msBayes config file to be used to generate saturation '
                    'plot.'))
    parser.add_argument('-n', '--num-prior-samples',
            action = 'store',
            type = int,
            default = 1000,
            help = ('The number of prior samples to simulate for the '
                    'saturation plot.'))
    parser.add_argument('--np',
            action = 'store',
            type = int,
            default = multiprocessing.cpu_count(),
            help = ('The maximum number of processes to run in parallel. The '
                    'default is the number of CPUs available on the machine.'))
    parser.add_argument('-o', '--output-dir',
            action = 'store',
            type = arg_is_dir,
            help = ('The directory in which all output files will be written. '
                    'The default is to use the directory of the first observed '
                    'config file.'))
    parser.add_argument('--temp-dir',
            action = 'store',
            type = arg_is_dir,
            help = ('A directory to temporarily stage files. The default is to '
                    'use the output directory.'))
    parser.add_argument('-s', '--stat-prefixes',
            nargs = '*',
            type = str,
            default = ['pi', 'pi.net', 'wattTheta', 'tajD.denom'],
            help = ('Prefixes of summary statistics to use in the analyses. '
                    'The prefixes should be separated by spaces. '
                    'Default: `-s pi pi.net wattTheta tajD.denom`.'))
    parser.add_argument('--vertical-lines',
            nargs = '*',
            type = float,
            default = [],
            help = ('Positions along x-axis where vertical lines are to be '
                    'drawn. Default is to draw no vertical lines.'))
    parser.add_argument('--compress',
            action = 'store_true',
            help = 'Compress plot data file.')
    parser.add_argument('--keep-temps',
            action = 'store_true',
            help = 'Keep all temporary files.')
    parser.add_argument('--seed',
            action = 'store',
            type = int,
            help = 'Random number seed to use for the analysis.')
    parser.add_argument('--version',
            action = 'version',
            version = '%(prog)s ' + _program_info['version'],
            help = 'Report version and exit.')
    parser.add_argument('--quiet',
            action = 'store_true',
            help = 'Run without verbose messaging.')
    parser.add_argument('--debug',
            action = 'store_true',
            help = 'Run in debugging mode.')

    args = parser.parse_args()

    ##########################################################################
    ## handle args

    from pymsbayes.utils.messaging import (LoggingControl,
            InfoLogger)

    LoggingControl.set_logging_level("INFO")
    if args.quiet:
        LoggingControl.set_logging_level("WARNING")
    if args.debug:
        LoggingControl.set_logging_level("DEBUG")
    log = LoggingControl.get_logger(__name__)

    from pymsbayes.workers import MsBayesWorker
    from pymsbayes.utils.parsing import (get_patterns_from_prefixes,
            DEFAULT_STAT_PATTERNS, get_stats_by_time, dict_line_iter)
    from pymsbayes.manager import Manager
    from pymsbayes.utils.tempfs import TempFileSystem
    from pymsbayes.utils import probability
    from pymsbayes.utils.functions import long_division
    from pymsbayes.config import MsBayesConfig
    from pymsbayes.utils import GLOBAL_RNG, MSBAYES_SORT_INDEX, ToolPathManager
    from pymsbayes.fileio import process_file_arg
    from pymsbayes.plotting import MATPLOTLIB_AVAILABLE, SaturationPlotGrid

    MSBAYES_SORT_INDEX.set_index(0)

    # get full paths to tools
    msbayes_path = ToolPathManager.get_tool_full_path('msbayes.pl')
    dpp_msbayes_path = ToolPathManager.get_tool_full_path('dpp-msbayes.pl')

    if not args.output_dir:
        args.output_dir = os.path.dirname(args.config)
    info = InfoLogger(os.path.join(args.output_dir, 'pymsbayes-info.txt'))

    stats_by_time_path = os.path.join(args.output_dir, 'stats-by-time.txt')
    if args.compress:
        stats_by_time_path += '.gz'
    plot_path = os.path.join(args.output_dir, 'saturation-plot.pdf')

    if not args.temp_dir:
        args.temp_dir = args.output_dir
    temp_fs = TempFileSystem(parent=args.temp_dir, prefix='temp-files-')
    args.stat_prefixes = [s.rstrip('.') for s in args.stat_prefixes]
    stat_patterns = get_patterns_from_prefixes(
            [s + '.' for s in args.stat_prefixes],
            ignore_case=True)
    if not args.seed:
        args.seed = random.randint(1, 999999999)
    GLOBAL_RNG.seed(args.seed)
    compress_level = None
    if args.compress:
        compress_level = 9

    cfg = MsBayesConfig(args.config)
    num_taxon_pairs = cfg.npairs
    cfg.div_model_prior = 'constrained'
    cfg.psi = probability.DiscreteUniformDistribution(num_taxon_pairs,
            num_taxon_pairs)
    config_path = temp_fs.get_file_path(prefix='cfg-')
    cfg.write(config_path)

    info.write('[pymsbayes]', log.info)
    info.write('\tprogram_name = {name}'.format(**_program_info), log.info)
    info.write('\tversion = {version}'.format(**_program_info), log.info)
    info.write('\tinvocation = {0!r}'.format(' '.join(sys.argv)), log.info)
    info.write('\toutput_directory = {0!r}'.format(args.output_dir), log.info)
    info.write('\ttemp_directory = {0!r}'.format(temp_fs.base_dir), log.info)
    info.write('\tsort_index = {0}'.format(
            MSBAYES_SORT_INDEX.current_value()), log.info)
    info.write('\tstat_patterns = {0!r}'.format(
            ', '.join([p.pattern for p in stat_patterns])), log.info)
    info.write('\tseed = {0}'.format(args.seed), log.info)
    info.write('\tnum_prior_samples = {0}'.format(args.num_prior_samples),
            log.info)
    info.write('\tstats_by_time_path = {0!r}'.format(stats_by_time_path),
            log.info)
    info.write('\t[[tool_paths]]', log.info)
    info.write('\t\tdpp_msbayes = {0}'.format(dpp_msbayes_path), log.info)
    info.write('\t\tmsbayes = {0}'.format(msbayes_path), log.info)

    info.write('\t[[config]]', log.debug)
    info.write('{0}'.format(str(cfg)), log.debug)

    ##########################################################################
    ## begin analysis --- generate samples

    start_time = datetime.datetime.now()

    if args.np > args.num_prior_samples:
        args.np = args.num_prior_samples
    batch_size, remainder = long_division(args.num_prior_samples, args.np)
    schema = 'abctoolbox'
    workers = []
    for i in range(args.np):
        sample_size = batch_size
        if i == (args.np - 1):
            sample_size += remainder
        w = MsBayesWorker(
                temp_fs = temp_fs,
                sample_size = sample_size,
                config_path = config_path,
                report_parameters = True,
                schema = schema,
                include_header = True,
                stat_patterns = stat_patterns,
                write_stats_file = False)
        workers.append(w)

    log.info('Generating samples...')
    workers = Manager.run_workers(
            workers = workers,
            num_processors = args.np)
    log.info('Parsing samples...')
    stats_by_time = get_stats_by_time([w.prior_path for w in workers])
    stat_keys = stats_by_time.keys()
    stat_keys.remove('PRI.t')
    for prefix in args.stat_prefixes:
        if not prefix in stat_keys:
            raise Exception('stat prefix {0!r} not found in simulated stats:'
                    '\n\t{1}'.format(prefix, ', '.join(stat_keys)))
    header = ['PRI.t'] + args.stat_prefixes
    log.info('Writing stats-by-time matrix...')
    out, close = process_file_arg(stats_by_time_path, 'w',
            compresslevel = compress_level)
    for row in dict_line_iter(stats_by_time, sep = '\t', header = header):
        out.write(row)
    if close:
        out.close()

    log.info('Creating plots...')

    if not MATPLOTLIB_AVAILABLE:
        log.warning(
                '`matplotlib` could not be imported, so the plot can not be\n'
                'produced. The data to create the plot can be found in:\n\t'
                '{0!r}'.format(stats_by_time_path))
    else:
        y_labels = {'pi': r'$\pi$',
                   'pi.net': r'$\pi_{net}$',
                   'wattTheta': r'$\theta_W$',
                   'tajD.denom': r'$SD(\pi - \theta_W)$'}
        spg = SaturationPlotGrid(stats_by_time,
                x_key = 'PRI.t',
                y_keys = args.stat_prefixes,
                y_labels = y_labels,
                num_columns = 2,
                vertical_line_positions = args.vertical_lines)
        fig = spg.create_grid()
        fig.savefig(plot_path)

    stop_time = datetime.datetime.now()
    log.info('Done!')
    info.write('\t[[run_stats]]', log.info)
    info.write('\t\tstart_time = {0}'.format(str(start_time)), log.info)
    info.write('\t\tstop_time = {0}'.format(str(stop_time)), log.info)
    info.write('\t\ttotal_duration = {0}'.format(str(stop_time - start_time)),
            log.info)

    if not args.keep_temps:
        log.debug('purging temps...')
        temp_fs.purge()
Пример #7
0
def main_cli():
    description = '{name} {version}'.format(**_program_info)
    parser = argparse.ArgumentParser(description = description,
            formatter_class = argparse_utils.SmartHelpFormatter)
    parser.add_argument('-c', '--config',
            type = argparse_utils.arg_is_config,
            required = True,
            help = ('msBayes config file to be used to generate saturation '
                    'plot.'))
    parser.add_argument('-n', '--num-prior-samples',
            action = 'store',
            type = int,
            default = 1000,
            help = ('The number of prior samples to simulate for the '
                    'saturation plot.'))
    parser.add_argument('--np',
            action = 'store',
            type = int,
            default = multiprocessing.cpu_count(),
            help = ('The maximum number of processes to run in parallel. The '
                    'default is the number of CPUs available on the machine.'))
    parser.add_argument('-o', '--output-dir',
            action = 'store',
            type = argparse_utils.arg_is_dir,
            help = ('The directory in which all output files will be written. '
                    'The default is to use the directory of the first observed '
                    'config file.'))
    parser.add_argument('--temp-dir',
            action = 'store',
            type = argparse_utils.arg_is_dir,
            help = ('A directory to temporarily stage files. The default is to '
                    'use the output directory.'))
    parser.add_argument('-s', '--stat-prefixes',
            nargs = '*',
            type = str,
            default = ['pi', 'pi.net', 'wattTheta', 'tajD.denom'],
            help = ('Prefixes of summary statistics to use in the analyses. '
                    'The prefixes should be separated by spaces. '
                    'Default: `-s pi pi.net wattTheta tajD.denom`.'))
    parser.add_argument('--sort-index',
            action = 'store',
            type = int,
            default = 0,
            choices = range(12),
            help = argparse_utils.get_sort_index_help_message())
    parser.add_argument('--compress',
            action = 'store_true',
            help = 'Compress plot data file.')
    parser.add_argument('--keep-temps',
            action = 'store_true',
            help = 'Keep all temporary files.')
    parser.add_argument('--seed',
            action = 'store',
            type = int,
            help = 'Random number seed to use for the analysis.')
    parser.add_argument('--version',
            action = 'version',
            version = '%(prog)s ' + _program_info['version'],
            help = 'Report version and exit.')
    parser.add_argument('--quiet',
            action = 'store_true',
            help = 'Run without verbose messaging.')
    parser.add_argument('--debug',
            action = 'store_true',
            help = 'Run in debugging mode.')

    args = parser.parse_args()

    ##########################################################################
    ## handle args

    from pymsbayes.utils.messaging import (LoggingControl,
            InfoLogger)

    LoggingControl.set_logging_level("INFO")
    if args.quiet:
        LoggingControl.set_logging_level("WARNING")
    if args.debug:
        LoggingControl.set_logging_level("DEBUG")
    log = LoggingControl.get_logger(__name__)

    from pymsbayes.workers import MsBayesWorker
    from pymsbayes.utils.parsing import (get_patterns_from_prefixes,
            DEFAULT_STAT_PATTERNS, get_dict_from_spreadsheets, dict_line_iter)
    from pymsbayes.manager import Manager
    from pymsbayes.utils.tempfs import TempFileSystem
    from pymsbayes.utils import probability, stats
    from pymsbayes.utils.functions import long_division
    from pymsbayes.config import MsBayesConfig
    from pymsbayes.utils import GLOBAL_RNG, MSBAYES_SORT_INDEX, ToolPathManager
    from pymsbayes.fileio import process_file_arg
    from pymsbayes import plotting

    MSBAYES_SORT_INDEX.set_index(args.sort_index)

    # get full paths to tools
    msbayes_path = ToolPathManager.get_tool_full_path('msbayes.pl')
    dpp_msbayes_path = ToolPathManager.get_tool_full_path('dpp-msbayes.pl')

    if not args.output_dir:
        args.output_dir = os.path.dirname(args.config)
    info = InfoLogger(os.path.join(args.output_dir, 'pymsbayes-info.txt'))

    sample_path = os.path.join(args.output_dir, 'prior-sample.txt')
    if args.compress:
        sample_path += '.gz'

    if not args.temp_dir:
        args.temp_dir = args.output_dir
    temp_fs = TempFileSystem(parent=args.temp_dir, prefix='temp-files-')
    args.stat_prefixes = [s.rstrip('.') for s in args.stat_prefixes]
    stat_patterns = get_patterns_from_prefixes(
            [s + '.' for s in args.stat_prefixes],
            ignore_case=True)
    if not args.seed:
        args.seed = random.randint(1, 999999999)
    GLOBAL_RNG.seed(args.seed)
    compress_level = None
    if args.compress:
        compress_level = 9

    cfg = MsBayesConfig(args.config)
    num_taxon_pairs = cfg.npairs

    info.write('[pymsbayes]', log.info)
    info.write('\tprogram_name = {name}'.format(**_program_info), log.info)
    info.write('\tversion = {version}'.format(**_program_info), log.info)
    info.write('\tinvocation = {0!r}'.format(' '.join(sys.argv)), log.info)
    info.write('\toutput_directory = {0!r}'.format(args.output_dir), log.info)
    info.write('\ttemp_directory = {0!r}'.format(temp_fs.base_dir), log.info)
    info.write('\tsort_index = {0}'.format(
            MSBAYES_SORT_INDEX.current_value()), log.info)
    info.write('\tstat_patterns = {0!r}'.format(
            ', '.join([p.pattern for p in stat_patterns])), log.info)
    info.write('\tseed = {0}'.format(args.seed), log.info)
    info.write('\tnum_prior_samples = {0}'.format(args.num_prior_samples),
            log.info)
    info.write('\tsample_path = {0!r}'.format(sample_path), log.info)
    info.write('\t[[tool_paths]]', log.info)
    info.write('\t\tdpp_msbayes = {0}'.format(dpp_msbayes_path), log.info)
    info.write('\t\tmsbayes = {0}'.format(msbayes_path), log.info)

    info.write('\t[[config]]', log.debug)
    info.write('{0}'.format(str(cfg)), log.debug)

    ##########################################################################
    ## begin analysis --- generate samples

    start_time = datetime.datetime.now()

    if args.np > args.num_prior_samples:
        args.np = args.num_prior_samples
    batch_size, remainder = long_division(args.num_prior_samples, args.np)
    schema = 'abctoolbox'
    workers = []
    for i in range(args.np):
        sample_size = batch_size
        if i == (args.np - 1):
            sample_size += remainder
        w = MsBayesWorker(
                temp_fs = temp_fs,
                sample_size = sample_size,
                config_path = args.config,
                report_parameters = True,
                schema = schema,
                include_header = True,
                stat_patterns = stat_patterns,
                write_stats_file = False)
        workers.append(w)

    log.info('Generating samples...')
    workers = Manager.run_workers(
            workers = workers,
            num_processors = args.np)
    log.info('Parsing samples...')
    sample = get_dict_from_spreadsheets([w.prior_path for w in workers])

    log.info('Writing prior samples...')
    out, close = process_file_arg(sample_path, 'w',
            compresslevel = compress_level)
    for row in dict_line_iter(sample, sep = '\t'):
        out.write(row)
    if close:
        out.close()

    log.info('Creating plots...')

    if not plotting.MATPLOTLIB_AVAILABLE:
        log.warning(
                '`matplotlib` could not be imported, so the plot can not be\n'
                'produced. The data to create the plot can be found in:\n\t'
                '{0!r}'.format(sample_path))
        sys.exit(1)

    for stat_pattern in stat_patterns:
        found = False
        for stat, values in sample.iteritems():
            if stat_pattern.match(stat):
                values = [float(v) for v in values]
                found = True
                plot_path = os.path.join(args.output_dir,
                        'plot-{0}.pdf'.format(stat))
                summary = stats.get_summary(values)
                s = r'mean = {0:.4f} ({1:.4f}-{2:.4f})'.format(
                        summary['mean'],
                        summary['qi_95'][0],
                        summary['qi_95'][1])
                hd = plotting.HistData(x = values,
                        normed = True,
                        bins = 20,
                        histtype = 'bar',
                        align = 'mid',
                        orientation = 'vertical',
                        zorder = 0)
                hist = plotting.ScatterPlot(hist_data_list = [hd],
                        right_text = s)
                hist.left_text_size = 12.0
                hist.right_text_size = 12.0
                xticks = [i for i in hist.ax.get_xticks()]
                xtick_labels = [i for i in xticks]
                yticks = [i for i in hist.ax.get_yticks()]
                ytick_labels = [i for i in yticks]
                if len(xtick_labels) >= 8:
                    for i in range(1, len(xtick_labels), 2):
                        xtick_labels[i] = ''
                if len(ytick_labels) >= 8:
                    for i in range(1, len(ytick_labels), 2):
                        ytick_labels[i] = ''
                xticks_obj = plotting.Ticks(ticks = xticks,
                        labels = xtick_labels,
                        horizontalalignment = 'center')
                yticks_obj = plotting.Ticks(ticks = yticks,
                        labels = ytick_labels)
                hist.xticks_obj = xticks_obj
                hist.yticks_obj = yticks_obj

                plot_grid = plotting.PlotGrid(subplots = [hist],
                        num_columns = 1,
                        label_schema = None,
                        title = stat,
                        title_size = 14.0,
                        title_top = False,
                        y_title = 'Density',
                        y_title_position = 0.001,
                        y_title_size = 14.0,
                        height = 4.0,
                        width = 6.0,
                        auto_height = False)
                plot_grid.auto_adjust_margins = False
                plot_grid.margin_left = 0.04
                plot_grid.margin_bottom = 0.04 
                plot_grid.margin_right = 1.0 
                plot_grid.margin_top = 0.97
                plot_grid.reset_figure()
                plot_grid.savefig(plot_path)

        if not found:
            raise Exception('stat pattern {0!r} not found in simulated stats:'
                    '\n\t{1}'.format(stat_pattern, ', '.join(sample.keys())))

    stop_time = datetime.datetime.now()
    log.info('Done!')
    info.write('\t[[run_stats]]', log.info)
    info.write('\t\tstart_time = {0}'.format(str(start_time)), log.info)
    info.write('\t\tstop_time = {0}'.format(str(stop_time)), log.info)
    info.write('\t\ttotal_duration = {0}'.format(str(stop_time - start_time)),
            log.info)

    if not args.keep_temps:
        log.debug('purging temps...')
        temp_fs.purge()
Пример #8
0
def main_cli():
    description = '{name} {version}'.format(**_program_info)
    parser = argparse.ArgumentParser(description=description)
    parser.add_argument(
        '-c',
        '--config',
        type=arg_is_config,
        required=True,
        help=('msBayes config file to be used to generate saturation '
              'plot.'))
    parser.add_argument(
        '-n',
        '--num-prior-samples',
        action='store',
        type=int,
        default=1000,
        help=('The number of prior samples to simulate for the '
              'saturation plot.'))
    parser.add_argument(
        '--np',
        action='store',
        type=int,
        default=multiprocessing.cpu_count(),
        help=('The maximum number of processes to run in parallel. The '
              'default is the number of CPUs available on the machine.'))
    parser.add_argument(
        '-o',
        '--output-dir',
        action='store',
        type=arg_is_dir,
        help=('The directory in which all output files will be written. '
              'The default is to use the directory of the first observed '
              'config file.'))
    parser.add_argument(
        '--temp-dir',
        action='store',
        type=arg_is_dir,
        help=('A directory to temporarily stage files. The default is to '
              'use the output directory.'))
    parser.add_argument(
        '-s',
        '--stat-prefixes',
        nargs='*',
        type=str,
        default=['pi', 'pi.net', 'wattTheta', 'tajD.denom'],
        help=('Prefixes of summary statistics to use in the analyses. '
              'The prefixes should be separated by spaces. '
              'Default: `-s pi pi.net wattTheta tajD.denom`.'))
    parser.add_argument(
        '--vertical-lines',
        nargs='*',
        type=float,
        default=[],
        help=('Positions along x-axis where vertical lines are to be '
              'drawn. Default is to draw no vertical lines.'))
    parser.add_argument('--compress',
                        action='store_true',
                        help='Compress plot data file.')
    parser.add_argument('--keep-temps',
                        action='store_true',
                        help='Keep all temporary files.')
    parser.add_argument('--seed',
                        action='store',
                        type=int,
                        help='Random number seed to use for the analysis.')
    parser.add_argument('--version',
                        action='version',
                        version='%(prog)s ' + _program_info['version'],
                        help='Report version and exit.')
    parser.add_argument('--quiet',
                        action='store_true',
                        help='Run without verbose messaging.')
    parser.add_argument('--debug',
                        action='store_true',
                        help='Run in debugging mode.')

    args = parser.parse_args()

    ##########################################################################
    ## handle args

    from pymsbayes.utils.messaging import (LoggingControl, InfoLogger)

    LoggingControl.set_logging_level("INFO")
    if args.quiet:
        LoggingControl.set_logging_level("WARNING")
    if args.debug:
        LoggingControl.set_logging_level("DEBUG")
    log = LoggingControl.get_logger(__name__)

    from pymsbayes.workers import MsBayesWorker
    from pymsbayes.utils.parsing import (get_patterns_from_prefixes,
                                         DEFAULT_STAT_PATTERNS,
                                         get_stats_by_time, dict_line_iter)
    from pymsbayes.manager import Manager
    from pymsbayes.utils.tempfs import TempFileSystem
    from pymsbayes.utils import probability
    from pymsbayes.utils.functions import long_division
    from pymsbayes.config import MsBayesConfig
    from pymsbayes.utils import GLOBAL_RNG, MSBAYES_SORT_INDEX, ToolPathManager
    from pymsbayes.fileio import process_file_arg
    from pymsbayes.plotting import MATPLOTLIB_AVAILABLE, SaturationPlotGrid

    MSBAYES_SORT_INDEX.set_index(0)

    # get full paths to tools
    msbayes_path = ToolPathManager.get_tool_full_path('msbayes.pl')
    dpp_msbayes_path = ToolPathManager.get_tool_full_path('dpp-msbayes.pl')

    if not args.output_dir:
        args.output_dir = os.path.dirname(args.config)
    info = InfoLogger(os.path.join(args.output_dir, 'pymsbayes-info.txt'))

    stats_by_time_path = os.path.join(args.output_dir, 'stats-by-time.txt')
    if args.compress:
        stats_by_time_path += '.gz'
    plot_path = os.path.join(args.output_dir, 'saturation-plot.pdf')

    if not args.temp_dir:
        args.temp_dir = args.output_dir
    temp_fs = TempFileSystem(parent=args.temp_dir, prefix='temp-files-')
    args.stat_prefixes = [s.rstrip('.') for s in args.stat_prefixes]
    stat_patterns = get_patterns_from_prefixes(
        [s + '.' for s in args.stat_prefixes], ignore_case=True)
    if not args.seed:
        args.seed = random.randint(1, 999999999)
    GLOBAL_RNG.seed(args.seed)
    compress_level = None
    if args.compress:
        compress_level = 9

    cfg = MsBayesConfig(args.config)
    num_taxon_pairs = cfg.npairs
    cfg.div_model_prior = 'constrained'
    cfg.psi = probability.DiscreteUniformDistribution(num_taxon_pairs,
                                                      num_taxon_pairs)
    config_path = temp_fs.get_file_path(prefix='cfg-')
    cfg.write(config_path)

    info.write('[pymsbayes]', log.info)
    info.write('\tprogram_name = {name}'.format(**_program_info), log.info)
    info.write('\tversion = {version}'.format(**_program_info), log.info)
    info.write('\tinvocation = {0!r}'.format(' '.join(sys.argv)), log.info)
    info.write('\toutput_directory = {0!r}'.format(args.output_dir), log.info)
    info.write('\ttemp_directory = {0!r}'.format(temp_fs.base_dir), log.info)
    info.write('\tsort_index = {0}'.format(MSBAYES_SORT_INDEX.current_value()),
               log.info)
    info.write(
        '\tstat_patterns = {0!r}'.format(', '.join(
            [p.pattern for p in stat_patterns])), log.info)
    info.write('\tseed = {0}'.format(args.seed), log.info)
    info.write('\tnum_prior_samples = {0}'.format(args.num_prior_samples),
               log.info)
    info.write('\tstats_by_time_path = {0!r}'.format(stats_by_time_path),
               log.info)
    info.write('\t[[tool_paths]]', log.info)
    info.write('\t\tdpp_msbayes = {0}'.format(dpp_msbayes_path), log.info)
    info.write('\t\tmsbayes = {0}'.format(msbayes_path), log.info)

    info.write('\t[[config]]', log.debug)
    info.write('{0}'.format(str(cfg)), log.debug)

    ##########################################################################
    ## begin analysis --- generate samples

    start_time = datetime.datetime.now()

    if args.np > args.num_prior_samples:
        args.np = args.num_prior_samples
    batch_size, remainder = long_division(args.num_prior_samples, args.np)
    schema = 'abctoolbox'
    workers = []
    for i in range(args.np):
        sample_size = batch_size
        if i == (args.np - 1):
            sample_size += remainder
        w = MsBayesWorker(temp_fs=temp_fs,
                          sample_size=sample_size,
                          config_path=config_path,
                          report_parameters=True,
                          schema=schema,
                          include_header=True,
                          stat_patterns=stat_patterns,
                          write_stats_file=False)
        workers.append(w)

    log.info('Generating samples...')
    workers = Manager.run_workers(workers=workers, num_processors=args.np)
    log.info('Parsing samples...')
    stats_by_time = get_stats_by_time([w.prior_path for w in workers])
    stat_keys = stats_by_time.keys()
    stat_keys.remove('PRI.t')
    for prefix in args.stat_prefixes:
        if not prefix in stat_keys:
            raise Exception('stat prefix {0!r} not found in simulated stats:'
                            '\n\t{1}'.format(prefix, ', '.join(stat_keys)))
    header = ['PRI.t'] + args.stat_prefixes
    log.info('Writing stats-by-time matrix...')
    out, close = process_file_arg(stats_by_time_path,
                                  'w',
                                  compresslevel=compress_level)
    for row in dict_line_iter(stats_by_time, sep='\t', header=header):
        out.write(row)
    if close:
        out.close()

    log.info('Creating plots...')

    if not MATPLOTLIB_AVAILABLE:
        log.warning(
            '`matplotlib` could not be imported, so the plot can not be\n'
            'produced. The data to create the plot can be found in:\n\t'
            '{0!r}'.format(stats_by_time_path))
    else:
        y_labels = {
            'pi': r'$\pi$',
            'pi.net': r'$\pi_{net}$',
            'wattTheta': r'$\theta_W$',
            'tajD.denom': r'$SD(\pi - \theta_W)$'
        }
        spg = SaturationPlotGrid(stats_by_time,
                                 x_key='PRI.t',
                                 y_keys=args.stat_prefixes,
                                 y_labels=y_labels,
                                 num_columns=2,
                                 vertical_line_positions=args.vertical_lines)
        fig = spg.create_grid()
        fig.savefig(plot_path)

    stop_time = datetime.datetime.now()
    log.info('Done!')
    info.write('\t[[run_stats]]', log.info)
    info.write('\t\tstart_time = {0}'.format(str(start_time)), log.info)
    info.write('\t\tstop_time = {0}'.format(str(stop_time)), log.info)
    info.write('\t\ttotal_duration = {0}'.format(str(stop_time - start_time)),
               log.info)

    if not args.keep_temps:
        log.debug('purging temps...')
        temp_fs.purge()
Пример #9
0
def main_cli():
    description = '{name} {version}'.format(**_program_info)
    parser = argparse.ArgumentParser(
        description=description,
        formatter_class=argparse_utils.SmartHelpFormatter)
    parser.add_argument(
        '-c',
        '--config',
        type=argparse_utils.arg_is_config,
        required=True,
        help=('msBayes config file to be used to generate saturation '
              'plot.'))
    parser.add_argument(
        '-n',
        '--num-prior-samples',
        action='store',
        type=int,
        default=1000,
        help=('The number of prior samples to simulate for the '
              'saturation plot.'))
    parser.add_argument(
        '--np',
        action='store',
        type=int,
        default=multiprocessing.cpu_count(),
        help=('The maximum number of processes to run in parallel. The '
              'default is the number of CPUs available on the machine.'))
    parser.add_argument(
        '-o',
        '--output-dir',
        action='store',
        type=argparse_utils.arg_is_dir,
        help=('The directory in which all output files will be written. '
              'The default is to use the directory of the first observed '
              'config file.'))
    parser.add_argument(
        '--temp-dir',
        action='store',
        type=argparse_utils.arg_is_dir,
        help=('A directory to temporarily stage files. The default is to '
              'use the output directory.'))
    parser.add_argument(
        '-s',
        '--stat-prefixes',
        nargs='*',
        type=str,
        default=['pi', 'pi.net', 'wattTheta', 'tajD.denom'],
        help=('Prefixes of summary statistics to use in the analyses. '
              'The prefixes should be separated by spaces. '
              'Default: `-s pi pi.net wattTheta tajD.denom`.'))
    parser.add_argument('--sort-index',
                        action='store',
                        type=int,
                        default=0,
                        choices=range(12),
                        help=argparse_utils.get_sort_index_help_message())
    parser.add_argument('--compress',
                        action='store_true',
                        help='Compress plot data file.')
    parser.add_argument('--keep-temps',
                        action='store_true',
                        help='Keep all temporary files.')
    parser.add_argument('--seed',
                        action='store',
                        type=int,
                        help='Random number seed to use for the analysis.')
    parser.add_argument('--version',
                        action='version',
                        version='%(prog)s ' + _program_info['version'],
                        help='Report version and exit.')
    parser.add_argument('--quiet',
                        action='store_true',
                        help='Run without verbose messaging.')
    parser.add_argument('--debug',
                        action='store_true',
                        help='Run in debugging mode.')

    args = parser.parse_args()

    ##########################################################################
    ## handle args

    from pymsbayes.utils.messaging import (LoggingControl, InfoLogger)

    LoggingControl.set_logging_level("INFO")
    if args.quiet:
        LoggingControl.set_logging_level("WARNING")
    if args.debug:
        LoggingControl.set_logging_level("DEBUG")
    log = LoggingControl.get_logger(__name__)

    from pymsbayes.workers import MsBayesWorker
    from pymsbayes.utils.parsing import (get_patterns_from_prefixes,
                                         DEFAULT_STAT_PATTERNS,
                                         get_dict_from_spreadsheets,
                                         dict_line_iter)
    from pymsbayes.manager import Manager
    from pymsbayes.utils.tempfs import TempFileSystem
    from pymsbayes.utils import probability, stats
    from pymsbayes.utils.functions import long_division
    from pymsbayes.config import MsBayesConfig
    from pymsbayes.utils import GLOBAL_RNG, MSBAYES_SORT_INDEX, ToolPathManager
    from pymsbayes.fileio import process_file_arg
    from pymsbayes import plotting

    MSBAYES_SORT_INDEX.set_index(args.sort_index)

    # get full paths to tools
    msbayes_path = ToolPathManager.get_tool_full_path('msbayes.pl')
    dpp_msbayes_path = ToolPathManager.get_tool_full_path('dpp-msbayes.pl')

    if not args.output_dir:
        args.output_dir = os.path.dirname(args.config)
    info = InfoLogger(os.path.join(args.output_dir, 'pymsbayes-info.txt'))

    sample_path = os.path.join(args.output_dir, 'prior-sample.txt')
    if args.compress:
        sample_path += '.gz'

    if not args.temp_dir:
        args.temp_dir = args.output_dir
    temp_fs = TempFileSystem(parent=args.temp_dir, prefix='temp-files-')
    args.stat_prefixes = [s.rstrip('.') for s in args.stat_prefixes]
    stat_patterns = get_patterns_from_prefixes(
        [s + '.' for s in args.stat_prefixes], ignore_case=True)
    if not args.seed:
        args.seed = random.randint(1, 999999999)
    GLOBAL_RNG.seed(args.seed)
    compress_level = None
    if args.compress:
        compress_level = 9

    cfg = MsBayesConfig(args.config)
    num_taxon_pairs = cfg.npairs

    info.write('[pymsbayes]', log.info)
    info.write('\tprogram_name = {name}'.format(**_program_info), log.info)
    info.write('\tversion = {version}'.format(**_program_info), log.info)
    info.write('\tinvocation = {0!r}'.format(' '.join(sys.argv)), log.info)
    info.write('\toutput_directory = {0!r}'.format(args.output_dir), log.info)
    info.write('\ttemp_directory = {0!r}'.format(temp_fs.base_dir), log.info)
    info.write('\tsort_index = {0}'.format(MSBAYES_SORT_INDEX.current_value()),
               log.info)
    info.write(
        '\tstat_patterns = {0!r}'.format(', '.join(
            [p.pattern for p in stat_patterns])), log.info)
    info.write('\tseed = {0}'.format(args.seed), log.info)
    info.write('\tnum_prior_samples = {0}'.format(args.num_prior_samples),
               log.info)
    info.write('\tsample_path = {0!r}'.format(sample_path), log.info)
    info.write('\t[[tool_paths]]', log.info)
    info.write('\t\tdpp_msbayes = {0}'.format(dpp_msbayes_path), log.info)
    info.write('\t\tmsbayes = {0}'.format(msbayes_path), log.info)

    info.write('\t[[config]]', log.debug)
    info.write('{0}'.format(str(cfg)), log.debug)

    ##########################################################################
    ## begin analysis --- generate samples

    start_time = datetime.datetime.now()

    if args.np > args.num_prior_samples:
        args.np = args.num_prior_samples
    batch_size, remainder = long_division(args.num_prior_samples, args.np)
    schema = 'abctoolbox'
    workers = []
    for i in range(args.np):
        sample_size = batch_size
        if i == (args.np - 1):
            sample_size += remainder
        w = MsBayesWorker(temp_fs=temp_fs,
                          sample_size=sample_size,
                          config_path=args.config,
                          report_parameters=True,
                          schema=schema,
                          include_header=True,
                          stat_patterns=stat_patterns,
                          write_stats_file=False)
        workers.append(w)

    log.info('Generating samples...')
    workers = Manager.run_workers(workers=workers, num_processors=args.np)
    log.info('Parsing samples...')
    sample = get_dict_from_spreadsheets([w.prior_path for w in workers])

    log.info('Writing prior samples...')
    out, close = process_file_arg(sample_path,
                                  'w',
                                  compresslevel=compress_level)
    for row in dict_line_iter(sample, sep='\t'):
        out.write(row)
    if close:
        out.close()

    log.info('Creating plots...')

    if not plotting.MATPLOTLIB_AVAILABLE:
        log.warning(
            '`matplotlib` could not be imported, so the plot can not be\n'
            'produced. The data to create the plot can be found in:\n\t'
            '{0!r}'.format(sample_path))
        sys.exit(1)

    for stat_pattern in stat_patterns:
        found = False
        for stat, values in sample.iteritems():
            if stat_pattern.match(stat):
                values = [float(v) for v in values]
                found = True
                plot_path = os.path.join(args.output_dir,
                                         'plot-{0}.pdf'.format(stat))
                summary = stats.get_summary(values)
                s = r'mean = {0:.4f} ({1:.4f}-{2:.4f})'.format(
                    summary['mean'], summary['qi_95'][0], summary['qi_95'][1])
                hd = plotting.HistData(x=values,
                                       normed=True,
                                       bins=20,
                                       histtype='bar',
                                       align='mid',
                                       orientation='vertical',
                                       zorder=0)
                hist = plotting.ScatterPlot(hist_data_list=[hd], right_text=s)
                hist.left_text_size = 12.0
                hist.right_text_size = 12.0
                xticks = [i for i in hist.ax.get_xticks()]
                xtick_labels = [i for i in xticks]
                yticks = [i for i in hist.ax.get_yticks()]
                ytick_labels = [i for i in yticks]
                if len(xtick_labels) >= 8:
                    for i in range(1, len(xtick_labels), 2):
                        xtick_labels[i] = ''
                if len(ytick_labels) >= 8:
                    for i in range(1, len(ytick_labels), 2):
                        ytick_labels[i] = ''
                xticks_obj = plotting.Ticks(ticks=xticks,
                                            labels=xtick_labels,
                                            horizontalalignment='center')
                yticks_obj = plotting.Ticks(ticks=yticks, labels=ytick_labels)
                hist.xticks_obj = xticks_obj
                hist.yticks_obj = yticks_obj

                plot_grid = plotting.PlotGrid(subplots=[hist],
                                              num_columns=1,
                                              label_schema=None,
                                              title=stat,
                                              title_size=14.0,
                                              title_top=False,
                                              y_title='Density',
                                              y_title_position=0.001,
                                              y_title_size=14.0,
                                              height=4.0,
                                              width=6.0,
                                              auto_height=False)
                plot_grid.auto_adjust_margins = False
                plot_grid.margin_left = 0.04
                plot_grid.margin_bottom = 0.04
                plot_grid.margin_right = 1.0
                plot_grid.margin_top = 0.97
                plot_grid.reset_figure()
                plot_grid.savefig(plot_path)

        if not found:
            raise Exception('stat pattern {0!r} not found in simulated stats:'
                            '\n\t{1}'.format(stat_pattern,
                                             ', '.join(sample.keys())))

    stop_time = datetime.datetime.now()
    log.info('Done!')
    info.write('\t[[run_stats]]', log.info)
    info.write('\t\tstart_time = {0}'.format(str(start_time)), log.info)
    info.write('\t\tstop_time = {0}'.format(str(stop_time)), log.info)
    info.write('\t\ttotal_duration = {0}'.format(str(stop_time - start_time)),
               log.info)

    if not args.keep_temps:
        log.debug('purging temps...')
        temp_fs.purge()
Пример #10
0
def main_cli():
    description = '{name} {version}'.format(**_program_info)
    parser = argparse.ArgumentParser(description=description)
    parser.add_argument(
        'configs',
        metavar='CONFIG-PATH',
        type=arg_is_config,
        nargs='+',
        help=('msBayes config file paths for which to estimate prior '
              'probabilities.'))
    parser.add_argument('-n',
                        '--num-prior-samples',
                        action='store',
                        type=int,
                        default=1000,
                        help=('The number of prior samples to simulate for '
                              'proabability estimates.'))
    parser.add_argument(
        '--np',
        action='store',
        type=int,
        default=multiprocessing.cpu_count(),
        help=('The maximum number of processes to run in parallel. The '
              'default is the number of CPUs available on the machine.'))
    parser.add_argument(
        '-d',
        '--dispersion-threshold',
        action='store',
        type=float,
        default=0.01,
        help=('The threshold for the dispersion index of divegence '
              'times. The estimated prior probability that the '
              'dispersion index is less than this threshold will '
              'be reported for each config.'))
    parser.add_argument(
        '-c',
        '--cv-threshold',
        action='store',
        type=float,
        default=0.01,
        help=('The threshold for the coefficient of variation (CV) of '
              'divegence times. The estimated prior probability that the '
              'CV is less than this threshold will '
              'be reported for each config.'))
    parser.add_argument('--seed',
                        action='store',
                        type=int,
                        help='Random number seed to use for the analysis.')
    parser.add_argument('--version',
                        action='version',
                        version='%(prog)s ' + _program_info['version'],
                        help='Report version and exit.')
    parser.add_argument('--quiet',
                        action='store_true',
                        help='Run without verbose messaging.')
    parser.add_argument('--debug',
                        action='store_true',
                        help='Run in debugging mode.')

    args = parser.parse_args()

    ##########################################################################
    ## handle args

    from pymsbayes.utils.messaging import (LoggingControl, InfoLogger)

    LoggingControl.set_logging_level("INFO")
    if args.quiet:
        LoggingControl.set_logging_level("WARNING")
    if args.debug:
        LoggingControl.set_logging_level("DEBUG")
    log = LoggingControl.get_logger(__name__)

    from pymsbayes.teams import ModelProbabilityEstimatorTeam
    from pymsbayes.utils import GLOBAL_RNG

    if not args.seed:
        args.seed = random.randint(1, 999999999)
    log.info('Using seed {0}'.format(args.seed))
    GLOBAL_RNG.seed(args.seed)

    ##########################################################################
    ## begin analysis --- generate samples

    start_time = datetime.datetime.now()

    prob_esimator_team = ModelProbabilityEstimatorTeam(
        config_paths=args.configs,
        num_samples=args.num_prior_samples,
        omega_threshold=args.dispersion_threshold,
        cv_threshold=args.cv_threshold,
        num_processors=args.np)
    prob_esimator_team.start()

    for path in args.configs:
        sys.stdout.write('Prior probabilities for model {0}:\n'.format(path))
        for k, p in prob_esimator_team.psi_probs[path].iteritems():
            sys.stdout.write('\tnum of divergence events = {0}: {1}\n'.format(
                k, p))
        sys.stdout.write('\tdispersion of div times < {0}: {1}\n'.format(
            args.dispersion_threshold, prob_esimator_team.omega_probs[path]))
        sys.stdout.write('\tCV of div times < {0}: {1}\n'.format(
            args.cv_threshold, prob_esimator_team.cv_probs[path]))

    stop_time = datetime.datetime.now()
    log.info('[run_stats]')
    log.info('\tstart_time = {0}'.format(str(start_time)))
    log.info('\tstop_time = {0}'.format(str(stop_time)))
    log.info('\ttotal_duration = {0}'.format(str(stop_time - start_time)))
Пример #11
0
def main_cli():
    description = '{name} {version}'.format(**_program_info)
    parser = argparse.ArgumentParser(description = description)
    parser.add_argument('config',
            metavar='CONFIG-FILE',
            type = argparse_utils.arg_is_config,
            help = ('msBayes config file used to estimate the posterior '
                    'sample.'))
    parser.add_argument('posterior_sample_path',
            metavar='POSTERIOR-SAMPLE-FILE',
            type=argparse_utils.arg_is_file,
            help = ('Path to posterior sample file (i.e., '
                    '`*-posterior-sample.txt`).'))
    parser.add_argument('-e', '--expression',
            dest = 'expressions',
            action = 'append',
            metavar = 'TAXON-INDEX-EXPRESSION',
            type = str,
            required = True,
            help = ('A conditional expression of divergence times based on '
                    'the taxon-pair indices for which to calculate the '
                    'posterior probability of being true. Indices correspond '
                    'to the order that pairs of taxa appear in the sample '
                    'table of the config, starting at 0 for the first '
                    'taxon-pair to appear in the table (starting from the '
                    'top). E.g., `-e "0 == 3 == 4"` would request the '
                    'proportion of times the 1st, 4th, and 5th taxon-pairs '
                    '(in order of appearance in the sample table of the '
                    'config) share the same divergence time in the '
                    'posterior sample, whereas `-e "0 > 1" would request the '
                    'proportion of times the the 1st taxon-pair diverged '
                    'further back in time than the 2nd taxon-pair in the '
                    'posterior sample.'))
    parser.add_argument('-n', '--num-prior-samples',
            action = 'store',
            type = argparse_utils.arg_is_positive_int,
            help = ('The number of prior samples to simulate for estimating '
                    'prior probabilities; prior probabilities and Bayes '
                    'factors will be reported. The default is to only report '
                    'posterior probabilities.'))
    parser.add_argument('--np',
            action = 'store',
            type = argparse_utils.arg_is_positive_int,
            default = multiprocessing.cpu_count(),
            help = ('The maximum number of processes to run in parallel for '
                    'prior simulations. The default is the number of CPUs '
                    'available on the machine. This option is only relevant '
                    'if the number of prior samples is specified using the '
                    '`-n` argument.'))
    parser.add_argument('--seed',
            action = 'store',
            type = argparse_utils.arg_is_positive_int,
            help = ('Random number seed to use for simulations. This option '
                    'is only relevant if the number of prior samples is '
                    'specified using the `-n` argument.'))
    parser.add_argument('--version',
            action = 'version',
            version = '%(prog)s ' + _program_info['version'],
            help = 'Report version and exit.')
    parser.add_argument('--quiet',
            action = 'store_true',
            help = 'Run without verbose messaging.')
    parser.add_argument('--debug',
            action = 'store_true',
            help = 'Run in debugging mode.')

    args = parser.parse_args()

    ##########################################################################
    ## handle args

    from pymsbayes.utils.messaging import (LoggingControl,
            InfoLogger)

    LoggingControl.set_logging_level("INFO")
    if args.quiet:
        LoggingControl.set_logging_level("WARNING")
    if args.debug:
        LoggingControl.set_logging_level("DEBUG")
    log = LoggingControl.get_logger(__name__)

    from pymsbayes import config
    from pymsbayes.teams import DivModelSimulatorTeam
    from pymsbayes.utils import stats, sumresults, GLOBAL_RNG

    if not args.seed:
        args.seed = random.randint(1, 999999999)
    GLOBAL_RNG.seed(args.seed)

    cfg = config.MsBayesConfig(args.config)

    evaluators = []
    for exp in args.expressions:
        evaluators.append(stats.ListConditionEvaluator(exp,
                index_labels = cfg.taxa))

    div_models = sumresults.get_partitions_from_posterior_sample_file(
            args.posterior_sample_path)

    sim_team = None
    if args.num_prior_samples:
        sim_team = DivModelSimulatorTeam(
                config_paths = [args.config],
                num_samples = args.num_prior_samples,
                num_processors = args.np)
        sim_team.start()

    for e in evaluators:
        title = '{0} --- {1}:'.format(e.expression,
                e.pretty_expression)
        section_title = '\n{0}\n{1}\n'.format(title, '-' * len(title))
        sys.stdout.write('{0}'.format(section_title))
        prob_shared_div = div_models.get_condition_frequency(e)
        sys.stdout.write('posterior probability = {0}\n'.format(prob_shared_div))
        if sim_team:
            prior_prob = sim_team.div_models[
                    args.config].get_condition_frequency(e)
            bf = ((prob_shared_div / (1 - prob_shared_div)) /
                    (prior_prob / (1 - prior_prob)))
            sys.stdout.write('prior probability = {0}\n'.format(prior_prob))
            sys.stdout.write('Bayes factor = {0}\n'.format(bf))
            sys.stdout.write('2ln(Bayes factor) = {0}\n'.format(2 * math.log(bf)))
        sys.stdout.write('\n')
Пример #12
0
def main_cli(argv = sys.argv):
    description = '{name} {version}'.format(**_program_info)
    parser = argparse.ArgumentParser(description = description,
            formatter_class = argparse_utils.SmartHelpFormatter)
    parser.add_argument('-o', '--observed-configs',
            nargs = '+',
            type = argparse_utils.arg_is_config,
            required = True,
            help = ('One or more msBayes config files to be used to either '
                    'calculate or simulate observed summary statistics. If '
                    'used in combination with `-r` each config will be used to '
                    'simulate pseudo-observed data. If analyzing real data, do '
                    'not use the `-r` option, and the fasta files specified '
                    'within the config must exist and contain the sequence '
                    'data.'))
    parser.add_argument('-p', '--prior-configs',
            nargs = '+',
            type = argparse_utils.arg_is_path,
            required = True,
            help = ('One or more config files to be used to generate prior '
                    'samples. If more than one config is specified, they '
                    'should be separated by spaces. '
                    'This option can also be used to specify the path to a '
                    'directory containing the prior samples and summary '
                    'statistic means and standard deviations generated by a '
                    'previous run using the `generate-samples-only` option. '
                    'These files should be found in the directory '
                    '`pymsbayes-output/prior-stats-summaries`. The'
                    '`pymsbayes-output/model-key.txt` also needs to be present.'
                    ' If specifying this directory, it should be the only '
                    'argument (i.e., no other directories or config files can '
                    'be provided).'))
    parser.add_argument('-r', '--reps',
            action = 'store',
            type = argparse_utils.arg_is_nonnegative_int,
            default = 0,
            help = ('This option has two effects. First, it signifies that '
                    'the analysis will be simulation based (i.e., no real '
                    'data will be used). Second, it specifies how many '
                    'simulation replicates to perform (i.e., how many data '
                    'sets to simulate and analyze).'))
    parser.add_argument('-n', '--num-prior-samples',
            action = 'store',
            type = argparse_utils.arg_is_positive_int,
            default = 1000000,
            help = ('The number of prior samples to simulate for each prior '
                    'config specified with `-p`.'))
    parser.add_argument('--prior-batch-size',
            action = 'store',
            type = argparse_utils.arg_is_positive_int,
            default = 10000,
            help = ('The number of prior samples to simulate for each batch.'))
    parser.add_argument('--generate-samples-only',
            action = 'store_true',
            help = ('Only generate samples from models as requested. I.e., '
                    'No analyses are performed to approximate posteriors. '
                    'This option can be useful if you want the prior samples '
                    'for other purposes.'))
    parser.add_argument('--num-posterior-samples',
            action = 'store',
            type = argparse_utils.arg_is_positive_int,
            default = 1000,
            help = ('The number of posterior samples desired for each '
                    'analysis. Default: 1000.'))
    parser.add_argument('--num-standardizing-samples',
            action = 'store',
            type = argparse_utils.arg_is_positive_int,
            default = 10000,
            help = ('The number of prior samples desired to use for '
                    'standardizing statistics. Default: 10000.'))
    parser.add_argument('--np',
            action = 'store',
            type = argparse_utils.arg_is_positive_int,
            default = multiprocessing.cpu_count(),
            help = ('The maximum number of processes to run in parallel. The '
                    'default is the number of CPUs available on the machine.'))
    parser.add_argument('--output-dir',
            action = 'store',
            type = argparse_utils.arg_is_dir,
            help = ('The directory in which all output files will be written. '
                    'The default is to use the directory of the first observed '
                    'config file.'))
    parser.add_argument('--temp-dir',
            action = 'store',
            type = argparse_utils.arg_is_dir,
            help = ('A directory to temporarily stage files. The default is to '
                    'use the output directory.'))
    parser.add_argument('--staging-dir',
            action = 'store',
            type = argparse_utils.arg_is_dir,
            help = ('A directory to temporarily stage prior files. This option '
                    'can be useful on clusters to speed up I/O while '
                    'generating prior samples. You can designate a local temp '
                    'directory on a compute node to avoid constant writing to '
                    'a shared drive. The default is to use the `temp-dir`.'))
    parser.add_argument('-s', '--stat-prefixes',
            nargs = '*',
            type = str,
            help = ('Prefixes of summary statistics to use in the analyses. '
                    'The prefixes should be separated by spaces. '
                    'Default: `-s pi wattTheta pi.net tajD.denom`.'))
    parser.add_argument('-b', '--bandwidth',
            action = 'store',
            type = float,
            help = ('Smoothing parameter for the posterior kernal density '
                    'estimation. This option is used for the `glm` '
                    'regression method. The default is 2 / '
                    '`num-posterior-samples`.'))
    parser.add_argument('-q', '--num-posterior-quantiles',
            action = 'store',
            type = argparse_utils.arg_is_positive_int,
            default = 1000,
            help = ('The number of equally spaced quantiles at which to '
                    'evaluate the GLM-estimated posterior density. '
                    'Default: 1000.'))
    parser.add_argument('--reporting-frequency',
            action = 'store',
            type = argparse_utils.arg_is_nonnegative_int,
            default = 0,
            help = ('Suggested frequency (in number of prior samples) for '
                    'running regression and reporting current results. '
                    'Default: 0 (only report final results). '
                    'If a value is given, it may be adjusted so that the '
                    'reporting frequency is a multiple of the multi-processed '
                    'batch size.'))
    parser.add_argument('--sort-index',
            action = 'store',
            type = argparse_utils.arg_is_nonnegative_int,
            default = 0,
            choices = range(12),
            help = argparse_utils.get_sort_index_help_message())
    parser.add_argument('--no-global-estimate',
            action = 'store_true',
            help = ('If multiple prior models are specified, by default a '
                    'global estimate is performed averaging over all models. '
                    'This option prevents the global estimation (i.e., only '
                    'inferences for each model are made).'))
    parser.add_argument('--compress',
            action = 'store_true',
            help = 'Compress large results files.')
    parser.add_argument('--keep-temps',
            action = 'store_true',
            help = 'Keep all temporary files.')
    parser.add_argument('--seed',
            action = 'store',
            type = int,
            help = 'Random number seed to use for the analysis.')
    parser.add_argument('--output-prefix',
            action = 'store',
            type = str,
            default = '',
            help = ('Prefix to use at beginning of output files. The default '
                    'is no prefix.'))
    parser.add_argument('--data-key-path',
            action = 'store',
            type = argparse_utils.arg_is_file,
            help = ('The path to a `data-key.txt` file generated by a previous '
                    'run. This file should be found in the directory '
                    '`pymsbayes-output/data-key.txt`. This option '
                    'will override the `-o`/`--observed-configs` option, and '
                    'is intended to be used in combination with the '
                    '`--start-from` option to restart an analysis.'))
    parser.add_argument('--start-from-simulation-index',
            action = 'store',
            type = argparse_utils.arg_is_nonnegative_int,
            default = 0,
            help = ('The simulation index at which to begin analyses. Must be '
                    'used in combination with either the number of simulation '
                    'replicates (`-r`/`--reps`) or the `--data-key-path` '
                    'option, and must be a positive '
                    'integer that is less than the number of simulation '
                    'replicates. This option can be useful if an analysis '
                    'needs to be restarted.'))
    parser.add_argument('--start-from-observed-index',
            action = 'store',
            type = argparse_utils.arg_is_nonnegative_int,
            default = 0,
            help = ('The observed config index at which to begin analyses. '
                    'Can be used in combination with the `--data-key-path` '
                    'option to restart long-running, multi-observed-config '
                    'analyses'))
    parser.add_argument('--dry-run',
            action = 'store_true',
            help = 'Do not run analyses; only process settings')
    parser.add_argument('--version',
            action = 'version',
            version = '%(prog)s ' + _program_info['version'],
            help = 'Report version and exit.')
    parser.add_argument('--quiet',
            action = 'store_true',
            help = 'Run without verbose messaging.')
    parser.add_argument('--debug',
            action = 'store_true',
            help = 'Run in debugging mode.')

    if argv == sys.argv:
        args = parser.parse_args()
    else:
        args = parser.parse_args(argv)

    ##########################################################################
    ## handle args

    from pymsbayes.utils.messaging import (LoggingControl,
            InfoLogger)

    LoggingControl.set_logging_level("INFO")
    if args.quiet:
        LoggingControl.set_logging_level("WARNING")
    if args.debug:
        LoggingControl.set_logging_level("DEBUG")
    log = LoggingControl.get_logger(__name__)

    from pymsbayes.workers import (MsBayesWorker, merge_prior_files,
            ObsSumStatsWorker)
    from pymsbayes.teams import ABCTeam
    from pymsbayes.utils.functions import (is_file, is_dir, long_division,
            mk_new_dir)
    from pymsbayes.utils.parsing import (get_patterns_from_prefixes,
            DEFAULT_STAT_PATTERNS, DIV_MODEL_PATTERNS, MODEL_PATTERNS,
            PSI_PATTERNS, MEAN_TAU_PATTERNS, OMEGA_PATTERNS, CV_PATTERNS,
            line_count)
    from pymsbayes.utils import sumresults, errors
    from pymsbayes.manager import Manager
    from pymsbayes.utils.tempfs import TempFileSystem
    from pymsbayes.config import MsBayesConfig
    from pymsbayes.utils import (GLOBAL_RNG, set_memory_trace,
            MSBAYES_SORT_INDEX, ToolPathManager)

    MSBAYES_SORT_INDEX.set_index(args.sort_index)

    if len(args.observed_configs) != len(set(args.observed_configs)):
        raise ValueError('All paths to observed config files must be unique')

    if args.num_standardizing_samples > args.num_prior_samples:
        args.num_standardizing_samples = args.num_prior_samples
    
    # get full paths to tools
    msbayes_path = ToolPathManager.get_tool_full_path('msbayes.pl')
    dpp_msbayes_path = ToolPathManager.get_tool_full_path('dpp-msbayes.pl')
    eureject_path = ToolPathManager.get_tool_full_path('eureject')
    abctb_path = ToolPathManager.get_tool_full_path('ABCestimator')

    # vet prior-configs option
    using_previous_priors = False
    previous_prior_dir = None
    if (len(args.prior_configs) == 1) and (is_dir(args.prior_configs[0])):
        previous_prior_dir = args.prior_configs.pop(0)
        previous_priors = glob.glob(os.path.join(previous_prior_dir,
                '*-prior-sample.txt'))
        previous_sums = glob.glob(os.path.join(previous_prior_dir,
                '*-means-and-std-devs.txt'))
        if (not previous_priors) or (not previous_sums):
            raise ValueError('directory {0!r} specified with `prior-configs` '
                    'option does not contain necessary prior and summary '
                    'files'.format(args.prior_configs[0]))
        using_previous_priors = True
    else:
        for path in args.prior_configs:
            if not is_file(path):
                raise ValueError('prior config {0!r} is not a file'.format(
                        path))
    if len(args.prior_configs) != len(set(args.prior_configs)):
        raise ValueError('All paths to prior config files must be unique') 
    if not args.output_dir:
        args.output_dir = os.path.dirname(args.observed_configs[0])
    base_dir = mk_new_dir(os.path.join(args.output_dir, 'pymsbayes-results'))
    if not args.temp_dir:
        args.temp_dir = base_dir
    info_path = os.path.join(base_dir, args.output_prefix + \
            'pymsbayes-info.txt')
    info = InfoLogger(info_path)
    info.write('[pymsbayes]'.format(base_dir))
    info.write('\tversion = {version}'.format(**_program_info))
    info.write('\toutput_directory = {0}'.format(base_dir))
    temp_fs = TempFileSystem(parent=args.temp_dir, prefix='temp-files-')
    base_temp_dir = temp_fs.base_dir
    info.write('\ttemp_directory = {0}'.format(base_temp_dir))
    info.write('\tsort_index = {0}'.format(
            MSBAYES_SORT_INDEX.current_value()))
    info.write('\tsimulation_reps = {0}'.format(args.reps))
    stat_patterns = DEFAULT_STAT_PATTERNS
    if args.stat_prefixes:
        for i in range(len(args.stat_prefixes)):
            if not args.stat_prefixes[i].endswith('.'):
                args.stat_prefixes[i] += '.'
        stat_patterns = get_patterns_from_prefixes(
                args.stat_prefixes,
                ignore_case=True)
    if not args.bandwidth:
        args.bandwidth = 2 / float(args.num_posterior_samples)
    if not args.seed:
        args.seed = random.randint(1, 999999999)
    GLOBAL_RNG.seed(args.seed)
    if args.data_key_path:
        observed_map = sumresults.parse_data_key_file(args.data_key_path)
        observed_paths = [observed_map[k] for k in sorted(observed_map.keys())]
    else:
        observed_dir = mk_new_dir(os.path.join(base_dir,
                'observed-summary-stats'))
        observed_paths = [os.path.join(observed_dir, args.output_prefix + \
            'observed-{0}.txt'.format(i+1)) for i in range(len(
                    args.observed_configs))]
    info.write('\tseed = {0}'.format(args.seed))
    info.write('\tnum_processors = {0}'.format(args.np))
    info.write('\tnum_prior_samples = {0}'.format(
            args.num_prior_samples))
    info.write('\tnum_standardizing_samples = {0}'.format(
            args.num_standardizing_samples))
    info.write('\tbandwidth = {0}'.format(args.bandwidth))
    info.write('\tposterior_quantiles = {0}'.format(
            args.num_posterior_quantiles))
    info.write('\tposterior_sample_size = {0}'.format(
            args.num_posterior_samples))
    info.write('\tstat_patterns = {0}'.format(
            ', '.join([p.pattern for p in stat_patterns])))

    # vet observed configs
    ref_config_path = args.observed_configs[0]
    ref_config = MsBayesConfig(ref_config_path) 
    all_config_paths = []
    num_taxon_pairs = ref_config.npairs
    assert num_taxon_pairs > 0
    for config in args.observed_configs:
        all_config_paths.append(config)
        if not ref_config.equal_sample_table(config):
            if not args.keep_temps:
                temp_fs.purge()
            raise errors.SampleTableError(
                    'sample tables in config {0!r} and {1!r} differ; '
                    'all sample tables must be the same.'.format(
                            ref_config_path, config))

    info.write('\tnum_taxon_pairs = {0}'.format(num_taxon_pairs))
    info.write('\tdry_run = {0}'.format(args.dry_run))
    info.write('\t[[tool_paths]]')
    info.write('\t\tdpp_msbayes = {0}'.format(dpp_msbayes_path))
    info.write('\t\tmsbayes = {0}'.format(msbayes_path))
    info.write('\t\teureject = {0}'.format(eureject_path))
    info.write('\t\tabcestimator = {0}'.format(abctb_path))
    info.write('\t[[observed_configs]]')
    for i, cfg in enumerate(args.observed_configs):
        info.write('\t\t{0} = {1}'.format(i + 1, os.path.relpath(cfg,
                os.path.dirname(info_path))))

    abc_team = ABCTeam(
            temp_fs = temp_fs,
            observed_stats_files = observed_paths,
            num_taxon_pairs = num_taxon_pairs,
            config_paths = args.prior_configs,
            previous_prior_dir = previous_prior_dir,
            num_prior_samples = args.num_prior_samples,
            num_processors = args.np,
            num_standardizing_samples = args.num_standardizing_samples,
            num_posterior_samples = args.num_posterior_samples,
            num_posterior_density_quantiles = args.num_posterior_quantiles,
            batch_size = args.prior_batch_size,
            output_dir = base_dir,
            output_prefix = args.output_prefix,
            prior_temp_dir = args.staging_dir,
            rng = GLOBAL_RNG,
            report_parameters = True,
            stat_patterns = stat_patterns,
            eureject_exe_path = eureject_path,
            abctoolbox_exe_path = abctb_path,
            msbayes_exe_path = None,
            abctoolbox_bandwidth = args.bandwidth,
            omega_threshold = 0.01,
            cv_threshold = 0.01,
            compress = args.compress,
            reporting_frequency = args.reporting_frequency,
            keep_temps = args.keep_temps,
            global_estimate_only = False,
            global_estimate = not args.no_global_estimate,
            generate_prior_samples_only = args.generate_samples_only,
            start_from_simulation_index = args.start_from_simulation_index,
            start_from_observed_index = args.start_from_observed_index)

    models_to_configs = {}
    configs_to_models = {}
    for k, v in abc_team.models.iteritems():
        models_to_configs[k] = v
        configs_to_models[v] = k
        cfg = MsBayesConfig(v)
        all_config_paths.append(v)
        # vet prior configs
        if not ref_config.equal_sample_table(cfg):
            if not args.keep_temps:
                temp_fs.purge()
            raise errors.SampleTableError(
                    'sample tables in config {0!r} and {1!r} differ; '
                    'all sample tables must be the same.'.format(
                            ref_config_path, v))

    info.write('\t[[observed_paths]]')
    for i in sorted(abc_team.observed_stats_paths.iterkeys()):
        info.write('\t\t{0} = {1}'.format(i, os.path.relpath(
                abc_team.observed_stats_paths[i],
                os.path.dirname(info_path))))
    info.write('\t[[prior_configs]]')
    for i in sorted(abc_team.models.iterkeys()):
        info.write('\t\t{0} = {1}'.format(i, os.path.relpath(
                abc_team.models[i],
                os.path.dirname(info_path))))

    ##########################################################################
    ## begin analysis --- get observed summary stats

    set_memory_trace() # start logging memory profile
    start_time = datetime.datetime.now()

    if args.data_key_path:
        log.info('Using provided summary statitics...')
    elif not args.dry_run:
        obs_temp_dir = base_temp_dir
        if args.staging_dir:
            obs_temp_dir = args.staging_dir
        observed_temp_fs = TempFileSystem(parent = obs_temp_dir,
                prefix = 'observed-temps-')

        if args.reps < 1:
            log.info('Calculating summary statistics from sequence data...')
            obs_workers = []
            for i, cfg in enumerate(args.observed_configs):
                ss_worker = ObsSumStatsWorker(
                        temp_fs = observed_temp_fs,
                        config_path = cfg,
                        output_path = observed_paths[i],
                        schema = 'abctoolbox',
                        stat_patterns = stat_patterns)
                obs_workers.append(ss_worker)

            obs_workers = Manager.run_workers(
                workers = obs_workers,
                num_processors = args.np)

            # re-vet all configs to see if some were changed by obsSumStats.pl
            new_ref_config = ref_config
            ref_modified = False
            # new ref because if all configs were updated all is good
            if not ref_config.equal_sample_table(ref_config_path):
                ref_modified = True
                new_ref_config = MsBayesConfig(ref_config_path)
                log.warning("""
The alignment lengths in config
{0!r}
have been corrected for sites with *any* ambiguous bases and/or gaps by
obsSumStats.pl.
                    """.format(ref_config_path))
            for config in all_config_paths:
                if not new_ref_config.equal_sample_table(config):
                    corrected_config = config
                    if ref_modified:
                        corrected_config = ref_config_path
                    if not args.keep_temps:
                        observed_temp_fs.purge()
                        temp_fs.purge()
                    raise errors.SampleTableError("""
The sample tables in configs
{0!r}
and
{1!r}
differ because obsSumStats.pl modified alignment lengths in config
{2!r}
to correct for sites in the alignments with *any* ambiguous bases and/or gaps.
Please make sure the sample tables in all configs will be the same after
correcting alignment lengths for sites that contain *any* ambiguous bases
and/or gaps. You can do this by copying and pasting the sample table in
{2!r}
that has been corrected by obsSumStats.pl into the other configs that were not
corrected.
                        """.format(ref_config_path, config, corrected_config))

        else:
            log.info('Simulating summary statistics from observed configs...')
            num_observed_workers = min([args.reps, args.np])
            if args.reps <= args.np:
                observed_batch_size = 1
                remainder = 0
            else:
                observed_batch_size, remainder = long_division(args.reps,
                        args.np)
            msbayes_workers = []
            for idx, cfg in enumerate(args.observed_configs):
                observed_model_idx = configs_to_models.get(cfg,
                        None)
                schema = 'abctoolbox'
                for i in range(num_observed_workers):
                    worker = MsBayesWorker(
                            temp_fs = observed_temp_fs,
                            sample_size = observed_batch_size,
                            config_path = cfg,
                            model_index = observed_model_idx,
                            report_parameters = True,
                            schema = schema,
                            include_header = True,
                            stat_patterns = stat_patterns,
                            write_stats_file = False,
                            staging_dir = None,
                            tag = idx)
                    msbayes_workers.append(worker)
                if remainder > 0:
                    worker = MsBayesWorker(
                            temp_fs = observed_temp_fs,
                            sample_size = remainder,
                            config_path = cfg,
                            model_index = observed_model_idx,
                            report_parameters = True,
                            schema = schema,
                            include_header = True,
                            stat_patterns = stat_patterns,
                            write_stats_file = False,
                            staging_dir = None,
                            tag = idx)
                    msbayes_workers.append(worker)

            # run parallel msbayes processes
            msbayes_workers = Manager.run_workers(
                workers = msbayes_workers,
                num_processors = args.np)

            workers = dict(zip(range(len(args.observed_configs)),
                    [[] for i in range(len(args.observed_configs))]))
            for w in msbayes_workers:
                workers[w.tag].append(w)

            # merge simulated observed data into one file
            for i in range(len(args.observed_configs)):
                merge_prior_files([w.prior_path for w in workers[i]],
                        observed_paths[i])
                lc = line_count(observed_paths[i], ignore_headers=True)
                if lc != args.reps:
                    if not args.keep_temps:
                        temp_fs.purge()
                    raise Exception('The number of observed simulations ({0}) '
                            'generated for observed config {1!r} and output to '
                            'file {2!r} does not match the number of reps '
                            '({3})'.format(lc, args.observed_configs[i],
                                observed_paths[i], args.reps))
        if not args.keep_temps:
            log.debug('purging observed temps...')
            observed_temp_fs.purge()

    ##########################################################################
    ## Begin ABC analyses

    if not args.dry_run:
        abc_team.run()

    stop_time = datetime.datetime.now()
    log.info('Done!')
    info.write('\t[[run_stats]]', log.info)
    info.write('\t\tstart_time = {0}'.format(str(start_time)), log.info)
    info.write('\t\tstop_time = {0}'.format(str(stop_time)), log.info)
    info.write('\t\ttotal_duration = {0}'.format(str(stop_time - start_time)),
            log.info)

    if not args.keep_temps:
        log.debug('purging temps...')
        temp_fs.purge()
def main_cli():
    description = '{name} {version}'.format(**_program_info)
    parser = argparse.ArgumentParser(description=description)
    parser.add_argument('div_model_path',
                        metavar='DIV-MODEL-RESULTS-FILE',
                        type=argparse_utils.arg_is_file,
                        help=('Path to divergence model results file (i.e., '
                              '`*-div-model-results.txt`).'))
    parser.add_argument(
        '-i',
        '--taxon-indices',
        nargs='+',
        type=argparse_utils.arg_is_positive_int,
        required=True,
        help=('Two or more space-separated indices of taxa for which to '
              'calculate the probability of them co-diverging. Indices '
              'correspond to the line in the sample table of the config, '
              'starting at 1 for the first line of the table. At least '
              'two indices are required.'))
    parser.add_argument(
        '-c',
        '--config',
        type=argparse_utils.arg_is_config,
        help=('msBayes config file to be used to estimate prior '
              'probability via simulations. If provided, the '
              'posterior and prior probability and bayes factor is '
              'reported. If not provided, only the posterior '
              'probability is reported.'))
    parser.add_argument(
        '-n',
        '--num-prior-samples',
        action='store',
        type=argparse_utils.arg_is_positive_int,
        default=100000,
        help=('The number of prior samples to simulate for estimating'
              'prior probabilities. Only used if a config file is '
              'provided with the `-c` argument.'))
    parser.add_argument(
        '--np',
        action='store',
        type=argparse_utils.arg_is_positive_int,
        default=multiprocessing.cpu_count(),
        help=('The maximum number of processes to run in parallel for '
              'prior simulations. The default is the number of CPUs '
              'available on the machine. This option is only relevant '
              'if a config file is provided using the `-c` argument.'))
    parser.add_argument(
        '--seed',
        action='store',
        type=argparse_utils.arg_is_positive_int,
        help=('Random number seed to use for simulations. This option '
              'is only relevant if a config file is provided using the '
              '`-c` argument.'))
    parser.add_argument('--version',
                        action='version',
                        version='%(prog)s ' + _program_info['version'],
                        help='Report version and exit.')
    parser.add_argument('--quiet',
                        action='store_true',
                        help='Run without verbose messaging.')
    parser.add_argument('--debug',
                        action='store_true',
                        help='Run in debugging mode.')

    args = parser.parse_args()

    ##########################################################################
    ## handle args

    from pymsbayes.utils.messaging import (LoggingControl, InfoLogger)

    LoggingControl.set_logging_level("INFO")
    if args.quiet:
        LoggingControl.set_logging_level("WARNING")
    if args.debug:
        LoggingControl.set_logging_level("DEBUG")
    log = LoggingControl.get_logger(__name__)

    from pymsbayes import config
    from pymsbayes.teams import ModelProbabilityEstimatorTeam
    from pymsbayes.utils import sumresults, GLOBAL_RNG

    if len(args.taxon_indices) < 2:
        log.error('At least two taxon indices are required')
        sys.exit(1)
    if not args.seed:
        args.seed = random.randint(1, 999999999)
    GLOBAL_RNG.seed(args.seed)

    div_models = sumresults.OrderedDivergenceModelCollection(
        div_model_results_path=args.div_model_path)
    for i in args.taxon_indices:
        if ((i < 1) or (i > div_models.npairs)):
            log.error('taxon index {0} is out of bounds'.format(i))
            sys.exit(1)
    args.taxon_indices = [i - 1 for i in args.taxon_indices]
    prob_shared_div = div_models.prob_of_shared_divergence(args.taxon_indices)

    if args.config:
        prob_estimator_team = ModelProbabilityEstimatorTeam(
            config_paths=[args.config],
            num_samples=args.num_prior_samples,
            num_processors=args.np)
        prob_estimator_team.start()
        prior_prob = prob_estimator_team.shared_div_probs[args.config][len(
            args.taxon_indices)]
        bf = ((prob_shared_div / (1 - prob_shared_div)) / (prior_prob /
                                                           (1 - prior_prob)))

    sys.stdout.write('posterior probability = {0}\n'.format(prob_shared_div))
    if args.config:
        sys.stdout.write('prior probability = {0}\n'.format(prior_prob))
        sys.stdout.write('Bayes factor = {0}\n'.format(bf))
        sys.stdout.write('2ln(Bayes factor) = {0}\n'.format(2 * math.log(bf)))
Пример #14
0
def main_cli():
    description = '{name} {version}'.format(**_program_info)
    parser = argparse.ArgumentParser(description = description)
    parser.add_argument('info_path',
            metavar='PYMSBAYES-INFO-FILE',
            type=argparse_utils.arg_is_file,
            help=('Path to `pymsbayes-info.txt` file.'))
    parser.add_argument('-n', '--num-prior-samples',
            action = 'store',
            type = argparse_utils.arg_is_positive_int,
            default = 100000,
            help = ('The number of prior samples to simulate for estimating '
                    'prior probabilities.'))
    parser.add_argument('-i', '--sample-index',
            action = 'store',
            type = argparse_utils.arg_is_positive_int,
            help = ('The prior-sample index of results to be summarized. '
                    'Output files should have a consistent schema. For '
                    'example, a results file for divergence models might look '
                    'something like '
                    '`d1-m1-s1-1000000-div-model-results.txt`. In this example, '
                    'the prior-sample index is "1000000". The default is to '
                    'use the largest prior-sample index, which is probably '
                    'what you want.'))
    parser.add_argument('-o', '--output-dir',
            action = 'store',
            type = argparse_utils.arg_is_dir,
            help = ('The directory in which all output plots will be written. '
                    'The default is to use the directory of the pymsbayes info '
                    'file.'))
    parser.add_argument('--np',
            action = 'store',
            type = argparse_utils.arg_is_positive_int,
            default = multiprocessing.cpu_count(),
            help = ('The maximum number of processes to run in parallel. The '
                    'default is the number of CPUs available on the machine.'))
    parser.add_argument('-m', '--mu',
            action = 'store',
            type = argparse_utils.arg_is_positive_float,
            default = None,
            help = ('The mutation rate with which to scale time to units of '
                    'generations. By default, time is not scaled to '
                    'generations.'))
    parser.add_argument('--extension',
            action = 'store',
            type = str,
            default = 'pdf',
            help = ('The file format extension of the plots (e.g., "pdf", '
                    '"png"). The default is pdf.'))
    parser.add_argument('--seed',
            action = 'store',
            type = argparse_utils.arg_is_positive_int,
            help = 'Random number seed to use for the analysis.')
    parser.add_argument('--version',
            action = 'version',
            version = '%(prog)s ' + _program_info['version'],
            help = 'Report version and exit.')
    parser.add_argument('--quiet',
            action = 'store_true',
            help = 'Run without verbose messaging.')
    parser.add_argument('--debug',
            action = 'store_true',
            help = 'Run in debugging mode.')

    args = parser.parse_args()

    ##########################################################################
    ## handle args

    from pymsbayes.utils.messaging import (LoggingControl,
            InfoLogger)

    LoggingControl.set_logging_level("INFO")
    if args.quiet:
        LoggingControl.set_logging_level("WARNING")
    if args.debug:
        LoggingControl.set_logging_level("DEBUG")
    log = LoggingControl.get_logger(__name__)

    from pymsbayes import plotting
    from pymsbayes.utils import sumresults
    from pymsbayes.utils import GLOBAL_RNG

    if not plotting.MATPLOTLIB_AVAILABLE:
        log.error(
                '`matplotlib` could not be imported, so plots can not be\n'
                'produced. Please install `matplotlib` and try again.')
        sys.exit(1)

    if not args.seed:
        args.seed = random.randint(1, 999999999)
    GLOBAL_RNG.seed(args.seed)

    if not args.output_dir:
        args.output_dir = os.path.dirname(args.info_path)
    args.output_dir = os.path.join(args.output_dir, 'plots')
    if not os.path.exists(args.output_dir):
        os.mkdir(args.output_dir)

    results = sumresults.DMCSimulationResults(args.info_path)
    if results.num_sim_reps > 1:
        log.error('Results appear to be from simulation-based analysis, '
                'for which this plotting script is not appropriate.')
        sys.exit(1)

    observed_indices = sorted(results.observed_index_to_config.keys())
    prior_indices = sorted(results.prior_index_to_config.keys())
    for obs_idx in observed_indices:
        for prior_idx in prior_indices:
            result_indices = results.get_result_indices(obs_idx, prior_idx, 1)
            result_idx = max(result_indices)
            result_path_prefix = '{0}{1}-'.format(
                    results.get_result_path_prefix(obs_idx, prior_idx, 1),
                    result_idx)
            result_dir = os.path.dirname(result_path_prefix)
            out_prefix = os.path.join(args.output_dir, os.path.basename(
                    result_path_prefix))
            prior_cfg = results.prior_configs[prior_idx]
            posterior_summary_path = get_result_path(result_path_prefix,
                    'posterior-summary')
            div_model_path = get_result_path(result_path_prefix,
                    'div-model-results')
            config_path = results.prior_index_to_config[prior_idx]
            time_multiplier = 1.0
            if args.mu is not None:
                if prior_cfg.time_in_subs_per_site:
                    time_multiplier = 1.0 / args.mu
                else:
                    try:
                        mean_theta = prior_cfg.theta.mean
                    except:
                        mean_theta = prior_cfg.d_theta.mean
                    time_multiplier = mean_theta / args.mu

            if results.sort_index == 0:
                #plot marginal times
                if not posterior_summary_path:
                    log.warning('Could not find {0}{1}.txt(.gz); '
                            'Skipping marginal times plot...'.format(
                                    result_path_prefix,
                                    'posterior-summary'))
                else:
                    label_dimension = (0.34 * (prior_cfg.npairs + 1)) + 0.56
                    marginal_times_plot = plotting.get_marginal_divergence_time_plot(
                            config_path = config_path,
                            posterior_summary_path = posterior_summary_path,
                            labels = None,
                            estimate = 'median',
                            interval = 'HPD_95_interval',
                            time_multiplier = time_multiplier,
                            horizontal = True,
                            label_dimension = label_dimension,
                            measure_dimension = 8.0,
                            label_size = 12.0,
                            measure_tick_label_size = 12.0,
                            measure_axis_label = 'Divergence time',
                            measure_axis_label_size = 14.0,
                            label_axis_label = 'Taxon pair',
                            label_axis_label_size = 14.0,
                            usetex = False)
                    marginal_times_path = '{0}{1}'.format(out_prefix,
                            'marginal-divergence-times.' + args.extension)
                    marginal_times_plot.savefig(marginal_times_path)

                #plot top ordered models
                if not div_model_path:
                    log.warning('Could not find {0}{1}.txt(.gz); '
                            'Skipping ordered div model plot...'.format(
                                    result_path_prefix,
                                    'div-model-results'))
                else:
                    height = 12.0
                    margin_top = 0.99
                    margin_left = 0.03
                    padding_between_vertical = 0.8
                    if prior_cfg.npairs < 4:
                        height *= 0.8
                        margin_top -= 0.01
                        margin_left += 0.05
                        padding_between_vertical += 0.3
                    width = (0.38 * prior_cfg.npairs) + 1.5
                    div_model_plot = plotting.OrderedDivergenceModelPlotGrid(
                            div_model_results_path = div_model_path,
                            config_path = config_path,
                            num_top_models = 10,
                            time_multiplier = time_multiplier,
                            height = height,
                            width = width,
                            plot_label_schema = 'uppercase',
                            plot_label_offset = 0,
                            plot_label_size = 12.0,
                            y_title = 'Divergence time',
                            y_title_size = 14.0,
                            y_tick_label_size = 10.0,
                            right_text_size = 10.0,
                            margin_left = margin_left,
                            margin_bottom = 0.0,
                            margin_right = 1,
                            margin_top = margin_top,
                            padding_between_vertical = padding_between_vertical,
                            tab = 0.08)
                    plot = div_model_plot.create_grid()
                    div_model_plot_path = '{0}{1}'.format(out_prefix,
                            'ordered-div-models.' + args.extension)
                    plot.savefig(div_model_plot_path)

            else:
                #plot top unordered models
                if not div_model_path:
                    log.warning('Could not find {0}{1}.txt(.gz); '
                            'Skipping unordered div model plot...'.format(
                                    result_path_prefix,
                                    'div-model-results'))
                else:
                    width = (0.38 * prior_cfg.npairs) + 1.5
                    div_model_plot = plotting.UnorderedDivergenceModelPlotGrid(
                            div_model_results_path = div_model_path,
                            num_top_models = 10,
                            time_multiplier = time_multiplier,
                            height = 10.0,
                            width = width,
                            data_label_size = 10.0,
                            plot_label_schema = 'uppercase',
                            plot_label_offset = 0,
                            plot_label_size = 12.0,
                            y_title = 'Divergence time',
                            y_title_size = 14.0,
                            y_tick_label_size = 10.0,
                            right_text_size = 10.0,
                            margin_left = 0.03,
                            margin_bottom = 0.0,
                            margin_right = 1,
                            margin_top = 0.99,
                            padding_between_vertical = 0.8,
                            tab = 0.08)
                    plot = div_model_plot.create_grid()
                    div_model_plot_path = '{0}{1}'.format(out_prefix,
                            'ordered-div-models.' + args.extension)
                    plot.savefig(div_model_plot_path)

            #plot ndiv plot
            psi_path = get_result_path(result_path_prefix,
                    'psi-results')
            if not psi_path:
                log.warning('Could not find {0}{1}.txt(.gz); '
                        'Skipping number of divergences plot...'.format(
                                result_path_prefix,
                                'psi-results'))
            else:
                width = (0.25 * prior_cfg.npairs) + 0.55
                if width < 2.8:
                    width = 2.8
                num_div_summary = plotting.NumberOfDivergencesSummary(
                        config_path = results.prior_index_to_config[prior_idx],
                        psi_results_path = psi_path,
                        posterior_summary_path = posterior_summary_path,
                        num_prior_samples = args.num_prior_samples,
                        num_processors = args.np)
                num_div_summary.create_plot(
                        plot_label_size = 10.0,
                        right_text_size = 10.0,
                        x_label_size = 10.0,
                        y_label_size = 10.0,
                        xtick_label_size = 10.0,
                        ytick_label_size = 8.0,
                        height = 6.0,
                        width = width,
                        margin_bottom = 0.0,
                        margin_left = 0.0,
                        margin_top = 0.97,
                        margin_right = 1.0,
                        padding_between_vertical = 1.0)
                num_div_plot_path = '{0}{1}'.format(out_prefix,
                        'number-of-divergences.' + args.extension)
                num_div_summary.save_plot(num_div_plot_path)

                bf_plot_path = '{0}{1}'.format(out_prefix,
                        ('number-of-divergences-bayes-factors-only.' +
                                args.extension))
                num_div_summary.save_bf_plot(bf_plot_path)
                
                num_div_bf_path = '{0}{1}'.format(out_prefix,
                        'number-of-divergences-bayes-factors.txt')
                with open(num_div_bf_path, 'w') as out:
                    out.write('num_of_divs\t2ln(bf)\n')
                    for n in sorted(num_div_summary.psi_bayes_factors.keys()):
                        out.write('{0}\t{1}\n'.format(n,
                                num_div_summary.psi_bayes_factors[n]))

    log.info('The plots are in: {0}'.format(args.output_dir))
Пример #15
0
def main_cli(argv=sys.argv):
    description = '{name} {version}'.format(**_program_info)
    parser = argparse.ArgumentParser(
        description=description,
        formatter_class=argparse_utils.SmartHelpFormatter)
    parser.add_argument(
        '-o',
        '--observed-configs',
        nargs='+',
        type=argparse_utils.arg_is_config,
        required=True,
        help=('One or more msBayes config files to be used to either '
              'calculate or simulate observed summary statistics. If '
              'used in combination with `-r` each config will be used to '
              'simulate pseudo-observed data. If analyzing real data, do '
              'not use the `-r` option, and the fasta files specified '
              'within the config must exist and contain the sequence '
              'data.'))
    parser.add_argument(
        '-p',
        '--prior-configs',
        nargs='+',
        type=argparse_utils.arg_is_path,
        required=True,
        help=('One or more config files to be used to generate prior '
              'samples. If more than one config is specified, they '
              'should be separated by spaces. '
              'This option can also be used to specify the path to a '
              'directory containing the prior samples and summary '
              'statistic means and standard deviations generated by a '
              'previous run using the `generate-samples-only` option. '
              'These files should be found in the directory '
              '`pymsbayes-output/prior-stats-summaries`. The'
              '`pymsbayes-output/model-key.txt` also needs to be present.'
              ' If specifying this directory, it should be the only '
              'argument (i.e., no other directories or config files can '
              'be provided).'))
    parser.add_argument(
        '-r',
        '--reps',
        action='store',
        type=argparse_utils.arg_is_nonnegative_int,
        default=0,
        help=('This option has two effects. First, it signifies that '
              'the analysis will be simulation based (i.e., no real '
              'data will be used). Second, it specifies how many '
              'simulation replicates to perform (i.e., how many data '
              'sets to simulate and analyze).'))
    parser.add_argument(
        '-n',
        '--num-prior-samples',
        action='store',
        type=argparse_utils.arg_is_positive_int,
        default=1000000,
        help=('The number of prior samples to simulate for each prior '
              'config specified with `-p`.'))
    parser.add_argument(
        '--prior-batch-size',
        action='store',
        type=argparse_utils.arg_is_positive_int,
        default=10000,
        help=('The number of prior samples to simulate for each batch.'))
    parser.add_argument(
        '--generate-samples-only',
        action='store_true',
        help=('Only generate samples from models as requested. I.e., '
              'No analyses are performed to approximate posteriors. '
              'This option can be useful if you want the prior samples '
              'for other purposes.'))
    parser.add_argument(
        '--num-posterior-samples',
        action='store',
        type=argparse_utils.arg_is_positive_int,
        default=1000,
        help=('The number of posterior samples desired for each '
              'analysis. Default: 1000.'))
    parser.add_argument('--num-standardizing-samples',
                        action='store',
                        type=argparse_utils.arg_is_positive_int,
                        default=10000,
                        help=('The number of prior samples desired to use for '
                              'standardizing statistics. Default: 10000.'))
    parser.add_argument(
        '--np',
        action='store',
        type=argparse_utils.arg_is_positive_int,
        default=multiprocessing.cpu_count(),
        help=('The maximum number of processes to run in parallel. The '
              'default is the number of CPUs available on the machine.'))
    parser.add_argument(
        '--output-dir',
        action='store',
        type=argparse_utils.arg_is_dir,
        help=('The directory in which all output files will be written. '
              'The default is to use the directory of the first observed '
              'config file.'))
    parser.add_argument(
        '--temp-dir',
        action='store',
        type=argparse_utils.arg_is_dir,
        help=('A directory to temporarily stage files. The default is to '
              'use the output directory.'))
    parser.add_argument(
        '--staging-dir',
        action='store',
        type=argparse_utils.arg_is_dir,
        help=('A directory to temporarily stage prior files. This option '
              'can be useful on clusters to speed up I/O while '
              'generating prior samples. You can designate a local temp '
              'directory on a compute node to avoid constant writing to '
              'a shared drive. The default is to use the `temp-dir`.'))
    parser.add_argument(
        '-s',
        '--stat-prefixes',
        nargs='*',
        type=str,
        help=('Prefixes of summary statistics to use in the analyses. '
              'The prefixes should be separated by spaces. '
              'Default: `-s pi wattTheta pi.net tajD.denom`.'))
    parser.add_argument(
        '-b',
        '--bandwidth',
        action='store',
        type=float,
        help=('Smoothing parameter for the posterior kernal density '
              'estimation. This option is used for the `glm` '
              'regression method. The default is 2 / '
              '`num-posterior-samples`.'))
    parser.add_argument(
        '-q',
        '--num-posterior-quantiles',
        action='store',
        type=argparse_utils.arg_is_positive_int,
        default=1000,
        help=('The number of equally spaced quantiles at which to '
              'evaluate the GLM-estimated posterior density. '
              'Default: 1000.'))
    parser.add_argument(
        '--reporting-frequency',
        action='store',
        type=argparse_utils.arg_is_nonnegative_int,
        default=0,
        help=('Suggested frequency (in number of prior samples) for '
              'running regression and reporting current results. '
              'Default: 0 (only report final results). '
              'If a value is given, it may be adjusted so that the '
              'reporting frequency is a multiple of the multi-processed '
              'batch size.'))
    parser.add_argument('--sort-index',
                        action='store',
                        type=argparse_utils.arg_is_nonnegative_int,
                        default=0,
                        choices=range(12),
                        help=argparse_utils.get_sort_index_help_message())
    parser.add_argument(
        '--no-global-estimate',
        action='store_true',
        help=('If multiple prior models are specified, by default a '
              'global estimate is performed averaging over all models. '
              'This option prevents the global estimation (i.e., only '
              'inferences for each model are made).'))
    parser.add_argument('--compress',
                        action='store_true',
                        help='Compress large results files.')
    parser.add_argument('--keep-temps',
                        action='store_true',
                        help='Keep all temporary files.')
    parser.add_argument('--seed',
                        action='store',
                        type=int,
                        help='Random number seed to use for the analysis.')
    parser.add_argument(
        '--output-prefix',
        action='store',
        type=str,
        default='',
        help=('Prefix to use at beginning of output files. The default '
              'is no prefix.'))
    parser.add_argument(
        '--data-key-path',
        action='store',
        type=argparse_utils.arg_is_file,
        help=('The path to a `data-key.txt` file generated by a previous '
              'run. This file should be found in the directory '
              '`pymsbayes-output/data-key.txt`. This option '
              'will override the `-o`/`--observed-configs` option, and '
              'is intended to be used in combination with the '
              '`--start-from` option to restart an analysis.'))
    parser.add_argument(
        '--start-from-simulation-index',
        action='store',
        type=argparse_utils.arg_is_nonnegative_int,
        default=0,
        help=('The simulation index at which to begin analyses. Must be '
              'used in combination with either the number of simulation '
              'replicates (`-r`/`--reps`) or the `--data-key-path` '
              'option, and must be a positive '
              'integer that is less than the number of simulation '
              'replicates. This option can be useful if an analysis '
              'needs to be restarted.'))
    parser.add_argument(
        '--start-from-observed-index',
        action='store',
        type=argparse_utils.arg_is_nonnegative_int,
        default=0,
        help=('The observed config index at which to begin analyses. '
              'Can be used in combination with the `--data-key-path` '
              'option to restart long-running, multi-observed-config '
              'analyses'))
    parser.add_argument('--dry-run',
                        action='store_true',
                        help='Do not run analyses; only process settings')
    parser.add_argument('--version',
                        action='version',
                        version='%(prog)s ' + _program_info['version'],
                        help='Report version and exit.')
    parser.add_argument('--quiet',
                        action='store_true',
                        help='Run without verbose messaging.')
    parser.add_argument('--debug',
                        action='store_true',
                        help='Run in debugging mode.')

    if argv == sys.argv:
        args = parser.parse_args()
    else:
        args = parser.parse_args(argv)

    ##########################################################################
    ## handle args

    from pymsbayes.utils.messaging import (LoggingControl, InfoLogger)

    LoggingControl.set_logging_level("INFO")
    if args.quiet:
        LoggingControl.set_logging_level("WARNING")
    if args.debug:
        LoggingControl.set_logging_level("DEBUG")
    log = LoggingControl.get_logger(__name__)

    from pymsbayes.workers import (MsBayesWorker, merge_prior_files,
                                   ObsSumStatsWorker)
    from pymsbayes.teams import ABCTeam
    from pymsbayes.utils.functions import (is_file, is_dir, long_division,
                                           mk_new_dir)
    from pymsbayes.utils.parsing import (get_patterns_from_prefixes,
                                         DEFAULT_STAT_PATTERNS,
                                         DIV_MODEL_PATTERNS, MODEL_PATTERNS,
                                         PSI_PATTERNS, MEAN_TAU_PATTERNS,
                                         OMEGA_PATTERNS, CV_PATTERNS,
                                         line_count)
    from pymsbayes.utils import sumresults, errors
    from pymsbayes.manager import Manager
    from pymsbayes.utils.tempfs import TempFileSystem
    from pymsbayes.config import MsBayesConfig
    from pymsbayes.utils import (GLOBAL_RNG, set_memory_trace,
                                 MSBAYES_SORT_INDEX, ToolPathManager)

    MSBAYES_SORT_INDEX.set_index(args.sort_index)

    if len(args.observed_configs) != len(set(args.observed_configs)):
        raise ValueError('All paths to observed config files must be unique')

    if args.num_standardizing_samples > args.num_prior_samples:
        args.num_standardizing_samples = args.num_prior_samples

    # get full paths to tools
    msbayes_path = ToolPathManager.get_tool_full_path('msbayes.pl')
    dpp_msbayes_path = ToolPathManager.get_tool_full_path('dpp-msbayes.pl')
    eureject_path = ToolPathManager.get_tool_full_path('eureject')
    abctb_path = ToolPathManager.get_tool_full_path('ABCestimator')

    # vet prior-configs option
    using_previous_priors = False
    previous_prior_dir = None
    if (len(args.prior_configs) == 1) and (is_dir(args.prior_configs[0])):
        previous_prior_dir = args.prior_configs.pop(0)
        previous_priors = glob.glob(
            os.path.join(previous_prior_dir, '*-prior-sample.txt'))
        previous_sums = glob.glob(
            os.path.join(previous_prior_dir, '*-means-and-std-devs.txt'))
        if (not previous_priors) or (not previous_sums):
            raise ValueError(
                'directory {0!r} specified with `prior-configs` '
                'option does not contain necessary prior and summary '
                'files'.format(args.prior_configs[0]))
        using_previous_priors = True
    else:
        for path in args.prior_configs:
            if not is_file(path):
                raise ValueError(
                    'prior config {0!r} is not a file'.format(path))
    if len(args.prior_configs) != len(set(args.prior_configs)):
        raise ValueError('All paths to prior config files must be unique')
    if not args.output_dir:
        args.output_dir = os.path.dirname(args.observed_configs[0])
    base_dir = mk_new_dir(os.path.join(args.output_dir, 'pymsbayes-results'))
    if not args.temp_dir:
        args.temp_dir = base_dir
    info_path = os.path.join(base_dir, args.output_prefix + \
            'pymsbayes-info.txt')
    info = InfoLogger(info_path)
    info.write('[pymsbayes]'.format(base_dir))
    info.write('\tversion = {version}'.format(**_program_info))
    info.write('\toutput_directory = {0}'.format(base_dir))
    temp_fs = TempFileSystem(parent=args.temp_dir, prefix='temp-files-')
    base_temp_dir = temp_fs.base_dir
    info.write('\ttemp_directory = {0}'.format(base_temp_dir))
    info.write('\tsort_index = {0}'.format(MSBAYES_SORT_INDEX.current_value()))
    info.write('\tsimulation_reps = {0}'.format(args.reps))
    stat_patterns = DEFAULT_STAT_PATTERNS
    if args.stat_prefixes:
        for i in range(len(args.stat_prefixes)):
            if not args.stat_prefixes[i].endswith('.'):
                args.stat_prefixes[i] += '.'
        stat_patterns = get_patterns_from_prefixes(args.stat_prefixes,
                                                   ignore_case=True)
    if not args.bandwidth:
        args.bandwidth = 2 / float(args.num_posterior_samples)
    if not args.seed:
        args.seed = random.randint(1, 999999999)
    GLOBAL_RNG.seed(args.seed)
    if args.data_key_path:
        observed_map = sumresults.parse_data_key_file(args.data_key_path)
        observed_paths = [observed_map[k] for k in sorted(observed_map.keys())]
    else:
        observed_dir = mk_new_dir(
            os.path.join(base_dir, 'observed-summary-stats'))
        observed_paths = [os.path.join(observed_dir, args.output_prefix + \
            'observed-{0}.txt'.format(i+1)) for i in range(len(
                    args.observed_configs))]
    info.write('\tseed = {0}'.format(args.seed))
    info.write('\tnum_processors = {0}'.format(args.np))
    info.write('\tnum_prior_samples = {0}'.format(args.num_prior_samples))
    info.write('\tnum_standardizing_samples = {0}'.format(
        args.num_standardizing_samples))
    info.write('\tbandwidth = {0}'.format(args.bandwidth))
    info.write('\tposterior_quantiles = {0}'.format(
        args.num_posterior_quantiles))
    info.write('\tposterior_sample_size = {0}'.format(
        args.num_posterior_samples))
    info.write('\tstat_patterns = {0}'.format(', '.join(
        [p.pattern for p in stat_patterns])))

    # vet observed configs
    ref_config_path = args.observed_configs[0]
    ref_config = MsBayesConfig(ref_config_path)
    all_config_paths = []
    num_taxon_pairs = ref_config.npairs
    assert num_taxon_pairs > 0
    for config in args.observed_configs:
        all_config_paths.append(config)
        if not ref_config.equal_sample_table(config):
            if not args.keep_temps:
                temp_fs.purge()
            raise errors.SampleTableError(
                'sample tables in config {0!r} and {1!r} differ; '
                'all sample tables must be the same.'.format(
                    ref_config_path, config))

    info.write('\tnum_taxon_pairs = {0}'.format(num_taxon_pairs))
    info.write('\tdry_run = {0}'.format(args.dry_run))
    info.write('\t[[tool_paths]]')
    info.write('\t\tdpp_msbayes = {0}'.format(dpp_msbayes_path))
    info.write('\t\tmsbayes = {0}'.format(msbayes_path))
    info.write('\t\teureject = {0}'.format(eureject_path))
    info.write('\t\tabcestimator = {0}'.format(abctb_path))
    info.write('\t[[observed_configs]]')
    for i, cfg in enumerate(args.observed_configs):
        info.write('\t\t{0} = {1}'.format(
            i + 1, os.path.relpath(cfg, os.path.dirname(info_path))))

    abc_team = ABCTeam(
        temp_fs=temp_fs,
        observed_stats_files=observed_paths,
        num_taxon_pairs=num_taxon_pairs,
        config_paths=args.prior_configs,
        previous_prior_dir=previous_prior_dir,
        num_prior_samples=args.num_prior_samples,
        num_processors=args.np,
        num_standardizing_samples=args.num_standardizing_samples,
        num_posterior_samples=args.num_posterior_samples,
        num_posterior_density_quantiles=args.num_posterior_quantiles,
        batch_size=args.prior_batch_size,
        output_dir=base_dir,
        output_prefix=args.output_prefix,
        prior_temp_dir=args.staging_dir,
        rng=GLOBAL_RNG,
        report_parameters=True,
        stat_patterns=stat_patterns,
        eureject_exe_path=eureject_path,
        abctoolbox_exe_path=abctb_path,
        msbayes_exe_path=None,
        abctoolbox_bandwidth=args.bandwidth,
        omega_threshold=0.01,
        cv_threshold=0.01,
        compress=args.compress,
        reporting_frequency=args.reporting_frequency,
        keep_temps=args.keep_temps,
        global_estimate_only=False,
        global_estimate=not args.no_global_estimate,
        generate_prior_samples_only=args.generate_samples_only,
        start_from_simulation_index=args.start_from_simulation_index,
        start_from_observed_index=args.start_from_observed_index)

    models_to_configs = {}
    configs_to_models = {}
    for k, v in abc_team.models.iteritems():
        models_to_configs[k] = v
        configs_to_models[v] = k
        cfg = MsBayesConfig(v)
        all_config_paths.append(v)
        # vet prior configs
        if not ref_config.equal_sample_table(cfg):
            if not args.keep_temps:
                temp_fs.purge()
            raise errors.SampleTableError(
                'sample tables in config {0!r} and {1!r} differ; '
                'all sample tables must be the same.'.format(
                    ref_config_path, v))

    info.write('\t[[observed_paths]]')
    for i in sorted(abc_team.observed_stats_paths.iterkeys()):
        info.write('\t\t{0} = {1}'.format(
            i,
            os.path.relpath(abc_team.observed_stats_paths[i],
                            os.path.dirname(info_path))))
    info.write('\t[[prior_configs]]')
    for i in sorted(abc_team.models.iterkeys()):
        info.write('\t\t{0} = {1}'.format(
            i, os.path.relpath(abc_team.models[i],
                               os.path.dirname(info_path))))

    ##########################################################################
    ## begin analysis --- get observed summary stats

    set_memory_trace()  # start logging memory profile
    start_time = datetime.datetime.now()

    if args.data_key_path:
        log.info('Using provided summary statitics...')
    elif not args.dry_run:
        obs_temp_dir = base_temp_dir
        if args.staging_dir:
            obs_temp_dir = args.staging_dir
        observed_temp_fs = TempFileSystem(parent=obs_temp_dir,
                                          prefix='observed-temps-')

        if args.reps < 1:
            log.info('Calculating summary statistics from sequence data...')
            obs_workers = []
            for i, cfg in enumerate(args.observed_configs):
                ss_worker = ObsSumStatsWorker(temp_fs=observed_temp_fs,
                                              config_path=cfg,
                                              output_path=observed_paths[i],
                                              schema='abctoolbox',
                                              stat_patterns=stat_patterns)
                obs_workers.append(ss_worker)

            obs_workers = Manager.run_workers(workers=obs_workers,
                                              num_processors=args.np)

            # re-vet all configs to see if some were changed by obsSumStats.pl
            new_ref_config = ref_config
            ref_modified = False
            # new ref because if all configs were updated all is good
            if not ref_config.equal_sample_table(ref_config_path):
                ref_modified = True
                new_ref_config = MsBayesConfig(ref_config_path)
                log.warning("""
The alignment lengths in config
{0!r}
have been corrected for sites with *any* ambiguous bases and/or gaps by
obsSumStats.pl.
                    """.format(ref_config_path))
            for config in all_config_paths:
                if not new_ref_config.equal_sample_table(config):
                    corrected_config = config
                    if ref_modified:
                        corrected_config = ref_config_path
                    if not args.keep_temps:
                        observed_temp_fs.purge()
                        temp_fs.purge()
                    raise errors.SampleTableError("""
The sample tables in configs
{0!r}
and
{1!r}
differ because obsSumStats.pl modified alignment lengths in config
{2!r}
to correct for sites in the alignments with *any* ambiguous bases and/or gaps.
Please make sure the sample tables in all configs will be the same after
correcting alignment lengths for sites that contain *any* ambiguous bases
and/or gaps. You can do this by copying and pasting the sample table in
{2!r}
that has been corrected by obsSumStats.pl into the other configs that were not
corrected.
                        """.format(ref_config_path, config, corrected_config))

        else:
            log.info('Simulating summary statistics from observed configs...')
            num_observed_workers = min([args.reps, args.np])
            if args.reps <= args.np:
                observed_batch_size = 1
                remainder = 0
            else:
                observed_batch_size, remainder = long_division(
                    args.reps, args.np)
            msbayes_workers = []
            for idx, cfg in enumerate(args.observed_configs):
                observed_model_idx = configs_to_models.get(cfg, None)
                schema = 'abctoolbox'
                for i in range(num_observed_workers):
                    worker = MsBayesWorker(temp_fs=observed_temp_fs,
                                           sample_size=observed_batch_size,
                                           config_path=cfg,
                                           model_index=observed_model_idx,
                                           report_parameters=True,
                                           schema=schema,
                                           include_header=True,
                                           stat_patterns=stat_patterns,
                                           write_stats_file=False,
                                           staging_dir=None,
                                           tag=idx)
                    msbayes_workers.append(worker)
                if remainder > 0:
                    worker = MsBayesWorker(temp_fs=observed_temp_fs,
                                           sample_size=remainder,
                                           config_path=cfg,
                                           model_index=observed_model_idx,
                                           report_parameters=True,
                                           schema=schema,
                                           include_header=True,
                                           stat_patterns=stat_patterns,
                                           write_stats_file=False,
                                           staging_dir=None,
                                           tag=idx)
                    msbayes_workers.append(worker)

            # run parallel msbayes processes
            msbayes_workers = Manager.run_workers(workers=msbayes_workers,
                                                  num_processors=args.np)

            workers = dict(
                zip(range(len(args.observed_configs)),
                    [[] for i in range(len(args.observed_configs))]))
            for w in msbayes_workers:
                workers[w.tag].append(w)

            # merge simulated observed data into one file
            for i in range(len(args.observed_configs)):
                merge_prior_files([w.prior_path for w in workers[i]],
                                  observed_paths[i])
                lc = line_count(observed_paths[i], ignore_headers=True)
                if lc != args.reps:
                    if not args.keep_temps:
                        temp_fs.purge()
                    raise Exception(
                        'The number of observed simulations ({0}) '
                        'generated for observed config {1!r} and output to '
                        'file {2!r} does not match the number of reps '
                        '({3})'.format(lc, args.observed_configs[i],
                                       observed_paths[i], args.reps))
        if not args.keep_temps:
            log.debug('purging observed temps...')
            observed_temp_fs.purge()

    ##########################################################################
    ## Begin ABC analyses

    if not args.dry_run:
        abc_team.run()

    stop_time = datetime.datetime.now()
    log.info('Done!')
    info.write('\t[[run_stats]]', log.info)
    info.write('\t\tstart_time = {0}'.format(str(start_time)), log.info)
    info.write('\t\tstop_time = {0}'.format(str(stop_time)), log.info)
    info.write('\t\ttotal_duration = {0}'.format(str(stop_time - start_time)),
               log.info)

    if not args.keep_temps:
        log.debug('purging temps...')
        temp_fs.purge()
Пример #16
0
def main_cli():
    description = '{name} {version}'.format(**_program_info)
    parser = argparse.ArgumentParser(description=description)
    parser.add_argument(
        'config',
        metavar='CONFIG-FILE',
        type=argparse_utils.arg_is_config,
        help=('msBayes config file used to estimate the posterior '
              'sample.'))
    parser.add_argument('posterior_sample_path',
                        metavar='POSTERIOR-SAMPLE-FILE',
                        type=argparse_utils.arg_is_file,
                        help=('Path to posterior sample file (i.e., '
                              '`*-posterior-sample.txt`).'))
    parser.add_argument(
        '-e',
        '--expression',
        dest='expressions',
        action='append',
        metavar='TAXON-INDEX-EXPRESSION',
        type=str,
        required=True,
        help=('A conditional expression of divergence times based on '
              'the taxon-pair indices for which to calculate the '
              'posterior probability of being true. Indices correspond '
              'to the order that pairs of taxa appear in the sample '
              'table of the config, starting at 0 for the first '
              'taxon-pair to appear in the table (starting from the '
              'top). E.g., `-e "0 == 3 == 4"` would request the '
              'proportion of times the 1st, 4th, and 5th taxon-pairs '
              '(in order of appearance in the sample table of the '
              'config) share the same divergence time in the '
              'posterior sample, whereas `-e "0 > 1" would request the '
              'proportion of times the the 1st taxon-pair diverged '
              'further back in time than the 2nd taxon-pair in the '
              'posterior sample.'))
    parser.add_argument(
        '-n',
        '--num-prior-samples',
        action='store',
        type=argparse_utils.arg_is_positive_int,
        help=('The number of prior samples to simulate for estimating '
              'prior probabilities; prior probabilities and Bayes '
              'factors will be reported. The default is to only report '
              'posterior probabilities.'))
    parser.add_argument(
        '--np',
        action='store',
        type=argparse_utils.arg_is_positive_int,
        default=multiprocessing.cpu_count(),
        help=('The maximum number of processes to run in parallel for '
              'prior simulations. The default is the number of CPUs '
              'available on the machine. This option is only relevant '
              'if the number of prior samples is specified using the '
              '`-n` argument.'))
    parser.add_argument(
        '--seed',
        action='store',
        type=argparse_utils.arg_is_positive_int,
        help=('Random number seed to use for simulations. This option '
              'is only relevant if the number of prior samples is '
              'specified using the `-n` argument.'))
    parser.add_argument('--version',
                        action='version',
                        version='%(prog)s ' + _program_info['version'],
                        help='Report version and exit.')
    parser.add_argument('--quiet',
                        action='store_true',
                        help='Run without verbose messaging.')
    parser.add_argument('--debug',
                        action='store_true',
                        help='Run in debugging mode.')

    args = parser.parse_args()

    ##########################################################################
    ## handle args

    from pymsbayes.utils.messaging import (LoggingControl, InfoLogger)

    LoggingControl.set_logging_level("INFO")
    if args.quiet:
        LoggingControl.set_logging_level("WARNING")
    if args.debug:
        LoggingControl.set_logging_level("DEBUG")
    log = LoggingControl.get_logger(__name__)

    from pymsbayes import config
    from pymsbayes.teams import DivModelSimulatorTeam
    from pymsbayes.utils import stats, sumresults, GLOBAL_RNG

    if not args.seed:
        args.seed = random.randint(1, 999999999)
    GLOBAL_RNG.seed(args.seed)

    cfg = config.MsBayesConfig(args.config)

    evaluators = []
    for exp in args.expressions:
        evaluators.append(
            stats.ListConditionEvaluator(exp, index_labels=cfg.taxa))

    div_models = sumresults.get_partitions_from_posterior_sample_file(
        args.posterior_sample_path)

    sim_team = None
    if args.num_prior_samples:
        sim_team = DivModelSimulatorTeam(config_paths=[args.config],
                                         num_samples=args.num_prior_samples,
                                         num_processors=args.np)
        sim_team.start()

    for e in evaluators:
        title = '{0} --- {1}:'.format(e.expression, e.pretty_expression)
        section_title = '\n{0}\n{1}\n'.format(title, '-' * len(title))
        sys.stdout.write('{0}'.format(section_title))
        prob_shared_div = div_models.get_condition_frequency(e)
        sys.stdout.write(
            'posterior probability = {0}\n'.format(prob_shared_div))
        if sim_team:
            prior_prob = sim_team.div_models[
                args.config].get_condition_frequency(e)
            bf = ((prob_shared_div / (1 - prob_shared_div)) /
                  (prior_prob / (1 - prior_prob)))
            sys.stdout.write('prior probability = {0}\n'.format(prior_prob))
            sys.stdout.write('Bayes factor = {0}\n'.format(bf))
            sys.stdout.write('2ln(Bayes factor) = {0}\n'.format(2 *
                                                                math.log(bf)))
        sys.stdout.write('\n')
Пример #17
0
def main_cli():
    description = '{name} {version}'.format(**_program_info)
    parser = argparse.ArgumentParser(description=description)
    parser.add_argument('info_path',
                        metavar='PYMSBAYES-INFO-FILE',
                        type=argparse_utils.arg_is_file,
                        help=('Path to `pymsbayes-info.txt` file.'))
    parser.add_argument(
        '-n',
        '--num-prior-samples',
        action='store',
        type=argparse_utils.arg_is_positive_int,
        default=100000,
        help=('The number of prior samples to simulate for estimating '
              'prior probabilities.'))
    parser.add_argument(
        '-i',
        '--sample-index',
        action='store',
        type=argparse_utils.arg_is_positive_int,
        help=('The prior-sample index of results to be summarized. '
              'Output files should have a consistent schema. For '
              'example, a results file for divergence models might look '
              'something like '
              '`d1-m1-s1-1000000-div-model-results.txt`. In this example, '
              'the prior-sample index is "1000000". The default is to '
              'use the largest prior-sample index, which is probably '
              'what you want.'))
    parser.add_argument(
        '-o',
        '--output-dir',
        action='store',
        type=argparse_utils.arg_is_dir,
        help=('The directory in which all output plots will be written. '
              'The default is to use the directory of the pymsbayes info '
              'file.'))
    parser.add_argument(
        '--np',
        action='store',
        type=argparse_utils.arg_is_positive_int,
        default=multiprocessing.cpu_count(),
        help=('The maximum number of processes to run in parallel. The '
              'default is the number of CPUs available on the machine.'))
    parser.add_argument(
        '-m',
        '--mu',
        action='store',
        type=argparse_utils.arg_is_positive_float,
        default=None,
        help=('The mutation rate with which to scale time to units of '
              'generations. By default, time is not scaled to '
              'generations.'))
    parser.add_argument(
        '--extension',
        action='store',
        type=str,
        default='pdf',
        help=('The file format extension of the plots (e.g., "pdf", '
              '"png"). The default is pdf.'))
    parser.add_argument('--seed',
                        action='store',
                        type=argparse_utils.arg_is_positive_int,
                        help='Random number seed to use for the analysis.')
    parser.add_argument('--version',
                        action='version',
                        version='%(prog)s ' + _program_info['version'],
                        help='Report version and exit.')
    parser.add_argument('--quiet',
                        action='store_true',
                        help='Run without verbose messaging.')
    parser.add_argument('--debug',
                        action='store_true',
                        help='Run in debugging mode.')

    args = parser.parse_args()

    ##########################################################################
    ## handle args

    from pymsbayes.utils.messaging import (LoggingControl, InfoLogger)

    LoggingControl.set_logging_level("INFO")
    if args.quiet:
        LoggingControl.set_logging_level("WARNING")
    if args.debug:
        LoggingControl.set_logging_level("DEBUG")
    log = LoggingControl.get_logger(__name__)

    from pymsbayes import plotting
    from pymsbayes.utils import sumresults
    from pymsbayes.utils import GLOBAL_RNG

    if not plotting.MATPLOTLIB_AVAILABLE:
        log.error('`matplotlib` could not be imported, so plots can not be\n'
                  'produced. Please install `matplotlib` and try again.')
        sys.exit(1)

    if not args.seed:
        args.seed = random.randint(1, 999999999)
    GLOBAL_RNG.seed(args.seed)

    if not args.output_dir:
        args.output_dir = os.path.dirname(args.info_path)
    args.output_dir = os.path.join(args.output_dir, 'plots')
    if not os.path.exists(args.output_dir):
        os.mkdir(args.output_dir)

    results = sumresults.DMCSimulationResults(args.info_path)
    if results.num_sim_reps > 1:
        log.error('Results appear to be from simulation-based analysis, '
                  'for which this plotting script is not appropriate.')
        sys.exit(1)

    observed_indices = sorted(results.observed_index_to_config.keys())
    prior_indices = sorted(results.prior_index_to_config.keys())
    for obs_idx in observed_indices:
        for prior_idx in prior_indices:
            result_indices = results.get_result_indices(obs_idx, prior_idx, 1)
            result_idx = max(result_indices)
            result_path_prefix = '{0}{1}-'.format(
                results.get_result_path_prefix(obs_idx, prior_idx, 1),
                result_idx)
            result_dir = os.path.dirname(result_path_prefix)
            out_prefix = os.path.join(args.output_dir,
                                      os.path.basename(result_path_prefix))
            prior_cfg = results.prior_configs[prior_idx]
            posterior_summary_path = get_result_path(result_path_prefix,
                                                     'posterior-summary')
            div_model_path = get_result_path(result_path_prefix,
                                             'div-model-results')
            config_path = results.prior_index_to_config[prior_idx]
            time_multiplier = 1.0
            if args.mu is not None:
                if prior_cfg.time_in_subs_per_site:
                    time_multiplier = 1.0 / args.mu
                else:
                    try:
                        mean_theta = prior_cfg.theta.mean
                    except:
                        mean_theta = prior_cfg.d_theta.mean
                    time_multiplier = mean_theta / args.mu

            if results.sort_index == 0:
                #plot marginal times
                if not posterior_summary_path:
                    log.warning('Could not find {0}{1}.txt(.gz); '
                                'Skipping marginal times plot...'.format(
                                    result_path_prefix, 'posterior-summary'))
                else:
                    label_dimension = (0.34 * (prior_cfg.npairs + 1)) + 0.56
                    marginal_times_plot = plotting.get_marginal_divergence_time_plot(
                        config_path=config_path,
                        posterior_summary_path=posterior_summary_path,
                        labels=None,
                        estimate='median',
                        interval='HPD_95_interval',
                        time_multiplier=time_multiplier,
                        horizontal=True,
                        label_dimension=label_dimension,
                        measure_dimension=8.0,
                        label_size=12.0,
                        measure_tick_label_size=12.0,
                        measure_axis_label='Divergence time',
                        measure_axis_label_size=14.0,
                        label_axis_label='Taxon pair',
                        label_axis_label_size=14.0,
                        usetex=False)
                    marginal_times_path = '{0}{1}'.format(
                        out_prefix,
                        'marginal-divergence-times.' + args.extension)
                    marginal_times_plot.savefig(marginal_times_path)

                #plot top ordered models
                if not div_model_path:
                    log.warning('Could not find {0}{1}.txt(.gz); '
                                'Skipping ordered div model plot...'.format(
                                    result_path_prefix, 'div-model-results'))
                else:
                    height = 12.0
                    margin_top = 0.99
                    margin_left = 0.03
                    padding_between_vertical = 0.8
                    if prior_cfg.npairs < 4:
                        height *= 0.8
                        margin_top -= 0.01
                        margin_left += 0.05
                        padding_between_vertical += 0.3
                    width = (0.38 * prior_cfg.npairs) + 1.5
                    div_model_plot = plotting.OrderedDivergenceModelPlotGrid(
                        div_model_results_path=div_model_path,
                        config_path=config_path,
                        num_top_models=10,
                        time_multiplier=time_multiplier,
                        height=height,
                        width=width,
                        plot_label_schema='uppercase',
                        plot_label_offset=0,
                        plot_label_size=12.0,
                        y_title='Divergence time',
                        y_title_size=14.0,
                        y_tick_label_size=10.0,
                        right_text_size=10.0,
                        margin_left=margin_left,
                        margin_bottom=0.0,
                        margin_right=1,
                        margin_top=margin_top,
                        padding_between_vertical=padding_between_vertical,
                        tab=0.08)
                    plot = div_model_plot.create_grid()
                    div_model_plot_path = '{0}{1}'.format(
                        out_prefix, 'ordered-div-models.' + args.extension)
                    plot.savefig(div_model_plot_path)

            else:
                #plot top unordered models
                if not div_model_path:
                    log.warning('Could not find {0}{1}.txt(.gz); '
                                'Skipping unordered div model plot...'.format(
                                    result_path_prefix, 'div-model-results'))
                else:
                    width = (0.38 * prior_cfg.npairs) + 1.5
                    div_model_plot = plotting.UnorderedDivergenceModelPlotGrid(
                        div_model_results_path=div_model_path,
                        num_top_models=10,
                        time_multiplier=time_multiplier,
                        height=10.0,
                        width=width,
                        data_label_size=10.0,
                        plot_label_schema='uppercase',
                        plot_label_offset=0,
                        plot_label_size=12.0,
                        y_title='Divergence time',
                        y_title_size=14.0,
                        y_tick_label_size=10.0,
                        right_text_size=10.0,
                        margin_left=0.03,
                        margin_bottom=0.0,
                        margin_right=1,
                        margin_top=0.99,
                        padding_between_vertical=0.8,
                        tab=0.08)
                    plot = div_model_plot.create_grid()
                    div_model_plot_path = '{0}{1}'.format(
                        out_prefix, 'ordered-div-models.' + args.extension)
                    plot.savefig(div_model_plot_path)

            #plot ndiv plot
            psi_path = get_result_path(result_path_prefix, 'psi-results')
            if not psi_path:
                log.warning('Could not find {0}{1}.txt(.gz); '
                            'Skipping number of divergences plot...'.format(
                                result_path_prefix, 'psi-results'))
            else:
                width = (0.25 * prior_cfg.npairs) + 0.55
                if width < 2.8:
                    width = 2.8
                num_div_summary = plotting.NumberOfDivergencesSummary(
                    config_path=results.prior_index_to_config[prior_idx],
                    psi_results_path=psi_path,
                    posterior_summary_path=posterior_summary_path,
                    num_prior_samples=args.num_prior_samples,
                    num_processors=args.np)
                num_div_summary.create_plot(plot_label_size=10.0,
                                            right_text_size=10.0,
                                            x_label_size=10.0,
                                            y_label_size=10.0,
                                            xtick_label_size=10.0,
                                            ytick_label_size=8.0,
                                            height=6.0,
                                            width=width,
                                            margin_bottom=0.0,
                                            margin_left=0.0,
                                            margin_top=0.97,
                                            margin_right=1.0,
                                            padding_between_vertical=1.0)
                num_div_plot_path = '{0}{1}'.format(
                    out_prefix, 'number-of-divergences.' + args.extension)
                num_div_summary.save_plot(num_div_plot_path)

                bf_plot_path = '{0}{1}'.format(
                    out_prefix, ('number-of-divergences-bayes-factors-only.' +
                                 args.extension))
                num_div_summary.save_bf_plot(bf_plot_path)

                num_div_bf_path = '{0}{1}'.format(
                    out_prefix, 'number-of-divergences-bayes-factors.txt')
                with open(num_div_bf_path, 'w') as out:
                    out.write('num_of_divs\t2ln(bf)\n')
                    for n in sorted(num_div_summary.psi_bayes_factors.keys()):
                        out.write('{0}\t{1}\n'.format(
                            n, num_div_summary.psi_bayes_factors[n]))

    log.info('The plots are in: {0}'.format(args.output_dir))
Пример #18
0
def main_cli():
    description = '{name} {version}'.format(**_program_info)
    parser = argparse.ArgumentParser(description = description)
    parser.add_argument('div_model_path',
            metavar='DIV-MODEL-RESULTS-FILE',
            type=argparse_utils.arg_is_file,
            help = ('Path to divergence model results file (i.e., '
                    '`*-div-model-results.txt`).'))
    parser.add_argument('-i', '--taxon-indices',
            nargs = '+',
            type = argparse_utils.arg_is_positive_int,
            required = True,
            help = ('Two or more space-separated indices of taxa for which to '
                    'calculate the probability of them co-diverging. Indices '
                    'correspond to the line in the sample table of the config, '
                    'starting at 1 for the first line of the table. At least '
                    'two indices are required.'))
    parser.add_argument('-c', '--config',
            type = argparse_utils.arg_is_config,
            help = ('msBayes config file to be used to estimate prior '
                    'probability via simulations. If provided, the '
                    'posterior and prior probability and bayes factor is '
                    'reported. If not provided, only the posterior '
                    'probability is reported.'))
    parser.add_argument('-n', '--num-prior-samples',
            action = 'store',
            type = argparse_utils.arg_is_positive_int,
            default = 100000,
            help = ('The number of prior samples to simulate for estimating'
                    'prior probabilities. Only used if a config file is '
                    'provided with the `-c` argument.'))
    parser.add_argument('--np',
            action = 'store',
            type = argparse_utils.arg_is_positive_int,
            default = multiprocessing.cpu_count(),
            help = ('The maximum number of processes to run in parallel for '
                    'prior simulations. The default is the number of CPUs '
                    'available on the machine. This option is only relevant '
                    'if a config file is provided using the `-c` argument.'))
    parser.add_argument('--seed',
            action = 'store',
            type = argparse_utils.arg_is_positive_int,
            help = ('Random number seed to use for simulations. This option '
                    'is only relevant if a config file is provided using the '
                    '`-c` argument.'))
    parser.add_argument('--version',
            action = 'version',
            version = '%(prog)s ' + _program_info['version'],
            help = 'Report version and exit.')
    parser.add_argument('--quiet',
            action = 'store_true',
            help = 'Run without verbose messaging.')
    parser.add_argument('--debug',
            action = 'store_true',
            help = 'Run in debugging mode.')

    args = parser.parse_args()

    ##########################################################################
    ## handle args

    from pymsbayes.utils.messaging import (LoggingControl,
            InfoLogger)

    LoggingControl.set_logging_level("INFO")
    if args.quiet:
        LoggingControl.set_logging_level("WARNING")
    if args.debug:
        LoggingControl.set_logging_level("DEBUG")
    log = LoggingControl.get_logger(__name__)

    from pymsbayes import config
    from pymsbayes.teams import ModelProbabilityEstimatorTeam
    from pymsbayes.utils import sumresults, GLOBAL_RNG

    if len(args.taxon_indices) < 2:
        log.error('At least two taxon indices are required')
        sys.exit(1)
    if not args.seed:
        args.seed = random.randint(1, 999999999)
    GLOBAL_RNG.seed(args.seed)

    div_models = sumresults.OrderedDivergenceModelCollection(
            div_model_results_path = args.div_model_path)
    for i in args.taxon_indices:
        if ((i < 1) or (i > div_models.npairs)):
            log.error('taxon index {0} is out of bounds'.format(i))
            sys.exit(1)
    args.taxon_indices = [i - 1 for i in args.taxon_indices]
    prob_shared_div = div_models.prob_of_shared_divergence(args.taxon_indices)

    if args.config:
        prob_estimator_team = ModelProbabilityEstimatorTeam(
                config_paths = [args.config],
                num_samples = args.num_prior_samples,
                num_processors = args.np)
        prob_estimator_team.start()
        prior_prob = prob_estimator_team.shared_div_probs[args.config][
                len(args.taxon_indices)]
        bf = ((prob_shared_div / (1 - prob_shared_div)) /
                (prior_prob / (1 - prior_prob)))

    sys.stdout.write('posterior probability = {0}\n'.format(prob_shared_div))
    if args.config:
        sys.stdout.write('prior probability = {0}\n'.format(prior_prob))
        sys.stdout.write('Bayes factor = {0}\n'.format(bf))
        sys.stdout.write('2ln(Bayes factor) = {0}\n'.format(2 * math.log(bf)))