Пример #1
0
def main_cli(argv = sys.argv):
    description = '{name} {version}'.format(**_program_info)
    parser = argparse.ArgumentParser(description = description,
            formatter_class = argparse_utils.SmartHelpFormatter)

    parser.add_argument('info_path',
            metavar='PYMSBAYES-INFO-PATH',
            type=argparse_utils.arg_is_file,
            help=('Path to the "pymsbayes-info.txt" file.'))
    parser.add_argument('--plot',
            action = 'store_true',
            help = 'Create plots from result summaries.')
    parser.add_argument('--quiet',
            action = 'store_true',
            help = 'Run without verbose messaging.')
    parser.add_argument('--debug',
            action = 'store_true',
            help = 'Run in debugging mode.')

    if argv == sys.argv:
        args = parser.parse_args()
    else:
        args = parser.parse_args(argv)

    ##########################################################################
    ## handle args

    from pymsbayes.utils.messaging import LoggingControl

    LoggingControl.set_logging_level("INFO")
    if args.quiet:
        LoggingControl.set_logging_level("WARNING")
    if args.debug:
        LoggingControl.set_logging_level("DEBUG")
    log = LoggingControl.get_logger(__name__)

    from pymsbayes.utils import sumresults

    results = sumresults.DMCSimulationResults(args.info_path)
    prior_indices = results.prior_index_to_config.keys()
    test_path = results.get_result_summary_path(
            results.observed_index_to_path.keys()[0],
            prior_indices[0])
    if os.path.exists(test_path):
        log.warning('summary files already exists; skipping summaries!')
    else:
        results.write_result_summaries(
                prior_indices = prior_indices,
                include_tau_exclusion_info = False)
    if args.plot:
        create_plots(args.info_path)
Пример #2
0
def main_cli(argv=sys.argv):
    description = '{name} {version}'.format(**_program_info)
    parser = argparse.ArgumentParser(
        description=description,
        formatter_class=argparse_utils.SmartHelpFormatter)

    parser.add_argument('info_path',
                        metavar='PYMSBAYES-INFO-PATH',
                        type=argparse_utils.arg_is_file,
                        help=('Path to the "pymsbayes-info.txt" file.'))
    parser.add_argument('--plot',
                        action='store_true',
                        help='Create plots from result summaries.')
    parser.add_argument('--quiet',
                        action='store_true',
                        help='Run without verbose messaging.')
    parser.add_argument('--debug',
                        action='store_true',
                        help='Run in debugging mode.')

    if argv == sys.argv:
        args = parser.parse_args()
    else:
        args = parser.parse_args(argv)

    ##########################################################################
    ## handle args

    from pymsbayes.utils.messaging import LoggingControl

    LoggingControl.set_logging_level("INFO")
    if args.quiet:
        LoggingControl.set_logging_level("WARNING")
    if args.debug:
        LoggingControl.set_logging_level("DEBUG")
    log = LoggingControl.get_logger(__name__)

    from pymsbayes.utils import sumresults

    results = sumresults.DMCSimulationResults(args.info_path)
    prior_indices = results.prior_index_to_config.keys()
    test_path = results.get_result_summary_path(
        results.observed_index_to_path.keys()[0], prior_indices[0])
    if os.path.exists(test_path):
        log.warning('summary files already exists; skipping summaries!')
    else:
        results.write_result_summaries(prior_indices=prior_indices,
                                       include_tau_exclusion_info=False)
    if args.plot:
        create_plots(args.info_path)
Пример #3
0
def main_cli():
    description = '{name} {version}'.format(**_program_info)
    parser = argparse.ArgumentParser(description=description)
    parser.add_argument('info_path',
                        metavar='PYMSBAYES-INFO-FILE',
                        type=argparse_utils.arg_is_file,
                        help=('Path to `pymsbayes-info.txt` file.'))
    parser.add_argument(
        '-n',
        '--num-prior-samples',
        action='store',
        type=argparse_utils.arg_is_positive_int,
        default=100000,
        help=('The number of prior samples to simulate for estimating '
              'prior probabilities.'))
    parser.add_argument(
        '-i',
        '--sample-index',
        action='store',
        type=argparse_utils.arg_is_positive_int,
        help=('The prior-sample index of results to be summarized. '
              'Output files should have a consistent schema. For '
              'example, a results file for divergence models might look '
              'something like '
              '`d1-m1-s1-1000000-div-model-results.txt`. In this example, '
              'the prior-sample index is "1000000". The default is to '
              'use the largest prior-sample index, which is probably '
              'what you want.'))
    parser.add_argument(
        '-o',
        '--output-dir',
        action='store',
        type=argparse_utils.arg_is_dir,
        help=('The directory in which all output plots will be written. '
              'The default is to use the directory of the pymsbayes info '
              'file.'))
    parser.add_argument(
        '--np',
        action='store',
        type=argparse_utils.arg_is_positive_int,
        default=multiprocessing.cpu_count(),
        help=('The maximum number of processes to run in parallel. The '
              'default is the number of CPUs available on the machine.'))
    parser.add_argument(
        '-m',
        '--mu',
        action='store',
        type=argparse_utils.arg_is_positive_float,
        default=None,
        help=('The mutation rate with which to scale time to units of '
              'generations. By default, time is not scaled to '
              'generations.'))
    parser.add_argument(
        '--extension',
        action='store',
        type=str,
        default='pdf',
        help=('The file format extension of the plots (e.g., "pdf", '
              '"png"). The default is pdf.'))
    parser.add_argument('--seed',
                        action='store',
                        type=argparse_utils.arg_is_positive_int,
                        help='Random number seed to use for the analysis.')
    parser.add_argument('--version',
                        action='version',
                        version='%(prog)s ' + _program_info['version'],
                        help='Report version and exit.')
    parser.add_argument('--quiet',
                        action='store_true',
                        help='Run without verbose messaging.')
    parser.add_argument('--debug',
                        action='store_true',
                        help='Run in debugging mode.')

    args = parser.parse_args()

    ##########################################################################
    ## handle args

    from pymsbayes.utils.messaging import (LoggingControl, InfoLogger)

    LoggingControl.set_logging_level("INFO")
    if args.quiet:
        LoggingControl.set_logging_level("WARNING")
    if args.debug:
        LoggingControl.set_logging_level("DEBUG")
    log = LoggingControl.get_logger(__name__)

    from pymsbayes import plotting
    from pymsbayes.utils import sumresults
    from pymsbayes.utils import GLOBAL_RNG

    if not plotting.MATPLOTLIB_AVAILABLE:
        log.error('`matplotlib` could not be imported, so plots can not be\n'
                  'produced. Please install `matplotlib` and try again.')
        sys.exit(1)

    if not args.seed:
        args.seed = random.randint(1, 999999999)
    GLOBAL_RNG.seed(args.seed)

    if not args.output_dir:
        args.output_dir = os.path.dirname(args.info_path)
    args.output_dir = os.path.join(args.output_dir, 'plots')
    if not os.path.exists(args.output_dir):
        os.mkdir(args.output_dir)

    results = sumresults.DMCSimulationResults(args.info_path)
    if results.num_sim_reps > 1:
        log.error('Results appear to be from simulation-based analysis, '
                  'for which this plotting script is not appropriate.')
        sys.exit(1)

    observed_indices = sorted(results.observed_index_to_config.keys())
    prior_indices = sorted(results.prior_index_to_config.keys())
    for obs_idx in observed_indices:
        for prior_idx in prior_indices:
            result_indices = results.get_result_indices(obs_idx, prior_idx, 1)
            result_idx = max(result_indices)
            result_path_prefix = '{0}{1}-'.format(
                results.get_result_path_prefix(obs_idx, prior_idx, 1),
                result_idx)
            result_dir = os.path.dirname(result_path_prefix)
            out_prefix = os.path.join(args.output_dir,
                                      os.path.basename(result_path_prefix))
            prior_cfg = results.prior_configs[prior_idx]
            posterior_summary_path = get_result_path(result_path_prefix,
                                                     'posterior-summary')
            div_model_path = get_result_path(result_path_prefix,
                                             'div-model-results')
            config_path = results.prior_index_to_config[prior_idx]
            time_multiplier = 1.0
            if args.mu is not None:
                if prior_cfg.time_in_subs_per_site:
                    time_multiplier = 1.0 / args.mu
                else:
                    try:
                        mean_theta = prior_cfg.theta.mean
                    except:
                        mean_theta = prior_cfg.d_theta.mean
                    time_multiplier = mean_theta / args.mu

            if results.sort_index == 0:
                #plot marginal times
                if not posterior_summary_path:
                    log.warning('Could not find {0}{1}.txt(.gz); '
                                'Skipping marginal times plot...'.format(
                                    result_path_prefix, 'posterior-summary'))
                else:
                    label_dimension = (0.34 * (prior_cfg.npairs + 1)) + 0.56
                    marginal_times_plot = plotting.get_marginal_divergence_time_plot(
                        config_path=config_path,
                        posterior_summary_path=posterior_summary_path,
                        labels=None,
                        estimate='median',
                        interval='HPD_95_interval',
                        time_multiplier=time_multiplier,
                        horizontal=True,
                        label_dimension=label_dimension,
                        measure_dimension=8.0,
                        label_size=12.0,
                        measure_tick_label_size=12.0,
                        measure_axis_label='Divergence time',
                        measure_axis_label_size=14.0,
                        label_axis_label='Taxon pair',
                        label_axis_label_size=14.0,
                        usetex=False)
                    marginal_times_path = '{0}{1}'.format(
                        out_prefix,
                        'marginal-divergence-times.' + args.extension)
                    marginal_times_plot.savefig(marginal_times_path)

                #plot top ordered models
                if not div_model_path:
                    log.warning('Could not find {0}{1}.txt(.gz); '
                                'Skipping ordered div model plot...'.format(
                                    result_path_prefix, 'div-model-results'))
                else:
                    height = 12.0
                    margin_top = 0.99
                    margin_left = 0.03
                    padding_between_vertical = 0.8
                    if prior_cfg.npairs < 4:
                        height *= 0.8
                        margin_top -= 0.01
                        margin_left += 0.05
                        padding_between_vertical += 0.3
                    width = (0.38 * prior_cfg.npairs) + 1.5
                    div_model_plot = plotting.OrderedDivergenceModelPlotGrid(
                        div_model_results_path=div_model_path,
                        config_path=config_path,
                        num_top_models=10,
                        time_multiplier=time_multiplier,
                        height=height,
                        width=width,
                        plot_label_schema='uppercase',
                        plot_label_offset=0,
                        plot_label_size=12.0,
                        y_title='Divergence time',
                        y_title_size=14.0,
                        y_tick_label_size=10.0,
                        right_text_size=10.0,
                        margin_left=margin_left,
                        margin_bottom=0.0,
                        margin_right=1,
                        margin_top=margin_top,
                        padding_between_vertical=padding_between_vertical,
                        tab=0.08)
                    plot = div_model_plot.create_grid()
                    div_model_plot_path = '{0}{1}'.format(
                        out_prefix, 'ordered-div-models.' + args.extension)
                    plot.savefig(div_model_plot_path)

            else:
                #plot top unordered models
                if not div_model_path:
                    log.warning('Could not find {0}{1}.txt(.gz); '
                                'Skipping unordered div model plot...'.format(
                                    result_path_prefix, 'div-model-results'))
                else:
                    width = (0.38 * prior_cfg.npairs) + 1.5
                    div_model_plot = plotting.UnorderedDivergenceModelPlotGrid(
                        div_model_results_path=div_model_path,
                        num_top_models=10,
                        time_multiplier=time_multiplier,
                        height=10.0,
                        width=width,
                        data_label_size=10.0,
                        plot_label_schema='uppercase',
                        plot_label_offset=0,
                        plot_label_size=12.0,
                        y_title='Divergence time',
                        y_title_size=14.0,
                        y_tick_label_size=10.0,
                        right_text_size=10.0,
                        margin_left=0.03,
                        margin_bottom=0.0,
                        margin_right=1,
                        margin_top=0.99,
                        padding_between_vertical=0.8,
                        tab=0.08)
                    plot = div_model_plot.create_grid()
                    div_model_plot_path = '{0}{1}'.format(
                        out_prefix, 'ordered-div-models.' + args.extension)
                    plot.savefig(div_model_plot_path)

            #plot ndiv plot
            psi_path = get_result_path(result_path_prefix, 'psi-results')
            if not psi_path:
                log.warning('Could not find {0}{1}.txt(.gz); '
                            'Skipping number of divergences plot...'.format(
                                result_path_prefix, 'psi-results'))
            else:
                width = (0.25 * prior_cfg.npairs) + 0.55
                if width < 2.8:
                    width = 2.8
                num_div_summary = plotting.NumberOfDivergencesSummary(
                    config_path=results.prior_index_to_config[prior_idx],
                    psi_results_path=psi_path,
                    posterior_summary_path=posterior_summary_path,
                    num_prior_samples=args.num_prior_samples,
                    num_processors=args.np)
                num_div_summary.create_plot(plot_label_size=10.0,
                                            right_text_size=10.0,
                                            x_label_size=10.0,
                                            y_label_size=10.0,
                                            xtick_label_size=10.0,
                                            ytick_label_size=8.0,
                                            height=6.0,
                                            width=width,
                                            margin_bottom=0.0,
                                            margin_left=0.0,
                                            margin_top=0.97,
                                            margin_right=1.0,
                                            padding_between_vertical=1.0)
                num_div_plot_path = '{0}{1}'.format(
                    out_prefix, 'number-of-divergences.' + args.extension)
                num_div_summary.save_plot(num_div_plot_path)

                bf_plot_path = '{0}{1}'.format(
                    out_prefix, ('number-of-divergences-bayes-factors-only.' +
                                 args.extension))
                num_div_summary.save_bf_plot(bf_plot_path)

                num_div_bf_path = '{0}{1}'.format(
                    out_prefix, 'number-of-divergences-bayes-factors.txt')
                with open(num_div_bf_path, 'w') as out:
                    out.write('num_of_divs\t2ln(bf)\n')
                    for n in sorted(num_div_summary.psi_bayes_factors.keys()):
                        out.write('{0}\t{1}\n'.format(
                            n, num_div_summary.psi_bayes_factors[n]))

    log.info('The plots are in: {0}'.format(args.output_dir))
def main_cli():
    description = '{name} {version}'.format(**_program_info)
    parser = argparse.ArgumentParser(description=description)
    parser.add_argument(
        'config',
        metavar='CONFIG-FILE',
        type=argparse_utils.arg_is_config,
        help=('msBayes config file used to estimate the posterior '
              'sample.'))
    parser.add_argument('posterior_sample_path',
                        metavar='POSTERIOR-SAMPLE-FILE',
                        type=argparse_utils.arg_is_file,
                        help=('Path to posterior sample file (i.e., '
                              '`*-posterior-sample.txt`).'))
    parser.add_argument(
        '-e',
        '--expression',
        dest='expressions',
        action='append',
        metavar='TAXON-INDEX-EXPRESSION',
        type=str,
        required=True,
        help=('A conditional expression of divergence times based on '
              'the taxon-pair indices for which to calculate the '
              'posterior probability of being true. Indices correspond '
              'to the order that pairs of taxa appear in the sample '
              'table of the config, starting at 0 for the first '
              'taxon-pair to appear in the table (starting from the '
              'top). E.g., `-e "0 == 3 == 4"` would request the '
              'proportion of times the 1st, 4th, and 5th taxon-pairs '
              '(in order of appearance in the sample table of the '
              'config) share the same divergence time in the '
              'posterior sample, whereas `-e "0 > 1" would request the '
              'proportion of times the the 1st taxon-pair diverged '
              'further back in time than the 2nd taxon-pair in the '
              'posterior sample.'))
    parser.add_argument(
        '-n',
        '--num-prior-samples',
        action='store',
        type=argparse_utils.arg_is_positive_int,
        help=('The number of prior samples to simulate for estimating '
              'prior probabilities; prior probabilities and Bayes '
              'factors will be reported. The default is to only report '
              'posterior probabilities.'))
    parser.add_argument(
        '--np',
        action='store',
        type=argparse_utils.arg_is_positive_int,
        default=multiprocessing.cpu_count(),
        help=('The maximum number of processes to run in parallel for '
              'prior simulations. The default is the number of CPUs '
              'available on the machine. This option is only relevant '
              'if the number of prior samples is specified using the '
              '`-n` argument.'))
    parser.add_argument(
        '--seed',
        action='store',
        type=argparse_utils.arg_is_positive_int,
        help=('Random number seed to use for simulations. This option '
              'is only relevant if the number of prior samples is '
              'specified using the `-n` argument.'))
    parser.add_argument('--version',
                        action='version',
                        version='%(prog)s ' + _program_info['version'],
                        help='Report version and exit.')
    parser.add_argument('--quiet',
                        action='store_true',
                        help='Run without verbose messaging.')
    parser.add_argument('--debug',
                        action='store_true',
                        help='Run in debugging mode.')

    args = parser.parse_args()

    ##########################################################################
    ## handle args

    from pymsbayes.utils.messaging import (LoggingControl, InfoLogger)

    LoggingControl.set_logging_level("INFO")
    if args.quiet:
        LoggingControl.set_logging_level("WARNING")
    if args.debug:
        LoggingControl.set_logging_level("DEBUG")
    log = LoggingControl.get_logger(__name__)

    from pymsbayes import config
    from pymsbayes.teams import DivModelSimulatorTeam
    from pymsbayes.utils import stats, sumresults, GLOBAL_RNG

    if not args.seed:
        args.seed = random.randint(1, 999999999)
    GLOBAL_RNG.seed(args.seed)

    cfg = config.MsBayesConfig(args.config)

    evaluators = []
    for exp in args.expressions:
        evaluators.append(
            stats.ListConditionEvaluator(exp, index_labels=cfg.taxa))

    div_models = sumresults.get_partitions_from_posterior_sample_file(
        args.posterior_sample_path)

    sim_team = None
    if args.num_prior_samples:
        sim_team = DivModelSimulatorTeam(config_paths=[args.config],
                                         num_samples=args.num_prior_samples,
                                         num_processors=args.np)
        sim_team.start()

    for e in evaluators:
        title = '{0} --- {1}:'.format(e.expression, e.pretty_expression)
        section_title = '\n{0}\n{1}\n'.format(title, '-' * len(title))
        sys.stdout.write('{0}'.format(section_title))
        prob_shared_div = div_models.get_condition_frequency(e)
        sys.stdout.write(
            'posterior probability = {0}\n'.format(prob_shared_div))
        if sim_team:
            prior_prob = sim_team.div_models[
                args.config].get_condition_frequency(e)
            bf = ((prob_shared_div / (1 - prob_shared_div)) /
                  (prior_prob / (1 - prior_prob)))
            sys.stdout.write('prior probability = {0}\n'.format(prior_prob))
            sys.stdout.write('Bayes factor = {0}\n'.format(bf))
            sys.stdout.write('2ln(Bayes factor) = {0}\n'.format(2 *
                                                                math.log(bf)))
        sys.stdout.write('\n')
Пример #5
0
def main_cli():
    description = '{name} {version}'.format(**_program_info)
    parser = argparse.ArgumentParser(description = description)
    parser.add_argument('-c', '--config',
            type = arg_is_config,
            required = True,
            help = ('msBayes config file to be used to generate saturation '
                    'plot.'))
    parser.add_argument('-n', '--num-prior-samples',
            action = 'store',
            type = int,
            default = 1000,
            help = ('The number of prior samples to simulate for the '
                    'saturation plot.'))
    parser.add_argument('--np',
            action = 'store',
            type = int,
            default = multiprocessing.cpu_count(),
            help = ('The maximum number of processes to run in parallel. The '
                    'default is the number of CPUs available on the machine.'))
    parser.add_argument('-o', '--output-dir',
            action = 'store',
            type = arg_is_dir,
            help = ('The directory in which all output files will be written. '
                    'The default is to use the directory of the first observed '
                    'config file.'))
    parser.add_argument('--temp-dir',
            action = 'store',
            type = arg_is_dir,
            help = ('A directory to temporarily stage files. The default is to '
                    'use the output directory.'))
    parser.add_argument('-s', '--stat-prefixes',
            nargs = '*',
            type = str,
            default = ['pi', 'pi.net', 'wattTheta', 'tajD.denom'],
            help = ('Prefixes of summary statistics to use in the analyses. '
                    'The prefixes should be separated by spaces. '
                    'Default: `-s pi pi.net wattTheta tajD.denom`.'))
    parser.add_argument('--vertical-lines',
            nargs = '*',
            type = float,
            default = [],
            help = ('Positions along x-axis where vertical lines are to be '
                    'drawn. Default is to draw no vertical lines.'))
    parser.add_argument('--compress',
            action = 'store_true',
            help = 'Compress plot data file.')
    parser.add_argument('--keep-temps',
            action = 'store_true',
            help = 'Keep all temporary files.')
    parser.add_argument('--seed',
            action = 'store',
            type = int,
            help = 'Random number seed to use for the analysis.')
    parser.add_argument('--version',
            action = 'version',
            version = '%(prog)s ' + _program_info['version'],
            help = 'Report version and exit.')
    parser.add_argument('--quiet',
            action = 'store_true',
            help = 'Run without verbose messaging.')
    parser.add_argument('--debug',
            action = 'store_true',
            help = 'Run in debugging mode.')

    args = parser.parse_args()

    ##########################################################################
    ## handle args

    from pymsbayes.utils.messaging import (LoggingControl,
            InfoLogger)

    LoggingControl.set_logging_level("INFO")
    if args.quiet:
        LoggingControl.set_logging_level("WARNING")
    if args.debug:
        LoggingControl.set_logging_level("DEBUG")
    log = LoggingControl.get_logger(__name__)

    from pymsbayes.workers import MsBayesWorker
    from pymsbayes.utils.parsing import (get_patterns_from_prefixes,
            DEFAULT_STAT_PATTERNS, get_stats_by_time, dict_line_iter)
    from pymsbayes.manager import Manager
    from pymsbayes.utils.tempfs import TempFileSystem
    from pymsbayes.utils import probability
    from pymsbayes.utils.functions import long_division
    from pymsbayes.config import MsBayesConfig
    from pymsbayes.utils import GLOBAL_RNG, MSBAYES_SORT_INDEX, ToolPathManager
    from pymsbayes.fileio import process_file_arg
    from pymsbayes.plotting import MATPLOTLIB_AVAILABLE, SaturationPlotGrid

    MSBAYES_SORT_INDEX.set_index(0)

    # get full paths to tools
    msbayes_path = ToolPathManager.get_tool_full_path('msbayes.pl')
    dpp_msbayes_path = ToolPathManager.get_tool_full_path('dpp-msbayes.pl')

    if not args.output_dir:
        args.output_dir = os.path.dirname(args.config)
    info = InfoLogger(os.path.join(args.output_dir, 'pymsbayes-info.txt'))

    stats_by_time_path = os.path.join(args.output_dir, 'stats-by-time.txt')
    if args.compress:
        stats_by_time_path += '.gz'
    plot_path = os.path.join(args.output_dir, 'saturation-plot.pdf')

    if not args.temp_dir:
        args.temp_dir = args.output_dir
    temp_fs = TempFileSystem(parent=args.temp_dir, prefix='temp-files-')
    args.stat_prefixes = [s.rstrip('.') for s in args.stat_prefixes]
    stat_patterns = get_patterns_from_prefixes(
            [s + '.' for s in args.stat_prefixes],
            ignore_case=True)
    if not args.seed:
        args.seed = random.randint(1, 999999999)
    GLOBAL_RNG.seed(args.seed)
    compress_level = None
    if args.compress:
        compress_level = 9

    cfg = MsBayesConfig(args.config)
    num_taxon_pairs = cfg.npairs
    cfg.div_model_prior = 'constrained'
    cfg.psi = probability.DiscreteUniformDistribution(num_taxon_pairs,
            num_taxon_pairs)
    config_path = temp_fs.get_file_path(prefix='cfg-')
    cfg.write(config_path)

    info.write('[pymsbayes]', log.info)
    info.write('\tprogram_name = {name}'.format(**_program_info), log.info)
    info.write('\tversion = {version}'.format(**_program_info), log.info)
    info.write('\tinvocation = {0!r}'.format(' '.join(sys.argv)), log.info)
    info.write('\toutput_directory = {0!r}'.format(args.output_dir), log.info)
    info.write('\ttemp_directory = {0!r}'.format(temp_fs.base_dir), log.info)
    info.write('\tsort_index = {0}'.format(
            MSBAYES_SORT_INDEX.current_value()), log.info)
    info.write('\tstat_patterns = {0!r}'.format(
            ', '.join([p.pattern for p in stat_patterns])), log.info)
    info.write('\tseed = {0}'.format(args.seed), log.info)
    info.write('\tnum_prior_samples = {0}'.format(args.num_prior_samples),
            log.info)
    info.write('\tstats_by_time_path = {0!r}'.format(stats_by_time_path),
            log.info)
    info.write('\t[[tool_paths]]', log.info)
    info.write('\t\tdpp_msbayes = {0}'.format(dpp_msbayes_path), log.info)
    info.write('\t\tmsbayes = {0}'.format(msbayes_path), log.info)

    info.write('\t[[config]]', log.debug)
    info.write('{0}'.format(str(cfg)), log.debug)

    ##########################################################################
    ## begin analysis --- generate samples

    start_time = datetime.datetime.now()

    if args.np > args.num_prior_samples:
        args.np = args.num_prior_samples
    batch_size, remainder = long_division(args.num_prior_samples, args.np)
    schema = 'abctoolbox'
    workers = []
    for i in range(args.np):
        sample_size = batch_size
        if i == (args.np - 1):
            sample_size += remainder
        w = MsBayesWorker(
                temp_fs = temp_fs,
                sample_size = sample_size,
                config_path = config_path,
                report_parameters = True,
                schema = schema,
                include_header = True,
                stat_patterns = stat_patterns,
                write_stats_file = False)
        workers.append(w)

    log.info('Generating samples...')
    workers = Manager.run_workers(
            workers = workers,
            num_processors = args.np)
    log.info('Parsing samples...')
    stats_by_time = get_stats_by_time([w.prior_path for w in workers])
    stat_keys = stats_by_time.keys()
    stat_keys.remove('PRI.t')
    for prefix in args.stat_prefixes:
        if not prefix in stat_keys:
            raise Exception('stat prefix {0!r} not found in simulated stats:'
                    '\n\t{1}'.format(prefix, ', '.join(stat_keys)))
    header = ['PRI.t'] + args.stat_prefixes
    log.info('Writing stats-by-time matrix...')
    out, close = process_file_arg(stats_by_time_path, 'w',
            compresslevel = compress_level)
    for row in dict_line_iter(stats_by_time, sep = '\t', header = header):
        out.write(row)
    if close:
        out.close()

    log.info('Creating plots...')

    if not MATPLOTLIB_AVAILABLE:
        log.warning(
                '`matplotlib` could not be imported, so the plot can not be\n'
                'produced. The data to create the plot can be found in:\n\t'
                '{0!r}'.format(stats_by_time_path))
    else:
        y_labels = {'pi': r'$\pi$',
                   'pi.net': r'$\pi_{net}$',
                   'wattTheta': r'$\theta_W$',
                   'tajD.denom': r'$SD(\pi - \theta_W)$'}
        spg = SaturationPlotGrid(stats_by_time,
                x_key = 'PRI.t',
                y_keys = args.stat_prefixes,
                y_labels = y_labels,
                num_columns = 2,
                vertical_line_positions = args.vertical_lines)
        fig = spg.create_grid()
        fig.savefig(plot_path)

    stop_time = datetime.datetime.now()
    log.info('Done!')
    info.write('\t[[run_stats]]', log.info)
    info.write('\t\tstart_time = {0}'.format(str(start_time)), log.info)
    info.write('\t\tstop_time = {0}'.format(str(stop_time)), log.info)
    info.write('\t\ttotal_duration = {0}'.format(str(stop_time - start_time)),
            log.info)

    if not args.keep_temps:
        log.debug('purging temps...')
        temp_fs.purge()
Пример #6
0
def main_cli():
    description = '{name} {version}'.format(**_program_info)
    parser = argparse.ArgumentParser(description = description,
            formatter_class = argparse_utils.SmartHelpFormatter)
    parser.add_argument('-c', '--config',
            type = argparse_utils.arg_is_config,
            required = True,
            help = ('msBayes config file to be used to generate saturation '
                    'plot.'))
    parser.add_argument('-n', '--num-prior-samples',
            action = 'store',
            type = int,
            default = 1000,
            help = ('The number of prior samples to simulate for the '
                    'saturation plot.'))
    parser.add_argument('--np',
            action = 'store',
            type = int,
            default = multiprocessing.cpu_count(),
            help = ('The maximum number of processes to run in parallel. The '
                    'default is the number of CPUs available on the machine.'))
    parser.add_argument('-o', '--output-dir',
            action = 'store',
            type = argparse_utils.arg_is_dir,
            help = ('The directory in which all output files will be written. '
                    'The default is to use the directory of the first observed '
                    'config file.'))
    parser.add_argument('--temp-dir',
            action = 'store',
            type = argparse_utils.arg_is_dir,
            help = ('A directory to temporarily stage files. The default is to '
                    'use the output directory.'))
    parser.add_argument('-s', '--stat-prefixes',
            nargs = '*',
            type = str,
            default = ['pi', 'pi.net', 'wattTheta', 'tajD.denom'],
            help = ('Prefixes of summary statistics to use in the analyses. '
                    'The prefixes should be separated by spaces. '
                    'Default: `-s pi pi.net wattTheta tajD.denom`.'))
    parser.add_argument('--sort-index',
            action = 'store',
            type = int,
            default = 0,
            choices = range(12),
            help = argparse_utils.get_sort_index_help_message())
    parser.add_argument('--compress',
            action = 'store_true',
            help = 'Compress plot data file.')
    parser.add_argument('--keep-temps',
            action = 'store_true',
            help = 'Keep all temporary files.')
    parser.add_argument('--seed',
            action = 'store',
            type = int,
            help = 'Random number seed to use for the analysis.')
    parser.add_argument('--version',
            action = 'version',
            version = '%(prog)s ' + _program_info['version'],
            help = 'Report version and exit.')
    parser.add_argument('--quiet',
            action = 'store_true',
            help = 'Run without verbose messaging.')
    parser.add_argument('--debug',
            action = 'store_true',
            help = 'Run in debugging mode.')

    args = parser.parse_args()

    ##########################################################################
    ## handle args

    from pymsbayes.utils.messaging import (LoggingControl,
            InfoLogger)

    LoggingControl.set_logging_level("INFO")
    if args.quiet:
        LoggingControl.set_logging_level("WARNING")
    if args.debug:
        LoggingControl.set_logging_level("DEBUG")
    log = LoggingControl.get_logger(__name__)

    from pymsbayes.workers import MsBayesWorker
    from pymsbayes.utils.parsing import (get_patterns_from_prefixes,
            DEFAULT_STAT_PATTERNS, get_dict_from_spreadsheets, dict_line_iter)
    from pymsbayes.manager import Manager
    from pymsbayes.utils.tempfs import TempFileSystem
    from pymsbayes.utils import probability, stats
    from pymsbayes.utils.functions import long_division
    from pymsbayes.config import MsBayesConfig
    from pymsbayes.utils import GLOBAL_RNG, MSBAYES_SORT_INDEX, ToolPathManager
    from pymsbayes.fileio import process_file_arg
    from pymsbayes import plotting

    MSBAYES_SORT_INDEX.set_index(args.sort_index)

    # get full paths to tools
    msbayes_path = ToolPathManager.get_tool_full_path('msbayes.pl')
    dpp_msbayes_path = ToolPathManager.get_tool_full_path('dpp-msbayes.pl')

    if not args.output_dir:
        args.output_dir = os.path.dirname(args.config)
    info = InfoLogger(os.path.join(args.output_dir, 'pymsbayes-info.txt'))

    sample_path = os.path.join(args.output_dir, 'prior-sample.txt')
    if args.compress:
        sample_path += '.gz'

    if not args.temp_dir:
        args.temp_dir = args.output_dir
    temp_fs = TempFileSystem(parent=args.temp_dir, prefix='temp-files-')
    args.stat_prefixes = [s.rstrip('.') for s in args.stat_prefixes]
    stat_patterns = get_patterns_from_prefixes(
            [s + '.' for s in args.stat_prefixes],
            ignore_case=True)
    if not args.seed:
        args.seed = random.randint(1, 999999999)
    GLOBAL_RNG.seed(args.seed)
    compress_level = None
    if args.compress:
        compress_level = 9

    cfg = MsBayesConfig(args.config)
    num_taxon_pairs = cfg.npairs

    info.write('[pymsbayes]', log.info)
    info.write('\tprogram_name = {name}'.format(**_program_info), log.info)
    info.write('\tversion = {version}'.format(**_program_info), log.info)
    info.write('\tinvocation = {0!r}'.format(' '.join(sys.argv)), log.info)
    info.write('\toutput_directory = {0!r}'.format(args.output_dir), log.info)
    info.write('\ttemp_directory = {0!r}'.format(temp_fs.base_dir), log.info)
    info.write('\tsort_index = {0}'.format(
            MSBAYES_SORT_INDEX.current_value()), log.info)
    info.write('\tstat_patterns = {0!r}'.format(
            ', '.join([p.pattern for p in stat_patterns])), log.info)
    info.write('\tseed = {0}'.format(args.seed), log.info)
    info.write('\tnum_prior_samples = {0}'.format(args.num_prior_samples),
            log.info)
    info.write('\tsample_path = {0!r}'.format(sample_path), log.info)
    info.write('\t[[tool_paths]]', log.info)
    info.write('\t\tdpp_msbayes = {0}'.format(dpp_msbayes_path), log.info)
    info.write('\t\tmsbayes = {0}'.format(msbayes_path), log.info)

    info.write('\t[[config]]', log.debug)
    info.write('{0}'.format(str(cfg)), log.debug)

    ##########################################################################
    ## begin analysis --- generate samples

    start_time = datetime.datetime.now()

    if args.np > args.num_prior_samples:
        args.np = args.num_prior_samples
    batch_size, remainder = long_division(args.num_prior_samples, args.np)
    schema = 'abctoolbox'
    workers = []
    for i in range(args.np):
        sample_size = batch_size
        if i == (args.np - 1):
            sample_size += remainder
        w = MsBayesWorker(
                temp_fs = temp_fs,
                sample_size = sample_size,
                config_path = args.config,
                report_parameters = True,
                schema = schema,
                include_header = True,
                stat_patterns = stat_patterns,
                write_stats_file = False)
        workers.append(w)

    log.info('Generating samples...')
    workers = Manager.run_workers(
            workers = workers,
            num_processors = args.np)
    log.info('Parsing samples...')
    sample = get_dict_from_spreadsheets([w.prior_path for w in workers])

    log.info('Writing prior samples...')
    out, close = process_file_arg(sample_path, 'w',
            compresslevel = compress_level)
    for row in dict_line_iter(sample, sep = '\t'):
        out.write(row)
    if close:
        out.close()

    log.info('Creating plots...')

    if not plotting.MATPLOTLIB_AVAILABLE:
        log.warning(
                '`matplotlib` could not be imported, so the plot can not be\n'
                'produced. The data to create the plot can be found in:\n\t'
                '{0!r}'.format(sample_path))
        sys.exit(1)

    for stat_pattern in stat_patterns:
        found = False
        for stat, values in sample.iteritems():
            if stat_pattern.match(stat):
                values = [float(v) for v in values]
                found = True
                plot_path = os.path.join(args.output_dir,
                        'plot-{0}.pdf'.format(stat))
                summary = stats.get_summary(values)
                s = r'mean = {0:.4f} ({1:.4f}-{2:.4f})'.format(
                        summary['mean'],
                        summary['qi_95'][0],
                        summary['qi_95'][1])
                hd = plotting.HistData(x = values,
                        normed = True,
                        bins = 20,
                        histtype = 'bar',
                        align = 'mid',
                        orientation = 'vertical',
                        zorder = 0)
                hist = plotting.ScatterPlot(hist_data_list = [hd],
                        right_text = s)
                hist.left_text_size = 12.0
                hist.right_text_size = 12.0
                xticks = [i for i in hist.ax.get_xticks()]
                xtick_labels = [i for i in xticks]
                yticks = [i for i in hist.ax.get_yticks()]
                ytick_labels = [i for i in yticks]
                if len(xtick_labels) >= 8:
                    for i in range(1, len(xtick_labels), 2):
                        xtick_labels[i] = ''
                if len(ytick_labels) >= 8:
                    for i in range(1, len(ytick_labels), 2):
                        ytick_labels[i] = ''
                xticks_obj = plotting.Ticks(ticks = xticks,
                        labels = xtick_labels,
                        horizontalalignment = 'center')
                yticks_obj = plotting.Ticks(ticks = yticks,
                        labels = ytick_labels)
                hist.xticks_obj = xticks_obj
                hist.yticks_obj = yticks_obj

                plot_grid = plotting.PlotGrid(subplots = [hist],
                        num_columns = 1,
                        label_schema = None,
                        title = stat,
                        title_size = 14.0,
                        title_top = False,
                        y_title = 'Density',
                        y_title_position = 0.001,
                        y_title_size = 14.0,
                        height = 4.0,
                        width = 6.0,
                        auto_height = False)
                plot_grid.auto_adjust_margins = False
                plot_grid.margin_left = 0.04
                plot_grid.margin_bottom = 0.04 
                plot_grid.margin_right = 1.0 
                plot_grid.margin_top = 0.97
                plot_grid.reset_figure()
                plot_grid.savefig(plot_path)

        if not found:
            raise Exception('stat pattern {0!r} not found in simulated stats:'
                    '\n\t{1}'.format(stat_pattern, ', '.join(sample.keys())))

    stop_time = datetime.datetime.now()
    log.info('Done!')
    info.write('\t[[run_stats]]', log.info)
    info.write('\t\tstart_time = {0}'.format(str(start_time)), log.info)
    info.write('\t\tstop_time = {0}'.format(str(stop_time)), log.info)
    info.write('\t\ttotal_duration = {0}'.format(str(stop_time - start_time)),
            log.info)

    if not args.keep_temps:
        log.debug('purging temps...')
        temp_fs.purge()
Пример #7
0
def main_cli():
    description = '{name} {version}'.format(**_program_info)
    parser = argparse.ArgumentParser(description=description)
    parser.add_argument(
        '-c',
        '--config',
        type=arg_is_config,
        required=True,
        help=('msBayes config file to be used to generate saturation '
              'plot.'))
    parser.add_argument(
        '-n',
        '--num-prior-samples',
        action='store',
        type=int,
        default=1000,
        help=('The number of prior samples to simulate for the '
              'saturation plot.'))
    parser.add_argument(
        '--np',
        action='store',
        type=int,
        default=multiprocessing.cpu_count(),
        help=('The maximum number of processes to run in parallel. The '
              'default is the number of CPUs available on the machine.'))
    parser.add_argument(
        '-o',
        '--output-dir',
        action='store',
        type=arg_is_dir,
        help=('The directory in which all output files will be written. '
              'The default is to use the directory of the first observed '
              'config file.'))
    parser.add_argument(
        '--temp-dir',
        action='store',
        type=arg_is_dir,
        help=('A directory to temporarily stage files. The default is to '
              'use the output directory.'))
    parser.add_argument(
        '-s',
        '--stat-prefixes',
        nargs='*',
        type=str,
        default=['pi', 'pi.net', 'wattTheta', 'tajD.denom'],
        help=('Prefixes of summary statistics to use in the analyses. '
              'The prefixes should be separated by spaces. '
              'Default: `-s pi pi.net wattTheta tajD.denom`.'))
    parser.add_argument(
        '--vertical-lines',
        nargs='*',
        type=float,
        default=[],
        help=('Positions along x-axis where vertical lines are to be '
              'drawn. Default is to draw no vertical lines.'))
    parser.add_argument('--compress',
                        action='store_true',
                        help='Compress plot data file.')
    parser.add_argument('--keep-temps',
                        action='store_true',
                        help='Keep all temporary files.')
    parser.add_argument('--seed',
                        action='store',
                        type=int,
                        help='Random number seed to use for the analysis.')
    parser.add_argument('--version',
                        action='version',
                        version='%(prog)s ' + _program_info['version'],
                        help='Report version and exit.')
    parser.add_argument('--quiet',
                        action='store_true',
                        help='Run without verbose messaging.')
    parser.add_argument('--debug',
                        action='store_true',
                        help='Run in debugging mode.')

    args = parser.parse_args()

    ##########################################################################
    ## handle args

    from pymsbayes.utils.messaging import (LoggingControl, InfoLogger)

    LoggingControl.set_logging_level("INFO")
    if args.quiet:
        LoggingControl.set_logging_level("WARNING")
    if args.debug:
        LoggingControl.set_logging_level("DEBUG")
    log = LoggingControl.get_logger(__name__)

    from pymsbayes.workers import MsBayesWorker
    from pymsbayes.utils.parsing import (get_patterns_from_prefixes,
                                         DEFAULT_STAT_PATTERNS,
                                         get_stats_by_time, dict_line_iter)
    from pymsbayes.manager import Manager
    from pymsbayes.utils.tempfs import TempFileSystem
    from pymsbayes.utils import probability
    from pymsbayes.utils.functions import long_division
    from pymsbayes.config import MsBayesConfig
    from pymsbayes.utils import GLOBAL_RNG, MSBAYES_SORT_INDEX, ToolPathManager
    from pymsbayes.fileio import process_file_arg
    from pymsbayes.plotting import MATPLOTLIB_AVAILABLE, SaturationPlotGrid

    MSBAYES_SORT_INDEX.set_index(0)

    # get full paths to tools
    msbayes_path = ToolPathManager.get_tool_full_path('msbayes.pl')
    dpp_msbayes_path = ToolPathManager.get_tool_full_path('dpp-msbayes.pl')

    if not args.output_dir:
        args.output_dir = os.path.dirname(args.config)
    info = InfoLogger(os.path.join(args.output_dir, 'pymsbayes-info.txt'))

    stats_by_time_path = os.path.join(args.output_dir, 'stats-by-time.txt')
    if args.compress:
        stats_by_time_path += '.gz'
    plot_path = os.path.join(args.output_dir, 'saturation-plot.pdf')

    if not args.temp_dir:
        args.temp_dir = args.output_dir
    temp_fs = TempFileSystem(parent=args.temp_dir, prefix='temp-files-')
    args.stat_prefixes = [s.rstrip('.') for s in args.stat_prefixes]
    stat_patterns = get_patterns_from_prefixes(
        [s + '.' for s in args.stat_prefixes], ignore_case=True)
    if not args.seed:
        args.seed = random.randint(1, 999999999)
    GLOBAL_RNG.seed(args.seed)
    compress_level = None
    if args.compress:
        compress_level = 9

    cfg = MsBayesConfig(args.config)
    num_taxon_pairs = cfg.npairs
    cfg.div_model_prior = 'constrained'
    cfg.psi = probability.DiscreteUniformDistribution(num_taxon_pairs,
                                                      num_taxon_pairs)
    config_path = temp_fs.get_file_path(prefix='cfg-')
    cfg.write(config_path)

    info.write('[pymsbayes]', log.info)
    info.write('\tprogram_name = {name}'.format(**_program_info), log.info)
    info.write('\tversion = {version}'.format(**_program_info), log.info)
    info.write('\tinvocation = {0!r}'.format(' '.join(sys.argv)), log.info)
    info.write('\toutput_directory = {0!r}'.format(args.output_dir), log.info)
    info.write('\ttemp_directory = {0!r}'.format(temp_fs.base_dir), log.info)
    info.write('\tsort_index = {0}'.format(MSBAYES_SORT_INDEX.current_value()),
               log.info)
    info.write(
        '\tstat_patterns = {0!r}'.format(', '.join(
            [p.pattern for p in stat_patterns])), log.info)
    info.write('\tseed = {0}'.format(args.seed), log.info)
    info.write('\tnum_prior_samples = {0}'.format(args.num_prior_samples),
               log.info)
    info.write('\tstats_by_time_path = {0!r}'.format(stats_by_time_path),
               log.info)
    info.write('\t[[tool_paths]]', log.info)
    info.write('\t\tdpp_msbayes = {0}'.format(dpp_msbayes_path), log.info)
    info.write('\t\tmsbayes = {0}'.format(msbayes_path), log.info)

    info.write('\t[[config]]', log.debug)
    info.write('{0}'.format(str(cfg)), log.debug)

    ##########################################################################
    ## begin analysis --- generate samples

    start_time = datetime.datetime.now()

    if args.np > args.num_prior_samples:
        args.np = args.num_prior_samples
    batch_size, remainder = long_division(args.num_prior_samples, args.np)
    schema = 'abctoolbox'
    workers = []
    for i in range(args.np):
        sample_size = batch_size
        if i == (args.np - 1):
            sample_size += remainder
        w = MsBayesWorker(temp_fs=temp_fs,
                          sample_size=sample_size,
                          config_path=config_path,
                          report_parameters=True,
                          schema=schema,
                          include_header=True,
                          stat_patterns=stat_patterns,
                          write_stats_file=False)
        workers.append(w)

    log.info('Generating samples...')
    workers = Manager.run_workers(workers=workers, num_processors=args.np)
    log.info('Parsing samples...')
    stats_by_time = get_stats_by_time([w.prior_path for w in workers])
    stat_keys = stats_by_time.keys()
    stat_keys.remove('PRI.t')
    for prefix in args.stat_prefixes:
        if not prefix in stat_keys:
            raise Exception('stat prefix {0!r} not found in simulated stats:'
                            '\n\t{1}'.format(prefix, ', '.join(stat_keys)))
    header = ['PRI.t'] + args.stat_prefixes
    log.info('Writing stats-by-time matrix...')
    out, close = process_file_arg(stats_by_time_path,
                                  'w',
                                  compresslevel=compress_level)
    for row in dict_line_iter(stats_by_time, sep='\t', header=header):
        out.write(row)
    if close:
        out.close()

    log.info('Creating plots...')

    if not MATPLOTLIB_AVAILABLE:
        log.warning(
            '`matplotlib` could not be imported, so the plot can not be\n'
            'produced. The data to create the plot can be found in:\n\t'
            '{0!r}'.format(stats_by_time_path))
    else:
        y_labels = {
            'pi': r'$\pi$',
            'pi.net': r'$\pi_{net}$',
            'wattTheta': r'$\theta_W$',
            'tajD.denom': r'$SD(\pi - \theta_W)$'
        }
        spg = SaturationPlotGrid(stats_by_time,
                                 x_key='PRI.t',
                                 y_keys=args.stat_prefixes,
                                 y_labels=y_labels,
                                 num_columns=2,
                                 vertical_line_positions=args.vertical_lines)
        fig = spg.create_grid()
        fig.savefig(plot_path)

    stop_time = datetime.datetime.now()
    log.info('Done!')
    info.write('\t[[run_stats]]', log.info)
    info.write('\t\tstart_time = {0}'.format(str(start_time)), log.info)
    info.write('\t\tstop_time = {0}'.format(str(stop_time)), log.info)
    info.write('\t\ttotal_duration = {0}'.format(str(stop_time - start_time)),
               log.info)

    if not args.keep_temps:
        log.debug('purging temps...')
        temp_fs.purge()
Пример #8
0
def main_cli():
    description = '{name} {version}'.format(**_program_info)
    parser = argparse.ArgumentParser(description=description)
    parser.add_argument(
        'configs',
        metavar='CONFIG-PATH',
        type=arg_is_config,
        nargs='+',
        help=('msBayes config file paths for which to estimate prior '
              'probabilities.'))
    parser.add_argument('-n',
                        '--num-prior-samples',
                        action='store',
                        type=int,
                        default=1000,
                        help=('The number of prior samples to simulate for '
                              'proabability estimates.'))
    parser.add_argument(
        '--np',
        action='store',
        type=int,
        default=multiprocessing.cpu_count(),
        help=('The maximum number of processes to run in parallel. The '
              'default is the number of CPUs available on the machine.'))
    parser.add_argument(
        '-d',
        '--dispersion-threshold',
        action='store',
        type=float,
        default=0.01,
        help=('The threshold for the dispersion index of divegence '
              'times. The estimated prior probability that the '
              'dispersion index is less than this threshold will '
              'be reported for each config.'))
    parser.add_argument(
        '-c',
        '--cv-threshold',
        action='store',
        type=float,
        default=0.01,
        help=('The threshold for the coefficient of variation (CV) of '
              'divegence times. The estimated prior probability that the '
              'CV is less than this threshold will '
              'be reported for each config.'))
    parser.add_argument('--seed',
                        action='store',
                        type=int,
                        help='Random number seed to use for the analysis.')
    parser.add_argument('--version',
                        action='version',
                        version='%(prog)s ' + _program_info['version'],
                        help='Report version and exit.')
    parser.add_argument('--quiet',
                        action='store_true',
                        help='Run without verbose messaging.')
    parser.add_argument('--debug',
                        action='store_true',
                        help='Run in debugging mode.')

    args = parser.parse_args()

    ##########################################################################
    ## handle args

    from pymsbayes.utils.messaging import (LoggingControl, InfoLogger)

    LoggingControl.set_logging_level("INFO")
    if args.quiet:
        LoggingControl.set_logging_level("WARNING")
    if args.debug:
        LoggingControl.set_logging_level("DEBUG")
    log = LoggingControl.get_logger(__name__)

    from pymsbayes.teams import ModelProbabilityEstimatorTeam
    from pymsbayes.utils import GLOBAL_RNG

    if not args.seed:
        args.seed = random.randint(1, 999999999)
    log.info('Using seed {0}'.format(args.seed))
    GLOBAL_RNG.seed(args.seed)

    ##########################################################################
    ## begin analysis --- generate samples

    start_time = datetime.datetime.now()

    prob_esimator_team = ModelProbabilityEstimatorTeam(
        config_paths=args.configs,
        num_samples=args.num_prior_samples,
        omega_threshold=args.dispersion_threshold,
        cv_threshold=args.cv_threshold,
        num_processors=args.np)
    prob_esimator_team.start()

    for path in args.configs:
        sys.stdout.write('Prior probabilities for model {0}:\n'.format(path))
        for k, p in prob_esimator_team.psi_probs[path].iteritems():
            sys.stdout.write('\tnum of divergence events = {0}: {1}\n'.format(
                k, p))
        sys.stdout.write('\tdispersion of div times < {0}: {1}\n'.format(
            args.dispersion_threshold, prob_esimator_team.omega_probs[path]))
        sys.stdout.write('\tCV of div times < {0}: {1}\n'.format(
            args.cv_threshold, prob_esimator_team.cv_probs[path]))

    stop_time = datetime.datetime.now()
    log.info('[run_stats]')
    log.info('\tstart_time = {0}'.format(str(start_time)))
    log.info('\tstop_time = {0}'.format(str(stop_time)))
    log.info('\ttotal_duration = {0}'.format(str(stop_time - start_time)))
Пример #9
0
def main_cli(argv = sys.argv):
    description = '{name} {version}'.format(**_program_info)
    parser = argparse.ArgumentParser(description = description,
            formatter_class = argparse_utils.SmartHelpFormatter)
    parser.add_argument('-o', '--observed-configs',
            nargs = '+',
            type = argparse_utils.arg_is_config,
            required = True,
            help = ('One or more msBayes config files to be used to either '
                    'calculate or simulate observed summary statistics. If '
                    'used in combination with `-r` each config will be used to '
                    'simulate pseudo-observed data. If analyzing real data, do '
                    'not use the `-r` option, and the fasta files specified '
                    'within the config must exist and contain the sequence '
                    'data.'))
    parser.add_argument('-p', '--prior-configs',
            nargs = '+',
            type = argparse_utils.arg_is_path,
            required = True,
            help = ('One or more config files to be used to generate prior '
                    'samples. If more than one config is specified, they '
                    'should be separated by spaces. '
                    'This option can also be used to specify the path to a '
                    'directory containing the prior samples and summary '
                    'statistic means and standard deviations generated by a '
                    'previous run using the `generate-samples-only` option. '
                    'These files should be found in the directory '
                    '`pymsbayes-output/prior-stats-summaries`. The'
                    '`pymsbayes-output/model-key.txt` also needs to be present.'
                    ' If specifying this directory, it should be the only '
                    'argument (i.e., no other directories or config files can '
                    'be provided).'))
    parser.add_argument('-r', '--reps',
            action = 'store',
            type = argparse_utils.arg_is_nonnegative_int,
            default = 0,
            help = ('This option has two effects. First, it signifies that '
                    'the analysis will be simulation based (i.e., no real '
                    'data will be used). Second, it specifies how many '
                    'simulation replicates to perform (i.e., how many data '
                    'sets to simulate and analyze).'))
    parser.add_argument('-n', '--num-prior-samples',
            action = 'store',
            type = argparse_utils.arg_is_positive_int,
            default = 1000000,
            help = ('The number of prior samples to simulate for each prior '
                    'config specified with `-p`.'))
    parser.add_argument('--prior-batch-size',
            action = 'store',
            type = argparse_utils.arg_is_positive_int,
            default = 10000,
            help = ('The number of prior samples to simulate for each batch.'))
    parser.add_argument('--generate-samples-only',
            action = 'store_true',
            help = ('Only generate samples from models as requested. I.e., '
                    'No analyses are performed to approximate posteriors. '
                    'This option can be useful if you want the prior samples '
                    'for other purposes.'))
    parser.add_argument('--num-posterior-samples',
            action = 'store',
            type = argparse_utils.arg_is_positive_int,
            default = 1000,
            help = ('The number of posterior samples desired for each '
                    'analysis. Default: 1000.'))
    parser.add_argument('--num-standardizing-samples',
            action = 'store',
            type = argparse_utils.arg_is_positive_int,
            default = 10000,
            help = ('The number of prior samples desired to use for '
                    'standardizing statistics. Default: 10000.'))
    parser.add_argument('--np',
            action = 'store',
            type = argparse_utils.arg_is_positive_int,
            default = multiprocessing.cpu_count(),
            help = ('The maximum number of processes to run in parallel. The '
                    'default is the number of CPUs available on the machine.'))
    parser.add_argument('--output-dir',
            action = 'store',
            type = argparse_utils.arg_is_dir,
            help = ('The directory in which all output files will be written. '
                    'The default is to use the directory of the first observed '
                    'config file.'))
    parser.add_argument('--temp-dir',
            action = 'store',
            type = argparse_utils.arg_is_dir,
            help = ('A directory to temporarily stage files. The default is to '
                    'use the output directory.'))
    parser.add_argument('--staging-dir',
            action = 'store',
            type = argparse_utils.arg_is_dir,
            help = ('A directory to temporarily stage prior files. This option '
                    'can be useful on clusters to speed up I/O while '
                    'generating prior samples. You can designate a local temp '
                    'directory on a compute node to avoid constant writing to '
                    'a shared drive. The default is to use the `temp-dir`.'))
    parser.add_argument('-s', '--stat-prefixes',
            nargs = '*',
            type = str,
            help = ('Prefixes of summary statistics to use in the analyses. '
                    'The prefixes should be separated by spaces. '
                    'Default: `-s pi wattTheta pi.net tajD.denom`.'))
    parser.add_argument('-b', '--bandwidth',
            action = 'store',
            type = float,
            help = ('Smoothing parameter for the posterior kernal density '
                    'estimation. This option is used for the `glm` '
                    'regression method. The default is 2 / '
                    '`num-posterior-samples`.'))
    parser.add_argument('-q', '--num-posterior-quantiles',
            action = 'store',
            type = argparse_utils.arg_is_positive_int,
            default = 1000,
            help = ('The number of equally spaced quantiles at which to '
                    'evaluate the GLM-estimated posterior density. '
                    'Default: 1000.'))
    parser.add_argument('--reporting-frequency',
            action = 'store',
            type = argparse_utils.arg_is_nonnegative_int,
            default = 0,
            help = ('Suggested frequency (in number of prior samples) for '
                    'running regression and reporting current results. '
                    'Default: 0 (only report final results). '
                    'If a value is given, it may be adjusted so that the '
                    'reporting frequency is a multiple of the multi-processed '
                    'batch size.'))
    parser.add_argument('--sort-index',
            action = 'store',
            type = argparse_utils.arg_is_nonnegative_int,
            default = 0,
            choices = range(12),
            help = argparse_utils.get_sort_index_help_message())
    parser.add_argument('--no-global-estimate',
            action = 'store_true',
            help = ('If multiple prior models are specified, by default a '
                    'global estimate is performed averaging over all models. '
                    'This option prevents the global estimation (i.e., only '
                    'inferences for each model are made).'))
    parser.add_argument('--compress',
            action = 'store_true',
            help = 'Compress large results files.')
    parser.add_argument('--keep-temps',
            action = 'store_true',
            help = 'Keep all temporary files.')
    parser.add_argument('--seed',
            action = 'store',
            type = int,
            help = 'Random number seed to use for the analysis.')
    parser.add_argument('--output-prefix',
            action = 'store',
            type = str,
            default = '',
            help = ('Prefix to use at beginning of output files. The default '
                    'is no prefix.'))
    parser.add_argument('--data-key-path',
            action = 'store',
            type = argparse_utils.arg_is_file,
            help = ('The path to a `data-key.txt` file generated by a previous '
                    'run. This file should be found in the directory '
                    '`pymsbayes-output/data-key.txt`. This option '
                    'will override the `-o`/`--observed-configs` option, and '
                    'is intended to be used in combination with the '
                    '`--start-from` option to restart an analysis.'))
    parser.add_argument('--start-from-simulation-index',
            action = 'store',
            type = argparse_utils.arg_is_nonnegative_int,
            default = 0,
            help = ('The simulation index at which to begin analyses. Must be '
                    'used in combination with either the number of simulation '
                    'replicates (`-r`/`--reps`) or the `--data-key-path` '
                    'option, and must be a positive '
                    'integer that is less than the number of simulation '
                    'replicates. This option can be useful if an analysis '
                    'needs to be restarted.'))
    parser.add_argument('--start-from-observed-index',
            action = 'store',
            type = argparse_utils.arg_is_nonnegative_int,
            default = 0,
            help = ('The observed config index at which to begin analyses. '
                    'Can be used in combination with the `--data-key-path` '
                    'option to restart long-running, multi-observed-config '
                    'analyses'))
    parser.add_argument('--dry-run',
            action = 'store_true',
            help = 'Do not run analyses; only process settings')
    parser.add_argument('--version',
            action = 'version',
            version = '%(prog)s ' + _program_info['version'],
            help = 'Report version and exit.')
    parser.add_argument('--quiet',
            action = 'store_true',
            help = 'Run without verbose messaging.')
    parser.add_argument('--debug',
            action = 'store_true',
            help = 'Run in debugging mode.')

    if argv == sys.argv:
        args = parser.parse_args()
    else:
        args = parser.parse_args(argv)

    ##########################################################################
    ## handle args

    from pymsbayes.utils.messaging import (LoggingControl,
            InfoLogger)

    LoggingControl.set_logging_level("INFO")
    if args.quiet:
        LoggingControl.set_logging_level("WARNING")
    if args.debug:
        LoggingControl.set_logging_level("DEBUG")
    log = LoggingControl.get_logger(__name__)

    from pymsbayes.workers import (MsBayesWorker, merge_prior_files,
            ObsSumStatsWorker)
    from pymsbayes.teams import ABCTeam
    from pymsbayes.utils.functions import (is_file, is_dir, long_division,
            mk_new_dir)
    from pymsbayes.utils.parsing import (get_patterns_from_prefixes,
            DEFAULT_STAT_PATTERNS, DIV_MODEL_PATTERNS, MODEL_PATTERNS,
            PSI_PATTERNS, MEAN_TAU_PATTERNS, OMEGA_PATTERNS, CV_PATTERNS,
            line_count)
    from pymsbayes.utils import sumresults, errors
    from pymsbayes.manager import Manager
    from pymsbayes.utils.tempfs import TempFileSystem
    from pymsbayes.config import MsBayesConfig
    from pymsbayes.utils import (GLOBAL_RNG, set_memory_trace,
            MSBAYES_SORT_INDEX, ToolPathManager)

    MSBAYES_SORT_INDEX.set_index(args.sort_index)

    if len(args.observed_configs) != len(set(args.observed_configs)):
        raise ValueError('All paths to observed config files must be unique')

    if args.num_standardizing_samples > args.num_prior_samples:
        args.num_standardizing_samples = args.num_prior_samples
    
    # get full paths to tools
    msbayes_path = ToolPathManager.get_tool_full_path('msbayes.pl')
    dpp_msbayes_path = ToolPathManager.get_tool_full_path('dpp-msbayes.pl')
    eureject_path = ToolPathManager.get_tool_full_path('eureject')
    abctb_path = ToolPathManager.get_tool_full_path('ABCestimator')

    # vet prior-configs option
    using_previous_priors = False
    previous_prior_dir = None
    if (len(args.prior_configs) == 1) and (is_dir(args.prior_configs[0])):
        previous_prior_dir = args.prior_configs.pop(0)
        previous_priors = glob.glob(os.path.join(previous_prior_dir,
                '*-prior-sample.txt'))
        previous_sums = glob.glob(os.path.join(previous_prior_dir,
                '*-means-and-std-devs.txt'))
        if (not previous_priors) or (not previous_sums):
            raise ValueError('directory {0!r} specified with `prior-configs` '
                    'option does not contain necessary prior and summary '
                    'files'.format(args.prior_configs[0]))
        using_previous_priors = True
    else:
        for path in args.prior_configs:
            if not is_file(path):
                raise ValueError('prior config {0!r} is not a file'.format(
                        path))
    if len(args.prior_configs) != len(set(args.prior_configs)):
        raise ValueError('All paths to prior config files must be unique') 
    if not args.output_dir:
        args.output_dir = os.path.dirname(args.observed_configs[0])
    base_dir = mk_new_dir(os.path.join(args.output_dir, 'pymsbayes-results'))
    if not args.temp_dir:
        args.temp_dir = base_dir
    info_path = os.path.join(base_dir, args.output_prefix + \
            'pymsbayes-info.txt')
    info = InfoLogger(info_path)
    info.write('[pymsbayes]'.format(base_dir))
    info.write('\tversion = {version}'.format(**_program_info))
    info.write('\toutput_directory = {0}'.format(base_dir))
    temp_fs = TempFileSystem(parent=args.temp_dir, prefix='temp-files-')
    base_temp_dir = temp_fs.base_dir
    info.write('\ttemp_directory = {0}'.format(base_temp_dir))
    info.write('\tsort_index = {0}'.format(
            MSBAYES_SORT_INDEX.current_value()))
    info.write('\tsimulation_reps = {0}'.format(args.reps))
    stat_patterns = DEFAULT_STAT_PATTERNS
    if args.stat_prefixes:
        for i in range(len(args.stat_prefixes)):
            if not args.stat_prefixes[i].endswith('.'):
                args.stat_prefixes[i] += '.'
        stat_patterns = get_patterns_from_prefixes(
                args.stat_prefixes,
                ignore_case=True)
    if not args.bandwidth:
        args.bandwidth = 2 / float(args.num_posterior_samples)
    if not args.seed:
        args.seed = random.randint(1, 999999999)
    GLOBAL_RNG.seed(args.seed)
    if args.data_key_path:
        observed_map = sumresults.parse_data_key_file(args.data_key_path)
        observed_paths = [observed_map[k] for k in sorted(observed_map.keys())]
    else:
        observed_dir = mk_new_dir(os.path.join(base_dir,
                'observed-summary-stats'))
        observed_paths = [os.path.join(observed_dir, args.output_prefix + \
            'observed-{0}.txt'.format(i+1)) for i in range(len(
                    args.observed_configs))]
    info.write('\tseed = {0}'.format(args.seed))
    info.write('\tnum_processors = {0}'.format(args.np))
    info.write('\tnum_prior_samples = {0}'.format(
            args.num_prior_samples))
    info.write('\tnum_standardizing_samples = {0}'.format(
            args.num_standardizing_samples))
    info.write('\tbandwidth = {0}'.format(args.bandwidth))
    info.write('\tposterior_quantiles = {0}'.format(
            args.num_posterior_quantiles))
    info.write('\tposterior_sample_size = {0}'.format(
            args.num_posterior_samples))
    info.write('\tstat_patterns = {0}'.format(
            ', '.join([p.pattern for p in stat_patterns])))

    # vet observed configs
    ref_config_path = args.observed_configs[0]
    ref_config = MsBayesConfig(ref_config_path) 
    all_config_paths = []
    num_taxon_pairs = ref_config.npairs
    assert num_taxon_pairs > 0
    for config in args.observed_configs:
        all_config_paths.append(config)
        if not ref_config.equal_sample_table(config):
            if not args.keep_temps:
                temp_fs.purge()
            raise errors.SampleTableError(
                    'sample tables in config {0!r} and {1!r} differ; '
                    'all sample tables must be the same.'.format(
                            ref_config_path, config))

    info.write('\tnum_taxon_pairs = {0}'.format(num_taxon_pairs))
    info.write('\tdry_run = {0}'.format(args.dry_run))
    info.write('\t[[tool_paths]]')
    info.write('\t\tdpp_msbayes = {0}'.format(dpp_msbayes_path))
    info.write('\t\tmsbayes = {0}'.format(msbayes_path))
    info.write('\t\teureject = {0}'.format(eureject_path))
    info.write('\t\tabcestimator = {0}'.format(abctb_path))
    info.write('\t[[observed_configs]]')
    for i, cfg in enumerate(args.observed_configs):
        info.write('\t\t{0} = {1}'.format(i + 1, os.path.relpath(cfg,
                os.path.dirname(info_path))))

    abc_team = ABCTeam(
            temp_fs = temp_fs,
            observed_stats_files = observed_paths,
            num_taxon_pairs = num_taxon_pairs,
            config_paths = args.prior_configs,
            previous_prior_dir = previous_prior_dir,
            num_prior_samples = args.num_prior_samples,
            num_processors = args.np,
            num_standardizing_samples = args.num_standardizing_samples,
            num_posterior_samples = args.num_posterior_samples,
            num_posterior_density_quantiles = args.num_posterior_quantiles,
            batch_size = args.prior_batch_size,
            output_dir = base_dir,
            output_prefix = args.output_prefix,
            prior_temp_dir = args.staging_dir,
            rng = GLOBAL_RNG,
            report_parameters = True,
            stat_patterns = stat_patterns,
            eureject_exe_path = eureject_path,
            abctoolbox_exe_path = abctb_path,
            msbayes_exe_path = None,
            abctoolbox_bandwidth = args.bandwidth,
            omega_threshold = 0.01,
            cv_threshold = 0.01,
            compress = args.compress,
            reporting_frequency = args.reporting_frequency,
            keep_temps = args.keep_temps,
            global_estimate_only = False,
            global_estimate = not args.no_global_estimate,
            generate_prior_samples_only = args.generate_samples_only,
            start_from_simulation_index = args.start_from_simulation_index,
            start_from_observed_index = args.start_from_observed_index)

    models_to_configs = {}
    configs_to_models = {}
    for k, v in abc_team.models.iteritems():
        models_to_configs[k] = v
        configs_to_models[v] = k
        cfg = MsBayesConfig(v)
        all_config_paths.append(v)
        # vet prior configs
        if not ref_config.equal_sample_table(cfg):
            if not args.keep_temps:
                temp_fs.purge()
            raise errors.SampleTableError(
                    'sample tables in config {0!r} and {1!r} differ; '
                    'all sample tables must be the same.'.format(
                            ref_config_path, v))

    info.write('\t[[observed_paths]]')
    for i in sorted(abc_team.observed_stats_paths.iterkeys()):
        info.write('\t\t{0} = {1}'.format(i, os.path.relpath(
                abc_team.observed_stats_paths[i],
                os.path.dirname(info_path))))
    info.write('\t[[prior_configs]]')
    for i in sorted(abc_team.models.iterkeys()):
        info.write('\t\t{0} = {1}'.format(i, os.path.relpath(
                abc_team.models[i],
                os.path.dirname(info_path))))

    ##########################################################################
    ## begin analysis --- get observed summary stats

    set_memory_trace() # start logging memory profile
    start_time = datetime.datetime.now()

    if args.data_key_path:
        log.info('Using provided summary statitics...')
    elif not args.dry_run:
        obs_temp_dir = base_temp_dir
        if args.staging_dir:
            obs_temp_dir = args.staging_dir
        observed_temp_fs = TempFileSystem(parent = obs_temp_dir,
                prefix = 'observed-temps-')

        if args.reps < 1:
            log.info('Calculating summary statistics from sequence data...')
            obs_workers = []
            for i, cfg in enumerate(args.observed_configs):
                ss_worker = ObsSumStatsWorker(
                        temp_fs = observed_temp_fs,
                        config_path = cfg,
                        output_path = observed_paths[i],
                        schema = 'abctoolbox',
                        stat_patterns = stat_patterns)
                obs_workers.append(ss_worker)

            obs_workers = Manager.run_workers(
                workers = obs_workers,
                num_processors = args.np)

            # re-vet all configs to see if some were changed by obsSumStats.pl
            new_ref_config = ref_config
            ref_modified = False
            # new ref because if all configs were updated all is good
            if not ref_config.equal_sample_table(ref_config_path):
                ref_modified = True
                new_ref_config = MsBayesConfig(ref_config_path)
                log.warning("""
The alignment lengths in config
{0!r}
have been corrected for sites with *any* ambiguous bases and/or gaps by
obsSumStats.pl.
                    """.format(ref_config_path))
            for config in all_config_paths:
                if not new_ref_config.equal_sample_table(config):
                    corrected_config = config
                    if ref_modified:
                        corrected_config = ref_config_path
                    if not args.keep_temps:
                        observed_temp_fs.purge()
                        temp_fs.purge()
                    raise errors.SampleTableError("""
The sample tables in configs
{0!r}
and
{1!r}
differ because obsSumStats.pl modified alignment lengths in config
{2!r}
to correct for sites in the alignments with *any* ambiguous bases and/or gaps.
Please make sure the sample tables in all configs will be the same after
correcting alignment lengths for sites that contain *any* ambiguous bases
and/or gaps. You can do this by copying and pasting the sample table in
{2!r}
that has been corrected by obsSumStats.pl into the other configs that were not
corrected.
                        """.format(ref_config_path, config, corrected_config))

        else:
            log.info('Simulating summary statistics from observed configs...')
            num_observed_workers = min([args.reps, args.np])
            if args.reps <= args.np:
                observed_batch_size = 1
                remainder = 0
            else:
                observed_batch_size, remainder = long_division(args.reps,
                        args.np)
            msbayes_workers = []
            for idx, cfg in enumerate(args.observed_configs):
                observed_model_idx = configs_to_models.get(cfg,
                        None)
                schema = 'abctoolbox'
                for i in range(num_observed_workers):
                    worker = MsBayesWorker(
                            temp_fs = observed_temp_fs,
                            sample_size = observed_batch_size,
                            config_path = cfg,
                            model_index = observed_model_idx,
                            report_parameters = True,
                            schema = schema,
                            include_header = True,
                            stat_patterns = stat_patterns,
                            write_stats_file = False,
                            staging_dir = None,
                            tag = idx)
                    msbayes_workers.append(worker)
                if remainder > 0:
                    worker = MsBayesWorker(
                            temp_fs = observed_temp_fs,
                            sample_size = remainder,
                            config_path = cfg,
                            model_index = observed_model_idx,
                            report_parameters = True,
                            schema = schema,
                            include_header = True,
                            stat_patterns = stat_patterns,
                            write_stats_file = False,
                            staging_dir = None,
                            tag = idx)
                    msbayes_workers.append(worker)

            # run parallel msbayes processes
            msbayes_workers = Manager.run_workers(
                workers = msbayes_workers,
                num_processors = args.np)

            workers = dict(zip(range(len(args.observed_configs)),
                    [[] for i in range(len(args.observed_configs))]))
            for w in msbayes_workers:
                workers[w.tag].append(w)

            # merge simulated observed data into one file
            for i in range(len(args.observed_configs)):
                merge_prior_files([w.prior_path for w in workers[i]],
                        observed_paths[i])
                lc = line_count(observed_paths[i], ignore_headers=True)
                if lc != args.reps:
                    if not args.keep_temps:
                        temp_fs.purge()
                    raise Exception('The number of observed simulations ({0}) '
                            'generated for observed config {1!r} and output to '
                            'file {2!r} does not match the number of reps '
                            '({3})'.format(lc, args.observed_configs[i],
                                observed_paths[i], args.reps))
        if not args.keep_temps:
            log.debug('purging observed temps...')
            observed_temp_fs.purge()

    ##########################################################################
    ## Begin ABC analyses

    if not args.dry_run:
        abc_team.run()

    stop_time = datetime.datetime.now()
    log.info('Done!')
    info.write('\t[[run_stats]]', log.info)
    info.write('\t\tstart_time = {0}'.format(str(start_time)), log.info)
    info.write('\t\tstop_time = {0}'.format(str(stop_time)), log.info)
    info.write('\t\ttotal_duration = {0}'.format(str(stop_time - start_time)),
            log.info)

    if not args.keep_temps:
        log.debug('purging temps...')
        temp_fs.purge()
Пример #10
0
def main_cli():
    description = '{name} {version}'.format(**_program_info)
    parser = argparse.ArgumentParser(description = description)
    parser.add_argument('config',
            metavar='CONFIG-FILE',
            type = argparse_utils.arg_is_config,
            help = ('msBayes config file used to estimate the posterior '
                    'sample.'))
    parser.add_argument('posterior_sample_path',
            metavar='POSTERIOR-SAMPLE-FILE',
            type=argparse_utils.arg_is_file,
            help = ('Path to posterior sample file (i.e., '
                    '`*-posterior-sample.txt`).'))
    parser.add_argument('-e', '--expression',
            dest = 'expressions',
            action = 'append',
            metavar = 'TAXON-INDEX-EXPRESSION',
            type = str,
            required = True,
            help = ('A conditional expression of divergence times based on '
                    'the taxon-pair indices for which to calculate the '
                    'posterior probability of being true. Indices correspond '
                    'to the order that pairs of taxa appear in the sample '
                    'table of the config, starting at 0 for the first '
                    'taxon-pair to appear in the table (starting from the '
                    'top). E.g., `-e "0 == 3 == 4"` would request the '
                    'proportion of times the 1st, 4th, and 5th taxon-pairs '
                    '(in order of appearance in the sample table of the '
                    'config) share the same divergence time in the '
                    'posterior sample, whereas `-e "0 > 1" would request the '
                    'proportion of times the the 1st taxon-pair diverged '
                    'further back in time than the 2nd taxon-pair in the '
                    'posterior sample.'))
    parser.add_argument('-n', '--num-prior-samples',
            action = 'store',
            type = argparse_utils.arg_is_positive_int,
            help = ('The number of prior samples to simulate for estimating '
                    'prior probabilities; prior probabilities and Bayes '
                    'factors will be reported. The default is to only report '
                    'posterior probabilities.'))
    parser.add_argument('--np',
            action = 'store',
            type = argparse_utils.arg_is_positive_int,
            default = multiprocessing.cpu_count(),
            help = ('The maximum number of processes to run in parallel for '
                    'prior simulations. The default is the number of CPUs '
                    'available on the machine. This option is only relevant '
                    'if the number of prior samples is specified using the '
                    '`-n` argument.'))
    parser.add_argument('--seed',
            action = 'store',
            type = argparse_utils.arg_is_positive_int,
            help = ('Random number seed to use for simulations. This option '
                    'is only relevant if the number of prior samples is '
                    'specified using the `-n` argument.'))
    parser.add_argument('--version',
            action = 'version',
            version = '%(prog)s ' + _program_info['version'],
            help = 'Report version and exit.')
    parser.add_argument('--quiet',
            action = 'store_true',
            help = 'Run without verbose messaging.')
    parser.add_argument('--debug',
            action = 'store_true',
            help = 'Run in debugging mode.')

    args = parser.parse_args()

    ##########################################################################
    ## handle args

    from pymsbayes.utils.messaging import (LoggingControl,
            InfoLogger)

    LoggingControl.set_logging_level("INFO")
    if args.quiet:
        LoggingControl.set_logging_level("WARNING")
    if args.debug:
        LoggingControl.set_logging_level("DEBUG")
    log = LoggingControl.get_logger(__name__)

    from pymsbayes import config
    from pymsbayes.teams import DivModelSimulatorTeam
    from pymsbayes.utils import stats, sumresults, GLOBAL_RNG

    if not args.seed:
        args.seed = random.randint(1, 999999999)
    GLOBAL_RNG.seed(args.seed)

    cfg = config.MsBayesConfig(args.config)

    evaluators = []
    for exp in args.expressions:
        evaluators.append(stats.ListConditionEvaluator(exp,
                index_labels = cfg.taxa))

    div_models = sumresults.get_partitions_from_posterior_sample_file(
            args.posterior_sample_path)

    sim_team = None
    if args.num_prior_samples:
        sim_team = DivModelSimulatorTeam(
                config_paths = [args.config],
                num_samples = args.num_prior_samples,
                num_processors = args.np)
        sim_team.start()

    for e in evaluators:
        title = '{0} --- {1}:'.format(e.expression,
                e.pretty_expression)
        section_title = '\n{0}\n{1}\n'.format(title, '-' * len(title))
        sys.stdout.write('{0}'.format(section_title))
        prob_shared_div = div_models.get_condition_frequency(e)
        sys.stdout.write('posterior probability = {0}\n'.format(prob_shared_div))
        if sim_team:
            prior_prob = sim_team.div_models[
                    args.config].get_condition_frequency(e)
            bf = ((prob_shared_div / (1 - prob_shared_div)) /
                    (prior_prob / (1 - prior_prob)))
            sys.stdout.write('prior probability = {0}\n'.format(prior_prob))
            sys.stdout.write('Bayes factor = {0}\n'.format(bf))
            sys.stdout.write('2ln(Bayes factor) = {0}\n'.format(2 * math.log(bf)))
        sys.stdout.write('\n')
def main_cli():
    description = '{name} {version}'.format(**_program_info)
    parser = argparse.ArgumentParser(description=description)
    parser.add_argument('div_model_path',
                        metavar='DIV-MODEL-RESULTS-FILE',
                        type=argparse_utils.arg_is_file,
                        help=('Path to divergence model results file (i.e., '
                              '`*-div-model-results.txt`).'))
    parser.add_argument(
        '-i',
        '--taxon-indices',
        nargs='+',
        type=argparse_utils.arg_is_positive_int,
        required=True,
        help=('Two or more space-separated indices of taxa for which to '
              'calculate the probability of them co-diverging. Indices '
              'correspond to the line in the sample table of the config, '
              'starting at 1 for the first line of the table. At least '
              'two indices are required.'))
    parser.add_argument(
        '-c',
        '--config',
        type=argparse_utils.arg_is_config,
        help=('msBayes config file to be used to estimate prior '
              'probability via simulations. If provided, the '
              'posterior and prior probability and bayes factor is '
              'reported. If not provided, only the posterior '
              'probability is reported.'))
    parser.add_argument(
        '-n',
        '--num-prior-samples',
        action='store',
        type=argparse_utils.arg_is_positive_int,
        default=100000,
        help=('The number of prior samples to simulate for estimating'
              'prior probabilities. Only used if a config file is '
              'provided with the `-c` argument.'))
    parser.add_argument(
        '--np',
        action='store',
        type=argparse_utils.arg_is_positive_int,
        default=multiprocessing.cpu_count(),
        help=('The maximum number of processes to run in parallel for '
              'prior simulations. The default is the number of CPUs '
              'available on the machine. This option is only relevant '
              'if a config file is provided using the `-c` argument.'))
    parser.add_argument(
        '--seed',
        action='store',
        type=argparse_utils.arg_is_positive_int,
        help=('Random number seed to use for simulations. This option '
              'is only relevant if a config file is provided using the '
              '`-c` argument.'))
    parser.add_argument('--version',
                        action='version',
                        version='%(prog)s ' + _program_info['version'],
                        help='Report version and exit.')
    parser.add_argument('--quiet',
                        action='store_true',
                        help='Run without verbose messaging.')
    parser.add_argument('--debug',
                        action='store_true',
                        help='Run in debugging mode.')

    args = parser.parse_args()

    ##########################################################################
    ## handle args

    from pymsbayes.utils.messaging import (LoggingControl, InfoLogger)

    LoggingControl.set_logging_level("INFO")
    if args.quiet:
        LoggingControl.set_logging_level("WARNING")
    if args.debug:
        LoggingControl.set_logging_level("DEBUG")
    log = LoggingControl.get_logger(__name__)

    from pymsbayes import config
    from pymsbayes.teams import ModelProbabilityEstimatorTeam
    from pymsbayes.utils import sumresults, GLOBAL_RNG

    if len(args.taxon_indices) < 2:
        log.error('At least two taxon indices are required')
        sys.exit(1)
    if not args.seed:
        args.seed = random.randint(1, 999999999)
    GLOBAL_RNG.seed(args.seed)

    div_models = sumresults.OrderedDivergenceModelCollection(
        div_model_results_path=args.div_model_path)
    for i in args.taxon_indices:
        if ((i < 1) or (i > div_models.npairs)):
            log.error('taxon index {0} is out of bounds'.format(i))
            sys.exit(1)
    args.taxon_indices = [i - 1 for i in args.taxon_indices]
    prob_shared_div = div_models.prob_of_shared_divergence(args.taxon_indices)

    if args.config:
        prob_estimator_team = ModelProbabilityEstimatorTeam(
            config_paths=[args.config],
            num_samples=args.num_prior_samples,
            num_processors=args.np)
        prob_estimator_team.start()
        prior_prob = prob_estimator_team.shared_div_probs[args.config][len(
            args.taxon_indices)]
        bf = ((prob_shared_div / (1 - prob_shared_div)) / (prior_prob /
                                                           (1 - prior_prob)))

    sys.stdout.write('posterior probability = {0}\n'.format(prob_shared_div))
    if args.config:
        sys.stdout.write('prior probability = {0}\n'.format(prior_prob))
        sys.stdout.write('Bayes factor = {0}\n'.format(bf))
        sys.stdout.write('2ln(Bayes factor) = {0}\n'.format(2 * math.log(bf)))
Пример #12
0
def main_cli():
    description = '{name} {version}'.format(**_program_info)
    parser = argparse.ArgumentParser(description = description)
    parser.add_argument('info_path',
            metavar='PYMSBAYES-INFO-FILE',
            type=argparse_utils.arg_is_file,
            help=('Path to `pymsbayes-info.txt` file.'))
    parser.add_argument('-n', '--num-prior-samples',
            action = 'store',
            type = argparse_utils.arg_is_positive_int,
            default = 100000,
            help = ('The number of prior samples to simulate for estimating '
                    'prior probabilities.'))
    parser.add_argument('-i', '--sample-index',
            action = 'store',
            type = argparse_utils.arg_is_positive_int,
            help = ('The prior-sample index of results to be summarized. '
                    'Output files should have a consistent schema. For '
                    'example, a results file for divergence models might look '
                    'something like '
                    '`d1-m1-s1-1000000-div-model-results.txt`. In this example, '
                    'the prior-sample index is "1000000". The default is to '
                    'use the largest prior-sample index, which is probably '
                    'what you want.'))
    parser.add_argument('-o', '--output-dir',
            action = 'store',
            type = argparse_utils.arg_is_dir,
            help = ('The directory in which all output plots will be written. '
                    'The default is to use the directory of the pymsbayes info '
                    'file.'))
    parser.add_argument('--np',
            action = 'store',
            type = argparse_utils.arg_is_positive_int,
            default = multiprocessing.cpu_count(),
            help = ('The maximum number of processes to run in parallel. The '
                    'default is the number of CPUs available on the machine.'))
    parser.add_argument('-m', '--mu',
            action = 'store',
            type = argparse_utils.arg_is_positive_float,
            default = None,
            help = ('The mutation rate with which to scale time to units of '
                    'generations. By default, time is not scaled to '
                    'generations.'))
    parser.add_argument('--extension',
            action = 'store',
            type = str,
            default = 'pdf',
            help = ('The file format extension of the plots (e.g., "pdf", '
                    '"png"). The default is pdf.'))
    parser.add_argument('--seed',
            action = 'store',
            type = argparse_utils.arg_is_positive_int,
            help = 'Random number seed to use for the analysis.')
    parser.add_argument('--version',
            action = 'version',
            version = '%(prog)s ' + _program_info['version'],
            help = 'Report version and exit.')
    parser.add_argument('--quiet',
            action = 'store_true',
            help = 'Run without verbose messaging.')
    parser.add_argument('--debug',
            action = 'store_true',
            help = 'Run in debugging mode.')

    args = parser.parse_args()

    ##########################################################################
    ## handle args

    from pymsbayes.utils.messaging import (LoggingControl,
            InfoLogger)

    LoggingControl.set_logging_level("INFO")
    if args.quiet:
        LoggingControl.set_logging_level("WARNING")
    if args.debug:
        LoggingControl.set_logging_level("DEBUG")
    log = LoggingControl.get_logger(__name__)

    from pymsbayes import plotting
    from pymsbayes.utils import sumresults
    from pymsbayes.utils import GLOBAL_RNG

    if not plotting.MATPLOTLIB_AVAILABLE:
        log.error(
                '`matplotlib` could not be imported, so plots can not be\n'
                'produced. Please install `matplotlib` and try again.')
        sys.exit(1)

    if not args.seed:
        args.seed = random.randint(1, 999999999)
    GLOBAL_RNG.seed(args.seed)

    if not args.output_dir:
        args.output_dir = os.path.dirname(args.info_path)
    args.output_dir = os.path.join(args.output_dir, 'plots')
    if not os.path.exists(args.output_dir):
        os.mkdir(args.output_dir)

    results = sumresults.DMCSimulationResults(args.info_path)
    if results.num_sim_reps > 1:
        log.error('Results appear to be from simulation-based analysis, '
                'for which this plotting script is not appropriate.')
        sys.exit(1)

    observed_indices = sorted(results.observed_index_to_config.keys())
    prior_indices = sorted(results.prior_index_to_config.keys())
    for obs_idx in observed_indices:
        for prior_idx in prior_indices:
            result_indices = results.get_result_indices(obs_idx, prior_idx, 1)
            result_idx = max(result_indices)
            result_path_prefix = '{0}{1}-'.format(
                    results.get_result_path_prefix(obs_idx, prior_idx, 1),
                    result_idx)
            result_dir = os.path.dirname(result_path_prefix)
            out_prefix = os.path.join(args.output_dir, os.path.basename(
                    result_path_prefix))
            prior_cfg = results.prior_configs[prior_idx]
            posterior_summary_path = get_result_path(result_path_prefix,
                    'posterior-summary')
            div_model_path = get_result_path(result_path_prefix,
                    'div-model-results')
            config_path = results.prior_index_to_config[prior_idx]
            time_multiplier = 1.0
            if args.mu is not None:
                if prior_cfg.time_in_subs_per_site:
                    time_multiplier = 1.0 / args.mu
                else:
                    try:
                        mean_theta = prior_cfg.theta.mean
                    except:
                        mean_theta = prior_cfg.d_theta.mean
                    time_multiplier = mean_theta / args.mu

            if results.sort_index == 0:
                #plot marginal times
                if not posterior_summary_path:
                    log.warning('Could not find {0}{1}.txt(.gz); '
                            'Skipping marginal times plot...'.format(
                                    result_path_prefix,
                                    'posterior-summary'))
                else:
                    label_dimension = (0.34 * (prior_cfg.npairs + 1)) + 0.56
                    marginal_times_plot = plotting.get_marginal_divergence_time_plot(
                            config_path = config_path,
                            posterior_summary_path = posterior_summary_path,
                            labels = None,
                            estimate = 'median',
                            interval = 'HPD_95_interval',
                            time_multiplier = time_multiplier,
                            horizontal = True,
                            label_dimension = label_dimension,
                            measure_dimension = 8.0,
                            label_size = 12.0,
                            measure_tick_label_size = 12.0,
                            measure_axis_label = 'Divergence time',
                            measure_axis_label_size = 14.0,
                            label_axis_label = 'Taxon pair',
                            label_axis_label_size = 14.0,
                            usetex = False)
                    marginal_times_path = '{0}{1}'.format(out_prefix,
                            'marginal-divergence-times.' + args.extension)
                    marginal_times_plot.savefig(marginal_times_path)

                #plot top ordered models
                if not div_model_path:
                    log.warning('Could not find {0}{1}.txt(.gz); '
                            'Skipping ordered div model plot...'.format(
                                    result_path_prefix,
                                    'div-model-results'))
                else:
                    height = 12.0
                    margin_top = 0.99
                    margin_left = 0.03
                    padding_between_vertical = 0.8
                    if prior_cfg.npairs < 4:
                        height *= 0.8
                        margin_top -= 0.01
                        margin_left += 0.05
                        padding_between_vertical += 0.3
                    width = (0.38 * prior_cfg.npairs) + 1.5
                    div_model_plot = plotting.OrderedDivergenceModelPlotGrid(
                            div_model_results_path = div_model_path,
                            config_path = config_path,
                            num_top_models = 10,
                            time_multiplier = time_multiplier,
                            height = height,
                            width = width,
                            plot_label_schema = 'uppercase',
                            plot_label_offset = 0,
                            plot_label_size = 12.0,
                            y_title = 'Divergence time',
                            y_title_size = 14.0,
                            y_tick_label_size = 10.0,
                            right_text_size = 10.0,
                            margin_left = margin_left,
                            margin_bottom = 0.0,
                            margin_right = 1,
                            margin_top = margin_top,
                            padding_between_vertical = padding_between_vertical,
                            tab = 0.08)
                    plot = div_model_plot.create_grid()
                    div_model_plot_path = '{0}{1}'.format(out_prefix,
                            'ordered-div-models.' + args.extension)
                    plot.savefig(div_model_plot_path)

            else:
                #plot top unordered models
                if not div_model_path:
                    log.warning('Could not find {0}{1}.txt(.gz); '
                            'Skipping unordered div model plot...'.format(
                                    result_path_prefix,
                                    'div-model-results'))
                else:
                    width = (0.38 * prior_cfg.npairs) + 1.5
                    div_model_plot = plotting.UnorderedDivergenceModelPlotGrid(
                            div_model_results_path = div_model_path,
                            num_top_models = 10,
                            time_multiplier = time_multiplier,
                            height = 10.0,
                            width = width,
                            data_label_size = 10.0,
                            plot_label_schema = 'uppercase',
                            plot_label_offset = 0,
                            plot_label_size = 12.0,
                            y_title = 'Divergence time',
                            y_title_size = 14.0,
                            y_tick_label_size = 10.0,
                            right_text_size = 10.0,
                            margin_left = 0.03,
                            margin_bottom = 0.0,
                            margin_right = 1,
                            margin_top = 0.99,
                            padding_between_vertical = 0.8,
                            tab = 0.08)
                    plot = div_model_plot.create_grid()
                    div_model_plot_path = '{0}{1}'.format(out_prefix,
                            'ordered-div-models.' + args.extension)
                    plot.savefig(div_model_plot_path)

            #plot ndiv plot
            psi_path = get_result_path(result_path_prefix,
                    'psi-results')
            if not psi_path:
                log.warning('Could not find {0}{1}.txt(.gz); '
                        'Skipping number of divergences plot...'.format(
                                result_path_prefix,
                                'psi-results'))
            else:
                width = (0.25 * prior_cfg.npairs) + 0.55
                if width < 2.8:
                    width = 2.8
                num_div_summary = plotting.NumberOfDivergencesSummary(
                        config_path = results.prior_index_to_config[prior_idx],
                        psi_results_path = psi_path,
                        posterior_summary_path = posterior_summary_path,
                        num_prior_samples = args.num_prior_samples,
                        num_processors = args.np)
                num_div_summary.create_plot(
                        plot_label_size = 10.0,
                        right_text_size = 10.0,
                        x_label_size = 10.0,
                        y_label_size = 10.0,
                        xtick_label_size = 10.0,
                        ytick_label_size = 8.0,
                        height = 6.0,
                        width = width,
                        margin_bottom = 0.0,
                        margin_left = 0.0,
                        margin_top = 0.97,
                        margin_right = 1.0,
                        padding_between_vertical = 1.0)
                num_div_plot_path = '{0}{1}'.format(out_prefix,
                        'number-of-divergences.' + args.extension)
                num_div_summary.save_plot(num_div_plot_path)

                bf_plot_path = '{0}{1}'.format(out_prefix,
                        ('number-of-divergences-bayes-factors-only.' +
                                args.extension))
                num_div_summary.save_bf_plot(bf_plot_path)
                
                num_div_bf_path = '{0}{1}'.format(out_prefix,
                        'number-of-divergences-bayes-factors.txt')
                with open(num_div_bf_path, 'w') as out:
                    out.write('num_of_divs\t2ln(bf)\n')
                    for n in sorted(num_div_summary.psi_bayes_factors.keys()):
                        out.write('{0}\t{1}\n'.format(n,
                                num_div_summary.psi_bayes_factors[n]))

    log.info('The plots are in: {0}'.format(args.output_dir))
Пример #13
0
def main_cli(argv=sys.argv):
    description = '{name} {version}'.format(**_program_info)
    parser = argparse.ArgumentParser(
        description=description,
        formatter_class=argparse_utils.SmartHelpFormatter)
    parser.add_argument(
        '-o',
        '--observed-configs',
        nargs='+',
        type=argparse_utils.arg_is_config,
        required=True,
        help=('One or more msBayes config files to be used to either '
              'calculate or simulate observed summary statistics. If '
              'used in combination with `-r` each config will be used to '
              'simulate pseudo-observed data. If analyzing real data, do '
              'not use the `-r` option, and the fasta files specified '
              'within the config must exist and contain the sequence '
              'data.'))
    parser.add_argument(
        '-p',
        '--prior-configs',
        nargs='+',
        type=argparse_utils.arg_is_path,
        required=True,
        help=('One or more config files to be used to generate prior '
              'samples. If more than one config is specified, they '
              'should be separated by spaces. '
              'This option can also be used to specify the path to a '
              'directory containing the prior samples and summary '
              'statistic means and standard deviations generated by a '
              'previous run using the `generate-samples-only` option. '
              'These files should be found in the directory '
              '`pymsbayes-output/prior-stats-summaries`. The'
              '`pymsbayes-output/model-key.txt` also needs to be present.'
              ' If specifying this directory, it should be the only '
              'argument (i.e., no other directories or config files can '
              'be provided).'))
    parser.add_argument(
        '-r',
        '--reps',
        action='store',
        type=argparse_utils.arg_is_nonnegative_int,
        default=0,
        help=('This option has two effects. First, it signifies that '
              'the analysis will be simulation based (i.e., no real '
              'data will be used). Second, it specifies how many '
              'simulation replicates to perform (i.e., how many data '
              'sets to simulate and analyze).'))
    parser.add_argument(
        '-n',
        '--num-prior-samples',
        action='store',
        type=argparse_utils.arg_is_positive_int,
        default=1000000,
        help=('The number of prior samples to simulate for each prior '
              'config specified with `-p`.'))
    parser.add_argument(
        '--prior-batch-size',
        action='store',
        type=argparse_utils.arg_is_positive_int,
        default=10000,
        help=('The number of prior samples to simulate for each batch.'))
    parser.add_argument(
        '--generate-samples-only',
        action='store_true',
        help=('Only generate samples from models as requested. I.e., '
              'No analyses are performed to approximate posteriors. '
              'This option can be useful if you want the prior samples '
              'for other purposes.'))
    parser.add_argument(
        '--num-posterior-samples',
        action='store',
        type=argparse_utils.arg_is_positive_int,
        default=1000,
        help=('The number of posterior samples desired for each '
              'analysis. Default: 1000.'))
    parser.add_argument('--num-standardizing-samples',
                        action='store',
                        type=argparse_utils.arg_is_positive_int,
                        default=10000,
                        help=('The number of prior samples desired to use for '
                              'standardizing statistics. Default: 10000.'))
    parser.add_argument(
        '--np',
        action='store',
        type=argparse_utils.arg_is_positive_int,
        default=multiprocessing.cpu_count(),
        help=('The maximum number of processes to run in parallel. The '
              'default is the number of CPUs available on the machine.'))
    parser.add_argument(
        '--output-dir',
        action='store',
        type=argparse_utils.arg_is_dir,
        help=('The directory in which all output files will be written. '
              'The default is to use the directory of the first observed '
              'config file.'))
    parser.add_argument(
        '--temp-dir',
        action='store',
        type=argparse_utils.arg_is_dir,
        help=('A directory to temporarily stage files. The default is to '
              'use the output directory.'))
    parser.add_argument(
        '--staging-dir',
        action='store',
        type=argparse_utils.arg_is_dir,
        help=('A directory to temporarily stage prior files. This option '
              'can be useful on clusters to speed up I/O while '
              'generating prior samples. You can designate a local temp '
              'directory on a compute node to avoid constant writing to '
              'a shared drive. The default is to use the `temp-dir`.'))
    parser.add_argument(
        '-s',
        '--stat-prefixes',
        nargs='*',
        type=str,
        help=('Prefixes of summary statistics to use in the analyses. '
              'The prefixes should be separated by spaces. '
              'Default: `-s pi wattTheta pi.net tajD.denom`.'))
    parser.add_argument(
        '-b',
        '--bandwidth',
        action='store',
        type=float,
        help=('Smoothing parameter for the posterior kernal density '
              'estimation. This option is used for the `glm` '
              'regression method. The default is 2 / '
              '`num-posterior-samples`.'))
    parser.add_argument(
        '-q',
        '--num-posterior-quantiles',
        action='store',
        type=argparse_utils.arg_is_positive_int,
        default=1000,
        help=('The number of equally spaced quantiles at which to '
              'evaluate the GLM-estimated posterior density. '
              'Default: 1000.'))
    parser.add_argument(
        '--reporting-frequency',
        action='store',
        type=argparse_utils.arg_is_nonnegative_int,
        default=0,
        help=('Suggested frequency (in number of prior samples) for '
              'running regression and reporting current results. '
              'Default: 0 (only report final results). '
              'If a value is given, it may be adjusted so that the '
              'reporting frequency is a multiple of the multi-processed '
              'batch size.'))
    parser.add_argument('--sort-index',
                        action='store',
                        type=argparse_utils.arg_is_nonnegative_int,
                        default=0,
                        choices=range(12),
                        help=argparse_utils.get_sort_index_help_message())
    parser.add_argument(
        '--no-global-estimate',
        action='store_true',
        help=('If multiple prior models are specified, by default a '
              'global estimate is performed averaging over all models. '
              'This option prevents the global estimation (i.e., only '
              'inferences for each model are made).'))
    parser.add_argument('--compress',
                        action='store_true',
                        help='Compress large results files.')
    parser.add_argument('--keep-temps',
                        action='store_true',
                        help='Keep all temporary files.')
    parser.add_argument('--seed',
                        action='store',
                        type=int,
                        help='Random number seed to use for the analysis.')
    parser.add_argument(
        '--output-prefix',
        action='store',
        type=str,
        default='',
        help=('Prefix to use at beginning of output files. The default '
              'is no prefix.'))
    parser.add_argument(
        '--data-key-path',
        action='store',
        type=argparse_utils.arg_is_file,
        help=('The path to a `data-key.txt` file generated by a previous '
              'run. This file should be found in the directory '
              '`pymsbayes-output/data-key.txt`. This option '
              'will override the `-o`/`--observed-configs` option, and '
              'is intended to be used in combination with the '
              '`--start-from` option to restart an analysis.'))
    parser.add_argument(
        '--start-from-simulation-index',
        action='store',
        type=argparse_utils.arg_is_nonnegative_int,
        default=0,
        help=('The simulation index at which to begin analyses. Must be '
              'used in combination with either the number of simulation '
              'replicates (`-r`/`--reps`) or the `--data-key-path` '
              'option, and must be a positive '
              'integer that is less than the number of simulation '
              'replicates. This option can be useful if an analysis '
              'needs to be restarted.'))
    parser.add_argument(
        '--start-from-observed-index',
        action='store',
        type=argparse_utils.arg_is_nonnegative_int,
        default=0,
        help=('The observed config index at which to begin analyses. '
              'Can be used in combination with the `--data-key-path` '
              'option to restart long-running, multi-observed-config '
              'analyses'))
    parser.add_argument('--dry-run',
                        action='store_true',
                        help='Do not run analyses; only process settings')
    parser.add_argument('--version',
                        action='version',
                        version='%(prog)s ' + _program_info['version'],
                        help='Report version and exit.')
    parser.add_argument('--quiet',
                        action='store_true',
                        help='Run without verbose messaging.')
    parser.add_argument('--debug',
                        action='store_true',
                        help='Run in debugging mode.')

    if argv == sys.argv:
        args = parser.parse_args()
    else:
        args = parser.parse_args(argv)

    ##########################################################################
    ## handle args

    from pymsbayes.utils.messaging import (LoggingControl, InfoLogger)

    LoggingControl.set_logging_level("INFO")
    if args.quiet:
        LoggingControl.set_logging_level("WARNING")
    if args.debug:
        LoggingControl.set_logging_level("DEBUG")
    log = LoggingControl.get_logger(__name__)

    from pymsbayes.workers import (MsBayesWorker, merge_prior_files,
                                   ObsSumStatsWorker)
    from pymsbayes.teams import ABCTeam
    from pymsbayes.utils.functions import (is_file, is_dir, long_division,
                                           mk_new_dir)
    from pymsbayes.utils.parsing import (get_patterns_from_prefixes,
                                         DEFAULT_STAT_PATTERNS,
                                         DIV_MODEL_PATTERNS, MODEL_PATTERNS,
                                         PSI_PATTERNS, MEAN_TAU_PATTERNS,
                                         OMEGA_PATTERNS, CV_PATTERNS,
                                         line_count)
    from pymsbayes.utils import sumresults, errors
    from pymsbayes.manager import Manager
    from pymsbayes.utils.tempfs import TempFileSystem
    from pymsbayes.config import MsBayesConfig
    from pymsbayes.utils import (GLOBAL_RNG, set_memory_trace,
                                 MSBAYES_SORT_INDEX, ToolPathManager)

    MSBAYES_SORT_INDEX.set_index(args.sort_index)

    if len(args.observed_configs) != len(set(args.observed_configs)):
        raise ValueError('All paths to observed config files must be unique')

    if args.num_standardizing_samples > args.num_prior_samples:
        args.num_standardizing_samples = args.num_prior_samples

    # get full paths to tools
    msbayes_path = ToolPathManager.get_tool_full_path('msbayes.pl')
    dpp_msbayes_path = ToolPathManager.get_tool_full_path('dpp-msbayes.pl')
    eureject_path = ToolPathManager.get_tool_full_path('eureject')
    abctb_path = ToolPathManager.get_tool_full_path('ABCestimator')

    # vet prior-configs option
    using_previous_priors = False
    previous_prior_dir = None
    if (len(args.prior_configs) == 1) and (is_dir(args.prior_configs[0])):
        previous_prior_dir = args.prior_configs.pop(0)
        previous_priors = glob.glob(
            os.path.join(previous_prior_dir, '*-prior-sample.txt'))
        previous_sums = glob.glob(
            os.path.join(previous_prior_dir, '*-means-and-std-devs.txt'))
        if (not previous_priors) or (not previous_sums):
            raise ValueError(
                'directory {0!r} specified with `prior-configs` '
                'option does not contain necessary prior and summary '
                'files'.format(args.prior_configs[0]))
        using_previous_priors = True
    else:
        for path in args.prior_configs:
            if not is_file(path):
                raise ValueError(
                    'prior config {0!r} is not a file'.format(path))
    if len(args.prior_configs) != len(set(args.prior_configs)):
        raise ValueError('All paths to prior config files must be unique')
    if not args.output_dir:
        args.output_dir = os.path.dirname(args.observed_configs[0])
    base_dir = mk_new_dir(os.path.join(args.output_dir, 'pymsbayes-results'))
    if not args.temp_dir:
        args.temp_dir = base_dir
    info_path = os.path.join(base_dir, args.output_prefix + \
            'pymsbayes-info.txt')
    info = InfoLogger(info_path)
    info.write('[pymsbayes]'.format(base_dir))
    info.write('\tversion = {version}'.format(**_program_info))
    info.write('\toutput_directory = {0}'.format(base_dir))
    temp_fs = TempFileSystem(parent=args.temp_dir, prefix='temp-files-')
    base_temp_dir = temp_fs.base_dir
    info.write('\ttemp_directory = {0}'.format(base_temp_dir))
    info.write('\tsort_index = {0}'.format(MSBAYES_SORT_INDEX.current_value()))
    info.write('\tsimulation_reps = {0}'.format(args.reps))
    stat_patterns = DEFAULT_STAT_PATTERNS
    if args.stat_prefixes:
        for i in range(len(args.stat_prefixes)):
            if not args.stat_prefixes[i].endswith('.'):
                args.stat_prefixes[i] += '.'
        stat_patterns = get_patterns_from_prefixes(args.stat_prefixes,
                                                   ignore_case=True)
    if not args.bandwidth:
        args.bandwidth = 2 / float(args.num_posterior_samples)
    if not args.seed:
        args.seed = random.randint(1, 999999999)
    GLOBAL_RNG.seed(args.seed)
    if args.data_key_path:
        observed_map = sumresults.parse_data_key_file(args.data_key_path)
        observed_paths = [observed_map[k] for k in sorted(observed_map.keys())]
    else:
        observed_dir = mk_new_dir(
            os.path.join(base_dir, 'observed-summary-stats'))
        observed_paths = [os.path.join(observed_dir, args.output_prefix + \
            'observed-{0}.txt'.format(i+1)) for i in range(len(
                    args.observed_configs))]
    info.write('\tseed = {0}'.format(args.seed))
    info.write('\tnum_processors = {0}'.format(args.np))
    info.write('\tnum_prior_samples = {0}'.format(args.num_prior_samples))
    info.write('\tnum_standardizing_samples = {0}'.format(
        args.num_standardizing_samples))
    info.write('\tbandwidth = {0}'.format(args.bandwidth))
    info.write('\tposterior_quantiles = {0}'.format(
        args.num_posterior_quantiles))
    info.write('\tposterior_sample_size = {0}'.format(
        args.num_posterior_samples))
    info.write('\tstat_patterns = {0}'.format(', '.join(
        [p.pattern for p in stat_patterns])))

    # vet observed configs
    ref_config_path = args.observed_configs[0]
    ref_config = MsBayesConfig(ref_config_path)
    all_config_paths = []
    num_taxon_pairs = ref_config.npairs
    assert num_taxon_pairs > 0
    for config in args.observed_configs:
        all_config_paths.append(config)
        if not ref_config.equal_sample_table(config):
            if not args.keep_temps:
                temp_fs.purge()
            raise errors.SampleTableError(
                'sample tables in config {0!r} and {1!r} differ; '
                'all sample tables must be the same.'.format(
                    ref_config_path, config))

    info.write('\tnum_taxon_pairs = {0}'.format(num_taxon_pairs))
    info.write('\tdry_run = {0}'.format(args.dry_run))
    info.write('\t[[tool_paths]]')
    info.write('\t\tdpp_msbayes = {0}'.format(dpp_msbayes_path))
    info.write('\t\tmsbayes = {0}'.format(msbayes_path))
    info.write('\t\teureject = {0}'.format(eureject_path))
    info.write('\t\tabcestimator = {0}'.format(abctb_path))
    info.write('\t[[observed_configs]]')
    for i, cfg in enumerate(args.observed_configs):
        info.write('\t\t{0} = {1}'.format(
            i + 1, os.path.relpath(cfg, os.path.dirname(info_path))))

    abc_team = ABCTeam(
        temp_fs=temp_fs,
        observed_stats_files=observed_paths,
        num_taxon_pairs=num_taxon_pairs,
        config_paths=args.prior_configs,
        previous_prior_dir=previous_prior_dir,
        num_prior_samples=args.num_prior_samples,
        num_processors=args.np,
        num_standardizing_samples=args.num_standardizing_samples,
        num_posterior_samples=args.num_posterior_samples,
        num_posterior_density_quantiles=args.num_posterior_quantiles,
        batch_size=args.prior_batch_size,
        output_dir=base_dir,
        output_prefix=args.output_prefix,
        prior_temp_dir=args.staging_dir,
        rng=GLOBAL_RNG,
        report_parameters=True,
        stat_patterns=stat_patterns,
        eureject_exe_path=eureject_path,
        abctoolbox_exe_path=abctb_path,
        msbayes_exe_path=None,
        abctoolbox_bandwidth=args.bandwidth,
        omega_threshold=0.01,
        cv_threshold=0.01,
        compress=args.compress,
        reporting_frequency=args.reporting_frequency,
        keep_temps=args.keep_temps,
        global_estimate_only=False,
        global_estimate=not args.no_global_estimate,
        generate_prior_samples_only=args.generate_samples_only,
        start_from_simulation_index=args.start_from_simulation_index,
        start_from_observed_index=args.start_from_observed_index)

    models_to_configs = {}
    configs_to_models = {}
    for k, v in abc_team.models.iteritems():
        models_to_configs[k] = v
        configs_to_models[v] = k
        cfg = MsBayesConfig(v)
        all_config_paths.append(v)
        # vet prior configs
        if not ref_config.equal_sample_table(cfg):
            if not args.keep_temps:
                temp_fs.purge()
            raise errors.SampleTableError(
                'sample tables in config {0!r} and {1!r} differ; '
                'all sample tables must be the same.'.format(
                    ref_config_path, v))

    info.write('\t[[observed_paths]]')
    for i in sorted(abc_team.observed_stats_paths.iterkeys()):
        info.write('\t\t{0} = {1}'.format(
            i,
            os.path.relpath(abc_team.observed_stats_paths[i],
                            os.path.dirname(info_path))))
    info.write('\t[[prior_configs]]')
    for i in sorted(abc_team.models.iterkeys()):
        info.write('\t\t{0} = {1}'.format(
            i, os.path.relpath(abc_team.models[i],
                               os.path.dirname(info_path))))

    ##########################################################################
    ## begin analysis --- get observed summary stats

    set_memory_trace()  # start logging memory profile
    start_time = datetime.datetime.now()

    if args.data_key_path:
        log.info('Using provided summary statitics...')
    elif not args.dry_run:
        obs_temp_dir = base_temp_dir
        if args.staging_dir:
            obs_temp_dir = args.staging_dir
        observed_temp_fs = TempFileSystem(parent=obs_temp_dir,
                                          prefix='observed-temps-')

        if args.reps < 1:
            log.info('Calculating summary statistics from sequence data...')
            obs_workers = []
            for i, cfg in enumerate(args.observed_configs):
                ss_worker = ObsSumStatsWorker(temp_fs=observed_temp_fs,
                                              config_path=cfg,
                                              output_path=observed_paths[i],
                                              schema='abctoolbox',
                                              stat_patterns=stat_patterns)
                obs_workers.append(ss_worker)

            obs_workers = Manager.run_workers(workers=obs_workers,
                                              num_processors=args.np)

            # re-vet all configs to see if some were changed by obsSumStats.pl
            new_ref_config = ref_config
            ref_modified = False
            # new ref because if all configs were updated all is good
            if not ref_config.equal_sample_table(ref_config_path):
                ref_modified = True
                new_ref_config = MsBayesConfig(ref_config_path)
                log.warning("""
The alignment lengths in config
{0!r}
have been corrected for sites with *any* ambiguous bases and/or gaps by
obsSumStats.pl.
                    """.format(ref_config_path))
            for config in all_config_paths:
                if not new_ref_config.equal_sample_table(config):
                    corrected_config = config
                    if ref_modified:
                        corrected_config = ref_config_path
                    if not args.keep_temps:
                        observed_temp_fs.purge()
                        temp_fs.purge()
                    raise errors.SampleTableError("""
The sample tables in configs
{0!r}
and
{1!r}
differ because obsSumStats.pl modified alignment lengths in config
{2!r}
to correct for sites in the alignments with *any* ambiguous bases and/or gaps.
Please make sure the sample tables in all configs will be the same after
correcting alignment lengths for sites that contain *any* ambiguous bases
and/or gaps. You can do this by copying and pasting the sample table in
{2!r}
that has been corrected by obsSumStats.pl into the other configs that were not
corrected.
                        """.format(ref_config_path, config, corrected_config))

        else:
            log.info('Simulating summary statistics from observed configs...')
            num_observed_workers = min([args.reps, args.np])
            if args.reps <= args.np:
                observed_batch_size = 1
                remainder = 0
            else:
                observed_batch_size, remainder = long_division(
                    args.reps, args.np)
            msbayes_workers = []
            for idx, cfg in enumerate(args.observed_configs):
                observed_model_idx = configs_to_models.get(cfg, None)
                schema = 'abctoolbox'
                for i in range(num_observed_workers):
                    worker = MsBayesWorker(temp_fs=observed_temp_fs,
                                           sample_size=observed_batch_size,
                                           config_path=cfg,
                                           model_index=observed_model_idx,
                                           report_parameters=True,
                                           schema=schema,
                                           include_header=True,
                                           stat_patterns=stat_patterns,
                                           write_stats_file=False,
                                           staging_dir=None,
                                           tag=idx)
                    msbayes_workers.append(worker)
                if remainder > 0:
                    worker = MsBayesWorker(temp_fs=observed_temp_fs,
                                           sample_size=remainder,
                                           config_path=cfg,
                                           model_index=observed_model_idx,
                                           report_parameters=True,
                                           schema=schema,
                                           include_header=True,
                                           stat_patterns=stat_patterns,
                                           write_stats_file=False,
                                           staging_dir=None,
                                           tag=idx)
                    msbayes_workers.append(worker)

            # run parallel msbayes processes
            msbayes_workers = Manager.run_workers(workers=msbayes_workers,
                                                  num_processors=args.np)

            workers = dict(
                zip(range(len(args.observed_configs)),
                    [[] for i in range(len(args.observed_configs))]))
            for w in msbayes_workers:
                workers[w.tag].append(w)

            # merge simulated observed data into one file
            for i in range(len(args.observed_configs)):
                merge_prior_files([w.prior_path for w in workers[i]],
                                  observed_paths[i])
                lc = line_count(observed_paths[i], ignore_headers=True)
                if lc != args.reps:
                    if not args.keep_temps:
                        temp_fs.purge()
                    raise Exception(
                        'The number of observed simulations ({0}) '
                        'generated for observed config {1!r} and output to '
                        'file {2!r} does not match the number of reps '
                        '({3})'.format(lc, args.observed_configs[i],
                                       observed_paths[i], args.reps))
        if not args.keep_temps:
            log.debug('purging observed temps...')
            observed_temp_fs.purge()

    ##########################################################################
    ## Begin ABC analyses

    if not args.dry_run:
        abc_team.run()

    stop_time = datetime.datetime.now()
    log.info('Done!')
    info.write('\t[[run_stats]]', log.info)
    info.write('\t\tstart_time = {0}'.format(str(start_time)), log.info)
    info.write('\t\tstop_time = {0}'.format(str(stop_time)), log.info)
    info.write('\t\ttotal_duration = {0}'.format(str(stop_time - start_time)),
               log.info)

    if not args.keep_temps:
        log.debug('purging temps...')
        temp_fs.purge()
Пример #14
0
def main_cli():
    description = '{name} {version}'.format(**_program_info)
    parser = argparse.ArgumentParser(
        description=description,
        formatter_class=argparse_utils.SmartHelpFormatter)
    parser.add_argument(
        '-c',
        '--config',
        type=argparse_utils.arg_is_config,
        required=True,
        help=('msBayes config file to be used to generate saturation '
              'plot.'))
    parser.add_argument(
        '-n',
        '--num-prior-samples',
        action='store',
        type=int,
        default=1000,
        help=('The number of prior samples to simulate for the '
              'saturation plot.'))
    parser.add_argument(
        '--np',
        action='store',
        type=int,
        default=multiprocessing.cpu_count(),
        help=('The maximum number of processes to run in parallel. The '
              'default is the number of CPUs available on the machine.'))
    parser.add_argument(
        '-o',
        '--output-dir',
        action='store',
        type=argparse_utils.arg_is_dir,
        help=('The directory in which all output files will be written. '
              'The default is to use the directory of the first observed '
              'config file.'))
    parser.add_argument(
        '--temp-dir',
        action='store',
        type=argparse_utils.arg_is_dir,
        help=('A directory to temporarily stage files. The default is to '
              'use the output directory.'))
    parser.add_argument(
        '-s',
        '--stat-prefixes',
        nargs='*',
        type=str,
        default=['pi', 'pi.net', 'wattTheta', 'tajD.denom'],
        help=('Prefixes of summary statistics to use in the analyses. '
              'The prefixes should be separated by spaces. '
              'Default: `-s pi pi.net wattTheta tajD.denom`.'))
    parser.add_argument('--sort-index',
                        action='store',
                        type=int,
                        default=0,
                        choices=range(12),
                        help=argparse_utils.get_sort_index_help_message())
    parser.add_argument('--compress',
                        action='store_true',
                        help='Compress plot data file.')
    parser.add_argument('--keep-temps',
                        action='store_true',
                        help='Keep all temporary files.')
    parser.add_argument('--seed',
                        action='store',
                        type=int,
                        help='Random number seed to use for the analysis.')
    parser.add_argument('--version',
                        action='version',
                        version='%(prog)s ' + _program_info['version'],
                        help='Report version and exit.')
    parser.add_argument('--quiet',
                        action='store_true',
                        help='Run without verbose messaging.')
    parser.add_argument('--debug',
                        action='store_true',
                        help='Run in debugging mode.')

    args = parser.parse_args()

    ##########################################################################
    ## handle args

    from pymsbayes.utils.messaging import (LoggingControl, InfoLogger)

    LoggingControl.set_logging_level("INFO")
    if args.quiet:
        LoggingControl.set_logging_level("WARNING")
    if args.debug:
        LoggingControl.set_logging_level("DEBUG")
    log = LoggingControl.get_logger(__name__)

    from pymsbayes.workers import MsBayesWorker
    from pymsbayes.utils.parsing import (get_patterns_from_prefixes,
                                         DEFAULT_STAT_PATTERNS,
                                         get_dict_from_spreadsheets,
                                         dict_line_iter)
    from pymsbayes.manager import Manager
    from pymsbayes.utils.tempfs import TempFileSystem
    from pymsbayes.utils import probability, stats
    from pymsbayes.utils.functions import long_division
    from pymsbayes.config import MsBayesConfig
    from pymsbayes.utils import GLOBAL_RNG, MSBAYES_SORT_INDEX, ToolPathManager
    from pymsbayes.fileio import process_file_arg
    from pymsbayes import plotting

    MSBAYES_SORT_INDEX.set_index(args.sort_index)

    # get full paths to tools
    msbayes_path = ToolPathManager.get_tool_full_path('msbayes.pl')
    dpp_msbayes_path = ToolPathManager.get_tool_full_path('dpp-msbayes.pl')

    if not args.output_dir:
        args.output_dir = os.path.dirname(args.config)
    info = InfoLogger(os.path.join(args.output_dir, 'pymsbayes-info.txt'))

    sample_path = os.path.join(args.output_dir, 'prior-sample.txt')
    if args.compress:
        sample_path += '.gz'

    if not args.temp_dir:
        args.temp_dir = args.output_dir
    temp_fs = TempFileSystem(parent=args.temp_dir, prefix='temp-files-')
    args.stat_prefixes = [s.rstrip('.') for s in args.stat_prefixes]
    stat_patterns = get_patterns_from_prefixes(
        [s + '.' for s in args.stat_prefixes], ignore_case=True)
    if not args.seed:
        args.seed = random.randint(1, 999999999)
    GLOBAL_RNG.seed(args.seed)
    compress_level = None
    if args.compress:
        compress_level = 9

    cfg = MsBayesConfig(args.config)
    num_taxon_pairs = cfg.npairs

    info.write('[pymsbayes]', log.info)
    info.write('\tprogram_name = {name}'.format(**_program_info), log.info)
    info.write('\tversion = {version}'.format(**_program_info), log.info)
    info.write('\tinvocation = {0!r}'.format(' '.join(sys.argv)), log.info)
    info.write('\toutput_directory = {0!r}'.format(args.output_dir), log.info)
    info.write('\ttemp_directory = {0!r}'.format(temp_fs.base_dir), log.info)
    info.write('\tsort_index = {0}'.format(MSBAYES_SORT_INDEX.current_value()),
               log.info)
    info.write(
        '\tstat_patterns = {0!r}'.format(', '.join(
            [p.pattern for p in stat_patterns])), log.info)
    info.write('\tseed = {0}'.format(args.seed), log.info)
    info.write('\tnum_prior_samples = {0}'.format(args.num_prior_samples),
               log.info)
    info.write('\tsample_path = {0!r}'.format(sample_path), log.info)
    info.write('\t[[tool_paths]]', log.info)
    info.write('\t\tdpp_msbayes = {0}'.format(dpp_msbayes_path), log.info)
    info.write('\t\tmsbayes = {0}'.format(msbayes_path), log.info)

    info.write('\t[[config]]', log.debug)
    info.write('{0}'.format(str(cfg)), log.debug)

    ##########################################################################
    ## begin analysis --- generate samples

    start_time = datetime.datetime.now()

    if args.np > args.num_prior_samples:
        args.np = args.num_prior_samples
    batch_size, remainder = long_division(args.num_prior_samples, args.np)
    schema = 'abctoolbox'
    workers = []
    for i in range(args.np):
        sample_size = batch_size
        if i == (args.np - 1):
            sample_size += remainder
        w = MsBayesWorker(temp_fs=temp_fs,
                          sample_size=sample_size,
                          config_path=args.config,
                          report_parameters=True,
                          schema=schema,
                          include_header=True,
                          stat_patterns=stat_patterns,
                          write_stats_file=False)
        workers.append(w)

    log.info('Generating samples...')
    workers = Manager.run_workers(workers=workers, num_processors=args.np)
    log.info('Parsing samples...')
    sample = get_dict_from_spreadsheets([w.prior_path for w in workers])

    log.info('Writing prior samples...')
    out, close = process_file_arg(sample_path,
                                  'w',
                                  compresslevel=compress_level)
    for row in dict_line_iter(sample, sep='\t'):
        out.write(row)
    if close:
        out.close()

    log.info('Creating plots...')

    if not plotting.MATPLOTLIB_AVAILABLE:
        log.warning(
            '`matplotlib` could not be imported, so the plot can not be\n'
            'produced. The data to create the plot can be found in:\n\t'
            '{0!r}'.format(sample_path))
        sys.exit(1)

    for stat_pattern in stat_patterns:
        found = False
        for stat, values in sample.iteritems():
            if stat_pattern.match(stat):
                values = [float(v) for v in values]
                found = True
                plot_path = os.path.join(args.output_dir,
                                         'plot-{0}.pdf'.format(stat))
                summary = stats.get_summary(values)
                s = r'mean = {0:.4f} ({1:.4f}-{2:.4f})'.format(
                    summary['mean'], summary['qi_95'][0], summary['qi_95'][1])
                hd = plotting.HistData(x=values,
                                       normed=True,
                                       bins=20,
                                       histtype='bar',
                                       align='mid',
                                       orientation='vertical',
                                       zorder=0)
                hist = plotting.ScatterPlot(hist_data_list=[hd], right_text=s)
                hist.left_text_size = 12.0
                hist.right_text_size = 12.0
                xticks = [i for i in hist.ax.get_xticks()]
                xtick_labels = [i for i in xticks]
                yticks = [i for i in hist.ax.get_yticks()]
                ytick_labels = [i for i in yticks]
                if len(xtick_labels) >= 8:
                    for i in range(1, len(xtick_labels), 2):
                        xtick_labels[i] = ''
                if len(ytick_labels) >= 8:
                    for i in range(1, len(ytick_labels), 2):
                        ytick_labels[i] = ''
                xticks_obj = plotting.Ticks(ticks=xticks,
                                            labels=xtick_labels,
                                            horizontalalignment='center')
                yticks_obj = plotting.Ticks(ticks=yticks, labels=ytick_labels)
                hist.xticks_obj = xticks_obj
                hist.yticks_obj = yticks_obj

                plot_grid = plotting.PlotGrid(subplots=[hist],
                                              num_columns=1,
                                              label_schema=None,
                                              title=stat,
                                              title_size=14.0,
                                              title_top=False,
                                              y_title='Density',
                                              y_title_position=0.001,
                                              y_title_size=14.0,
                                              height=4.0,
                                              width=6.0,
                                              auto_height=False)
                plot_grid.auto_adjust_margins = False
                plot_grid.margin_left = 0.04
                plot_grid.margin_bottom = 0.04
                plot_grid.margin_right = 1.0
                plot_grid.margin_top = 0.97
                plot_grid.reset_figure()
                plot_grid.savefig(plot_path)

        if not found:
            raise Exception('stat pattern {0!r} not found in simulated stats:'
                            '\n\t{1}'.format(stat_pattern,
                                             ', '.join(sample.keys())))

    stop_time = datetime.datetime.now()
    log.info('Done!')
    info.write('\t[[run_stats]]', log.info)
    info.write('\t\tstart_time = {0}'.format(str(start_time)), log.info)
    info.write('\t\tstop_time = {0}'.format(str(stop_time)), log.info)
    info.write('\t\ttotal_duration = {0}'.format(str(stop_time - start_time)),
               log.info)

    if not args.keep_temps:
        log.debug('purging temps...')
        temp_fs.purge()
Пример #15
0
def main_cli():
    description = '{name} {version}'.format(**_program_info)
    parser = argparse.ArgumentParser(description = description)
    parser.add_argument('div_model_path',
            metavar='DIV-MODEL-RESULTS-FILE',
            type=argparse_utils.arg_is_file,
            help = ('Path to divergence model results file (i.e., '
                    '`*-div-model-results.txt`).'))
    parser.add_argument('-i', '--taxon-indices',
            nargs = '+',
            type = argparse_utils.arg_is_positive_int,
            required = True,
            help = ('Two or more space-separated indices of taxa for which to '
                    'calculate the probability of them co-diverging. Indices '
                    'correspond to the line in the sample table of the config, '
                    'starting at 1 for the first line of the table. At least '
                    'two indices are required.'))
    parser.add_argument('-c', '--config',
            type = argparse_utils.arg_is_config,
            help = ('msBayes config file to be used to estimate prior '
                    'probability via simulations. If provided, the '
                    'posterior and prior probability and bayes factor is '
                    'reported. If not provided, only the posterior '
                    'probability is reported.'))
    parser.add_argument('-n', '--num-prior-samples',
            action = 'store',
            type = argparse_utils.arg_is_positive_int,
            default = 100000,
            help = ('The number of prior samples to simulate for estimating'
                    'prior probabilities. Only used if a config file is '
                    'provided with the `-c` argument.'))
    parser.add_argument('--np',
            action = 'store',
            type = argparse_utils.arg_is_positive_int,
            default = multiprocessing.cpu_count(),
            help = ('The maximum number of processes to run in parallel for '
                    'prior simulations. The default is the number of CPUs '
                    'available on the machine. This option is only relevant '
                    'if a config file is provided using the `-c` argument.'))
    parser.add_argument('--seed',
            action = 'store',
            type = argparse_utils.arg_is_positive_int,
            help = ('Random number seed to use for simulations. This option '
                    'is only relevant if a config file is provided using the '
                    '`-c` argument.'))
    parser.add_argument('--version',
            action = 'version',
            version = '%(prog)s ' + _program_info['version'],
            help = 'Report version and exit.')
    parser.add_argument('--quiet',
            action = 'store_true',
            help = 'Run without verbose messaging.')
    parser.add_argument('--debug',
            action = 'store_true',
            help = 'Run in debugging mode.')

    args = parser.parse_args()

    ##########################################################################
    ## handle args

    from pymsbayes.utils.messaging import (LoggingControl,
            InfoLogger)

    LoggingControl.set_logging_level("INFO")
    if args.quiet:
        LoggingControl.set_logging_level("WARNING")
    if args.debug:
        LoggingControl.set_logging_level("DEBUG")
    log = LoggingControl.get_logger(__name__)

    from pymsbayes import config
    from pymsbayes.teams import ModelProbabilityEstimatorTeam
    from pymsbayes.utils import sumresults, GLOBAL_RNG

    if len(args.taxon_indices) < 2:
        log.error('At least two taxon indices are required')
        sys.exit(1)
    if not args.seed:
        args.seed = random.randint(1, 999999999)
    GLOBAL_RNG.seed(args.seed)

    div_models = sumresults.OrderedDivergenceModelCollection(
            div_model_results_path = args.div_model_path)
    for i in args.taxon_indices:
        if ((i < 1) or (i > div_models.npairs)):
            log.error('taxon index {0} is out of bounds'.format(i))
            sys.exit(1)
    args.taxon_indices = [i - 1 for i in args.taxon_indices]
    prob_shared_div = div_models.prob_of_shared_divergence(args.taxon_indices)

    if args.config:
        prob_estimator_team = ModelProbabilityEstimatorTeam(
                config_paths = [args.config],
                num_samples = args.num_prior_samples,
                num_processors = args.np)
        prob_estimator_team.start()
        prior_prob = prob_estimator_team.shared_div_probs[args.config][
                len(args.taxon_indices)]
        bf = ((prob_shared_div / (1 - prob_shared_div)) /
                (prior_prob / (1 - prior_prob)))

    sys.stdout.write('posterior probability = {0}\n'.format(prob_shared_div))
    if args.config:
        sys.stdout.write('prior probability = {0}\n'.format(prior_prob))
        sys.stdout.write('Bayes factor = {0}\n'.format(bf))
        sys.stdout.write('2ln(Bayes factor) = {0}\n'.format(2 * math.log(bf)))