def main_cli(argv = sys.argv): description = '{name} {version}'.format(**_program_info) parser = argparse.ArgumentParser(description = description, formatter_class = argparse_utils.SmartHelpFormatter) parser.add_argument('info_path', metavar='PYMSBAYES-INFO-PATH', type=argparse_utils.arg_is_file, help=('Path to the "pymsbayes-info.txt" file.')) parser.add_argument('--plot', action = 'store_true', help = 'Create plots from result summaries.') parser.add_argument('--quiet', action = 'store_true', help = 'Run without verbose messaging.') parser.add_argument('--debug', action = 'store_true', help = 'Run in debugging mode.') if argv == sys.argv: args = parser.parse_args() else: args = parser.parse_args(argv) ########################################################################## ## handle args from pymsbayes.utils.messaging import LoggingControl LoggingControl.set_logging_level("INFO") if args.quiet: LoggingControl.set_logging_level("WARNING") if args.debug: LoggingControl.set_logging_level("DEBUG") log = LoggingControl.get_logger(__name__) from pymsbayes.utils import sumresults results = sumresults.DMCSimulationResults(args.info_path) prior_indices = results.prior_index_to_config.keys() test_path = results.get_result_summary_path( results.observed_index_to_path.keys()[0], prior_indices[0]) if os.path.exists(test_path): log.warning('summary files already exists; skipping summaries!') else: results.write_result_summaries( prior_indices = prior_indices, include_tau_exclusion_info = False) if args.plot: create_plots(args.info_path)
def main_cli(argv=sys.argv): description = '{name} {version}'.format(**_program_info) parser = argparse.ArgumentParser( description=description, formatter_class=argparse_utils.SmartHelpFormatter) parser.add_argument('info_path', metavar='PYMSBAYES-INFO-PATH', type=argparse_utils.arg_is_file, help=('Path to the "pymsbayes-info.txt" file.')) parser.add_argument('--plot', action='store_true', help='Create plots from result summaries.') parser.add_argument('--quiet', action='store_true', help='Run without verbose messaging.') parser.add_argument('--debug', action='store_true', help='Run in debugging mode.') if argv == sys.argv: args = parser.parse_args() else: args = parser.parse_args(argv) ########################################################################## ## handle args from pymsbayes.utils.messaging import LoggingControl LoggingControl.set_logging_level("INFO") if args.quiet: LoggingControl.set_logging_level("WARNING") if args.debug: LoggingControl.set_logging_level("DEBUG") log = LoggingControl.get_logger(__name__) from pymsbayes.utils import sumresults results = sumresults.DMCSimulationResults(args.info_path) prior_indices = results.prior_index_to_config.keys() test_path = results.get_result_summary_path( results.observed_index_to_path.keys()[0], prior_indices[0]) if os.path.exists(test_path): log.warning('summary files already exists; skipping summaries!') else: results.write_result_summaries(prior_indices=prior_indices, include_tau_exclusion_info=False) if args.plot: create_plots(args.info_path)
def main_cli(): description = '{name} {version}'.format(**_program_info) parser = argparse.ArgumentParser(description=description) parser.add_argument('info_path', metavar='PYMSBAYES-INFO-FILE', type=argparse_utils.arg_is_file, help=('Path to `pymsbayes-info.txt` file.')) parser.add_argument( '-n', '--num-prior-samples', action='store', type=argparse_utils.arg_is_positive_int, default=100000, help=('The number of prior samples to simulate for estimating ' 'prior probabilities.')) parser.add_argument( '-i', '--sample-index', action='store', type=argparse_utils.arg_is_positive_int, help=('The prior-sample index of results to be summarized. ' 'Output files should have a consistent schema. For ' 'example, a results file for divergence models might look ' 'something like ' '`d1-m1-s1-1000000-div-model-results.txt`. In this example, ' 'the prior-sample index is "1000000". The default is to ' 'use the largest prior-sample index, which is probably ' 'what you want.')) parser.add_argument( '-o', '--output-dir', action='store', type=argparse_utils.arg_is_dir, help=('The directory in which all output plots will be written. ' 'The default is to use the directory of the pymsbayes info ' 'file.')) parser.add_argument( '--np', action='store', type=argparse_utils.arg_is_positive_int, default=multiprocessing.cpu_count(), help=('The maximum number of processes to run in parallel. The ' 'default is the number of CPUs available on the machine.')) parser.add_argument( '-m', '--mu', action='store', type=argparse_utils.arg_is_positive_float, default=None, help=('The mutation rate with which to scale time to units of ' 'generations. By default, time is not scaled to ' 'generations.')) parser.add_argument( '--extension', action='store', type=str, default='pdf', help=('The file format extension of the plots (e.g., "pdf", ' '"png"). The default is pdf.')) parser.add_argument('--seed', action='store', type=argparse_utils.arg_is_positive_int, help='Random number seed to use for the analysis.') parser.add_argument('--version', action='version', version='%(prog)s ' + _program_info['version'], help='Report version and exit.') parser.add_argument('--quiet', action='store_true', help='Run without verbose messaging.') parser.add_argument('--debug', action='store_true', help='Run in debugging mode.') args = parser.parse_args() ########################################################################## ## handle args from pymsbayes.utils.messaging import (LoggingControl, InfoLogger) LoggingControl.set_logging_level("INFO") if args.quiet: LoggingControl.set_logging_level("WARNING") if args.debug: LoggingControl.set_logging_level("DEBUG") log = LoggingControl.get_logger(__name__) from pymsbayes import plotting from pymsbayes.utils import sumresults from pymsbayes.utils import GLOBAL_RNG if not plotting.MATPLOTLIB_AVAILABLE: log.error('`matplotlib` could not be imported, so plots can not be\n' 'produced. Please install `matplotlib` and try again.') sys.exit(1) if not args.seed: args.seed = random.randint(1, 999999999) GLOBAL_RNG.seed(args.seed) if not args.output_dir: args.output_dir = os.path.dirname(args.info_path) args.output_dir = os.path.join(args.output_dir, 'plots') if not os.path.exists(args.output_dir): os.mkdir(args.output_dir) results = sumresults.DMCSimulationResults(args.info_path) if results.num_sim_reps > 1: log.error('Results appear to be from simulation-based analysis, ' 'for which this plotting script is not appropriate.') sys.exit(1) observed_indices = sorted(results.observed_index_to_config.keys()) prior_indices = sorted(results.prior_index_to_config.keys()) for obs_idx in observed_indices: for prior_idx in prior_indices: result_indices = results.get_result_indices(obs_idx, prior_idx, 1) result_idx = max(result_indices) result_path_prefix = '{0}{1}-'.format( results.get_result_path_prefix(obs_idx, prior_idx, 1), result_idx) result_dir = os.path.dirname(result_path_prefix) out_prefix = os.path.join(args.output_dir, os.path.basename(result_path_prefix)) prior_cfg = results.prior_configs[prior_idx] posterior_summary_path = get_result_path(result_path_prefix, 'posterior-summary') div_model_path = get_result_path(result_path_prefix, 'div-model-results') config_path = results.prior_index_to_config[prior_idx] time_multiplier = 1.0 if args.mu is not None: if prior_cfg.time_in_subs_per_site: time_multiplier = 1.0 / args.mu else: try: mean_theta = prior_cfg.theta.mean except: mean_theta = prior_cfg.d_theta.mean time_multiplier = mean_theta / args.mu if results.sort_index == 0: #plot marginal times if not posterior_summary_path: log.warning('Could not find {0}{1}.txt(.gz); ' 'Skipping marginal times plot...'.format( result_path_prefix, 'posterior-summary')) else: label_dimension = (0.34 * (prior_cfg.npairs + 1)) + 0.56 marginal_times_plot = plotting.get_marginal_divergence_time_plot( config_path=config_path, posterior_summary_path=posterior_summary_path, labels=None, estimate='median', interval='HPD_95_interval', time_multiplier=time_multiplier, horizontal=True, label_dimension=label_dimension, measure_dimension=8.0, label_size=12.0, measure_tick_label_size=12.0, measure_axis_label='Divergence time', measure_axis_label_size=14.0, label_axis_label='Taxon pair', label_axis_label_size=14.0, usetex=False) marginal_times_path = '{0}{1}'.format( out_prefix, 'marginal-divergence-times.' + args.extension) marginal_times_plot.savefig(marginal_times_path) #plot top ordered models if not div_model_path: log.warning('Could not find {0}{1}.txt(.gz); ' 'Skipping ordered div model plot...'.format( result_path_prefix, 'div-model-results')) else: height = 12.0 margin_top = 0.99 margin_left = 0.03 padding_between_vertical = 0.8 if prior_cfg.npairs < 4: height *= 0.8 margin_top -= 0.01 margin_left += 0.05 padding_between_vertical += 0.3 width = (0.38 * prior_cfg.npairs) + 1.5 div_model_plot = plotting.OrderedDivergenceModelPlotGrid( div_model_results_path=div_model_path, config_path=config_path, num_top_models=10, time_multiplier=time_multiplier, height=height, width=width, plot_label_schema='uppercase', plot_label_offset=0, plot_label_size=12.0, y_title='Divergence time', y_title_size=14.0, y_tick_label_size=10.0, right_text_size=10.0, margin_left=margin_left, margin_bottom=0.0, margin_right=1, margin_top=margin_top, padding_between_vertical=padding_between_vertical, tab=0.08) plot = div_model_plot.create_grid() div_model_plot_path = '{0}{1}'.format( out_prefix, 'ordered-div-models.' + args.extension) plot.savefig(div_model_plot_path) else: #plot top unordered models if not div_model_path: log.warning('Could not find {0}{1}.txt(.gz); ' 'Skipping unordered div model plot...'.format( result_path_prefix, 'div-model-results')) else: width = (0.38 * prior_cfg.npairs) + 1.5 div_model_plot = plotting.UnorderedDivergenceModelPlotGrid( div_model_results_path=div_model_path, num_top_models=10, time_multiplier=time_multiplier, height=10.0, width=width, data_label_size=10.0, plot_label_schema='uppercase', plot_label_offset=0, plot_label_size=12.0, y_title='Divergence time', y_title_size=14.0, y_tick_label_size=10.0, right_text_size=10.0, margin_left=0.03, margin_bottom=0.0, margin_right=1, margin_top=0.99, padding_between_vertical=0.8, tab=0.08) plot = div_model_plot.create_grid() div_model_plot_path = '{0}{1}'.format( out_prefix, 'ordered-div-models.' + args.extension) plot.savefig(div_model_plot_path) #plot ndiv plot psi_path = get_result_path(result_path_prefix, 'psi-results') if not psi_path: log.warning('Could not find {0}{1}.txt(.gz); ' 'Skipping number of divergences plot...'.format( result_path_prefix, 'psi-results')) else: width = (0.25 * prior_cfg.npairs) + 0.55 if width < 2.8: width = 2.8 num_div_summary = plotting.NumberOfDivergencesSummary( config_path=results.prior_index_to_config[prior_idx], psi_results_path=psi_path, posterior_summary_path=posterior_summary_path, num_prior_samples=args.num_prior_samples, num_processors=args.np) num_div_summary.create_plot(plot_label_size=10.0, right_text_size=10.0, x_label_size=10.0, y_label_size=10.0, xtick_label_size=10.0, ytick_label_size=8.0, height=6.0, width=width, margin_bottom=0.0, margin_left=0.0, margin_top=0.97, margin_right=1.0, padding_between_vertical=1.0) num_div_plot_path = '{0}{1}'.format( out_prefix, 'number-of-divergences.' + args.extension) num_div_summary.save_plot(num_div_plot_path) bf_plot_path = '{0}{1}'.format( out_prefix, ('number-of-divergences-bayes-factors-only.' + args.extension)) num_div_summary.save_bf_plot(bf_plot_path) num_div_bf_path = '{0}{1}'.format( out_prefix, 'number-of-divergences-bayes-factors.txt') with open(num_div_bf_path, 'w') as out: out.write('num_of_divs\t2ln(bf)\n') for n in sorted(num_div_summary.psi_bayes_factors.keys()): out.write('{0}\t{1}\n'.format( n, num_div_summary.psi_bayes_factors[n])) log.info('The plots are in: {0}'.format(args.output_dir))
def main_cli(): description = '{name} {version}'.format(**_program_info) parser = argparse.ArgumentParser(description=description) parser.add_argument( 'config', metavar='CONFIG-FILE', type=argparse_utils.arg_is_config, help=('msBayes config file used to estimate the posterior ' 'sample.')) parser.add_argument('posterior_sample_path', metavar='POSTERIOR-SAMPLE-FILE', type=argparse_utils.arg_is_file, help=('Path to posterior sample file (i.e., ' '`*-posterior-sample.txt`).')) parser.add_argument( '-e', '--expression', dest='expressions', action='append', metavar='TAXON-INDEX-EXPRESSION', type=str, required=True, help=('A conditional expression of divergence times based on ' 'the taxon-pair indices for which to calculate the ' 'posterior probability of being true. Indices correspond ' 'to the order that pairs of taxa appear in the sample ' 'table of the config, starting at 0 for the first ' 'taxon-pair to appear in the table (starting from the ' 'top). E.g., `-e "0 == 3 == 4"` would request the ' 'proportion of times the 1st, 4th, and 5th taxon-pairs ' '(in order of appearance in the sample table of the ' 'config) share the same divergence time in the ' 'posterior sample, whereas `-e "0 > 1" would request the ' 'proportion of times the the 1st taxon-pair diverged ' 'further back in time than the 2nd taxon-pair in the ' 'posterior sample.')) parser.add_argument( '-n', '--num-prior-samples', action='store', type=argparse_utils.arg_is_positive_int, help=('The number of prior samples to simulate for estimating ' 'prior probabilities; prior probabilities and Bayes ' 'factors will be reported. The default is to only report ' 'posterior probabilities.')) parser.add_argument( '--np', action='store', type=argparse_utils.arg_is_positive_int, default=multiprocessing.cpu_count(), help=('The maximum number of processes to run in parallel for ' 'prior simulations. The default is the number of CPUs ' 'available on the machine. This option is only relevant ' 'if the number of prior samples is specified using the ' '`-n` argument.')) parser.add_argument( '--seed', action='store', type=argparse_utils.arg_is_positive_int, help=('Random number seed to use for simulations. This option ' 'is only relevant if the number of prior samples is ' 'specified using the `-n` argument.')) parser.add_argument('--version', action='version', version='%(prog)s ' + _program_info['version'], help='Report version and exit.') parser.add_argument('--quiet', action='store_true', help='Run without verbose messaging.') parser.add_argument('--debug', action='store_true', help='Run in debugging mode.') args = parser.parse_args() ########################################################################## ## handle args from pymsbayes.utils.messaging import (LoggingControl, InfoLogger) LoggingControl.set_logging_level("INFO") if args.quiet: LoggingControl.set_logging_level("WARNING") if args.debug: LoggingControl.set_logging_level("DEBUG") log = LoggingControl.get_logger(__name__) from pymsbayes import config from pymsbayes.teams import DivModelSimulatorTeam from pymsbayes.utils import stats, sumresults, GLOBAL_RNG if not args.seed: args.seed = random.randint(1, 999999999) GLOBAL_RNG.seed(args.seed) cfg = config.MsBayesConfig(args.config) evaluators = [] for exp in args.expressions: evaluators.append( stats.ListConditionEvaluator(exp, index_labels=cfg.taxa)) div_models = sumresults.get_partitions_from_posterior_sample_file( args.posterior_sample_path) sim_team = None if args.num_prior_samples: sim_team = DivModelSimulatorTeam(config_paths=[args.config], num_samples=args.num_prior_samples, num_processors=args.np) sim_team.start() for e in evaluators: title = '{0} --- {1}:'.format(e.expression, e.pretty_expression) section_title = '\n{0}\n{1}\n'.format(title, '-' * len(title)) sys.stdout.write('{0}'.format(section_title)) prob_shared_div = div_models.get_condition_frequency(e) sys.stdout.write( 'posterior probability = {0}\n'.format(prob_shared_div)) if sim_team: prior_prob = sim_team.div_models[ args.config].get_condition_frequency(e) bf = ((prob_shared_div / (1 - prob_shared_div)) / (prior_prob / (1 - prior_prob))) sys.stdout.write('prior probability = {0}\n'.format(prior_prob)) sys.stdout.write('Bayes factor = {0}\n'.format(bf)) sys.stdout.write('2ln(Bayes factor) = {0}\n'.format(2 * math.log(bf))) sys.stdout.write('\n')
def main_cli(): description = '{name} {version}'.format(**_program_info) parser = argparse.ArgumentParser(description = description) parser.add_argument('-c', '--config', type = arg_is_config, required = True, help = ('msBayes config file to be used to generate saturation ' 'plot.')) parser.add_argument('-n', '--num-prior-samples', action = 'store', type = int, default = 1000, help = ('The number of prior samples to simulate for the ' 'saturation plot.')) parser.add_argument('--np', action = 'store', type = int, default = multiprocessing.cpu_count(), help = ('The maximum number of processes to run in parallel. The ' 'default is the number of CPUs available on the machine.')) parser.add_argument('-o', '--output-dir', action = 'store', type = arg_is_dir, help = ('The directory in which all output files will be written. ' 'The default is to use the directory of the first observed ' 'config file.')) parser.add_argument('--temp-dir', action = 'store', type = arg_is_dir, help = ('A directory to temporarily stage files. The default is to ' 'use the output directory.')) parser.add_argument('-s', '--stat-prefixes', nargs = '*', type = str, default = ['pi', 'pi.net', 'wattTheta', 'tajD.denom'], help = ('Prefixes of summary statistics to use in the analyses. ' 'The prefixes should be separated by spaces. ' 'Default: `-s pi pi.net wattTheta tajD.denom`.')) parser.add_argument('--vertical-lines', nargs = '*', type = float, default = [], help = ('Positions along x-axis where vertical lines are to be ' 'drawn. Default is to draw no vertical lines.')) parser.add_argument('--compress', action = 'store_true', help = 'Compress plot data file.') parser.add_argument('--keep-temps', action = 'store_true', help = 'Keep all temporary files.') parser.add_argument('--seed', action = 'store', type = int, help = 'Random number seed to use for the analysis.') parser.add_argument('--version', action = 'version', version = '%(prog)s ' + _program_info['version'], help = 'Report version and exit.') parser.add_argument('--quiet', action = 'store_true', help = 'Run without verbose messaging.') parser.add_argument('--debug', action = 'store_true', help = 'Run in debugging mode.') args = parser.parse_args() ########################################################################## ## handle args from pymsbayes.utils.messaging import (LoggingControl, InfoLogger) LoggingControl.set_logging_level("INFO") if args.quiet: LoggingControl.set_logging_level("WARNING") if args.debug: LoggingControl.set_logging_level("DEBUG") log = LoggingControl.get_logger(__name__) from pymsbayes.workers import MsBayesWorker from pymsbayes.utils.parsing import (get_patterns_from_prefixes, DEFAULT_STAT_PATTERNS, get_stats_by_time, dict_line_iter) from pymsbayes.manager import Manager from pymsbayes.utils.tempfs import TempFileSystem from pymsbayes.utils import probability from pymsbayes.utils.functions import long_division from pymsbayes.config import MsBayesConfig from pymsbayes.utils import GLOBAL_RNG, MSBAYES_SORT_INDEX, ToolPathManager from pymsbayes.fileio import process_file_arg from pymsbayes.plotting import MATPLOTLIB_AVAILABLE, SaturationPlotGrid MSBAYES_SORT_INDEX.set_index(0) # get full paths to tools msbayes_path = ToolPathManager.get_tool_full_path('msbayes.pl') dpp_msbayes_path = ToolPathManager.get_tool_full_path('dpp-msbayes.pl') if not args.output_dir: args.output_dir = os.path.dirname(args.config) info = InfoLogger(os.path.join(args.output_dir, 'pymsbayes-info.txt')) stats_by_time_path = os.path.join(args.output_dir, 'stats-by-time.txt') if args.compress: stats_by_time_path += '.gz' plot_path = os.path.join(args.output_dir, 'saturation-plot.pdf') if not args.temp_dir: args.temp_dir = args.output_dir temp_fs = TempFileSystem(parent=args.temp_dir, prefix='temp-files-') args.stat_prefixes = [s.rstrip('.') for s in args.stat_prefixes] stat_patterns = get_patterns_from_prefixes( [s + '.' for s in args.stat_prefixes], ignore_case=True) if not args.seed: args.seed = random.randint(1, 999999999) GLOBAL_RNG.seed(args.seed) compress_level = None if args.compress: compress_level = 9 cfg = MsBayesConfig(args.config) num_taxon_pairs = cfg.npairs cfg.div_model_prior = 'constrained' cfg.psi = probability.DiscreteUniformDistribution(num_taxon_pairs, num_taxon_pairs) config_path = temp_fs.get_file_path(prefix='cfg-') cfg.write(config_path) info.write('[pymsbayes]', log.info) info.write('\tprogram_name = {name}'.format(**_program_info), log.info) info.write('\tversion = {version}'.format(**_program_info), log.info) info.write('\tinvocation = {0!r}'.format(' '.join(sys.argv)), log.info) info.write('\toutput_directory = {0!r}'.format(args.output_dir), log.info) info.write('\ttemp_directory = {0!r}'.format(temp_fs.base_dir), log.info) info.write('\tsort_index = {0}'.format( MSBAYES_SORT_INDEX.current_value()), log.info) info.write('\tstat_patterns = {0!r}'.format( ', '.join([p.pattern for p in stat_patterns])), log.info) info.write('\tseed = {0}'.format(args.seed), log.info) info.write('\tnum_prior_samples = {0}'.format(args.num_prior_samples), log.info) info.write('\tstats_by_time_path = {0!r}'.format(stats_by_time_path), log.info) info.write('\t[[tool_paths]]', log.info) info.write('\t\tdpp_msbayes = {0}'.format(dpp_msbayes_path), log.info) info.write('\t\tmsbayes = {0}'.format(msbayes_path), log.info) info.write('\t[[config]]', log.debug) info.write('{0}'.format(str(cfg)), log.debug) ########################################################################## ## begin analysis --- generate samples start_time = datetime.datetime.now() if args.np > args.num_prior_samples: args.np = args.num_prior_samples batch_size, remainder = long_division(args.num_prior_samples, args.np) schema = 'abctoolbox' workers = [] for i in range(args.np): sample_size = batch_size if i == (args.np - 1): sample_size += remainder w = MsBayesWorker( temp_fs = temp_fs, sample_size = sample_size, config_path = config_path, report_parameters = True, schema = schema, include_header = True, stat_patterns = stat_patterns, write_stats_file = False) workers.append(w) log.info('Generating samples...') workers = Manager.run_workers( workers = workers, num_processors = args.np) log.info('Parsing samples...') stats_by_time = get_stats_by_time([w.prior_path for w in workers]) stat_keys = stats_by_time.keys() stat_keys.remove('PRI.t') for prefix in args.stat_prefixes: if not prefix in stat_keys: raise Exception('stat prefix {0!r} not found in simulated stats:' '\n\t{1}'.format(prefix, ', '.join(stat_keys))) header = ['PRI.t'] + args.stat_prefixes log.info('Writing stats-by-time matrix...') out, close = process_file_arg(stats_by_time_path, 'w', compresslevel = compress_level) for row in dict_line_iter(stats_by_time, sep = '\t', header = header): out.write(row) if close: out.close() log.info('Creating plots...') if not MATPLOTLIB_AVAILABLE: log.warning( '`matplotlib` could not be imported, so the plot can not be\n' 'produced. The data to create the plot can be found in:\n\t' '{0!r}'.format(stats_by_time_path)) else: y_labels = {'pi': r'$\pi$', 'pi.net': r'$\pi_{net}$', 'wattTheta': r'$\theta_W$', 'tajD.denom': r'$SD(\pi - \theta_W)$'} spg = SaturationPlotGrid(stats_by_time, x_key = 'PRI.t', y_keys = args.stat_prefixes, y_labels = y_labels, num_columns = 2, vertical_line_positions = args.vertical_lines) fig = spg.create_grid() fig.savefig(plot_path) stop_time = datetime.datetime.now() log.info('Done!') info.write('\t[[run_stats]]', log.info) info.write('\t\tstart_time = {0}'.format(str(start_time)), log.info) info.write('\t\tstop_time = {0}'.format(str(stop_time)), log.info) info.write('\t\ttotal_duration = {0}'.format(str(stop_time - start_time)), log.info) if not args.keep_temps: log.debug('purging temps...') temp_fs.purge()
def main_cli(): description = '{name} {version}'.format(**_program_info) parser = argparse.ArgumentParser(description = description, formatter_class = argparse_utils.SmartHelpFormatter) parser.add_argument('-c', '--config', type = argparse_utils.arg_is_config, required = True, help = ('msBayes config file to be used to generate saturation ' 'plot.')) parser.add_argument('-n', '--num-prior-samples', action = 'store', type = int, default = 1000, help = ('The number of prior samples to simulate for the ' 'saturation plot.')) parser.add_argument('--np', action = 'store', type = int, default = multiprocessing.cpu_count(), help = ('The maximum number of processes to run in parallel. The ' 'default is the number of CPUs available on the machine.')) parser.add_argument('-o', '--output-dir', action = 'store', type = argparse_utils.arg_is_dir, help = ('The directory in which all output files will be written. ' 'The default is to use the directory of the first observed ' 'config file.')) parser.add_argument('--temp-dir', action = 'store', type = argparse_utils.arg_is_dir, help = ('A directory to temporarily stage files. The default is to ' 'use the output directory.')) parser.add_argument('-s', '--stat-prefixes', nargs = '*', type = str, default = ['pi', 'pi.net', 'wattTheta', 'tajD.denom'], help = ('Prefixes of summary statistics to use in the analyses. ' 'The prefixes should be separated by spaces. ' 'Default: `-s pi pi.net wattTheta tajD.denom`.')) parser.add_argument('--sort-index', action = 'store', type = int, default = 0, choices = range(12), help = argparse_utils.get_sort_index_help_message()) parser.add_argument('--compress', action = 'store_true', help = 'Compress plot data file.') parser.add_argument('--keep-temps', action = 'store_true', help = 'Keep all temporary files.') parser.add_argument('--seed', action = 'store', type = int, help = 'Random number seed to use for the analysis.') parser.add_argument('--version', action = 'version', version = '%(prog)s ' + _program_info['version'], help = 'Report version and exit.') parser.add_argument('--quiet', action = 'store_true', help = 'Run without verbose messaging.') parser.add_argument('--debug', action = 'store_true', help = 'Run in debugging mode.') args = parser.parse_args() ########################################################################## ## handle args from pymsbayes.utils.messaging import (LoggingControl, InfoLogger) LoggingControl.set_logging_level("INFO") if args.quiet: LoggingControl.set_logging_level("WARNING") if args.debug: LoggingControl.set_logging_level("DEBUG") log = LoggingControl.get_logger(__name__) from pymsbayes.workers import MsBayesWorker from pymsbayes.utils.parsing import (get_patterns_from_prefixes, DEFAULT_STAT_PATTERNS, get_dict_from_spreadsheets, dict_line_iter) from pymsbayes.manager import Manager from pymsbayes.utils.tempfs import TempFileSystem from pymsbayes.utils import probability, stats from pymsbayes.utils.functions import long_division from pymsbayes.config import MsBayesConfig from pymsbayes.utils import GLOBAL_RNG, MSBAYES_SORT_INDEX, ToolPathManager from pymsbayes.fileio import process_file_arg from pymsbayes import plotting MSBAYES_SORT_INDEX.set_index(args.sort_index) # get full paths to tools msbayes_path = ToolPathManager.get_tool_full_path('msbayes.pl') dpp_msbayes_path = ToolPathManager.get_tool_full_path('dpp-msbayes.pl') if not args.output_dir: args.output_dir = os.path.dirname(args.config) info = InfoLogger(os.path.join(args.output_dir, 'pymsbayes-info.txt')) sample_path = os.path.join(args.output_dir, 'prior-sample.txt') if args.compress: sample_path += '.gz' if not args.temp_dir: args.temp_dir = args.output_dir temp_fs = TempFileSystem(parent=args.temp_dir, prefix='temp-files-') args.stat_prefixes = [s.rstrip('.') for s in args.stat_prefixes] stat_patterns = get_patterns_from_prefixes( [s + '.' for s in args.stat_prefixes], ignore_case=True) if not args.seed: args.seed = random.randint(1, 999999999) GLOBAL_RNG.seed(args.seed) compress_level = None if args.compress: compress_level = 9 cfg = MsBayesConfig(args.config) num_taxon_pairs = cfg.npairs info.write('[pymsbayes]', log.info) info.write('\tprogram_name = {name}'.format(**_program_info), log.info) info.write('\tversion = {version}'.format(**_program_info), log.info) info.write('\tinvocation = {0!r}'.format(' '.join(sys.argv)), log.info) info.write('\toutput_directory = {0!r}'.format(args.output_dir), log.info) info.write('\ttemp_directory = {0!r}'.format(temp_fs.base_dir), log.info) info.write('\tsort_index = {0}'.format( MSBAYES_SORT_INDEX.current_value()), log.info) info.write('\tstat_patterns = {0!r}'.format( ', '.join([p.pattern for p in stat_patterns])), log.info) info.write('\tseed = {0}'.format(args.seed), log.info) info.write('\tnum_prior_samples = {0}'.format(args.num_prior_samples), log.info) info.write('\tsample_path = {0!r}'.format(sample_path), log.info) info.write('\t[[tool_paths]]', log.info) info.write('\t\tdpp_msbayes = {0}'.format(dpp_msbayes_path), log.info) info.write('\t\tmsbayes = {0}'.format(msbayes_path), log.info) info.write('\t[[config]]', log.debug) info.write('{0}'.format(str(cfg)), log.debug) ########################################################################## ## begin analysis --- generate samples start_time = datetime.datetime.now() if args.np > args.num_prior_samples: args.np = args.num_prior_samples batch_size, remainder = long_division(args.num_prior_samples, args.np) schema = 'abctoolbox' workers = [] for i in range(args.np): sample_size = batch_size if i == (args.np - 1): sample_size += remainder w = MsBayesWorker( temp_fs = temp_fs, sample_size = sample_size, config_path = args.config, report_parameters = True, schema = schema, include_header = True, stat_patterns = stat_patterns, write_stats_file = False) workers.append(w) log.info('Generating samples...') workers = Manager.run_workers( workers = workers, num_processors = args.np) log.info('Parsing samples...') sample = get_dict_from_spreadsheets([w.prior_path for w in workers]) log.info('Writing prior samples...') out, close = process_file_arg(sample_path, 'w', compresslevel = compress_level) for row in dict_line_iter(sample, sep = '\t'): out.write(row) if close: out.close() log.info('Creating plots...') if not plotting.MATPLOTLIB_AVAILABLE: log.warning( '`matplotlib` could not be imported, so the plot can not be\n' 'produced. The data to create the plot can be found in:\n\t' '{0!r}'.format(sample_path)) sys.exit(1) for stat_pattern in stat_patterns: found = False for stat, values in sample.iteritems(): if stat_pattern.match(stat): values = [float(v) for v in values] found = True plot_path = os.path.join(args.output_dir, 'plot-{0}.pdf'.format(stat)) summary = stats.get_summary(values) s = r'mean = {0:.4f} ({1:.4f}-{2:.4f})'.format( summary['mean'], summary['qi_95'][0], summary['qi_95'][1]) hd = plotting.HistData(x = values, normed = True, bins = 20, histtype = 'bar', align = 'mid', orientation = 'vertical', zorder = 0) hist = plotting.ScatterPlot(hist_data_list = [hd], right_text = s) hist.left_text_size = 12.0 hist.right_text_size = 12.0 xticks = [i for i in hist.ax.get_xticks()] xtick_labels = [i for i in xticks] yticks = [i for i in hist.ax.get_yticks()] ytick_labels = [i for i in yticks] if len(xtick_labels) >= 8: for i in range(1, len(xtick_labels), 2): xtick_labels[i] = '' if len(ytick_labels) >= 8: for i in range(1, len(ytick_labels), 2): ytick_labels[i] = '' xticks_obj = plotting.Ticks(ticks = xticks, labels = xtick_labels, horizontalalignment = 'center') yticks_obj = plotting.Ticks(ticks = yticks, labels = ytick_labels) hist.xticks_obj = xticks_obj hist.yticks_obj = yticks_obj plot_grid = plotting.PlotGrid(subplots = [hist], num_columns = 1, label_schema = None, title = stat, title_size = 14.0, title_top = False, y_title = 'Density', y_title_position = 0.001, y_title_size = 14.0, height = 4.0, width = 6.0, auto_height = False) plot_grid.auto_adjust_margins = False plot_grid.margin_left = 0.04 plot_grid.margin_bottom = 0.04 plot_grid.margin_right = 1.0 plot_grid.margin_top = 0.97 plot_grid.reset_figure() plot_grid.savefig(plot_path) if not found: raise Exception('stat pattern {0!r} not found in simulated stats:' '\n\t{1}'.format(stat_pattern, ', '.join(sample.keys()))) stop_time = datetime.datetime.now() log.info('Done!') info.write('\t[[run_stats]]', log.info) info.write('\t\tstart_time = {0}'.format(str(start_time)), log.info) info.write('\t\tstop_time = {0}'.format(str(stop_time)), log.info) info.write('\t\ttotal_duration = {0}'.format(str(stop_time - start_time)), log.info) if not args.keep_temps: log.debug('purging temps...') temp_fs.purge()
def main_cli(): description = '{name} {version}'.format(**_program_info) parser = argparse.ArgumentParser(description=description) parser.add_argument( '-c', '--config', type=arg_is_config, required=True, help=('msBayes config file to be used to generate saturation ' 'plot.')) parser.add_argument( '-n', '--num-prior-samples', action='store', type=int, default=1000, help=('The number of prior samples to simulate for the ' 'saturation plot.')) parser.add_argument( '--np', action='store', type=int, default=multiprocessing.cpu_count(), help=('The maximum number of processes to run in parallel. The ' 'default is the number of CPUs available on the machine.')) parser.add_argument( '-o', '--output-dir', action='store', type=arg_is_dir, help=('The directory in which all output files will be written. ' 'The default is to use the directory of the first observed ' 'config file.')) parser.add_argument( '--temp-dir', action='store', type=arg_is_dir, help=('A directory to temporarily stage files. The default is to ' 'use the output directory.')) parser.add_argument( '-s', '--stat-prefixes', nargs='*', type=str, default=['pi', 'pi.net', 'wattTheta', 'tajD.denom'], help=('Prefixes of summary statistics to use in the analyses. ' 'The prefixes should be separated by spaces. ' 'Default: `-s pi pi.net wattTheta tajD.denom`.')) parser.add_argument( '--vertical-lines', nargs='*', type=float, default=[], help=('Positions along x-axis where vertical lines are to be ' 'drawn. Default is to draw no vertical lines.')) parser.add_argument('--compress', action='store_true', help='Compress plot data file.') parser.add_argument('--keep-temps', action='store_true', help='Keep all temporary files.') parser.add_argument('--seed', action='store', type=int, help='Random number seed to use for the analysis.') parser.add_argument('--version', action='version', version='%(prog)s ' + _program_info['version'], help='Report version and exit.') parser.add_argument('--quiet', action='store_true', help='Run without verbose messaging.') parser.add_argument('--debug', action='store_true', help='Run in debugging mode.') args = parser.parse_args() ########################################################################## ## handle args from pymsbayes.utils.messaging import (LoggingControl, InfoLogger) LoggingControl.set_logging_level("INFO") if args.quiet: LoggingControl.set_logging_level("WARNING") if args.debug: LoggingControl.set_logging_level("DEBUG") log = LoggingControl.get_logger(__name__) from pymsbayes.workers import MsBayesWorker from pymsbayes.utils.parsing import (get_patterns_from_prefixes, DEFAULT_STAT_PATTERNS, get_stats_by_time, dict_line_iter) from pymsbayes.manager import Manager from pymsbayes.utils.tempfs import TempFileSystem from pymsbayes.utils import probability from pymsbayes.utils.functions import long_division from pymsbayes.config import MsBayesConfig from pymsbayes.utils import GLOBAL_RNG, MSBAYES_SORT_INDEX, ToolPathManager from pymsbayes.fileio import process_file_arg from pymsbayes.plotting import MATPLOTLIB_AVAILABLE, SaturationPlotGrid MSBAYES_SORT_INDEX.set_index(0) # get full paths to tools msbayes_path = ToolPathManager.get_tool_full_path('msbayes.pl') dpp_msbayes_path = ToolPathManager.get_tool_full_path('dpp-msbayes.pl') if not args.output_dir: args.output_dir = os.path.dirname(args.config) info = InfoLogger(os.path.join(args.output_dir, 'pymsbayes-info.txt')) stats_by_time_path = os.path.join(args.output_dir, 'stats-by-time.txt') if args.compress: stats_by_time_path += '.gz' plot_path = os.path.join(args.output_dir, 'saturation-plot.pdf') if not args.temp_dir: args.temp_dir = args.output_dir temp_fs = TempFileSystem(parent=args.temp_dir, prefix='temp-files-') args.stat_prefixes = [s.rstrip('.') for s in args.stat_prefixes] stat_patterns = get_patterns_from_prefixes( [s + '.' for s in args.stat_prefixes], ignore_case=True) if not args.seed: args.seed = random.randint(1, 999999999) GLOBAL_RNG.seed(args.seed) compress_level = None if args.compress: compress_level = 9 cfg = MsBayesConfig(args.config) num_taxon_pairs = cfg.npairs cfg.div_model_prior = 'constrained' cfg.psi = probability.DiscreteUniformDistribution(num_taxon_pairs, num_taxon_pairs) config_path = temp_fs.get_file_path(prefix='cfg-') cfg.write(config_path) info.write('[pymsbayes]', log.info) info.write('\tprogram_name = {name}'.format(**_program_info), log.info) info.write('\tversion = {version}'.format(**_program_info), log.info) info.write('\tinvocation = {0!r}'.format(' '.join(sys.argv)), log.info) info.write('\toutput_directory = {0!r}'.format(args.output_dir), log.info) info.write('\ttemp_directory = {0!r}'.format(temp_fs.base_dir), log.info) info.write('\tsort_index = {0}'.format(MSBAYES_SORT_INDEX.current_value()), log.info) info.write( '\tstat_patterns = {0!r}'.format(', '.join( [p.pattern for p in stat_patterns])), log.info) info.write('\tseed = {0}'.format(args.seed), log.info) info.write('\tnum_prior_samples = {0}'.format(args.num_prior_samples), log.info) info.write('\tstats_by_time_path = {0!r}'.format(stats_by_time_path), log.info) info.write('\t[[tool_paths]]', log.info) info.write('\t\tdpp_msbayes = {0}'.format(dpp_msbayes_path), log.info) info.write('\t\tmsbayes = {0}'.format(msbayes_path), log.info) info.write('\t[[config]]', log.debug) info.write('{0}'.format(str(cfg)), log.debug) ########################################################################## ## begin analysis --- generate samples start_time = datetime.datetime.now() if args.np > args.num_prior_samples: args.np = args.num_prior_samples batch_size, remainder = long_division(args.num_prior_samples, args.np) schema = 'abctoolbox' workers = [] for i in range(args.np): sample_size = batch_size if i == (args.np - 1): sample_size += remainder w = MsBayesWorker(temp_fs=temp_fs, sample_size=sample_size, config_path=config_path, report_parameters=True, schema=schema, include_header=True, stat_patterns=stat_patterns, write_stats_file=False) workers.append(w) log.info('Generating samples...') workers = Manager.run_workers(workers=workers, num_processors=args.np) log.info('Parsing samples...') stats_by_time = get_stats_by_time([w.prior_path for w in workers]) stat_keys = stats_by_time.keys() stat_keys.remove('PRI.t') for prefix in args.stat_prefixes: if not prefix in stat_keys: raise Exception('stat prefix {0!r} not found in simulated stats:' '\n\t{1}'.format(prefix, ', '.join(stat_keys))) header = ['PRI.t'] + args.stat_prefixes log.info('Writing stats-by-time matrix...') out, close = process_file_arg(stats_by_time_path, 'w', compresslevel=compress_level) for row in dict_line_iter(stats_by_time, sep='\t', header=header): out.write(row) if close: out.close() log.info('Creating plots...') if not MATPLOTLIB_AVAILABLE: log.warning( '`matplotlib` could not be imported, so the plot can not be\n' 'produced. The data to create the plot can be found in:\n\t' '{0!r}'.format(stats_by_time_path)) else: y_labels = { 'pi': r'$\pi$', 'pi.net': r'$\pi_{net}$', 'wattTheta': r'$\theta_W$', 'tajD.denom': r'$SD(\pi - \theta_W)$' } spg = SaturationPlotGrid(stats_by_time, x_key='PRI.t', y_keys=args.stat_prefixes, y_labels=y_labels, num_columns=2, vertical_line_positions=args.vertical_lines) fig = spg.create_grid() fig.savefig(plot_path) stop_time = datetime.datetime.now() log.info('Done!') info.write('\t[[run_stats]]', log.info) info.write('\t\tstart_time = {0}'.format(str(start_time)), log.info) info.write('\t\tstop_time = {0}'.format(str(stop_time)), log.info) info.write('\t\ttotal_duration = {0}'.format(str(stop_time - start_time)), log.info) if not args.keep_temps: log.debug('purging temps...') temp_fs.purge()
def main_cli(): description = '{name} {version}'.format(**_program_info) parser = argparse.ArgumentParser(description=description) parser.add_argument( 'configs', metavar='CONFIG-PATH', type=arg_is_config, nargs='+', help=('msBayes config file paths for which to estimate prior ' 'probabilities.')) parser.add_argument('-n', '--num-prior-samples', action='store', type=int, default=1000, help=('The number of prior samples to simulate for ' 'proabability estimates.')) parser.add_argument( '--np', action='store', type=int, default=multiprocessing.cpu_count(), help=('The maximum number of processes to run in parallel. The ' 'default is the number of CPUs available on the machine.')) parser.add_argument( '-d', '--dispersion-threshold', action='store', type=float, default=0.01, help=('The threshold for the dispersion index of divegence ' 'times. The estimated prior probability that the ' 'dispersion index is less than this threshold will ' 'be reported for each config.')) parser.add_argument( '-c', '--cv-threshold', action='store', type=float, default=0.01, help=('The threshold for the coefficient of variation (CV) of ' 'divegence times. The estimated prior probability that the ' 'CV is less than this threshold will ' 'be reported for each config.')) parser.add_argument('--seed', action='store', type=int, help='Random number seed to use for the analysis.') parser.add_argument('--version', action='version', version='%(prog)s ' + _program_info['version'], help='Report version and exit.') parser.add_argument('--quiet', action='store_true', help='Run without verbose messaging.') parser.add_argument('--debug', action='store_true', help='Run in debugging mode.') args = parser.parse_args() ########################################################################## ## handle args from pymsbayes.utils.messaging import (LoggingControl, InfoLogger) LoggingControl.set_logging_level("INFO") if args.quiet: LoggingControl.set_logging_level("WARNING") if args.debug: LoggingControl.set_logging_level("DEBUG") log = LoggingControl.get_logger(__name__) from pymsbayes.teams import ModelProbabilityEstimatorTeam from pymsbayes.utils import GLOBAL_RNG if not args.seed: args.seed = random.randint(1, 999999999) log.info('Using seed {0}'.format(args.seed)) GLOBAL_RNG.seed(args.seed) ########################################################################## ## begin analysis --- generate samples start_time = datetime.datetime.now() prob_esimator_team = ModelProbabilityEstimatorTeam( config_paths=args.configs, num_samples=args.num_prior_samples, omega_threshold=args.dispersion_threshold, cv_threshold=args.cv_threshold, num_processors=args.np) prob_esimator_team.start() for path in args.configs: sys.stdout.write('Prior probabilities for model {0}:\n'.format(path)) for k, p in prob_esimator_team.psi_probs[path].iteritems(): sys.stdout.write('\tnum of divergence events = {0}: {1}\n'.format( k, p)) sys.stdout.write('\tdispersion of div times < {0}: {1}\n'.format( args.dispersion_threshold, prob_esimator_team.omega_probs[path])) sys.stdout.write('\tCV of div times < {0}: {1}\n'.format( args.cv_threshold, prob_esimator_team.cv_probs[path])) stop_time = datetime.datetime.now() log.info('[run_stats]') log.info('\tstart_time = {0}'.format(str(start_time))) log.info('\tstop_time = {0}'.format(str(stop_time))) log.info('\ttotal_duration = {0}'.format(str(stop_time - start_time)))
def main_cli(argv = sys.argv): description = '{name} {version}'.format(**_program_info) parser = argparse.ArgumentParser(description = description, formatter_class = argparse_utils.SmartHelpFormatter) parser.add_argument('-o', '--observed-configs', nargs = '+', type = argparse_utils.arg_is_config, required = True, help = ('One or more msBayes config files to be used to either ' 'calculate or simulate observed summary statistics. If ' 'used in combination with `-r` each config will be used to ' 'simulate pseudo-observed data. If analyzing real data, do ' 'not use the `-r` option, and the fasta files specified ' 'within the config must exist and contain the sequence ' 'data.')) parser.add_argument('-p', '--prior-configs', nargs = '+', type = argparse_utils.arg_is_path, required = True, help = ('One or more config files to be used to generate prior ' 'samples. If more than one config is specified, they ' 'should be separated by spaces. ' 'This option can also be used to specify the path to a ' 'directory containing the prior samples and summary ' 'statistic means and standard deviations generated by a ' 'previous run using the `generate-samples-only` option. ' 'These files should be found in the directory ' '`pymsbayes-output/prior-stats-summaries`. The' '`pymsbayes-output/model-key.txt` also needs to be present.' ' If specifying this directory, it should be the only ' 'argument (i.e., no other directories or config files can ' 'be provided).')) parser.add_argument('-r', '--reps', action = 'store', type = argparse_utils.arg_is_nonnegative_int, default = 0, help = ('This option has two effects. First, it signifies that ' 'the analysis will be simulation based (i.e., no real ' 'data will be used). Second, it specifies how many ' 'simulation replicates to perform (i.e., how many data ' 'sets to simulate and analyze).')) parser.add_argument('-n', '--num-prior-samples', action = 'store', type = argparse_utils.arg_is_positive_int, default = 1000000, help = ('The number of prior samples to simulate for each prior ' 'config specified with `-p`.')) parser.add_argument('--prior-batch-size', action = 'store', type = argparse_utils.arg_is_positive_int, default = 10000, help = ('The number of prior samples to simulate for each batch.')) parser.add_argument('--generate-samples-only', action = 'store_true', help = ('Only generate samples from models as requested. I.e., ' 'No analyses are performed to approximate posteriors. ' 'This option can be useful if you want the prior samples ' 'for other purposes.')) parser.add_argument('--num-posterior-samples', action = 'store', type = argparse_utils.arg_is_positive_int, default = 1000, help = ('The number of posterior samples desired for each ' 'analysis. Default: 1000.')) parser.add_argument('--num-standardizing-samples', action = 'store', type = argparse_utils.arg_is_positive_int, default = 10000, help = ('The number of prior samples desired to use for ' 'standardizing statistics. Default: 10000.')) parser.add_argument('--np', action = 'store', type = argparse_utils.arg_is_positive_int, default = multiprocessing.cpu_count(), help = ('The maximum number of processes to run in parallel. The ' 'default is the number of CPUs available on the machine.')) parser.add_argument('--output-dir', action = 'store', type = argparse_utils.arg_is_dir, help = ('The directory in which all output files will be written. ' 'The default is to use the directory of the first observed ' 'config file.')) parser.add_argument('--temp-dir', action = 'store', type = argparse_utils.arg_is_dir, help = ('A directory to temporarily stage files. The default is to ' 'use the output directory.')) parser.add_argument('--staging-dir', action = 'store', type = argparse_utils.arg_is_dir, help = ('A directory to temporarily stage prior files. This option ' 'can be useful on clusters to speed up I/O while ' 'generating prior samples. You can designate a local temp ' 'directory on a compute node to avoid constant writing to ' 'a shared drive. The default is to use the `temp-dir`.')) parser.add_argument('-s', '--stat-prefixes', nargs = '*', type = str, help = ('Prefixes of summary statistics to use in the analyses. ' 'The prefixes should be separated by spaces. ' 'Default: `-s pi wattTheta pi.net tajD.denom`.')) parser.add_argument('-b', '--bandwidth', action = 'store', type = float, help = ('Smoothing parameter for the posterior kernal density ' 'estimation. This option is used for the `glm` ' 'regression method. The default is 2 / ' '`num-posterior-samples`.')) parser.add_argument('-q', '--num-posterior-quantiles', action = 'store', type = argparse_utils.arg_is_positive_int, default = 1000, help = ('The number of equally spaced quantiles at which to ' 'evaluate the GLM-estimated posterior density. ' 'Default: 1000.')) parser.add_argument('--reporting-frequency', action = 'store', type = argparse_utils.arg_is_nonnegative_int, default = 0, help = ('Suggested frequency (in number of prior samples) for ' 'running regression and reporting current results. ' 'Default: 0 (only report final results). ' 'If a value is given, it may be adjusted so that the ' 'reporting frequency is a multiple of the multi-processed ' 'batch size.')) parser.add_argument('--sort-index', action = 'store', type = argparse_utils.arg_is_nonnegative_int, default = 0, choices = range(12), help = argparse_utils.get_sort_index_help_message()) parser.add_argument('--no-global-estimate', action = 'store_true', help = ('If multiple prior models are specified, by default a ' 'global estimate is performed averaging over all models. ' 'This option prevents the global estimation (i.e., only ' 'inferences for each model are made).')) parser.add_argument('--compress', action = 'store_true', help = 'Compress large results files.') parser.add_argument('--keep-temps', action = 'store_true', help = 'Keep all temporary files.') parser.add_argument('--seed', action = 'store', type = int, help = 'Random number seed to use for the analysis.') parser.add_argument('--output-prefix', action = 'store', type = str, default = '', help = ('Prefix to use at beginning of output files. The default ' 'is no prefix.')) parser.add_argument('--data-key-path', action = 'store', type = argparse_utils.arg_is_file, help = ('The path to a `data-key.txt` file generated by a previous ' 'run. This file should be found in the directory ' '`pymsbayes-output/data-key.txt`. This option ' 'will override the `-o`/`--observed-configs` option, and ' 'is intended to be used in combination with the ' '`--start-from` option to restart an analysis.')) parser.add_argument('--start-from-simulation-index', action = 'store', type = argparse_utils.arg_is_nonnegative_int, default = 0, help = ('The simulation index at which to begin analyses. Must be ' 'used in combination with either the number of simulation ' 'replicates (`-r`/`--reps`) or the `--data-key-path` ' 'option, and must be a positive ' 'integer that is less than the number of simulation ' 'replicates. This option can be useful if an analysis ' 'needs to be restarted.')) parser.add_argument('--start-from-observed-index', action = 'store', type = argparse_utils.arg_is_nonnegative_int, default = 0, help = ('The observed config index at which to begin analyses. ' 'Can be used in combination with the `--data-key-path` ' 'option to restart long-running, multi-observed-config ' 'analyses')) parser.add_argument('--dry-run', action = 'store_true', help = 'Do not run analyses; only process settings') parser.add_argument('--version', action = 'version', version = '%(prog)s ' + _program_info['version'], help = 'Report version and exit.') parser.add_argument('--quiet', action = 'store_true', help = 'Run without verbose messaging.') parser.add_argument('--debug', action = 'store_true', help = 'Run in debugging mode.') if argv == sys.argv: args = parser.parse_args() else: args = parser.parse_args(argv) ########################################################################## ## handle args from pymsbayes.utils.messaging import (LoggingControl, InfoLogger) LoggingControl.set_logging_level("INFO") if args.quiet: LoggingControl.set_logging_level("WARNING") if args.debug: LoggingControl.set_logging_level("DEBUG") log = LoggingControl.get_logger(__name__) from pymsbayes.workers import (MsBayesWorker, merge_prior_files, ObsSumStatsWorker) from pymsbayes.teams import ABCTeam from pymsbayes.utils.functions import (is_file, is_dir, long_division, mk_new_dir) from pymsbayes.utils.parsing import (get_patterns_from_prefixes, DEFAULT_STAT_PATTERNS, DIV_MODEL_PATTERNS, MODEL_PATTERNS, PSI_PATTERNS, MEAN_TAU_PATTERNS, OMEGA_PATTERNS, CV_PATTERNS, line_count) from pymsbayes.utils import sumresults, errors from pymsbayes.manager import Manager from pymsbayes.utils.tempfs import TempFileSystem from pymsbayes.config import MsBayesConfig from pymsbayes.utils import (GLOBAL_RNG, set_memory_trace, MSBAYES_SORT_INDEX, ToolPathManager) MSBAYES_SORT_INDEX.set_index(args.sort_index) if len(args.observed_configs) != len(set(args.observed_configs)): raise ValueError('All paths to observed config files must be unique') if args.num_standardizing_samples > args.num_prior_samples: args.num_standardizing_samples = args.num_prior_samples # get full paths to tools msbayes_path = ToolPathManager.get_tool_full_path('msbayes.pl') dpp_msbayes_path = ToolPathManager.get_tool_full_path('dpp-msbayes.pl') eureject_path = ToolPathManager.get_tool_full_path('eureject') abctb_path = ToolPathManager.get_tool_full_path('ABCestimator') # vet prior-configs option using_previous_priors = False previous_prior_dir = None if (len(args.prior_configs) == 1) and (is_dir(args.prior_configs[0])): previous_prior_dir = args.prior_configs.pop(0) previous_priors = glob.glob(os.path.join(previous_prior_dir, '*-prior-sample.txt')) previous_sums = glob.glob(os.path.join(previous_prior_dir, '*-means-and-std-devs.txt')) if (not previous_priors) or (not previous_sums): raise ValueError('directory {0!r} specified with `prior-configs` ' 'option does not contain necessary prior and summary ' 'files'.format(args.prior_configs[0])) using_previous_priors = True else: for path in args.prior_configs: if not is_file(path): raise ValueError('prior config {0!r} is not a file'.format( path)) if len(args.prior_configs) != len(set(args.prior_configs)): raise ValueError('All paths to prior config files must be unique') if not args.output_dir: args.output_dir = os.path.dirname(args.observed_configs[0]) base_dir = mk_new_dir(os.path.join(args.output_dir, 'pymsbayes-results')) if not args.temp_dir: args.temp_dir = base_dir info_path = os.path.join(base_dir, args.output_prefix + \ 'pymsbayes-info.txt') info = InfoLogger(info_path) info.write('[pymsbayes]'.format(base_dir)) info.write('\tversion = {version}'.format(**_program_info)) info.write('\toutput_directory = {0}'.format(base_dir)) temp_fs = TempFileSystem(parent=args.temp_dir, prefix='temp-files-') base_temp_dir = temp_fs.base_dir info.write('\ttemp_directory = {0}'.format(base_temp_dir)) info.write('\tsort_index = {0}'.format( MSBAYES_SORT_INDEX.current_value())) info.write('\tsimulation_reps = {0}'.format(args.reps)) stat_patterns = DEFAULT_STAT_PATTERNS if args.stat_prefixes: for i in range(len(args.stat_prefixes)): if not args.stat_prefixes[i].endswith('.'): args.stat_prefixes[i] += '.' stat_patterns = get_patterns_from_prefixes( args.stat_prefixes, ignore_case=True) if not args.bandwidth: args.bandwidth = 2 / float(args.num_posterior_samples) if not args.seed: args.seed = random.randint(1, 999999999) GLOBAL_RNG.seed(args.seed) if args.data_key_path: observed_map = sumresults.parse_data_key_file(args.data_key_path) observed_paths = [observed_map[k] for k in sorted(observed_map.keys())] else: observed_dir = mk_new_dir(os.path.join(base_dir, 'observed-summary-stats')) observed_paths = [os.path.join(observed_dir, args.output_prefix + \ 'observed-{0}.txt'.format(i+1)) for i in range(len( args.observed_configs))] info.write('\tseed = {0}'.format(args.seed)) info.write('\tnum_processors = {0}'.format(args.np)) info.write('\tnum_prior_samples = {0}'.format( args.num_prior_samples)) info.write('\tnum_standardizing_samples = {0}'.format( args.num_standardizing_samples)) info.write('\tbandwidth = {0}'.format(args.bandwidth)) info.write('\tposterior_quantiles = {0}'.format( args.num_posterior_quantiles)) info.write('\tposterior_sample_size = {0}'.format( args.num_posterior_samples)) info.write('\tstat_patterns = {0}'.format( ', '.join([p.pattern for p in stat_patterns]))) # vet observed configs ref_config_path = args.observed_configs[0] ref_config = MsBayesConfig(ref_config_path) all_config_paths = [] num_taxon_pairs = ref_config.npairs assert num_taxon_pairs > 0 for config in args.observed_configs: all_config_paths.append(config) if not ref_config.equal_sample_table(config): if not args.keep_temps: temp_fs.purge() raise errors.SampleTableError( 'sample tables in config {0!r} and {1!r} differ; ' 'all sample tables must be the same.'.format( ref_config_path, config)) info.write('\tnum_taxon_pairs = {0}'.format(num_taxon_pairs)) info.write('\tdry_run = {0}'.format(args.dry_run)) info.write('\t[[tool_paths]]') info.write('\t\tdpp_msbayes = {0}'.format(dpp_msbayes_path)) info.write('\t\tmsbayes = {0}'.format(msbayes_path)) info.write('\t\teureject = {0}'.format(eureject_path)) info.write('\t\tabcestimator = {0}'.format(abctb_path)) info.write('\t[[observed_configs]]') for i, cfg in enumerate(args.observed_configs): info.write('\t\t{0} = {1}'.format(i + 1, os.path.relpath(cfg, os.path.dirname(info_path)))) abc_team = ABCTeam( temp_fs = temp_fs, observed_stats_files = observed_paths, num_taxon_pairs = num_taxon_pairs, config_paths = args.prior_configs, previous_prior_dir = previous_prior_dir, num_prior_samples = args.num_prior_samples, num_processors = args.np, num_standardizing_samples = args.num_standardizing_samples, num_posterior_samples = args.num_posterior_samples, num_posterior_density_quantiles = args.num_posterior_quantiles, batch_size = args.prior_batch_size, output_dir = base_dir, output_prefix = args.output_prefix, prior_temp_dir = args.staging_dir, rng = GLOBAL_RNG, report_parameters = True, stat_patterns = stat_patterns, eureject_exe_path = eureject_path, abctoolbox_exe_path = abctb_path, msbayes_exe_path = None, abctoolbox_bandwidth = args.bandwidth, omega_threshold = 0.01, cv_threshold = 0.01, compress = args.compress, reporting_frequency = args.reporting_frequency, keep_temps = args.keep_temps, global_estimate_only = False, global_estimate = not args.no_global_estimate, generate_prior_samples_only = args.generate_samples_only, start_from_simulation_index = args.start_from_simulation_index, start_from_observed_index = args.start_from_observed_index) models_to_configs = {} configs_to_models = {} for k, v in abc_team.models.iteritems(): models_to_configs[k] = v configs_to_models[v] = k cfg = MsBayesConfig(v) all_config_paths.append(v) # vet prior configs if not ref_config.equal_sample_table(cfg): if not args.keep_temps: temp_fs.purge() raise errors.SampleTableError( 'sample tables in config {0!r} and {1!r} differ; ' 'all sample tables must be the same.'.format( ref_config_path, v)) info.write('\t[[observed_paths]]') for i in sorted(abc_team.observed_stats_paths.iterkeys()): info.write('\t\t{0} = {1}'.format(i, os.path.relpath( abc_team.observed_stats_paths[i], os.path.dirname(info_path)))) info.write('\t[[prior_configs]]') for i in sorted(abc_team.models.iterkeys()): info.write('\t\t{0} = {1}'.format(i, os.path.relpath( abc_team.models[i], os.path.dirname(info_path)))) ########################################################################## ## begin analysis --- get observed summary stats set_memory_trace() # start logging memory profile start_time = datetime.datetime.now() if args.data_key_path: log.info('Using provided summary statitics...') elif not args.dry_run: obs_temp_dir = base_temp_dir if args.staging_dir: obs_temp_dir = args.staging_dir observed_temp_fs = TempFileSystem(parent = obs_temp_dir, prefix = 'observed-temps-') if args.reps < 1: log.info('Calculating summary statistics from sequence data...') obs_workers = [] for i, cfg in enumerate(args.observed_configs): ss_worker = ObsSumStatsWorker( temp_fs = observed_temp_fs, config_path = cfg, output_path = observed_paths[i], schema = 'abctoolbox', stat_patterns = stat_patterns) obs_workers.append(ss_worker) obs_workers = Manager.run_workers( workers = obs_workers, num_processors = args.np) # re-vet all configs to see if some were changed by obsSumStats.pl new_ref_config = ref_config ref_modified = False # new ref because if all configs were updated all is good if not ref_config.equal_sample_table(ref_config_path): ref_modified = True new_ref_config = MsBayesConfig(ref_config_path) log.warning(""" The alignment lengths in config {0!r} have been corrected for sites with *any* ambiguous bases and/or gaps by obsSumStats.pl. """.format(ref_config_path)) for config in all_config_paths: if not new_ref_config.equal_sample_table(config): corrected_config = config if ref_modified: corrected_config = ref_config_path if not args.keep_temps: observed_temp_fs.purge() temp_fs.purge() raise errors.SampleTableError(""" The sample tables in configs {0!r} and {1!r} differ because obsSumStats.pl modified alignment lengths in config {2!r} to correct for sites in the alignments with *any* ambiguous bases and/or gaps. Please make sure the sample tables in all configs will be the same after correcting alignment lengths for sites that contain *any* ambiguous bases and/or gaps. You can do this by copying and pasting the sample table in {2!r} that has been corrected by obsSumStats.pl into the other configs that were not corrected. """.format(ref_config_path, config, corrected_config)) else: log.info('Simulating summary statistics from observed configs...') num_observed_workers = min([args.reps, args.np]) if args.reps <= args.np: observed_batch_size = 1 remainder = 0 else: observed_batch_size, remainder = long_division(args.reps, args.np) msbayes_workers = [] for idx, cfg in enumerate(args.observed_configs): observed_model_idx = configs_to_models.get(cfg, None) schema = 'abctoolbox' for i in range(num_observed_workers): worker = MsBayesWorker( temp_fs = observed_temp_fs, sample_size = observed_batch_size, config_path = cfg, model_index = observed_model_idx, report_parameters = True, schema = schema, include_header = True, stat_patterns = stat_patterns, write_stats_file = False, staging_dir = None, tag = idx) msbayes_workers.append(worker) if remainder > 0: worker = MsBayesWorker( temp_fs = observed_temp_fs, sample_size = remainder, config_path = cfg, model_index = observed_model_idx, report_parameters = True, schema = schema, include_header = True, stat_patterns = stat_patterns, write_stats_file = False, staging_dir = None, tag = idx) msbayes_workers.append(worker) # run parallel msbayes processes msbayes_workers = Manager.run_workers( workers = msbayes_workers, num_processors = args.np) workers = dict(zip(range(len(args.observed_configs)), [[] for i in range(len(args.observed_configs))])) for w in msbayes_workers: workers[w.tag].append(w) # merge simulated observed data into one file for i in range(len(args.observed_configs)): merge_prior_files([w.prior_path for w in workers[i]], observed_paths[i]) lc = line_count(observed_paths[i], ignore_headers=True) if lc != args.reps: if not args.keep_temps: temp_fs.purge() raise Exception('The number of observed simulations ({0}) ' 'generated for observed config {1!r} and output to ' 'file {2!r} does not match the number of reps ' '({3})'.format(lc, args.observed_configs[i], observed_paths[i], args.reps)) if not args.keep_temps: log.debug('purging observed temps...') observed_temp_fs.purge() ########################################################################## ## Begin ABC analyses if not args.dry_run: abc_team.run() stop_time = datetime.datetime.now() log.info('Done!') info.write('\t[[run_stats]]', log.info) info.write('\t\tstart_time = {0}'.format(str(start_time)), log.info) info.write('\t\tstop_time = {0}'.format(str(stop_time)), log.info) info.write('\t\ttotal_duration = {0}'.format(str(stop_time - start_time)), log.info) if not args.keep_temps: log.debug('purging temps...') temp_fs.purge()
def main_cli(): description = '{name} {version}'.format(**_program_info) parser = argparse.ArgumentParser(description = description) parser.add_argument('config', metavar='CONFIG-FILE', type = argparse_utils.arg_is_config, help = ('msBayes config file used to estimate the posterior ' 'sample.')) parser.add_argument('posterior_sample_path', metavar='POSTERIOR-SAMPLE-FILE', type=argparse_utils.arg_is_file, help = ('Path to posterior sample file (i.e., ' '`*-posterior-sample.txt`).')) parser.add_argument('-e', '--expression', dest = 'expressions', action = 'append', metavar = 'TAXON-INDEX-EXPRESSION', type = str, required = True, help = ('A conditional expression of divergence times based on ' 'the taxon-pair indices for which to calculate the ' 'posterior probability of being true. Indices correspond ' 'to the order that pairs of taxa appear in the sample ' 'table of the config, starting at 0 for the first ' 'taxon-pair to appear in the table (starting from the ' 'top). E.g., `-e "0 == 3 == 4"` would request the ' 'proportion of times the 1st, 4th, and 5th taxon-pairs ' '(in order of appearance in the sample table of the ' 'config) share the same divergence time in the ' 'posterior sample, whereas `-e "0 > 1" would request the ' 'proportion of times the the 1st taxon-pair diverged ' 'further back in time than the 2nd taxon-pair in the ' 'posterior sample.')) parser.add_argument('-n', '--num-prior-samples', action = 'store', type = argparse_utils.arg_is_positive_int, help = ('The number of prior samples to simulate for estimating ' 'prior probabilities; prior probabilities and Bayes ' 'factors will be reported. The default is to only report ' 'posterior probabilities.')) parser.add_argument('--np', action = 'store', type = argparse_utils.arg_is_positive_int, default = multiprocessing.cpu_count(), help = ('The maximum number of processes to run in parallel for ' 'prior simulations. The default is the number of CPUs ' 'available on the machine. This option is only relevant ' 'if the number of prior samples is specified using the ' '`-n` argument.')) parser.add_argument('--seed', action = 'store', type = argparse_utils.arg_is_positive_int, help = ('Random number seed to use for simulations. This option ' 'is only relevant if the number of prior samples is ' 'specified using the `-n` argument.')) parser.add_argument('--version', action = 'version', version = '%(prog)s ' + _program_info['version'], help = 'Report version and exit.') parser.add_argument('--quiet', action = 'store_true', help = 'Run without verbose messaging.') parser.add_argument('--debug', action = 'store_true', help = 'Run in debugging mode.') args = parser.parse_args() ########################################################################## ## handle args from pymsbayes.utils.messaging import (LoggingControl, InfoLogger) LoggingControl.set_logging_level("INFO") if args.quiet: LoggingControl.set_logging_level("WARNING") if args.debug: LoggingControl.set_logging_level("DEBUG") log = LoggingControl.get_logger(__name__) from pymsbayes import config from pymsbayes.teams import DivModelSimulatorTeam from pymsbayes.utils import stats, sumresults, GLOBAL_RNG if not args.seed: args.seed = random.randint(1, 999999999) GLOBAL_RNG.seed(args.seed) cfg = config.MsBayesConfig(args.config) evaluators = [] for exp in args.expressions: evaluators.append(stats.ListConditionEvaluator(exp, index_labels = cfg.taxa)) div_models = sumresults.get_partitions_from_posterior_sample_file( args.posterior_sample_path) sim_team = None if args.num_prior_samples: sim_team = DivModelSimulatorTeam( config_paths = [args.config], num_samples = args.num_prior_samples, num_processors = args.np) sim_team.start() for e in evaluators: title = '{0} --- {1}:'.format(e.expression, e.pretty_expression) section_title = '\n{0}\n{1}\n'.format(title, '-' * len(title)) sys.stdout.write('{0}'.format(section_title)) prob_shared_div = div_models.get_condition_frequency(e) sys.stdout.write('posterior probability = {0}\n'.format(prob_shared_div)) if sim_team: prior_prob = sim_team.div_models[ args.config].get_condition_frequency(e) bf = ((prob_shared_div / (1 - prob_shared_div)) / (prior_prob / (1 - prior_prob))) sys.stdout.write('prior probability = {0}\n'.format(prior_prob)) sys.stdout.write('Bayes factor = {0}\n'.format(bf)) sys.stdout.write('2ln(Bayes factor) = {0}\n'.format(2 * math.log(bf))) sys.stdout.write('\n')
def main_cli(): description = '{name} {version}'.format(**_program_info) parser = argparse.ArgumentParser(description=description) parser.add_argument('div_model_path', metavar='DIV-MODEL-RESULTS-FILE', type=argparse_utils.arg_is_file, help=('Path to divergence model results file (i.e., ' '`*-div-model-results.txt`).')) parser.add_argument( '-i', '--taxon-indices', nargs='+', type=argparse_utils.arg_is_positive_int, required=True, help=('Two or more space-separated indices of taxa for which to ' 'calculate the probability of them co-diverging. Indices ' 'correspond to the line in the sample table of the config, ' 'starting at 1 for the first line of the table. At least ' 'two indices are required.')) parser.add_argument( '-c', '--config', type=argparse_utils.arg_is_config, help=('msBayes config file to be used to estimate prior ' 'probability via simulations. If provided, the ' 'posterior and prior probability and bayes factor is ' 'reported. If not provided, only the posterior ' 'probability is reported.')) parser.add_argument( '-n', '--num-prior-samples', action='store', type=argparse_utils.arg_is_positive_int, default=100000, help=('The number of prior samples to simulate for estimating' 'prior probabilities. Only used if a config file is ' 'provided with the `-c` argument.')) parser.add_argument( '--np', action='store', type=argparse_utils.arg_is_positive_int, default=multiprocessing.cpu_count(), help=('The maximum number of processes to run in parallel for ' 'prior simulations. The default is the number of CPUs ' 'available on the machine. This option is only relevant ' 'if a config file is provided using the `-c` argument.')) parser.add_argument( '--seed', action='store', type=argparse_utils.arg_is_positive_int, help=('Random number seed to use for simulations. This option ' 'is only relevant if a config file is provided using the ' '`-c` argument.')) parser.add_argument('--version', action='version', version='%(prog)s ' + _program_info['version'], help='Report version and exit.') parser.add_argument('--quiet', action='store_true', help='Run without verbose messaging.') parser.add_argument('--debug', action='store_true', help='Run in debugging mode.') args = parser.parse_args() ########################################################################## ## handle args from pymsbayes.utils.messaging import (LoggingControl, InfoLogger) LoggingControl.set_logging_level("INFO") if args.quiet: LoggingControl.set_logging_level("WARNING") if args.debug: LoggingControl.set_logging_level("DEBUG") log = LoggingControl.get_logger(__name__) from pymsbayes import config from pymsbayes.teams import ModelProbabilityEstimatorTeam from pymsbayes.utils import sumresults, GLOBAL_RNG if len(args.taxon_indices) < 2: log.error('At least two taxon indices are required') sys.exit(1) if not args.seed: args.seed = random.randint(1, 999999999) GLOBAL_RNG.seed(args.seed) div_models = sumresults.OrderedDivergenceModelCollection( div_model_results_path=args.div_model_path) for i in args.taxon_indices: if ((i < 1) or (i > div_models.npairs)): log.error('taxon index {0} is out of bounds'.format(i)) sys.exit(1) args.taxon_indices = [i - 1 for i in args.taxon_indices] prob_shared_div = div_models.prob_of_shared_divergence(args.taxon_indices) if args.config: prob_estimator_team = ModelProbabilityEstimatorTeam( config_paths=[args.config], num_samples=args.num_prior_samples, num_processors=args.np) prob_estimator_team.start() prior_prob = prob_estimator_team.shared_div_probs[args.config][len( args.taxon_indices)] bf = ((prob_shared_div / (1 - prob_shared_div)) / (prior_prob / (1 - prior_prob))) sys.stdout.write('posterior probability = {0}\n'.format(prob_shared_div)) if args.config: sys.stdout.write('prior probability = {0}\n'.format(prior_prob)) sys.stdout.write('Bayes factor = {0}\n'.format(bf)) sys.stdout.write('2ln(Bayes factor) = {0}\n'.format(2 * math.log(bf)))
def main_cli(): description = '{name} {version}'.format(**_program_info) parser = argparse.ArgumentParser(description = description) parser.add_argument('info_path', metavar='PYMSBAYES-INFO-FILE', type=argparse_utils.arg_is_file, help=('Path to `pymsbayes-info.txt` file.')) parser.add_argument('-n', '--num-prior-samples', action = 'store', type = argparse_utils.arg_is_positive_int, default = 100000, help = ('The number of prior samples to simulate for estimating ' 'prior probabilities.')) parser.add_argument('-i', '--sample-index', action = 'store', type = argparse_utils.arg_is_positive_int, help = ('The prior-sample index of results to be summarized. ' 'Output files should have a consistent schema. For ' 'example, a results file for divergence models might look ' 'something like ' '`d1-m1-s1-1000000-div-model-results.txt`. In this example, ' 'the prior-sample index is "1000000". The default is to ' 'use the largest prior-sample index, which is probably ' 'what you want.')) parser.add_argument('-o', '--output-dir', action = 'store', type = argparse_utils.arg_is_dir, help = ('The directory in which all output plots will be written. ' 'The default is to use the directory of the pymsbayes info ' 'file.')) parser.add_argument('--np', action = 'store', type = argparse_utils.arg_is_positive_int, default = multiprocessing.cpu_count(), help = ('The maximum number of processes to run in parallel. The ' 'default is the number of CPUs available on the machine.')) parser.add_argument('-m', '--mu', action = 'store', type = argparse_utils.arg_is_positive_float, default = None, help = ('The mutation rate with which to scale time to units of ' 'generations. By default, time is not scaled to ' 'generations.')) parser.add_argument('--extension', action = 'store', type = str, default = 'pdf', help = ('The file format extension of the plots (e.g., "pdf", ' '"png"). The default is pdf.')) parser.add_argument('--seed', action = 'store', type = argparse_utils.arg_is_positive_int, help = 'Random number seed to use for the analysis.') parser.add_argument('--version', action = 'version', version = '%(prog)s ' + _program_info['version'], help = 'Report version and exit.') parser.add_argument('--quiet', action = 'store_true', help = 'Run without verbose messaging.') parser.add_argument('--debug', action = 'store_true', help = 'Run in debugging mode.') args = parser.parse_args() ########################################################################## ## handle args from pymsbayes.utils.messaging import (LoggingControl, InfoLogger) LoggingControl.set_logging_level("INFO") if args.quiet: LoggingControl.set_logging_level("WARNING") if args.debug: LoggingControl.set_logging_level("DEBUG") log = LoggingControl.get_logger(__name__) from pymsbayes import plotting from pymsbayes.utils import sumresults from pymsbayes.utils import GLOBAL_RNG if not plotting.MATPLOTLIB_AVAILABLE: log.error( '`matplotlib` could not be imported, so plots can not be\n' 'produced. Please install `matplotlib` and try again.') sys.exit(1) if not args.seed: args.seed = random.randint(1, 999999999) GLOBAL_RNG.seed(args.seed) if not args.output_dir: args.output_dir = os.path.dirname(args.info_path) args.output_dir = os.path.join(args.output_dir, 'plots') if not os.path.exists(args.output_dir): os.mkdir(args.output_dir) results = sumresults.DMCSimulationResults(args.info_path) if results.num_sim_reps > 1: log.error('Results appear to be from simulation-based analysis, ' 'for which this plotting script is not appropriate.') sys.exit(1) observed_indices = sorted(results.observed_index_to_config.keys()) prior_indices = sorted(results.prior_index_to_config.keys()) for obs_idx in observed_indices: for prior_idx in prior_indices: result_indices = results.get_result_indices(obs_idx, prior_idx, 1) result_idx = max(result_indices) result_path_prefix = '{0}{1}-'.format( results.get_result_path_prefix(obs_idx, prior_idx, 1), result_idx) result_dir = os.path.dirname(result_path_prefix) out_prefix = os.path.join(args.output_dir, os.path.basename( result_path_prefix)) prior_cfg = results.prior_configs[prior_idx] posterior_summary_path = get_result_path(result_path_prefix, 'posterior-summary') div_model_path = get_result_path(result_path_prefix, 'div-model-results') config_path = results.prior_index_to_config[prior_idx] time_multiplier = 1.0 if args.mu is not None: if prior_cfg.time_in_subs_per_site: time_multiplier = 1.0 / args.mu else: try: mean_theta = prior_cfg.theta.mean except: mean_theta = prior_cfg.d_theta.mean time_multiplier = mean_theta / args.mu if results.sort_index == 0: #plot marginal times if not posterior_summary_path: log.warning('Could not find {0}{1}.txt(.gz); ' 'Skipping marginal times plot...'.format( result_path_prefix, 'posterior-summary')) else: label_dimension = (0.34 * (prior_cfg.npairs + 1)) + 0.56 marginal_times_plot = plotting.get_marginal_divergence_time_plot( config_path = config_path, posterior_summary_path = posterior_summary_path, labels = None, estimate = 'median', interval = 'HPD_95_interval', time_multiplier = time_multiplier, horizontal = True, label_dimension = label_dimension, measure_dimension = 8.0, label_size = 12.0, measure_tick_label_size = 12.0, measure_axis_label = 'Divergence time', measure_axis_label_size = 14.0, label_axis_label = 'Taxon pair', label_axis_label_size = 14.0, usetex = False) marginal_times_path = '{0}{1}'.format(out_prefix, 'marginal-divergence-times.' + args.extension) marginal_times_plot.savefig(marginal_times_path) #plot top ordered models if not div_model_path: log.warning('Could not find {0}{1}.txt(.gz); ' 'Skipping ordered div model plot...'.format( result_path_prefix, 'div-model-results')) else: height = 12.0 margin_top = 0.99 margin_left = 0.03 padding_between_vertical = 0.8 if prior_cfg.npairs < 4: height *= 0.8 margin_top -= 0.01 margin_left += 0.05 padding_between_vertical += 0.3 width = (0.38 * prior_cfg.npairs) + 1.5 div_model_plot = plotting.OrderedDivergenceModelPlotGrid( div_model_results_path = div_model_path, config_path = config_path, num_top_models = 10, time_multiplier = time_multiplier, height = height, width = width, plot_label_schema = 'uppercase', plot_label_offset = 0, plot_label_size = 12.0, y_title = 'Divergence time', y_title_size = 14.0, y_tick_label_size = 10.0, right_text_size = 10.0, margin_left = margin_left, margin_bottom = 0.0, margin_right = 1, margin_top = margin_top, padding_between_vertical = padding_between_vertical, tab = 0.08) plot = div_model_plot.create_grid() div_model_plot_path = '{0}{1}'.format(out_prefix, 'ordered-div-models.' + args.extension) plot.savefig(div_model_plot_path) else: #plot top unordered models if not div_model_path: log.warning('Could not find {0}{1}.txt(.gz); ' 'Skipping unordered div model plot...'.format( result_path_prefix, 'div-model-results')) else: width = (0.38 * prior_cfg.npairs) + 1.5 div_model_plot = plotting.UnorderedDivergenceModelPlotGrid( div_model_results_path = div_model_path, num_top_models = 10, time_multiplier = time_multiplier, height = 10.0, width = width, data_label_size = 10.0, plot_label_schema = 'uppercase', plot_label_offset = 0, plot_label_size = 12.0, y_title = 'Divergence time', y_title_size = 14.0, y_tick_label_size = 10.0, right_text_size = 10.0, margin_left = 0.03, margin_bottom = 0.0, margin_right = 1, margin_top = 0.99, padding_between_vertical = 0.8, tab = 0.08) plot = div_model_plot.create_grid() div_model_plot_path = '{0}{1}'.format(out_prefix, 'ordered-div-models.' + args.extension) plot.savefig(div_model_plot_path) #plot ndiv plot psi_path = get_result_path(result_path_prefix, 'psi-results') if not psi_path: log.warning('Could not find {0}{1}.txt(.gz); ' 'Skipping number of divergences plot...'.format( result_path_prefix, 'psi-results')) else: width = (0.25 * prior_cfg.npairs) + 0.55 if width < 2.8: width = 2.8 num_div_summary = plotting.NumberOfDivergencesSummary( config_path = results.prior_index_to_config[prior_idx], psi_results_path = psi_path, posterior_summary_path = posterior_summary_path, num_prior_samples = args.num_prior_samples, num_processors = args.np) num_div_summary.create_plot( plot_label_size = 10.0, right_text_size = 10.0, x_label_size = 10.0, y_label_size = 10.0, xtick_label_size = 10.0, ytick_label_size = 8.0, height = 6.0, width = width, margin_bottom = 0.0, margin_left = 0.0, margin_top = 0.97, margin_right = 1.0, padding_between_vertical = 1.0) num_div_plot_path = '{0}{1}'.format(out_prefix, 'number-of-divergences.' + args.extension) num_div_summary.save_plot(num_div_plot_path) bf_plot_path = '{0}{1}'.format(out_prefix, ('number-of-divergences-bayes-factors-only.' + args.extension)) num_div_summary.save_bf_plot(bf_plot_path) num_div_bf_path = '{0}{1}'.format(out_prefix, 'number-of-divergences-bayes-factors.txt') with open(num_div_bf_path, 'w') as out: out.write('num_of_divs\t2ln(bf)\n') for n in sorted(num_div_summary.psi_bayes_factors.keys()): out.write('{0}\t{1}\n'.format(n, num_div_summary.psi_bayes_factors[n])) log.info('The plots are in: {0}'.format(args.output_dir))
def main_cli(argv=sys.argv): description = '{name} {version}'.format(**_program_info) parser = argparse.ArgumentParser( description=description, formatter_class=argparse_utils.SmartHelpFormatter) parser.add_argument( '-o', '--observed-configs', nargs='+', type=argparse_utils.arg_is_config, required=True, help=('One or more msBayes config files to be used to either ' 'calculate or simulate observed summary statistics. If ' 'used in combination with `-r` each config will be used to ' 'simulate pseudo-observed data. If analyzing real data, do ' 'not use the `-r` option, and the fasta files specified ' 'within the config must exist and contain the sequence ' 'data.')) parser.add_argument( '-p', '--prior-configs', nargs='+', type=argparse_utils.arg_is_path, required=True, help=('One or more config files to be used to generate prior ' 'samples. If more than one config is specified, they ' 'should be separated by spaces. ' 'This option can also be used to specify the path to a ' 'directory containing the prior samples and summary ' 'statistic means and standard deviations generated by a ' 'previous run using the `generate-samples-only` option. ' 'These files should be found in the directory ' '`pymsbayes-output/prior-stats-summaries`. The' '`pymsbayes-output/model-key.txt` also needs to be present.' ' If specifying this directory, it should be the only ' 'argument (i.e., no other directories or config files can ' 'be provided).')) parser.add_argument( '-r', '--reps', action='store', type=argparse_utils.arg_is_nonnegative_int, default=0, help=('This option has two effects. First, it signifies that ' 'the analysis will be simulation based (i.e., no real ' 'data will be used). Second, it specifies how many ' 'simulation replicates to perform (i.e., how many data ' 'sets to simulate and analyze).')) parser.add_argument( '-n', '--num-prior-samples', action='store', type=argparse_utils.arg_is_positive_int, default=1000000, help=('The number of prior samples to simulate for each prior ' 'config specified with `-p`.')) parser.add_argument( '--prior-batch-size', action='store', type=argparse_utils.arg_is_positive_int, default=10000, help=('The number of prior samples to simulate for each batch.')) parser.add_argument( '--generate-samples-only', action='store_true', help=('Only generate samples from models as requested. I.e., ' 'No analyses are performed to approximate posteriors. ' 'This option can be useful if you want the prior samples ' 'for other purposes.')) parser.add_argument( '--num-posterior-samples', action='store', type=argparse_utils.arg_is_positive_int, default=1000, help=('The number of posterior samples desired for each ' 'analysis. Default: 1000.')) parser.add_argument('--num-standardizing-samples', action='store', type=argparse_utils.arg_is_positive_int, default=10000, help=('The number of prior samples desired to use for ' 'standardizing statistics. Default: 10000.')) parser.add_argument( '--np', action='store', type=argparse_utils.arg_is_positive_int, default=multiprocessing.cpu_count(), help=('The maximum number of processes to run in parallel. The ' 'default is the number of CPUs available on the machine.')) parser.add_argument( '--output-dir', action='store', type=argparse_utils.arg_is_dir, help=('The directory in which all output files will be written. ' 'The default is to use the directory of the first observed ' 'config file.')) parser.add_argument( '--temp-dir', action='store', type=argparse_utils.arg_is_dir, help=('A directory to temporarily stage files. The default is to ' 'use the output directory.')) parser.add_argument( '--staging-dir', action='store', type=argparse_utils.arg_is_dir, help=('A directory to temporarily stage prior files. This option ' 'can be useful on clusters to speed up I/O while ' 'generating prior samples. You can designate a local temp ' 'directory on a compute node to avoid constant writing to ' 'a shared drive. The default is to use the `temp-dir`.')) parser.add_argument( '-s', '--stat-prefixes', nargs='*', type=str, help=('Prefixes of summary statistics to use in the analyses. ' 'The prefixes should be separated by spaces. ' 'Default: `-s pi wattTheta pi.net tajD.denom`.')) parser.add_argument( '-b', '--bandwidth', action='store', type=float, help=('Smoothing parameter for the posterior kernal density ' 'estimation. This option is used for the `glm` ' 'regression method. The default is 2 / ' '`num-posterior-samples`.')) parser.add_argument( '-q', '--num-posterior-quantiles', action='store', type=argparse_utils.arg_is_positive_int, default=1000, help=('The number of equally spaced quantiles at which to ' 'evaluate the GLM-estimated posterior density. ' 'Default: 1000.')) parser.add_argument( '--reporting-frequency', action='store', type=argparse_utils.arg_is_nonnegative_int, default=0, help=('Suggested frequency (in number of prior samples) for ' 'running regression and reporting current results. ' 'Default: 0 (only report final results). ' 'If a value is given, it may be adjusted so that the ' 'reporting frequency is a multiple of the multi-processed ' 'batch size.')) parser.add_argument('--sort-index', action='store', type=argparse_utils.arg_is_nonnegative_int, default=0, choices=range(12), help=argparse_utils.get_sort_index_help_message()) parser.add_argument( '--no-global-estimate', action='store_true', help=('If multiple prior models are specified, by default a ' 'global estimate is performed averaging over all models. ' 'This option prevents the global estimation (i.e., only ' 'inferences for each model are made).')) parser.add_argument('--compress', action='store_true', help='Compress large results files.') parser.add_argument('--keep-temps', action='store_true', help='Keep all temporary files.') parser.add_argument('--seed', action='store', type=int, help='Random number seed to use for the analysis.') parser.add_argument( '--output-prefix', action='store', type=str, default='', help=('Prefix to use at beginning of output files. The default ' 'is no prefix.')) parser.add_argument( '--data-key-path', action='store', type=argparse_utils.arg_is_file, help=('The path to a `data-key.txt` file generated by a previous ' 'run. This file should be found in the directory ' '`pymsbayes-output/data-key.txt`. This option ' 'will override the `-o`/`--observed-configs` option, and ' 'is intended to be used in combination with the ' '`--start-from` option to restart an analysis.')) parser.add_argument( '--start-from-simulation-index', action='store', type=argparse_utils.arg_is_nonnegative_int, default=0, help=('The simulation index at which to begin analyses. Must be ' 'used in combination with either the number of simulation ' 'replicates (`-r`/`--reps`) or the `--data-key-path` ' 'option, and must be a positive ' 'integer that is less than the number of simulation ' 'replicates. This option can be useful if an analysis ' 'needs to be restarted.')) parser.add_argument( '--start-from-observed-index', action='store', type=argparse_utils.arg_is_nonnegative_int, default=0, help=('The observed config index at which to begin analyses. ' 'Can be used in combination with the `--data-key-path` ' 'option to restart long-running, multi-observed-config ' 'analyses')) parser.add_argument('--dry-run', action='store_true', help='Do not run analyses; only process settings') parser.add_argument('--version', action='version', version='%(prog)s ' + _program_info['version'], help='Report version and exit.') parser.add_argument('--quiet', action='store_true', help='Run without verbose messaging.') parser.add_argument('--debug', action='store_true', help='Run in debugging mode.') if argv == sys.argv: args = parser.parse_args() else: args = parser.parse_args(argv) ########################################################################## ## handle args from pymsbayes.utils.messaging import (LoggingControl, InfoLogger) LoggingControl.set_logging_level("INFO") if args.quiet: LoggingControl.set_logging_level("WARNING") if args.debug: LoggingControl.set_logging_level("DEBUG") log = LoggingControl.get_logger(__name__) from pymsbayes.workers import (MsBayesWorker, merge_prior_files, ObsSumStatsWorker) from pymsbayes.teams import ABCTeam from pymsbayes.utils.functions import (is_file, is_dir, long_division, mk_new_dir) from pymsbayes.utils.parsing import (get_patterns_from_prefixes, DEFAULT_STAT_PATTERNS, DIV_MODEL_PATTERNS, MODEL_PATTERNS, PSI_PATTERNS, MEAN_TAU_PATTERNS, OMEGA_PATTERNS, CV_PATTERNS, line_count) from pymsbayes.utils import sumresults, errors from pymsbayes.manager import Manager from pymsbayes.utils.tempfs import TempFileSystem from pymsbayes.config import MsBayesConfig from pymsbayes.utils import (GLOBAL_RNG, set_memory_trace, MSBAYES_SORT_INDEX, ToolPathManager) MSBAYES_SORT_INDEX.set_index(args.sort_index) if len(args.observed_configs) != len(set(args.observed_configs)): raise ValueError('All paths to observed config files must be unique') if args.num_standardizing_samples > args.num_prior_samples: args.num_standardizing_samples = args.num_prior_samples # get full paths to tools msbayes_path = ToolPathManager.get_tool_full_path('msbayes.pl') dpp_msbayes_path = ToolPathManager.get_tool_full_path('dpp-msbayes.pl') eureject_path = ToolPathManager.get_tool_full_path('eureject') abctb_path = ToolPathManager.get_tool_full_path('ABCestimator') # vet prior-configs option using_previous_priors = False previous_prior_dir = None if (len(args.prior_configs) == 1) and (is_dir(args.prior_configs[0])): previous_prior_dir = args.prior_configs.pop(0) previous_priors = glob.glob( os.path.join(previous_prior_dir, '*-prior-sample.txt')) previous_sums = glob.glob( os.path.join(previous_prior_dir, '*-means-and-std-devs.txt')) if (not previous_priors) or (not previous_sums): raise ValueError( 'directory {0!r} specified with `prior-configs` ' 'option does not contain necessary prior and summary ' 'files'.format(args.prior_configs[0])) using_previous_priors = True else: for path in args.prior_configs: if not is_file(path): raise ValueError( 'prior config {0!r} is not a file'.format(path)) if len(args.prior_configs) != len(set(args.prior_configs)): raise ValueError('All paths to prior config files must be unique') if not args.output_dir: args.output_dir = os.path.dirname(args.observed_configs[0]) base_dir = mk_new_dir(os.path.join(args.output_dir, 'pymsbayes-results')) if not args.temp_dir: args.temp_dir = base_dir info_path = os.path.join(base_dir, args.output_prefix + \ 'pymsbayes-info.txt') info = InfoLogger(info_path) info.write('[pymsbayes]'.format(base_dir)) info.write('\tversion = {version}'.format(**_program_info)) info.write('\toutput_directory = {0}'.format(base_dir)) temp_fs = TempFileSystem(parent=args.temp_dir, prefix='temp-files-') base_temp_dir = temp_fs.base_dir info.write('\ttemp_directory = {0}'.format(base_temp_dir)) info.write('\tsort_index = {0}'.format(MSBAYES_SORT_INDEX.current_value())) info.write('\tsimulation_reps = {0}'.format(args.reps)) stat_patterns = DEFAULT_STAT_PATTERNS if args.stat_prefixes: for i in range(len(args.stat_prefixes)): if not args.stat_prefixes[i].endswith('.'): args.stat_prefixes[i] += '.' stat_patterns = get_patterns_from_prefixes(args.stat_prefixes, ignore_case=True) if not args.bandwidth: args.bandwidth = 2 / float(args.num_posterior_samples) if not args.seed: args.seed = random.randint(1, 999999999) GLOBAL_RNG.seed(args.seed) if args.data_key_path: observed_map = sumresults.parse_data_key_file(args.data_key_path) observed_paths = [observed_map[k] for k in sorted(observed_map.keys())] else: observed_dir = mk_new_dir( os.path.join(base_dir, 'observed-summary-stats')) observed_paths = [os.path.join(observed_dir, args.output_prefix + \ 'observed-{0}.txt'.format(i+1)) for i in range(len( args.observed_configs))] info.write('\tseed = {0}'.format(args.seed)) info.write('\tnum_processors = {0}'.format(args.np)) info.write('\tnum_prior_samples = {0}'.format(args.num_prior_samples)) info.write('\tnum_standardizing_samples = {0}'.format( args.num_standardizing_samples)) info.write('\tbandwidth = {0}'.format(args.bandwidth)) info.write('\tposterior_quantiles = {0}'.format( args.num_posterior_quantiles)) info.write('\tposterior_sample_size = {0}'.format( args.num_posterior_samples)) info.write('\tstat_patterns = {0}'.format(', '.join( [p.pattern for p in stat_patterns]))) # vet observed configs ref_config_path = args.observed_configs[0] ref_config = MsBayesConfig(ref_config_path) all_config_paths = [] num_taxon_pairs = ref_config.npairs assert num_taxon_pairs > 0 for config in args.observed_configs: all_config_paths.append(config) if not ref_config.equal_sample_table(config): if not args.keep_temps: temp_fs.purge() raise errors.SampleTableError( 'sample tables in config {0!r} and {1!r} differ; ' 'all sample tables must be the same.'.format( ref_config_path, config)) info.write('\tnum_taxon_pairs = {0}'.format(num_taxon_pairs)) info.write('\tdry_run = {0}'.format(args.dry_run)) info.write('\t[[tool_paths]]') info.write('\t\tdpp_msbayes = {0}'.format(dpp_msbayes_path)) info.write('\t\tmsbayes = {0}'.format(msbayes_path)) info.write('\t\teureject = {0}'.format(eureject_path)) info.write('\t\tabcestimator = {0}'.format(abctb_path)) info.write('\t[[observed_configs]]') for i, cfg in enumerate(args.observed_configs): info.write('\t\t{0} = {1}'.format( i + 1, os.path.relpath(cfg, os.path.dirname(info_path)))) abc_team = ABCTeam( temp_fs=temp_fs, observed_stats_files=observed_paths, num_taxon_pairs=num_taxon_pairs, config_paths=args.prior_configs, previous_prior_dir=previous_prior_dir, num_prior_samples=args.num_prior_samples, num_processors=args.np, num_standardizing_samples=args.num_standardizing_samples, num_posterior_samples=args.num_posterior_samples, num_posterior_density_quantiles=args.num_posterior_quantiles, batch_size=args.prior_batch_size, output_dir=base_dir, output_prefix=args.output_prefix, prior_temp_dir=args.staging_dir, rng=GLOBAL_RNG, report_parameters=True, stat_patterns=stat_patterns, eureject_exe_path=eureject_path, abctoolbox_exe_path=abctb_path, msbayes_exe_path=None, abctoolbox_bandwidth=args.bandwidth, omega_threshold=0.01, cv_threshold=0.01, compress=args.compress, reporting_frequency=args.reporting_frequency, keep_temps=args.keep_temps, global_estimate_only=False, global_estimate=not args.no_global_estimate, generate_prior_samples_only=args.generate_samples_only, start_from_simulation_index=args.start_from_simulation_index, start_from_observed_index=args.start_from_observed_index) models_to_configs = {} configs_to_models = {} for k, v in abc_team.models.iteritems(): models_to_configs[k] = v configs_to_models[v] = k cfg = MsBayesConfig(v) all_config_paths.append(v) # vet prior configs if not ref_config.equal_sample_table(cfg): if not args.keep_temps: temp_fs.purge() raise errors.SampleTableError( 'sample tables in config {0!r} and {1!r} differ; ' 'all sample tables must be the same.'.format( ref_config_path, v)) info.write('\t[[observed_paths]]') for i in sorted(abc_team.observed_stats_paths.iterkeys()): info.write('\t\t{0} = {1}'.format( i, os.path.relpath(abc_team.observed_stats_paths[i], os.path.dirname(info_path)))) info.write('\t[[prior_configs]]') for i in sorted(abc_team.models.iterkeys()): info.write('\t\t{0} = {1}'.format( i, os.path.relpath(abc_team.models[i], os.path.dirname(info_path)))) ########################################################################## ## begin analysis --- get observed summary stats set_memory_trace() # start logging memory profile start_time = datetime.datetime.now() if args.data_key_path: log.info('Using provided summary statitics...') elif not args.dry_run: obs_temp_dir = base_temp_dir if args.staging_dir: obs_temp_dir = args.staging_dir observed_temp_fs = TempFileSystem(parent=obs_temp_dir, prefix='observed-temps-') if args.reps < 1: log.info('Calculating summary statistics from sequence data...') obs_workers = [] for i, cfg in enumerate(args.observed_configs): ss_worker = ObsSumStatsWorker(temp_fs=observed_temp_fs, config_path=cfg, output_path=observed_paths[i], schema='abctoolbox', stat_patterns=stat_patterns) obs_workers.append(ss_worker) obs_workers = Manager.run_workers(workers=obs_workers, num_processors=args.np) # re-vet all configs to see if some were changed by obsSumStats.pl new_ref_config = ref_config ref_modified = False # new ref because if all configs were updated all is good if not ref_config.equal_sample_table(ref_config_path): ref_modified = True new_ref_config = MsBayesConfig(ref_config_path) log.warning(""" The alignment lengths in config {0!r} have been corrected for sites with *any* ambiguous bases and/or gaps by obsSumStats.pl. """.format(ref_config_path)) for config in all_config_paths: if not new_ref_config.equal_sample_table(config): corrected_config = config if ref_modified: corrected_config = ref_config_path if not args.keep_temps: observed_temp_fs.purge() temp_fs.purge() raise errors.SampleTableError(""" The sample tables in configs {0!r} and {1!r} differ because obsSumStats.pl modified alignment lengths in config {2!r} to correct for sites in the alignments with *any* ambiguous bases and/or gaps. Please make sure the sample tables in all configs will be the same after correcting alignment lengths for sites that contain *any* ambiguous bases and/or gaps. You can do this by copying and pasting the sample table in {2!r} that has been corrected by obsSumStats.pl into the other configs that were not corrected. """.format(ref_config_path, config, corrected_config)) else: log.info('Simulating summary statistics from observed configs...') num_observed_workers = min([args.reps, args.np]) if args.reps <= args.np: observed_batch_size = 1 remainder = 0 else: observed_batch_size, remainder = long_division( args.reps, args.np) msbayes_workers = [] for idx, cfg in enumerate(args.observed_configs): observed_model_idx = configs_to_models.get(cfg, None) schema = 'abctoolbox' for i in range(num_observed_workers): worker = MsBayesWorker(temp_fs=observed_temp_fs, sample_size=observed_batch_size, config_path=cfg, model_index=observed_model_idx, report_parameters=True, schema=schema, include_header=True, stat_patterns=stat_patterns, write_stats_file=False, staging_dir=None, tag=idx) msbayes_workers.append(worker) if remainder > 0: worker = MsBayesWorker(temp_fs=observed_temp_fs, sample_size=remainder, config_path=cfg, model_index=observed_model_idx, report_parameters=True, schema=schema, include_header=True, stat_patterns=stat_patterns, write_stats_file=False, staging_dir=None, tag=idx) msbayes_workers.append(worker) # run parallel msbayes processes msbayes_workers = Manager.run_workers(workers=msbayes_workers, num_processors=args.np) workers = dict( zip(range(len(args.observed_configs)), [[] for i in range(len(args.observed_configs))])) for w in msbayes_workers: workers[w.tag].append(w) # merge simulated observed data into one file for i in range(len(args.observed_configs)): merge_prior_files([w.prior_path for w in workers[i]], observed_paths[i]) lc = line_count(observed_paths[i], ignore_headers=True) if lc != args.reps: if not args.keep_temps: temp_fs.purge() raise Exception( 'The number of observed simulations ({0}) ' 'generated for observed config {1!r} and output to ' 'file {2!r} does not match the number of reps ' '({3})'.format(lc, args.observed_configs[i], observed_paths[i], args.reps)) if not args.keep_temps: log.debug('purging observed temps...') observed_temp_fs.purge() ########################################################################## ## Begin ABC analyses if not args.dry_run: abc_team.run() stop_time = datetime.datetime.now() log.info('Done!') info.write('\t[[run_stats]]', log.info) info.write('\t\tstart_time = {0}'.format(str(start_time)), log.info) info.write('\t\tstop_time = {0}'.format(str(stop_time)), log.info) info.write('\t\ttotal_duration = {0}'.format(str(stop_time - start_time)), log.info) if not args.keep_temps: log.debug('purging temps...') temp_fs.purge()
def main_cli(): description = '{name} {version}'.format(**_program_info) parser = argparse.ArgumentParser( description=description, formatter_class=argparse_utils.SmartHelpFormatter) parser.add_argument( '-c', '--config', type=argparse_utils.arg_is_config, required=True, help=('msBayes config file to be used to generate saturation ' 'plot.')) parser.add_argument( '-n', '--num-prior-samples', action='store', type=int, default=1000, help=('The number of prior samples to simulate for the ' 'saturation plot.')) parser.add_argument( '--np', action='store', type=int, default=multiprocessing.cpu_count(), help=('The maximum number of processes to run in parallel. The ' 'default is the number of CPUs available on the machine.')) parser.add_argument( '-o', '--output-dir', action='store', type=argparse_utils.arg_is_dir, help=('The directory in which all output files will be written. ' 'The default is to use the directory of the first observed ' 'config file.')) parser.add_argument( '--temp-dir', action='store', type=argparse_utils.arg_is_dir, help=('A directory to temporarily stage files. The default is to ' 'use the output directory.')) parser.add_argument( '-s', '--stat-prefixes', nargs='*', type=str, default=['pi', 'pi.net', 'wattTheta', 'tajD.denom'], help=('Prefixes of summary statistics to use in the analyses. ' 'The prefixes should be separated by spaces. ' 'Default: `-s pi pi.net wattTheta tajD.denom`.')) parser.add_argument('--sort-index', action='store', type=int, default=0, choices=range(12), help=argparse_utils.get_sort_index_help_message()) parser.add_argument('--compress', action='store_true', help='Compress plot data file.') parser.add_argument('--keep-temps', action='store_true', help='Keep all temporary files.') parser.add_argument('--seed', action='store', type=int, help='Random number seed to use for the analysis.') parser.add_argument('--version', action='version', version='%(prog)s ' + _program_info['version'], help='Report version and exit.') parser.add_argument('--quiet', action='store_true', help='Run without verbose messaging.') parser.add_argument('--debug', action='store_true', help='Run in debugging mode.') args = parser.parse_args() ########################################################################## ## handle args from pymsbayes.utils.messaging import (LoggingControl, InfoLogger) LoggingControl.set_logging_level("INFO") if args.quiet: LoggingControl.set_logging_level("WARNING") if args.debug: LoggingControl.set_logging_level("DEBUG") log = LoggingControl.get_logger(__name__) from pymsbayes.workers import MsBayesWorker from pymsbayes.utils.parsing import (get_patterns_from_prefixes, DEFAULT_STAT_PATTERNS, get_dict_from_spreadsheets, dict_line_iter) from pymsbayes.manager import Manager from pymsbayes.utils.tempfs import TempFileSystem from pymsbayes.utils import probability, stats from pymsbayes.utils.functions import long_division from pymsbayes.config import MsBayesConfig from pymsbayes.utils import GLOBAL_RNG, MSBAYES_SORT_INDEX, ToolPathManager from pymsbayes.fileio import process_file_arg from pymsbayes import plotting MSBAYES_SORT_INDEX.set_index(args.sort_index) # get full paths to tools msbayes_path = ToolPathManager.get_tool_full_path('msbayes.pl') dpp_msbayes_path = ToolPathManager.get_tool_full_path('dpp-msbayes.pl') if not args.output_dir: args.output_dir = os.path.dirname(args.config) info = InfoLogger(os.path.join(args.output_dir, 'pymsbayes-info.txt')) sample_path = os.path.join(args.output_dir, 'prior-sample.txt') if args.compress: sample_path += '.gz' if not args.temp_dir: args.temp_dir = args.output_dir temp_fs = TempFileSystem(parent=args.temp_dir, prefix='temp-files-') args.stat_prefixes = [s.rstrip('.') for s in args.stat_prefixes] stat_patterns = get_patterns_from_prefixes( [s + '.' for s in args.stat_prefixes], ignore_case=True) if not args.seed: args.seed = random.randint(1, 999999999) GLOBAL_RNG.seed(args.seed) compress_level = None if args.compress: compress_level = 9 cfg = MsBayesConfig(args.config) num_taxon_pairs = cfg.npairs info.write('[pymsbayes]', log.info) info.write('\tprogram_name = {name}'.format(**_program_info), log.info) info.write('\tversion = {version}'.format(**_program_info), log.info) info.write('\tinvocation = {0!r}'.format(' '.join(sys.argv)), log.info) info.write('\toutput_directory = {0!r}'.format(args.output_dir), log.info) info.write('\ttemp_directory = {0!r}'.format(temp_fs.base_dir), log.info) info.write('\tsort_index = {0}'.format(MSBAYES_SORT_INDEX.current_value()), log.info) info.write( '\tstat_patterns = {0!r}'.format(', '.join( [p.pattern for p in stat_patterns])), log.info) info.write('\tseed = {0}'.format(args.seed), log.info) info.write('\tnum_prior_samples = {0}'.format(args.num_prior_samples), log.info) info.write('\tsample_path = {0!r}'.format(sample_path), log.info) info.write('\t[[tool_paths]]', log.info) info.write('\t\tdpp_msbayes = {0}'.format(dpp_msbayes_path), log.info) info.write('\t\tmsbayes = {0}'.format(msbayes_path), log.info) info.write('\t[[config]]', log.debug) info.write('{0}'.format(str(cfg)), log.debug) ########################################################################## ## begin analysis --- generate samples start_time = datetime.datetime.now() if args.np > args.num_prior_samples: args.np = args.num_prior_samples batch_size, remainder = long_division(args.num_prior_samples, args.np) schema = 'abctoolbox' workers = [] for i in range(args.np): sample_size = batch_size if i == (args.np - 1): sample_size += remainder w = MsBayesWorker(temp_fs=temp_fs, sample_size=sample_size, config_path=args.config, report_parameters=True, schema=schema, include_header=True, stat_patterns=stat_patterns, write_stats_file=False) workers.append(w) log.info('Generating samples...') workers = Manager.run_workers(workers=workers, num_processors=args.np) log.info('Parsing samples...') sample = get_dict_from_spreadsheets([w.prior_path for w in workers]) log.info('Writing prior samples...') out, close = process_file_arg(sample_path, 'w', compresslevel=compress_level) for row in dict_line_iter(sample, sep='\t'): out.write(row) if close: out.close() log.info('Creating plots...') if not plotting.MATPLOTLIB_AVAILABLE: log.warning( '`matplotlib` could not be imported, so the plot can not be\n' 'produced. The data to create the plot can be found in:\n\t' '{0!r}'.format(sample_path)) sys.exit(1) for stat_pattern in stat_patterns: found = False for stat, values in sample.iteritems(): if stat_pattern.match(stat): values = [float(v) for v in values] found = True plot_path = os.path.join(args.output_dir, 'plot-{0}.pdf'.format(stat)) summary = stats.get_summary(values) s = r'mean = {0:.4f} ({1:.4f}-{2:.4f})'.format( summary['mean'], summary['qi_95'][0], summary['qi_95'][1]) hd = plotting.HistData(x=values, normed=True, bins=20, histtype='bar', align='mid', orientation='vertical', zorder=0) hist = plotting.ScatterPlot(hist_data_list=[hd], right_text=s) hist.left_text_size = 12.0 hist.right_text_size = 12.0 xticks = [i for i in hist.ax.get_xticks()] xtick_labels = [i for i in xticks] yticks = [i for i in hist.ax.get_yticks()] ytick_labels = [i for i in yticks] if len(xtick_labels) >= 8: for i in range(1, len(xtick_labels), 2): xtick_labels[i] = '' if len(ytick_labels) >= 8: for i in range(1, len(ytick_labels), 2): ytick_labels[i] = '' xticks_obj = plotting.Ticks(ticks=xticks, labels=xtick_labels, horizontalalignment='center') yticks_obj = plotting.Ticks(ticks=yticks, labels=ytick_labels) hist.xticks_obj = xticks_obj hist.yticks_obj = yticks_obj plot_grid = plotting.PlotGrid(subplots=[hist], num_columns=1, label_schema=None, title=stat, title_size=14.0, title_top=False, y_title='Density', y_title_position=0.001, y_title_size=14.0, height=4.0, width=6.0, auto_height=False) plot_grid.auto_adjust_margins = False plot_grid.margin_left = 0.04 plot_grid.margin_bottom = 0.04 plot_grid.margin_right = 1.0 plot_grid.margin_top = 0.97 plot_grid.reset_figure() plot_grid.savefig(plot_path) if not found: raise Exception('stat pattern {0!r} not found in simulated stats:' '\n\t{1}'.format(stat_pattern, ', '.join(sample.keys()))) stop_time = datetime.datetime.now() log.info('Done!') info.write('\t[[run_stats]]', log.info) info.write('\t\tstart_time = {0}'.format(str(start_time)), log.info) info.write('\t\tstop_time = {0}'.format(str(stop_time)), log.info) info.write('\t\ttotal_duration = {0}'.format(str(stop_time - start_time)), log.info) if not args.keep_temps: log.debug('purging temps...') temp_fs.purge()
def main_cli(): description = '{name} {version}'.format(**_program_info) parser = argparse.ArgumentParser(description = description) parser.add_argument('div_model_path', metavar='DIV-MODEL-RESULTS-FILE', type=argparse_utils.arg_is_file, help = ('Path to divergence model results file (i.e., ' '`*-div-model-results.txt`).')) parser.add_argument('-i', '--taxon-indices', nargs = '+', type = argparse_utils.arg_is_positive_int, required = True, help = ('Two or more space-separated indices of taxa for which to ' 'calculate the probability of them co-diverging. Indices ' 'correspond to the line in the sample table of the config, ' 'starting at 1 for the first line of the table. At least ' 'two indices are required.')) parser.add_argument('-c', '--config', type = argparse_utils.arg_is_config, help = ('msBayes config file to be used to estimate prior ' 'probability via simulations. If provided, the ' 'posterior and prior probability and bayes factor is ' 'reported. If not provided, only the posterior ' 'probability is reported.')) parser.add_argument('-n', '--num-prior-samples', action = 'store', type = argparse_utils.arg_is_positive_int, default = 100000, help = ('The number of prior samples to simulate for estimating' 'prior probabilities. Only used if a config file is ' 'provided with the `-c` argument.')) parser.add_argument('--np', action = 'store', type = argparse_utils.arg_is_positive_int, default = multiprocessing.cpu_count(), help = ('The maximum number of processes to run in parallel for ' 'prior simulations. The default is the number of CPUs ' 'available on the machine. This option is only relevant ' 'if a config file is provided using the `-c` argument.')) parser.add_argument('--seed', action = 'store', type = argparse_utils.arg_is_positive_int, help = ('Random number seed to use for simulations. This option ' 'is only relevant if a config file is provided using the ' '`-c` argument.')) parser.add_argument('--version', action = 'version', version = '%(prog)s ' + _program_info['version'], help = 'Report version and exit.') parser.add_argument('--quiet', action = 'store_true', help = 'Run without verbose messaging.') parser.add_argument('--debug', action = 'store_true', help = 'Run in debugging mode.') args = parser.parse_args() ########################################################################## ## handle args from pymsbayes.utils.messaging import (LoggingControl, InfoLogger) LoggingControl.set_logging_level("INFO") if args.quiet: LoggingControl.set_logging_level("WARNING") if args.debug: LoggingControl.set_logging_level("DEBUG") log = LoggingControl.get_logger(__name__) from pymsbayes import config from pymsbayes.teams import ModelProbabilityEstimatorTeam from pymsbayes.utils import sumresults, GLOBAL_RNG if len(args.taxon_indices) < 2: log.error('At least two taxon indices are required') sys.exit(1) if not args.seed: args.seed = random.randint(1, 999999999) GLOBAL_RNG.seed(args.seed) div_models = sumresults.OrderedDivergenceModelCollection( div_model_results_path = args.div_model_path) for i in args.taxon_indices: if ((i < 1) or (i > div_models.npairs)): log.error('taxon index {0} is out of bounds'.format(i)) sys.exit(1) args.taxon_indices = [i - 1 for i in args.taxon_indices] prob_shared_div = div_models.prob_of_shared_divergence(args.taxon_indices) if args.config: prob_estimator_team = ModelProbabilityEstimatorTeam( config_paths = [args.config], num_samples = args.num_prior_samples, num_processors = args.np) prob_estimator_team.start() prior_prob = prob_estimator_team.shared_div_probs[args.config][ len(args.taxon_indices)] bf = ((prob_shared_div / (1 - prob_shared_div)) / (prior_prob / (1 - prior_prob))) sys.stdout.write('posterior probability = {0}\n'.format(prob_shared_div)) if args.config: sys.stdout.write('prior probability = {0}\n'.format(prior_prob)) sys.stdout.write('Bayes factor = {0}\n'.format(bf)) sys.stdout.write('2ln(Bayes factor) = {0}\n'.format(2 * math.log(bf)))