def setUp(self): self.set_up() self.cfg_path = package_paths.data_path('4pairs_1locus.cfg') self.cfg_path2 = package_paths.data_path('4pairs_1locus_maxt5.cfg') self.np_new_cfg = package_paths.data_path('negros_panay_new.cfg') self.np_new_sps_cfg = package_paths.data_path('negros_panay_new_subs_per_site.cfg') self.np_cfg = package_paths.data_path('negros_panay_timescale.cfg') self.np_sps_cfg = package_paths.data_path('negros_panay_timescale_subs_per_site.cfg') self.seed = GLOBAL_RNG.randint(1, 999999999) self.rng = random.Random() self.rng.seed(self.seed) self.output_dir = self.get_test_subdir(prefix='dmc-test-') self.output_prefix = self.temp_fs.token_id
def setUp(self): self.set_up() self.cfg_path = package_paths.data_path('4pairs_1locus.cfg') self.cfg_path2 = package_paths.data_path('4pairs_1locus_maxt5.cfg') self.np_new_cfg = package_paths.data_path('negros_panay_new.cfg') self.np_new_sps_cfg = package_paths.data_path( 'negros_panay_new_subs_per_site.cfg') self.np_cfg = package_paths.data_path('negros_panay_timescale.cfg') self.np_sps_cfg = package_paths.data_path( 'negros_panay_timescale_subs_per_site.cfg') self.seed = GLOBAL_RNG.randint(1, 999999999) self.rng = random.Random() self.rng.seed(self.seed) self.output_dir = self.get_test_subdir(prefix='dmc-test-') self.output_prefix = self.temp_fs.token_id
def main(): keys_to_print = { 'ncats': 'expected number of categories', 'concentration': 'concentration parameter', } parameter_options = ['concentration', 'ncats'] description = '{name} {version}'.format(**_program_info) parser = argparse.ArgumentParser( description=description, formatter_class=argparse.RawTextHelpFormatter) parser.add_argument('--version', action='version', version='%(prog)s ' + _program_info['version'], help='report version and exit') parser.add_argument('parameter', choices=parameter_options, nargs=1, help=('The parameter provided. The two options are:\n' '`concentration`: the concentration parameter\n' '\tof the Dirichlet process.\n' '`ncats`: the expected (mean) number of\n' '\tcategories for the dirichlet\n' '\tprocess.\n' 'You provide one of these two parameters along\n' 'with the number of elements (taxon pairs),\n' 'and this program calculates and returns the\n' 'other one accordingly.')) parser.add_argument('parameter_value', metavar='X', type=argparse_utils.arg_is_positive_float, help=('Value of the parameter')) parser.add_argument('--shape', required=False, type=argparse_utils.arg_is_positive_float, help=('Shape parameter of a gamma hyperprior on the\n' 'concentration parameter of the Dirichlet\n' 'process. If provided, the program will\n' 'calculate a corresponding scale parameter\n' 'for the gamma hyperprior such that the\n' 'mean of the gamma hyperprior is equal to\n' 'the reported concentration parameter and the\n' 'prior expectation for the number of\n' 'categories is equal to `ncats`.')) parser.add_argument( '--reps', action='store', type=int, required=False, help=('The number of simulation replicates to use for\n' 'estimating the probability of the number of categories.\n' 'By default, no simulations are run and the probabilities\n' 'are not estimated or reported.')) parser.add_argument( '--np', action='store', type=argparse_utils.arg_is_positive_int, default=multiprocessing.cpu_count(), help=('The maximum number of processes to run in parallel. The\n' 'default is the number of CPUs available on the machine.\n' 'This option is only used if `--reps` is specified.')) parser.add_argument('--seed', action='store', type=argparse_utils.arg_is_positive_int, help='Random number seed to use for the analysis.') parser.add_argument( 'num_elements', metavar='N', type=argparse_utils.arg_is_nonnegative_int, help='Number of elements (i.e., number of taxon pairs).') args = parser.parse_args() from pymsbayes.utils.messaging import (LoggingControl, InfoLogger) LoggingControl.set_logging_level("WARNING") from pymsbayes.teams import DppSimTeam from pymsbayes.utils import GLOBAL_RNG, probability from pymsbayes.utils.stats import Partition p = Partition('0' * args.num_elements) results = dict(zip(parameter_options, [None for k in parameter_options])) args.parameter = args.parameter[0] if args.parameter == 'concentration': results['concentration'] = args.parameter_value results['ncats'] = p.get_dpp_expected_num_cats(args.parameter_value) elif args.parameter == 'ncats': if args.parameter_value > args.num_elements: sys.stderr.write( 'ERROR: `ncats` cannot be greater than the number ' 'of elements\n') sys.exit(1) elif args.parameter_value < 1.0: sys.stderr.write('ERROR: `ncats` cannot be less than 1\n') sys.exit(1) results['ncats'] = args.parameter_value results['concentration'] = p.get_dpp_concentration( args.parameter_value) else: raise Exception('parameter option {0} is not valid'.format( args.parameter)) alpha = results['concentration'] if args.shape: results['shape'] = args.shape results['scale'] = results['concentration'] / args.shape parameter_options.extend(['shape', 'scale']) alpha = probability.GammaDistribution(shape=results['shape'], scale=results['scale']) sys.stdout.write('number of elements = {0}\n'.format(args.num_elements)) for key in parameter_options: sys.stdout.write('{0} = {1}\n'.format(keys_to_print.get(key, key), results[key])) if args.reps: sys.stderr.write( '\nStarting simulations to estimate probabilities...\n') if not args.seed: args.seed = random.randint(1, 999999999) sys.stderr.write('Using seed {0}\n\n'.format(args.seed)) GLOBAL_RNG.seed(args.seed) sim_team = DppSimTeam(alpha=alpha, num_elements=args.num_elements, base_distribution=None, num_samples=args.reps, num_processors=args.np) sim_team.start() sys.stderr.write( 'Estimated probabilities of the number of categories:\n') for k, prob in sim_team.psi_probs.iteritems(): sys.stdout.write('\tp(ncats = {0}) = {1:.4f} (n = {2})\n'.format( k, prob, p.number_of_partitions_into_k_subsets(k)))
def main(): keys_to_print = { 'ncats': 'expected number of categories', 'concentration': 'concentration parameter', } parameter_options = ['concentration', 'ncats'] description = '{name} {version}'.format(**_program_info) parser = argparse.ArgumentParser(description = description, formatter_class=argparse.RawTextHelpFormatter) parser.add_argument('--version', action='version', version='%(prog)s ' + _program_info['version'], help='report version and exit') parser.add_argument('parameter', choices=parameter_options, nargs=1, help = ('The parameter provided. The two options are:\n' '`concentration`: the concentration parameter\n' '\tof the Dirichlet process.\n' '`ncats`: the expected (mean) number of\n' '\tcategories for the dirichlet\n' '\tprocess.\n' 'You provide one of these two parameters along\n' 'with the number of elements (taxon pairs),\n' 'and this program calculates and returns the\n' 'other one accordingly.')) parser.add_argument('parameter_value', metavar='X', type=argparse_utils.arg_is_positive_float, help=('Value of the parameter')) parser.add_argument('--shape', required = False, type=argparse_utils.arg_is_positive_float, help = ('Shape parameter of a gamma hyperprior on the\n' 'concentration parameter of the Dirichlet\n' 'process. If provided, the program will\n' 'calculate a corresponding scale parameter\n' 'for the gamma hyperprior such that the\n' 'mean of the gamma hyperprior is equal to\n' 'the reported concentration parameter and the\n' 'prior expectation for the number of\n' 'categories is equal to `ncats`.')) parser.add_argument('--reps', action = 'store', type = int, required = False, help = ('The number of simulation replicates to use for\n' 'estimating the probability of the number of categories.\n' 'By default, no simulations are run and the probabilities\n' 'are not estimated or reported.')) parser.add_argument('--np', action = 'store', type = argparse_utils.arg_is_positive_int, default = multiprocessing.cpu_count(), help = ('The maximum number of processes to run in parallel. The\n' 'default is the number of CPUs available on the machine.\n' 'This option is only used if `--reps` is specified.')) parser.add_argument('--seed', action = 'store', type = argparse_utils.arg_is_positive_int, help = 'Random number seed to use for the analysis.') parser.add_argument('num_elements', metavar='N', type=argparse_utils.arg_is_nonnegative_int, help='Number of elements (i.e., number of taxon pairs).') args = parser.parse_args() from pymsbayes.utils.messaging import (LoggingControl, InfoLogger) LoggingControl.set_logging_level("WARNING") from pymsbayes.teams import DppSimTeam from pymsbayes.utils import GLOBAL_RNG, probability from pymsbayes.utils.stats import Partition p = Partition('0' * args.num_elements) results = dict(zip(parameter_options, [None for k in parameter_options])) args.parameter = args.parameter[0] if args.parameter == 'concentration': results['concentration'] = args.parameter_value results['ncats'] = p.get_dpp_expected_num_cats(args.parameter_value) elif args.parameter == 'ncats': if args.parameter_value > args.num_elements: sys.stderr.write('ERROR: `ncats` cannot be greater than the number ' 'of elements\n') sys.exit(1) elif args.parameter_value < 1.0: sys.stderr.write('ERROR: `ncats` cannot be less than 1\n') sys.exit(1) results['ncats'] = args.parameter_value results['concentration'] = p.get_dpp_concentration(args.parameter_value) else: raise Exception('parameter option {0} is not valid'.format( args.parameter)) alpha = results['concentration'] if args.shape: results['shape'] = args.shape results['scale'] = results['concentration'] / args.shape parameter_options.extend(['shape', 'scale']) alpha = probability.GammaDistribution( shape = results['shape'], scale = results['scale']) sys.stdout.write('number of elements = {0}\n'.format(args.num_elements)) for key in parameter_options: sys.stdout.write('{0} = {1}\n'.format( keys_to_print.get(key, key), results[key])) if args.reps: sys.stderr.write('\nStarting simulations to estimate probabilities...\n') if not args.seed: args.seed = random.randint(1, 999999999) sys.stderr.write('Using seed {0}\n\n'.format(args.seed)) GLOBAL_RNG.seed(args.seed) sim_team = DppSimTeam( alpha = alpha, num_elements = args.num_elements, base_distribution = None, num_samples = args.reps, num_processors = args.np) sim_team.start() sys.stderr.write('Estimated probabilities of the number of categories:\n') for k, prob in sim_team.psi_probs.iteritems(): sys.stdout.write('\tp(ncats = {0}) = {1:.4f} (n = {2})\n'.format( k, prob, p.number_of_partitions_into_k_subsets(k)))
label_axis_label_size = 16.0) pg.savefig(os.path.join(out_dir, 'negros-panay-marginal-times.pdf')) def main_cli(): create_plots( dpp_config_path = project_util.PHILIPPINES_DPP_CFG, uniform_config_path = project_util.PHILIPPINES_UNIFORM_CFG, old_config_path = project_util.PHILIPPINES_OLD_CFG, dpp_info_path = project_util.PHILIPPINES_DPP_INFO, dpp_simple_info_path = project_util.PHILIPPINES_DPP_SIMPLE_INFO, dpp_inform_info_path = project_util.PHILIPPINES_DPP_INFORM_INFO, uniform_info_path = project_util.PHILIPPINES_UNIFORM_INFO, old_info_path = project_util.PHILIPPINES_OLD_INFO, out_dir = project_util.PLOT_DIR) create_negros_panay_plots( config_path = project_util.NEGROS_PANAY_CFG, ordered_info_path = project_util.NP_DPP_ORDERED_INFO, unordered_info_path = project_util.NP_DPP_UNORDERED_INFO, out_dir = project_util.PLOT_DIR) create_time_plot( config_path = project_util.NEGROS_PANAY_CFG, info_path = project_util.NP_DPP_ORDERED_INFO, out_dir = project_util.PLOT_DIR) if __name__ == '__main__': if len(sys.argv) > 1: seed = int(sys.argv[1]) GLOBAL_RNG.seed(seed) main_cli()
def main_cli(): description = '{name} {version}'.format(**_program_info) parser = argparse.ArgumentParser(description = description) parser.add_argument('-c', '--config', type = arg_is_config, required = True, help = ('msBayes config file to be used to generate saturation ' 'plot.')) parser.add_argument('-n', '--num-prior-samples', action = 'store', type = int, default = 1000, help = ('The number of prior samples to simulate for the ' 'saturation plot.')) parser.add_argument('--np', action = 'store', type = int, default = multiprocessing.cpu_count(), help = ('The maximum number of processes to run in parallel. The ' 'default is the number of CPUs available on the machine.')) parser.add_argument('-o', '--output-dir', action = 'store', type = arg_is_dir, help = ('The directory in which all output files will be written. ' 'The default is to use the directory of the first observed ' 'config file.')) parser.add_argument('--temp-dir', action = 'store', type = arg_is_dir, help = ('A directory to temporarily stage files. The default is to ' 'use the output directory.')) parser.add_argument('-s', '--stat-prefixes', nargs = '*', type = str, default = ['pi', 'pi.net', 'wattTheta', 'tajD.denom'], help = ('Prefixes of summary statistics to use in the analyses. ' 'The prefixes should be separated by spaces. ' 'Default: `-s pi pi.net wattTheta tajD.denom`.')) parser.add_argument('--vertical-lines', nargs = '*', type = float, default = [], help = ('Positions along x-axis where vertical lines are to be ' 'drawn. Default is to draw no vertical lines.')) parser.add_argument('--compress', action = 'store_true', help = 'Compress plot data file.') parser.add_argument('--keep-temps', action = 'store_true', help = 'Keep all temporary files.') parser.add_argument('--seed', action = 'store', type = int, help = 'Random number seed to use for the analysis.') parser.add_argument('--version', action = 'version', version = '%(prog)s ' + _program_info['version'], help = 'Report version and exit.') parser.add_argument('--quiet', action = 'store_true', help = 'Run without verbose messaging.') parser.add_argument('--debug', action = 'store_true', help = 'Run in debugging mode.') args = parser.parse_args() ########################################################################## ## handle args from pymsbayes.utils.messaging import (LoggingControl, InfoLogger) LoggingControl.set_logging_level("INFO") if args.quiet: LoggingControl.set_logging_level("WARNING") if args.debug: LoggingControl.set_logging_level("DEBUG") log = LoggingControl.get_logger(__name__) from pymsbayes.workers import MsBayesWorker from pymsbayes.utils.parsing import (get_patterns_from_prefixes, DEFAULT_STAT_PATTERNS, get_stats_by_time, dict_line_iter) from pymsbayes.manager import Manager from pymsbayes.utils.tempfs import TempFileSystem from pymsbayes.utils import probability from pymsbayes.utils.functions import long_division from pymsbayes.config import MsBayesConfig from pymsbayes.utils import GLOBAL_RNG, MSBAYES_SORT_INDEX, ToolPathManager from pymsbayes.fileio import process_file_arg from pymsbayes.plotting import MATPLOTLIB_AVAILABLE, SaturationPlotGrid MSBAYES_SORT_INDEX.set_index(0) # get full paths to tools msbayes_path = ToolPathManager.get_tool_full_path('msbayes.pl') dpp_msbayes_path = ToolPathManager.get_tool_full_path('dpp-msbayes.pl') if not args.output_dir: args.output_dir = os.path.dirname(args.config) info = InfoLogger(os.path.join(args.output_dir, 'pymsbayes-info.txt')) stats_by_time_path = os.path.join(args.output_dir, 'stats-by-time.txt') if args.compress: stats_by_time_path += '.gz' plot_path = os.path.join(args.output_dir, 'saturation-plot.pdf') if not args.temp_dir: args.temp_dir = args.output_dir temp_fs = TempFileSystem(parent=args.temp_dir, prefix='temp-files-') args.stat_prefixes = [s.rstrip('.') for s in args.stat_prefixes] stat_patterns = get_patterns_from_prefixes( [s + '.' for s in args.stat_prefixes], ignore_case=True) if not args.seed: args.seed = random.randint(1, 999999999) GLOBAL_RNG.seed(args.seed) compress_level = None if args.compress: compress_level = 9 cfg = MsBayesConfig(args.config) num_taxon_pairs = cfg.npairs cfg.div_model_prior = 'constrained' cfg.psi = probability.DiscreteUniformDistribution(num_taxon_pairs, num_taxon_pairs) config_path = temp_fs.get_file_path(prefix='cfg-') cfg.write(config_path) info.write('[pymsbayes]', log.info) info.write('\tprogram_name = {name}'.format(**_program_info), log.info) info.write('\tversion = {version}'.format(**_program_info), log.info) info.write('\tinvocation = {0!r}'.format(' '.join(sys.argv)), log.info) info.write('\toutput_directory = {0!r}'.format(args.output_dir), log.info) info.write('\ttemp_directory = {0!r}'.format(temp_fs.base_dir), log.info) info.write('\tsort_index = {0}'.format( MSBAYES_SORT_INDEX.current_value()), log.info) info.write('\tstat_patterns = {0!r}'.format( ', '.join([p.pattern for p in stat_patterns])), log.info) info.write('\tseed = {0}'.format(args.seed), log.info) info.write('\tnum_prior_samples = {0}'.format(args.num_prior_samples), log.info) info.write('\tstats_by_time_path = {0!r}'.format(stats_by_time_path), log.info) info.write('\t[[tool_paths]]', log.info) info.write('\t\tdpp_msbayes = {0}'.format(dpp_msbayes_path), log.info) info.write('\t\tmsbayes = {0}'.format(msbayes_path), log.info) info.write('\t[[config]]', log.debug) info.write('{0}'.format(str(cfg)), log.debug) ########################################################################## ## begin analysis --- generate samples start_time = datetime.datetime.now() if args.np > args.num_prior_samples: args.np = args.num_prior_samples batch_size, remainder = long_division(args.num_prior_samples, args.np) schema = 'abctoolbox' workers = [] for i in range(args.np): sample_size = batch_size if i == (args.np - 1): sample_size += remainder w = MsBayesWorker( temp_fs = temp_fs, sample_size = sample_size, config_path = config_path, report_parameters = True, schema = schema, include_header = True, stat_patterns = stat_patterns, write_stats_file = False) workers.append(w) log.info('Generating samples...') workers = Manager.run_workers( workers = workers, num_processors = args.np) log.info('Parsing samples...') stats_by_time = get_stats_by_time([w.prior_path for w in workers]) stat_keys = stats_by_time.keys() stat_keys.remove('PRI.t') for prefix in args.stat_prefixes: if not prefix in stat_keys: raise Exception('stat prefix {0!r} not found in simulated stats:' '\n\t{1}'.format(prefix, ', '.join(stat_keys))) header = ['PRI.t'] + args.stat_prefixes log.info('Writing stats-by-time matrix...') out, close = process_file_arg(stats_by_time_path, 'w', compresslevel = compress_level) for row in dict_line_iter(stats_by_time, sep = '\t', header = header): out.write(row) if close: out.close() log.info('Creating plots...') if not MATPLOTLIB_AVAILABLE: log.warning( '`matplotlib` could not be imported, so the plot can not be\n' 'produced. The data to create the plot can be found in:\n\t' '{0!r}'.format(stats_by_time_path)) else: y_labels = {'pi': r'$\pi$', 'pi.net': r'$\pi_{net}$', 'wattTheta': r'$\theta_W$', 'tajD.denom': r'$SD(\pi - \theta_W)$'} spg = SaturationPlotGrid(stats_by_time, x_key = 'PRI.t', y_keys = args.stat_prefixes, y_labels = y_labels, num_columns = 2, vertical_line_positions = args.vertical_lines) fig = spg.create_grid() fig.savefig(plot_path) stop_time = datetime.datetime.now() log.info('Done!') info.write('\t[[run_stats]]', log.info) info.write('\t\tstart_time = {0}'.format(str(start_time)), log.info) info.write('\t\tstop_time = {0}'.format(str(stop_time)), log.info) info.write('\t\ttotal_duration = {0}'.format(str(stop_time - start_time)), log.info) if not args.keep_temps: log.debug('purging temps...') temp_fs.purge()
def main_cli(): description = '{name} {version}'.format(**_program_info) parser = argparse.ArgumentParser(description = description, formatter_class = argparse_utils.SmartHelpFormatter) parser.add_argument('-c', '--config', type = argparse_utils.arg_is_config, required = True, help = ('msBayes config file to be used to generate saturation ' 'plot.')) parser.add_argument('-n', '--num-prior-samples', action = 'store', type = int, default = 1000, help = ('The number of prior samples to simulate for the ' 'saturation plot.')) parser.add_argument('--np', action = 'store', type = int, default = multiprocessing.cpu_count(), help = ('The maximum number of processes to run in parallel. The ' 'default is the number of CPUs available on the machine.')) parser.add_argument('-o', '--output-dir', action = 'store', type = argparse_utils.arg_is_dir, help = ('The directory in which all output files will be written. ' 'The default is to use the directory of the first observed ' 'config file.')) parser.add_argument('--temp-dir', action = 'store', type = argparse_utils.arg_is_dir, help = ('A directory to temporarily stage files. The default is to ' 'use the output directory.')) parser.add_argument('-s', '--stat-prefixes', nargs = '*', type = str, default = ['pi', 'pi.net', 'wattTheta', 'tajD.denom'], help = ('Prefixes of summary statistics to use in the analyses. ' 'The prefixes should be separated by spaces. ' 'Default: `-s pi pi.net wattTheta tajD.denom`.')) parser.add_argument('--sort-index', action = 'store', type = int, default = 0, choices = range(12), help = argparse_utils.get_sort_index_help_message()) parser.add_argument('--compress', action = 'store_true', help = 'Compress plot data file.') parser.add_argument('--keep-temps', action = 'store_true', help = 'Keep all temporary files.') parser.add_argument('--seed', action = 'store', type = int, help = 'Random number seed to use for the analysis.') parser.add_argument('--version', action = 'version', version = '%(prog)s ' + _program_info['version'], help = 'Report version and exit.') parser.add_argument('--quiet', action = 'store_true', help = 'Run without verbose messaging.') parser.add_argument('--debug', action = 'store_true', help = 'Run in debugging mode.') args = parser.parse_args() ########################################################################## ## handle args from pymsbayes.utils.messaging import (LoggingControl, InfoLogger) LoggingControl.set_logging_level("INFO") if args.quiet: LoggingControl.set_logging_level("WARNING") if args.debug: LoggingControl.set_logging_level("DEBUG") log = LoggingControl.get_logger(__name__) from pymsbayes.workers import MsBayesWorker from pymsbayes.utils.parsing import (get_patterns_from_prefixes, DEFAULT_STAT_PATTERNS, get_dict_from_spreadsheets, dict_line_iter) from pymsbayes.manager import Manager from pymsbayes.utils.tempfs import TempFileSystem from pymsbayes.utils import probability, stats from pymsbayes.utils.functions import long_division from pymsbayes.config import MsBayesConfig from pymsbayes.utils import GLOBAL_RNG, MSBAYES_SORT_INDEX, ToolPathManager from pymsbayes.fileio import process_file_arg from pymsbayes import plotting MSBAYES_SORT_INDEX.set_index(args.sort_index) # get full paths to tools msbayes_path = ToolPathManager.get_tool_full_path('msbayes.pl') dpp_msbayes_path = ToolPathManager.get_tool_full_path('dpp-msbayes.pl') if not args.output_dir: args.output_dir = os.path.dirname(args.config) info = InfoLogger(os.path.join(args.output_dir, 'pymsbayes-info.txt')) sample_path = os.path.join(args.output_dir, 'prior-sample.txt') if args.compress: sample_path += '.gz' if not args.temp_dir: args.temp_dir = args.output_dir temp_fs = TempFileSystem(parent=args.temp_dir, prefix='temp-files-') args.stat_prefixes = [s.rstrip('.') for s in args.stat_prefixes] stat_patterns = get_patterns_from_prefixes( [s + '.' for s in args.stat_prefixes], ignore_case=True) if not args.seed: args.seed = random.randint(1, 999999999) GLOBAL_RNG.seed(args.seed) compress_level = None if args.compress: compress_level = 9 cfg = MsBayesConfig(args.config) num_taxon_pairs = cfg.npairs info.write('[pymsbayes]', log.info) info.write('\tprogram_name = {name}'.format(**_program_info), log.info) info.write('\tversion = {version}'.format(**_program_info), log.info) info.write('\tinvocation = {0!r}'.format(' '.join(sys.argv)), log.info) info.write('\toutput_directory = {0!r}'.format(args.output_dir), log.info) info.write('\ttemp_directory = {0!r}'.format(temp_fs.base_dir), log.info) info.write('\tsort_index = {0}'.format( MSBAYES_SORT_INDEX.current_value()), log.info) info.write('\tstat_patterns = {0!r}'.format( ', '.join([p.pattern for p in stat_patterns])), log.info) info.write('\tseed = {0}'.format(args.seed), log.info) info.write('\tnum_prior_samples = {0}'.format(args.num_prior_samples), log.info) info.write('\tsample_path = {0!r}'.format(sample_path), log.info) info.write('\t[[tool_paths]]', log.info) info.write('\t\tdpp_msbayes = {0}'.format(dpp_msbayes_path), log.info) info.write('\t\tmsbayes = {0}'.format(msbayes_path), log.info) info.write('\t[[config]]', log.debug) info.write('{0}'.format(str(cfg)), log.debug) ########################################################################## ## begin analysis --- generate samples start_time = datetime.datetime.now() if args.np > args.num_prior_samples: args.np = args.num_prior_samples batch_size, remainder = long_division(args.num_prior_samples, args.np) schema = 'abctoolbox' workers = [] for i in range(args.np): sample_size = batch_size if i == (args.np - 1): sample_size += remainder w = MsBayesWorker( temp_fs = temp_fs, sample_size = sample_size, config_path = args.config, report_parameters = True, schema = schema, include_header = True, stat_patterns = stat_patterns, write_stats_file = False) workers.append(w) log.info('Generating samples...') workers = Manager.run_workers( workers = workers, num_processors = args.np) log.info('Parsing samples...') sample = get_dict_from_spreadsheets([w.prior_path for w in workers]) log.info('Writing prior samples...') out, close = process_file_arg(sample_path, 'w', compresslevel = compress_level) for row in dict_line_iter(sample, sep = '\t'): out.write(row) if close: out.close() log.info('Creating plots...') if not plotting.MATPLOTLIB_AVAILABLE: log.warning( '`matplotlib` could not be imported, so the plot can not be\n' 'produced. The data to create the plot can be found in:\n\t' '{0!r}'.format(sample_path)) sys.exit(1) for stat_pattern in stat_patterns: found = False for stat, values in sample.iteritems(): if stat_pattern.match(stat): values = [float(v) for v in values] found = True plot_path = os.path.join(args.output_dir, 'plot-{0}.pdf'.format(stat)) summary = stats.get_summary(values) s = r'mean = {0:.4f} ({1:.4f}-{2:.4f})'.format( summary['mean'], summary['qi_95'][0], summary['qi_95'][1]) hd = plotting.HistData(x = values, normed = True, bins = 20, histtype = 'bar', align = 'mid', orientation = 'vertical', zorder = 0) hist = plotting.ScatterPlot(hist_data_list = [hd], right_text = s) hist.left_text_size = 12.0 hist.right_text_size = 12.0 xticks = [i for i in hist.ax.get_xticks()] xtick_labels = [i for i in xticks] yticks = [i for i in hist.ax.get_yticks()] ytick_labels = [i for i in yticks] if len(xtick_labels) >= 8: for i in range(1, len(xtick_labels), 2): xtick_labels[i] = '' if len(ytick_labels) >= 8: for i in range(1, len(ytick_labels), 2): ytick_labels[i] = '' xticks_obj = plotting.Ticks(ticks = xticks, labels = xtick_labels, horizontalalignment = 'center') yticks_obj = plotting.Ticks(ticks = yticks, labels = ytick_labels) hist.xticks_obj = xticks_obj hist.yticks_obj = yticks_obj plot_grid = plotting.PlotGrid(subplots = [hist], num_columns = 1, label_schema = None, title = stat, title_size = 14.0, title_top = False, y_title = 'Density', y_title_position = 0.001, y_title_size = 14.0, height = 4.0, width = 6.0, auto_height = False) plot_grid.auto_adjust_margins = False plot_grid.margin_left = 0.04 plot_grid.margin_bottom = 0.04 plot_grid.margin_right = 1.0 plot_grid.margin_top = 0.97 plot_grid.reset_figure() plot_grid.savefig(plot_path) if not found: raise Exception('stat pattern {0!r} not found in simulated stats:' '\n\t{1}'.format(stat_pattern, ', '.join(sample.keys()))) stop_time = datetime.datetime.now() log.info('Done!') info.write('\t[[run_stats]]', log.info) info.write('\t\tstart_time = {0}'.format(str(start_time)), log.info) info.write('\t\tstop_time = {0}'.format(str(stop_time)), log.info) info.write('\t\ttotal_duration = {0}'.format(str(stop_time - start_time)), log.info) if not args.keep_temps: log.debug('purging temps...') temp_fs.purge()
def main_cli(): description = '{name} {version}'.format(**_program_info) parser = argparse.ArgumentParser(description=description) parser.add_argument( '-c', '--config', type=arg_is_config, required=True, help=('msBayes config file to be used to generate saturation ' 'plot.')) parser.add_argument( '-n', '--num-prior-samples', action='store', type=int, default=1000, help=('The number of prior samples to simulate for the ' 'saturation plot.')) parser.add_argument( '--np', action='store', type=int, default=multiprocessing.cpu_count(), help=('The maximum number of processes to run in parallel. The ' 'default is the number of CPUs available on the machine.')) parser.add_argument( '-o', '--output-dir', action='store', type=arg_is_dir, help=('The directory in which all output files will be written. ' 'The default is to use the directory of the first observed ' 'config file.')) parser.add_argument( '--temp-dir', action='store', type=arg_is_dir, help=('A directory to temporarily stage files. The default is to ' 'use the output directory.')) parser.add_argument( '-s', '--stat-prefixes', nargs='*', type=str, default=['pi', 'pi.net', 'wattTheta', 'tajD.denom'], help=('Prefixes of summary statistics to use in the analyses. ' 'The prefixes should be separated by spaces. ' 'Default: `-s pi pi.net wattTheta tajD.denom`.')) parser.add_argument( '--vertical-lines', nargs='*', type=float, default=[], help=('Positions along x-axis where vertical lines are to be ' 'drawn. Default is to draw no vertical lines.')) parser.add_argument('--compress', action='store_true', help='Compress plot data file.') parser.add_argument('--keep-temps', action='store_true', help='Keep all temporary files.') parser.add_argument('--seed', action='store', type=int, help='Random number seed to use for the analysis.') parser.add_argument('--version', action='version', version='%(prog)s ' + _program_info['version'], help='Report version and exit.') parser.add_argument('--quiet', action='store_true', help='Run without verbose messaging.') parser.add_argument('--debug', action='store_true', help='Run in debugging mode.') args = parser.parse_args() ########################################################################## ## handle args from pymsbayes.utils.messaging import (LoggingControl, InfoLogger) LoggingControl.set_logging_level("INFO") if args.quiet: LoggingControl.set_logging_level("WARNING") if args.debug: LoggingControl.set_logging_level("DEBUG") log = LoggingControl.get_logger(__name__) from pymsbayes.workers import MsBayesWorker from pymsbayes.utils.parsing import (get_patterns_from_prefixes, DEFAULT_STAT_PATTERNS, get_stats_by_time, dict_line_iter) from pymsbayes.manager import Manager from pymsbayes.utils.tempfs import TempFileSystem from pymsbayes.utils import probability from pymsbayes.utils.functions import long_division from pymsbayes.config import MsBayesConfig from pymsbayes.utils import GLOBAL_RNG, MSBAYES_SORT_INDEX, ToolPathManager from pymsbayes.fileio import process_file_arg from pymsbayes.plotting import MATPLOTLIB_AVAILABLE, SaturationPlotGrid MSBAYES_SORT_INDEX.set_index(0) # get full paths to tools msbayes_path = ToolPathManager.get_tool_full_path('msbayes.pl') dpp_msbayes_path = ToolPathManager.get_tool_full_path('dpp-msbayes.pl') if not args.output_dir: args.output_dir = os.path.dirname(args.config) info = InfoLogger(os.path.join(args.output_dir, 'pymsbayes-info.txt')) stats_by_time_path = os.path.join(args.output_dir, 'stats-by-time.txt') if args.compress: stats_by_time_path += '.gz' plot_path = os.path.join(args.output_dir, 'saturation-plot.pdf') if not args.temp_dir: args.temp_dir = args.output_dir temp_fs = TempFileSystem(parent=args.temp_dir, prefix='temp-files-') args.stat_prefixes = [s.rstrip('.') for s in args.stat_prefixes] stat_patterns = get_patterns_from_prefixes( [s + '.' for s in args.stat_prefixes], ignore_case=True) if not args.seed: args.seed = random.randint(1, 999999999) GLOBAL_RNG.seed(args.seed) compress_level = None if args.compress: compress_level = 9 cfg = MsBayesConfig(args.config) num_taxon_pairs = cfg.npairs cfg.div_model_prior = 'constrained' cfg.psi = probability.DiscreteUniformDistribution(num_taxon_pairs, num_taxon_pairs) config_path = temp_fs.get_file_path(prefix='cfg-') cfg.write(config_path) info.write('[pymsbayes]', log.info) info.write('\tprogram_name = {name}'.format(**_program_info), log.info) info.write('\tversion = {version}'.format(**_program_info), log.info) info.write('\tinvocation = {0!r}'.format(' '.join(sys.argv)), log.info) info.write('\toutput_directory = {0!r}'.format(args.output_dir), log.info) info.write('\ttemp_directory = {0!r}'.format(temp_fs.base_dir), log.info) info.write('\tsort_index = {0}'.format(MSBAYES_SORT_INDEX.current_value()), log.info) info.write( '\tstat_patterns = {0!r}'.format(', '.join( [p.pattern for p in stat_patterns])), log.info) info.write('\tseed = {0}'.format(args.seed), log.info) info.write('\tnum_prior_samples = {0}'.format(args.num_prior_samples), log.info) info.write('\tstats_by_time_path = {0!r}'.format(stats_by_time_path), log.info) info.write('\t[[tool_paths]]', log.info) info.write('\t\tdpp_msbayes = {0}'.format(dpp_msbayes_path), log.info) info.write('\t\tmsbayes = {0}'.format(msbayes_path), log.info) info.write('\t[[config]]', log.debug) info.write('{0}'.format(str(cfg)), log.debug) ########################################################################## ## begin analysis --- generate samples start_time = datetime.datetime.now() if args.np > args.num_prior_samples: args.np = args.num_prior_samples batch_size, remainder = long_division(args.num_prior_samples, args.np) schema = 'abctoolbox' workers = [] for i in range(args.np): sample_size = batch_size if i == (args.np - 1): sample_size += remainder w = MsBayesWorker(temp_fs=temp_fs, sample_size=sample_size, config_path=config_path, report_parameters=True, schema=schema, include_header=True, stat_patterns=stat_patterns, write_stats_file=False) workers.append(w) log.info('Generating samples...') workers = Manager.run_workers(workers=workers, num_processors=args.np) log.info('Parsing samples...') stats_by_time = get_stats_by_time([w.prior_path for w in workers]) stat_keys = stats_by_time.keys() stat_keys.remove('PRI.t') for prefix in args.stat_prefixes: if not prefix in stat_keys: raise Exception('stat prefix {0!r} not found in simulated stats:' '\n\t{1}'.format(prefix, ', '.join(stat_keys))) header = ['PRI.t'] + args.stat_prefixes log.info('Writing stats-by-time matrix...') out, close = process_file_arg(stats_by_time_path, 'w', compresslevel=compress_level) for row in dict_line_iter(stats_by_time, sep='\t', header=header): out.write(row) if close: out.close() log.info('Creating plots...') if not MATPLOTLIB_AVAILABLE: log.warning( '`matplotlib` could not be imported, so the plot can not be\n' 'produced. The data to create the plot can be found in:\n\t' '{0!r}'.format(stats_by_time_path)) else: y_labels = { 'pi': r'$\pi$', 'pi.net': r'$\pi_{net}$', 'wattTheta': r'$\theta_W$', 'tajD.denom': r'$SD(\pi - \theta_W)$' } spg = SaturationPlotGrid(stats_by_time, x_key='PRI.t', y_keys=args.stat_prefixes, y_labels=y_labels, num_columns=2, vertical_line_positions=args.vertical_lines) fig = spg.create_grid() fig.savefig(plot_path) stop_time = datetime.datetime.now() log.info('Done!') info.write('\t[[run_stats]]', log.info) info.write('\t\tstart_time = {0}'.format(str(start_time)), log.info) info.write('\t\tstop_time = {0}'.format(str(stop_time)), log.info) info.write('\t\ttotal_duration = {0}'.format(str(stop_time - start_time)), log.info) if not args.keep_temps: log.debug('purging temps...') temp_fs.purge()
def main_cli(): description = '{name} {version}'.format(**_program_info) parser = argparse.ArgumentParser( description=description, formatter_class=argparse_utils.SmartHelpFormatter) parser.add_argument( '-c', '--config', type=argparse_utils.arg_is_config, required=True, help=('msBayes config file to be used to generate saturation ' 'plot.')) parser.add_argument( '-n', '--num-prior-samples', action='store', type=int, default=1000, help=('The number of prior samples to simulate for the ' 'saturation plot.')) parser.add_argument( '--np', action='store', type=int, default=multiprocessing.cpu_count(), help=('The maximum number of processes to run in parallel. The ' 'default is the number of CPUs available on the machine.')) parser.add_argument( '-o', '--output-dir', action='store', type=argparse_utils.arg_is_dir, help=('The directory in which all output files will be written. ' 'The default is to use the directory of the first observed ' 'config file.')) parser.add_argument( '--temp-dir', action='store', type=argparse_utils.arg_is_dir, help=('A directory to temporarily stage files. The default is to ' 'use the output directory.')) parser.add_argument( '-s', '--stat-prefixes', nargs='*', type=str, default=['pi', 'pi.net', 'wattTheta', 'tajD.denom'], help=('Prefixes of summary statistics to use in the analyses. ' 'The prefixes should be separated by spaces. ' 'Default: `-s pi pi.net wattTheta tajD.denom`.')) parser.add_argument('--sort-index', action='store', type=int, default=0, choices=range(12), help=argparse_utils.get_sort_index_help_message()) parser.add_argument('--compress', action='store_true', help='Compress plot data file.') parser.add_argument('--keep-temps', action='store_true', help='Keep all temporary files.') parser.add_argument('--seed', action='store', type=int, help='Random number seed to use for the analysis.') parser.add_argument('--version', action='version', version='%(prog)s ' + _program_info['version'], help='Report version and exit.') parser.add_argument('--quiet', action='store_true', help='Run without verbose messaging.') parser.add_argument('--debug', action='store_true', help='Run in debugging mode.') args = parser.parse_args() ########################################################################## ## handle args from pymsbayes.utils.messaging import (LoggingControl, InfoLogger) LoggingControl.set_logging_level("INFO") if args.quiet: LoggingControl.set_logging_level("WARNING") if args.debug: LoggingControl.set_logging_level("DEBUG") log = LoggingControl.get_logger(__name__) from pymsbayes.workers import MsBayesWorker from pymsbayes.utils.parsing import (get_patterns_from_prefixes, DEFAULT_STAT_PATTERNS, get_dict_from_spreadsheets, dict_line_iter) from pymsbayes.manager import Manager from pymsbayes.utils.tempfs import TempFileSystem from pymsbayes.utils import probability, stats from pymsbayes.utils.functions import long_division from pymsbayes.config import MsBayesConfig from pymsbayes.utils import GLOBAL_RNG, MSBAYES_SORT_INDEX, ToolPathManager from pymsbayes.fileio import process_file_arg from pymsbayes import plotting MSBAYES_SORT_INDEX.set_index(args.sort_index) # get full paths to tools msbayes_path = ToolPathManager.get_tool_full_path('msbayes.pl') dpp_msbayes_path = ToolPathManager.get_tool_full_path('dpp-msbayes.pl') if not args.output_dir: args.output_dir = os.path.dirname(args.config) info = InfoLogger(os.path.join(args.output_dir, 'pymsbayes-info.txt')) sample_path = os.path.join(args.output_dir, 'prior-sample.txt') if args.compress: sample_path += '.gz' if not args.temp_dir: args.temp_dir = args.output_dir temp_fs = TempFileSystem(parent=args.temp_dir, prefix='temp-files-') args.stat_prefixes = [s.rstrip('.') for s in args.stat_prefixes] stat_patterns = get_patterns_from_prefixes( [s + '.' for s in args.stat_prefixes], ignore_case=True) if not args.seed: args.seed = random.randint(1, 999999999) GLOBAL_RNG.seed(args.seed) compress_level = None if args.compress: compress_level = 9 cfg = MsBayesConfig(args.config) num_taxon_pairs = cfg.npairs info.write('[pymsbayes]', log.info) info.write('\tprogram_name = {name}'.format(**_program_info), log.info) info.write('\tversion = {version}'.format(**_program_info), log.info) info.write('\tinvocation = {0!r}'.format(' '.join(sys.argv)), log.info) info.write('\toutput_directory = {0!r}'.format(args.output_dir), log.info) info.write('\ttemp_directory = {0!r}'.format(temp_fs.base_dir), log.info) info.write('\tsort_index = {0}'.format(MSBAYES_SORT_INDEX.current_value()), log.info) info.write( '\tstat_patterns = {0!r}'.format(', '.join( [p.pattern for p in stat_patterns])), log.info) info.write('\tseed = {0}'.format(args.seed), log.info) info.write('\tnum_prior_samples = {0}'.format(args.num_prior_samples), log.info) info.write('\tsample_path = {0!r}'.format(sample_path), log.info) info.write('\t[[tool_paths]]', log.info) info.write('\t\tdpp_msbayes = {0}'.format(dpp_msbayes_path), log.info) info.write('\t\tmsbayes = {0}'.format(msbayes_path), log.info) info.write('\t[[config]]', log.debug) info.write('{0}'.format(str(cfg)), log.debug) ########################################################################## ## begin analysis --- generate samples start_time = datetime.datetime.now() if args.np > args.num_prior_samples: args.np = args.num_prior_samples batch_size, remainder = long_division(args.num_prior_samples, args.np) schema = 'abctoolbox' workers = [] for i in range(args.np): sample_size = batch_size if i == (args.np - 1): sample_size += remainder w = MsBayesWorker(temp_fs=temp_fs, sample_size=sample_size, config_path=args.config, report_parameters=True, schema=schema, include_header=True, stat_patterns=stat_patterns, write_stats_file=False) workers.append(w) log.info('Generating samples...') workers = Manager.run_workers(workers=workers, num_processors=args.np) log.info('Parsing samples...') sample = get_dict_from_spreadsheets([w.prior_path for w in workers]) log.info('Writing prior samples...') out, close = process_file_arg(sample_path, 'w', compresslevel=compress_level) for row in dict_line_iter(sample, sep='\t'): out.write(row) if close: out.close() log.info('Creating plots...') if not plotting.MATPLOTLIB_AVAILABLE: log.warning( '`matplotlib` could not be imported, so the plot can not be\n' 'produced. The data to create the plot can be found in:\n\t' '{0!r}'.format(sample_path)) sys.exit(1) for stat_pattern in stat_patterns: found = False for stat, values in sample.iteritems(): if stat_pattern.match(stat): values = [float(v) for v in values] found = True plot_path = os.path.join(args.output_dir, 'plot-{0}.pdf'.format(stat)) summary = stats.get_summary(values) s = r'mean = {0:.4f} ({1:.4f}-{2:.4f})'.format( summary['mean'], summary['qi_95'][0], summary['qi_95'][1]) hd = plotting.HistData(x=values, normed=True, bins=20, histtype='bar', align='mid', orientation='vertical', zorder=0) hist = plotting.ScatterPlot(hist_data_list=[hd], right_text=s) hist.left_text_size = 12.0 hist.right_text_size = 12.0 xticks = [i for i in hist.ax.get_xticks()] xtick_labels = [i for i in xticks] yticks = [i for i in hist.ax.get_yticks()] ytick_labels = [i for i in yticks] if len(xtick_labels) >= 8: for i in range(1, len(xtick_labels), 2): xtick_labels[i] = '' if len(ytick_labels) >= 8: for i in range(1, len(ytick_labels), 2): ytick_labels[i] = '' xticks_obj = plotting.Ticks(ticks=xticks, labels=xtick_labels, horizontalalignment='center') yticks_obj = plotting.Ticks(ticks=yticks, labels=ytick_labels) hist.xticks_obj = xticks_obj hist.yticks_obj = yticks_obj plot_grid = plotting.PlotGrid(subplots=[hist], num_columns=1, label_schema=None, title=stat, title_size=14.0, title_top=False, y_title='Density', y_title_position=0.001, y_title_size=14.0, height=4.0, width=6.0, auto_height=False) plot_grid.auto_adjust_margins = False plot_grid.margin_left = 0.04 plot_grid.margin_bottom = 0.04 plot_grid.margin_right = 1.0 plot_grid.margin_top = 0.97 plot_grid.reset_figure() plot_grid.savefig(plot_path) if not found: raise Exception('stat pattern {0!r} not found in simulated stats:' '\n\t{1}'.format(stat_pattern, ', '.join(sample.keys()))) stop_time = datetime.datetime.now() log.info('Done!') info.write('\t[[run_stats]]', log.info) info.write('\t\tstart_time = {0}'.format(str(start_time)), log.info) info.write('\t\tstop_time = {0}'.format(str(stop_time)), log.info) info.write('\t\ttotal_duration = {0}'.format(str(stop_time - start_time)), log.info) if not args.keep_temps: log.debug('purging temps...') temp_fs.purge()
def main_cli(): description = '{name} {version}'.format(**_program_info) parser = argparse.ArgumentParser(description=description) parser.add_argument( 'configs', metavar='CONFIG-PATH', type=arg_is_config, nargs='+', help=('msBayes config file paths for which to estimate prior ' 'probabilities.')) parser.add_argument('-n', '--num-prior-samples', action='store', type=int, default=1000, help=('The number of prior samples to simulate for ' 'proabability estimates.')) parser.add_argument( '--np', action='store', type=int, default=multiprocessing.cpu_count(), help=('The maximum number of processes to run in parallel. The ' 'default is the number of CPUs available on the machine.')) parser.add_argument( '-d', '--dispersion-threshold', action='store', type=float, default=0.01, help=('The threshold for the dispersion index of divegence ' 'times. The estimated prior probability that the ' 'dispersion index is less than this threshold will ' 'be reported for each config.')) parser.add_argument( '-c', '--cv-threshold', action='store', type=float, default=0.01, help=('The threshold for the coefficient of variation (CV) of ' 'divegence times. The estimated prior probability that the ' 'CV is less than this threshold will ' 'be reported for each config.')) parser.add_argument('--seed', action='store', type=int, help='Random number seed to use for the analysis.') parser.add_argument('--version', action='version', version='%(prog)s ' + _program_info['version'], help='Report version and exit.') parser.add_argument('--quiet', action='store_true', help='Run without verbose messaging.') parser.add_argument('--debug', action='store_true', help='Run in debugging mode.') args = parser.parse_args() ########################################################################## ## handle args from pymsbayes.utils.messaging import (LoggingControl, InfoLogger) LoggingControl.set_logging_level("INFO") if args.quiet: LoggingControl.set_logging_level("WARNING") if args.debug: LoggingControl.set_logging_level("DEBUG") log = LoggingControl.get_logger(__name__) from pymsbayes.teams import ModelProbabilityEstimatorTeam from pymsbayes.utils import GLOBAL_RNG if not args.seed: args.seed = random.randint(1, 999999999) log.info('Using seed {0}'.format(args.seed)) GLOBAL_RNG.seed(args.seed) ########################################################################## ## begin analysis --- generate samples start_time = datetime.datetime.now() prob_esimator_team = ModelProbabilityEstimatorTeam( config_paths=args.configs, num_samples=args.num_prior_samples, omega_threshold=args.dispersion_threshold, cv_threshold=args.cv_threshold, num_processors=args.np) prob_esimator_team.start() for path in args.configs: sys.stdout.write('Prior probabilities for model {0}:\n'.format(path)) for k, p in prob_esimator_team.psi_probs[path].iteritems(): sys.stdout.write('\tnum of divergence events = {0}: {1}\n'.format( k, p)) sys.stdout.write('\tdispersion of div times < {0}: {1}\n'.format( args.dispersion_threshold, prob_esimator_team.omega_probs[path])) sys.stdout.write('\tCV of div times < {0}: {1}\n'.format( args.cv_threshold, prob_esimator_team.cv_probs[path])) stop_time = datetime.datetime.now() log.info('[run_stats]') log.info('\tstart_time = {0}'.format(str(start_time))) log.info('\tstop_time = {0}'.format(str(stop_time))) log.info('\ttotal_duration = {0}'.format(str(stop_time - start_time)))
def main_cli(): description = '{name} {version}'.format(**_program_info) parser = argparse.ArgumentParser(description = description) parser.add_argument('config', metavar='CONFIG-FILE', type = argparse_utils.arg_is_config, help = ('msBayes config file used to estimate the posterior ' 'sample.')) parser.add_argument('posterior_sample_path', metavar='POSTERIOR-SAMPLE-FILE', type=argparse_utils.arg_is_file, help = ('Path to posterior sample file (i.e., ' '`*-posterior-sample.txt`).')) parser.add_argument('-e', '--expression', dest = 'expressions', action = 'append', metavar = 'TAXON-INDEX-EXPRESSION', type = str, required = True, help = ('A conditional expression of divergence times based on ' 'the taxon-pair indices for which to calculate the ' 'posterior probability of being true. Indices correspond ' 'to the order that pairs of taxa appear in the sample ' 'table of the config, starting at 0 for the first ' 'taxon-pair to appear in the table (starting from the ' 'top). E.g., `-e "0 == 3 == 4"` would request the ' 'proportion of times the 1st, 4th, and 5th taxon-pairs ' '(in order of appearance in the sample table of the ' 'config) share the same divergence time in the ' 'posterior sample, whereas `-e "0 > 1" would request the ' 'proportion of times the the 1st taxon-pair diverged ' 'further back in time than the 2nd taxon-pair in the ' 'posterior sample.')) parser.add_argument('-n', '--num-prior-samples', action = 'store', type = argparse_utils.arg_is_positive_int, help = ('The number of prior samples to simulate for estimating ' 'prior probabilities; prior probabilities and Bayes ' 'factors will be reported. The default is to only report ' 'posterior probabilities.')) parser.add_argument('--np', action = 'store', type = argparse_utils.arg_is_positive_int, default = multiprocessing.cpu_count(), help = ('The maximum number of processes to run in parallel for ' 'prior simulations. The default is the number of CPUs ' 'available on the machine. This option is only relevant ' 'if the number of prior samples is specified using the ' '`-n` argument.')) parser.add_argument('--seed', action = 'store', type = argparse_utils.arg_is_positive_int, help = ('Random number seed to use for simulations. This option ' 'is only relevant if the number of prior samples is ' 'specified using the `-n` argument.')) parser.add_argument('--version', action = 'version', version = '%(prog)s ' + _program_info['version'], help = 'Report version and exit.') parser.add_argument('--quiet', action = 'store_true', help = 'Run without verbose messaging.') parser.add_argument('--debug', action = 'store_true', help = 'Run in debugging mode.') args = parser.parse_args() ########################################################################## ## handle args from pymsbayes.utils.messaging import (LoggingControl, InfoLogger) LoggingControl.set_logging_level("INFO") if args.quiet: LoggingControl.set_logging_level("WARNING") if args.debug: LoggingControl.set_logging_level("DEBUG") log = LoggingControl.get_logger(__name__) from pymsbayes import config from pymsbayes.teams import DivModelSimulatorTeam from pymsbayes.utils import stats, sumresults, GLOBAL_RNG if not args.seed: args.seed = random.randint(1, 999999999) GLOBAL_RNG.seed(args.seed) cfg = config.MsBayesConfig(args.config) evaluators = [] for exp in args.expressions: evaluators.append(stats.ListConditionEvaluator(exp, index_labels = cfg.taxa)) div_models = sumresults.get_partitions_from_posterior_sample_file( args.posterior_sample_path) sim_team = None if args.num_prior_samples: sim_team = DivModelSimulatorTeam( config_paths = [args.config], num_samples = args.num_prior_samples, num_processors = args.np) sim_team.start() for e in evaluators: title = '{0} --- {1}:'.format(e.expression, e.pretty_expression) section_title = '\n{0}\n{1}\n'.format(title, '-' * len(title)) sys.stdout.write('{0}'.format(section_title)) prob_shared_div = div_models.get_condition_frequency(e) sys.stdout.write('posterior probability = {0}\n'.format(prob_shared_div)) if sim_team: prior_prob = sim_team.div_models[ args.config].get_condition_frequency(e) bf = ((prob_shared_div / (1 - prob_shared_div)) / (prior_prob / (1 - prior_prob))) sys.stdout.write('prior probability = {0}\n'.format(prior_prob)) sys.stdout.write('Bayes factor = {0}\n'.format(bf)) sys.stdout.write('2ln(Bayes factor) = {0}\n'.format(2 * math.log(bf))) sys.stdout.write('\n')
def main_cli(argv = sys.argv): description = '{name} {version}'.format(**_program_info) parser = argparse.ArgumentParser(description = description, formatter_class = argparse_utils.SmartHelpFormatter) parser.add_argument('-o', '--observed-configs', nargs = '+', type = argparse_utils.arg_is_config, required = True, help = ('One or more msBayes config files to be used to either ' 'calculate or simulate observed summary statistics. If ' 'used in combination with `-r` each config will be used to ' 'simulate pseudo-observed data. If analyzing real data, do ' 'not use the `-r` option, and the fasta files specified ' 'within the config must exist and contain the sequence ' 'data.')) parser.add_argument('-p', '--prior-configs', nargs = '+', type = argparse_utils.arg_is_path, required = True, help = ('One or more config files to be used to generate prior ' 'samples. If more than one config is specified, they ' 'should be separated by spaces. ' 'This option can also be used to specify the path to a ' 'directory containing the prior samples and summary ' 'statistic means and standard deviations generated by a ' 'previous run using the `generate-samples-only` option. ' 'These files should be found in the directory ' '`pymsbayes-output/prior-stats-summaries`. The' '`pymsbayes-output/model-key.txt` also needs to be present.' ' If specifying this directory, it should be the only ' 'argument (i.e., no other directories or config files can ' 'be provided).')) parser.add_argument('-r', '--reps', action = 'store', type = argparse_utils.arg_is_nonnegative_int, default = 0, help = ('This option has two effects. First, it signifies that ' 'the analysis will be simulation based (i.e., no real ' 'data will be used). Second, it specifies how many ' 'simulation replicates to perform (i.e., how many data ' 'sets to simulate and analyze).')) parser.add_argument('-n', '--num-prior-samples', action = 'store', type = argparse_utils.arg_is_positive_int, default = 1000000, help = ('The number of prior samples to simulate for each prior ' 'config specified with `-p`.')) parser.add_argument('--prior-batch-size', action = 'store', type = argparse_utils.arg_is_positive_int, default = 10000, help = ('The number of prior samples to simulate for each batch.')) parser.add_argument('--generate-samples-only', action = 'store_true', help = ('Only generate samples from models as requested. I.e., ' 'No analyses are performed to approximate posteriors. ' 'This option can be useful if you want the prior samples ' 'for other purposes.')) parser.add_argument('--num-posterior-samples', action = 'store', type = argparse_utils.arg_is_positive_int, default = 1000, help = ('The number of posterior samples desired for each ' 'analysis. Default: 1000.')) parser.add_argument('--num-standardizing-samples', action = 'store', type = argparse_utils.arg_is_positive_int, default = 10000, help = ('The number of prior samples desired to use for ' 'standardizing statistics. Default: 10000.')) parser.add_argument('--np', action = 'store', type = argparse_utils.arg_is_positive_int, default = multiprocessing.cpu_count(), help = ('The maximum number of processes to run in parallel. The ' 'default is the number of CPUs available on the machine.')) parser.add_argument('--output-dir', action = 'store', type = argparse_utils.arg_is_dir, help = ('The directory in which all output files will be written. ' 'The default is to use the directory of the first observed ' 'config file.')) parser.add_argument('--temp-dir', action = 'store', type = argparse_utils.arg_is_dir, help = ('A directory to temporarily stage files. The default is to ' 'use the output directory.')) parser.add_argument('--staging-dir', action = 'store', type = argparse_utils.arg_is_dir, help = ('A directory to temporarily stage prior files. This option ' 'can be useful on clusters to speed up I/O while ' 'generating prior samples. You can designate a local temp ' 'directory on a compute node to avoid constant writing to ' 'a shared drive. The default is to use the `temp-dir`.')) parser.add_argument('-s', '--stat-prefixes', nargs = '*', type = str, help = ('Prefixes of summary statistics to use in the analyses. ' 'The prefixes should be separated by spaces. ' 'Default: `-s pi wattTheta pi.net tajD.denom`.')) parser.add_argument('-b', '--bandwidth', action = 'store', type = float, help = ('Smoothing parameter for the posterior kernal density ' 'estimation. This option is used for the `glm` ' 'regression method. The default is 2 / ' '`num-posterior-samples`.')) parser.add_argument('-q', '--num-posterior-quantiles', action = 'store', type = argparse_utils.arg_is_positive_int, default = 1000, help = ('The number of equally spaced quantiles at which to ' 'evaluate the GLM-estimated posterior density. ' 'Default: 1000.')) parser.add_argument('--reporting-frequency', action = 'store', type = argparse_utils.arg_is_nonnegative_int, default = 0, help = ('Suggested frequency (in number of prior samples) for ' 'running regression and reporting current results. ' 'Default: 0 (only report final results). ' 'If a value is given, it may be adjusted so that the ' 'reporting frequency is a multiple of the multi-processed ' 'batch size.')) parser.add_argument('--sort-index', action = 'store', type = argparse_utils.arg_is_nonnegative_int, default = 0, choices = range(12), help = argparse_utils.get_sort_index_help_message()) parser.add_argument('--no-global-estimate', action = 'store_true', help = ('If multiple prior models are specified, by default a ' 'global estimate is performed averaging over all models. ' 'This option prevents the global estimation (i.e., only ' 'inferences for each model are made).')) parser.add_argument('--compress', action = 'store_true', help = 'Compress large results files.') parser.add_argument('--keep-temps', action = 'store_true', help = 'Keep all temporary files.') parser.add_argument('--seed', action = 'store', type = int, help = 'Random number seed to use for the analysis.') parser.add_argument('--output-prefix', action = 'store', type = str, default = '', help = ('Prefix to use at beginning of output files. The default ' 'is no prefix.')) parser.add_argument('--data-key-path', action = 'store', type = argparse_utils.arg_is_file, help = ('The path to a `data-key.txt` file generated by a previous ' 'run. This file should be found in the directory ' '`pymsbayes-output/data-key.txt`. This option ' 'will override the `-o`/`--observed-configs` option, and ' 'is intended to be used in combination with the ' '`--start-from` option to restart an analysis.')) parser.add_argument('--start-from-simulation-index', action = 'store', type = argparse_utils.arg_is_nonnegative_int, default = 0, help = ('The simulation index at which to begin analyses. Must be ' 'used in combination with either the number of simulation ' 'replicates (`-r`/`--reps`) or the `--data-key-path` ' 'option, and must be a positive ' 'integer that is less than the number of simulation ' 'replicates. This option can be useful if an analysis ' 'needs to be restarted.')) parser.add_argument('--start-from-observed-index', action = 'store', type = argparse_utils.arg_is_nonnegative_int, default = 0, help = ('The observed config index at which to begin analyses. ' 'Can be used in combination with the `--data-key-path` ' 'option to restart long-running, multi-observed-config ' 'analyses')) parser.add_argument('--dry-run', action = 'store_true', help = 'Do not run analyses; only process settings') parser.add_argument('--version', action = 'version', version = '%(prog)s ' + _program_info['version'], help = 'Report version and exit.') parser.add_argument('--quiet', action = 'store_true', help = 'Run without verbose messaging.') parser.add_argument('--debug', action = 'store_true', help = 'Run in debugging mode.') if argv == sys.argv: args = parser.parse_args() else: args = parser.parse_args(argv) ########################################################################## ## handle args from pymsbayes.utils.messaging import (LoggingControl, InfoLogger) LoggingControl.set_logging_level("INFO") if args.quiet: LoggingControl.set_logging_level("WARNING") if args.debug: LoggingControl.set_logging_level("DEBUG") log = LoggingControl.get_logger(__name__) from pymsbayes.workers import (MsBayesWorker, merge_prior_files, ObsSumStatsWorker) from pymsbayes.teams import ABCTeam from pymsbayes.utils.functions import (is_file, is_dir, long_division, mk_new_dir) from pymsbayes.utils.parsing import (get_patterns_from_prefixes, DEFAULT_STAT_PATTERNS, DIV_MODEL_PATTERNS, MODEL_PATTERNS, PSI_PATTERNS, MEAN_TAU_PATTERNS, OMEGA_PATTERNS, CV_PATTERNS, line_count) from pymsbayes.utils import sumresults, errors from pymsbayes.manager import Manager from pymsbayes.utils.tempfs import TempFileSystem from pymsbayes.config import MsBayesConfig from pymsbayes.utils import (GLOBAL_RNG, set_memory_trace, MSBAYES_SORT_INDEX, ToolPathManager) MSBAYES_SORT_INDEX.set_index(args.sort_index) if len(args.observed_configs) != len(set(args.observed_configs)): raise ValueError('All paths to observed config files must be unique') if args.num_standardizing_samples > args.num_prior_samples: args.num_standardizing_samples = args.num_prior_samples # get full paths to tools msbayes_path = ToolPathManager.get_tool_full_path('msbayes.pl') dpp_msbayes_path = ToolPathManager.get_tool_full_path('dpp-msbayes.pl') eureject_path = ToolPathManager.get_tool_full_path('eureject') abctb_path = ToolPathManager.get_tool_full_path('ABCestimator') # vet prior-configs option using_previous_priors = False previous_prior_dir = None if (len(args.prior_configs) == 1) and (is_dir(args.prior_configs[0])): previous_prior_dir = args.prior_configs.pop(0) previous_priors = glob.glob(os.path.join(previous_prior_dir, '*-prior-sample.txt')) previous_sums = glob.glob(os.path.join(previous_prior_dir, '*-means-and-std-devs.txt')) if (not previous_priors) or (not previous_sums): raise ValueError('directory {0!r} specified with `prior-configs` ' 'option does not contain necessary prior and summary ' 'files'.format(args.prior_configs[0])) using_previous_priors = True else: for path in args.prior_configs: if not is_file(path): raise ValueError('prior config {0!r} is not a file'.format( path)) if len(args.prior_configs) != len(set(args.prior_configs)): raise ValueError('All paths to prior config files must be unique') if not args.output_dir: args.output_dir = os.path.dirname(args.observed_configs[0]) base_dir = mk_new_dir(os.path.join(args.output_dir, 'pymsbayes-results')) if not args.temp_dir: args.temp_dir = base_dir info_path = os.path.join(base_dir, args.output_prefix + \ 'pymsbayes-info.txt') info = InfoLogger(info_path) info.write('[pymsbayes]'.format(base_dir)) info.write('\tversion = {version}'.format(**_program_info)) info.write('\toutput_directory = {0}'.format(base_dir)) temp_fs = TempFileSystem(parent=args.temp_dir, prefix='temp-files-') base_temp_dir = temp_fs.base_dir info.write('\ttemp_directory = {0}'.format(base_temp_dir)) info.write('\tsort_index = {0}'.format( MSBAYES_SORT_INDEX.current_value())) info.write('\tsimulation_reps = {0}'.format(args.reps)) stat_patterns = DEFAULT_STAT_PATTERNS if args.stat_prefixes: for i in range(len(args.stat_prefixes)): if not args.stat_prefixes[i].endswith('.'): args.stat_prefixes[i] += '.' stat_patterns = get_patterns_from_prefixes( args.stat_prefixes, ignore_case=True) if not args.bandwidth: args.bandwidth = 2 / float(args.num_posterior_samples) if not args.seed: args.seed = random.randint(1, 999999999) GLOBAL_RNG.seed(args.seed) if args.data_key_path: observed_map = sumresults.parse_data_key_file(args.data_key_path) observed_paths = [observed_map[k] for k in sorted(observed_map.keys())] else: observed_dir = mk_new_dir(os.path.join(base_dir, 'observed-summary-stats')) observed_paths = [os.path.join(observed_dir, args.output_prefix + \ 'observed-{0}.txt'.format(i+1)) for i in range(len( args.observed_configs))] info.write('\tseed = {0}'.format(args.seed)) info.write('\tnum_processors = {0}'.format(args.np)) info.write('\tnum_prior_samples = {0}'.format( args.num_prior_samples)) info.write('\tnum_standardizing_samples = {0}'.format( args.num_standardizing_samples)) info.write('\tbandwidth = {0}'.format(args.bandwidth)) info.write('\tposterior_quantiles = {0}'.format( args.num_posterior_quantiles)) info.write('\tposterior_sample_size = {0}'.format( args.num_posterior_samples)) info.write('\tstat_patterns = {0}'.format( ', '.join([p.pattern for p in stat_patterns]))) # vet observed configs ref_config_path = args.observed_configs[0] ref_config = MsBayesConfig(ref_config_path) all_config_paths = [] num_taxon_pairs = ref_config.npairs assert num_taxon_pairs > 0 for config in args.observed_configs: all_config_paths.append(config) if not ref_config.equal_sample_table(config): if not args.keep_temps: temp_fs.purge() raise errors.SampleTableError( 'sample tables in config {0!r} and {1!r} differ; ' 'all sample tables must be the same.'.format( ref_config_path, config)) info.write('\tnum_taxon_pairs = {0}'.format(num_taxon_pairs)) info.write('\tdry_run = {0}'.format(args.dry_run)) info.write('\t[[tool_paths]]') info.write('\t\tdpp_msbayes = {0}'.format(dpp_msbayes_path)) info.write('\t\tmsbayes = {0}'.format(msbayes_path)) info.write('\t\teureject = {0}'.format(eureject_path)) info.write('\t\tabcestimator = {0}'.format(abctb_path)) info.write('\t[[observed_configs]]') for i, cfg in enumerate(args.observed_configs): info.write('\t\t{0} = {1}'.format(i + 1, os.path.relpath(cfg, os.path.dirname(info_path)))) abc_team = ABCTeam( temp_fs = temp_fs, observed_stats_files = observed_paths, num_taxon_pairs = num_taxon_pairs, config_paths = args.prior_configs, previous_prior_dir = previous_prior_dir, num_prior_samples = args.num_prior_samples, num_processors = args.np, num_standardizing_samples = args.num_standardizing_samples, num_posterior_samples = args.num_posterior_samples, num_posterior_density_quantiles = args.num_posterior_quantiles, batch_size = args.prior_batch_size, output_dir = base_dir, output_prefix = args.output_prefix, prior_temp_dir = args.staging_dir, rng = GLOBAL_RNG, report_parameters = True, stat_patterns = stat_patterns, eureject_exe_path = eureject_path, abctoolbox_exe_path = abctb_path, msbayes_exe_path = None, abctoolbox_bandwidth = args.bandwidth, omega_threshold = 0.01, cv_threshold = 0.01, compress = args.compress, reporting_frequency = args.reporting_frequency, keep_temps = args.keep_temps, global_estimate_only = False, global_estimate = not args.no_global_estimate, generate_prior_samples_only = args.generate_samples_only, start_from_simulation_index = args.start_from_simulation_index, start_from_observed_index = args.start_from_observed_index) models_to_configs = {} configs_to_models = {} for k, v in abc_team.models.iteritems(): models_to_configs[k] = v configs_to_models[v] = k cfg = MsBayesConfig(v) all_config_paths.append(v) # vet prior configs if not ref_config.equal_sample_table(cfg): if not args.keep_temps: temp_fs.purge() raise errors.SampleTableError( 'sample tables in config {0!r} and {1!r} differ; ' 'all sample tables must be the same.'.format( ref_config_path, v)) info.write('\t[[observed_paths]]') for i in sorted(abc_team.observed_stats_paths.iterkeys()): info.write('\t\t{0} = {1}'.format(i, os.path.relpath( abc_team.observed_stats_paths[i], os.path.dirname(info_path)))) info.write('\t[[prior_configs]]') for i in sorted(abc_team.models.iterkeys()): info.write('\t\t{0} = {1}'.format(i, os.path.relpath( abc_team.models[i], os.path.dirname(info_path)))) ########################################################################## ## begin analysis --- get observed summary stats set_memory_trace() # start logging memory profile start_time = datetime.datetime.now() if args.data_key_path: log.info('Using provided summary statitics...') elif not args.dry_run: obs_temp_dir = base_temp_dir if args.staging_dir: obs_temp_dir = args.staging_dir observed_temp_fs = TempFileSystem(parent = obs_temp_dir, prefix = 'observed-temps-') if args.reps < 1: log.info('Calculating summary statistics from sequence data...') obs_workers = [] for i, cfg in enumerate(args.observed_configs): ss_worker = ObsSumStatsWorker( temp_fs = observed_temp_fs, config_path = cfg, output_path = observed_paths[i], schema = 'abctoolbox', stat_patterns = stat_patterns) obs_workers.append(ss_worker) obs_workers = Manager.run_workers( workers = obs_workers, num_processors = args.np) # re-vet all configs to see if some were changed by obsSumStats.pl new_ref_config = ref_config ref_modified = False # new ref because if all configs were updated all is good if not ref_config.equal_sample_table(ref_config_path): ref_modified = True new_ref_config = MsBayesConfig(ref_config_path) log.warning(""" The alignment lengths in config {0!r} have been corrected for sites with *any* ambiguous bases and/or gaps by obsSumStats.pl. """.format(ref_config_path)) for config in all_config_paths: if not new_ref_config.equal_sample_table(config): corrected_config = config if ref_modified: corrected_config = ref_config_path if not args.keep_temps: observed_temp_fs.purge() temp_fs.purge() raise errors.SampleTableError(""" The sample tables in configs {0!r} and {1!r} differ because obsSumStats.pl modified alignment lengths in config {2!r} to correct for sites in the alignments with *any* ambiguous bases and/or gaps. Please make sure the sample tables in all configs will be the same after correcting alignment lengths for sites that contain *any* ambiguous bases and/or gaps. You can do this by copying and pasting the sample table in {2!r} that has been corrected by obsSumStats.pl into the other configs that were not corrected. """.format(ref_config_path, config, corrected_config)) else: log.info('Simulating summary statistics from observed configs...') num_observed_workers = min([args.reps, args.np]) if args.reps <= args.np: observed_batch_size = 1 remainder = 0 else: observed_batch_size, remainder = long_division(args.reps, args.np) msbayes_workers = [] for idx, cfg in enumerate(args.observed_configs): observed_model_idx = configs_to_models.get(cfg, None) schema = 'abctoolbox' for i in range(num_observed_workers): worker = MsBayesWorker( temp_fs = observed_temp_fs, sample_size = observed_batch_size, config_path = cfg, model_index = observed_model_idx, report_parameters = True, schema = schema, include_header = True, stat_patterns = stat_patterns, write_stats_file = False, staging_dir = None, tag = idx) msbayes_workers.append(worker) if remainder > 0: worker = MsBayesWorker( temp_fs = observed_temp_fs, sample_size = remainder, config_path = cfg, model_index = observed_model_idx, report_parameters = True, schema = schema, include_header = True, stat_patterns = stat_patterns, write_stats_file = False, staging_dir = None, tag = idx) msbayes_workers.append(worker) # run parallel msbayes processes msbayes_workers = Manager.run_workers( workers = msbayes_workers, num_processors = args.np) workers = dict(zip(range(len(args.observed_configs)), [[] for i in range(len(args.observed_configs))])) for w in msbayes_workers: workers[w.tag].append(w) # merge simulated observed data into one file for i in range(len(args.observed_configs)): merge_prior_files([w.prior_path for w in workers[i]], observed_paths[i]) lc = line_count(observed_paths[i], ignore_headers=True) if lc != args.reps: if not args.keep_temps: temp_fs.purge() raise Exception('The number of observed simulations ({0}) ' 'generated for observed config {1!r} and output to ' 'file {2!r} does not match the number of reps ' '({3})'.format(lc, args.observed_configs[i], observed_paths[i], args.reps)) if not args.keep_temps: log.debug('purging observed temps...') observed_temp_fs.purge() ########################################################################## ## Begin ABC analyses if not args.dry_run: abc_team.run() stop_time = datetime.datetime.now() log.info('Done!') info.write('\t[[run_stats]]', log.info) info.write('\t\tstart_time = {0}'.format(str(start_time)), log.info) info.write('\t\tstop_time = {0}'.format(str(stop_time)), log.info) info.write('\t\ttotal_duration = {0}'.format(str(stop_time - start_time)), log.info) if not args.keep_temps: log.debug('purging temps...') temp_fs.purge()
def main_cli(): description = '{name} {version}'.format(**_program_info) parser = argparse.ArgumentParser(description=description) parser.add_argument('div_model_path', metavar='DIV-MODEL-RESULTS-FILE', type=argparse_utils.arg_is_file, help=('Path to divergence model results file (i.e., ' '`*-div-model-results.txt`).')) parser.add_argument( '-i', '--taxon-indices', nargs='+', type=argparse_utils.arg_is_positive_int, required=True, help=('Two or more space-separated indices of taxa for which to ' 'calculate the probability of them co-diverging. Indices ' 'correspond to the line in the sample table of the config, ' 'starting at 1 for the first line of the table. At least ' 'two indices are required.')) parser.add_argument( '-c', '--config', type=argparse_utils.arg_is_config, help=('msBayes config file to be used to estimate prior ' 'probability via simulations. If provided, the ' 'posterior and prior probability and bayes factor is ' 'reported. If not provided, only the posterior ' 'probability is reported.')) parser.add_argument( '-n', '--num-prior-samples', action='store', type=argparse_utils.arg_is_positive_int, default=100000, help=('The number of prior samples to simulate for estimating' 'prior probabilities. Only used if a config file is ' 'provided with the `-c` argument.')) parser.add_argument( '--np', action='store', type=argparse_utils.arg_is_positive_int, default=multiprocessing.cpu_count(), help=('The maximum number of processes to run in parallel for ' 'prior simulations. The default is the number of CPUs ' 'available on the machine. This option is only relevant ' 'if a config file is provided using the `-c` argument.')) parser.add_argument( '--seed', action='store', type=argparse_utils.arg_is_positive_int, help=('Random number seed to use for simulations. This option ' 'is only relevant if a config file is provided using the ' '`-c` argument.')) parser.add_argument('--version', action='version', version='%(prog)s ' + _program_info['version'], help='Report version and exit.') parser.add_argument('--quiet', action='store_true', help='Run without verbose messaging.') parser.add_argument('--debug', action='store_true', help='Run in debugging mode.') args = parser.parse_args() ########################################################################## ## handle args from pymsbayes.utils.messaging import (LoggingControl, InfoLogger) LoggingControl.set_logging_level("INFO") if args.quiet: LoggingControl.set_logging_level("WARNING") if args.debug: LoggingControl.set_logging_level("DEBUG") log = LoggingControl.get_logger(__name__) from pymsbayes import config from pymsbayes.teams import ModelProbabilityEstimatorTeam from pymsbayes.utils import sumresults, GLOBAL_RNG if len(args.taxon_indices) < 2: log.error('At least two taxon indices are required') sys.exit(1) if not args.seed: args.seed = random.randint(1, 999999999) GLOBAL_RNG.seed(args.seed) div_models = sumresults.OrderedDivergenceModelCollection( div_model_results_path=args.div_model_path) for i in args.taxon_indices: if ((i < 1) or (i > div_models.npairs)): log.error('taxon index {0} is out of bounds'.format(i)) sys.exit(1) args.taxon_indices = [i - 1 for i in args.taxon_indices] prob_shared_div = div_models.prob_of_shared_divergence(args.taxon_indices) if args.config: prob_estimator_team = ModelProbabilityEstimatorTeam( config_paths=[args.config], num_samples=args.num_prior_samples, num_processors=args.np) prob_estimator_team.start() prior_prob = prob_estimator_team.shared_div_probs[args.config][len( args.taxon_indices)] bf = ((prob_shared_div / (1 - prob_shared_div)) / (prior_prob / (1 - prior_prob))) sys.stdout.write('posterior probability = {0}\n'.format(prob_shared_div)) if args.config: sys.stdout.write('prior probability = {0}\n'.format(prior_prob)) sys.stdout.write('Bayes factor = {0}\n'.format(bf)) sys.stdout.write('2ln(Bayes factor) = {0}\n'.format(2 * math.log(bf)))
def main_cli(): description = '{name} {version}'.format(**_program_info) parser = argparse.ArgumentParser(description = description) parser.add_argument('info_path', metavar='PYMSBAYES-INFO-FILE', type=argparse_utils.arg_is_file, help=('Path to `pymsbayes-info.txt` file.')) parser.add_argument('-n', '--num-prior-samples', action = 'store', type = argparse_utils.arg_is_positive_int, default = 100000, help = ('The number of prior samples to simulate for estimating ' 'prior probabilities.')) parser.add_argument('-i', '--sample-index', action = 'store', type = argparse_utils.arg_is_positive_int, help = ('The prior-sample index of results to be summarized. ' 'Output files should have a consistent schema. For ' 'example, a results file for divergence models might look ' 'something like ' '`d1-m1-s1-1000000-div-model-results.txt`. In this example, ' 'the prior-sample index is "1000000". The default is to ' 'use the largest prior-sample index, which is probably ' 'what you want.')) parser.add_argument('-o', '--output-dir', action = 'store', type = argparse_utils.arg_is_dir, help = ('The directory in which all output plots will be written. ' 'The default is to use the directory of the pymsbayes info ' 'file.')) parser.add_argument('--np', action = 'store', type = argparse_utils.arg_is_positive_int, default = multiprocessing.cpu_count(), help = ('The maximum number of processes to run in parallel. The ' 'default is the number of CPUs available on the machine.')) parser.add_argument('-m', '--mu', action = 'store', type = argparse_utils.arg_is_positive_float, default = None, help = ('The mutation rate with which to scale time to units of ' 'generations. By default, time is not scaled to ' 'generations.')) parser.add_argument('--extension', action = 'store', type = str, default = 'pdf', help = ('The file format extension of the plots (e.g., "pdf", ' '"png"). The default is pdf.')) parser.add_argument('--seed', action = 'store', type = argparse_utils.arg_is_positive_int, help = 'Random number seed to use for the analysis.') parser.add_argument('--version', action = 'version', version = '%(prog)s ' + _program_info['version'], help = 'Report version and exit.') parser.add_argument('--quiet', action = 'store_true', help = 'Run without verbose messaging.') parser.add_argument('--debug', action = 'store_true', help = 'Run in debugging mode.') args = parser.parse_args() ########################################################################## ## handle args from pymsbayes.utils.messaging import (LoggingControl, InfoLogger) LoggingControl.set_logging_level("INFO") if args.quiet: LoggingControl.set_logging_level("WARNING") if args.debug: LoggingControl.set_logging_level("DEBUG") log = LoggingControl.get_logger(__name__) from pymsbayes import plotting from pymsbayes.utils import sumresults from pymsbayes.utils import GLOBAL_RNG if not plotting.MATPLOTLIB_AVAILABLE: log.error( '`matplotlib` could not be imported, so plots can not be\n' 'produced. Please install `matplotlib` and try again.') sys.exit(1) if not args.seed: args.seed = random.randint(1, 999999999) GLOBAL_RNG.seed(args.seed) if not args.output_dir: args.output_dir = os.path.dirname(args.info_path) args.output_dir = os.path.join(args.output_dir, 'plots') if not os.path.exists(args.output_dir): os.mkdir(args.output_dir) results = sumresults.DMCSimulationResults(args.info_path) if results.num_sim_reps > 1: log.error('Results appear to be from simulation-based analysis, ' 'for which this plotting script is not appropriate.') sys.exit(1) observed_indices = sorted(results.observed_index_to_config.keys()) prior_indices = sorted(results.prior_index_to_config.keys()) for obs_idx in observed_indices: for prior_idx in prior_indices: result_indices = results.get_result_indices(obs_idx, prior_idx, 1) result_idx = max(result_indices) result_path_prefix = '{0}{1}-'.format( results.get_result_path_prefix(obs_idx, prior_idx, 1), result_idx) result_dir = os.path.dirname(result_path_prefix) out_prefix = os.path.join(args.output_dir, os.path.basename( result_path_prefix)) prior_cfg = results.prior_configs[prior_idx] posterior_summary_path = get_result_path(result_path_prefix, 'posterior-summary') div_model_path = get_result_path(result_path_prefix, 'div-model-results') config_path = results.prior_index_to_config[prior_idx] time_multiplier = 1.0 if args.mu is not None: if prior_cfg.time_in_subs_per_site: time_multiplier = 1.0 / args.mu else: try: mean_theta = prior_cfg.theta.mean except: mean_theta = prior_cfg.d_theta.mean time_multiplier = mean_theta / args.mu if results.sort_index == 0: #plot marginal times if not posterior_summary_path: log.warning('Could not find {0}{1}.txt(.gz); ' 'Skipping marginal times plot...'.format( result_path_prefix, 'posterior-summary')) else: label_dimension = (0.34 * (prior_cfg.npairs + 1)) + 0.56 marginal_times_plot = plotting.get_marginal_divergence_time_plot( config_path = config_path, posterior_summary_path = posterior_summary_path, labels = None, estimate = 'median', interval = 'HPD_95_interval', time_multiplier = time_multiplier, horizontal = True, label_dimension = label_dimension, measure_dimension = 8.0, label_size = 12.0, measure_tick_label_size = 12.0, measure_axis_label = 'Divergence time', measure_axis_label_size = 14.0, label_axis_label = 'Taxon pair', label_axis_label_size = 14.0, usetex = False) marginal_times_path = '{0}{1}'.format(out_prefix, 'marginal-divergence-times.' + args.extension) marginal_times_plot.savefig(marginal_times_path) #plot top ordered models if not div_model_path: log.warning('Could not find {0}{1}.txt(.gz); ' 'Skipping ordered div model plot...'.format( result_path_prefix, 'div-model-results')) else: height = 12.0 margin_top = 0.99 margin_left = 0.03 padding_between_vertical = 0.8 if prior_cfg.npairs < 4: height *= 0.8 margin_top -= 0.01 margin_left += 0.05 padding_between_vertical += 0.3 width = (0.38 * prior_cfg.npairs) + 1.5 div_model_plot = plotting.OrderedDivergenceModelPlotGrid( div_model_results_path = div_model_path, config_path = config_path, num_top_models = 10, time_multiplier = time_multiplier, height = height, width = width, plot_label_schema = 'uppercase', plot_label_offset = 0, plot_label_size = 12.0, y_title = 'Divergence time', y_title_size = 14.0, y_tick_label_size = 10.0, right_text_size = 10.0, margin_left = margin_left, margin_bottom = 0.0, margin_right = 1, margin_top = margin_top, padding_between_vertical = padding_between_vertical, tab = 0.08) plot = div_model_plot.create_grid() div_model_plot_path = '{0}{1}'.format(out_prefix, 'ordered-div-models.' + args.extension) plot.savefig(div_model_plot_path) else: #plot top unordered models if not div_model_path: log.warning('Could not find {0}{1}.txt(.gz); ' 'Skipping unordered div model plot...'.format( result_path_prefix, 'div-model-results')) else: width = (0.38 * prior_cfg.npairs) + 1.5 div_model_plot = plotting.UnorderedDivergenceModelPlotGrid( div_model_results_path = div_model_path, num_top_models = 10, time_multiplier = time_multiplier, height = 10.0, width = width, data_label_size = 10.0, plot_label_schema = 'uppercase', plot_label_offset = 0, plot_label_size = 12.0, y_title = 'Divergence time', y_title_size = 14.0, y_tick_label_size = 10.0, right_text_size = 10.0, margin_left = 0.03, margin_bottom = 0.0, margin_right = 1, margin_top = 0.99, padding_between_vertical = 0.8, tab = 0.08) plot = div_model_plot.create_grid() div_model_plot_path = '{0}{1}'.format(out_prefix, 'ordered-div-models.' + args.extension) plot.savefig(div_model_plot_path) #plot ndiv plot psi_path = get_result_path(result_path_prefix, 'psi-results') if not psi_path: log.warning('Could not find {0}{1}.txt(.gz); ' 'Skipping number of divergences plot...'.format( result_path_prefix, 'psi-results')) else: width = (0.25 * prior_cfg.npairs) + 0.55 if width < 2.8: width = 2.8 num_div_summary = plotting.NumberOfDivergencesSummary( config_path = results.prior_index_to_config[prior_idx], psi_results_path = psi_path, posterior_summary_path = posterior_summary_path, num_prior_samples = args.num_prior_samples, num_processors = args.np) num_div_summary.create_plot( plot_label_size = 10.0, right_text_size = 10.0, x_label_size = 10.0, y_label_size = 10.0, xtick_label_size = 10.0, ytick_label_size = 8.0, height = 6.0, width = width, margin_bottom = 0.0, margin_left = 0.0, margin_top = 0.97, margin_right = 1.0, padding_between_vertical = 1.0) num_div_plot_path = '{0}{1}'.format(out_prefix, 'number-of-divergences.' + args.extension) num_div_summary.save_plot(num_div_plot_path) bf_plot_path = '{0}{1}'.format(out_prefix, ('number-of-divergences-bayes-factors-only.' + args.extension)) num_div_summary.save_bf_plot(bf_plot_path) num_div_bf_path = '{0}{1}'.format(out_prefix, 'number-of-divergences-bayes-factors.txt') with open(num_div_bf_path, 'w') as out: out.write('num_of_divs\t2ln(bf)\n') for n in sorted(num_div_summary.psi_bayes_factors.keys()): out.write('{0}\t{1}\n'.format(n, num_div_summary.psi_bayes_factors[n])) log.info('The plots are in: {0}'.format(args.output_dir))
def main_cli(argv=sys.argv): description = '{name} {version}'.format(**_program_info) parser = argparse.ArgumentParser( description=description, formatter_class=argparse_utils.SmartHelpFormatter) parser.add_argument( '-o', '--observed-configs', nargs='+', type=argparse_utils.arg_is_config, required=True, help=('One or more msBayes config files to be used to either ' 'calculate or simulate observed summary statistics. If ' 'used in combination with `-r` each config will be used to ' 'simulate pseudo-observed data. If analyzing real data, do ' 'not use the `-r` option, and the fasta files specified ' 'within the config must exist and contain the sequence ' 'data.')) parser.add_argument( '-p', '--prior-configs', nargs='+', type=argparse_utils.arg_is_path, required=True, help=('One or more config files to be used to generate prior ' 'samples. If more than one config is specified, they ' 'should be separated by spaces. ' 'This option can also be used to specify the path to a ' 'directory containing the prior samples and summary ' 'statistic means and standard deviations generated by a ' 'previous run using the `generate-samples-only` option. ' 'These files should be found in the directory ' '`pymsbayes-output/prior-stats-summaries`. The' '`pymsbayes-output/model-key.txt` also needs to be present.' ' If specifying this directory, it should be the only ' 'argument (i.e., no other directories or config files can ' 'be provided).')) parser.add_argument( '-r', '--reps', action='store', type=argparse_utils.arg_is_nonnegative_int, default=0, help=('This option has two effects. First, it signifies that ' 'the analysis will be simulation based (i.e., no real ' 'data will be used). Second, it specifies how many ' 'simulation replicates to perform (i.e., how many data ' 'sets to simulate and analyze).')) parser.add_argument( '-n', '--num-prior-samples', action='store', type=argparse_utils.arg_is_positive_int, default=1000000, help=('The number of prior samples to simulate for each prior ' 'config specified with `-p`.')) parser.add_argument( '--prior-batch-size', action='store', type=argparse_utils.arg_is_positive_int, default=10000, help=('The number of prior samples to simulate for each batch.')) parser.add_argument( '--generate-samples-only', action='store_true', help=('Only generate samples from models as requested. I.e., ' 'No analyses are performed to approximate posteriors. ' 'This option can be useful if you want the prior samples ' 'for other purposes.')) parser.add_argument( '--num-posterior-samples', action='store', type=argparse_utils.arg_is_positive_int, default=1000, help=('The number of posterior samples desired for each ' 'analysis. Default: 1000.')) parser.add_argument('--num-standardizing-samples', action='store', type=argparse_utils.arg_is_positive_int, default=10000, help=('The number of prior samples desired to use for ' 'standardizing statistics. Default: 10000.')) parser.add_argument( '--np', action='store', type=argparse_utils.arg_is_positive_int, default=multiprocessing.cpu_count(), help=('The maximum number of processes to run in parallel. The ' 'default is the number of CPUs available on the machine.')) parser.add_argument( '--output-dir', action='store', type=argparse_utils.arg_is_dir, help=('The directory in which all output files will be written. ' 'The default is to use the directory of the first observed ' 'config file.')) parser.add_argument( '--temp-dir', action='store', type=argparse_utils.arg_is_dir, help=('A directory to temporarily stage files. The default is to ' 'use the output directory.')) parser.add_argument( '--staging-dir', action='store', type=argparse_utils.arg_is_dir, help=('A directory to temporarily stage prior files. This option ' 'can be useful on clusters to speed up I/O while ' 'generating prior samples. You can designate a local temp ' 'directory on a compute node to avoid constant writing to ' 'a shared drive. The default is to use the `temp-dir`.')) parser.add_argument( '-s', '--stat-prefixes', nargs='*', type=str, help=('Prefixes of summary statistics to use in the analyses. ' 'The prefixes should be separated by spaces. ' 'Default: `-s pi wattTheta pi.net tajD.denom`.')) parser.add_argument( '-b', '--bandwidth', action='store', type=float, help=('Smoothing parameter for the posterior kernal density ' 'estimation. This option is used for the `glm` ' 'regression method. The default is 2 / ' '`num-posterior-samples`.')) parser.add_argument( '-q', '--num-posterior-quantiles', action='store', type=argparse_utils.arg_is_positive_int, default=1000, help=('The number of equally spaced quantiles at which to ' 'evaluate the GLM-estimated posterior density. ' 'Default: 1000.')) parser.add_argument( '--reporting-frequency', action='store', type=argparse_utils.arg_is_nonnegative_int, default=0, help=('Suggested frequency (in number of prior samples) for ' 'running regression and reporting current results. ' 'Default: 0 (only report final results). ' 'If a value is given, it may be adjusted so that the ' 'reporting frequency is a multiple of the multi-processed ' 'batch size.')) parser.add_argument('--sort-index', action='store', type=argparse_utils.arg_is_nonnegative_int, default=0, choices=range(12), help=argparse_utils.get_sort_index_help_message()) parser.add_argument( '--no-global-estimate', action='store_true', help=('If multiple prior models are specified, by default a ' 'global estimate is performed averaging over all models. ' 'This option prevents the global estimation (i.e., only ' 'inferences for each model are made).')) parser.add_argument('--compress', action='store_true', help='Compress large results files.') parser.add_argument('--keep-temps', action='store_true', help='Keep all temporary files.') parser.add_argument('--seed', action='store', type=int, help='Random number seed to use for the analysis.') parser.add_argument( '--output-prefix', action='store', type=str, default='', help=('Prefix to use at beginning of output files. The default ' 'is no prefix.')) parser.add_argument( '--data-key-path', action='store', type=argparse_utils.arg_is_file, help=('The path to a `data-key.txt` file generated by a previous ' 'run. This file should be found in the directory ' '`pymsbayes-output/data-key.txt`. This option ' 'will override the `-o`/`--observed-configs` option, and ' 'is intended to be used in combination with the ' '`--start-from` option to restart an analysis.')) parser.add_argument( '--start-from-simulation-index', action='store', type=argparse_utils.arg_is_nonnegative_int, default=0, help=('The simulation index at which to begin analyses. Must be ' 'used in combination with either the number of simulation ' 'replicates (`-r`/`--reps`) or the `--data-key-path` ' 'option, and must be a positive ' 'integer that is less than the number of simulation ' 'replicates. This option can be useful if an analysis ' 'needs to be restarted.')) parser.add_argument( '--start-from-observed-index', action='store', type=argparse_utils.arg_is_nonnegative_int, default=0, help=('The observed config index at which to begin analyses. ' 'Can be used in combination with the `--data-key-path` ' 'option to restart long-running, multi-observed-config ' 'analyses')) parser.add_argument('--dry-run', action='store_true', help='Do not run analyses; only process settings') parser.add_argument('--version', action='version', version='%(prog)s ' + _program_info['version'], help='Report version and exit.') parser.add_argument('--quiet', action='store_true', help='Run without verbose messaging.') parser.add_argument('--debug', action='store_true', help='Run in debugging mode.') if argv == sys.argv: args = parser.parse_args() else: args = parser.parse_args(argv) ########################################################################## ## handle args from pymsbayes.utils.messaging import (LoggingControl, InfoLogger) LoggingControl.set_logging_level("INFO") if args.quiet: LoggingControl.set_logging_level("WARNING") if args.debug: LoggingControl.set_logging_level("DEBUG") log = LoggingControl.get_logger(__name__) from pymsbayes.workers import (MsBayesWorker, merge_prior_files, ObsSumStatsWorker) from pymsbayes.teams import ABCTeam from pymsbayes.utils.functions import (is_file, is_dir, long_division, mk_new_dir) from pymsbayes.utils.parsing import (get_patterns_from_prefixes, DEFAULT_STAT_PATTERNS, DIV_MODEL_PATTERNS, MODEL_PATTERNS, PSI_PATTERNS, MEAN_TAU_PATTERNS, OMEGA_PATTERNS, CV_PATTERNS, line_count) from pymsbayes.utils import sumresults, errors from pymsbayes.manager import Manager from pymsbayes.utils.tempfs import TempFileSystem from pymsbayes.config import MsBayesConfig from pymsbayes.utils import (GLOBAL_RNG, set_memory_trace, MSBAYES_SORT_INDEX, ToolPathManager) MSBAYES_SORT_INDEX.set_index(args.sort_index) if len(args.observed_configs) != len(set(args.observed_configs)): raise ValueError('All paths to observed config files must be unique') if args.num_standardizing_samples > args.num_prior_samples: args.num_standardizing_samples = args.num_prior_samples # get full paths to tools msbayes_path = ToolPathManager.get_tool_full_path('msbayes.pl') dpp_msbayes_path = ToolPathManager.get_tool_full_path('dpp-msbayes.pl') eureject_path = ToolPathManager.get_tool_full_path('eureject') abctb_path = ToolPathManager.get_tool_full_path('ABCestimator') # vet prior-configs option using_previous_priors = False previous_prior_dir = None if (len(args.prior_configs) == 1) and (is_dir(args.prior_configs[0])): previous_prior_dir = args.prior_configs.pop(0) previous_priors = glob.glob( os.path.join(previous_prior_dir, '*-prior-sample.txt')) previous_sums = glob.glob( os.path.join(previous_prior_dir, '*-means-and-std-devs.txt')) if (not previous_priors) or (not previous_sums): raise ValueError( 'directory {0!r} specified with `prior-configs` ' 'option does not contain necessary prior and summary ' 'files'.format(args.prior_configs[0])) using_previous_priors = True else: for path in args.prior_configs: if not is_file(path): raise ValueError( 'prior config {0!r} is not a file'.format(path)) if len(args.prior_configs) != len(set(args.prior_configs)): raise ValueError('All paths to prior config files must be unique') if not args.output_dir: args.output_dir = os.path.dirname(args.observed_configs[0]) base_dir = mk_new_dir(os.path.join(args.output_dir, 'pymsbayes-results')) if not args.temp_dir: args.temp_dir = base_dir info_path = os.path.join(base_dir, args.output_prefix + \ 'pymsbayes-info.txt') info = InfoLogger(info_path) info.write('[pymsbayes]'.format(base_dir)) info.write('\tversion = {version}'.format(**_program_info)) info.write('\toutput_directory = {0}'.format(base_dir)) temp_fs = TempFileSystem(parent=args.temp_dir, prefix='temp-files-') base_temp_dir = temp_fs.base_dir info.write('\ttemp_directory = {0}'.format(base_temp_dir)) info.write('\tsort_index = {0}'.format(MSBAYES_SORT_INDEX.current_value())) info.write('\tsimulation_reps = {0}'.format(args.reps)) stat_patterns = DEFAULT_STAT_PATTERNS if args.stat_prefixes: for i in range(len(args.stat_prefixes)): if not args.stat_prefixes[i].endswith('.'): args.stat_prefixes[i] += '.' stat_patterns = get_patterns_from_prefixes(args.stat_prefixes, ignore_case=True) if not args.bandwidth: args.bandwidth = 2 / float(args.num_posterior_samples) if not args.seed: args.seed = random.randint(1, 999999999) GLOBAL_RNG.seed(args.seed) if args.data_key_path: observed_map = sumresults.parse_data_key_file(args.data_key_path) observed_paths = [observed_map[k] for k in sorted(observed_map.keys())] else: observed_dir = mk_new_dir( os.path.join(base_dir, 'observed-summary-stats')) observed_paths = [os.path.join(observed_dir, args.output_prefix + \ 'observed-{0}.txt'.format(i+1)) for i in range(len( args.observed_configs))] info.write('\tseed = {0}'.format(args.seed)) info.write('\tnum_processors = {0}'.format(args.np)) info.write('\tnum_prior_samples = {0}'.format(args.num_prior_samples)) info.write('\tnum_standardizing_samples = {0}'.format( args.num_standardizing_samples)) info.write('\tbandwidth = {0}'.format(args.bandwidth)) info.write('\tposterior_quantiles = {0}'.format( args.num_posterior_quantiles)) info.write('\tposterior_sample_size = {0}'.format( args.num_posterior_samples)) info.write('\tstat_patterns = {0}'.format(', '.join( [p.pattern for p in stat_patterns]))) # vet observed configs ref_config_path = args.observed_configs[0] ref_config = MsBayesConfig(ref_config_path) all_config_paths = [] num_taxon_pairs = ref_config.npairs assert num_taxon_pairs > 0 for config in args.observed_configs: all_config_paths.append(config) if not ref_config.equal_sample_table(config): if not args.keep_temps: temp_fs.purge() raise errors.SampleTableError( 'sample tables in config {0!r} and {1!r} differ; ' 'all sample tables must be the same.'.format( ref_config_path, config)) info.write('\tnum_taxon_pairs = {0}'.format(num_taxon_pairs)) info.write('\tdry_run = {0}'.format(args.dry_run)) info.write('\t[[tool_paths]]') info.write('\t\tdpp_msbayes = {0}'.format(dpp_msbayes_path)) info.write('\t\tmsbayes = {0}'.format(msbayes_path)) info.write('\t\teureject = {0}'.format(eureject_path)) info.write('\t\tabcestimator = {0}'.format(abctb_path)) info.write('\t[[observed_configs]]') for i, cfg in enumerate(args.observed_configs): info.write('\t\t{0} = {1}'.format( i + 1, os.path.relpath(cfg, os.path.dirname(info_path)))) abc_team = ABCTeam( temp_fs=temp_fs, observed_stats_files=observed_paths, num_taxon_pairs=num_taxon_pairs, config_paths=args.prior_configs, previous_prior_dir=previous_prior_dir, num_prior_samples=args.num_prior_samples, num_processors=args.np, num_standardizing_samples=args.num_standardizing_samples, num_posterior_samples=args.num_posterior_samples, num_posterior_density_quantiles=args.num_posterior_quantiles, batch_size=args.prior_batch_size, output_dir=base_dir, output_prefix=args.output_prefix, prior_temp_dir=args.staging_dir, rng=GLOBAL_RNG, report_parameters=True, stat_patterns=stat_patterns, eureject_exe_path=eureject_path, abctoolbox_exe_path=abctb_path, msbayes_exe_path=None, abctoolbox_bandwidth=args.bandwidth, omega_threshold=0.01, cv_threshold=0.01, compress=args.compress, reporting_frequency=args.reporting_frequency, keep_temps=args.keep_temps, global_estimate_only=False, global_estimate=not args.no_global_estimate, generate_prior_samples_only=args.generate_samples_only, start_from_simulation_index=args.start_from_simulation_index, start_from_observed_index=args.start_from_observed_index) models_to_configs = {} configs_to_models = {} for k, v in abc_team.models.iteritems(): models_to_configs[k] = v configs_to_models[v] = k cfg = MsBayesConfig(v) all_config_paths.append(v) # vet prior configs if not ref_config.equal_sample_table(cfg): if not args.keep_temps: temp_fs.purge() raise errors.SampleTableError( 'sample tables in config {0!r} and {1!r} differ; ' 'all sample tables must be the same.'.format( ref_config_path, v)) info.write('\t[[observed_paths]]') for i in sorted(abc_team.observed_stats_paths.iterkeys()): info.write('\t\t{0} = {1}'.format( i, os.path.relpath(abc_team.observed_stats_paths[i], os.path.dirname(info_path)))) info.write('\t[[prior_configs]]') for i in sorted(abc_team.models.iterkeys()): info.write('\t\t{0} = {1}'.format( i, os.path.relpath(abc_team.models[i], os.path.dirname(info_path)))) ########################################################################## ## begin analysis --- get observed summary stats set_memory_trace() # start logging memory profile start_time = datetime.datetime.now() if args.data_key_path: log.info('Using provided summary statitics...') elif not args.dry_run: obs_temp_dir = base_temp_dir if args.staging_dir: obs_temp_dir = args.staging_dir observed_temp_fs = TempFileSystem(parent=obs_temp_dir, prefix='observed-temps-') if args.reps < 1: log.info('Calculating summary statistics from sequence data...') obs_workers = [] for i, cfg in enumerate(args.observed_configs): ss_worker = ObsSumStatsWorker(temp_fs=observed_temp_fs, config_path=cfg, output_path=observed_paths[i], schema='abctoolbox', stat_patterns=stat_patterns) obs_workers.append(ss_worker) obs_workers = Manager.run_workers(workers=obs_workers, num_processors=args.np) # re-vet all configs to see if some were changed by obsSumStats.pl new_ref_config = ref_config ref_modified = False # new ref because if all configs were updated all is good if not ref_config.equal_sample_table(ref_config_path): ref_modified = True new_ref_config = MsBayesConfig(ref_config_path) log.warning(""" The alignment lengths in config {0!r} have been corrected for sites with *any* ambiguous bases and/or gaps by obsSumStats.pl. """.format(ref_config_path)) for config in all_config_paths: if not new_ref_config.equal_sample_table(config): corrected_config = config if ref_modified: corrected_config = ref_config_path if not args.keep_temps: observed_temp_fs.purge() temp_fs.purge() raise errors.SampleTableError(""" The sample tables in configs {0!r} and {1!r} differ because obsSumStats.pl modified alignment lengths in config {2!r} to correct for sites in the alignments with *any* ambiguous bases and/or gaps. Please make sure the sample tables in all configs will be the same after correcting alignment lengths for sites that contain *any* ambiguous bases and/or gaps. You can do this by copying and pasting the sample table in {2!r} that has been corrected by obsSumStats.pl into the other configs that were not corrected. """.format(ref_config_path, config, corrected_config)) else: log.info('Simulating summary statistics from observed configs...') num_observed_workers = min([args.reps, args.np]) if args.reps <= args.np: observed_batch_size = 1 remainder = 0 else: observed_batch_size, remainder = long_division( args.reps, args.np) msbayes_workers = [] for idx, cfg in enumerate(args.observed_configs): observed_model_idx = configs_to_models.get(cfg, None) schema = 'abctoolbox' for i in range(num_observed_workers): worker = MsBayesWorker(temp_fs=observed_temp_fs, sample_size=observed_batch_size, config_path=cfg, model_index=observed_model_idx, report_parameters=True, schema=schema, include_header=True, stat_patterns=stat_patterns, write_stats_file=False, staging_dir=None, tag=idx) msbayes_workers.append(worker) if remainder > 0: worker = MsBayesWorker(temp_fs=observed_temp_fs, sample_size=remainder, config_path=cfg, model_index=observed_model_idx, report_parameters=True, schema=schema, include_header=True, stat_patterns=stat_patterns, write_stats_file=False, staging_dir=None, tag=idx) msbayes_workers.append(worker) # run parallel msbayes processes msbayes_workers = Manager.run_workers(workers=msbayes_workers, num_processors=args.np) workers = dict( zip(range(len(args.observed_configs)), [[] for i in range(len(args.observed_configs))])) for w in msbayes_workers: workers[w.tag].append(w) # merge simulated observed data into one file for i in range(len(args.observed_configs)): merge_prior_files([w.prior_path for w in workers[i]], observed_paths[i]) lc = line_count(observed_paths[i], ignore_headers=True) if lc != args.reps: if not args.keep_temps: temp_fs.purge() raise Exception( 'The number of observed simulations ({0}) ' 'generated for observed config {1!r} and output to ' 'file {2!r} does not match the number of reps ' '({3})'.format(lc, args.observed_configs[i], observed_paths[i], args.reps)) if not args.keep_temps: log.debug('purging observed temps...') observed_temp_fs.purge() ########################################################################## ## Begin ABC analyses if not args.dry_run: abc_team.run() stop_time = datetime.datetime.now() log.info('Done!') info.write('\t[[run_stats]]', log.info) info.write('\t\tstart_time = {0}'.format(str(start_time)), log.info) info.write('\t\tstop_time = {0}'.format(str(stop_time)), log.info) info.write('\t\ttotal_duration = {0}'.format(str(stop_time - start_time)), log.info) if not args.keep_temps: log.debug('purging temps...') temp_fs.purge()
def main_cli(): description = '{name} {version}'.format(**_program_info) parser = argparse.ArgumentParser(description=description) parser.add_argument( 'config', metavar='CONFIG-FILE', type=argparse_utils.arg_is_config, help=('msBayes config file used to estimate the posterior ' 'sample.')) parser.add_argument('posterior_sample_path', metavar='POSTERIOR-SAMPLE-FILE', type=argparse_utils.arg_is_file, help=('Path to posterior sample file (i.e., ' '`*-posterior-sample.txt`).')) parser.add_argument( '-e', '--expression', dest='expressions', action='append', metavar='TAXON-INDEX-EXPRESSION', type=str, required=True, help=('A conditional expression of divergence times based on ' 'the taxon-pair indices for which to calculate the ' 'posterior probability of being true. Indices correspond ' 'to the order that pairs of taxa appear in the sample ' 'table of the config, starting at 0 for the first ' 'taxon-pair to appear in the table (starting from the ' 'top). E.g., `-e "0 == 3 == 4"` would request the ' 'proportion of times the 1st, 4th, and 5th taxon-pairs ' '(in order of appearance in the sample table of the ' 'config) share the same divergence time in the ' 'posterior sample, whereas `-e "0 > 1" would request the ' 'proportion of times the the 1st taxon-pair diverged ' 'further back in time than the 2nd taxon-pair in the ' 'posterior sample.')) parser.add_argument( '-n', '--num-prior-samples', action='store', type=argparse_utils.arg_is_positive_int, help=('The number of prior samples to simulate for estimating ' 'prior probabilities; prior probabilities and Bayes ' 'factors will be reported. The default is to only report ' 'posterior probabilities.')) parser.add_argument( '--np', action='store', type=argparse_utils.arg_is_positive_int, default=multiprocessing.cpu_count(), help=('The maximum number of processes to run in parallel for ' 'prior simulations. The default is the number of CPUs ' 'available on the machine. This option is only relevant ' 'if the number of prior samples is specified using the ' '`-n` argument.')) parser.add_argument( '--seed', action='store', type=argparse_utils.arg_is_positive_int, help=('Random number seed to use for simulations. This option ' 'is only relevant if the number of prior samples is ' 'specified using the `-n` argument.')) parser.add_argument('--version', action='version', version='%(prog)s ' + _program_info['version'], help='Report version and exit.') parser.add_argument('--quiet', action='store_true', help='Run without verbose messaging.') parser.add_argument('--debug', action='store_true', help='Run in debugging mode.') args = parser.parse_args() ########################################################################## ## handle args from pymsbayes.utils.messaging import (LoggingControl, InfoLogger) LoggingControl.set_logging_level("INFO") if args.quiet: LoggingControl.set_logging_level("WARNING") if args.debug: LoggingControl.set_logging_level("DEBUG") log = LoggingControl.get_logger(__name__) from pymsbayes import config from pymsbayes.teams import DivModelSimulatorTeam from pymsbayes.utils import stats, sumresults, GLOBAL_RNG if not args.seed: args.seed = random.randint(1, 999999999) GLOBAL_RNG.seed(args.seed) cfg = config.MsBayesConfig(args.config) evaluators = [] for exp in args.expressions: evaluators.append( stats.ListConditionEvaluator(exp, index_labels=cfg.taxa)) div_models = sumresults.get_partitions_from_posterior_sample_file( args.posterior_sample_path) sim_team = None if args.num_prior_samples: sim_team = DivModelSimulatorTeam(config_paths=[args.config], num_samples=args.num_prior_samples, num_processors=args.np) sim_team.start() for e in evaluators: title = '{0} --- {1}:'.format(e.expression, e.pretty_expression) section_title = '\n{0}\n{1}\n'.format(title, '-' * len(title)) sys.stdout.write('{0}'.format(section_title)) prob_shared_div = div_models.get_condition_frequency(e) sys.stdout.write( 'posterior probability = {0}\n'.format(prob_shared_div)) if sim_team: prior_prob = sim_team.div_models[ args.config].get_condition_frequency(e) bf = ((prob_shared_div / (1 - prob_shared_div)) / (prior_prob / (1 - prior_prob))) sys.stdout.write('prior probability = {0}\n'.format(prior_prob)) sys.stdout.write('Bayes factor = {0}\n'.format(bf)) sys.stdout.write('2ln(Bayes factor) = {0}\n'.format(2 * math.log(bf))) sys.stdout.write('\n')
def main_cli(): description = '{name} {version}'.format(**_program_info) parser = argparse.ArgumentParser(description=description) parser.add_argument('info_path', metavar='PYMSBAYES-INFO-FILE', type=argparse_utils.arg_is_file, help=('Path to `pymsbayes-info.txt` file.')) parser.add_argument( '-n', '--num-prior-samples', action='store', type=argparse_utils.arg_is_positive_int, default=100000, help=('The number of prior samples to simulate for estimating ' 'prior probabilities.')) parser.add_argument( '-i', '--sample-index', action='store', type=argparse_utils.arg_is_positive_int, help=('The prior-sample index of results to be summarized. ' 'Output files should have a consistent schema. For ' 'example, a results file for divergence models might look ' 'something like ' '`d1-m1-s1-1000000-div-model-results.txt`. In this example, ' 'the prior-sample index is "1000000". The default is to ' 'use the largest prior-sample index, which is probably ' 'what you want.')) parser.add_argument( '-o', '--output-dir', action='store', type=argparse_utils.arg_is_dir, help=('The directory in which all output plots will be written. ' 'The default is to use the directory of the pymsbayes info ' 'file.')) parser.add_argument( '--np', action='store', type=argparse_utils.arg_is_positive_int, default=multiprocessing.cpu_count(), help=('The maximum number of processes to run in parallel. The ' 'default is the number of CPUs available on the machine.')) parser.add_argument( '-m', '--mu', action='store', type=argparse_utils.arg_is_positive_float, default=None, help=('The mutation rate with which to scale time to units of ' 'generations. By default, time is not scaled to ' 'generations.')) parser.add_argument( '--extension', action='store', type=str, default='pdf', help=('The file format extension of the plots (e.g., "pdf", ' '"png"). The default is pdf.')) parser.add_argument('--seed', action='store', type=argparse_utils.arg_is_positive_int, help='Random number seed to use for the analysis.') parser.add_argument('--version', action='version', version='%(prog)s ' + _program_info['version'], help='Report version and exit.') parser.add_argument('--quiet', action='store_true', help='Run without verbose messaging.') parser.add_argument('--debug', action='store_true', help='Run in debugging mode.') args = parser.parse_args() ########################################################################## ## handle args from pymsbayes.utils.messaging import (LoggingControl, InfoLogger) LoggingControl.set_logging_level("INFO") if args.quiet: LoggingControl.set_logging_level("WARNING") if args.debug: LoggingControl.set_logging_level("DEBUG") log = LoggingControl.get_logger(__name__) from pymsbayes import plotting from pymsbayes.utils import sumresults from pymsbayes.utils import GLOBAL_RNG if not plotting.MATPLOTLIB_AVAILABLE: log.error('`matplotlib` could not be imported, so plots can not be\n' 'produced. Please install `matplotlib` and try again.') sys.exit(1) if not args.seed: args.seed = random.randint(1, 999999999) GLOBAL_RNG.seed(args.seed) if not args.output_dir: args.output_dir = os.path.dirname(args.info_path) args.output_dir = os.path.join(args.output_dir, 'plots') if not os.path.exists(args.output_dir): os.mkdir(args.output_dir) results = sumresults.DMCSimulationResults(args.info_path) if results.num_sim_reps > 1: log.error('Results appear to be from simulation-based analysis, ' 'for which this plotting script is not appropriate.') sys.exit(1) observed_indices = sorted(results.observed_index_to_config.keys()) prior_indices = sorted(results.prior_index_to_config.keys()) for obs_idx in observed_indices: for prior_idx in prior_indices: result_indices = results.get_result_indices(obs_idx, prior_idx, 1) result_idx = max(result_indices) result_path_prefix = '{0}{1}-'.format( results.get_result_path_prefix(obs_idx, prior_idx, 1), result_idx) result_dir = os.path.dirname(result_path_prefix) out_prefix = os.path.join(args.output_dir, os.path.basename(result_path_prefix)) prior_cfg = results.prior_configs[prior_idx] posterior_summary_path = get_result_path(result_path_prefix, 'posterior-summary') div_model_path = get_result_path(result_path_prefix, 'div-model-results') config_path = results.prior_index_to_config[prior_idx] time_multiplier = 1.0 if args.mu is not None: if prior_cfg.time_in_subs_per_site: time_multiplier = 1.0 / args.mu else: try: mean_theta = prior_cfg.theta.mean except: mean_theta = prior_cfg.d_theta.mean time_multiplier = mean_theta / args.mu if results.sort_index == 0: #plot marginal times if not posterior_summary_path: log.warning('Could not find {0}{1}.txt(.gz); ' 'Skipping marginal times plot...'.format( result_path_prefix, 'posterior-summary')) else: label_dimension = (0.34 * (prior_cfg.npairs + 1)) + 0.56 marginal_times_plot = plotting.get_marginal_divergence_time_plot( config_path=config_path, posterior_summary_path=posterior_summary_path, labels=None, estimate='median', interval='HPD_95_interval', time_multiplier=time_multiplier, horizontal=True, label_dimension=label_dimension, measure_dimension=8.0, label_size=12.0, measure_tick_label_size=12.0, measure_axis_label='Divergence time', measure_axis_label_size=14.0, label_axis_label='Taxon pair', label_axis_label_size=14.0, usetex=False) marginal_times_path = '{0}{1}'.format( out_prefix, 'marginal-divergence-times.' + args.extension) marginal_times_plot.savefig(marginal_times_path) #plot top ordered models if not div_model_path: log.warning('Could not find {0}{1}.txt(.gz); ' 'Skipping ordered div model plot...'.format( result_path_prefix, 'div-model-results')) else: height = 12.0 margin_top = 0.99 margin_left = 0.03 padding_between_vertical = 0.8 if prior_cfg.npairs < 4: height *= 0.8 margin_top -= 0.01 margin_left += 0.05 padding_between_vertical += 0.3 width = (0.38 * prior_cfg.npairs) + 1.5 div_model_plot = plotting.OrderedDivergenceModelPlotGrid( div_model_results_path=div_model_path, config_path=config_path, num_top_models=10, time_multiplier=time_multiplier, height=height, width=width, plot_label_schema='uppercase', plot_label_offset=0, plot_label_size=12.0, y_title='Divergence time', y_title_size=14.0, y_tick_label_size=10.0, right_text_size=10.0, margin_left=margin_left, margin_bottom=0.0, margin_right=1, margin_top=margin_top, padding_between_vertical=padding_between_vertical, tab=0.08) plot = div_model_plot.create_grid() div_model_plot_path = '{0}{1}'.format( out_prefix, 'ordered-div-models.' + args.extension) plot.savefig(div_model_plot_path) else: #plot top unordered models if not div_model_path: log.warning('Could not find {0}{1}.txt(.gz); ' 'Skipping unordered div model plot...'.format( result_path_prefix, 'div-model-results')) else: width = (0.38 * prior_cfg.npairs) + 1.5 div_model_plot = plotting.UnorderedDivergenceModelPlotGrid( div_model_results_path=div_model_path, num_top_models=10, time_multiplier=time_multiplier, height=10.0, width=width, data_label_size=10.0, plot_label_schema='uppercase', plot_label_offset=0, plot_label_size=12.0, y_title='Divergence time', y_title_size=14.0, y_tick_label_size=10.0, right_text_size=10.0, margin_left=0.03, margin_bottom=0.0, margin_right=1, margin_top=0.99, padding_between_vertical=0.8, tab=0.08) plot = div_model_plot.create_grid() div_model_plot_path = '{0}{1}'.format( out_prefix, 'ordered-div-models.' + args.extension) plot.savefig(div_model_plot_path) #plot ndiv plot psi_path = get_result_path(result_path_prefix, 'psi-results') if not psi_path: log.warning('Could not find {0}{1}.txt(.gz); ' 'Skipping number of divergences plot...'.format( result_path_prefix, 'psi-results')) else: width = (0.25 * prior_cfg.npairs) + 0.55 if width < 2.8: width = 2.8 num_div_summary = plotting.NumberOfDivergencesSummary( config_path=results.prior_index_to_config[prior_idx], psi_results_path=psi_path, posterior_summary_path=posterior_summary_path, num_prior_samples=args.num_prior_samples, num_processors=args.np) num_div_summary.create_plot(plot_label_size=10.0, right_text_size=10.0, x_label_size=10.0, y_label_size=10.0, xtick_label_size=10.0, ytick_label_size=8.0, height=6.0, width=width, margin_bottom=0.0, margin_left=0.0, margin_top=0.97, margin_right=1.0, padding_between_vertical=1.0) num_div_plot_path = '{0}{1}'.format( out_prefix, 'number-of-divergences.' + args.extension) num_div_summary.save_plot(num_div_plot_path) bf_plot_path = '{0}{1}'.format( out_prefix, ('number-of-divergences-bayes-factors-only.' + args.extension)) num_div_summary.save_bf_plot(bf_plot_path) num_div_bf_path = '{0}{1}'.format( out_prefix, 'number-of-divergences-bayes-factors.txt') with open(num_div_bf_path, 'w') as out: out.write('num_of_divs\t2ln(bf)\n') for n in sorted(num_div_summary.psi_bayes_factors.keys()): out.write('{0}\t{1}\n'.format( n, num_div_summary.psi_bayes_factors[n])) log.info('The plots are in: {0}'.format(args.output_dir))
def main_cli(): description = '{name} {version}'.format(**_program_info) parser = argparse.ArgumentParser(description = description) parser.add_argument('div_model_path', metavar='DIV-MODEL-RESULTS-FILE', type=argparse_utils.arg_is_file, help = ('Path to divergence model results file (i.e., ' '`*-div-model-results.txt`).')) parser.add_argument('-i', '--taxon-indices', nargs = '+', type = argparse_utils.arg_is_positive_int, required = True, help = ('Two or more space-separated indices of taxa for which to ' 'calculate the probability of them co-diverging. Indices ' 'correspond to the line in the sample table of the config, ' 'starting at 1 for the first line of the table. At least ' 'two indices are required.')) parser.add_argument('-c', '--config', type = argparse_utils.arg_is_config, help = ('msBayes config file to be used to estimate prior ' 'probability via simulations. If provided, the ' 'posterior and prior probability and bayes factor is ' 'reported. If not provided, only the posterior ' 'probability is reported.')) parser.add_argument('-n', '--num-prior-samples', action = 'store', type = argparse_utils.arg_is_positive_int, default = 100000, help = ('The number of prior samples to simulate for estimating' 'prior probabilities. Only used if a config file is ' 'provided with the `-c` argument.')) parser.add_argument('--np', action = 'store', type = argparse_utils.arg_is_positive_int, default = multiprocessing.cpu_count(), help = ('The maximum number of processes to run in parallel for ' 'prior simulations. The default is the number of CPUs ' 'available on the machine. This option is only relevant ' 'if a config file is provided using the `-c` argument.')) parser.add_argument('--seed', action = 'store', type = argparse_utils.arg_is_positive_int, help = ('Random number seed to use for simulations. This option ' 'is only relevant if a config file is provided using the ' '`-c` argument.')) parser.add_argument('--version', action = 'version', version = '%(prog)s ' + _program_info['version'], help = 'Report version and exit.') parser.add_argument('--quiet', action = 'store_true', help = 'Run without verbose messaging.') parser.add_argument('--debug', action = 'store_true', help = 'Run in debugging mode.') args = parser.parse_args() ########################################################################## ## handle args from pymsbayes.utils.messaging import (LoggingControl, InfoLogger) LoggingControl.set_logging_level("INFO") if args.quiet: LoggingControl.set_logging_level("WARNING") if args.debug: LoggingControl.set_logging_level("DEBUG") log = LoggingControl.get_logger(__name__) from pymsbayes import config from pymsbayes.teams import ModelProbabilityEstimatorTeam from pymsbayes.utils import sumresults, GLOBAL_RNG if len(args.taxon_indices) < 2: log.error('At least two taxon indices are required') sys.exit(1) if not args.seed: args.seed = random.randint(1, 999999999) GLOBAL_RNG.seed(args.seed) div_models = sumresults.OrderedDivergenceModelCollection( div_model_results_path = args.div_model_path) for i in args.taxon_indices: if ((i < 1) or (i > div_models.npairs)): log.error('taxon index {0} is out of bounds'.format(i)) sys.exit(1) args.taxon_indices = [i - 1 for i in args.taxon_indices] prob_shared_div = div_models.prob_of_shared_divergence(args.taxon_indices) if args.config: prob_estimator_team = ModelProbabilityEstimatorTeam( config_paths = [args.config], num_samples = args.num_prior_samples, num_processors = args.np) prob_estimator_team.start() prior_prob = prob_estimator_team.shared_div_probs[args.config][ len(args.taxon_indices)] bf = ((prob_shared_div / (1 - prob_shared_div)) / (prior_prob / (1 - prior_prob))) sys.stdout.write('posterior probability = {0}\n'.format(prob_shared_div)) if args.config: sys.stdout.write('prior probability = {0}\n'.format(prior_prob)) sys.stdout.write('Bayes factor = {0}\n'.format(bf)) sys.stdout.write('2ln(Bayes factor) = {0}\n'.format(2 * math.log(bf)))