def test_long_division(self): self.assertEqual(functions.long_division(5, 2), (2, 1)) self.assertEqual(functions.long_division(6, 2), (3, 0)) self.assertEqual(functions.long_division(-11, 3), (-4, 1))
def main_cli(): description = '{name} {version}'.format(**_program_info) parser = argparse.ArgumentParser(description = description, formatter_class = argparse_utils.SmartHelpFormatter) parser.add_argument('-c', '--config', type = argparse_utils.arg_is_config, required = True, help = ('msBayes config file to be used to generate saturation ' 'plot.')) parser.add_argument('-n', '--num-prior-samples', action = 'store', type = int, default = 1000, help = ('The number of prior samples to simulate for the ' 'saturation plot.')) parser.add_argument('--np', action = 'store', type = int, default = multiprocessing.cpu_count(), help = ('The maximum number of processes to run in parallel. The ' 'default is the number of CPUs available on the machine.')) parser.add_argument('-o', '--output-dir', action = 'store', type = argparse_utils.arg_is_dir, help = ('The directory in which all output files will be written. ' 'The default is to use the directory of the first observed ' 'config file.')) parser.add_argument('--temp-dir', action = 'store', type = argparse_utils.arg_is_dir, help = ('A directory to temporarily stage files. The default is to ' 'use the output directory.')) parser.add_argument('-s', '--stat-prefixes', nargs = '*', type = str, default = ['pi', 'pi.net', 'wattTheta', 'tajD.denom'], help = ('Prefixes of summary statistics to use in the analyses. ' 'The prefixes should be separated by spaces. ' 'Default: `-s pi pi.net wattTheta tajD.denom`.')) parser.add_argument('--sort-index', action = 'store', type = int, default = 0, choices = range(12), help = argparse_utils.get_sort_index_help_message()) parser.add_argument('--compress', action = 'store_true', help = 'Compress plot data file.') parser.add_argument('--keep-temps', action = 'store_true', help = 'Keep all temporary files.') parser.add_argument('--seed', action = 'store', type = int, help = 'Random number seed to use for the analysis.') parser.add_argument('--version', action = 'version', version = '%(prog)s ' + _program_info['version'], help = 'Report version and exit.') parser.add_argument('--quiet', action = 'store_true', help = 'Run without verbose messaging.') parser.add_argument('--debug', action = 'store_true', help = 'Run in debugging mode.') args = parser.parse_args() ########################################################################## ## handle args from pymsbayes.utils.messaging import (LoggingControl, InfoLogger) LoggingControl.set_logging_level("INFO") if args.quiet: LoggingControl.set_logging_level("WARNING") if args.debug: LoggingControl.set_logging_level("DEBUG") log = LoggingControl.get_logger(__name__) from pymsbayes.workers import MsBayesWorker from pymsbayes.utils.parsing import (get_patterns_from_prefixes, DEFAULT_STAT_PATTERNS, get_dict_from_spreadsheets, dict_line_iter) from pymsbayes.manager import Manager from pymsbayes.utils.tempfs import TempFileSystem from pymsbayes.utils import probability, stats from pymsbayes.utils.functions import long_division from pymsbayes.config import MsBayesConfig from pymsbayes.utils import GLOBAL_RNG, MSBAYES_SORT_INDEX, ToolPathManager from pymsbayes.fileio import process_file_arg from pymsbayes import plotting MSBAYES_SORT_INDEX.set_index(args.sort_index) # get full paths to tools msbayes_path = ToolPathManager.get_tool_full_path('msbayes.pl') dpp_msbayes_path = ToolPathManager.get_tool_full_path('dpp-msbayes.pl') if not args.output_dir: args.output_dir = os.path.dirname(args.config) info = InfoLogger(os.path.join(args.output_dir, 'pymsbayes-info.txt')) sample_path = os.path.join(args.output_dir, 'prior-sample.txt') if args.compress: sample_path += '.gz' if not args.temp_dir: args.temp_dir = args.output_dir temp_fs = TempFileSystem(parent=args.temp_dir, prefix='temp-files-') args.stat_prefixes = [s.rstrip('.') for s in args.stat_prefixes] stat_patterns = get_patterns_from_prefixes( [s + '.' for s in args.stat_prefixes], ignore_case=True) if not args.seed: args.seed = random.randint(1, 999999999) GLOBAL_RNG.seed(args.seed) compress_level = None if args.compress: compress_level = 9 cfg = MsBayesConfig(args.config) num_taxon_pairs = cfg.npairs info.write('[pymsbayes]', log.info) info.write('\tprogram_name = {name}'.format(**_program_info), log.info) info.write('\tversion = {version}'.format(**_program_info), log.info) info.write('\tinvocation = {0!r}'.format(' '.join(sys.argv)), log.info) info.write('\toutput_directory = {0!r}'.format(args.output_dir), log.info) info.write('\ttemp_directory = {0!r}'.format(temp_fs.base_dir), log.info) info.write('\tsort_index = {0}'.format( MSBAYES_SORT_INDEX.current_value()), log.info) info.write('\tstat_patterns = {0!r}'.format( ', '.join([p.pattern for p in stat_patterns])), log.info) info.write('\tseed = {0}'.format(args.seed), log.info) info.write('\tnum_prior_samples = {0}'.format(args.num_prior_samples), log.info) info.write('\tsample_path = {0!r}'.format(sample_path), log.info) info.write('\t[[tool_paths]]', log.info) info.write('\t\tdpp_msbayes = {0}'.format(dpp_msbayes_path), log.info) info.write('\t\tmsbayes = {0}'.format(msbayes_path), log.info) info.write('\t[[config]]', log.debug) info.write('{0}'.format(str(cfg)), log.debug) ########################################################################## ## begin analysis --- generate samples start_time = datetime.datetime.now() if args.np > args.num_prior_samples: args.np = args.num_prior_samples batch_size, remainder = long_division(args.num_prior_samples, args.np) schema = 'abctoolbox' workers = [] for i in range(args.np): sample_size = batch_size if i == (args.np - 1): sample_size += remainder w = MsBayesWorker( temp_fs = temp_fs, sample_size = sample_size, config_path = args.config, report_parameters = True, schema = schema, include_header = True, stat_patterns = stat_patterns, write_stats_file = False) workers.append(w) log.info('Generating samples...') workers = Manager.run_workers( workers = workers, num_processors = args.np) log.info('Parsing samples...') sample = get_dict_from_spreadsheets([w.prior_path for w in workers]) log.info('Writing prior samples...') out, close = process_file_arg(sample_path, 'w', compresslevel = compress_level) for row in dict_line_iter(sample, sep = '\t'): out.write(row) if close: out.close() log.info('Creating plots...') if not plotting.MATPLOTLIB_AVAILABLE: log.warning( '`matplotlib` could not be imported, so the plot can not be\n' 'produced. The data to create the plot can be found in:\n\t' '{0!r}'.format(sample_path)) sys.exit(1) for stat_pattern in stat_patterns: found = False for stat, values in sample.iteritems(): if stat_pattern.match(stat): values = [float(v) for v in values] found = True plot_path = os.path.join(args.output_dir, 'plot-{0}.pdf'.format(stat)) summary = stats.get_summary(values) s = r'mean = {0:.4f} ({1:.4f}-{2:.4f})'.format( summary['mean'], summary['qi_95'][0], summary['qi_95'][1]) hd = plotting.HistData(x = values, normed = True, bins = 20, histtype = 'bar', align = 'mid', orientation = 'vertical', zorder = 0) hist = plotting.ScatterPlot(hist_data_list = [hd], right_text = s) hist.left_text_size = 12.0 hist.right_text_size = 12.0 xticks = [i for i in hist.ax.get_xticks()] xtick_labels = [i for i in xticks] yticks = [i for i in hist.ax.get_yticks()] ytick_labels = [i for i in yticks] if len(xtick_labels) >= 8: for i in range(1, len(xtick_labels), 2): xtick_labels[i] = '' if len(ytick_labels) >= 8: for i in range(1, len(ytick_labels), 2): ytick_labels[i] = '' xticks_obj = plotting.Ticks(ticks = xticks, labels = xtick_labels, horizontalalignment = 'center') yticks_obj = plotting.Ticks(ticks = yticks, labels = ytick_labels) hist.xticks_obj = xticks_obj hist.yticks_obj = yticks_obj plot_grid = plotting.PlotGrid(subplots = [hist], num_columns = 1, label_schema = None, title = stat, title_size = 14.0, title_top = False, y_title = 'Density', y_title_position = 0.001, y_title_size = 14.0, height = 4.0, width = 6.0, auto_height = False) plot_grid.auto_adjust_margins = False plot_grid.margin_left = 0.04 plot_grid.margin_bottom = 0.04 plot_grid.margin_right = 1.0 plot_grid.margin_top = 0.97 plot_grid.reset_figure() plot_grid.savefig(plot_path) if not found: raise Exception('stat pattern {0!r} not found in simulated stats:' '\n\t{1}'.format(stat_pattern, ', '.join(sample.keys()))) stop_time = datetime.datetime.now() log.info('Done!') info.write('\t[[run_stats]]', log.info) info.write('\t\tstart_time = {0}'.format(str(start_time)), log.info) info.write('\t\tstop_time = {0}'.format(str(stop_time)), log.info) info.write('\t\ttotal_duration = {0}'.format(str(stop_time - start_time)), log.info) if not args.keep_temps: log.debug('purging temps...') temp_fs.purge()
def main_cli(): description = '{name} {version}'.format(**_program_info) parser = argparse.ArgumentParser(description = description) parser.add_argument('-c', '--config', type = arg_is_config, required = True, help = ('msBayes config file to be used to generate saturation ' 'plot.')) parser.add_argument('-n', '--num-prior-samples', action = 'store', type = int, default = 1000, help = ('The number of prior samples to simulate for the ' 'saturation plot.')) parser.add_argument('--np', action = 'store', type = int, default = multiprocessing.cpu_count(), help = ('The maximum number of processes to run in parallel. The ' 'default is the number of CPUs available on the machine.')) parser.add_argument('-o', '--output-dir', action = 'store', type = arg_is_dir, help = ('The directory in which all output files will be written. ' 'The default is to use the directory of the first observed ' 'config file.')) parser.add_argument('--temp-dir', action = 'store', type = arg_is_dir, help = ('A directory to temporarily stage files. The default is to ' 'use the output directory.')) parser.add_argument('-s', '--stat-prefixes', nargs = '*', type = str, default = ['pi', 'pi.net', 'wattTheta', 'tajD.denom'], help = ('Prefixes of summary statistics to use in the analyses. ' 'The prefixes should be separated by spaces. ' 'Default: `-s pi pi.net wattTheta tajD.denom`.')) parser.add_argument('--vertical-lines', nargs = '*', type = float, default = [], help = ('Positions along x-axis where vertical lines are to be ' 'drawn. Default is to draw no vertical lines.')) parser.add_argument('--compress', action = 'store_true', help = 'Compress plot data file.') parser.add_argument('--keep-temps', action = 'store_true', help = 'Keep all temporary files.') parser.add_argument('--seed', action = 'store', type = int, help = 'Random number seed to use for the analysis.') parser.add_argument('--version', action = 'version', version = '%(prog)s ' + _program_info['version'], help = 'Report version and exit.') parser.add_argument('--quiet', action = 'store_true', help = 'Run without verbose messaging.') parser.add_argument('--debug', action = 'store_true', help = 'Run in debugging mode.') args = parser.parse_args() ########################################################################## ## handle args from pymsbayes.utils.messaging import (LoggingControl, InfoLogger) LoggingControl.set_logging_level("INFO") if args.quiet: LoggingControl.set_logging_level("WARNING") if args.debug: LoggingControl.set_logging_level("DEBUG") log = LoggingControl.get_logger(__name__) from pymsbayes.workers import MsBayesWorker from pymsbayes.utils.parsing import (get_patterns_from_prefixes, DEFAULT_STAT_PATTERNS, get_stats_by_time, dict_line_iter) from pymsbayes.manager import Manager from pymsbayes.utils.tempfs import TempFileSystem from pymsbayes.utils import probability from pymsbayes.utils.functions import long_division from pymsbayes.config import MsBayesConfig from pymsbayes.utils import GLOBAL_RNG, MSBAYES_SORT_INDEX, ToolPathManager from pymsbayes.fileio import process_file_arg from pymsbayes.plotting import MATPLOTLIB_AVAILABLE, SaturationPlotGrid MSBAYES_SORT_INDEX.set_index(0) # get full paths to tools msbayes_path = ToolPathManager.get_tool_full_path('msbayes.pl') dpp_msbayes_path = ToolPathManager.get_tool_full_path('dpp-msbayes.pl') if not args.output_dir: args.output_dir = os.path.dirname(args.config) info = InfoLogger(os.path.join(args.output_dir, 'pymsbayes-info.txt')) stats_by_time_path = os.path.join(args.output_dir, 'stats-by-time.txt') if args.compress: stats_by_time_path += '.gz' plot_path = os.path.join(args.output_dir, 'saturation-plot.pdf') if not args.temp_dir: args.temp_dir = args.output_dir temp_fs = TempFileSystem(parent=args.temp_dir, prefix='temp-files-') args.stat_prefixes = [s.rstrip('.') for s in args.stat_prefixes] stat_patterns = get_patterns_from_prefixes( [s + '.' for s in args.stat_prefixes], ignore_case=True) if not args.seed: args.seed = random.randint(1, 999999999) GLOBAL_RNG.seed(args.seed) compress_level = None if args.compress: compress_level = 9 cfg = MsBayesConfig(args.config) num_taxon_pairs = cfg.npairs cfg.div_model_prior = 'constrained' cfg.psi = probability.DiscreteUniformDistribution(num_taxon_pairs, num_taxon_pairs) config_path = temp_fs.get_file_path(prefix='cfg-') cfg.write(config_path) info.write('[pymsbayes]', log.info) info.write('\tprogram_name = {name}'.format(**_program_info), log.info) info.write('\tversion = {version}'.format(**_program_info), log.info) info.write('\tinvocation = {0!r}'.format(' '.join(sys.argv)), log.info) info.write('\toutput_directory = {0!r}'.format(args.output_dir), log.info) info.write('\ttemp_directory = {0!r}'.format(temp_fs.base_dir), log.info) info.write('\tsort_index = {0}'.format( MSBAYES_SORT_INDEX.current_value()), log.info) info.write('\tstat_patterns = {0!r}'.format( ', '.join([p.pattern for p in stat_patterns])), log.info) info.write('\tseed = {0}'.format(args.seed), log.info) info.write('\tnum_prior_samples = {0}'.format(args.num_prior_samples), log.info) info.write('\tstats_by_time_path = {0!r}'.format(stats_by_time_path), log.info) info.write('\t[[tool_paths]]', log.info) info.write('\t\tdpp_msbayes = {0}'.format(dpp_msbayes_path), log.info) info.write('\t\tmsbayes = {0}'.format(msbayes_path), log.info) info.write('\t[[config]]', log.debug) info.write('{0}'.format(str(cfg)), log.debug) ########################################################################## ## begin analysis --- generate samples start_time = datetime.datetime.now() if args.np > args.num_prior_samples: args.np = args.num_prior_samples batch_size, remainder = long_division(args.num_prior_samples, args.np) schema = 'abctoolbox' workers = [] for i in range(args.np): sample_size = batch_size if i == (args.np - 1): sample_size += remainder w = MsBayesWorker( temp_fs = temp_fs, sample_size = sample_size, config_path = config_path, report_parameters = True, schema = schema, include_header = True, stat_patterns = stat_patterns, write_stats_file = False) workers.append(w) log.info('Generating samples...') workers = Manager.run_workers( workers = workers, num_processors = args.np) log.info('Parsing samples...') stats_by_time = get_stats_by_time([w.prior_path for w in workers]) stat_keys = stats_by_time.keys() stat_keys.remove('PRI.t') for prefix in args.stat_prefixes: if not prefix in stat_keys: raise Exception('stat prefix {0!r} not found in simulated stats:' '\n\t{1}'.format(prefix, ', '.join(stat_keys))) header = ['PRI.t'] + args.stat_prefixes log.info('Writing stats-by-time matrix...') out, close = process_file_arg(stats_by_time_path, 'w', compresslevel = compress_level) for row in dict_line_iter(stats_by_time, sep = '\t', header = header): out.write(row) if close: out.close() log.info('Creating plots...') if not MATPLOTLIB_AVAILABLE: log.warning( '`matplotlib` could not be imported, so the plot can not be\n' 'produced. The data to create the plot can be found in:\n\t' '{0!r}'.format(stats_by_time_path)) else: y_labels = {'pi': r'$\pi$', 'pi.net': r'$\pi_{net}$', 'wattTheta': r'$\theta_W$', 'tajD.denom': r'$SD(\pi - \theta_W)$'} spg = SaturationPlotGrid(stats_by_time, x_key = 'PRI.t', y_keys = args.stat_prefixes, y_labels = y_labels, num_columns = 2, vertical_line_positions = args.vertical_lines) fig = spg.create_grid() fig.savefig(plot_path) stop_time = datetime.datetime.now() log.info('Done!') info.write('\t[[run_stats]]', log.info) info.write('\t\tstart_time = {0}'.format(str(start_time)), log.info) info.write('\t\tstop_time = {0}'.format(str(stop_time)), log.info) info.write('\t\ttotal_duration = {0}'.format(str(stop_time - start_time)), log.info) if not args.keep_temps: log.debug('purging temps...') temp_fs.purge()
def main_cli(): description = '{name} {version}'.format(**_program_info) parser = argparse.ArgumentParser(description=description) parser.add_argument( '-c', '--config', type=arg_is_config, required=True, help=('msBayes config file to be used to generate saturation ' 'plot.')) parser.add_argument( '-n', '--num-prior-samples', action='store', type=int, default=1000, help=('The number of prior samples to simulate for the ' 'saturation plot.')) parser.add_argument( '--np', action='store', type=int, default=multiprocessing.cpu_count(), help=('The maximum number of processes to run in parallel. The ' 'default is the number of CPUs available on the machine.')) parser.add_argument( '-o', '--output-dir', action='store', type=arg_is_dir, help=('The directory in which all output files will be written. ' 'The default is to use the directory of the first observed ' 'config file.')) parser.add_argument( '--temp-dir', action='store', type=arg_is_dir, help=('A directory to temporarily stage files. The default is to ' 'use the output directory.')) parser.add_argument( '-s', '--stat-prefixes', nargs='*', type=str, default=['pi', 'pi.net', 'wattTheta', 'tajD.denom'], help=('Prefixes of summary statistics to use in the analyses. ' 'The prefixes should be separated by spaces. ' 'Default: `-s pi pi.net wattTheta tajD.denom`.')) parser.add_argument( '--vertical-lines', nargs='*', type=float, default=[], help=('Positions along x-axis where vertical lines are to be ' 'drawn. Default is to draw no vertical lines.')) parser.add_argument('--compress', action='store_true', help='Compress plot data file.') parser.add_argument('--keep-temps', action='store_true', help='Keep all temporary files.') parser.add_argument('--seed', action='store', type=int, help='Random number seed to use for the analysis.') parser.add_argument('--version', action='version', version='%(prog)s ' + _program_info['version'], help='Report version and exit.') parser.add_argument('--quiet', action='store_true', help='Run without verbose messaging.') parser.add_argument('--debug', action='store_true', help='Run in debugging mode.') args = parser.parse_args() ########################################################################## ## handle args from pymsbayes.utils.messaging import (LoggingControl, InfoLogger) LoggingControl.set_logging_level("INFO") if args.quiet: LoggingControl.set_logging_level("WARNING") if args.debug: LoggingControl.set_logging_level("DEBUG") log = LoggingControl.get_logger(__name__) from pymsbayes.workers import MsBayesWorker from pymsbayes.utils.parsing import (get_patterns_from_prefixes, DEFAULT_STAT_PATTERNS, get_stats_by_time, dict_line_iter) from pymsbayes.manager import Manager from pymsbayes.utils.tempfs import TempFileSystem from pymsbayes.utils import probability from pymsbayes.utils.functions import long_division from pymsbayes.config import MsBayesConfig from pymsbayes.utils import GLOBAL_RNG, MSBAYES_SORT_INDEX, ToolPathManager from pymsbayes.fileio import process_file_arg from pymsbayes.plotting import MATPLOTLIB_AVAILABLE, SaturationPlotGrid MSBAYES_SORT_INDEX.set_index(0) # get full paths to tools msbayes_path = ToolPathManager.get_tool_full_path('msbayes.pl') dpp_msbayes_path = ToolPathManager.get_tool_full_path('dpp-msbayes.pl') if not args.output_dir: args.output_dir = os.path.dirname(args.config) info = InfoLogger(os.path.join(args.output_dir, 'pymsbayes-info.txt')) stats_by_time_path = os.path.join(args.output_dir, 'stats-by-time.txt') if args.compress: stats_by_time_path += '.gz' plot_path = os.path.join(args.output_dir, 'saturation-plot.pdf') if not args.temp_dir: args.temp_dir = args.output_dir temp_fs = TempFileSystem(parent=args.temp_dir, prefix='temp-files-') args.stat_prefixes = [s.rstrip('.') for s in args.stat_prefixes] stat_patterns = get_patterns_from_prefixes( [s + '.' for s in args.stat_prefixes], ignore_case=True) if not args.seed: args.seed = random.randint(1, 999999999) GLOBAL_RNG.seed(args.seed) compress_level = None if args.compress: compress_level = 9 cfg = MsBayesConfig(args.config) num_taxon_pairs = cfg.npairs cfg.div_model_prior = 'constrained' cfg.psi = probability.DiscreteUniformDistribution(num_taxon_pairs, num_taxon_pairs) config_path = temp_fs.get_file_path(prefix='cfg-') cfg.write(config_path) info.write('[pymsbayes]', log.info) info.write('\tprogram_name = {name}'.format(**_program_info), log.info) info.write('\tversion = {version}'.format(**_program_info), log.info) info.write('\tinvocation = {0!r}'.format(' '.join(sys.argv)), log.info) info.write('\toutput_directory = {0!r}'.format(args.output_dir), log.info) info.write('\ttemp_directory = {0!r}'.format(temp_fs.base_dir), log.info) info.write('\tsort_index = {0}'.format(MSBAYES_SORT_INDEX.current_value()), log.info) info.write( '\tstat_patterns = {0!r}'.format(', '.join( [p.pattern for p in stat_patterns])), log.info) info.write('\tseed = {0}'.format(args.seed), log.info) info.write('\tnum_prior_samples = {0}'.format(args.num_prior_samples), log.info) info.write('\tstats_by_time_path = {0!r}'.format(stats_by_time_path), log.info) info.write('\t[[tool_paths]]', log.info) info.write('\t\tdpp_msbayes = {0}'.format(dpp_msbayes_path), log.info) info.write('\t\tmsbayes = {0}'.format(msbayes_path), log.info) info.write('\t[[config]]', log.debug) info.write('{0}'.format(str(cfg)), log.debug) ########################################################################## ## begin analysis --- generate samples start_time = datetime.datetime.now() if args.np > args.num_prior_samples: args.np = args.num_prior_samples batch_size, remainder = long_division(args.num_prior_samples, args.np) schema = 'abctoolbox' workers = [] for i in range(args.np): sample_size = batch_size if i == (args.np - 1): sample_size += remainder w = MsBayesWorker(temp_fs=temp_fs, sample_size=sample_size, config_path=config_path, report_parameters=True, schema=schema, include_header=True, stat_patterns=stat_patterns, write_stats_file=False) workers.append(w) log.info('Generating samples...') workers = Manager.run_workers(workers=workers, num_processors=args.np) log.info('Parsing samples...') stats_by_time = get_stats_by_time([w.prior_path for w in workers]) stat_keys = stats_by_time.keys() stat_keys.remove('PRI.t') for prefix in args.stat_prefixes: if not prefix in stat_keys: raise Exception('stat prefix {0!r} not found in simulated stats:' '\n\t{1}'.format(prefix, ', '.join(stat_keys))) header = ['PRI.t'] + args.stat_prefixes log.info('Writing stats-by-time matrix...') out, close = process_file_arg(stats_by_time_path, 'w', compresslevel=compress_level) for row in dict_line_iter(stats_by_time, sep='\t', header=header): out.write(row) if close: out.close() log.info('Creating plots...') if not MATPLOTLIB_AVAILABLE: log.warning( '`matplotlib` could not be imported, so the plot can not be\n' 'produced. The data to create the plot can be found in:\n\t' '{0!r}'.format(stats_by_time_path)) else: y_labels = { 'pi': r'$\pi$', 'pi.net': r'$\pi_{net}$', 'wattTheta': r'$\theta_W$', 'tajD.denom': r'$SD(\pi - \theta_W)$' } spg = SaturationPlotGrid(stats_by_time, x_key='PRI.t', y_keys=args.stat_prefixes, y_labels=y_labels, num_columns=2, vertical_line_positions=args.vertical_lines) fig = spg.create_grid() fig.savefig(plot_path) stop_time = datetime.datetime.now() log.info('Done!') info.write('\t[[run_stats]]', log.info) info.write('\t\tstart_time = {0}'.format(str(start_time)), log.info) info.write('\t\tstop_time = {0}'.format(str(stop_time)), log.info) info.write('\t\ttotal_duration = {0}'.format(str(stop_time - start_time)), log.info) if not args.keep_temps: log.debug('purging temps...') temp_fs.purge()
def main_cli(argv = sys.argv): description = '{name} {version}'.format(**_program_info) parser = argparse.ArgumentParser(description = description, formatter_class = argparse_utils.SmartHelpFormatter) parser.add_argument('-o', '--observed-configs', nargs = '+', type = argparse_utils.arg_is_config, required = True, help = ('One or more msBayes config files to be used to either ' 'calculate or simulate observed summary statistics. If ' 'used in combination with `-r` each config will be used to ' 'simulate pseudo-observed data. If analyzing real data, do ' 'not use the `-r` option, and the fasta files specified ' 'within the config must exist and contain the sequence ' 'data.')) parser.add_argument('-p', '--prior-configs', nargs = '+', type = argparse_utils.arg_is_path, required = True, help = ('One or more config files to be used to generate prior ' 'samples. If more than one config is specified, they ' 'should be separated by spaces. ' 'This option can also be used to specify the path to a ' 'directory containing the prior samples and summary ' 'statistic means and standard deviations generated by a ' 'previous run using the `generate-samples-only` option. ' 'These files should be found in the directory ' '`pymsbayes-output/prior-stats-summaries`. The' '`pymsbayes-output/model-key.txt` also needs to be present.' ' If specifying this directory, it should be the only ' 'argument (i.e., no other directories or config files can ' 'be provided).')) parser.add_argument('-r', '--reps', action = 'store', type = argparse_utils.arg_is_nonnegative_int, default = 0, help = ('This option has two effects. First, it signifies that ' 'the analysis will be simulation based (i.e., no real ' 'data will be used). Second, it specifies how many ' 'simulation replicates to perform (i.e., how many data ' 'sets to simulate and analyze).')) parser.add_argument('-n', '--num-prior-samples', action = 'store', type = argparse_utils.arg_is_positive_int, default = 1000000, help = ('The number of prior samples to simulate for each prior ' 'config specified with `-p`.')) parser.add_argument('--prior-batch-size', action = 'store', type = argparse_utils.arg_is_positive_int, default = 10000, help = ('The number of prior samples to simulate for each batch.')) parser.add_argument('--generate-samples-only', action = 'store_true', help = ('Only generate samples from models as requested. I.e., ' 'No analyses are performed to approximate posteriors. ' 'This option can be useful if you want the prior samples ' 'for other purposes.')) parser.add_argument('--num-posterior-samples', action = 'store', type = argparse_utils.arg_is_positive_int, default = 1000, help = ('The number of posterior samples desired for each ' 'analysis. Default: 1000.')) parser.add_argument('--num-standardizing-samples', action = 'store', type = argparse_utils.arg_is_positive_int, default = 10000, help = ('The number of prior samples desired to use for ' 'standardizing statistics. Default: 10000.')) parser.add_argument('--np', action = 'store', type = argparse_utils.arg_is_positive_int, default = multiprocessing.cpu_count(), help = ('The maximum number of processes to run in parallel. The ' 'default is the number of CPUs available on the machine.')) parser.add_argument('--output-dir', action = 'store', type = argparse_utils.arg_is_dir, help = ('The directory in which all output files will be written. ' 'The default is to use the directory of the first observed ' 'config file.')) parser.add_argument('--temp-dir', action = 'store', type = argparse_utils.arg_is_dir, help = ('A directory to temporarily stage files. The default is to ' 'use the output directory.')) parser.add_argument('--staging-dir', action = 'store', type = argparse_utils.arg_is_dir, help = ('A directory to temporarily stage prior files. This option ' 'can be useful on clusters to speed up I/O while ' 'generating prior samples. You can designate a local temp ' 'directory on a compute node to avoid constant writing to ' 'a shared drive. The default is to use the `temp-dir`.')) parser.add_argument('-s', '--stat-prefixes', nargs = '*', type = str, help = ('Prefixes of summary statistics to use in the analyses. ' 'The prefixes should be separated by spaces. ' 'Default: `-s pi wattTheta pi.net tajD.denom`.')) parser.add_argument('-b', '--bandwidth', action = 'store', type = float, help = ('Smoothing parameter for the posterior kernal density ' 'estimation. This option is used for the `glm` ' 'regression method. The default is 2 / ' '`num-posterior-samples`.')) parser.add_argument('-q', '--num-posterior-quantiles', action = 'store', type = argparse_utils.arg_is_positive_int, default = 1000, help = ('The number of equally spaced quantiles at which to ' 'evaluate the GLM-estimated posterior density. ' 'Default: 1000.')) parser.add_argument('--reporting-frequency', action = 'store', type = argparse_utils.arg_is_nonnegative_int, default = 0, help = ('Suggested frequency (in number of prior samples) for ' 'running regression and reporting current results. ' 'Default: 0 (only report final results). ' 'If a value is given, it may be adjusted so that the ' 'reporting frequency is a multiple of the multi-processed ' 'batch size.')) parser.add_argument('--sort-index', action = 'store', type = argparse_utils.arg_is_nonnegative_int, default = 0, choices = range(12), help = argparse_utils.get_sort_index_help_message()) parser.add_argument('--no-global-estimate', action = 'store_true', help = ('If multiple prior models are specified, by default a ' 'global estimate is performed averaging over all models. ' 'This option prevents the global estimation (i.e., only ' 'inferences for each model are made).')) parser.add_argument('--compress', action = 'store_true', help = 'Compress large results files.') parser.add_argument('--keep-temps', action = 'store_true', help = 'Keep all temporary files.') parser.add_argument('--seed', action = 'store', type = int, help = 'Random number seed to use for the analysis.') parser.add_argument('--output-prefix', action = 'store', type = str, default = '', help = ('Prefix to use at beginning of output files. The default ' 'is no prefix.')) parser.add_argument('--data-key-path', action = 'store', type = argparse_utils.arg_is_file, help = ('The path to a `data-key.txt` file generated by a previous ' 'run. This file should be found in the directory ' '`pymsbayes-output/data-key.txt`. This option ' 'will override the `-o`/`--observed-configs` option, and ' 'is intended to be used in combination with the ' '`--start-from` option to restart an analysis.')) parser.add_argument('--start-from-simulation-index', action = 'store', type = argparse_utils.arg_is_nonnegative_int, default = 0, help = ('The simulation index at which to begin analyses. Must be ' 'used in combination with either the number of simulation ' 'replicates (`-r`/`--reps`) or the `--data-key-path` ' 'option, and must be a positive ' 'integer that is less than the number of simulation ' 'replicates. This option can be useful if an analysis ' 'needs to be restarted.')) parser.add_argument('--start-from-observed-index', action = 'store', type = argparse_utils.arg_is_nonnegative_int, default = 0, help = ('The observed config index at which to begin analyses. ' 'Can be used in combination with the `--data-key-path` ' 'option to restart long-running, multi-observed-config ' 'analyses')) parser.add_argument('--dry-run', action = 'store_true', help = 'Do not run analyses; only process settings') parser.add_argument('--version', action = 'version', version = '%(prog)s ' + _program_info['version'], help = 'Report version and exit.') parser.add_argument('--quiet', action = 'store_true', help = 'Run without verbose messaging.') parser.add_argument('--debug', action = 'store_true', help = 'Run in debugging mode.') if argv == sys.argv: args = parser.parse_args() else: args = parser.parse_args(argv) ########################################################################## ## handle args from pymsbayes.utils.messaging import (LoggingControl, InfoLogger) LoggingControl.set_logging_level("INFO") if args.quiet: LoggingControl.set_logging_level("WARNING") if args.debug: LoggingControl.set_logging_level("DEBUG") log = LoggingControl.get_logger(__name__) from pymsbayes.workers import (MsBayesWorker, merge_prior_files, ObsSumStatsWorker) from pymsbayes.teams import ABCTeam from pymsbayes.utils.functions import (is_file, is_dir, long_division, mk_new_dir) from pymsbayes.utils.parsing import (get_patterns_from_prefixes, DEFAULT_STAT_PATTERNS, DIV_MODEL_PATTERNS, MODEL_PATTERNS, PSI_PATTERNS, MEAN_TAU_PATTERNS, OMEGA_PATTERNS, CV_PATTERNS, line_count) from pymsbayes.utils import sumresults, errors from pymsbayes.manager import Manager from pymsbayes.utils.tempfs import TempFileSystem from pymsbayes.config import MsBayesConfig from pymsbayes.utils import (GLOBAL_RNG, set_memory_trace, MSBAYES_SORT_INDEX, ToolPathManager) MSBAYES_SORT_INDEX.set_index(args.sort_index) if len(args.observed_configs) != len(set(args.observed_configs)): raise ValueError('All paths to observed config files must be unique') if args.num_standardizing_samples > args.num_prior_samples: args.num_standardizing_samples = args.num_prior_samples # get full paths to tools msbayes_path = ToolPathManager.get_tool_full_path('msbayes.pl') dpp_msbayes_path = ToolPathManager.get_tool_full_path('dpp-msbayes.pl') eureject_path = ToolPathManager.get_tool_full_path('eureject') abctb_path = ToolPathManager.get_tool_full_path('ABCestimator') # vet prior-configs option using_previous_priors = False previous_prior_dir = None if (len(args.prior_configs) == 1) and (is_dir(args.prior_configs[0])): previous_prior_dir = args.prior_configs.pop(0) previous_priors = glob.glob(os.path.join(previous_prior_dir, '*-prior-sample.txt')) previous_sums = glob.glob(os.path.join(previous_prior_dir, '*-means-and-std-devs.txt')) if (not previous_priors) or (not previous_sums): raise ValueError('directory {0!r} specified with `prior-configs` ' 'option does not contain necessary prior and summary ' 'files'.format(args.prior_configs[0])) using_previous_priors = True else: for path in args.prior_configs: if not is_file(path): raise ValueError('prior config {0!r} is not a file'.format( path)) if len(args.prior_configs) != len(set(args.prior_configs)): raise ValueError('All paths to prior config files must be unique') if not args.output_dir: args.output_dir = os.path.dirname(args.observed_configs[0]) base_dir = mk_new_dir(os.path.join(args.output_dir, 'pymsbayes-results')) if not args.temp_dir: args.temp_dir = base_dir info_path = os.path.join(base_dir, args.output_prefix + \ 'pymsbayes-info.txt') info = InfoLogger(info_path) info.write('[pymsbayes]'.format(base_dir)) info.write('\tversion = {version}'.format(**_program_info)) info.write('\toutput_directory = {0}'.format(base_dir)) temp_fs = TempFileSystem(parent=args.temp_dir, prefix='temp-files-') base_temp_dir = temp_fs.base_dir info.write('\ttemp_directory = {0}'.format(base_temp_dir)) info.write('\tsort_index = {0}'.format( MSBAYES_SORT_INDEX.current_value())) info.write('\tsimulation_reps = {0}'.format(args.reps)) stat_patterns = DEFAULT_STAT_PATTERNS if args.stat_prefixes: for i in range(len(args.stat_prefixes)): if not args.stat_prefixes[i].endswith('.'): args.stat_prefixes[i] += '.' stat_patterns = get_patterns_from_prefixes( args.stat_prefixes, ignore_case=True) if not args.bandwidth: args.bandwidth = 2 / float(args.num_posterior_samples) if not args.seed: args.seed = random.randint(1, 999999999) GLOBAL_RNG.seed(args.seed) if args.data_key_path: observed_map = sumresults.parse_data_key_file(args.data_key_path) observed_paths = [observed_map[k] for k in sorted(observed_map.keys())] else: observed_dir = mk_new_dir(os.path.join(base_dir, 'observed-summary-stats')) observed_paths = [os.path.join(observed_dir, args.output_prefix + \ 'observed-{0}.txt'.format(i+1)) for i in range(len( args.observed_configs))] info.write('\tseed = {0}'.format(args.seed)) info.write('\tnum_processors = {0}'.format(args.np)) info.write('\tnum_prior_samples = {0}'.format( args.num_prior_samples)) info.write('\tnum_standardizing_samples = {0}'.format( args.num_standardizing_samples)) info.write('\tbandwidth = {0}'.format(args.bandwidth)) info.write('\tposterior_quantiles = {0}'.format( args.num_posterior_quantiles)) info.write('\tposterior_sample_size = {0}'.format( args.num_posterior_samples)) info.write('\tstat_patterns = {0}'.format( ', '.join([p.pattern for p in stat_patterns]))) # vet observed configs ref_config_path = args.observed_configs[0] ref_config = MsBayesConfig(ref_config_path) all_config_paths = [] num_taxon_pairs = ref_config.npairs assert num_taxon_pairs > 0 for config in args.observed_configs: all_config_paths.append(config) if not ref_config.equal_sample_table(config): if not args.keep_temps: temp_fs.purge() raise errors.SampleTableError( 'sample tables in config {0!r} and {1!r} differ; ' 'all sample tables must be the same.'.format( ref_config_path, config)) info.write('\tnum_taxon_pairs = {0}'.format(num_taxon_pairs)) info.write('\tdry_run = {0}'.format(args.dry_run)) info.write('\t[[tool_paths]]') info.write('\t\tdpp_msbayes = {0}'.format(dpp_msbayes_path)) info.write('\t\tmsbayes = {0}'.format(msbayes_path)) info.write('\t\teureject = {0}'.format(eureject_path)) info.write('\t\tabcestimator = {0}'.format(abctb_path)) info.write('\t[[observed_configs]]') for i, cfg in enumerate(args.observed_configs): info.write('\t\t{0} = {1}'.format(i + 1, os.path.relpath(cfg, os.path.dirname(info_path)))) abc_team = ABCTeam( temp_fs = temp_fs, observed_stats_files = observed_paths, num_taxon_pairs = num_taxon_pairs, config_paths = args.prior_configs, previous_prior_dir = previous_prior_dir, num_prior_samples = args.num_prior_samples, num_processors = args.np, num_standardizing_samples = args.num_standardizing_samples, num_posterior_samples = args.num_posterior_samples, num_posterior_density_quantiles = args.num_posterior_quantiles, batch_size = args.prior_batch_size, output_dir = base_dir, output_prefix = args.output_prefix, prior_temp_dir = args.staging_dir, rng = GLOBAL_RNG, report_parameters = True, stat_patterns = stat_patterns, eureject_exe_path = eureject_path, abctoolbox_exe_path = abctb_path, msbayes_exe_path = None, abctoolbox_bandwidth = args.bandwidth, omega_threshold = 0.01, cv_threshold = 0.01, compress = args.compress, reporting_frequency = args.reporting_frequency, keep_temps = args.keep_temps, global_estimate_only = False, global_estimate = not args.no_global_estimate, generate_prior_samples_only = args.generate_samples_only, start_from_simulation_index = args.start_from_simulation_index, start_from_observed_index = args.start_from_observed_index) models_to_configs = {} configs_to_models = {} for k, v in abc_team.models.iteritems(): models_to_configs[k] = v configs_to_models[v] = k cfg = MsBayesConfig(v) all_config_paths.append(v) # vet prior configs if not ref_config.equal_sample_table(cfg): if not args.keep_temps: temp_fs.purge() raise errors.SampleTableError( 'sample tables in config {0!r} and {1!r} differ; ' 'all sample tables must be the same.'.format( ref_config_path, v)) info.write('\t[[observed_paths]]') for i in sorted(abc_team.observed_stats_paths.iterkeys()): info.write('\t\t{0} = {1}'.format(i, os.path.relpath( abc_team.observed_stats_paths[i], os.path.dirname(info_path)))) info.write('\t[[prior_configs]]') for i in sorted(abc_team.models.iterkeys()): info.write('\t\t{0} = {1}'.format(i, os.path.relpath( abc_team.models[i], os.path.dirname(info_path)))) ########################################################################## ## begin analysis --- get observed summary stats set_memory_trace() # start logging memory profile start_time = datetime.datetime.now() if args.data_key_path: log.info('Using provided summary statitics...') elif not args.dry_run: obs_temp_dir = base_temp_dir if args.staging_dir: obs_temp_dir = args.staging_dir observed_temp_fs = TempFileSystem(parent = obs_temp_dir, prefix = 'observed-temps-') if args.reps < 1: log.info('Calculating summary statistics from sequence data...') obs_workers = [] for i, cfg in enumerate(args.observed_configs): ss_worker = ObsSumStatsWorker( temp_fs = observed_temp_fs, config_path = cfg, output_path = observed_paths[i], schema = 'abctoolbox', stat_patterns = stat_patterns) obs_workers.append(ss_worker) obs_workers = Manager.run_workers( workers = obs_workers, num_processors = args.np) # re-vet all configs to see if some were changed by obsSumStats.pl new_ref_config = ref_config ref_modified = False # new ref because if all configs were updated all is good if not ref_config.equal_sample_table(ref_config_path): ref_modified = True new_ref_config = MsBayesConfig(ref_config_path) log.warning(""" The alignment lengths in config {0!r} have been corrected for sites with *any* ambiguous bases and/or gaps by obsSumStats.pl. """.format(ref_config_path)) for config in all_config_paths: if not new_ref_config.equal_sample_table(config): corrected_config = config if ref_modified: corrected_config = ref_config_path if not args.keep_temps: observed_temp_fs.purge() temp_fs.purge() raise errors.SampleTableError(""" The sample tables in configs {0!r} and {1!r} differ because obsSumStats.pl modified alignment lengths in config {2!r} to correct for sites in the alignments with *any* ambiguous bases and/or gaps. Please make sure the sample tables in all configs will be the same after correcting alignment lengths for sites that contain *any* ambiguous bases and/or gaps. You can do this by copying and pasting the sample table in {2!r} that has been corrected by obsSumStats.pl into the other configs that were not corrected. """.format(ref_config_path, config, corrected_config)) else: log.info('Simulating summary statistics from observed configs...') num_observed_workers = min([args.reps, args.np]) if args.reps <= args.np: observed_batch_size = 1 remainder = 0 else: observed_batch_size, remainder = long_division(args.reps, args.np) msbayes_workers = [] for idx, cfg in enumerate(args.observed_configs): observed_model_idx = configs_to_models.get(cfg, None) schema = 'abctoolbox' for i in range(num_observed_workers): worker = MsBayesWorker( temp_fs = observed_temp_fs, sample_size = observed_batch_size, config_path = cfg, model_index = observed_model_idx, report_parameters = True, schema = schema, include_header = True, stat_patterns = stat_patterns, write_stats_file = False, staging_dir = None, tag = idx) msbayes_workers.append(worker) if remainder > 0: worker = MsBayesWorker( temp_fs = observed_temp_fs, sample_size = remainder, config_path = cfg, model_index = observed_model_idx, report_parameters = True, schema = schema, include_header = True, stat_patterns = stat_patterns, write_stats_file = False, staging_dir = None, tag = idx) msbayes_workers.append(worker) # run parallel msbayes processes msbayes_workers = Manager.run_workers( workers = msbayes_workers, num_processors = args.np) workers = dict(zip(range(len(args.observed_configs)), [[] for i in range(len(args.observed_configs))])) for w in msbayes_workers: workers[w.tag].append(w) # merge simulated observed data into one file for i in range(len(args.observed_configs)): merge_prior_files([w.prior_path for w in workers[i]], observed_paths[i]) lc = line_count(observed_paths[i], ignore_headers=True) if lc != args.reps: if not args.keep_temps: temp_fs.purge() raise Exception('The number of observed simulations ({0}) ' 'generated for observed config {1!r} and output to ' 'file {2!r} does not match the number of reps ' '({3})'.format(lc, args.observed_configs[i], observed_paths[i], args.reps)) if not args.keep_temps: log.debug('purging observed temps...') observed_temp_fs.purge() ########################################################################## ## Begin ABC analyses if not args.dry_run: abc_team.run() stop_time = datetime.datetime.now() log.info('Done!') info.write('\t[[run_stats]]', log.info) info.write('\t\tstart_time = {0}'.format(str(start_time)), log.info) info.write('\t\tstop_time = {0}'.format(str(stop_time)), log.info) info.write('\t\ttotal_duration = {0}'.format(str(stop_time - start_time)), log.info) if not args.keep_temps: log.debug('purging temps...') temp_fs.purge()
def main_cli(argv=sys.argv): description = '{name} {version}'.format(**_program_info) parser = argparse.ArgumentParser( description=description, formatter_class=argparse_utils.SmartHelpFormatter) parser.add_argument( '-o', '--observed-configs', nargs='+', type=argparse_utils.arg_is_config, required=True, help=('One or more msBayes config files to be used to either ' 'calculate or simulate observed summary statistics. If ' 'used in combination with `-r` each config will be used to ' 'simulate pseudo-observed data. If analyzing real data, do ' 'not use the `-r` option, and the fasta files specified ' 'within the config must exist and contain the sequence ' 'data.')) parser.add_argument( '-p', '--prior-configs', nargs='+', type=argparse_utils.arg_is_path, required=True, help=('One or more config files to be used to generate prior ' 'samples. If more than one config is specified, they ' 'should be separated by spaces. ' 'This option can also be used to specify the path to a ' 'directory containing the prior samples and summary ' 'statistic means and standard deviations generated by a ' 'previous run using the `generate-samples-only` option. ' 'These files should be found in the directory ' '`pymsbayes-output/prior-stats-summaries`. The' '`pymsbayes-output/model-key.txt` also needs to be present.' ' If specifying this directory, it should be the only ' 'argument (i.e., no other directories or config files can ' 'be provided).')) parser.add_argument( '-r', '--reps', action='store', type=argparse_utils.arg_is_nonnegative_int, default=0, help=('This option has two effects. First, it signifies that ' 'the analysis will be simulation based (i.e., no real ' 'data will be used). Second, it specifies how many ' 'simulation replicates to perform (i.e., how many data ' 'sets to simulate and analyze).')) parser.add_argument( '-n', '--num-prior-samples', action='store', type=argparse_utils.arg_is_positive_int, default=1000000, help=('The number of prior samples to simulate for each prior ' 'config specified with `-p`.')) parser.add_argument( '--prior-batch-size', action='store', type=argparse_utils.arg_is_positive_int, default=10000, help=('The number of prior samples to simulate for each batch.')) parser.add_argument( '--generate-samples-only', action='store_true', help=('Only generate samples from models as requested. I.e., ' 'No analyses are performed to approximate posteriors. ' 'This option can be useful if you want the prior samples ' 'for other purposes.')) parser.add_argument( '--num-posterior-samples', action='store', type=argparse_utils.arg_is_positive_int, default=1000, help=('The number of posterior samples desired for each ' 'analysis. Default: 1000.')) parser.add_argument('--num-standardizing-samples', action='store', type=argparse_utils.arg_is_positive_int, default=10000, help=('The number of prior samples desired to use for ' 'standardizing statistics. Default: 10000.')) parser.add_argument( '--np', action='store', type=argparse_utils.arg_is_positive_int, default=multiprocessing.cpu_count(), help=('The maximum number of processes to run in parallel. The ' 'default is the number of CPUs available on the machine.')) parser.add_argument( '--output-dir', action='store', type=argparse_utils.arg_is_dir, help=('The directory in which all output files will be written. ' 'The default is to use the directory of the first observed ' 'config file.')) parser.add_argument( '--temp-dir', action='store', type=argparse_utils.arg_is_dir, help=('A directory to temporarily stage files. The default is to ' 'use the output directory.')) parser.add_argument( '--staging-dir', action='store', type=argparse_utils.arg_is_dir, help=('A directory to temporarily stage prior files. This option ' 'can be useful on clusters to speed up I/O while ' 'generating prior samples. You can designate a local temp ' 'directory on a compute node to avoid constant writing to ' 'a shared drive. The default is to use the `temp-dir`.')) parser.add_argument( '-s', '--stat-prefixes', nargs='*', type=str, help=('Prefixes of summary statistics to use in the analyses. ' 'The prefixes should be separated by spaces. ' 'Default: `-s pi wattTheta pi.net tajD.denom`.')) parser.add_argument( '-b', '--bandwidth', action='store', type=float, help=('Smoothing parameter for the posterior kernal density ' 'estimation. This option is used for the `glm` ' 'regression method. The default is 2 / ' '`num-posterior-samples`.')) parser.add_argument( '-q', '--num-posterior-quantiles', action='store', type=argparse_utils.arg_is_positive_int, default=1000, help=('The number of equally spaced quantiles at which to ' 'evaluate the GLM-estimated posterior density. ' 'Default: 1000.')) parser.add_argument( '--reporting-frequency', action='store', type=argparse_utils.arg_is_nonnegative_int, default=0, help=('Suggested frequency (in number of prior samples) for ' 'running regression and reporting current results. ' 'Default: 0 (only report final results). ' 'If a value is given, it may be adjusted so that the ' 'reporting frequency is a multiple of the multi-processed ' 'batch size.')) parser.add_argument('--sort-index', action='store', type=argparse_utils.arg_is_nonnegative_int, default=0, choices=range(12), help=argparse_utils.get_sort_index_help_message()) parser.add_argument( '--no-global-estimate', action='store_true', help=('If multiple prior models are specified, by default a ' 'global estimate is performed averaging over all models. ' 'This option prevents the global estimation (i.e., only ' 'inferences for each model are made).')) parser.add_argument('--compress', action='store_true', help='Compress large results files.') parser.add_argument('--keep-temps', action='store_true', help='Keep all temporary files.') parser.add_argument('--seed', action='store', type=int, help='Random number seed to use for the analysis.') parser.add_argument( '--output-prefix', action='store', type=str, default='', help=('Prefix to use at beginning of output files. The default ' 'is no prefix.')) parser.add_argument( '--data-key-path', action='store', type=argparse_utils.arg_is_file, help=('The path to a `data-key.txt` file generated by a previous ' 'run. This file should be found in the directory ' '`pymsbayes-output/data-key.txt`. This option ' 'will override the `-o`/`--observed-configs` option, and ' 'is intended to be used in combination with the ' '`--start-from` option to restart an analysis.')) parser.add_argument( '--start-from-simulation-index', action='store', type=argparse_utils.arg_is_nonnegative_int, default=0, help=('The simulation index at which to begin analyses. Must be ' 'used in combination with either the number of simulation ' 'replicates (`-r`/`--reps`) or the `--data-key-path` ' 'option, and must be a positive ' 'integer that is less than the number of simulation ' 'replicates. This option can be useful if an analysis ' 'needs to be restarted.')) parser.add_argument( '--start-from-observed-index', action='store', type=argparse_utils.arg_is_nonnegative_int, default=0, help=('The observed config index at which to begin analyses. ' 'Can be used in combination with the `--data-key-path` ' 'option to restart long-running, multi-observed-config ' 'analyses')) parser.add_argument('--dry-run', action='store_true', help='Do not run analyses; only process settings') parser.add_argument('--version', action='version', version='%(prog)s ' + _program_info['version'], help='Report version and exit.') parser.add_argument('--quiet', action='store_true', help='Run without verbose messaging.') parser.add_argument('--debug', action='store_true', help='Run in debugging mode.') if argv == sys.argv: args = parser.parse_args() else: args = parser.parse_args(argv) ########################################################################## ## handle args from pymsbayes.utils.messaging import (LoggingControl, InfoLogger) LoggingControl.set_logging_level("INFO") if args.quiet: LoggingControl.set_logging_level("WARNING") if args.debug: LoggingControl.set_logging_level("DEBUG") log = LoggingControl.get_logger(__name__) from pymsbayes.workers import (MsBayesWorker, merge_prior_files, ObsSumStatsWorker) from pymsbayes.teams import ABCTeam from pymsbayes.utils.functions import (is_file, is_dir, long_division, mk_new_dir) from pymsbayes.utils.parsing import (get_patterns_from_prefixes, DEFAULT_STAT_PATTERNS, DIV_MODEL_PATTERNS, MODEL_PATTERNS, PSI_PATTERNS, MEAN_TAU_PATTERNS, OMEGA_PATTERNS, CV_PATTERNS, line_count) from pymsbayes.utils import sumresults, errors from pymsbayes.manager import Manager from pymsbayes.utils.tempfs import TempFileSystem from pymsbayes.config import MsBayesConfig from pymsbayes.utils import (GLOBAL_RNG, set_memory_trace, MSBAYES_SORT_INDEX, ToolPathManager) MSBAYES_SORT_INDEX.set_index(args.sort_index) if len(args.observed_configs) != len(set(args.observed_configs)): raise ValueError('All paths to observed config files must be unique') if args.num_standardizing_samples > args.num_prior_samples: args.num_standardizing_samples = args.num_prior_samples # get full paths to tools msbayes_path = ToolPathManager.get_tool_full_path('msbayes.pl') dpp_msbayes_path = ToolPathManager.get_tool_full_path('dpp-msbayes.pl') eureject_path = ToolPathManager.get_tool_full_path('eureject') abctb_path = ToolPathManager.get_tool_full_path('ABCestimator') # vet prior-configs option using_previous_priors = False previous_prior_dir = None if (len(args.prior_configs) == 1) and (is_dir(args.prior_configs[0])): previous_prior_dir = args.prior_configs.pop(0) previous_priors = glob.glob( os.path.join(previous_prior_dir, '*-prior-sample.txt')) previous_sums = glob.glob( os.path.join(previous_prior_dir, '*-means-and-std-devs.txt')) if (not previous_priors) or (not previous_sums): raise ValueError( 'directory {0!r} specified with `prior-configs` ' 'option does not contain necessary prior and summary ' 'files'.format(args.prior_configs[0])) using_previous_priors = True else: for path in args.prior_configs: if not is_file(path): raise ValueError( 'prior config {0!r} is not a file'.format(path)) if len(args.prior_configs) != len(set(args.prior_configs)): raise ValueError('All paths to prior config files must be unique') if not args.output_dir: args.output_dir = os.path.dirname(args.observed_configs[0]) base_dir = mk_new_dir(os.path.join(args.output_dir, 'pymsbayes-results')) if not args.temp_dir: args.temp_dir = base_dir info_path = os.path.join(base_dir, args.output_prefix + \ 'pymsbayes-info.txt') info = InfoLogger(info_path) info.write('[pymsbayes]'.format(base_dir)) info.write('\tversion = {version}'.format(**_program_info)) info.write('\toutput_directory = {0}'.format(base_dir)) temp_fs = TempFileSystem(parent=args.temp_dir, prefix='temp-files-') base_temp_dir = temp_fs.base_dir info.write('\ttemp_directory = {0}'.format(base_temp_dir)) info.write('\tsort_index = {0}'.format(MSBAYES_SORT_INDEX.current_value())) info.write('\tsimulation_reps = {0}'.format(args.reps)) stat_patterns = DEFAULT_STAT_PATTERNS if args.stat_prefixes: for i in range(len(args.stat_prefixes)): if not args.stat_prefixes[i].endswith('.'): args.stat_prefixes[i] += '.' stat_patterns = get_patterns_from_prefixes(args.stat_prefixes, ignore_case=True) if not args.bandwidth: args.bandwidth = 2 / float(args.num_posterior_samples) if not args.seed: args.seed = random.randint(1, 999999999) GLOBAL_RNG.seed(args.seed) if args.data_key_path: observed_map = sumresults.parse_data_key_file(args.data_key_path) observed_paths = [observed_map[k] for k in sorted(observed_map.keys())] else: observed_dir = mk_new_dir( os.path.join(base_dir, 'observed-summary-stats')) observed_paths = [os.path.join(observed_dir, args.output_prefix + \ 'observed-{0}.txt'.format(i+1)) for i in range(len( args.observed_configs))] info.write('\tseed = {0}'.format(args.seed)) info.write('\tnum_processors = {0}'.format(args.np)) info.write('\tnum_prior_samples = {0}'.format(args.num_prior_samples)) info.write('\tnum_standardizing_samples = {0}'.format( args.num_standardizing_samples)) info.write('\tbandwidth = {0}'.format(args.bandwidth)) info.write('\tposterior_quantiles = {0}'.format( args.num_posterior_quantiles)) info.write('\tposterior_sample_size = {0}'.format( args.num_posterior_samples)) info.write('\tstat_patterns = {0}'.format(', '.join( [p.pattern for p in stat_patterns]))) # vet observed configs ref_config_path = args.observed_configs[0] ref_config = MsBayesConfig(ref_config_path) all_config_paths = [] num_taxon_pairs = ref_config.npairs assert num_taxon_pairs > 0 for config in args.observed_configs: all_config_paths.append(config) if not ref_config.equal_sample_table(config): if not args.keep_temps: temp_fs.purge() raise errors.SampleTableError( 'sample tables in config {0!r} and {1!r} differ; ' 'all sample tables must be the same.'.format( ref_config_path, config)) info.write('\tnum_taxon_pairs = {0}'.format(num_taxon_pairs)) info.write('\tdry_run = {0}'.format(args.dry_run)) info.write('\t[[tool_paths]]') info.write('\t\tdpp_msbayes = {0}'.format(dpp_msbayes_path)) info.write('\t\tmsbayes = {0}'.format(msbayes_path)) info.write('\t\teureject = {0}'.format(eureject_path)) info.write('\t\tabcestimator = {0}'.format(abctb_path)) info.write('\t[[observed_configs]]') for i, cfg in enumerate(args.observed_configs): info.write('\t\t{0} = {1}'.format( i + 1, os.path.relpath(cfg, os.path.dirname(info_path)))) abc_team = ABCTeam( temp_fs=temp_fs, observed_stats_files=observed_paths, num_taxon_pairs=num_taxon_pairs, config_paths=args.prior_configs, previous_prior_dir=previous_prior_dir, num_prior_samples=args.num_prior_samples, num_processors=args.np, num_standardizing_samples=args.num_standardizing_samples, num_posterior_samples=args.num_posterior_samples, num_posterior_density_quantiles=args.num_posterior_quantiles, batch_size=args.prior_batch_size, output_dir=base_dir, output_prefix=args.output_prefix, prior_temp_dir=args.staging_dir, rng=GLOBAL_RNG, report_parameters=True, stat_patterns=stat_patterns, eureject_exe_path=eureject_path, abctoolbox_exe_path=abctb_path, msbayes_exe_path=None, abctoolbox_bandwidth=args.bandwidth, omega_threshold=0.01, cv_threshold=0.01, compress=args.compress, reporting_frequency=args.reporting_frequency, keep_temps=args.keep_temps, global_estimate_only=False, global_estimate=not args.no_global_estimate, generate_prior_samples_only=args.generate_samples_only, start_from_simulation_index=args.start_from_simulation_index, start_from_observed_index=args.start_from_observed_index) models_to_configs = {} configs_to_models = {} for k, v in abc_team.models.iteritems(): models_to_configs[k] = v configs_to_models[v] = k cfg = MsBayesConfig(v) all_config_paths.append(v) # vet prior configs if not ref_config.equal_sample_table(cfg): if not args.keep_temps: temp_fs.purge() raise errors.SampleTableError( 'sample tables in config {0!r} and {1!r} differ; ' 'all sample tables must be the same.'.format( ref_config_path, v)) info.write('\t[[observed_paths]]') for i in sorted(abc_team.observed_stats_paths.iterkeys()): info.write('\t\t{0} = {1}'.format( i, os.path.relpath(abc_team.observed_stats_paths[i], os.path.dirname(info_path)))) info.write('\t[[prior_configs]]') for i in sorted(abc_team.models.iterkeys()): info.write('\t\t{0} = {1}'.format( i, os.path.relpath(abc_team.models[i], os.path.dirname(info_path)))) ########################################################################## ## begin analysis --- get observed summary stats set_memory_trace() # start logging memory profile start_time = datetime.datetime.now() if args.data_key_path: log.info('Using provided summary statitics...') elif not args.dry_run: obs_temp_dir = base_temp_dir if args.staging_dir: obs_temp_dir = args.staging_dir observed_temp_fs = TempFileSystem(parent=obs_temp_dir, prefix='observed-temps-') if args.reps < 1: log.info('Calculating summary statistics from sequence data...') obs_workers = [] for i, cfg in enumerate(args.observed_configs): ss_worker = ObsSumStatsWorker(temp_fs=observed_temp_fs, config_path=cfg, output_path=observed_paths[i], schema='abctoolbox', stat_patterns=stat_patterns) obs_workers.append(ss_worker) obs_workers = Manager.run_workers(workers=obs_workers, num_processors=args.np) # re-vet all configs to see if some were changed by obsSumStats.pl new_ref_config = ref_config ref_modified = False # new ref because if all configs were updated all is good if not ref_config.equal_sample_table(ref_config_path): ref_modified = True new_ref_config = MsBayesConfig(ref_config_path) log.warning(""" The alignment lengths in config {0!r} have been corrected for sites with *any* ambiguous bases and/or gaps by obsSumStats.pl. """.format(ref_config_path)) for config in all_config_paths: if not new_ref_config.equal_sample_table(config): corrected_config = config if ref_modified: corrected_config = ref_config_path if not args.keep_temps: observed_temp_fs.purge() temp_fs.purge() raise errors.SampleTableError(""" The sample tables in configs {0!r} and {1!r} differ because obsSumStats.pl modified alignment lengths in config {2!r} to correct for sites in the alignments with *any* ambiguous bases and/or gaps. Please make sure the sample tables in all configs will be the same after correcting alignment lengths for sites that contain *any* ambiguous bases and/or gaps. You can do this by copying and pasting the sample table in {2!r} that has been corrected by obsSumStats.pl into the other configs that were not corrected. """.format(ref_config_path, config, corrected_config)) else: log.info('Simulating summary statistics from observed configs...') num_observed_workers = min([args.reps, args.np]) if args.reps <= args.np: observed_batch_size = 1 remainder = 0 else: observed_batch_size, remainder = long_division( args.reps, args.np) msbayes_workers = [] for idx, cfg in enumerate(args.observed_configs): observed_model_idx = configs_to_models.get(cfg, None) schema = 'abctoolbox' for i in range(num_observed_workers): worker = MsBayesWorker(temp_fs=observed_temp_fs, sample_size=observed_batch_size, config_path=cfg, model_index=observed_model_idx, report_parameters=True, schema=schema, include_header=True, stat_patterns=stat_patterns, write_stats_file=False, staging_dir=None, tag=idx) msbayes_workers.append(worker) if remainder > 0: worker = MsBayesWorker(temp_fs=observed_temp_fs, sample_size=remainder, config_path=cfg, model_index=observed_model_idx, report_parameters=True, schema=schema, include_header=True, stat_patterns=stat_patterns, write_stats_file=False, staging_dir=None, tag=idx) msbayes_workers.append(worker) # run parallel msbayes processes msbayes_workers = Manager.run_workers(workers=msbayes_workers, num_processors=args.np) workers = dict( zip(range(len(args.observed_configs)), [[] for i in range(len(args.observed_configs))])) for w in msbayes_workers: workers[w.tag].append(w) # merge simulated observed data into one file for i in range(len(args.observed_configs)): merge_prior_files([w.prior_path for w in workers[i]], observed_paths[i]) lc = line_count(observed_paths[i], ignore_headers=True) if lc != args.reps: if not args.keep_temps: temp_fs.purge() raise Exception( 'The number of observed simulations ({0}) ' 'generated for observed config {1!r} and output to ' 'file {2!r} does not match the number of reps ' '({3})'.format(lc, args.observed_configs[i], observed_paths[i], args.reps)) if not args.keep_temps: log.debug('purging observed temps...') observed_temp_fs.purge() ########################################################################## ## Begin ABC analyses if not args.dry_run: abc_team.run() stop_time = datetime.datetime.now() log.info('Done!') info.write('\t[[run_stats]]', log.info) info.write('\t\tstart_time = {0}'.format(str(start_time)), log.info) info.write('\t\tstop_time = {0}'.format(str(stop_time)), log.info) info.write('\t\ttotal_duration = {0}'.format(str(stop_time - start_time)), log.info) if not args.keep_temps: log.debug('purging temps...') temp_fs.purge()
def main_cli(): description = '{name} {version}'.format(**_program_info) parser = argparse.ArgumentParser( description=description, formatter_class=argparse_utils.SmartHelpFormatter) parser.add_argument( '-c', '--config', type=argparse_utils.arg_is_config, required=True, help=('msBayes config file to be used to generate saturation ' 'plot.')) parser.add_argument( '-n', '--num-prior-samples', action='store', type=int, default=1000, help=('The number of prior samples to simulate for the ' 'saturation plot.')) parser.add_argument( '--np', action='store', type=int, default=multiprocessing.cpu_count(), help=('The maximum number of processes to run in parallel. The ' 'default is the number of CPUs available on the machine.')) parser.add_argument( '-o', '--output-dir', action='store', type=argparse_utils.arg_is_dir, help=('The directory in which all output files will be written. ' 'The default is to use the directory of the first observed ' 'config file.')) parser.add_argument( '--temp-dir', action='store', type=argparse_utils.arg_is_dir, help=('A directory to temporarily stage files. The default is to ' 'use the output directory.')) parser.add_argument( '-s', '--stat-prefixes', nargs='*', type=str, default=['pi', 'pi.net', 'wattTheta', 'tajD.denom'], help=('Prefixes of summary statistics to use in the analyses. ' 'The prefixes should be separated by spaces. ' 'Default: `-s pi pi.net wattTheta tajD.denom`.')) parser.add_argument('--sort-index', action='store', type=int, default=0, choices=range(12), help=argparse_utils.get_sort_index_help_message()) parser.add_argument('--compress', action='store_true', help='Compress plot data file.') parser.add_argument('--keep-temps', action='store_true', help='Keep all temporary files.') parser.add_argument('--seed', action='store', type=int, help='Random number seed to use for the analysis.') parser.add_argument('--version', action='version', version='%(prog)s ' + _program_info['version'], help='Report version and exit.') parser.add_argument('--quiet', action='store_true', help='Run without verbose messaging.') parser.add_argument('--debug', action='store_true', help='Run in debugging mode.') args = parser.parse_args() ########################################################################## ## handle args from pymsbayes.utils.messaging import (LoggingControl, InfoLogger) LoggingControl.set_logging_level("INFO") if args.quiet: LoggingControl.set_logging_level("WARNING") if args.debug: LoggingControl.set_logging_level("DEBUG") log = LoggingControl.get_logger(__name__) from pymsbayes.workers import MsBayesWorker from pymsbayes.utils.parsing import (get_patterns_from_prefixes, DEFAULT_STAT_PATTERNS, get_dict_from_spreadsheets, dict_line_iter) from pymsbayes.manager import Manager from pymsbayes.utils.tempfs import TempFileSystem from pymsbayes.utils import probability, stats from pymsbayes.utils.functions import long_division from pymsbayes.config import MsBayesConfig from pymsbayes.utils import GLOBAL_RNG, MSBAYES_SORT_INDEX, ToolPathManager from pymsbayes.fileio import process_file_arg from pymsbayes import plotting MSBAYES_SORT_INDEX.set_index(args.sort_index) # get full paths to tools msbayes_path = ToolPathManager.get_tool_full_path('msbayes.pl') dpp_msbayes_path = ToolPathManager.get_tool_full_path('dpp-msbayes.pl') if not args.output_dir: args.output_dir = os.path.dirname(args.config) info = InfoLogger(os.path.join(args.output_dir, 'pymsbayes-info.txt')) sample_path = os.path.join(args.output_dir, 'prior-sample.txt') if args.compress: sample_path += '.gz' if not args.temp_dir: args.temp_dir = args.output_dir temp_fs = TempFileSystem(parent=args.temp_dir, prefix='temp-files-') args.stat_prefixes = [s.rstrip('.') for s in args.stat_prefixes] stat_patterns = get_patterns_from_prefixes( [s + '.' for s in args.stat_prefixes], ignore_case=True) if not args.seed: args.seed = random.randint(1, 999999999) GLOBAL_RNG.seed(args.seed) compress_level = None if args.compress: compress_level = 9 cfg = MsBayesConfig(args.config) num_taxon_pairs = cfg.npairs info.write('[pymsbayes]', log.info) info.write('\tprogram_name = {name}'.format(**_program_info), log.info) info.write('\tversion = {version}'.format(**_program_info), log.info) info.write('\tinvocation = {0!r}'.format(' '.join(sys.argv)), log.info) info.write('\toutput_directory = {0!r}'.format(args.output_dir), log.info) info.write('\ttemp_directory = {0!r}'.format(temp_fs.base_dir), log.info) info.write('\tsort_index = {0}'.format(MSBAYES_SORT_INDEX.current_value()), log.info) info.write( '\tstat_patterns = {0!r}'.format(', '.join( [p.pattern for p in stat_patterns])), log.info) info.write('\tseed = {0}'.format(args.seed), log.info) info.write('\tnum_prior_samples = {0}'.format(args.num_prior_samples), log.info) info.write('\tsample_path = {0!r}'.format(sample_path), log.info) info.write('\t[[tool_paths]]', log.info) info.write('\t\tdpp_msbayes = {0}'.format(dpp_msbayes_path), log.info) info.write('\t\tmsbayes = {0}'.format(msbayes_path), log.info) info.write('\t[[config]]', log.debug) info.write('{0}'.format(str(cfg)), log.debug) ########################################################################## ## begin analysis --- generate samples start_time = datetime.datetime.now() if args.np > args.num_prior_samples: args.np = args.num_prior_samples batch_size, remainder = long_division(args.num_prior_samples, args.np) schema = 'abctoolbox' workers = [] for i in range(args.np): sample_size = batch_size if i == (args.np - 1): sample_size += remainder w = MsBayesWorker(temp_fs=temp_fs, sample_size=sample_size, config_path=args.config, report_parameters=True, schema=schema, include_header=True, stat_patterns=stat_patterns, write_stats_file=False) workers.append(w) log.info('Generating samples...') workers = Manager.run_workers(workers=workers, num_processors=args.np) log.info('Parsing samples...') sample = get_dict_from_spreadsheets([w.prior_path for w in workers]) log.info('Writing prior samples...') out, close = process_file_arg(sample_path, 'w', compresslevel=compress_level) for row in dict_line_iter(sample, sep='\t'): out.write(row) if close: out.close() log.info('Creating plots...') if not plotting.MATPLOTLIB_AVAILABLE: log.warning( '`matplotlib` could not be imported, so the plot can not be\n' 'produced. The data to create the plot can be found in:\n\t' '{0!r}'.format(sample_path)) sys.exit(1) for stat_pattern in stat_patterns: found = False for stat, values in sample.iteritems(): if stat_pattern.match(stat): values = [float(v) for v in values] found = True plot_path = os.path.join(args.output_dir, 'plot-{0}.pdf'.format(stat)) summary = stats.get_summary(values) s = r'mean = {0:.4f} ({1:.4f}-{2:.4f})'.format( summary['mean'], summary['qi_95'][0], summary['qi_95'][1]) hd = plotting.HistData(x=values, normed=True, bins=20, histtype='bar', align='mid', orientation='vertical', zorder=0) hist = plotting.ScatterPlot(hist_data_list=[hd], right_text=s) hist.left_text_size = 12.0 hist.right_text_size = 12.0 xticks = [i for i in hist.ax.get_xticks()] xtick_labels = [i for i in xticks] yticks = [i for i in hist.ax.get_yticks()] ytick_labels = [i for i in yticks] if len(xtick_labels) >= 8: for i in range(1, len(xtick_labels), 2): xtick_labels[i] = '' if len(ytick_labels) >= 8: for i in range(1, len(ytick_labels), 2): ytick_labels[i] = '' xticks_obj = plotting.Ticks(ticks=xticks, labels=xtick_labels, horizontalalignment='center') yticks_obj = plotting.Ticks(ticks=yticks, labels=ytick_labels) hist.xticks_obj = xticks_obj hist.yticks_obj = yticks_obj plot_grid = plotting.PlotGrid(subplots=[hist], num_columns=1, label_schema=None, title=stat, title_size=14.0, title_top=False, y_title='Density', y_title_position=0.001, y_title_size=14.0, height=4.0, width=6.0, auto_height=False) plot_grid.auto_adjust_margins = False plot_grid.margin_left = 0.04 plot_grid.margin_bottom = 0.04 plot_grid.margin_right = 1.0 plot_grid.margin_top = 0.97 plot_grid.reset_figure() plot_grid.savefig(plot_path) if not found: raise Exception('stat pattern {0!r} not found in simulated stats:' '\n\t{1}'.format(stat_pattern, ', '.join(sample.keys()))) stop_time = datetime.datetime.now() log.info('Done!') info.write('\t[[run_stats]]', log.info) info.write('\t\tstart_time = {0}'.format(str(start_time)), log.info) info.write('\t\tstop_time = {0}'.format(str(stop_time)), log.info) info.write('\t\ttotal_duration = {0}'.format(str(stop_time - start_time)), log.info) if not args.keep_temps: log.debug('purging temps...') temp_fs.purge()