def set_up(self):
     self.temp_fs = TempFileSystem(parent=package_paths.test_path(),
                                   prefix='PyMsBayesTestTemp-')
     self.test_id = 'pymsbayes-' + random_str()
Exemplo n.º 2
0
def main_cli():
    description = '{name} {version}'.format(**_program_info)
    parser = argparse.ArgumentParser(description = description)
    parser.add_argument('-c', '--config',
            type = arg_is_config,
            required = True,
            help = ('msBayes config file to be used to generate saturation '
                    'plot.'))
    parser.add_argument('-n', '--num-prior-samples',
            action = 'store',
            type = int,
            default = 1000,
            help = ('The number of prior samples to simulate for the '
                    'saturation plot.'))
    parser.add_argument('--np',
            action = 'store',
            type = int,
            default = multiprocessing.cpu_count(),
            help = ('The maximum number of processes to run in parallel. The '
                    'default is the number of CPUs available on the machine.'))
    parser.add_argument('-o', '--output-dir',
            action = 'store',
            type = arg_is_dir,
            help = ('The directory in which all output files will be written. '
                    'The default is to use the directory of the first observed '
                    'config file.'))
    parser.add_argument('--temp-dir',
            action = 'store',
            type = arg_is_dir,
            help = ('A directory to temporarily stage files. The default is to '
                    'use the output directory.'))
    parser.add_argument('-s', '--stat-prefixes',
            nargs = '*',
            type = str,
            default = ['pi', 'pi.net', 'wattTheta', 'tajD.denom'],
            help = ('Prefixes of summary statistics to use in the analyses. '
                    'The prefixes should be separated by spaces. '
                    'Default: `-s pi pi.net wattTheta tajD.denom`.'))
    parser.add_argument('--vertical-lines',
            nargs = '*',
            type = float,
            default = [],
            help = ('Positions along x-axis where vertical lines are to be '
                    'drawn. Default is to draw no vertical lines.'))
    parser.add_argument('--compress',
            action = 'store_true',
            help = 'Compress plot data file.')
    parser.add_argument('--keep-temps',
            action = 'store_true',
            help = 'Keep all temporary files.')
    parser.add_argument('--seed',
            action = 'store',
            type = int,
            help = 'Random number seed to use for the analysis.')
    parser.add_argument('--version',
            action = 'version',
            version = '%(prog)s ' + _program_info['version'],
            help = 'Report version and exit.')
    parser.add_argument('--quiet',
            action = 'store_true',
            help = 'Run without verbose messaging.')
    parser.add_argument('--debug',
            action = 'store_true',
            help = 'Run in debugging mode.')

    args = parser.parse_args()

    ##########################################################################
    ## handle args

    from pymsbayes.utils.messaging import (LoggingControl,
            InfoLogger)

    LoggingControl.set_logging_level("INFO")
    if args.quiet:
        LoggingControl.set_logging_level("WARNING")
    if args.debug:
        LoggingControl.set_logging_level("DEBUG")
    log = LoggingControl.get_logger(__name__)

    from pymsbayes.workers import MsBayesWorker
    from pymsbayes.utils.parsing import (get_patterns_from_prefixes,
            DEFAULT_STAT_PATTERNS, get_stats_by_time, dict_line_iter)
    from pymsbayes.manager import Manager
    from pymsbayes.utils.tempfs import TempFileSystem
    from pymsbayes.utils import probability
    from pymsbayes.utils.functions import long_division
    from pymsbayes.config import MsBayesConfig
    from pymsbayes.utils import GLOBAL_RNG, MSBAYES_SORT_INDEX, ToolPathManager
    from pymsbayes.fileio import process_file_arg
    from pymsbayes.plotting import MATPLOTLIB_AVAILABLE, SaturationPlotGrid

    MSBAYES_SORT_INDEX.set_index(0)

    # get full paths to tools
    msbayes_path = ToolPathManager.get_tool_full_path('msbayes.pl')
    dpp_msbayes_path = ToolPathManager.get_tool_full_path('dpp-msbayes.pl')

    if not args.output_dir:
        args.output_dir = os.path.dirname(args.config)
    info = InfoLogger(os.path.join(args.output_dir, 'pymsbayes-info.txt'))

    stats_by_time_path = os.path.join(args.output_dir, 'stats-by-time.txt')
    if args.compress:
        stats_by_time_path += '.gz'
    plot_path = os.path.join(args.output_dir, 'saturation-plot.pdf')

    if not args.temp_dir:
        args.temp_dir = args.output_dir
    temp_fs = TempFileSystem(parent=args.temp_dir, prefix='temp-files-')
    args.stat_prefixes = [s.rstrip('.') for s in args.stat_prefixes]
    stat_patterns = get_patterns_from_prefixes(
            [s + '.' for s in args.stat_prefixes],
            ignore_case=True)
    if not args.seed:
        args.seed = random.randint(1, 999999999)
    GLOBAL_RNG.seed(args.seed)
    compress_level = None
    if args.compress:
        compress_level = 9

    cfg = MsBayesConfig(args.config)
    num_taxon_pairs = cfg.npairs
    cfg.div_model_prior = 'constrained'
    cfg.psi = probability.DiscreteUniformDistribution(num_taxon_pairs,
            num_taxon_pairs)
    config_path = temp_fs.get_file_path(prefix='cfg-')
    cfg.write(config_path)

    info.write('[pymsbayes]', log.info)
    info.write('\tprogram_name = {name}'.format(**_program_info), log.info)
    info.write('\tversion = {version}'.format(**_program_info), log.info)
    info.write('\tinvocation = {0!r}'.format(' '.join(sys.argv)), log.info)
    info.write('\toutput_directory = {0!r}'.format(args.output_dir), log.info)
    info.write('\ttemp_directory = {0!r}'.format(temp_fs.base_dir), log.info)
    info.write('\tsort_index = {0}'.format(
            MSBAYES_SORT_INDEX.current_value()), log.info)
    info.write('\tstat_patterns = {0!r}'.format(
            ', '.join([p.pattern for p in stat_patterns])), log.info)
    info.write('\tseed = {0}'.format(args.seed), log.info)
    info.write('\tnum_prior_samples = {0}'.format(args.num_prior_samples),
            log.info)
    info.write('\tstats_by_time_path = {0!r}'.format(stats_by_time_path),
            log.info)
    info.write('\t[[tool_paths]]', log.info)
    info.write('\t\tdpp_msbayes = {0}'.format(dpp_msbayes_path), log.info)
    info.write('\t\tmsbayes = {0}'.format(msbayes_path), log.info)

    info.write('\t[[config]]', log.debug)
    info.write('{0}'.format(str(cfg)), log.debug)

    ##########################################################################
    ## begin analysis --- generate samples

    start_time = datetime.datetime.now()

    if args.np > args.num_prior_samples:
        args.np = args.num_prior_samples
    batch_size, remainder = long_division(args.num_prior_samples, args.np)
    schema = 'abctoolbox'
    workers = []
    for i in range(args.np):
        sample_size = batch_size
        if i == (args.np - 1):
            sample_size += remainder
        w = MsBayesWorker(
                temp_fs = temp_fs,
                sample_size = sample_size,
                config_path = config_path,
                report_parameters = True,
                schema = schema,
                include_header = True,
                stat_patterns = stat_patterns,
                write_stats_file = False)
        workers.append(w)

    log.info('Generating samples...')
    workers = Manager.run_workers(
            workers = workers,
            num_processors = args.np)
    log.info('Parsing samples...')
    stats_by_time = get_stats_by_time([w.prior_path for w in workers])
    stat_keys = stats_by_time.keys()
    stat_keys.remove('PRI.t')
    for prefix in args.stat_prefixes:
        if not prefix in stat_keys:
            raise Exception('stat prefix {0!r} not found in simulated stats:'
                    '\n\t{1}'.format(prefix, ', '.join(stat_keys)))
    header = ['PRI.t'] + args.stat_prefixes
    log.info('Writing stats-by-time matrix...')
    out, close = process_file_arg(stats_by_time_path, 'w',
            compresslevel = compress_level)
    for row in dict_line_iter(stats_by_time, sep = '\t', header = header):
        out.write(row)
    if close:
        out.close()

    log.info('Creating plots...')

    if not MATPLOTLIB_AVAILABLE:
        log.warning(
                '`matplotlib` could not be imported, so the plot can not be\n'
                'produced. The data to create the plot can be found in:\n\t'
                '{0!r}'.format(stats_by_time_path))
    else:
        y_labels = {'pi': r'$\pi$',
                   'pi.net': r'$\pi_{net}$',
                   'wattTheta': r'$\theta_W$',
                   'tajD.denom': r'$SD(\pi - \theta_W)$'}
        spg = SaturationPlotGrid(stats_by_time,
                x_key = 'PRI.t',
                y_keys = args.stat_prefixes,
                y_labels = y_labels,
                num_columns = 2,
                vertical_line_positions = args.vertical_lines)
        fig = spg.create_grid()
        fig.savefig(plot_path)

    stop_time = datetime.datetime.now()
    log.info('Done!')
    info.write('\t[[run_stats]]', log.info)
    info.write('\t\tstart_time = {0}'.format(str(start_time)), log.info)
    info.write('\t\tstop_time = {0}'.format(str(stop_time)), log.info)
    info.write('\t\ttotal_duration = {0}'.format(str(stop_time - start_time)),
            log.info)

    if not args.keep_temps:
        log.debug('purging temps...')
        temp_fs.purge()
class PyMsBayesTestCase(unittest.TestCase):
    def set_up(self):
        self.temp_fs = TempFileSystem(parent=package_paths.test_path(),
                                      prefix='PyMsBayesTestTemp-')
        self.test_id = 'pymsbayes-' + random_str()

    def tear_down(self):
        self.register_file_system()
        self.temp_fs.purge()
        self.assertEqual(FileStream.open_files, set())

    def get_test_path(self, parent=None, prefix='temp'):
        return self.temp_fs.get_file_path(parent=parent, prefix=prefix)

    def get_test_subdir(self, parent=None, prefix='temp'):
        return self.temp_fs.create_subdir(parent=parent, prefix=prefix)

    def register_file(self, path):
        self.temp_fs._register_file(path)

    def register_dir(self, path):
        self.temp_fs._register_dir(path)

    def register_file_system(self):
        _LOG.debug('registering test file system...')
        for path, dirs, files, in os.walk(self.temp_fs.base_dir):
            for f in files:
                if f.startswith(self.test_id):
                    self.register_file(os.path.join(path, f))
            for d in dirs:
                if d.startswith(self.test_id):
                    self.register_dir(os.path.join(path, d))

    def _exe_script(self,
                    script_name,
                    args,
                    stdout=None,
                    stderr=None,
                    return_code=0):
        script_path = package_paths.script_path(script_name)
        if isinstance(args, str):
            arg_list = args.split()
        else:
            arg_list = args
        arg_list = [str(x) for x in arg_list]
        cmd = [sys.executable, script_path] + arg_list
        _LOG.debug('Invocation:\n\t{0}'.format(' '.join(cmd)))
        p = subprocess.Popen(cmd,
                             shell=False,
                             stdout=subprocess.PIPE,
                             stderr=subprocess.PIPE)
        o, e = p.communicate()
        exit_code = p.wait()
        if exit_code != return_code:
            _LOG.error("exit code {0} did not match {1}".format(
                exit_code, return_code))
            _LOG.error("here is the stdout:\n{0}".format(o))
            _LOG.error("here is the stderr:\n{0}".format(e))
        self.assertEqual(exit_code, return_code)
        if stdout != None:
            if o != stdout:
                _LOG.error("std out did not match expected:\n{0}".format(o))
            self.assertEqual(o, stdout)
        if stderr != None:
            if e != stderr:
                _LOG.error("std error did not match expected:\n{0}".format(e))
            self.assertEqual(e, stderr)

    def get_expected_indices(self,
                             num_pairs,
                             dummy_column=True,
                             parameters_reported=True):
        num_summary_params = 4
        if _CV_INCLUDED:
            num_summary_params += 1
        num_params = 4 * num_pairs
        num_default_stats = 4 * num_pairs
        start = 0
        if dummy_column:
            start = 1
        param_indices = range(start, start + num_summary_params)
        start += num_summary_params
        if parameters_reported:
            param_indices += range(start, start + num_params)
            start += num_params
        stat_indices = range(start, start + num_default_stats)
        return param_indices, stat_indices

    def prior_file_is_valid(self,
                            prior_path,
                            num_of_samples,
                            num_of_columns=None):
        try:
            prior_file = open(prior_path, 'rU')
        except:
            _LOG.error('prior invalid: could not open prior path {0}'.format(
                prior_path))
            return False
        nrows = 0
        for i, line in enumerate(prior_file):
            if nrows == 0 and HEADER_PATTERN.match(line):
                pass
            else:
                nrows += 1
            if not num_of_columns:
                num_of_columns = len(line.strip().split())
            ncols = len(line.strip().split())
            if num_of_columns != ncols:
                _LOG.error('prior invalid: num of columns at line {0} is {1} '
                           'NOT {2}'.format(i + 1, ncols, num_of_columns))
                return False
        prior_file.close()
        if num_of_samples != nrows:
            _LOG.error('prior invalid: num of rows is {0} NOT {1}'.format(
                nrows, num_of_samples))
            return False
        return True

    def get_number_of_lines(self, path):
        f, close = process_file_arg(path)
        count = 0
        for l in f:
            count += 1
        if close:
            f.close()
        return count

    def get_number_of_header_lines(self, path):
        f, close = process_file_arg(path)
        count = 0
        for l in f:
            if HEADER_PATTERN.match(l.strip()):
                count += 1
        if close:
            f.close()
        return count

    def parse_python_config(self, path):
        return ConfigObj(path)

    def get_config_from_msbayes_workers(self, msbayes_workers):
        cfgs = [MsBayesConfig(w.config_path) for w in msbayes_workers]
        self.assertSameConfigs(cfgs)
        return cfgs[0]

    def assertSameConfigs(self, cfgs):
        configs = list(cfgs)
        c1 = configs.pop(0)
        for c2 in cfgs:
            self.assertEqual(c1.time_in_subs_per_site,
                             c2.time_in_subs_per_site)
            self.assertEqual(c1.npairs, c2.npairs)
            self.assertEqual(c1.implementation, c2.implementation)
            self.assertEqual(c1.div_model_prior, c2.div_model_prior)
            self.assertEqual(c1.bottle_proportion_shared,
                             c2.bottle_proportion_shared)
            self.assertEqual(c1.theta_parameters, c2.theta_parameters)
            self.assertEqual(c1.taxa, c2.taxa)
            self.assertTrue(c1.sample_table.equals(c2.sample_table))
            if c1.psi:
                self.assertSameDistributions(c1.psi, c2.psi)
            else:
                self.assertEqual(c1.psi, c2.psi)
            if c1.tau:
                self.assertSameDistributions(c1.tau, c2.tau)
            else:
                self.assertEqual(c1.tau, c2.tau)
            if c1.theta:
                self.assertSameDistributions(c1.theta, c2.theta)
            else:
                self.assertEqual(c1.theta, c2.theta)
            if c1.a_theta:
                self.assertSameDistributions(c1.a_theta, c2.a_theta)
            else:
                self.assertEqual(c1.a_theta, c2.a_theta)
            if c1.d_theta:
                self.assertSameDistributions(c1.d_theta, c2.d_theta)
            else:
                self.assertEqual(c1.d_theta, c2.d_theta)
            if c1.recombination:
                self.assertSameDistributions(c1.recombination,
                                             c2.recombination)
            else:
                self.assertEqual(c1.recombination, c2.recombination)
            if c1.migration:
                self.assertSameDistributions(c1.migration, c2.migration)
            else:
                self.assertEqual(c1.migration, c2.migration)
            if c1.dpp_concentration:
                self.assertSameDistributions(c1.dpp_concentration,
                                             c2.dpp_concentration)
            else:
                self.assertEqual(c1.dpp_concentration, c2.dpp_concentration)
            if c1.bottle_proportion:
                self.assertSameDistributions(c1.bottle_proportion,
                                             c2.bottle_proportion)
            else:
                self.assertEqual(c1.bottle_proportion, c2.bottle_proportion)

    def get_parameter_summaries_from_msbayes_workers(self,
                                                     msbayes_workers,
                                                     shuffle_taus=True):
        msbayes_workers = list(msbayes_workers)
        s = dict(
            zip([i for i in msbayes_workers[0].parameter_indices], [
                SampleSummarizer(tag=msbayes_workers[0].header[i])
                for i in msbayes_workers[0].parameter_indices
            ]))
        ncols = None
        header = msbayes_workers[0].header
        pi = msbayes_workers[0].parameter_indices
        for w in msbayes_workers:
            self.assertEqual(w.header, header)
            self.assertEqual(w.parameter_indices, pi)
            f = open(w.prior_path, 'rU')
            for line_idx, row in enumerate(f):
                if not ncols:
                    ncols = len(row.strip().split())
                if HEADER_PATTERN.match(row.strip()):
                    continue
                r = row.strip().split()
                assert len(r) == ncols
                if shuffle_taus:  # because taus are sorted in prior files
                    psi_index = get_indices_of_patterns(
                        w.header, PSI_PATTERNS)[0]
                    tau_indices = get_indices_of_patterns(
                        w.header, TAU_PATTERNS)
                    psi = int(r[psi_index])
                    taus = [float(r[i]) for i in tau_indices]
                    self.assertEqual(psi, len(set(taus)))
                    random.shuffle(taus)
                    for n, i in enumerate(tau_indices):
                        s[i].add_sample(taus[n])
                    p_set = set(w.parameter_indices) - set(tau_indices)
                    p = sorted(list(p_set))
                    for i in p:
                        s[i].add_sample(float(r[i]))
                else:
                    for i in w.parameter_indices:
                        s[i].add_sample(float(r[i]))
            f.close()
        return s

    def assertPriorIsPrecise(self, msbayes_workers, places=2):
        msbayes_workers = list(msbayes_workers)
        self.assertWorkersFinished(msbayes_workers)
        param_sums = self.get_parameter_summaries_from_msbayes_workers(
            msbayes_workers)
        sample_size = 0
        for w in msbayes_workers:
            sample_size += w.sample_size
        for s in param_sums.itervalues():
            self.assertEqual(s.n, sample_size)
        cfg = self.get_config_from_msbayes_workers(msbayes_workers)
        psi_indices = get_indices_of_patterns(msbayes_workers[0].header,
                                              PSI_PATTERNS)
        self.assertEqual(len(psi_indices), 1)
        model_indices = get_indices_of_patterns(msbayes_workers[0].header,
                                                MODEL_PATTERNS)
        if not msbayes_workers[0].model_index is None:
            self.assertEqual(len(model_indices), 1)
        else:
            self.assertEqual(len(model_indices), 0)
        tau_indices = get_indices_of_patterns(msbayes_workers[0].header,
                                              TAU_PATTERNS)
        a_theta_indices = get_indices_of_patterns(msbayes_workers[0].header,
                                                  A_THETA_PATTERNS)
        d_theta_indices = get_indices_of_patterns(msbayes_workers[0].header,
                                                  D_THETA_PATTERNS)
        if msbayes_workers[0].report_parameters:
            self.assertEqual(len(tau_indices), cfg.npairs)
            self.assertEqual(len(a_theta_indices), cfg.npairs)
            self.assertEqual(len(d_theta_indices), 2 * cfg.npairs)
        else:
            self.assertEqual(len(tau_indices), 0)
            self.assertEqual(len(a_theta_indices), 0)
            self.assertEqual(len(d_theta_indices), 0)
        _LOG.debug('\n{0}\n'.format('\n'.join(
            [str(param_sums[i]) for i in sorted(param_sums.iterkeys())])))
        for i in psi_indices:
            self.assertSampleIsFromDistribution(param_sums[i],
                                                cfg.psi,
                                                places=places)
        for i in tau_indices:
            self.assertSampleIsFromDistribution(param_sums[i],
                                                cfg.tau,
                                                places=places)
        for i in a_theta_indices:
            self.assertSampleIsFromDistribution(param_sums[i],
                                                cfg.a_theta,
                                                places=places)
        for i in d_theta_indices:
            self.assertSampleIsFromDistribution(param_sums[i],
                                                cfg.d_theta,
                                                mean_adj=cfg.theta.mean,
                                                max_adj=cfg.theta.maximum,
                                                compare_variance=False,
                                                places=places)

    def assertPriorIsAccurate(self, msbayes_workers, places=2):
        msbayes_workers = list(msbayes_workers)
        self.assertWorkersFinished(msbayes_workers)
        pass

    def assertPriorIsValid(self, msbayes_workers, places=2):
        msbayes_workers = list(msbayes_workers)
        self.assertWorkersFinished(msbayes_workers)
        self.assertPriorIsPrecise(msbayes_workers, places=places)
        self.assertPriorIsAccurate(msbayes_workers, places=places)

    def assertWorkersFinished(self, msbayes_workers):
        for w in msbayes_workers:
            self.assertTrue(w.finished)

    def assertSampleIsFromDistribution(self,
                                       sample_sum,
                                       dist,
                                       places=2,
                                       mean_adj=1,
                                       max_adj=1,
                                       compare_variance=True):
        if isinstance(dist, probability.DiscreteUniformDistribution):
            self.assertEqual(sample_sum.minimum, dist.minimum)
            self.assertEqual(sample_sum.maximum, dist.maximum)
        else:
            if dist.minimum != float('-inf') or dist.minimum != float('inf'):
                self.assertAlmostEqual(sample_sum.minimum, dist.minimum,
                                       places)
            if dist.maximum != float('-inf') or dist.maximum != float('inf'):
                self.assertAlmostEqual(sample_sum.maximum,
                                       dist.maximum * max_adj, places)
        self.assertAlmostEqual(sample_sum.mean, dist.mean * mean_adj, places)
        if compare_variance:
            self.assertAlmostEqual(sample_sum.variance, dist.variance, places)

    def assertApproxEqual(self, x, y, percent_tol=1e-6):
        eq = (((abs(x - y) / ((abs(x) + abs(y)) / 2)) * 100) < percent_tol)
        if not eq:
            _LOG.error('x ({0}) and y ({1}) are not equal'.format(x, y))
        self.assertTrue(eq)

    def files_equal(self, f1, f2, exclude_line_endings=False):
        equal = True
        diffs = []
        f1, c1 = process_file_arg(f1)
        f2, c2 = process_file_arg(f2)
        line = 0
        f1_end = False
        f2_end = False
        lines_left = True
        while True:
            line += 1
            if f1_end == False:
                try:
                    l1 = f1.next()
                except (StopIteration, EOFError):
                    f1_end = line
                    pass
            if f2_end == False:
                try:
                    l2 = f2.next()
                except (StopIteration, EOFError):
                    f2_end = line
                    pass
            if f1_end != False and f2_end != False:
                break
            if exclude_line_endings:
                l1 = l1.strip()
                l2 = l2.strip()
            if f1_end == False and f2_end == False and l1 != l2:
                diffs.append(line)
                equal = False
        if f1_end != f2_end:
            mn = min([f1_end, f2_end])
            mx = max([f1_end, f2_end])
            diffs.extend(range(mn, mx + 1))
            equal = False
        assert len(diffs) == len(set(diffs))
        if c1:
            f1.close()
        if c2:
            f2.close()
        return equal, diffs

    def assertSameFiles(self, files, exclude_line_endings=False):
        files = list(files)
        all_equal = True
        diffs = StringIO()
        f1 = files.pop(0)
        for f2 in files:
            equal, diff_list = self.files_equal(f1, f2, exclude_line_endings)
            if not equal:
                all_equal = False
                n1 = f1
                if not isinstance(n1, str):
                    n1 = f1.name
                n2 = f2
                if not isinstance(n2, str):
                    n2 = f2.name
                diffs.write('{0} and {1} differ at lines:\n\t{2}\n'.format(
                    n1, n2, ','.join([str(i) for i in diff_list])))
        if not all_equal:
            _LOG.error('files are not equal:\n{0}\n'.format(diffs.getvalue()))
        self.assertTrue(all_equal)

    def assertSameUnsortedFiles(self, files):
        files = list(files)
        all_equal = True
        diffs = StringIO()
        f1, close = process_file_arg(files.pop(0))
        lines1 = sorted(f1.readlines())
        for f in files:
            f2, close2 = process_file_arg(f)
            lines2 = sorted(f2.readlines())
            if len(lines1) != len(lines2):
                all_equal = False
                diffs.write('{0} ({1}) and {2} ({3}) have different '
                            'number of lines\n'.format(f1.name, len(lines1),
                                                       f2.name, len(lines2)))
            for i in range(min(len(lines1), len(lines2))):
                if lines1[i].strip().split() != lines2[i].strip().split():
                    all_equal = False
                    diffs.write('{0} and {1} differ at sorted index '
                                '{2}\n'.format(f1.name, f2.name, i))

            if close2:
                f2.close()
        if not all_equal:
            _LOG.error('files are not equal after sorting:\n{0}\n'.format(
                diffs.getvalue()))
        self.assertTrue(all_equal)
        if close:
            f1.close()

    def same_samples(self, sample1, sample2, places=4, num_mismatches=0):
        if len(sample1) != len(sample2):
            return False
        for i in range(len(sample1)):
            if round(float(sample1[i]) - float(sample2[i]), places) != 0:
                if num_mismatches < 1:
                    return False
                num_mismatches -= 1
        return True

    def assertSameSamples(self,
                          files,
                          columns_to_ignore=[],
                          header=True,
                          places=5,
                          num_mismatches_per_sample=0,
                          num_sample_mismatches=0):
        files = list(files)
        all_equal = True
        diffs = StringIO()
        f1, close = process_file_arg(files.pop(0))
        f1_lines = f1.readlines()
        indices = [
            i for i in range(len(f1_lines[0].strip().split()))
            if i not in columns_to_ignore
        ]
        h1 = []
        if header:
            head = f1_lines.pop(0).strip().split()
            h1 = [head[i] for i in indices]
        lines1 = sorted(f1_lines)
        for f in files:
            f2, close2 = process_file_arg(f)
            f2_lines = f2.readlines()
            h2 = []
            if header:
                head = f2_lines.pop(0).strip().split()
                h2 = [head[i] for i in indices]
                if h1 != h2:
                    all_equal = False
                    diffs.write('{0} and {1} have different headers; not '
                                'comparing further\n'.format(f1.name, f2.name))
                    continue
            lines2 = sorted(f2_lines)
            if len(lines1) != len(lines2):
                all_equal = False
                diffs.write('{0} ({1}) and {2} ({3}) have different '
                            'number of lines\n'.format(f1.name, len(lines1),
                                                       f2.name, len(lines2)))
            n_matches = 0
            n_mismatches = 0
            for l1 in lines1:
                found = False
                for l2 in lines2:
                    values1 = l1.strip().split()
                    values2 = l2.strip().split()
                    v1 = [float(values1[x]) for x in indices]
                    v2 = [float(values2[x]) for x in indices]
                    if self.same_samples(
                            v1,
                            v2,
                            places=places,
                            num_mismatches=num_mismatches_per_sample):
                        found = True
                if found:
                    n_matches += 1
                else:
                    n_mismatches += 1
            if n_mismatches > 0:
                if n_mismatches > num_sample_mismatches:
                    all_equal = False
                diffs.write('{0} and {1}\nhave {2} mismatching samples and '
                            'share {3} samples\n'.format(
                                f1.name, f2.name, n_mismatches, n_matches))
            if close2:
                f2.close()
        if diffs.getvalue() != '':
            _LOG.error('files are not equal after sorting:\n{0}\n'.format(
                diffs.getvalue()))
        self.assertTrue(all_equal)
        if close:
            f1.close()

    def assertSameDistributions(self, d1, d2):
        self.assertEqual(d1.name, d2.name)
        self.assertEqual(str(d1), str(d2))
        self.assertEqual(d1.minimum, d2.minimum)
        self.assertEqual(d1.maximum, d2.maximum)
        self.assertEqual(d1.mean, d2.mean)
        self.assertEqual(d1.variance, d2.variance)

    def assertSameIntegerPartitions(self, integer_partitions):
        ips = list(integer_partitions)
        ip1 = ips.pop(0)
        for ip2 in ips:
            self.assertEqual(ip1._initialized, ip2._initialized)
            self.assertEqual(ip1.n, ip2.n)
            self.assertEqual(ip1.key, ip2.key)
            self.assertEqual(ip1.integer_partition, ip2.integer_partition)
            self.assertEqual(ip1._items, ip2._items)

    def assertSamePartitions(self, partitions):
        ps = list(partitions)
        p1 = ps.pop(0)
        for p2 in ps:
            self.assertEqual(p1._initialized, p2._initialized)
            self.assertEqual(p1.n, p2.n)
            self.assertEqual(p1.key, p2.key)
            self.assertEqual(p1.partition, p2.partition)
            self.assertEqual(p1.values, p2.values)
Exemplo n.º 4
0
 def set_up(self):
     self.temp_fs = TempFileSystem(
             parent = package_paths.test_path(),
             prefix = 'PyMsBayesTestTemp-')
     self.test_id = 'pymsbayes-' + random_str()
Exemplo n.º 5
0
def main_cli():
    description = '{name} {version}'.format(**_program_info)
    parser = argparse.ArgumentParser(description = description,
            formatter_class = argparse_utils.SmartHelpFormatter)
    parser.add_argument('-c', '--config',
            type = argparse_utils.arg_is_config,
            required = True,
            help = ('msBayes config file to be used to generate saturation '
                    'plot.'))
    parser.add_argument('-n', '--num-prior-samples',
            action = 'store',
            type = int,
            default = 1000,
            help = ('The number of prior samples to simulate for the '
                    'saturation plot.'))
    parser.add_argument('--np',
            action = 'store',
            type = int,
            default = multiprocessing.cpu_count(),
            help = ('The maximum number of processes to run in parallel. The '
                    'default is the number of CPUs available on the machine.'))
    parser.add_argument('-o', '--output-dir',
            action = 'store',
            type = argparse_utils.arg_is_dir,
            help = ('The directory in which all output files will be written. '
                    'The default is to use the directory of the first observed '
                    'config file.'))
    parser.add_argument('--temp-dir',
            action = 'store',
            type = argparse_utils.arg_is_dir,
            help = ('A directory to temporarily stage files. The default is to '
                    'use the output directory.'))
    parser.add_argument('-s', '--stat-prefixes',
            nargs = '*',
            type = str,
            default = ['pi', 'pi.net', 'wattTheta', 'tajD.denom'],
            help = ('Prefixes of summary statistics to use in the analyses. '
                    'The prefixes should be separated by spaces. '
                    'Default: `-s pi pi.net wattTheta tajD.denom`.'))
    parser.add_argument('--sort-index',
            action = 'store',
            type = int,
            default = 0,
            choices = range(12),
            help = argparse_utils.get_sort_index_help_message())
    parser.add_argument('--compress',
            action = 'store_true',
            help = 'Compress plot data file.')
    parser.add_argument('--keep-temps',
            action = 'store_true',
            help = 'Keep all temporary files.')
    parser.add_argument('--seed',
            action = 'store',
            type = int,
            help = 'Random number seed to use for the analysis.')
    parser.add_argument('--version',
            action = 'version',
            version = '%(prog)s ' + _program_info['version'],
            help = 'Report version and exit.')
    parser.add_argument('--quiet',
            action = 'store_true',
            help = 'Run without verbose messaging.')
    parser.add_argument('--debug',
            action = 'store_true',
            help = 'Run in debugging mode.')

    args = parser.parse_args()

    ##########################################################################
    ## handle args

    from pymsbayes.utils.messaging import (LoggingControl,
            InfoLogger)

    LoggingControl.set_logging_level("INFO")
    if args.quiet:
        LoggingControl.set_logging_level("WARNING")
    if args.debug:
        LoggingControl.set_logging_level("DEBUG")
    log = LoggingControl.get_logger(__name__)

    from pymsbayes.workers import MsBayesWorker
    from pymsbayes.utils.parsing import (get_patterns_from_prefixes,
            DEFAULT_STAT_PATTERNS, get_dict_from_spreadsheets, dict_line_iter)
    from pymsbayes.manager import Manager
    from pymsbayes.utils.tempfs import TempFileSystem
    from pymsbayes.utils import probability, stats
    from pymsbayes.utils.functions import long_division
    from pymsbayes.config import MsBayesConfig
    from pymsbayes.utils import GLOBAL_RNG, MSBAYES_SORT_INDEX, ToolPathManager
    from pymsbayes.fileio import process_file_arg
    from pymsbayes import plotting

    MSBAYES_SORT_INDEX.set_index(args.sort_index)

    # get full paths to tools
    msbayes_path = ToolPathManager.get_tool_full_path('msbayes.pl')
    dpp_msbayes_path = ToolPathManager.get_tool_full_path('dpp-msbayes.pl')

    if not args.output_dir:
        args.output_dir = os.path.dirname(args.config)
    info = InfoLogger(os.path.join(args.output_dir, 'pymsbayes-info.txt'))

    sample_path = os.path.join(args.output_dir, 'prior-sample.txt')
    if args.compress:
        sample_path += '.gz'

    if not args.temp_dir:
        args.temp_dir = args.output_dir
    temp_fs = TempFileSystem(parent=args.temp_dir, prefix='temp-files-')
    args.stat_prefixes = [s.rstrip('.') for s in args.stat_prefixes]
    stat_patterns = get_patterns_from_prefixes(
            [s + '.' for s in args.stat_prefixes],
            ignore_case=True)
    if not args.seed:
        args.seed = random.randint(1, 999999999)
    GLOBAL_RNG.seed(args.seed)
    compress_level = None
    if args.compress:
        compress_level = 9

    cfg = MsBayesConfig(args.config)
    num_taxon_pairs = cfg.npairs

    info.write('[pymsbayes]', log.info)
    info.write('\tprogram_name = {name}'.format(**_program_info), log.info)
    info.write('\tversion = {version}'.format(**_program_info), log.info)
    info.write('\tinvocation = {0!r}'.format(' '.join(sys.argv)), log.info)
    info.write('\toutput_directory = {0!r}'.format(args.output_dir), log.info)
    info.write('\ttemp_directory = {0!r}'.format(temp_fs.base_dir), log.info)
    info.write('\tsort_index = {0}'.format(
            MSBAYES_SORT_INDEX.current_value()), log.info)
    info.write('\tstat_patterns = {0!r}'.format(
            ', '.join([p.pattern for p in stat_patterns])), log.info)
    info.write('\tseed = {0}'.format(args.seed), log.info)
    info.write('\tnum_prior_samples = {0}'.format(args.num_prior_samples),
            log.info)
    info.write('\tsample_path = {0!r}'.format(sample_path), log.info)
    info.write('\t[[tool_paths]]', log.info)
    info.write('\t\tdpp_msbayes = {0}'.format(dpp_msbayes_path), log.info)
    info.write('\t\tmsbayes = {0}'.format(msbayes_path), log.info)

    info.write('\t[[config]]', log.debug)
    info.write('{0}'.format(str(cfg)), log.debug)

    ##########################################################################
    ## begin analysis --- generate samples

    start_time = datetime.datetime.now()

    if args.np > args.num_prior_samples:
        args.np = args.num_prior_samples
    batch_size, remainder = long_division(args.num_prior_samples, args.np)
    schema = 'abctoolbox'
    workers = []
    for i in range(args.np):
        sample_size = batch_size
        if i == (args.np - 1):
            sample_size += remainder
        w = MsBayesWorker(
                temp_fs = temp_fs,
                sample_size = sample_size,
                config_path = args.config,
                report_parameters = True,
                schema = schema,
                include_header = True,
                stat_patterns = stat_patterns,
                write_stats_file = False)
        workers.append(w)

    log.info('Generating samples...')
    workers = Manager.run_workers(
            workers = workers,
            num_processors = args.np)
    log.info('Parsing samples...')
    sample = get_dict_from_spreadsheets([w.prior_path for w in workers])

    log.info('Writing prior samples...')
    out, close = process_file_arg(sample_path, 'w',
            compresslevel = compress_level)
    for row in dict_line_iter(sample, sep = '\t'):
        out.write(row)
    if close:
        out.close()

    log.info('Creating plots...')

    if not plotting.MATPLOTLIB_AVAILABLE:
        log.warning(
                '`matplotlib` could not be imported, so the plot can not be\n'
                'produced. The data to create the plot can be found in:\n\t'
                '{0!r}'.format(sample_path))
        sys.exit(1)

    for stat_pattern in stat_patterns:
        found = False
        for stat, values in sample.iteritems():
            if stat_pattern.match(stat):
                values = [float(v) for v in values]
                found = True
                plot_path = os.path.join(args.output_dir,
                        'plot-{0}.pdf'.format(stat))
                summary = stats.get_summary(values)
                s = r'mean = {0:.4f} ({1:.4f}-{2:.4f})'.format(
                        summary['mean'],
                        summary['qi_95'][0],
                        summary['qi_95'][1])
                hd = plotting.HistData(x = values,
                        normed = True,
                        bins = 20,
                        histtype = 'bar',
                        align = 'mid',
                        orientation = 'vertical',
                        zorder = 0)
                hist = plotting.ScatterPlot(hist_data_list = [hd],
                        right_text = s)
                hist.left_text_size = 12.0
                hist.right_text_size = 12.0
                xticks = [i for i in hist.ax.get_xticks()]
                xtick_labels = [i for i in xticks]
                yticks = [i for i in hist.ax.get_yticks()]
                ytick_labels = [i for i in yticks]
                if len(xtick_labels) >= 8:
                    for i in range(1, len(xtick_labels), 2):
                        xtick_labels[i] = ''
                if len(ytick_labels) >= 8:
                    for i in range(1, len(ytick_labels), 2):
                        ytick_labels[i] = ''
                xticks_obj = plotting.Ticks(ticks = xticks,
                        labels = xtick_labels,
                        horizontalalignment = 'center')
                yticks_obj = plotting.Ticks(ticks = yticks,
                        labels = ytick_labels)
                hist.xticks_obj = xticks_obj
                hist.yticks_obj = yticks_obj

                plot_grid = plotting.PlotGrid(subplots = [hist],
                        num_columns = 1,
                        label_schema = None,
                        title = stat,
                        title_size = 14.0,
                        title_top = False,
                        y_title = 'Density',
                        y_title_position = 0.001,
                        y_title_size = 14.0,
                        height = 4.0,
                        width = 6.0,
                        auto_height = False)
                plot_grid.auto_adjust_margins = False
                plot_grid.margin_left = 0.04
                plot_grid.margin_bottom = 0.04 
                plot_grid.margin_right = 1.0 
                plot_grid.margin_top = 0.97
                plot_grid.reset_figure()
                plot_grid.savefig(plot_path)

        if not found:
            raise Exception('stat pattern {0!r} not found in simulated stats:'
                    '\n\t{1}'.format(stat_pattern, ', '.join(sample.keys())))

    stop_time = datetime.datetime.now()
    log.info('Done!')
    info.write('\t[[run_stats]]', log.info)
    info.write('\t\tstart_time = {0}'.format(str(start_time)), log.info)
    info.write('\t\tstop_time = {0}'.format(str(stop_time)), log.info)
    info.write('\t\ttotal_duration = {0}'.format(str(stop_time - start_time)),
            log.info)

    if not args.keep_temps:
        log.debug('purging temps...')
        temp_fs.purge()
Exemplo n.º 6
0
class PyMsBayesTestCase(unittest.TestCase):
    
    def set_up(self):
        self.temp_fs = TempFileSystem(
                parent = package_paths.test_path(),
                prefix = 'PyMsBayesTestTemp-')
        self.test_id = 'pymsbayes-' + random_str()

    def tear_down(self):
        self.register_file_system()
        self.temp_fs.purge()
        self.assertEqual(FileStream.open_files, set())

    def get_test_path(self, parent=None, prefix='temp'):
        return self.temp_fs.get_file_path(parent=parent, prefix=prefix)

    def get_test_subdir(self, parent=None, prefix='temp'):
        return self.temp_fs.create_subdir(parent=parent, prefix=prefix)

    def register_file(self, path):
        self.temp_fs._register_file(path)

    def register_dir(self, path):
        self.temp_fs._register_dir(path)

    def register_file_system(self):
        _LOG.debug('registering test file system...')
        for path, dirs, files, in os.walk(self.temp_fs.base_dir):
            for f in files:
                if f.startswith(self.test_id):
                    self.register_file(os.path.join(path, f))
            for d in dirs:
                if d.startswith(self.test_id):
                    self.register_dir(os.path.join(path, d))

    def _exe_script(self, script_name, args, stdout = None, stderr = None,
            return_code = 0):
        script_path = package_paths.script_path(script_name)
        if isinstance(args, str):
            arg_list = args.split()
        else:
            arg_list = args
        arg_list = [str(x) for x in arg_list]
        cmd = [sys.executable, script_path] + arg_list
        _LOG.debug('Invocation:\n\t{0}'.format(' '.join(cmd)))
        p = subprocess.Popen(cmd, shell=False, stdout=subprocess.PIPE,
                stderr=subprocess.PIPE)
        o, e  = p.communicate()
        exit_code = p.wait()
        if exit_code != return_code:
            _LOG.error("exit code {0} did not match {1}".format(exit_code,
                    return_code))
            _LOG.error("here is the stdout:\n{0}".format(o))
            _LOG.error("here is the stderr:\n{0}".format(e))
        self.assertEqual(exit_code, return_code)
        if stdout != None:
            if o != stdout:
                _LOG.error("std out did not match expected:\n{0}".format(o))
            self.assertEqual(o, stdout)
        if stderr != None:
            if e != stderr:
                _LOG.error("std error did not match expected:\n{0}".format(e))
            self.assertEqual(e, stderr)

    def get_expected_indices(self, num_pairs, dummy_column=True,
            parameters_reported=True):
        num_summary_params = 4
        if _CV_INCLUDED:
            num_summary_params += 1
        num_params = 4*num_pairs
        num_default_stats = 4*num_pairs
        start = 0
        if dummy_column:
            start = 1
        param_indices = range(start, start+num_summary_params)
        start += num_summary_params
        if parameters_reported:
            param_indices += range(start, start+num_params)
            start += num_params
        stat_indices = range(start, start+num_default_stats)
        return param_indices, stat_indices
    
    def prior_file_is_valid(self, prior_path, num_of_samples,
            num_of_columns=None):
        try:
            prior_file = open(prior_path, 'rU')
        except:
            _LOG.error('prior invalid: could not open prior path {0}'.format(
                    prior_path))
            return False
        nrows = 0
        for i, line in enumerate(prior_file):
            if nrows == 0 and HEADER_PATTERN.match(line):
                pass
            else:
                nrows += 1
            if not num_of_columns:
                num_of_columns = len(line.strip().split())
            ncols = len(line.strip().split())
            if num_of_columns != ncols:
                _LOG.error('prior invalid: num of columns at line {0} is {1} '
                        'NOT {2}'.format(i+1, ncols, num_of_columns))
                return False
        prior_file.close()
        if num_of_samples != nrows:
            _LOG.error('prior invalid: num of rows is {0} NOT {1}'.format(
                    nrows, num_of_samples))
            return False
        return True
    
    def get_number_of_lines(self, path):
        f, close = process_file_arg(path)
        count = 0
        for l in f:
            count += 1
        if close:
            f.close()
        return count

    def get_number_of_header_lines(self, path):
        f, close = process_file_arg(path)
        count = 0
        for l in f:
            if HEADER_PATTERN.match(l.strip()):
                count += 1
        if close:
            f.close()
        return count

    def parse_python_config(self, path):
        return ConfigObj(path)

    def get_config_from_msbayes_workers(self, msbayes_workers):
        cfgs = [MsBayesConfig(w.config_path) for w in msbayes_workers]
        self.assertSameConfigs(cfgs)
        return cfgs[0]

    def assertSameConfigs(self, cfgs):
        configs = list(cfgs)
        c1 = configs.pop(0)
        for c2 in cfgs:
            self.assertEqual(c1.time_in_subs_per_site,
                    c2.time_in_subs_per_site)
            self.assertEqual(c1.npairs, c2.npairs)
            self.assertEqual(c1.implementation, c2.implementation)
            self.assertEqual(c1.div_model_prior, c2.div_model_prior)
            self.assertEqual(c1.bottle_proportion_shared,
                    c2.bottle_proportion_shared)
            self.assertEqual(c1.theta_parameters, c2.theta_parameters)
            self.assertEqual(c1.taxa, c2.taxa)
            self.assertTrue(c1.sample_table.equals(c2.sample_table))
            if c1.psi:
                self.assertSameDistributions(c1.psi, c2.psi)
            else:
                self.assertEqual(c1.psi, c2.psi)
            if c1.tau:
                self.assertSameDistributions(c1.tau, c2.tau)
            else:
                self.assertEqual(c1.tau, c2.tau)
            if c1.theta:
                self.assertSameDistributions(c1.theta, c2.theta)
            else:
                self.assertEqual(c1.theta, c2.theta)
            if c1.a_theta:
                self.assertSameDistributions(c1.a_theta, c2.a_theta)
            else:
                self.assertEqual(c1.a_theta, c2.a_theta)
            if c1.d_theta:
                self.assertSameDistributions(c1.d_theta, c2.d_theta)
            else:
                self.assertEqual(c1.d_theta, c2.d_theta)
            if c1.recombination:
                self.assertSameDistributions(c1.recombination, c2.recombination)
            else:
                self.assertEqual(c1.recombination, c2.recombination)
            if c1.migration:
                self.assertSameDistributions(c1.migration, c2.migration)
            else:
                self.assertEqual(c1.migration, c2.migration)
            if c1.dpp_concentration:
                self.assertSameDistributions(c1.dpp_concentration,
                        c2.dpp_concentration)
            else:
                self.assertEqual(c1.dpp_concentration, c2.dpp_concentration)
            if c1.bottle_proportion:
                self.assertSameDistributions(c1.bottle_proportion,
                        c2.bottle_proportion)
            else:
                self.assertEqual(c1.bottle_proportion, c2.bottle_proportion)

    def get_parameter_summaries_from_msbayes_workers(self, msbayes_workers,
            shuffle_taus=True):
        msbayes_workers = list(msbayes_workers)
        s = dict(zip(
            [i for i in msbayes_workers[0].parameter_indices],
            [SampleSummarizer(
                tag=msbayes_workers[0].header[i]) for i in msbayes_workers[
                    0].parameter_indices]))
        ncols = None
        header = msbayes_workers[0].header
        pi = msbayes_workers[0].parameter_indices
        for w in msbayes_workers:
            self.assertEqual(w.header, header)
            self.assertEqual(w.parameter_indices, pi)
            f = open(w.prior_path, 'rU')
            for line_idx, row in enumerate(f):
                if not ncols:
                    ncols = len(row.strip().split())
                if HEADER_PATTERN.match(row.strip()):
                    continue
                r = row.strip().split()
                assert len(r) == ncols
                if shuffle_taus: # because taus are sorted in prior files
                    psi_index = get_indices_of_patterns(w.header,
                            PSI_PATTERNS)[0]
                    tau_indices = get_indices_of_patterns(w.header,
                            TAU_PATTERNS)
                    psi = int(r[psi_index])
                    taus = [float(r[i]) for i in tau_indices]
                    self.assertEqual(psi, len(set(taus)))
                    random.shuffle(taus)
                    for n, i in enumerate(tau_indices):
                        s[i].add_sample(taus[n])
                    p_set = set(w.parameter_indices) - set(tau_indices)
                    p = sorted(list(p_set))
                    for i in p:
                        s[i].add_sample(float(r[i]))
                else:
                    for i in w.parameter_indices:
                        s[i].add_sample(float(r[i]))
            f.close()
        return s

    def assertPriorIsPrecise(self, msbayes_workers, places=2):
        msbayes_workers = list(msbayes_workers)
        self.assertWorkersFinished(msbayes_workers)
        param_sums = self.get_parameter_summaries_from_msbayes_workers(
                msbayes_workers)
        sample_size = 0
        for w in msbayes_workers:
            sample_size += w.sample_size
        for s in param_sums.itervalues():
            self.assertEqual(s.n, sample_size)
        cfg = self.get_config_from_msbayes_workers(msbayes_workers)
        psi_indices = get_indices_of_patterns(msbayes_workers[0].header,
                PSI_PATTERNS)
        self.assertEqual(len(psi_indices), 1)
        model_indices = get_indices_of_patterns(msbayes_workers[0].header,
                MODEL_PATTERNS)
        if not msbayes_workers[0].model_index is None:
            self.assertEqual(len(model_indices), 1)
        else:
            self.assertEqual(len(model_indices), 0)
        tau_indices = get_indices_of_patterns(msbayes_workers[0].header,
                TAU_PATTERNS)
        a_theta_indices = get_indices_of_patterns(msbayes_workers[0].header,
                A_THETA_PATTERNS)
        d_theta_indices = get_indices_of_patterns(msbayes_workers[0].header,
                D_THETA_PATTERNS)
        if msbayes_workers[0].report_parameters:
            self.assertEqual(len(tau_indices), cfg.npairs)
            self.assertEqual(len(a_theta_indices), cfg.npairs)
            self.assertEqual(len(d_theta_indices), 2*cfg.npairs)
        else:
            self.assertEqual(len(tau_indices), 0)
            self.assertEqual(len(a_theta_indices), 0)
            self.assertEqual(len(d_theta_indices), 0)
        _LOG.debug('\n{0}\n'.format('\n'.join(
                [str(param_sums[i]) for i in sorted(param_sums.iterkeys())])))
        for i in psi_indices:
            self.assertSampleIsFromDistribution(param_sums[i], cfg.psi,
                    places=places)
        for i in tau_indices:
            self.assertSampleIsFromDistribution(param_sums[i], cfg.tau,
                    places=places)
        for i in a_theta_indices:
            self.assertSampleIsFromDistribution(param_sums[i], cfg.a_theta,
                    places=places)
        for i in d_theta_indices:
            self.assertSampleIsFromDistribution(param_sums[i], cfg.d_theta,
                    mean_adj=cfg.theta.mean,
                    max_adj=cfg.theta.maximum,
                    compare_variance=False,
                    places=places)

    def assertPriorIsAccurate(self, msbayes_workers, places=2):
        msbayes_workers = list(msbayes_workers)
        self.assertWorkersFinished(msbayes_workers)
        pass

    def assertPriorIsValid(self, msbayes_workers, places=2):
        msbayes_workers = list(msbayes_workers)
        self.assertWorkersFinished(msbayes_workers)
        self.assertPriorIsPrecise(msbayes_workers, places=places)
        self.assertPriorIsAccurate(msbayes_workers, places=places)

    def assertWorkersFinished(self, msbayes_workers):
        for w in msbayes_workers:
            self.assertTrue(w.finished)
                    
    def assertSampleIsFromDistribution(self, sample_sum, dist, places=2,
            mean_adj=1,
            max_adj=1,
            compare_variance=True):
        if isinstance(dist, probability.DiscreteUniformDistribution):
            self.assertEqual(sample_sum.minimum, dist.minimum)
            self.assertEqual(sample_sum.maximum, dist.maximum)
        else:
            if dist.minimum != float('-inf') or dist.minimum != float('inf'):
                self.assertAlmostEqual(sample_sum.minimum, dist.minimum, places)
            if dist.maximum != float('-inf') or dist.maximum != float('inf'):
                self.assertAlmostEqual(sample_sum.maximum, dist.maximum*max_adj, places)
        self.assertAlmostEqual(sample_sum.mean, dist.mean*mean_adj, places)
        if compare_variance:
            self.assertAlmostEqual(sample_sum.variance, dist.variance, places)

    def assertApproxEqual(self, x, y, percent_tol=1e-6):
        eq = (((abs(x-y) / ((abs(x)+abs(y))/2))*100) < percent_tol)
        if not eq:
            _LOG.error('x ({0}) and y ({1}) are not equal'.format(x, y))
        self.assertTrue(eq)

    def files_equal(self, f1, f2, exclude_line_endings=False):
        equal = True
        diffs = []
        f1, c1 = process_file_arg(f1)
        f2, c2 = process_file_arg(f2)
        line = 0
        f1_end = False
        f2_end = False
        lines_left = True
        while True:
            line += 1
            if f1_end == False:
                try:
                    l1 = f1.next()
                except (StopIteration, EOFError):
                    f1_end = line
                    pass
            if f2_end == False:
                try:
                    l2 = f2.next()
                except (StopIteration, EOFError):
                    f2_end = line
                    pass
            if f1_end != False and f2_end != False:
                break
            if exclude_line_endings:
                l1 = l1.strip()
                l2 = l2.strip()
            if f1_end == False and f2_end == False and l1 != l2:
                diffs.append(line)
                equal = False
        if f1_end != f2_end:
            mn = min([f1_end, f2_end])
            mx = max([f1_end, f2_end])
            diffs.extend(range(mn, mx+1))
            equal = False
        assert len(diffs) == len(set(diffs))
        if c1:
            f1.close()
        if c2:
            f2.close()
        return equal, diffs

    def assertSameFiles(self, files, exclude_line_endings=False):
        files = list(files)
        all_equal = True
        diffs = StringIO()
        f1 = files.pop(0)
        for f2 in files:
            equal, diff_list = self.files_equal(f1, f2, exclude_line_endings)
            if not equal:
                all_equal = False
                n1 = f1
                if not isinstance(n1, str):
                    n1 = f1.name
                n2 = f2
                if not isinstance(n2, str):
                    n2 = f2.name
                diffs.write('{0} and {1} differ at lines:\n\t{2}\n'.format(
                        n1, n2, ','.join([str(i) for i in diff_list])))
        if not all_equal:
            _LOG.error('files are not equal:\n{0}\n'.format(diffs.getvalue()))
        self.assertTrue(all_equal)

    def assertSameUnsortedFiles(self, files):
        files = list(files)
        all_equal = True
        diffs = StringIO()
        f1, close = process_file_arg(files.pop(0))
        lines1 = sorted(f1.readlines())
        for f in files:
            f2, close2 = process_file_arg(f)
            lines2 = sorted(f2.readlines())
            if len(lines1) != len(lines2):
                all_equal = False
                diffs.write('{0} ({1}) and {2} ({3}) have different '
                        'number of lines\n'.format(f1.name, len(lines1),
                                f2.name, len(lines2)))
            for i in range(min(len(lines1), len(lines2))):
                if lines1[i].strip().split() != lines2[i].strip().split():
                    all_equal = False
                    diffs.write('{0} and {1} differ at sorted index '
                            '{2}\n'.format(f1.name, f2.name, i))

            if close2:
                f2.close()
        if not all_equal:
            _LOG.error('files are not equal after sorting:\n{0}\n'.format(
                    diffs.getvalue()))
        self.assertTrue(all_equal)
        if close:
            f1.close()

    def same_samples(self, sample1, sample2, places = 4, num_mismatches = 0):
        if len(sample1) != len(sample2):
            return False
        for i in range(len(sample1)):
            if round(float(sample1[i]) - float(sample2[i]), places) != 0:
                if num_mismatches < 1:
                    return False
                num_mismatches -= 1
        return True
            
    def assertSameSamples(self, files, columns_to_ignore = [], header = True,
            places = 5, num_mismatches_per_sample = 0,
            num_sample_mismatches = 0):
        files = list(files)
        all_equal = True
        diffs = StringIO()
        f1, close = process_file_arg(files.pop(0))
        f1_lines = f1.readlines()
        indices = [i for i in range(len(
                f1_lines[0].strip().split())) if i not in columns_to_ignore]
        h1 = []
        if header:
            head = f1_lines.pop(0).strip().split()
            h1 = [head[i] for i in indices]
        lines1 = sorted(f1_lines)
        for f in files:
            f2, close2 = process_file_arg(f)
            f2_lines = f2.readlines()
            h2 = []
            if header:
                head = f2_lines.pop(0).strip().split()
                h2 = [head[i] for i in indices]
                if h1 != h2:
                    all_equal = False
                    diffs.write('{0} and {1} have different headers; not '
                            'comparing further\n'.format(
                                    f1.name, f2.name))
                    continue
            lines2 = sorted(f2_lines)
            if len(lines1) != len(lines2):
                all_equal = False
                diffs.write('{0} ({1}) and {2} ({3}) have different '
                        'number of lines\n'.format(f1.name, len(lines1),
                                f2.name, len(lines2)))
            n_matches = 0
            n_mismatches = 0
            for l1 in lines1:
                found = False
                for l2 in lines2:
                    values1 = l1.strip().split()
                    values2 = l2.strip().split()
                    v1 = [float(values1[x]) for x in indices]
                    v2 = [float(values2[x]) for x in indices]
                    if self.same_samples(v1, v2, places = places,
                            num_mismatches = num_mismatches_per_sample):
                        found = True
                if found:
                    n_matches += 1
                else:
                    n_mismatches += 1
            if n_mismatches > 0:
                if n_mismatches > num_sample_mismatches:
                    all_equal = False
                diffs.write('{0} and {1}\nhave {2} mismatching samples and '
                        'share {3} samples\n'.format(
                                f1.name, f2.name, n_mismatches, n_matches))
            if close2:
                f2.close()
        if diffs.getvalue() != '':
            _LOG.error('files are not equal after sorting:\n{0}\n'.format(
                    diffs.getvalue()))
        self.assertTrue(all_equal)
        if close:
            f1.close()
    
    def assertSameDistributions(self, d1, d2):
        self.assertEqual(d1.name, d2.name)
        self.assertEqual(str(d1), str(d2))
        self.assertEqual(d1.minimum, d2.minimum)
        self.assertEqual(d1.maximum, d2.maximum)
        self.assertEqual(d1.mean, d2.mean)
        self.assertEqual(d1.variance, d2.variance)

    def assertSameIntegerPartitions(self, integer_partitions):
        ips = list(integer_partitions)
        ip1 = ips.pop(0)
        for ip2 in ips:
            self.assertEqual(ip1._initialized, ip2._initialized)
            self.assertEqual(ip1.n, ip2.n)
            self.assertEqual(ip1.key, ip2.key)
            self.assertEqual(ip1.integer_partition, ip2.integer_partition)
            self.assertEqual(ip1._items, ip2._items)

    def assertSamePartitions(self, partitions):
        ps = list(partitions)
        p1 = ps.pop(0)
        for p2 in ps:
            self.assertEqual(p1._initialized, p2._initialized)
            self.assertEqual(p1.n, p2.n)
            self.assertEqual(p1.key, p2.key)
            self.assertEqual(p1.partition, p2.partition)
            self.assertEqual(p1.values, p2.values)
Exemplo n.º 7
0
def main_cli():
    description = '{name} {version}'.format(**_program_info)
    parser = argparse.ArgumentParser(description=description)
    parser.add_argument(
        '-c',
        '--config',
        type=arg_is_config,
        required=True,
        help=('msBayes config file to be used to generate saturation '
              'plot.'))
    parser.add_argument(
        '-n',
        '--num-prior-samples',
        action='store',
        type=int,
        default=1000,
        help=('The number of prior samples to simulate for the '
              'saturation plot.'))
    parser.add_argument(
        '--np',
        action='store',
        type=int,
        default=multiprocessing.cpu_count(),
        help=('The maximum number of processes to run in parallel. The '
              'default is the number of CPUs available on the machine.'))
    parser.add_argument(
        '-o',
        '--output-dir',
        action='store',
        type=arg_is_dir,
        help=('The directory in which all output files will be written. '
              'The default is to use the directory of the first observed '
              'config file.'))
    parser.add_argument(
        '--temp-dir',
        action='store',
        type=arg_is_dir,
        help=('A directory to temporarily stage files. The default is to '
              'use the output directory.'))
    parser.add_argument(
        '-s',
        '--stat-prefixes',
        nargs='*',
        type=str,
        default=['pi', 'pi.net', 'wattTheta', 'tajD.denom'],
        help=('Prefixes of summary statistics to use in the analyses. '
              'The prefixes should be separated by spaces. '
              'Default: `-s pi pi.net wattTheta tajD.denom`.'))
    parser.add_argument(
        '--vertical-lines',
        nargs='*',
        type=float,
        default=[],
        help=('Positions along x-axis where vertical lines are to be '
              'drawn. Default is to draw no vertical lines.'))
    parser.add_argument('--compress',
                        action='store_true',
                        help='Compress plot data file.')
    parser.add_argument('--keep-temps',
                        action='store_true',
                        help='Keep all temporary files.')
    parser.add_argument('--seed',
                        action='store',
                        type=int,
                        help='Random number seed to use for the analysis.')
    parser.add_argument('--version',
                        action='version',
                        version='%(prog)s ' + _program_info['version'],
                        help='Report version and exit.')
    parser.add_argument('--quiet',
                        action='store_true',
                        help='Run without verbose messaging.')
    parser.add_argument('--debug',
                        action='store_true',
                        help='Run in debugging mode.')

    args = parser.parse_args()

    ##########################################################################
    ## handle args

    from pymsbayes.utils.messaging import (LoggingControl, InfoLogger)

    LoggingControl.set_logging_level("INFO")
    if args.quiet:
        LoggingControl.set_logging_level("WARNING")
    if args.debug:
        LoggingControl.set_logging_level("DEBUG")
    log = LoggingControl.get_logger(__name__)

    from pymsbayes.workers import MsBayesWorker
    from pymsbayes.utils.parsing import (get_patterns_from_prefixes,
                                         DEFAULT_STAT_PATTERNS,
                                         get_stats_by_time, dict_line_iter)
    from pymsbayes.manager import Manager
    from pymsbayes.utils.tempfs import TempFileSystem
    from pymsbayes.utils import probability
    from pymsbayes.utils.functions import long_division
    from pymsbayes.config import MsBayesConfig
    from pymsbayes.utils import GLOBAL_RNG, MSBAYES_SORT_INDEX, ToolPathManager
    from pymsbayes.fileio import process_file_arg
    from pymsbayes.plotting import MATPLOTLIB_AVAILABLE, SaturationPlotGrid

    MSBAYES_SORT_INDEX.set_index(0)

    # get full paths to tools
    msbayes_path = ToolPathManager.get_tool_full_path('msbayes.pl')
    dpp_msbayes_path = ToolPathManager.get_tool_full_path('dpp-msbayes.pl')

    if not args.output_dir:
        args.output_dir = os.path.dirname(args.config)
    info = InfoLogger(os.path.join(args.output_dir, 'pymsbayes-info.txt'))

    stats_by_time_path = os.path.join(args.output_dir, 'stats-by-time.txt')
    if args.compress:
        stats_by_time_path += '.gz'
    plot_path = os.path.join(args.output_dir, 'saturation-plot.pdf')

    if not args.temp_dir:
        args.temp_dir = args.output_dir
    temp_fs = TempFileSystem(parent=args.temp_dir, prefix='temp-files-')
    args.stat_prefixes = [s.rstrip('.') for s in args.stat_prefixes]
    stat_patterns = get_patterns_from_prefixes(
        [s + '.' for s in args.stat_prefixes], ignore_case=True)
    if not args.seed:
        args.seed = random.randint(1, 999999999)
    GLOBAL_RNG.seed(args.seed)
    compress_level = None
    if args.compress:
        compress_level = 9

    cfg = MsBayesConfig(args.config)
    num_taxon_pairs = cfg.npairs
    cfg.div_model_prior = 'constrained'
    cfg.psi = probability.DiscreteUniformDistribution(num_taxon_pairs,
                                                      num_taxon_pairs)
    config_path = temp_fs.get_file_path(prefix='cfg-')
    cfg.write(config_path)

    info.write('[pymsbayes]', log.info)
    info.write('\tprogram_name = {name}'.format(**_program_info), log.info)
    info.write('\tversion = {version}'.format(**_program_info), log.info)
    info.write('\tinvocation = {0!r}'.format(' '.join(sys.argv)), log.info)
    info.write('\toutput_directory = {0!r}'.format(args.output_dir), log.info)
    info.write('\ttemp_directory = {0!r}'.format(temp_fs.base_dir), log.info)
    info.write('\tsort_index = {0}'.format(MSBAYES_SORT_INDEX.current_value()),
               log.info)
    info.write(
        '\tstat_patterns = {0!r}'.format(', '.join(
            [p.pattern for p in stat_patterns])), log.info)
    info.write('\tseed = {0}'.format(args.seed), log.info)
    info.write('\tnum_prior_samples = {0}'.format(args.num_prior_samples),
               log.info)
    info.write('\tstats_by_time_path = {0!r}'.format(stats_by_time_path),
               log.info)
    info.write('\t[[tool_paths]]', log.info)
    info.write('\t\tdpp_msbayes = {0}'.format(dpp_msbayes_path), log.info)
    info.write('\t\tmsbayes = {0}'.format(msbayes_path), log.info)

    info.write('\t[[config]]', log.debug)
    info.write('{0}'.format(str(cfg)), log.debug)

    ##########################################################################
    ## begin analysis --- generate samples

    start_time = datetime.datetime.now()

    if args.np > args.num_prior_samples:
        args.np = args.num_prior_samples
    batch_size, remainder = long_division(args.num_prior_samples, args.np)
    schema = 'abctoolbox'
    workers = []
    for i in range(args.np):
        sample_size = batch_size
        if i == (args.np - 1):
            sample_size += remainder
        w = MsBayesWorker(temp_fs=temp_fs,
                          sample_size=sample_size,
                          config_path=config_path,
                          report_parameters=True,
                          schema=schema,
                          include_header=True,
                          stat_patterns=stat_patterns,
                          write_stats_file=False)
        workers.append(w)

    log.info('Generating samples...')
    workers = Manager.run_workers(workers=workers, num_processors=args.np)
    log.info('Parsing samples...')
    stats_by_time = get_stats_by_time([w.prior_path for w in workers])
    stat_keys = stats_by_time.keys()
    stat_keys.remove('PRI.t')
    for prefix in args.stat_prefixes:
        if not prefix in stat_keys:
            raise Exception('stat prefix {0!r} not found in simulated stats:'
                            '\n\t{1}'.format(prefix, ', '.join(stat_keys)))
    header = ['PRI.t'] + args.stat_prefixes
    log.info('Writing stats-by-time matrix...')
    out, close = process_file_arg(stats_by_time_path,
                                  'w',
                                  compresslevel=compress_level)
    for row in dict_line_iter(stats_by_time, sep='\t', header=header):
        out.write(row)
    if close:
        out.close()

    log.info('Creating plots...')

    if not MATPLOTLIB_AVAILABLE:
        log.warning(
            '`matplotlib` could not be imported, so the plot can not be\n'
            'produced. The data to create the plot can be found in:\n\t'
            '{0!r}'.format(stats_by_time_path))
    else:
        y_labels = {
            'pi': r'$\pi$',
            'pi.net': r'$\pi_{net}$',
            'wattTheta': r'$\theta_W$',
            'tajD.denom': r'$SD(\pi - \theta_W)$'
        }
        spg = SaturationPlotGrid(stats_by_time,
                                 x_key='PRI.t',
                                 y_keys=args.stat_prefixes,
                                 y_labels=y_labels,
                                 num_columns=2,
                                 vertical_line_positions=args.vertical_lines)
        fig = spg.create_grid()
        fig.savefig(plot_path)

    stop_time = datetime.datetime.now()
    log.info('Done!')
    info.write('\t[[run_stats]]', log.info)
    info.write('\t\tstart_time = {0}'.format(str(start_time)), log.info)
    info.write('\t\tstop_time = {0}'.format(str(stop_time)), log.info)
    info.write('\t\ttotal_duration = {0}'.format(str(stop_time - start_time)),
               log.info)

    if not args.keep_temps:
        log.debug('purging temps...')
        temp_fs.purge()
Exemplo n.º 8
0
def main_cli(argv = sys.argv):
    description = '{name} {version}'.format(**_program_info)
    parser = argparse.ArgumentParser(description = description,
            formatter_class = argparse_utils.SmartHelpFormatter)
    parser.add_argument('-o', '--observed-configs',
            nargs = '+',
            type = argparse_utils.arg_is_config,
            required = True,
            help = ('One or more msBayes config files to be used to either '
                    'calculate or simulate observed summary statistics. If '
                    'used in combination with `-r` each config will be used to '
                    'simulate pseudo-observed data. If analyzing real data, do '
                    'not use the `-r` option, and the fasta files specified '
                    'within the config must exist and contain the sequence '
                    'data.'))
    parser.add_argument('-p', '--prior-configs',
            nargs = '+',
            type = argparse_utils.arg_is_path,
            required = True,
            help = ('One or more config files to be used to generate prior '
                    'samples. If more than one config is specified, they '
                    'should be separated by spaces. '
                    'This option can also be used to specify the path to a '
                    'directory containing the prior samples and summary '
                    'statistic means and standard deviations generated by a '
                    'previous run using the `generate-samples-only` option. '
                    'These files should be found in the directory '
                    '`pymsbayes-output/prior-stats-summaries`. The'
                    '`pymsbayes-output/model-key.txt` also needs to be present.'
                    ' If specifying this directory, it should be the only '
                    'argument (i.e., no other directories or config files can '
                    'be provided).'))
    parser.add_argument('-r', '--reps',
            action = 'store',
            type = argparse_utils.arg_is_nonnegative_int,
            default = 0,
            help = ('This option has two effects. First, it signifies that '
                    'the analysis will be simulation based (i.e., no real '
                    'data will be used). Second, it specifies how many '
                    'simulation replicates to perform (i.e., how many data '
                    'sets to simulate and analyze).'))
    parser.add_argument('-n', '--num-prior-samples',
            action = 'store',
            type = argparse_utils.arg_is_positive_int,
            default = 1000000,
            help = ('The number of prior samples to simulate for each prior '
                    'config specified with `-p`.'))
    parser.add_argument('--prior-batch-size',
            action = 'store',
            type = argparse_utils.arg_is_positive_int,
            default = 10000,
            help = ('The number of prior samples to simulate for each batch.'))
    parser.add_argument('--generate-samples-only',
            action = 'store_true',
            help = ('Only generate samples from models as requested. I.e., '
                    'No analyses are performed to approximate posteriors. '
                    'This option can be useful if you want the prior samples '
                    'for other purposes.'))
    parser.add_argument('--num-posterior-samples',
            action = 'store',
            type = argparse_utils.arg_is_positive_int,
            default = 1000,
            help = ('The number of posterior samples desired for each '
                    'analysis. Default: 1000.'))
    parser.add_argument('--num-standardizing-samples',
            action = 'store',
            type = argparse_utils.arg_is_positive_int,
            default = 10000,
            help = ('The number of prior samples desired to use for '
                    'standardizing statistics. Default: 10000.'))
    parser.add_argument('--np',
            action = 'store',
            type = argparse_utils.arg_is_positive_int,
            default = multiprocessing.cpu_count(),
            help = ('The maximum number of processes to run in parallel. The '
                    'default is the number of CPUs available on the machine.'))
    parser.add_argument('--output-dir',
            action = 'store',
            type = argparse_utils.arg_is_dir,
            help = ('The directory in which all output files will be written. '
                    'The default is to use the directory of the first observed '
                    'config file.'))
    parser.add_argument('--temp-dir',
            action = 'store',
            type = argparse_utils.arg_is_dir,
            help = ('A directory to temporarily stage files. The default is to '
                    'use the output directory.'))
    parser.add_argument('--staging-dir',
            action = 'store',
            type = argparse_utils.arg_is_dir,
            help = ('A directory to temporarily stage prior files. This option '
                    'can be useful on clusters to speed up I/O while '
                    'generating prior samples. You can designate a local temp '
                    'directory on a compute node to avoid constant writing to '
                    'a shared drive. The default is to use the `temp-dir`.'))
    parser.add_argument('-s', '--stat-prefixes',
            nargs = '*',
            type = str,
            help = ('Prefixes of summary statistics to use in the analyses. '
                    'The prefixes should be separated by spaces. '
                    'Default: `-s pi wattTheta pi.net tajD.denom`.'))
    parser.add_argument('-b', '--bandwidth',
            action = 'store',
            type = float,
            help = ('Smoothing parameter for the posterior kernal density '
                    'estimation. This option is used for the `glm` '
                    'regression method. The default is 2 / '
                    '`num-posterior-samples`.'))
    parser.add_argument('-q', '--num-posterior-quantiles',
            action = 'store',
            type = argparse_utils.arg_is_positive_int,
            default = 1000,
            help = ('The number of equally spaced quantiles at which to '
                    'evaluate the GLM-estimated posterior density. '
                    'Default: 1000.'))
    parser.add_argument('--reporting-frequency',
            action = 'store',
            type = argparse_utils.arg_is_nonnegative_int,
            default = 0,
            help = ('Suggested frequency (in number of prior samples) for '
                    'running regression and reporting current results. '
                    'Default: 0 (only report final results). '
                    'If a value is given, it may be adjusted so that the '
                    'reporting frequency is a multiple of the multi-processed '
                    'batch size.'))
    parser.add_argument('--sort-index',
            action = 'store',
            type = argparse_utils.arg_is_nonnegative_int,
            default = 0,
            choices = range(12),
            help = argparse_utils.get_sort_index_help_message())
    parser.add_argument('--no-global-estimate',
            action = 'store_true',
            help = ('If multiple prior models are specified, by default a '
                    'global estimate is performed averaging over all models. '
                    'This option prevents the global estimation (i.e., only '
                    'inferences for each model are made).'))
    parser.add_argument('--compress',
            action = 'store_true',
            help = 'Compress large results files.')
    parser.add_argument('--keep-temps',
            action = 'store_true',
            help = 'Keep all temporary files.')
    parser.add_argument('--seed',
            action = 'store',
            type = int,
            help = 'Random number seed to use for the analysis.')
    parser.add_argument('--output-prefix',
            action = 'store',
            type = str,
            default = '',
            help = ('Prefix to use at beginning of output files. The default '
                    'is no prefix.'))
    parser.add_argument('--data-key-path',
            action = 'store',
            type = argparse_utils.arg_is_file,
            help = ('The path to a `data-key.txt` file generated by a previous '
                    'run. This file should be found in the directory '
                    '`pymsbayes-output/data-key.txt`. This option '
                    'will override the `-o`/`--observed-configs` option, and '
                    'is intended to be used in combination with the '
                    '`--start-from` option to restart an analysis.'))
    parser.add_argument('--start-from-simulation-index',
            action = 'store',
            type = argparse_utils.arg_is_nonnegative_int,
            default = 0,
            help = ('The simulation index at which to begin analyses. Must be '
                    'used in combination with either the number of simulation '
                    'replicates (`-r`/`--reps`) or the `--data-key-path` '
                    'option, and must be a positive '
                    'integer that is less than the number of simulation '
                    'replicates. This option can be useful if an analysis '
                    'needs to be restarted.'))
    parser.add_argument('--start-from-observed-index',
            action = 'store',
            type = argparse_utils.arg_is_nonnegative_int,
            default = 0,
            help = ('The observed config index at which to begin analyses. '
                    'Can be used in combination with the `--data-key-path` '
                    'option to restart long-running, multi-observed-config '
                    'analyses'))
    parser.add_argument('--dry-run',
            action = 'store_true',
            help = 'Do not run analyses; only process settings')
    parser.add_argument('--version',
            action = 'version',
            version = '%(prog)s ' + _program_info['version'],
            help = 'Report version and exit.')
    parser.add_argument('--quiet',
            action = 'store_true',
            help = 'Run without verbose messaging.')
    parser.add_argument('--debug',
            action = 'store_true',
            help = 'Run in debugging mode.')

    if argv == sys.argv:
        args = parser.parse_args()
    else:
        args = parser.parse_args(argv)

    ##########################################################################
    ## handle args

    from pymsbayes.utils.messaging import (LoggingControl,
            InfoLogger)

    LoggingControl.set_logging_level("INFO")
    if args.quiet:
        LoggingControl.set_logging_level("WARNING")
    if args.debug:
        LoggingControl.set_logging_level("DEBUG")
    log = LoggingControl.get_logger(__name__)

    from pymsbayes.workers import (MsBayesWorker, merge_prior_files,
            ObsSumStatsWorker)
    from pymsbayes.teams import ABCTeam
    from pymsbayes.utils.functions import (is_file, is_dir, long_division,
            mk_new_dir)
    from pymsbayes.utils.parsing import (get_patterns_from_prefixes,
            DEFAULT_STAT_PATTERNS, DIV_MODEL_PATTERNS, MODEL_PATTERNS,
            PSI_PATTERNS, MEAN_TAU_PATTERNS, OMEGA_PATTERNS, CV_PATTERNS,
            line_count)
    from pymsbayes.utils import sumresults, errors
    from pymsbayes.manager import Manager
    from pymsbayes.utils.tempfs import TempFileSystem
    from pymsbayes.config import MsBayesConfig
    from pymsbayes.utils import (GLOBAL_RNG, set_memory_trace,
            MSBAYES_SORT_INDEX, ToolPathManager)

    MSBAYES_SORT_INDEX.set_index(args.sort_index)

    if len(args.observed_configs) != len(set(args.observed_configs)):
        raise ValueError('All paths to observed config files must be unique')

    if args.num_standardizing_samples > args.num_prior_samples:
        args.num_standardizing_samples = args.num_prior_samples
    
    # get full paths to tools
    msbayes_path = ToolPathManager.get_tool_full_path('msbayes.pl')
    dpp_msbayes_path = ToolPathManager.get_tool_full_path('dpp-msbayes.pl')
    eureject_path = ToolPathManager.get_tool_full_path('eureject')
    abctb_path = ToolPathManager.get_tool_full_path('ABCestimator')

    # vet prior-configs option
    using_previous_priors = False
    previous_prior_dir = None
    if (len(args.prior_configs) == 1) and (is_dir(args.prior_configs[0])):
        previous_prior_dir = args.prior_configs.pop(0)
        previous_priors = glob.glob(os.path.join(previous_prior_dir,
                '*-prior-sample.txt'))
        previous_sums = glob.glob(os.path.join(previous_prior_dir,
                '*-means-and-std-devs.txt'))
        if (not previous_priors) or (not previous_sums):
            raise ValueError('directory {0!r} specified with `prior-configs` '
                    'option does not contain necessary prior and summary '
                    'files'.format(args.prior_configs[0]))
        using_previous_priors = True
    else:
        for path in args.prior_configs:
            if not is_file(path):
                raise ValueError('prior config {0!r} is not a file'.format(
                        path))
    if len(args.prior_configs) != len(set(args.prior_configs)):
        raise ValueError('All paths to prior config files must be unique') 
    if not args.output_dir:
        args.output_dir = os.path.dirname(args.observed_configs[0])
    base_dir = mk_new_dir(os.path.join(args.output_dir, 'pymsbayes-results'))
    if not args.temp_dir:
        args.temp_dir = base_dir
    info_path = os.path.join(base_dir, args.output_prefix + \
            'pymsbayes-info.txt')
    info = InfoLogger(info_path)
    info.write('[pymsbayes]'.format(base_dir))
    info.write('\tversion = {version}'.format(**_program_info))
    info.write('\toutput_directory = {0}'.format(base_dir))
    temp_fs = TempFileSystem(parent=args.temp_dir, prefix='temp-files-')
    base_temp_dir = temp_fs.base_dir
    info.write('\ttemp_directory = {0}'.format(base_temp_dir))
    info.write('\tsort_index = {0}'.format(
            MSBAYES_SORT_INDEX.current_value()))
    info.write('\tsimulation_reps = {0}'.format(args.reps))
    stat_patterns = DEFAULT_STAT_PATTERNS
    if args.stat_prefixes:
        for i in range(len(args.stat_prefixes)):
            if not args.stat_prefixes[i].endswith('.'):
                args.stat_prefixes[i] += '.'
        stat_patterns = get_patterns_from_prefixes(
                args.stat_prefixes,
                ignore_case=True)
    if not args.bandwidth:
        args.bandwidth = 2 / float(args.num_posterior_samples)
    if not args.seed:
        args.seed = random.randint(1, 999999999)
    GLOBAL_RNG.seed(args.seed)
    if args.data_key_path:
        observed_map = sumresults.parse_data_key_file(args.data_key_path)
        observed_paths = [observed_map[k] for k in sorted(observed_map.keys())]
    else:
        observed_dir = mk_new_dir(os.path.join(base_dir,
                'observed-summary-stats'))
        observed_paths = [os.path.join(observed_dir, args.output_prefix + \
            'observed-{0}.txt'.format(i+1)) for i in range(len(
                    args.observed_configs))]
    info.write('\tseed = {0}'.format(args.seed))
    info.write('\tnum_processors = {0}'.format(args.np))
    info.write('\tnum_prior_samples = {0}'.format(
            args.num_prior_samples))
    info.write('\tnum_standardizing_samples = {0}'.format(
            args.num_standardizing_samples))
    info.write('\tbandwidth = {0}'.format(args.bandwidth))
    info.write('\tposterior_quantiles = {0}'.format(
            args.num_posterior_quantiles))
    info.write('\tposterior_sample_size = {0}'.format(
            args.num_posterior_samples))
    info.write('\tstat_patterns = {0}'.format(
            ', '.join([p.pattern for p in stat_patterns])))

    # vet observed configs
    ref_config_path = args.observed_configs[0]
    ref_config = MsBayesConfig(ref_config_path) 
    all_config_paths = []
    num_taxon_pairs = ref_config.npairs
    assert num_taxon_pairs > 0
    for config in args.observed_configs:
        all_config_paths.append(config)
        if not ref_config.equal_sample_table(config):
            if not args.keep_temps:
                temp_fs.purge()
            raise errors.SampleTableError(
                    'sample tables in config {0!r} and {1!r} differ; '
                    'all sample tables must be the same.'.format(
                            ref_config_path, config))

    info.write('\tnum_taxon_pairs = {0}'.format(num_taxon_pairs))
    info.write('\tdry_run = {0}'.format(args.dry_run))
    info.write('\t[[tool_paths]]')
    info.write('\t\tdpp_msbayes = {0}'.format(dpp_msbayes_path))
    info.write('\t\tmsbayes = {0}'.format(msbayes_path))
    info.write('\t\teureject = {0}'.format(eureject_path))
    info.write('\t\tabcestimator = {0}'.format(abctb_path))
    info.write('\t[[observed_configs]]')
    for i, cfg in enumerate(args.observed_configs):
        info.write('\t\t{0} = {1}'.format(i + 1, os.path.relpath(cfg,
                os.path.dirname(info_path))))

    abc_team = ABCTeam(
            temp_fs = temp_fs,
            observed_stats_files = observed_paths,
            num_taxon_pairs = num_taxon_pairs,
            config_paths = args.prior_configs,
            previous_prior_dir = previous_prior_dir,
            num_prior_samples = args.num_prior_samples,
            num_processors = args.np,
            num_standardizing_samples = args.num_standardizing_samples,
            num_posterior_samples = args.num_posterior_samples,
            num_posterior_density_quantiles = args.num_posterior_quantiles,
            batch_size = args.prior_batch_size,
            output_dir = base_dir,
            output_prefix = args.output_prefix,
            prior_temp_dir = args.staging_dir,
            rng = GLOBAL_RNG,
            report_parameters = True,
            stat_patterns = stat_patterns,
            eureject_exe_path = eureject_path,
            abctoolbox_exe_path = abctb_path,
            msbayes_exe_path = None,
            abctoolbox_bandwidth = args.bandwidth,
            omega_threshold = 0.01,
            cv_threshold = 0.01,
            compress = args.compress,
            reporting_frequency = args.reporting_frequency,
            keep_temps = args.keep_temps,
            global_estimate_only = False,
            global_estimate = not args.no_global_estimate,
            generate_prior_samples_only = args.generate_samples_only,
            start_from_simulation_index = args.start_from_simulation_index,
            start_from_observed_index = args.start_from_observed_index)

    models_to_configs = {}
    configs_to_models = {}
    for k, v in abc_team.models.iteritems():
        models_to_configs[k] = v
        configs_to_models[v] = k
        cfg = MsBayesConfig(v)
        all_config_paths.append(v)
        # vet prior configs
        if not ref_config.equal_sample_table(cfg):
            if not args.keep_temps:
                temp_fs.purge()
            raise errors.SampleTableError(
                    'sample tables in config {0!r} and {1!r} differ; '
                    'all sample tables must be the same.'.format(
                            ref_config_path, v))

    info.write('\t[[observed_paths]]')
    for i in sorted(abc_team.observed_stats_paths.iterkeys()):
        info.write('\t\t{0} = {1}'.format(i, os.path.relpath(
                abc_team.observed_stats_paths[i],
                os.path.dirname(info_path))))
    info.write('\t[[prior_configs]]')
    for i in sorted(abc_team.models.iterkeys()):
        info.write('\t\t{0} = {1}'.format(i, os.path.relpath(
                abc_team.models[i],
                os.path.dirname(info_path))))

    ##########################################################################
    ## begin analysis --- get observed summary stats

    set_memory_trace() # start logging memory profile
    start_time = datetime.datetime.now()

    if args.data_key_path:
        log.info('Using provided summary statitics...')
    elif not args.dry_run:
        obs_temp_dir = base_temp_dir
        if args.staging_dir:
            obs_temp_dir = args.staging_dir
        observed_temp_fs = TempFileSystem(parent = obs_temp_dir,
                prefix = 'observed-temps-')

        if args.reps < 1:
            log.info('Calculating summary statistics from sequence data...')
            obs_workers = []
            for i, cfg in enumerate(args.observed_configs):
                ss_worker = ObsSumStatsWorker(
                        temp_fs = observed_temp_fs,
                        config_path = cfg,
                        output_path = observed_paths[i],
                        schema = 'abctoolbox',
                        stat_patterns = stat_patterns)
                obs_workers.append(ss_worker)

            obs_workers = Manager.run_workers(
                workers = obs_workers,
                num_processors = args.np)

            # re-vet all configs to see if some were changed by obsSumStats.pl
            new_ref_config = ref_config
            ref_modified = False
            # new ref because if all configs were updated all is good
            if not ref_config.equal_sample_table(ref_config_path):
                ref_modified = True
                new_ref_config = MsBayesConfig(ref_config_path)
                log.warning("""
The alignment lengths in config
{0!r}
have been corrected for sites with *any* ambiguous bases and/or gaps by
obsSumStats.pl.
                    """.format(ref_config_path))
            for config in all_config_paths:
                if not new_ref_config.equal_sample_table(config):
                    corrected_config = config
                    if ref_modified:
                        corrected_config = ref_config_path
                    if not args.keep_temps:
                        observed_temp_fs.purge()
                        temp_fs.purge()
                    raise errors.SampleTableError("""
The sample tables in configs
{0!r}
and
{1!r}
differ because obsSumStats.pl modified alignment lengths in config
{2!r}
to correct for sites in the alignments with *any* ambiguous bases and/or gaps.
Please make sure the sample tables in all configs will be the same after
correcting alignment lengths for sites that contain *any* ambiguous bases
and/or gaps. You can do this by copying and pasting the sample table in
{2!r}
that has been corrected by obsSumStats.pl into the other configs that were not
corrected.
                        """.format(ref_config_path, config, corrected_config))

        else:
            log.info('Simulating summary statistics from observed configs...')
            num_observed_workers = min([args.reps, args.np])
            if args.reps <= args.np:
                observed_batch_size = 1
                remainder = 0
            else:
                observed_batch_size, remainder = long_division(args.reps,
                        args.np)
            msbayes_workers = []
            for idx, cfg in enumerate(args.observed_configs):
                observed_model_idx = configs_to_models.get(cfg,
                        None)
                schema = 'abctoolbox'
                for i in range(num_observed_workers):
                    worker = MsBayesWorker(
                            temp_fs = observed_temp_fs,
                            sample_size = observed_batch_size,
                            config_path = cfg,
                            model_index = observed_model_idx,
                            report_parameters = True,
                            schema = schema,
                            include_header = True,
                            stat_patterns = stat_patterns,
                            write_stats_file = False,
                            staging_dir = None,
                            tag = idx)
                    msbayes_workers.append(worker)
                if remainder > 0:
                    worker = MsBayesWorker(
                            temp_fs = observed_temp_fs,
                            sample_size = remainder,
                            config_path = cfg,
                            model_index = observed_model_idx,
                            report_parameters = True,
                            schema = schema,
                            include_header = True,
                            stat_patterns = stat_patterns,
                            write_stats_file = False,
                            staging_dir = None,
                            tag = idx)
                    msbayes_workers.append(worker)

            # run parallel msbayes processes
            msbayes_workers = Manager.run_workers(
                workers = msbayes_workers,
                num_processors = args.np)

            workers = dict(zip(range(len(args.observed_configs)),
                    [[] for i in range(len(args.observed_configs))]))
            for w in msbayes_workers:
                workers[w.tag].append(w)

            # merge simulated observed data into one file
            for i in range(len(args.observed_configs)):
                merge_prior_files([w.prior_path for w in workers[i]],
                        observed_paths[i])
                lc = line_count(observed_paths[i], ignore_headers=True)
                if lc != args.reps:
                    if not args.keep_temps:
                        temp_fs.purge()
                    raise Exception('The number of observed simulations ({0}) '
                            'generated for observed config {1!r} and output to '
                            'file {2!r} does not match the number of reps '
                            '({3})'.format(lc, args.observed_configs[i],
                                observed_paths[i], args.reps))
        if not args.keep_temps:
            log.debug('purging observed temps...')
            observed_temp_fs.purge()

    ##########################################################################
    ## Begin ABC analyses

    if not args.dry_run:
        abc_team.run()

    stop_time = datetime.datetime.now()
    log.info('Done!')
    info.write('\t[[run_stats]]', log.info)
    info.write('\t\tstart_time = {0}'.format(str(start_time)), log.info)
    info.write('\t\tstop_time = {0}'.format(str(stop_time)), log.info)
    info.write('\t\ttotal_duration = {0}'.format(str(stop_time - start_time)),
            log.info)

    if not args.keep_temps:
        log.debug('purging temps...')
        temp_fs.purge()
Exemplo n.º 9
0
def main_cli(argv=sys.argv):
    description = '{name} {version}'.format(**_program_info)
    parser = argparse.ArgumentParser(
        description=description,
        formatter_class=argparse_utils.SmartHelpFormatter)
    parser.add_argument(
        '-o',
        '--observed-configs',
        nargs='+',
        type=argparse_utils.arg_is_config,
        required=True,
        help=('One or more msBayes config files to be used to either '
              'calculate or simulate observed summary statistics. If '
              'used in combination with `-r` each config will be used to '
              'simulate pseudo-observed data. If analyzing real data, do '
              'not use the `-r` option, and the fasta files specified '
              'within the config must exist and contain the sequence '
              'data.'))
    parser.add_argument(
        '-p',
        '--prior-configs',
        nargs='+',
        type=argparse_utils.arg_is_path,
        required=True,
        help=('One or more config files to be used to generate prior '
              'samples. If more than one config is specified, they '
              'should be separated by spaces. '
              'This option can also be used to specify the path to a '
              'directory containing the prior samples and summary '
              'statistic means and standard deviations generated by a '
              'previous run using the `generate-samples-only` option. '
              'These files should be found in the directory '
              '`pymsbayes-output/prior-stats-summaries`. The'
              '`pymsbayes-output/model-key.txt` also needs to be present.'
              ' If specifying this directory, it should be the only '
              'argument (i.e., no other directories or config files can '
              'be provided).'))
    parser.add_argument(
        '-r',
        '--reps',
        action='store',
        type=argparse_utils.arg_is_nonnegative_int,
        default=0,
        help=('This option has two effects. First, it signifies that '
              'the analysis will be simulation based (i.e., no real '
              'data will be used). Second, it specifies how many '
              'simulation replicates to perform (i.e., how many data '
              'sets to simulate and analyze).'))
    parser.add_argument(
        '-n',
        '--num-prior-samples',
        action='store',
        type=argparse_utils.arg_is_positive_int,
        default=1000000,
        help=('The number of prior samples to simulate for each prior '
              'config specified with `-p`.'))
    parser.add_argument(
        '--prior-batch-size',
        action='store',
        type=argparse_utils.arg_is_positive_int,
        default=10000,
        help=('The number of prior samples to simulate for each batch.'))
    parser.add_argument(
        '--generate-samples-only',
        action='store_true',
        help=('Only generate samples from models as requested. I.e., '
              'No analyses are performed to approximate posteriors. '
              'This option can be useful if you want the prior samples '
              'for other purposes.'))
    parser.add_argument(
        '--num-posterior-samples',
        action='store',
        type=argparse_utils.arg_is_positive_int,
        default=1000,
        help=('The number of posterior samples desired for each '
              'analysis. Default: 1000.'))
    parser.add_argument('--num-standardizing-samples',
                        action='store',
                        type=argparse_utils.arg_is_positive_int,
                        default=10000,
                        help=('The number of prior samples desired to use for '
                              'standardizing statistics. Default: 10000.'))
    parser.add_argument(
        '--np',
        action='store',
        type=argparse_utils.arg_is_positive_int,
        default=multiprocessing.cpu_count(),
        help=('The maximum number of processes to run in parallel. The '
              'default is the number of CPUs available on the machine.'))
    parser.add_argument(
        '--output-dir',
        action='store',
        type=argparse_utils.arg_is_dir,
        help=('The directory in which all output files will be written. '
              'The default is to use the directory of the first observed '
              'config file.'))
    parser.add_argument(
        '--temp-dir',
        action='store',
        type=argparse_utils.arg_is_dir,
        help=('A directory to temporarily stage files. The default is to '
              'use the output directory.'))
    parser.add_argument(
        '--staging-dir',
        action='store',
        type=argparse_utils.arg_is_dir,
        help=('A directory to temporarily stage prior files. This option '
              'can be useful on clusters to speed up I/O while '
              'generating prior samples. You can designate a local temp '
              'directory on a compute node to avoid constant writing to '
              'a shared drive. The default is to use the `temp-dir`.'))
    parser.add_argument(
        '-s',
        '--stat-prefixes',
        nargs='*',
        type=str,
        help=('Prefixes of summary statistics to use in the analyses. '
              'The prefixes should be separated by spaces. '
              'Default: `-s pi wattTheta pi.net tajD.denom`.'))
    parser.add_argument(
        '-b',
        '--bandwidth',
        action='store',
        type=float,
        help=('Smoothing parameter for the posterior kernal density '
              'estimation. This option is used for the `glm` '
              'regression method. The default is 2 / '
              '`num-posterior-samples`.'))
    parser.add_argument(
        '-q',
        '--num-posterior-quantiles',
        action='store',
        type=argparse_utils.arg_is_positive_int,
        default=1000,
        help=('The number of equally spaced quantiles at which to '
              'evaluate the GLM-estimated posterior density. '
              'Default: 1000.'))
    parser.add_argument(
        '--reporting-frequency',
        action='store',
        type=argparse_utils.arg_is_nonnegative_int,
        default=0,
        help=('Suggested frequency (in number of prior samples) for '
              'running regression and reporting current results. '
              'Default: 0 (only report final results). '
              'If a value is given, it may be adjusted so that the '
              'reporting frequency is a multiple of the multi-processed '
              'batch size.'))
    parser.add_argument('--sort-index',
                        action='store',
                        type=argparse_utils.arg_is_nonnegative_int,
                        default=0,
                        choices=range(12),
                        help=argparse_utils.get_sort_index_help_message())
    parser.add_argument(
        '--no-global-estimate',
        action='store_true',
        help=('If multiple prior models are specified, by default a '
              'global estimate is performed averaging over all models. '
              'This option prevents the global estimation (i.e., only '
              'inferences for each model are made).'))
    parser.add_argument('--compress',
                        action='store_true',
                        help='Compress large results files.')
    parser.add_argument('--keep-temps',
                        action='store_true',
                        help='Keep all temporary files.')
    parser.add_argument('--seed',
                        action='store',
                        type=int,
                        help='Random number seed to use for the analysis.')
    parser.add_argument(
        '--output-prefix',
        action='store',
        type=str,
        default='',
        help=('Prefix to use at beginning of output files. The default '
              'is no prefix.'))
    parser.add_argument(
        '--data-key-path',
        action='store',
        type=argparse_utils.arg_is_file,
        help=('The path to a `data-key.txt` file generated by a previous '
              'run. This file should be found in the directory '
              '`pymsbayes-output/data-key.txt`. This option '
              'will override the `-o`/`--observed-configs` option, and '
              'is intended to be used in combination with the '
              '`--start-from` option to restart an analysis.'))
    parser.add_argument(
        '--start-from-simulation-index',
        action='store',
        type=argparse_utils.arg_is_nonnegative_int,
        default=0,
        help=('The simulation index at which to begin analyses. Must be '
              'used in combination with either the number of simulation '
              'replicates (`-r`/`--reps`) or the `--data-key-path` '
              'option, and must be a positive '
              'integer that is less than the number of simulation '
              'replicates. This option can be useful if an analysis '
              'needs to be restarted.'))
    parser.add_argument(
        '--start-from-observed-index',
        action='store',
        type=argparse_utils.arg_is_nonnegative_int,
        default=0,
        help=('The observed config index at which to begin analyses. '
              'Can be used in combination with the `--data-key-path` '
              'option to restart long-running, multi-observed-config '
              'analyses'))
    parser.add_argument('--dry-run',
                        action='store_true',
                        help='Do not run analyses; only process settings')
    parser.add_argument('--version',
                        action='version',
                        version='%(prog)s ' + _program_info['version'],
                        help='Report version and exit.')
    parser.add_argument('--quiet',
                        action='store_true',
                        help='Run without verbose messaging.')
    parser.add_argument('--debug',
                        action='store_true',
                        help='Run in debugging mode.')

    if argv == sys.argv:
        args = parser.parse_args()
    else:
        args = parser.parse_args(argv)

    ##########################################################################
    ## handle args

    from pymsbayes.utils.messaging import (LoggingControl, InfoLogger)

    LoggingControl.set_logging_level("INFO")
    if args.quiet:
        LoggingControl.set_logging_level("WARNING")
    if args.debug:
        LoggingControl.set_logging_level("DEBUG")
    log = LoggingControl.get_logger(__name__)

    from pymsbayes.workers import (MsBayesWorker, merge_prior_files,
                                   ObsSumStatsWorker)
    from pymsbayes.teams import ABCTeam
    from pymsbayes.utils.functions import (is_file, is_dir, long_division,
                                           mk_new_dir)
    from pymsbayes.utils.parsing import (get_patterns_from_prefixes,
                                         DEFAULT_STAT_PATTERNS,
                                         DIV_MODEL_PATTERNS, MODEL_PATTERNS,
                                         PSI_PATTERNS, MEAN_TAU_PATTERNS,
                                         OMEGA_PATTERNS, CV_PATTERNS,
                                         line_count)
    from pymsbayes.utils import sumresults, errors
    from pymsbayes.manager import Manager
    from pymsbayes.utils.tempfs import TempFileSystem
    from pymsbayes.config import MsBayesConfig
    from pymsbayes.utils import (GLOBAL_RNG, set_memory_trace,
                                 MSBAYES_SORT_INDEX, ToolPathManager)

    MSBAYES_SORT_INDEX.set_index(args.sort_index)

    if len(args.observed_configs) != len(set(args.observed_configs)):
        raise ValueError('All paths to observed config files must be unique')

    if args.num_standardizing_samples > args.num_prior_samples:
        args.num_standardizing_samples = args.num_prior_samples

    # get full paths to tools
    msbayes_path = ToolPathManager.get_tool_full_path('msbayes.pl')
    dpp_msbayes_path = ToolPathManager.get_tool_full_path('dpp-msbayes.pl')
    eureject_path = ToolPathManager.get_tool_full_path('eureject')
    abctb_path = ToolPathManager.get_tool_full_path('ABCestimator')

    # vet prior-configs option
    using_previous_priors = False
    previous_prior_dir = None
    if (len(args.prior_configs) == 1) and (is_dir(args.prior_configs[0])):
        previous_prior_dir = args.prior_configs.pop(0)
        previous_priors = glob.glob(
            os.path.join(previous_prior_dir, '*-prior-sample.txt'))
        previous_sums = glob.glob(
            os.path.join(previous_prior_dir, '*-means-and-std-devs.txt'))
        if (not previous_priors) or (not previous_sums):
            raise ValueError(
                'directory {0!r} specified with `prior-configs` '
                'option does not contain necessary prior and summary '
                'files'.format(args.prior_configs[0]))
        using_previous_priors = True
    else:
        for path in args.prior_configs:
            if not is_file(path):
                raise ValueError(
                    'prior config {0!r} is not a file'.format(path))
    if len(args.prior_configs) != len(set(args.prior_configs)):
        raise ValueError('All paths to prior config files must be unique')
    if not args.output_dir:
        args.output_dir = os.path.dirname(args.observed_configs[0])
    base_dir = mk_new_dir(os.path.join(args.output_dir, 'pymsbayes-results'))
    if not args.temp_dir:
        args.temp_dir = base_dir
    info_path = os.path.join(base_dir, args.output_prefix + \
            'pymsbayes-info.txt')
    info = InfoLogger(info_path)
    info.write('[pymsbayes]'.format(base_dir))
    info.write('\tversion = {version}'.format(**_program_info))
    info.write('\toutput_directory = {0}'.format(base_dir))
    temp_fs = TempFileSystem(parent=args.temp_dir, prefix='temp-files-')
    base_temp_dir = temp_fs.base_dir
    info.write('\ttemp_directory = {0}'.format(base_temp_dir))
    info.write('\tsort_index = {0}'.format(MSBAYES_SORT_INDEX.current_value()))
    info.write('\tsimulation_reps = {0}'.format(args.reps))
    stat_patterns = DEFAULT_STAT_PATTERNS
    if args.stat_prefixes:
        for i in range(len(args.stat_prefixes)):
            if not args.stat_prefixes[i].endswith('.'):
                args.stat_prefixes[i] += '.'
        stat_patterns = get_patterns_from_prefixes(args.stat_prefixes,
                                                   ignore_case=True)
    if not args.bandwidth:
        args.bandwidth = 2 / float(args.num_posterior_samples)
    if not args.seed:
        args.seed = random.randint(1, 999999999)
    GLOBAL_RNG.seed(args.seed)
    if args.data_key_path:
        observed_map = sumresults.parse_data_key_file(args.data_key_path)
        observed_paths = [observed_map[k] for k in sorted(observed_map.keys())]
    else:
        observed_dir = mk_new_dir(
            os.path.join(base_dir, 'observed-summary-stats'))
        observed_paths = [os.path.join(observed_dir, args.output_prefix + \
            'observed-{0}.txt'.format(i+1)) for i in range(len(
                    args.observed_configs))]
    info.write('\tseed = {0}'.format(args.seed))
    info.write('\tnum_processors = {0}'.format(args.np))
    info.write('\tnum_prior_samples = {0}'.format(args.num_prior_samples))
    info.write('\tnum_standardizing_samples = {0}'.format(
        args.num_standardizing_samples))
    info.write('\tbandwidth = {0}'.format(args.bandwidth))
    info.write('\tposterior_quantiles = {0}'.format(
        args.num_posterior_quantiles))
    info.write('\tposterior_sample_size = {0}'.format(
        args.num_posterior_samples))
    info.write('\tstat_patterns = {0}'.format(', '.join(
        [p.pattern for p in stat_patterns])))

    # vet observed configs
    ref_config_path = args.observed_configs[0]
    ref_config = MsBayesConfig(ref_config_path)
    all_config_paths = []
    num_taxon_pairs = ref_config.npairs
    assert num_taxon_pairs > 0
    for config in args.observed_configs:
        all_config_paths.append(config)
        if not ref_config.equal_sample_table(config):
            if not args.keep_temps:
                temp_fs.purge()
            raise errors.SampleTableError(
                'sample tables in config {0!r} and {1!r} differ; '
                'all sample tables must be the same.'.format(
                    ref_config_path, config))

    info.write('\tnum_taxon_pairs = {0}'.format(num_taxon_pairs))
    info.write('\tdry_run = {0}'.format(args.dry_run))
    info.write('\t[[tool_paths]]')
    info.write('\t\tdpp_msbayes = {0}'.format(dpp_msbayes_path))
    info.write('\t\tmsbayes = {0}'.format(msbayes_path))
    info.write('\t\teureject = {0}'.format(eureject_path))
    info.write('\t\tabcestimator = {0}'.format(abctb_path))
    info.write('\t[[observed_configs]]')
    for i, cfg in enumerate(args.observed_configs):
        info.write('\t\t{0} = {1}'.format(
            i + 1, os.path.relpath(cfg, os.path.dirname(info_path))))

    abc_team = ABCTeam(
        temp_fs=temp_fs,
        observed_stats_files=observed_paths,
        num_taxon_pairs=num_taxon_pairs,
        config_paths=args.prior_configs,
        previous_prior_dir=previous_prior_dir,
        num_prior_samples=args.num_prior_samples,
        num_processors=args.np,
        num_standardizing_samples=args.num_standardizing_samples,
        num_posterior_samples=args.num_posterior_samples,
        num_posterior_density_quantiles=args.num_posterior_quantiles,
        batch_size=args.prior_batch_size,
        output_dir=base_dir,
        output_prefix=args.output_prefix,
        prior_temp_dir=args.staging_dir,
        rng=GLOBAL_RNG,
        report_parameters=True,
        stat_patterns=stat_patterns,
        eureject_exe_path=eureject_path,
        abctoolbox_exe_path=abctb_path,
        msbayes_exe_path=None,
        abctoolbox_bandwidth=args.bandwidth,
        omega_threshold=0.01,
        cv_threshold=0.01,
        compress=args.compress,
        reporting_frequency=args.reporting_frequency,
        keep_temps=args.keep_temps,
        global_estimate_only=False,
        global_estimate=not args.no_global_estimate,
        generate_prior_samples_only=args.generate_samples_only,
        start_from_simulation_index=args.start_from_simulation_index,
        start_from_observed_index=args.start_from_observed_index)

    models_to_configs = {}
    configs_to_models = {}
    for k, v in abc_team.models.iteritems():
        models_to_configs[k] = v
        configs_to_models[v] = k
        cfg = MsBayesConfig(v)
        all_config_paths.append(v)
        # vet prior configs
        if not ref_config.equal_sample_table(cfg):
            if not args.keep_temps:
                temp_fs.purge()
            raise errors.SampleTableError(
                'sample tables in config {0!r} and {1!r} differ; '
                'all sample tables must be the same.'.format(
                    ref_config_path, v))

    info.write('\t[[observed_paths]]')
    for i in sorted(abc_team.observed_stats_paths.iterkeys()):
        info.write('\t\t{0} = {1}'.format(
            i,
            os.path.relpath(abc_team.observed_stats_paths[i],
                            os.path.dirname(info_path))))
    info.write('\t[[prior_configs]]')
    for i in sorted(abc_team.models.iterkeys()):
        info.write('\t\t{0} = {1}'.format(
            i, os.path.relpath(abc_team.models[i],
                               os.path.dirname(info_path))))

    ##########################################################################
    ## begin analysis --- get observed summary stats

    set_memory_trace()  # start logging memory profile
    start_time = datetime.datetime.now()

    if args.data_key_path:
        log.info('Using provided summary statitics...')
    elif not args.dry_run:
        obs_temp_dir = base_temp_dir
        if args.staging_dir:
            obs_temp_dir = args.staging_dir
        observed_temp_fs = TempFileSystem(parent=obs_temp_dir,
                                          prefix='observed-temps-')

        if args.reps < 1:
            log.info('Calculating summary statistics from sequence data...')
            obs_workers = []
            for i, cfg in enumerate(args.observed_configs):
                ss_worker = ObsSumStatsWorker(temp_fs=observed_temp_fs,
                                              config_path=cfg,
                                              output_path=observed_paths[i],
                                              schema='abctoolbox',
                                              stat_patterns=stat_patterns)
                obs_workers.append(ss_worker)

            obs_workers = Manager.run_workers(workers=obs_workers,
                                              num_processors=args.np)

            # re-vet all configs to see if some were changed by obsSumStats.pl
            new_ref_config = ref_config
            ref_modified = False
            # new ref because if all configs were updated all is good
            if not ref_config.equal_sample_table(ref_config_path):
                ref_modified = True
                new_ref_config = MsBayesConfig(ref_config_path)
                log.warning("""
The alignment lengths in config
{0!r}
have been corrected for sites with *any* ambiguous bases and/or gaps by
obsSumStats.pl.
                    """.format(ref_config_path))
            for config in all_config_paths:
                if not new_ref_config.equal_sample_table(config):
                    corrected_config = config
                    if ref_modified:
                        corrected_config = ref_config_path
                    if not args.keep_temps:
                        observed_temp_fs.purge()
                        temp_fs.purge()
                    raise errors.SampleTableError("""
The sample tables in configs
{0!r}
and
{1!r}
differ because obsSumStats.pl modified alignment lengths in config
{2!r}
to correct for sites in the alignments with *any* ambiguous bases and/or gaps.
Please make sure the sample tables in all configs will be the same after
correcting alignment lengths for sites that contain *any* ambiguous bases
and/or gaps. You can do this by copying and pasting the sample table in
{2!r}
that has been corrected by obsSumStats.pl into the other configs that were not
corrected.
                        """.format(ref_config_path, config, corrected_config))

        else:
            log.info('Simulating summary statistics from observed configs...')
            num_observed_workers = min([args.reps, args.np])
            if args.reps <= args.np:
                observed_batch_size = 1
                remainder = 0
            else:
                observed_batch_size, remainder = long_division(
                    args.reps, args.np)
            msbayes_workers = []
            for idx, cfg in enumerate(args.observed_configs):
                observed_model_idx = configs_to_models.get(cfg, None)
                schema = 'abctoolbox'
                for i in range(num_observed_workers):
                    worker = MsBayesWorker(temp_fs=observed_temp_fs,
                                           sample_size=observed_batch_size,
                                           config_path=cfg,
                                           model_index=observed_model_idx,
                                           report_parameters=True,
                                           schema=schema,
                                           include_header=True,
                                           stat_patterns=stat_patterns,
                                           write_stats_file=False,
                                           staging_dir=None,
                                           tag=idx)
                    msbayes_workers.append(worker)
                if remainder > 0:
                    worker = MsBayesWorker(temp_fs=observed_temp_fs,
                                           sample_size=remainder,
                                           config_path=cfg,
                                           model_index=observed_model_idx,
                                           report_parameters=True,
                                           schema=schema,
                                           include_header=True,
                                           stat_patterns=stat_patterns,
                                           write_stats_file=False,
                                           staging_dir=None,
                                           tag=idx)
                    msbayes_workers.append(worker)

            # run parallel msbayes processes
            msbayes_workers = Manager.run_workers(workers=msbayes_workers,
                                                  num_processors=args.np)

            workers = dict(
                zip(range(len(args.observed_configs)),
                    [[] for i in range(len(args.observed_configs))]))
            for w in msbayes_workers:
                workers[w.tag].append(w)

            # merge simulated observed data into one file
            for i in range(len(args.observed_configs)):
                merge_prior_files([w.prior_path for w in workers[i]],
                                  observed_paths[i])
                lc = line_count(observed_paths[i], ignore_headers=True)
                if lc != args.reps:
                    if not args.keep_temps:
                        temp_fs.purge()
                    raise Exception(
                        'The number of observed simulations ({0}) '
                        'generated for observed config {1!r} and output to '
                        'file {2!r} does not match the number of reps '
                        '({3})'.format(lc, args.observed_configs[i],
                                       observed_paths[i], args.reps))
        if not args.keep_temps:
            log.debug('purging observed temps...')
            observed_temp_fs.purge()

    ##########################################################################
    ## Begin ABC analyses

    if not args.dry_run:
        abc_team.run()

    stop_time = datetime.datetime.now()
    log.info('Done!')
    info.write('\t[[run_stats]]', log.info)
    info.write('\t\tstart_time = {0}'.format(str(start_time)), log.info)
    info.write('\t\tstop_time = {0}'.format(str(stop_time)), log.info)
    info.write('\t\ttotal_duration = {0}'.format(str(stop_time - start_time)),
               log.info)

    if not args.keep_temps:
        log.debug('purging temps...')
        temp_fs.purge()
Exemplo n.º 10
0
def main_cli():
    description = '{name} {version}'.format(**_program_info)
    parser = argparse.ArgumentParser(
        description=description,
        formatter_class=argparse_utils.SmartHelpFormatter)
    parser.add_argument(
        '-c',
        '--config',
        type=argparse_utils.arg_is_config,
        required=True,
        help=('msBayes config file to be used to generate saturation '
              'plot.'))
    parser.add_argument(
        '-n',
        '--num-prior-samples',
        action='store',
        type=int,
        default=1000,
        help=('The number of prior samples to simulate for the '
              'saturation plot.'))
    parser.add_argument(
        '--np',
        action='store',
        type=int,
        default=multiprocessing.cpu_count(),
        help=('The maximum number of processes to run in parallel. The '
              'default is the number of CPUs available on the machine.'))
    parser.add_argument(
        '-o',
        '--output-dir',
        action='store',
        type=argparse_utils.arg_is_dir,
        help=('The directory in which all output files will be written. '
              'The default is to use the directory of the first observed '
              'config file.'))
    parser.add_argument(
        '--temp-dir',
        action='store',
        type=argparse_utils.arg_is_dir,
        help=('A directory to temporarily stage files. The default is to '
              'use the output directory.'))
    parser.add_argument(
        '-s',
        '--stat-prefixes',
        nargs='*',
        type=str,
        default=['pi', 'pi.net', 'wattTheta', 'tajD.denom'],
        help=('Prefixes of summary statistics to use in the analyses. '
              'The prefixes should be separated by spaces. '
              'Default: `-s pi pi.net wattTheta tajD.denom`.'))
    parser.add_argument('--sort-index',
                        action='store',
                        type=int,
                        default=0,
                        choices=range(12),
                        help=argparse_utils.get_sort_index_help_message())
    parser.add_argument('--compress',
                        action='store_true',
                        help='Compress plot data file.')
    parser.add_argument('--keep-temps',
                        action='store_true',
                        help='Keep all temporary files.')
    parser.add_argument('--seed',
                        action='store',
                        type=int,
                        help='Random number seed to use for the analysis.')
    parser.add_argument('--version',
                        action='version',
                        version='%(prog)s ' + _program_info['version'],
                        help='Report version and exit.')
    parser.add_argument('--quiet',
                        action='store_true',
                        help='Run without verbose messaging.')
    parser.add_argument('--debug',
                        action='store_true',
                        help='Run in debugging mode.')

    args = parser.parse_args()

    ##########################################################################
    ## handle args

    from pymsbayes.utils.messaging import (LoggingControl, InfoLogger)

    LoggingControl.set_logging_level("INFO")
    if args.quiet:
        LoggingControl.set_logging_level("WARNING")
    if args.debug:
        LoggingControl.set_logging_level("DEBUG")
    log = LoggingControl.get_logger(__name__)

    from pymsbayes.workers import MsBayesWorker
    from pymsbayes.utils.parsing import (get_patterns_from_prefixes,
                                         DEFAULT_STAT_PATTERNS,
                                         get_dict_from_spreadsheets,
                                         dict_line_iter)
    from pymsbayes.manager import Manager
    from pymsbayes.utils.tempfs import TempFileSystem
    from pymsbayes.utils import probability, stats
    from pymsbayes.utils.functions import long_division
    from pymsbayes.config import MsBayesConfig
    from pymsbayes.utils import GLOBAL_RNG, MSBAYES_SORT_INDEX, ToolPathManager
    from pymsbayes.fileio import process_file_arg
    from pymsbayes import plotting

    MSBAYES_SORT_INDEX.set_index(args.sort_index)

    # get full paths to tools
    msbayes_path = ToolPathManager.get_tool_full_path('msbayes.pl')
    dpp_msbayes_path = ToolPathManager.get_tool_full_path('dpp-msbayes.pl')

    if not args.output_dir:
        args.output_dir = os.path.dirname(args.config)
    info = InfoLogger(os.path.join(args.output_dir, 'pymsbayes-info.txt'))

    sample_path = os.path.join(args.output_dir, 'prior-sample.txt')
    if args.compress:
        sample_path += '.gz'

    if not args.temp_dir:
        args.temp_dir = args.output_dir
    temp_fs = TempFileSystem(parent=args.temp_dir, prefix='temp-files-')
    args.stat_prefixes = [s.rstrip('.') for s in args.stat_prefixes]
    stat_patterns = get_patterns_from_prefixes(
        [s + '.' for s in args.stat_prefixes], ignore_case=True)
    if not args.seed:
        args.seed = random.randint(1, 999999999)
    GLOBAL_RNG.seed(args.seed)
    compress_level = None
    if args.compress:
        compress_level = 9

    cfg = MsBayesConfig(args.config)
    num_taxon_pairs = cfg.npairs

    info.write('[pymsbayes]', log.info)
    info.write('\tprogram_name = {name}'.format(**_program_info), log.info)
    info.write('\tversion = {version}'.format(**_program_info), log.info)
    info.write('\tinvocation = {0!r}'.format(' '.join(sys.argv)), log.info)
    info.write('\toutput_directory = {0!r}'.format(args.output_dir), log.info)
    info.write('\ttemp_directory = {0!r}'.format(temp_fs.base_dir), log.info)
    info.write('\tsort_index = {0}'.format(MSBAYES_SORT_INDEX.current_value()),
               log.info)
    info.write(
        '\tstat_patterns = {0!r}'.format(', '.join(
            [p.pattern for p in stat_patterns])), log.info)
    info.write('\tseed = {0}'.format(args.seed), log.info)
    info.write('\tnum_prior_samples = {0}'.format(args.num_prior_samples),
               log.info)
    info.write('\tsample_path = {0!r}'.format(sample_path), log.info)
    info.write('\t[[tool_paths]]', log.info)
    info.write('\t\tdpp_msbayes = {0}'.format(dpp_msbayes_path), log.info)
    info.write('\t\tmsbayes = {0}'.format(msbayes_path), log.info)

    info.write('\t[[config]]', log.debug)
    info.write('{0}'.format(str(cfg)), log.debug)

    ##########################################################################
    ## begin analysis --- generate samples

    start_time = datetime.datetime.now()

    if args.np > args.num_prior_samples:
        args.np = args.num_prior_samples
    batch_size, remainder = long_division(args.num_prior_samples, args.np)
    schema = 'abctoolbox'
    workers = []
    for i in range(args.np):
        sample_size = batch_size
        if i == (args.np - 1):
            sample_size += remainder
        w = MsBayesWorker(temp_fs=temp_fs,
                          sample_size=sample_size,
                          config_path=args.config,
                          report_parameters=True,
                          schema=schema,
                          include_header=True,
                          stat_patterns=stat_patterns,
                          write_stats_file=False)
        workers.append(w)

    log.info('Generating samples...')
    workers = Manager.run_workers(workers=workers, num_processors=args.np)
    log.info('Parsing samples...')
    sample = get_dict_from_spreadsheets([w.prior_path for w in workers])

    log.info('Writing prior samples...')
    out, close = process_file_arg(sample_path,
                                  'w',
                                  compresslevel=compress_level)
    for row in dict_line_iter(sample, sep='\t'):
        out.write(row)
    if close:
        out.close()

    log.info('Creating plots...')

    if not plotting.MATPLOTLIB_AVAILABLE:
        log.warning(
            '`matplotlib` could not be imported, so the plot can not be\n'
            'produced. The data to create the plot can be found in:\n\t'
            '{0!r}'.format(sample_path))
        sys.exit(1)

    for stat_pattern in stat_patterns:
        found = False
        for stat, values in sample.iteritems():
            if stat_pattern.match(stat):
                values = [float(v) for v in values]
                found = True
                plot_path = os.path.join(args.output_dir,
                                         'plot-{0}.pdf'.format(stat))
                summary = stats.get_summary(values)
                s = r'mean = {0:.4f} ({1:.4f}-{2:.4f})'.format(
                    summary['mean'], summary['qi_95'][0], summary['qi_95'][1])
                hd = plotting.HistData(x=values,
                                       normed=True,
                                       bins=20,
                                       histtype='bar',
                                       align='mid',
                                       orientation='vertical',
                                       zorder=0)
                hist = plotting.ScatterPlot(hist_data_list=[hd], right_text=s)
                hist.left_text_size = 12.0
                hist.right_text_size = 12.0
                xticks = [i for i in hist.ax.get_xticks()]
                xtick_labels = [i for i in xticks]
                yticks = [i for i in hist.ax.get_yticks()]
                ytick_labels = [i for i in yticks]
                if len(xtick_labels) >= 8:
                    for i in range(1, len(xtick_labels), 2):
                        xtick_labels[i] = ''
                if len(ytick_labels) >= 8:
                    for i in range(1, len(ytick_labels), 2):
                        ytick_labels[i] = ''
                xticks_obj = plotting.Ticks(ticks=xticks,
                                            labels=xtick_labels,
                                            horizontalalignment='center')
                yticks_obj = plotting.Ticks(ticks=yticks, labels=ytick_labels)
                hist.xticks_obj = xticks_obj
                hist.yticks_obj = yticks_obj

                plot_grid = plotting.PlotGrid(subplots=[hist],
                                              num_columns=1,
                                              label_schema=None,
                                              title=stat,
                                              title_size=14.0,
                                              title_top=False,
                                              y_title='Density',
                                              y_title_position=0.001,
                                              y_title_size=14.0,
                                              height=4.0,
                                              width=6.0,
                                              auto_height=False)
                plot_grid.auto_adjust_margins = False
                plot_grid.margin_left = 0.04
                plot_grid.margin_bottom = 0.04
                plot_grid.margin_right = 1.0
                plot_grid.margin_top = 0.97
                plot_grid.reset_figure()
                plot_grid.savefig(plot_path)

        if not found:
            raise Exception('stat pattern {0!r} not found in simulated stats:'
                            '\n\t{1}'.format(stat_pattern,
                                             ', '.join(sample.keys())))

    stop_time = datetime.datetime.now()
    log.info('Done!')
    info.write('\t[[run_stats]]', log.info)
    info.write('\t\tstart_time = {0}'.format(str(start_time)), log.info)
    info.write('\t\tstop_time = {0}'.format(str(stop_time)), log.info)
    info.write('\t\ttotal_duration = {0}'.format(str(stop_time - start_time)),
               log.info)

    if not args.keep_temps:
        log.debug('purging temps...')
        temp_fs.purge()