class PyMsBayesTestCase(unittest.TestCase): def set_up(self): self.temp_fs = TempFileSystem(parent=package_paths.test_path(), prefix='PyMsBayesTestTemp-') self.test_id = 'pymsbayes-' + random_str() def tear_down(self): self.register_file_system() self.temp_fs.purge() self.assertEqual(FileStream.open_files, set()) def get_test_path(self, parent=None, prefix='temp'): return self.temp_fs.get_file_path(parent=parent, prefix=prefix) def get_test_subdir(self, parent=None, prefix='temp'): return self.temp_fs.create_subdir(parent=parent, prefix=prefix) def register_file(self, path): self.temp_fs._register_file(path) def register_dir(self, path): self.temp_fs._register_dir(path) def register_file_system(self): _LOG.debug('registering test file system...') for path, dirs, files, in os.walk(self.temp_fs.base_dir): for f in files: if f.startswith(self.test_id): self.register_file(os.path.join(path, f)) for d in dirs: if d.startswith(self.test_id): self.register_dir(os.path.join(path, d)) def _exe_script(self, script_name, args, stdout=None, stderr=None, return_code=0): script_path = package_paths.script_path(script_name) if isinstance(args, str): arg_list = args.split() else: arg_list = args arg_list = [str(x) for x in arg_list] cmd = [sys.executable, script_path] + arg_list _LOG.debug('Invocation:\n\t{0}'.format(' '.join(cmd))) p = subprocess.Popen(cmd, shell=False, stdout=subprocess.PIPE, stderr=subprocess.PIPE) o, e = p.communicate() exit_code = p.wait() if exit_code != return_code: _LOG.error("exit code {0} did not match {1}".format( exit_code, return_code)) _LOG.error("here is the stdout:\n{0}".format(o)) _LOG.error("here is the stderr:\n{0}".format(e)) self.assertEqual(exit_code, return_code) if stdout != None: if o != stdout: _LOG.error("std out did not match expected:\n{0}".format(o)) self.assertEqual(o, stdout) if stderr != None: if e != stderr: _LOG.error("std error did not match expected:\n{0}".format(e)) self.assertEqual(e, stderr) def get_expected_indices(self, num_pairs, dummy_column=True, parameters_reported=True): num_summary_params = 4 if _CV_INCLUDED: num_summary_params += 1 num_params = 4 * num_pairs num_default_stats = 4 * num_pairs start = 0 if dummy_column: start = 1 param_indices = range(start, start + num_summary_params) start += num_summary_params if parameters_reported: param_indices += range(start, start + num_params) start += num_params stat_indices = range(start, start + num_default_stats) return param_indices, stat_indices def prior_file_is_valid(self, prior_path, num_of_samples, num_of_columns=None): try: prior_file = open(prior_path, 'rU') except: _LOG.error('prior invalid: could not open prior path {0}'.format( prior_path)) return False nrows = 0 for i, line in enumerate(prior_file): if nrows == 0 and HEADER_PATTERN.match(line): pass else: nrows += 1 if not num_of_columns: num_of_columns = len(line.strip().split()) ncols = len(line.strip().split()) if num_of_columns != ncols: _LOG.error('prior invalid: num of columns at line {0} is {1} ' 'NOT {2}'.format(i + 1, ncols, num_of_columns)) return False prior_file.close() if num_of_samples != nrows: _LOG.error('prior invalid: num of rows is {0} NOT {1}'.format( nrows, num_of_samples)) return False return True def get_number_of_lines(self, path): f, close = process_file_arg(path) count = 0 for l in f: count += 1 if close: f.close() return count def get_number_of_header_lines(self, path): f, close = process_file_arg(path) count = 0 for l in f: if HEADER_PATTERN.match(l.strip()): count += 1 if close: f.close() return count def parse_python_config(self, path): return ConfigObj(path) def get_config_from_msbayes_workers(self, msbayes_workers): cfgs = [MsBayesConfig(w.config_path) for w in msbayes_workers] self.assertSameConfigs(cfgs) return cfgs[0] def assertSameConfigs(self, cfgs): configs = list(cfgs) c1 = configs.pop(0) for c2 in cfgs: self.assertEqual(c1.time_in_subs_per_site, c2.time_in_subs_per_site) self.assertEqual(c1.npairs, c2.npairs) self.assertEqual(c1.implementation, c2.implementation) self.assertEqual(c1.div_model_prior, c2.div_model_prior) self.assertEqual(c1.bottle_proportion_shared, c2.bottle_proportion_shared) self.assertEqual(c1.theta_parameters, c2.theta_parameters) self.assertEqual(c1.taxa, c2.taxa) self.assertTrue(c1.sample_table.equals(c2.sample_table)) if c1.psi: self.assertSameDistributions(c1.psi, c2.psi) else: self.assertEqual(c1.psi, c2.psi) if c1.tau: self.assertSameDistributions(c1.tau, c2.tau) else: self.assertEqual(c1.tau, c2.tau) if c1.theta: self.assertSameDistributions(c1.theta, c2.theta) else: self.assertEqual(c1.theta, c2.theta) if c1.a_theta: self.assertSameDistributions(c1.a_theta, c2.a_theta) else: self.assertEqual(c1.a_theta, c2.a_theta) if c1.d_theta: self.assertSameDistributions(c1.d_theta, c2.d_theta) else: self.assertEqual(c1.d_theta, c2.d_theta) if c1.recombination: self.assertSameDistributions(c1.recombination, c2.recombination) else: self.assertEqual(c1.recombination, c2.recombination) if c1.migration: self.assertSameDistributions(c1.migration, c2.migration) else: self.assertEqual(c1.migration, c2.migration) if c1.dpp_concentration: self.assertSameDistributions(c1.dpp_concentration, c2.dpp_concentration) else: self.assertEqual(c1.dpp_concentration, c2.dpp_concentration) if c1.bottle_proportion: self.assertSameDistributions(c1.bottle_proportion, c2.bottle_proportion) else: self.assertEqual(c1.bottle_proportion, c2.bottle_proportion) def get_parameter_summaries_from_msbayes_workers(self, msbayes_workers, shuffle_taus=True): msbayes_workers = list(msbayes_workers) s = dict( zip([i for i in msbayes_workers[0].parameter_indices], [ SampleSummarizer(tag=msbayes_workers[0].header[i]) for i in msbayes_workers[0].parameter_indices ])) ncols = None header = msbayes_workers[0].header pi = msbayes_workers[0].parameter_indices for w in msbayes_workers: self.assertEqual(w.header, header) self.assertEqual(w.parameter_indices, pi) f = open(w.prior_path, 'rU') for line_idx, row in enumerate(f): if not ncols: ncols = len(row.strip().split()) if HEADER_PATTERN.match(row.strip()): continue r = row.strip().split() assert len(r) == ncols if shuffle_taus: # because taus are sorted in prior files psi_index = get_indices_of_patterns( w.header, PSI_PATTERNS)[0] tau_indices = get_indices_of_patterns( w.header, TAU_PATTERNS) psi = int(r[psi_index]) taus = [float(r[i]) for i in tau_indices] self.assertEqual(psi, len(set(taus))) random.shuffle(taus) for n, i in enumerate(tau_indices): s[i].add_sample(taus[n]) p_set = set(w.parameter_indices) - set(tau_indices) p = sorted(list(p_set)) for i in p: s[i].add_sample(float(r[i])) else: for i in w.parameter_indices: s[i].add_sample(float(r[i])) f.close() return s def assertPriorIsPrecise(self, msbayes_workers, places=2): msbayes_workers = list(msbayes_workers) self.assertWorkersFinished(msbayes_workers) param_sums = self.get_parameter_summaries_from_msbayes_workers( msbayes_workers) sample_size = 0 for w in msbayes_workers: sample_size += w.sample_size for s in param_sums.itervalues(): self.assertEqual(s.n, sample_size) cfg = self.get_config_from_msbayes_workers(msbayes_workers) psi_indices = get_indices_of_patterns(msbayes_workers[0].header, PSI_PATTERNS) self.assertEqual(len(psi_indices), 1) model_indices = get_indices_of_patterns(msbayes_workers[0].header, MODEL_PATTERNS) if not msbayes_workers[0].model_index is None: self.assertEqual(len(model_indices), 1) else: self.assertEqual(len(model_indices), 0) tau_indices = get_indices_of_patterns(msbayes_workers[0].header, TAU_PATTERNS) a_theta_indices = get_indices_of_patterns(msbayes_workers[0].header, A_THETA_PATTERNS) d_theta_indices = get_indices_of_patterns(msbayes_workers[0].header, D_THETA_PATTERNS) if msbayes_workers[0].report_parameters: self.assertEqual(len(tau_indices), cfg.npairs) self.assertEqual(len(a_theta_indices), cfg.npairs) self.assertEqual(len(d_theta_indices), 2 * cfg.npairs) else: self.assertEqual(len(tau_indices), 0) self.assertEqual(len(a_theta_indices), 0) self.assertEqual(len(d_theta_indices), 0) _LOG.debug('\n{0}\n'.format('\n'.join( [str(param_sums[i]) for i in sorted(param_sums.iterkeys())]))) for i in psi_indices: self.assertSampleIsFromDistribution(param_sums[i], cfg.psi, places=places) for i in tau_indices: self.assertSampleIsFromDistribution(param_sums[i], cfg.tau, places=places) for i in a_theta_indices: self.assertSampleIsFromDistribution(param_sums[i], cfg.a_theta, places=places) for i in d_theta_indices: self.assertSampleIsFromDistribution(param_sums[i], cfg.d_theta, mean_adj=cfg.theta.mean, max_adj=cfg.theta.maximum, compare_variance=False, places=places) def assertPriorIsAccurate(self, msbayes_workers, places=2): msbayes_workers = list(msbayes_workers) self.assertWorkersFinished(msbayes_workers) pass def assertPriorIsValid(self, msbayes_workers, places=2): msbayes_workers = list(msbayes_workers) self.assertWorkersFinished(msbayes_workers) self.assertPriorIsPrecise(msbayes_workers, places=places) self.assertPriorIsAccurate(msbayes_workers, places=places) def assertWorkersFinished(self, msbayes_workers): for w in msbayes_workers: self.assertTrue(w.finished) def assertSampleIsFromDistribution(self, sample_sum, dist, places=2, mean_adj=1, max_adj=1, compare_variance=True): if isinstance(dist, probability.DiscreteUniformDistribution): self.assertEqual(sample_sum.minimum, dist.minimum) self.assertEqual(sample_sum.maximum, dist.maximum) else: if dist.minimum != float('-inf') or dist.minimum != float('inf'): self.assertAlmostEqual(sample_sum.minimum, dist.minimum, places) if dist.maximum != float('-inf') or dist.maximum != float('inf'): self.assertAlmostEqual(sample_sum.maximum, dist.maximum * max_adj, places) self.assertAlmostEqual(sample_sum.mean, dist.mean * mean_adj, places) if compare_variance: self.assertAlmostEqual(sample_sum.variance, dist.variance, places) def assertApproxEqual(self, x, y, percent_tol=1e-6): eq = (((abs(x - y) / ((abs(x) + abs(y)) / 2)) * 100) < percent_tol) if not eq: _LOG.error('x ({0}) and y ({1}) are not equal'.format(x, y)) self.assertTrue(eq) def files_equal(self, f1, f2, exclude_line_endings=False): equal = True diffs = [] f1, c1 = process_file_arg(f1) f2, c2 = process_file_arg(f2) line = 0 f1_end = False f2_end = False lines_left = True while True: line += 1 if f1_end == False: try: l1 = f1.next() except (StopIteration, EOFError): f1_end = line pass if f2_end == False: try: l2 = f2.next() except (StopIteration, EOFError): f2_end = line pass if f1_end != False and f2_end != False: break if exclude_line_endings: l1 = l1.strip() l2 = l2.strip() if f1_end == False and f2_end == False and l1 != l2: diffs.append(line) equal = False if f1_end != f2_end: mn = min([f1_end, f2_end]) mx = max([f1_end, f2_end]) diffs.extend(range(mn, mx + 1)) equal = False assert len(diffs) == len(set(diffs)) if c1: f1.close() if c2: f2.close() return equal, diffs def assertSameFiles(self, files, exclude_line_endings=False): files = list(files) all_equal = True diffs = StringIO() f1 = files.pop(0) for f2 in files: equal, diff_list = self.files_equal(f1, f2, exclude_line_endings) if not equal: all_equal = False n1 = f1 if not isinstance(n1, str): n1 = f1.name n2 = f2 if not isinstance(n2, str): n2 = f2.name diffs.write('{0} and {1} differ at lines:\n\t{2}\n'.format( n1, n2, ','.join([str(i) for i in diff_list]))) if not all_equal: _LOG.error('files are not equal:\n{0}\n'.format(diffs.getvalue())) self.assertTrue(all_equal) def assertSameUnsortedFiles(self, files): files = list(files) all_equal = True diffs = StringIO() f1, close = process_file_arg(files.pop(0)) lines1 = sorted(f1.readlines()) for f in files: f2, close2 = process_file_arg(f) lines2 = sorted(f2.readlines()) if len(lines1) != len(lines2): all_equal = False diffs.write('{0} ({1}) and {2} ({3}) have different ' 'number of lines\n'.format(f1.name, len(lines1), f2.name, len(lines2))) for i in range(min(len(lines1), len(lines2))): if lines1[i].strip().split() != lines2[i].strip().split(): all_equal = False diffs.write('{0} and {1} differ at sorted index ' '{2}\n'.format(f1.name, f2.name, i)) if close2: f2.close() if not all_equal: _LOG.error('files are not equal after sorting:\n{0}\n'.format( diffs.getvalue())) self.assertTrue(all_equal) if close: f1.close() def same_samples(self, sample1, sample2, places=4, num_mismatches=0): if len(sample1) != len(sample2): return False for i in range(len(sample1)): if round(float(sample1[i]) - float(sample2[i]), places) != 0: if num_mismatches < 1: return False num_mismatches -= 1 return True def assertSameSamples(self, files, columns_to_ignore=[], header=True, places=5, num_mismatches_per_sample=0, num_sample_mismatches=0): files = list(files) all_equal = True diffs = StringIO() f1, close = process_file_arg(files.pop(0)) f1_lines = f1.readlines() indices = [ i for i in range(len(f1_lines[0].strip().split())) if i not in columns_to_ignore ] h1 = [] if header: head = f1_lines.pop(0).strip().split() h1 = [head[i] for i in indices] lines1 = sorted(f1_lines) for f in files: f2, close2 = process_file_arg(f) f2_lines = f2.readlines() h2 = [] if header: head = f2_lines.pop(0).strip().split() h2 = [head[i] for i in indices] if h1 != h2: all_equal = False diffs.write('{0} and {1} have different headers; not ' 'comparing further\n'.format(f1.name, f2.name)) continue lines2 = sorted(f2_lines) if len(lines1) != len(lines2): all_equal = False diffs.write('{0} ({1}) and {2} ({3}) have different ' 'number of lines\n'.format(f1.name, len(lines1), f2.name, len(lines2))) n_matches = 0 n_mismatches = 0 for l1 in lines1: found = False for l2 in lines2: values1 = l1.strip().split() values2 = l2.strip().split() v1 = [float(values1[x]) for x in indices] v2 = [float(values2[x]) for x in indices] if self.same_samples( v1, v2, places=places, num_mismatches=num_mismatches_per_sample): found = True if found: n_matches += 1 else: n_mismatches += 1 if n_mismatches > 0: if n_mismatches > num_sample_mismatches: all_equal = False diffs.write('{0} and {1}\nhave {2} mismatching samples and ' 'share {3} samples\n'.format( f1.name, f2.name, n_mismatches, n_matches)) if close2: f2.close() if diffs.getvalue() != '': _LOG.error('files are not equal after sorting:\n{0}\n'.format( diffs.getvalue())) self.assertTrue(all_equal) if close: f1.close() def assertSameDistributions(self, d1, d2): self.assertEqual(d1.name, d2.name) self.assertEqual(str(d1), str(d2)) self.assertEqual(d1.minimum, d2.minimum) self.assertEqual(d1.maximum, d2.maximum) self.assertEqual(d1.mean, d2.mean) self.assertEqual(d1.variance, d2.variance) def assertSameIntegerPartitions(self, integer_partitions): ips = list(integer_partitions) ip1 = ips.pop(0) for ip2 in ips: self.assertEqual(ip1._initialized, ip2._initialized) self.assertEqual(ip1.n, ip2.n) self.assertEqual(ip1.key, ip2.key) self.assertEqual(ip1.integer_partition, ip2.integer_partition) self.assertEqual(ip1._items, ip2._items) def assertSamePartitions(self, partitions): ps = list(partitions) p1 = ps.pop(0) for p2 in ps: self.assertEqual(p1._initialized, p2._initialized) self.assertEqual(p1.n, p2.n) self.assertEqual(p1.key, p2.key) self.assertEqual(p1.partition, p2.partition) self.assertEqual(p1.values, p2.values)
class PyMsBayesTestCase(unittest.TestCase): def set_up(self): self.temp_fs = TempFileSystem( parent = package_paths.test_path(), prefix = 'PyMsBayesTestTemp-') self.test_id = 'pymsbayes-' + random_str() def tear_down(self): self.register_file_system() self.temp_fs.purge() self.assertEqual(FileStream.open_files, set()) def get_test_path(self, parent=None, prefix='temp'): return self.temp_fs.get_file_path(parent=parent, prefix=prefix) def get_test_subdir(self, parent=None, prefix='temp'): return self.temp_fs.create_subdir(parent=parent, prefix=prefix) def register_file(self, path): self.temp_fs._register_file(path) def register_dir(self, path): self.temp_fs._register_dir(path) def register_file_system(self): _LOG.debug('registering test file system...') for path, dirs, files, in os.walk(self.temp_fs.base_dir): for f in files: if f.startswith(self.test_id): self.register_file(os.path.join(path, f)) for d in dirs: if d.startswith(self.test_id): self.register_dir(os.path.join(path, d)) def _exe_script(self, script_name, args, stdout = None, stderr = None, return_code = 0): script_path = package_paths.script_path(script_name) if isinstance(args, str): arg_list = args.split() else: arg_list = args arg_list = [str(x) for x in arg_list] cmd = [sys.executable, script_path] + arg_list _LOG.debug('Invocation:\n\t{0}'.format(' '.join(cmd))) p = subprocess.Popen(cmd, shell=False, stdout=subprocess.PIPE, stderr=subprocess.PIPE) o, e = p.communicate() exit_code = p.wait() if exit_code != return_code: _LOG.error("exit code {0} did not match {1}".format(exit_code, return_code)) _LOG.error("here is the stdout:\n{0}".format(o)) _LOG.error("here is the stderr:\n{0}".format(e)) self.assertEqual(exit_code, return_code) if stdout != None: if o != stdout: _LOG.error("std out did not match expected:\n{0}".format(o)) self.assertEqual(o, stdout) if stderr != None: if e != stderr: _LOG.error("std error did not match expected:\n{0}".format(e)) self.assertEqual(e, stderr) def get_expected_indices(self, num_pairs, dummy_column=True, parameters_reported=True): num_summary_params = 4 if _CV_INCLUDED: num_summary_params += 1 num_params = 4*num_pairs num_default_stats = 4*num_pairs start = 0 if dummy_column: start = 1 param_indices = range(start, start+num_summary_params) start += num_summary_params if parameters_reported: param_indices += range(start, start+num_params) start += num_params stat_indices = range(start, start+num_default_stats) return param_indices, stat_indices def prior_file_is_valid(self, prior_path, num_of_samples, num_of_columns=None): try: prior_file = open(prior_path, 'rU') except: _LOG.error('prior invalid: could not open prior path {0}'.format( prior_path)) return False nrows = 0 for i, line in enumerate(prior_file): if nrows == 0 and HEADER_PATTERN.match(line): pass else: nrows += 1 if not num_of_columns: num_of_columns = len(line.strip().split()) ncols = len(line.strip().split()) if num_of_columns != ncols: _LOG.error('prior invalid: num of columns at line {0} is {1} ' 'NOT {2}'.format(i+1, ncols, num_of_columns)) return False prior_file.close() if num_of_samples != nrows: _LOG.error('prior invalid: num of rows is {0} NOT {1}'.format( nrows, num_of_samples)) return False return True def get_number_of_lines(self, path): f, close = process_file_arg(path) count = 0 for l in f: count += 1 if close: f.close() return count def get_number_of_header_lines(self, path): f, close = process_file_arg(path) count = 0 for l in f: if HEADER_PATTERN.match(l.strip()): count += 1 if close: f.close() return count def parse_python_config(self, path): return ConfigObj(path) def get_config_from_msbayes_workers(self, msbayes_workers): cfgs = [MsBayesConfig(w.config_path) for w in msbayes_workers] self.assertSameConfigs(cfgs) return cfgs[0] def assertSameConfigs(self, cfgs): configs = list(cfgs) c1 = configs.pop(0) for c2 in cfgs: self.assertEqual(c1.time_in_subs_per_site, c2.time_in_subs_per_site) self.assertEqual(c1.npairs, c2.npairs) self.assertEqual(c1.implementation, c2.implementation) self.assertEqual(c1.div_model_prior, c2.div_model_prior) self.assertEqual(c1.bottle_proportion_shared, c2.bottle_proportion_shared) self.assertEqual(c1.theta_parameters, c2.theta_parameters) self.assertEqual(c1.taxa, c2.taxa) self.assertTrue(c1.sample_table.equals(c2.sample_table)) if c1.psi: self.assertSameDistributions(c1.psi, c2.psi) else: self.assertEqual(c1.psi, c2.psi) if c1.tau: self.assertSameDistributions(c1.tau, c2.tau) else: self.assertEqual(c1.tau, c2.tau) if c1.theta: self.assertSameDistributions(c1.theta, c2.theta) else: self.assertEqual(c1.theta, c2.theta) if c1.a_theta: self.assertSameDistributions(c1.a_theta, c2.a_theta) else: self.assertEqual(c1.a_theta, c2.a_theta) if c1.d_theta: self.assertSameDistributions(c1.d_theta, c2.d_theta) else: self.assertEqual(c1.d_theta, c2.d_theta) if c1.recombination: self.assertSameDistributions(c1.recombination, c2.recombination) else: self.assertEqual(c1.recombination, c2.recombination) if c1.migration: self.assertSameDistributions(c1.migration, c2.migration) else: self.assertEqual(c1.migration, c2.migration) if c1.dpp_concentration: self.assertSameDistributions(c1.dpp_concentration, c2.dpp_concentration) else: self.assertEqual(c1.dpp_concentration, c2.dpp_concentration) if c1.bottle_proportion: self.assertSameDistributions(c1.bottle_proportion, c2.bottle_proportion) else: self.assertEqual(c1.bottle_proportion, c2.bottle_proportion) def get_parameter_summaries_from_msbayes_workers(self, msbayes_workers, shuffle_taus=True): msbayes_workers = list(msbayes_workers) s = dict(zip( [i for i in msbayes_workers[0].parameter_indices], [SampleSummarizer( tag=msbayes_workers[0].header[i]) for i in msbayes_workers[ 0].parameter_indices])) ncols = None header = msbayes_workers[0].header pi = msbayes_workers[0].parameter_indices for w in msbayes_workers: self.assertEqual(w.header, header) self.assertEqual(w.parameter_indices, pi) f = open(w.prior_path, 'rU') for line_idx, row in enumerate(f): if not ncols: ncols = len(row.strip().split()) if HEADER_PATTERN.match(row.strip()): continue r = row.strip().split() assert len(r) == ncols if shuffle_taus: # because taus are sorted in prior files psi_index = get_indices_of_patterns(w.header, PSI_PATTERNS)[0] tau_indices = get_indices_of_patterns(w.header, TAU_PATTERNS) psi = int(r[psi_index]) taus = [float(r[i]) for i in tau_indices] self.assertEqual(psi, len(set(taus))) random.shuffle(taus) for n, i in enumerate(tau_indices): s[i].add_sample(taus[n]) p_set = set(w.parameter_indices) - set(tau_indices) p = sorted(list(p_set)) for i in p: s[i].add_sample(float(r[i])) else: for i in w.parameter_indices: s[i].add_sample(float(r[i])) f.close() return s def assertPriorIsPrecise(self, msbayes_workers, places=2): msbayes_workers = list(msbayes_workers) self.assertWorkersFinished(msbayes_workers) param_sums = self.get_parameter_summaries_from_msbayes_workers( msbayes_workers) sample_size = 0 for w in msbayes_workers: sample_size += w.sample_size for s in param_sums.itervalues(): self.assertEqual(s.n, sample_size) cfg = self.get_config_from_msbayes_workers(msbayes_workers) psi_indices = get_indices_of_patterns(msbayes_workers[0].header, PSI_PATTERNS) self.assertEqual(len(psi_indices), 1) model_indices = get_indices_of_patterns(msbayes_workers[0].header, MODEL_PATTERNS) if not msbayes_workers[0].model_index is None: self.assertEqual(len(model_indices), 1) else: self.assertEqual(len(model_indices), 0) tau_indices = get_indices_of_patterns(msbayes_workers[0].header, TAU_PATTERNS) a_theta_indices = get_indices_of_patterns(msbayes_workers[0].header, A_THETA_PATTERNS) d_theta_indices = get_indices_of_patterns(msbayes_workers[0].header, D_THETA_PATTERNS) if msbayes_workers[0].report_parameters: self.assertEqual(len(tau_indices), cfg.npairs) self.assertEqual(len(a_theta_indices), cfg.npairs) self.assertEqual(len(d_theta_indices), 2*cfg.npairs) else: self.assertEqual(len(tau_indices), 0) self.assertEqual(len(a_theta_indices), 0) self.assertEqual(len(d_theta_indices), 0) _LOG.debug('\n{0}\n'.format('\n'.join( [str(param_sums[i]) for i in sorted(param_sums.iterkeys())]))) for i in psi_indices: self.assertSampleIsFromDistribution(param_sums[i], cfg.psi, places=places) for i in tau_indices: self.assertSampleIsFromDistribution(param_sums[i], cfg.tau, places=places) for i in a_theta_indices: self.assertSampleIsFromDistribution(param_sums[i], cfg.a_theta, places=places) for i in d_theta_indices: self.assertSampleIsFromDistribution(param_sums[i], cfg.d_theta, mean_adj=cfg.theta.mean, max_adj=cfg.theta.maximum, compare_variance=False, places=places) def assertPriorIsAccurate(self, msbayes_workers, places=2): msbayes_workers = list(msbayes_workers) self.assertWorkersFinished(msbayes_workers) pass def assertPriorIsValid(self, msbayes_workers, places=2): msbayes_workers = list(msbayes_workers) self.assertWorkersFinished(msbayes_workers) self.assertPriorIsPrecise(msbayes_workers, places=places) self.assertPriorIsAccurate(msbayes_workers, places=places) def assertWorkersFinished(self, msbayes_workers): for w in msbayes_workers: self.assertTrue(w.finished) def assertSampleIsFromDistribution(self, sample_sum, dist, places=2, mean_adj=1, max_adj=1, compare_variance=True): if isinstance(dist, probability.DiscreteUniformDistribution): self.assertEqual(sample_sum.minimum, dist.minimum) self.assertEqual(sample_sum.maximum, dist.maximum) else: if dist.minimum != float('-inf') or dist.minimum != float('inf'): self.assertAlmostEqual(sample_sum.minimum, dist.minimum, places) if dist.maximum != float('-inf') or dist.maximum != float('inf'): self.assertAlmostEqual(sample_sum.maximum, dist.maximum*max_adj, places) self.assertAlmostEqual(sample_sum.mean, dist.mean*mean_adj, places) if compare_variance: self.assertAlmostEqual(sample_sum.variance, dist.variance, places) def assertApproxEqual(self, x, y, percent_tol=1e-6): eq = (((abs(x-y) / ((abs(x)+abs(y))/2))*100) < percent_tol) if not eq: _LOG.error('x ({0}) and y ({1}) are not equal'.format(x, y)) self.assertTrue(eq) def files_equal(self, f1, f2, exclude_line_endings=False): equal = True diffs = [] f1, c1 = process_file_arg(f1) f2, c2 = process_file_arg(f2) line = 0 f1_end = False f2_end = False lines_left = True while True: line += 1 if f1_end == False: try: l1 = f1.next() except (StopIteration, EOFError): f1_end = line pass if f2_end == False: try: l2 = f2.next() except (StopIteration, EOFError): f2_end = line pass if f1_end != False and f2_end != False: break if exclude_line_endings: l1 = l1.strip() l2 = l2.strip() if f1_end == False and f2_end == False and l1 != l2: diffs.append(line) equal = False if f1_end != f2_end: mn = min([f1_end, f2_end]) mx = max([f1_end, f2_end]) diffs.extend(range(mn, mx+1)) equal = False assert len(diffs) == len(set(diffs)) if c1: f1.close() if c2: f2.close() return equal, diffs def assertSameFiles(self, files, exclude_line_endings=False): files = list(files) all_equal = True diffs = StringIO() f1 = files.pop(0) for f2 in files: equal, diff_list = self.files_equal(f1, f2, exclude_line_endings) if not equal: all_equal = False n1 = f1 if not isinstance(n1, str): n1 = f1.name n2 = f2 if not isinstance(n2, str): n2 = f2.name diffs.write('{0} and {1} differ at lines:\n\t{2}\n'.format( n1, n2, ','.join([str(i) for i in diff_list]))) if not all_equal: _LOG.error('files are not equal:\n{0}\n'.format(diffs.getvalue())) self.assertTrue(all_equal) def assertSameUnsortedFiles(self, files): files = list(files) all_equal = True diffs = StringIO() f1, close = process_file_arg(files.pop(0)) lines1 = sorted(f1.readlines()) for f in files: f2, close2 = process_file_arg(f) lines2 = sorted(f2.readlines()) if len(lines1) != len(lines2): all_equal = False diffs.write('{0} ({1}) and {2} ({3}) have different ' 'number of lines\n'.format(f1.name, len(lines1), f2.name, len(lines2))) for i in range(min(len(lines1), len(lines2))): if lines1[i].strip().split() != lines2[i].strip().split(): all_equal = False diffs.write('{0} and {1} differ at sorted index ' '{2}\n'.format(f1.name, f2.name, i)) if close2: f2.close() if not all_equal: _LOG.error('files are not equal after sorting:\n{0}\n'.format( diffs.getvalue())) self.assertTrue(all_equal) if close: f1.close() def same_samples(self, sample1, sample2, places = 4, num_mismatches = 0): if len(sample1) != len(sample2): return False for i in range(len(sample1)): if round(float(sample1[i]) - float(sample2[i]), places) != 0: if num_mismatches < 1: return False num_mismatches -= 1 return True def assertSameSamples(self, files, columns_to_ignore = [], header = True, places = 5, num_mismatches_per_sample = 0, num_sample_mismatches = 0): files = list(files) all_equal = True diffs = StringIO() f1, close = process_file_arg(files.pop(0)) f1_lines = f1.readlines() indices = [i for i in range(len( f1_lines[0].strip().split())) if i not in columns_to_ignore] h1 = [] if header: head = f1_lines.pop(0).strip().split() h1 = [head[i] for i in indices] lines1 = sorted(f1_lines) for f in files: f2, close2 = process_file_arg(f) f2_lines = f2.readlines() h2 = [] if header: head = f2_lines.pop(0).strip().split() h2 = [head[i] for i in indices] if h1 != h2: all_equal = False diffs.write('{0} and {1} have different headers; not ' 'comparing further\n'.format( f1.name, f2.name)) continue lines2 = sorted(f2_lines) if len(lines1) != len(lines2): all_equal = False diffs.write('{0} ({1}) and {2} ({3}) have different ' 'number of lines\n'.format(f1.name, len(lines1), f2.name, len(lines2))) n_matches = 0 n_mismatches = 0 for l1 in lines1: found = False for l2 in lines2: values1 = l1.strip().split() values2 = l2.strip().split() v1 = [float(values1[x]) for x in indices] v2 = [float(values2[x]) for x in indices] if self.same_samples(v1, v2, places = places, num_mismatches = num_mismatches_per_sample): found = True if found: n_matches += 1 else: n_mismatches += 1 if n_mismatches > 0: if n_mismatches > num_sample_mismatches: all_equal = False diffs.write('{0} and {1}\nhave {2} mismatching samples and ' 'share {3} samples\n'.format( f1.name, f2.name, n_mismatches, n_matches)) if close2: f2.close() if diffs.getvalue() != '': _LOG.error('files are not equal after sorting:\n{0}\n'.format( diffs.getvalue())) self.assertTrue(all_equal) if close: f1.close() def assertSameDistributions(self, d1, d2): self.assertEqual(d1.name, d2.name) self.assertEqual(str(d1), str(d2)) self.assertEqual(d1.minimum, d2.minimum) self.assertEqual(d1.maximum, d2.maximum) self.assertEqual(d1.mean, d2.mean) self.assertEqual(d1.variance, d2.variance) def assertSameIntegerPartitions(self, integer_partitions): ips = list(integer_partitions) ip1 = ips.pop(0) for ip2 in ips: self.assertEqual(ip1._initialized, ip2._initialized) self.assertEqual(ip1.n, ip2.n) self.assertEqual(ip1.key, ip2.key) self.assertEqual(ip1.integer_partition, ip2.integer_partition) self.assertEqual(ip1._items, ip2._items) def assertSamePartitions(self, partitions): ps = list(partitions) p1 = ps.pop(0) for p2 in ps: self.assertEqual(p1._initialized, p2._initialized) self.assertEqual(p1.n, p2.n) self.assertEqual(p1.key, p2.key) self.assertEqual(p1.partition, p2.partition) self.assertEqual(p1.values, p2.values)
def main_cli(): description = '{name} {version}'.format(**_program_info) parser = argparse.ArgumentParser(description = description) parser.add_argument('-c', '--config', type = arg_is_config, required = True, help = ('msBayes config file to be used to generate saturation ' 'plot.')) parser.add_argument('-n', '--num-prior-samples', action = 'store', type = int, default = 1000, help = ('The number of prior samples to simulate for the ' 'saturation plot.')) parser.add_argument('--np', action = 'store', type = int, default = multiprocessing.cpu_count(), help = ('The maximum number of processes to run in parallel. The ' 'default is the number of CPUs available on the machine.')) parser.add_argument('-o', '--output-dir', action = 'store', type = arg_is_dir, help = ('The directory in which all output files will be written. ' 'The default is to use the directory of the first observed ' 'config file.')) parser.add_argument('--temp-dir', action = 'store', type = arg_is_dir, help = ('A directory to temporarily stage files. The default is to ' 'use the output directory.')) parser.add_argument('-s', '--stat-prefixes', nargs = '*', type = str, default = ['pi', 'pi.net', 'wattTheta', 'tajD.denom'], help = ('Prefixes of summary statistics to use in the analyses. ' 'The prefixes should be separated by spaces. ' 'Default: `-s pi pi.net wattTheta tajD.denom`.')) parser.add_argument('--vertical-lines', nargs = '*', type = float, default = [], help = ('Positions along x-axis where vertical lines are to be ' 'drawn. Default is to draw no vertical lines.')) parser.add_argument('--compress', action = 'store_true', help = 'Compress plot data file.') parser.add_argument('--keep-temps', action = 'store_true', help = 'Keep all temporary files.') parser.add_argument('--seed', action = 'store', type = int, help = 'Random number seed to use for the analysis.') parser.add_argument('--version', action = 'version', version = '%(prog)s ' + _program_info['version'], help = 'Report version and exit.') parser.add_argument('--quiet', action = 'store_true', help = 'Run without verbose messaging.') parser.add_argument('--debug', action = 'store_true', help = 'Run in debugging mode.') args = parser.parse_args() ########################################################################## ## handle args from pymsbayes.utils.messaging import (LoggingControl, InfoLogger) LoggingControl.set_logging_level("INFO") if args.quiet: LoggingControl.set_logging_level("WARNING") if args.debug: LoggingControl.set_logging_level("DEBUG") log = LoggingControl.get_logger(__name__) from pymsbayes.workers import MsBayesWorker from pymsbayes.utils.parsing import (get_patterns_from_prefixes, DEFAULT_STAT_PATTERNS, get_stats_by_time, dict_line_iter) from pymsbayes.manager import Manager from pymsbayes.utils.tempfs import TempFileSystem from pymsbayes.utils import probability from pymsbayes.utils.functions import long_division from pymsbayes.config import MsBayesConfig from pymsbayes.utils import GLOBAL_RNG, MSBAYES_SORT_INDEX, ToolPathManager from pymsbayes.fileio import process_file_arg from pymsbayes.plotting import MATPLOTLIB_AVAILABLE, SaturationPlotGrid MSBAYES_SORT_INDEX.set_index(0) # get full paths to tools msbayes_path = ToolPathManager.get_tool_full_path('msbayes.pl') dpp_msbayes_path = ToolPathManager.get_tool_full_path('dpp-msbayes.pl') if not args.output_dir: args.output_dir = os.path.dirname(args.config) info = InfoLogger(os.path.join(args.output_dir, 'pymsbayes-info.txt')) stats_by_time_path = os.path.join(args.output_dir, 'stats-by-time.txt') if args.compress: stats_by_time_path += '.gz' plot_path = os.path.join(args.output_dir, 'saturation-plot.pdf') if not args.temp_dir: args.temp_dir = args.output_dir temp_fs = TempFileSystem(parent=args.temp_dir, prefix='temp-files-') args.stat_prefixes = [s.rstrip('.') for s in args.stat_prefixes] stat_patterns = get_patterns_from_prefixes( [s + '.' for s in args.stat_prefixes], ignore_case=True) if not args.seed: args.seed = random.randint(1, 999999999) GLOBAL_RNG.seed(args.seed) compress_level = None if args.compress: compress_level = 9 cfg = MsBayesConfig(args.config) num_taxon_pairs = cfg.npairs cfg.div_model_prior = 'constrained' cfg.psi = probability.DiscreteUniformDistribution(num_taxon_pairs, num_taxon_pairs) config_path = temp_fs.get_file_path(prefix='cfg-') cfg.write(config_path) info.write('[pymsbayes]', log.info) info.write('\tprogram_name = {name}'.format(**_program_info), log.info) info.write('\tversion = {version}'.format(**_program_info), log.info) info.write('\tinvocation = {0!r}'.format(' '.join(sys.argv)), log.info) info.write('\toutput_directory = {0!r}'.format(args.output_dir), log.info) info.write('\ttemp_directory = {0!r}'.format(temp_fs.base_dir), log.info) info.write('\tsort_index = {0}'.format( MSBAYES_SORT_INDEX.current_value()), log.info) info.write('\tstat_patterns = {0!r}'.format( ', '.join([p.pattern for p in stat_patterns])), log.info) info.write('\tseed = {0}'.format(args.seed), log.info) info.write('\tnum_prior_samples = {0}'.format(args.num_prior_samples), log.info) info.write('\tstats_by_time_path = {0!r}'.format(stats_by_time_path), log.info) info.write('\t[[tool_paths]]', log.info) info.write('\t\tdpp_msbayes = {0}'.format(dpp_msbayes_path), log.info) info.write('\t\tmsbayes = {0}'.format(msbayes_path), log.info) info.write('\t[[config]]', log.debug) info.write('{0}'.format(str(cfg)), log.debug) ########################################################################## ## begin analysis --- generate samples start_time = datetime.datetime.now() if args.np > args.num_prior_samples: args.np = args.num_prior_samples batch_size, remainder = long_division(args.num_prior_samples, args.np) schema = 'abctoolbox' workers = [] for i in range(args.np): sample_size = batch_size if i == (args.np - 1): sample_size += remainder w = MsBayesWorker( temp_fs = temp_fs, sample_size = sample_size, config_path = config_path, report_parameters = True, schema = schema, include_header = True, stat_patterns = stat_patterns, write_stats_file = False) workers.append(w) log.info('Generating samples...') workers = Manager.run_workers( workers = workers, num_processors = args.np) log.info('Parsing samples...') stats_by_time = get_stats_by_time([w.prior_path for w in workers]) stat_keys = stats_by_time.keys() stat_keys.remove('PRI.t') for prefix in args.stat_prefixes: if not prefix in stat_keys: raise Exception('stat prefix {0!r} not found in simulated stats:' '\n\t{1}'.format(prefix, ', '.join(stat_keys))) header = ['PRI.t'] + args.stat_prefixes log.info('Writing stats-by-time matrix...') out, close = process_file_arg(stats_by_time_path, 'w', compresslevel = compress_level) for row in dict_line_iter(stats_by_time, sep = '\t', header = header): out.write(row) if close: out.close() log.info('Creating plots...') if not MATPLOTLIB_AVAILABLE: log.warning( '`matplotlib` could not be imported, so the plot can not be\n' 'produced. The data to create the plot can be found in:\n\t' '{0!r}'.format(stats_by_time_path)) else: y_labels = {'pi': r'$\pi$', 'pi.net': r'$\pi_{net}$', 'wattTheta': r'$\theta_W$', 'tajD.denom': r'$SD(\pi - \theta_W)$'} spg = SaturationPlotGrid(stats_by_time, x_key = 'PRI.t', y_keys = args.stat_prefixes, y_labels = y_labels, num_columns = 2, vertical_line_positions = args.vertical_lines) fig = spg.create_grid() fig.savefig(plot_path) stop_time = datetime.datetime.now() log.info('Done!') info.write('\t[[run_stats]]', log.info) info.write('\t\tstart_time = {0}'.format(str(start_time)), log.info) info.write('\t\tstop_time = {0}'.format(str(stop_time)), log.info) info.write('\t\ttotal_duration = {0}'.format(str(stop_time - start_time)), log.info) if not args.keep_temps: log.debug('purging temps...') temp_fs.purge()
def main_cli(): description = '{name} {version}'.format(**_program_info) parser = argparse.ArgumentParser(description=description) parser.add_argument( '-c', '--config', type=arg_is_config, required=True, help=('msBayes config file to be used to generate saturation ' 'plot.')) parser.add_argument( '-n', '--num-prior-samples', action='store', type=int, default=1000, help=('The number of prior samples to simulate for the ' 'saturation plot.')) parser.add_argument( '--np', action='store', type=int, default=multiprocessing.cpu_count(), help=('The maximum number of processes to run in parallel. The ' 'default is the number of CPUs available on the machine.')) parser.add_argument( '-o', '--output-dir', action='store', type=arg_is_dir, help=('The directory in which all output files will be written. ' 'The default is to use the directory of the first observed ' 'config file.')) parser.add_argument( '--temp-dir', action='store', type=arg_is_dir, help=('A directory to temporarily stage files. The default is to ' 'use the output directory.')) parser.add_argument( '-s', '--stat-prefixes', nargs='*', type=str, default=['pi', 'pi.net', 'wattTheta', 'tajD.denom'], help=('Prefixes of summary statistics to use in the analyses. ' 'The prefixes should be separated by spaces. ' 'Default: `-s pi pi.net wattTheta tajD.denom`.')) parser.add_argument( '--vertical-lines', nargs='*', type=float, default=[], help=('Positions along x-axis where vertical lines are to be ' 'drawn. Default is to draw no vertical lines.')) parser.add_argument('--compress', action='store_true', help='Compress plot data file.') parser.add_argument('--keep-temps', action='store_true', help='Keep all temporary files.') parser.add_argument('--seed', action='store', type=int, help='Random number seed to use for the analysis.') parser.add_argument('--version', action='version', version='%(prog)s ' + _program_info['version'], help='Report version and exit.') parser.add_argument('--quiet', action='store_true', help='Run without verbose messaging.') parser.add_argument('--debug', action='store_true', help='Run in debugging mode.') args = parser.parse_args() ########################################################################## ## handle args from pymsbayes.utils.messaging import (LoggingControl, InfoLogger) LoggingControl.set_logging_level("INFO") if args.quiet: LoggingControl.set_logging_level("WARNING") if args.debug: LoggingControl.set_logging_level("DEBUG") log = LoggingControl.get_logger(__name__) from pymsbayes.workers import MsBayesWorker from pymsbayes.utils.parsing import (get_patterns_from_prefixes, DEFAULT_STAT_PATTERNS, get_stats_by_time, dict_line_iter) from pymsbayes.manager import Manager from pymsbayes.utils.tempfs import TempFileSystem from pymsbayes.utils import probability from pymsbayes.utils.functions import long_division from pymsbayes.config import MsBayesConfig from pymsbayes.utils import GLOBAL_RNG, MSBAYES_SORT_INDEX, ToolPathManager from pymsbayes.fileio import process_file_arg from pymsbayes.plotting import MATPLOTLIB_AVAILABLE, SaturationPlotGrid MSBAYES_SORT_INDEX.set_index(0) # get full paths to tools msbayes_path = ToolPathManager.get_tool_full_path('msbayes.pl') dpp_msbayes_path = ToolPathManager.get_tool_full_path('dpp-msbayes.pl') if not args.output_dir: args.output_dir = os.path.dirname(args.config) info = InfoLogger(os.path.join(args.output_dir, 'pymsbayes-info.txt')) stats_by_time_path = os.path.join(args.output_dir, 'stats-by-time.txt') if args.compress: stats_by_time_path += '.gz' plot_path = os.path.join(args.output_dir, 'saturation-plot.pdf') if not args.temp_dir: args.temp_dir = args.output_dir temp_fs = TempFileSystem(parent=args.temp_dir, prefix='temp-files-') args.stat_prefixes = [s.rstrip('.') for s in args.stat_prefixes] stat_patterns = get_patterns_from_prefixes( [s + '.' for s in args.stat_prefixes], ignore_case=True) if not args.seed: args.seed = random.randint(1, 999999999) GLOBAL_RNG.seed(args.seed) compress_level = None if args.compress: compress_level = 9 cfg = MsBayesConfig(args.config) num_taxon_pairs = cfg.npairs cfg.div_model_prior = 'constrained' cfg.psi = probability.DiscreteUniformDistribution(num_taxon_pairs, num_taxon_pairs) config_path = temp_fs.get_file_path(prefix='cfg-') cfg.write(config_path) info.write('[pymsbayes]', log.info) info.write('\tprogram_name = {name}'.format(**_program_info), log.info) info.write('\tversion = {version}'.format(**_program_info), log.info) info.write('\tinvocation = {0!r}'.format(' '.join(sys.argv)), log.info) info.write('\toutput_directory = {0!r}'.format(args.output_dir), log.info) info.write('\ttemp_directory = {0!r}'.format(temp_fs.base_dir), log.info) info.write('\tsort_index = {0}'.format(MSBAYES_SORT_INDEX.current_value()), log.info) info.write( '\tstat_patterns = {0!r}'.format(', '.join( [p.pattern for p in stat_patterns])), log.info) info.write('\tseed = {0}'.format(args.seed), log.info) info.write('\tnum_prior_samples = {0}'.format(args.num_prior_samples), log.info) info.write('\tstats_by_time_path = {0!r}'.format(stats_by_time_path), log.info) info.write('\t[[tool_paths]]', log.info) info.write('\t\tdpp_msbayes = {0}'.format(dpp_msbayes_path), log.info) info.write('\t\tmsbayes = {0}'.format(msbayes_path), log.info) info.write('\t[[config]]', log.debug) info.write('{0}'.format(str(cfg)), log.debug) ########################################################################## ## begin analysis --- generate samples start_time = datetime.datetime.now() if args.np > args.num_prior_samples: args.np = args.num_prior_samples batch_size, remainder = long_division(args.num_prior_samples, args.np) schema = 'abctoolbox' workers = [] for i in range(args.np): sample_size = batch_size if i == (args.np - 1): sample_size += remainder w = MsBayesWorker(temp_fs=temp_fs, sample_size=sample_size, config_path=config_path, report_parameters=True, schema=schema, include_header=True, stat_patterns=stat_patterns, write_stats_file=False) workers.append(w) log.info('Generating samples...') workers = Manager.run_workers(workers=workers, num_processors=args.np) log.info('Parsing samples...') stats_by_time = get_stats_by_time([w.prior_path for w in workers]) stat_keys = stats_by_time.keys() stat_keys.remove('PRI.t') for prefix in args.stat_prefixes: if not prefix in stat_keys: raise Exception('stat prefix {0!r} not found in simulated stats:' '\n\t{1}'.format(prefix, ', '.join(stat_keys))) header = ['PRI.t'] + args.stat_prefixes log.info('Writing stats-by-time matrix...') out, close = process_file_arg(stats_by_time_path, 'w', compresslevel=compress_level) for row in dict_line_iter(stats_by_time, sep='\t', header=header): out.write(row) if close: out.close() log.info('Creating plots...') if not MATPLOTLIB_AVAILABLE: log.warning( '`matplotlib` could not be imported, so the plot can not be\n' 'produced. The data to create the plot can be found in:\n\t' '{0!r}'.format(stats_by_time_path)) else: y_labels = { 'pi': r'$\pi$', 'pi.net': r'$\pi_{net}$', 'wattTheta': r'$\theta_W$', 'tajD.denom': r'$SD(\pi - \theta_W)$' } spg = SaturationPlotGrid(stats_by_time, x_key='PRI.t', y_keys=args.stat_prefixes, y_labels=y_labels, num_columns=2, vertical_line_positions=args.vertical_lines) fig = spg.create_grid() fig.savefig(plot_path) stop_time = datetime.datetime.now() log.info('Done!') info.write('\t[[run_stats]]', log.info) info.write('\t\tstart_time = {0}'.format(str(start_time)), log.info) info.write('\t\tstop_time = {0}'.format(str(stop_time)), log.info) info.write('\t\ttotal_duration = {0}'.format(str(stop_time - start_time)), log.info) if not args.keep_temps: log.debug('purging temps...') temp_fs.purge()