def get_response_content(fs): """ @param fs: a FieldStorage object containing the cgi arguments @return: a (response_headers, response_text) pair """ # read the criterion string, creating the splitter object if fs.exact: splitter = Clustering.StoneExactDMS() elif fs.sign: splitter = Clustering.StoneSpectralSignDMS() elif fs.nj: splitter = Clustering.NeighborJoiningDMS() elif fs.random: splitter = Clustering.RandomDMS() # read the original tree tree = NewickIO.parse(fs.tree, FelTree.NewickTree) # define the maximum number of steps we want max_steps = 1000000 # Make sure that the splitter object is appropriate # for the number of taxa and the number of tree reconstructions. ntaxa = len(list(tree.gen_tips())) if splitter.get_complexity(ntaxa) * fs.iterations > max_steps: msg_a = 'use a faster bipartition function, ' msg_b = 'fewer taxa, or fewer tree reconstructions' raise HandlingError(msg_a + msg_b) # define the simulation parameters sim = Simulation(splitter, 'nj', 'cgi tree building simulation') sim.set_original_tree(tree) sim.set_step_limit(max_steps) # define an arbitrary but consistent ordering of the taxa ordered_names = [node.name for node in tree.gen_tips()] # attempt to simulate a bunch of distance matrices sampler = DMSampler.DMSampler(tree, ordered_names, fs.length) distance_matrices = [] for result in sampler.gen_samples_or_none(): # if a proposal was accepted then add it to the list if result: sequence_list, distance_matrix = result distance_matrices.append(distance_matrix) # if enough accepted samples have been generated then stop sampling remaining_acceptances = fs.iterations - len(distance_matrices) if not remaining_acceptances: break # If the remaining number of computrons is predicted # to be too much then stop. if sampler.get_remaining_computrons(remaining_acceptances) > max_steps: msg_a = 'this combination of parameters ' msg_b = 'is predicted to take too long' raise HandlingError(msg) sim.run(distance_matrices, ordered_names) # define the response out = StringIO() print >> out, 'partition error count frequencies:' print >> out, sim.get_histogram_string() print >> out, '' print >> out, 'weighted partition errors:', sim.get_deep_loss() # return the response return out.getvalue()
def do_hard_coded_analysis_a(tree, tree_remark): """ Do a hardcoded analysis of tree reconstruction methods. Make a bunch of R files. @param tree: a tree object @param tree_remark: a string that is a comment about the tree """ # define an arbitrary order for the names of the leaves of the tree ordered_names = list(node.name for node in tree.gen_tips()) # use 1000 replicates reconstruction_count = 1000 # Make R files for reconstruction results # from sequences 100 and 500 nucleotides long. for sequence_length in (100, 500): # sample distance matrices print 'sampling', reconstruction_count, 'distance matrices' print 'from alignments of length', sequence_length sampler = DMSampler.DMSampler(tree, ordered_names, sequence_length) distance_matrices = [] for result in sampler.gen_samples_or_none(): # if the proposal was rejected then try again if not result: continue # add the accepted distance matrix sample to the list sequence_list, distance_matrix = result distance_matrices.append(distance_matrix) # stop when we have generated enough distance matrices if len(distance_matrices) == reconstruction_count: break # run both neighbor joining and spectral sign clustering sims = [ Simulation(Clustering.NeighborJoiningDMS(), 'nj', 'neighbor joining'), Simulation(Clustering.StoneSpectralSignDMS(), 'nj', 'spectral sign') ] for sim in sims: print 'reconstructing', len(distance_matrices), 'trees' print 'using', sim.description sim.set_original_tree(tree) sim.run(distance_matrices, ordered_names) # consider the neighbor joining and the spectral sign results nj_sim, ss_sim = sims # write the uniform loss function comparison R script script_contents = R_helper(nj_sim.get_normalized_error_counts(), ss_sim.get_normalized_error_counts()) filename = 'uniform_%d.R' % sequence_length with open(filename, 'w') as fout: print >> fout, script_contents # write the weighted loss function comparison R script script_contents = R_helper(nj_sim.get_normalized_loss_values(), ss_sim.get_normalized_loss_values()) filename = 'weighted_%d.R' % sequence_length with open(filename, 'w') as fout: print >> fout, script_contents
def get_response_content(fs): # read the matrix D = fs.matrix if len(D) < 3: raise HandlingError('the matrix should have at least three rows') # read the ordered labels ordered_labels = Util.get_stripped_lines(StringIO(fs.labels)) if len(ordered_labels) != len(D): msg_a = 'the number of ordered labels should be the same ' msg_b = 'as the number of rows in the matrix' raise HandlingError(msg_a + msg_b) if len(set(ordered_labels)) != len(ordered_labels): raise HandlingError('the ordered labels must be unique') # read the criterion string, creating the splitter object if fs.exact: splitter = Clustering.StoneExactDMS() elif fs.sign: splitter = Clustering.StoneSpectralSignDMS() elif fs.threshold: splitter = Clustering.StoneSpectralThresholdDMS() elif fs.nj: splitter = Clustering.NeighborJoiningDMS() # Make sure that the splitter object # is appropriate for the size of the distance matrix. if splitter.get_complexity(len(D)) > 1000000: msg = 'use a smaller distance matrix or a faster bipartition function' raise HandlingError(msg) # read the original tree tree = NewickIO.parse(fs.tree, FelTree.NewickTree) if len(ordered_labels) != len(list(tree.gen_tips())): msg_a = 'the number of ordered labels should be the same ' msg_b = 'as the number of tips in the tree' raise HandlingError(msg_a + msg_b) tree_tip_names = set(tip.name for tip in tree.gen_tips()) if tree_tip_names != set(ordered_labels): msg_a = 'the leaf labels of the tree do not match ' msg_b = 'the ordered labels of the distance matrix rows' raise HandlingError(msg_a + msg_b) # create the tree builder tree_builder = NeighborhoodJoining.ValidatingTreeBuilder( D.tolist(), ordered_labels, splitter) # Read the recourse string and set the corresponding method # in the tree builder. if fs.njrecourse: tree_builder.set_fallback_name('nj') elif fs.halvingrecourse: tree_builder.set_fallback_name('halving') # define the response out = StringIO() # set parameters of the tree validating tree builder tree_builder.set_original_tree(tree) tree_builder.set_output_stream(out) tree = tree_builder.build() # return the response return out.getvalue()
def main(): """ Run some tree reconstructions from the command line. """ # initialize the simulation objects sims = [ Simulation(Clustering.NeighborJoiningDMS(), 'nj', 'neighbor joining'), Simulation(Clustering.RandomDMS(), 'nj', 'random partitioning'), Simulation(Clustering.StoneExactDMS(), 'nj', 'exact criterion with neighbor joining fallback'), #Simulation(Clustering.StoneExactDMS(), #'halving', 'exact criterion with stem halving fallback'), Simulation(Clustering.StoneSpectralSignDMS(), 'nj', 'spectral sign cut with neighbor joining fallback') #Simulation(Clustering.StoneSpectralSignDMS(), #'halving', 'spectral sign cut with stem halving fallback') ] # define the simulation parameters tree = get_default_original_tree() reconstruction_count = 1000 sequence_length = 100 step_limit_per_method = 10000000 # set the simulation parameters for sim in sims: sim.set_original_tree(get_default_original_tree()) sim.set_reconstruction_count(reconstruction_count) sim.set_step_limit(step_limit_per_method) sim.set_sequence_length(sequence_length) # show the simulation parameters print 'simulation parameters:' print 'original tree:', NewickIO.get_newick_string(tree) print 'reconstruction count:', reconstruction_count print 'sequence length:', sequence_length # run the simulations print 'running the simulations...' for sim in sims: print 'running "%s"...' % sim.description try: sim.run() except HandlingError as e: print 'Error:', e # print the simulation data print 'simulation results:' for sim in sims: print sim.description + ':' print sim.get_histogram_string()
def get_response_content(fs): # read the matrix D = fs.matrix if len(D) < 3: raise HandlingError('the matrix should have at least three rows') # read the ordered labels ordered_labels = Util.get_stripped_lines(StringIO(fs.labels)) if len(ordered_labels) != len(D): msg_a = 'the number of ordered labels should be the same ' msg_b = 'as the number of rows in the matrix' raise HandlingError(msg_a + msg_b) if len(set(ordered_labels)) != len(ordered_labels): raise HandlingError('the ordered labels must be unique') # read the criterion string, creating the splitter object if fs.sign: splitter = Clustering.StoneSpectralSignDMS() elif fs.threshold: splitter = Clustering.StoneSpectralThresholdDMS() elif fs.nj_general: splitter = Clustering.NeighborJoiningDMS() elif fs.nj_specific: splitter = None # Make sure that the splitter object is appropriate # for the size of the distance matrix. if splitter.get_complexity(len(D)) > 1000000: msg_a = 'use a smaller distance matrix ' msg_b = 'or a faster bipartition function' raise HandlingError(msg_a + msg_b) # create the tree builder tree_builder = NeighborhoodJoining.TreeBuilder(D.tolist(), ordered_labels, splitter) tree_builder.set_fallback_name('nj') # define the response out = StringIO() # build the tree tree = tree_builder.build() # write the response return out.getvalue()
def get_response_content(fs): # get the newick trees. trees = [] for tree_string in iterutils.stripped_lines(StringIO(fs.trees)): # Parse each tree and make sure # that it conforms to various requirements. tree = NewickIO.parse(tree_string, FelTree.NewickTree) tip_names = [tip.get_name() for tip in tree.gen_tips()] if len(tip_names) < 4: msg_a = 'expected at least four tips but found ' msg_b = str(len(tip_names)) raise HandlingError(msg_a + msg_b) if any(name is None for name in tip_names): raise HandlingError('each terminal node must be labeled') if len(set(tip_names)) != len(tip_names): raise HandlingError('each terminal node label must be unique') trees.append(tree) # read the criterion string, creating the splitter object if fs.exact: splitter = Clustering.StoneExactDMS() elif fs.sign: splitter = Clustering.StoneSpectralSignDMS() elif fs.threshold: splitter = Clustering.StoneSpectralThresholdDMS() elif fs.nj: splitter = Clustering.NeighborJoiningDMS() elif fs.random: splitter = Clustering.RandomDMS() # assert that the computation is fast complexity = 0 for tree in trees: n = len(list(tree.gen_tips())) complexity += n * splitter.get_complexity(n) if complexity > 1000000: raise HandlingError('this computation would take too long') # evaluate the bipartition of each tree based on its distance matrix informative_split_count = 0 degenerate_split_count = 0 invalid_split_count = 0 for tree in trees: tips = list(tree.gen_tips()) n = len(tips) D = tree.get_distance_matrix() if fs.strength: P = [row[:] for row in D] for i in range(n): for j in range(i): x = random.normalvariate(0, fs.strength) new_distance = D[i][j] * math.exp(x) P[i][j] = new_distance P[j][i] = new_distance else: P = D index_selection = splitter.get_selection(P) tip_selection = [tips[i] for i in index_selection] n_selection = len(tip_selection) n_complement = n - n_selection if min(n_selection, n_complement) < 2: degenerate_split_count += 1 else: if tree.get_split_branch(tip_selection): informative_split_count += 1 else: invalid_split_count += 1 # define the response out = StringIO() print >> out, informative_split_count, 'informative splits' print >> out, degenerate_split_count, 'degenerate splits' print >> out, invalid_split_count, 'invalid splits' # return the response return out.getvalue()
def do_command_line_analysis(options): """ Print some stuff to stdout, and show a progress bar on stderr. @param options: an object from optparse """ # load the tree, using the default tree if no filename was provided tree, tree_remark = get_tree_and_remark(options) # initialize the simulation objects sims = [ Simulation(Clustering.NeighborJoiningDMS(), 'nj', 'neighbor joining'), Simulation(Clustering.StoneSpectralSignDMS(), 'nj', 'spectral sign cut with neighbor joining fallback'), Simulation(Clustering.RandomDMS(), 'nj', 'random partitioning') ] # possibly add the slow simulation if options.use_exact: sims.append( Simulation(Clustering.StoneExactDMS(), 'nj', 'exact criterion with neighbor joining fallback')) # define the simulation parameters reconstruction_count = options.nsamples sequence_length_string = options.sequence_length if sequence_length_string == 'inf': sequence_length = float('inf') else: sequence_length = int(sequence_length_string) inf_replacement = 20.0 if options.reject_inf: inf_replacement = None elif options.replace_inf: try: inf_replacement = float(options.replace_inf) except ValueError: msg = 'invalid replace_inf value: ' raise OptionError(msg + str(options.replace_inf)) zero_replacement = 0 if options.reject_zero: zero_replacement = None elif options.replace_zero: try: zero_replacement = float(options.replace_zero) except ValueError: msg = 'invalid replace_zero value: ' raise OptionError(msg + str(options.replace_zero)) # start the html file print '<html><body>' # show the simulation parameters print 'original tree source:', tree_remark, '<br/>' print 'reconstruction count:', reconstruction_count, '<br/>' print 'sequence length:', sequence_length, '<br/>' # set the simulation parameters for each simulation for sim in sims: sim.set_original_tree(tree) # If there is only one reconstruction per method # then show the progress of the tree builder. if reconstruction_count == 1: sim.set_verbose() # define an arbitrary but consistent ordering of the taxa ordered_names = [node.name for node in tree.gen_tips()] try: # attempt to simulate a bunch of distance matrices if options.verbose: print 'sampling', reconstruction_count, 'distance matrices...' # initialize the distance matrix sampler sampler = DMSampler.DMSampler(tree, ordered_names, sequence_length) sampler.set_inf_replacement(inf_replacement) sampler.set_zero_replacement(zero_replacement) # start the progress bar pbar = Progress.Bar(1.0) # sample some distance matrices distance_matrices = [] for result in sampler.gen_samples_or_none(): # if we got a result then update the distance matrix list if result: sequence_list, D = result distance_matrices.append(D) # Update the progressbar regardless of whether or not # the proposal was accepted. remaining_acceptances = reconstruction_count - len( distance_matrices) numerator = sampler.get_completed_proposals() denominator = numerator + sampler.get_remaining_proposals( remaining_acceptances) dms_fraction = float(numerator) / float(denominator) dms_total = 1.0 / (1 + len(sims)) pbar.update(dms_fraction * dms_total) # if we have enough samples then break the loop if not remaining_acceptances: break # reconstruct trees using various methods for i, sim in enumerate(sims): if options.verbose: print 'running "%s"...' % sim.description sim.run(distance_matrices, ordered_names) pbar.update(float(i + 2) / float(1 + len(sims))) # stop the progress bar pbar.finish() # get the simulation data table = [('method', 'seconds', 'uniform loss', 'weighted loss')] for sim in sims: table.append((sim.description, sim.get_running_time(), sim.get_uniform_loss(), sim.get_deep_loss())) # convert the row major matrix into an html table print HtmlTable.get_table_string(table) # end the html file print '</html></body>' except KeyboardInterrupt: print 'interrupted stage', pbar.progress, 'of', pbar.high
def do_hard_coded_analysis_b(tree, tree_remark): """ Do a hardcoded analysis of tree reconstruction methods. Make R files of ordered reconstruction losses. @param tree: a tree object @param tree_remark: a string that is a comment about the tree """ # define an arbitrary order for the names of the leaves of the tree ordered_names = list(node.name for node in tree.gen_tips()) # use some replicates reconstruction_count = 100 # Make R files for reconstruction results from sequences # of some number of nucleotides in length. sequence_length = 2000 # define the tree reconstruction methods to be used sims = [ Simulation(Clustering.NeighborJoiningDMS(), 'nj', 'neighbor joining'), Simulation(Clustering.StoneSpectralSignDMS(), 'nj', 'spectral sign') ] # set tree reconstruction parameters for sim in sims: sim.set_original_tree(tree) # initialize the distance matrix sampler sampler = DMSampler.InfiniteAllelesSampler(tree, ordered_names, sequence_length) sampler.set_inf_replacement(20.0) sampler.set_zero_replacement(0.0) # start the progress bar pbar = Progress.Bar(1.0) # sample some distance matrices distance_matrix_start_time = time.time() distance_matrices = [] for result in sampler.gen_samples_or_none(): # if we got a result then update the distance matrix list if result: sequence_list, D = result distance_matrices.append(D) # Update the progressbar regardless of whether or not # the proposal was accepted. remaining_acceptances = reconstruction_count - len(distance_matrices) numerator = sampler.get_completed_proposals() denominator = numerator + sampler.get_remaining_proposals( remaining_acceptances) dms_fraction = float(numerator) / float(denominator) dms_total = 1.0 / (1 + len(sims)) pbar.update(dms_fraction * dms_total) # if we have enough samples then break the loop if not remaining_acceptances: break distance_matrix_seconds = time.time() - distance_matrix_start_time # reconstruct trees using various methods reconstruction_seconds = [] for i, sim in enumerate(sims): reconstruction_start_time = time.time() print 'reconstructing', len(distance_matrices), 'trees' print 'using', sim.description sim.run(distance_matrices, ordered_names) pbar.update(float(i + 2) / float(1 + len(sims))) reconstruction_seconds.append(time.time() - reconstruction_start_time) # stop the progress bar pbar.finish() # consider the neighbor joining and the spectral sign results nj_sim, ss_sim = sims # extract the simulation data label_list_pairs = [ ('nj.unweighted', nj_sim.get_normalized_error_counts()), ('ss.unweighted', ss_sim.get_normalized_error_counts()), ('nj.weighted', nj_sim.get_normalized_loss_values()), ('ss.weighted', ss_sim.get_normalized_loss_values()) ] labels, transposed_table = zip(*label_list_pairs) table = zip(*transposed_table) table_string = RUtil.get_table_string(table, labels) # write the table filename = 'out3.table' with open(filename, 'w') as fout: print >> fout, '# tree source:', tree_remark print >> fout, '# number of taxa:', len(ordered_names) print >> fout, '# sampled distance matrices:', len(distance_matrices) print >> fout, '# sampling seconds elapsed:', distance_matrix_seconds print >> fout, '# sites per sequence:', sequence_length for sim, seconds in zip(sims, reconstruction_seconds): msg_a = '# seconds elapsed for tree reconstruction using ' msg_b = sim.description + ': ' + str(seconds) print >> fout, msg_a + msg_b print >> fout, table_string print 'wrote', filename
def get_response_content(fs): """ @param fs: a FieldStorage object containing the cgi arguments @return: a (response_headers, response_text) pair """ # read the tree tree = NewickIO.parse(fs.tree, FelTree.NewickTree) # read the sequence length sequence_length = fs.sequence_length # get arbitrarily ordered leaf names ordered_names = list(node.name for node in tree.gen_tips()) # read the criterion string, creating the splitter object if fs.sign: splitter = Clustering.StoneSpectralSignDMS() elif fs.nj: splitter = Clustering.NeighborJoiningDMS() elif fs.random: splitter = Clustering.RandomDMS() # define the distance matrix sampler if fs.infinite_alleles: sampler = DMSampler.InfiniteAllelesSampler(tree, ordered_names, sequence_length) elif fs.jukes_cantor: sampler = DMSampler.DMSampler(tree, ordered_names, sequence_length) if fs.reject_infinity: sampler.set_inf_replacement(None) elif fs.replace_infinity: sampler.set_inf_replacement(20) if fs.reject_zero: sampler.set_zero_replacement(None) elif fs.replace_zero: sampler.set_zero_replacement(0.00001) elif fs.remain_zero: sampler.set_zero_replacement(0.0) # define the amount of time allotted to the sampler allocated_seconds = 1 # get distance matrices until we run out of time distance_matrices = [] start_time = time.clock() sampling_seconds = 0 for result in sampler.gen_samples_or_none(): # if the result was accepted then add the distance matrix if result is not None: sequence_list, D = result distance_matrices.append(D) # see if we need to stop sampling sampling_seconds = time.clock() - start_time if sampling_seconds >= allocated_seconds: break # reconstruct trees until we run out of time start_time = time.clock() reconstructing_seconds = 0 reconstructed_tree_count = 0 for D in distance_matrices: # reconstruct a tree using the method of choice tree_builder = NeighborhoodJoining.TreeBuilder(D, ordered_names, splitter) tree_builder.set_fallback_name('nj') try: query_tree = tree_builder.build() except NeighborhoodJoining.NeighborhoodJoiningError as e: raise HandlingError(e) reconstructed_tree_count += 1 # see if we need to stop reconstructing the trees reconstructing_seconds = time.clock() - start_time if reconstructing_seconds >= allocated_seconds: break # define the response out = StringIO() if distance_matrices: print >> out, 'seconds to sample', len(distance_matrices), print >> out, 'distance matrices:', sampling_seconds if reconstructed_tree_count: print >> out, 'seconds to reconstruct', reconstructed_tree_count, print >> out, 'trees:', reconstructing_seconds else: print >> out, 'no trees could be reconstructed', print >> out, 'in a reasonable amount of time' else: print >> out, 'no distance matrices could be sampled' print >> out, 'in a reasonable amount of time' print >> out, sampler.proposed, print >> out, 'distance matrices were proposed but were rejected' print >> out, sampler.proposals_with_zero, print >> out, 'proposed distance matrices had estimates of zero' print >> out, sampler.proposals_with_inf, print >> out, 'proposed distance matrices had estimates of infinity' # return the response return out.getvalue()
def get_response_content(fs): # read the criterion string, creating the splitter object if fs.exact: splitter = Clustering.StoneExactDMS() elif fs.sign: splitter = Clustering.StoneSpectralSignDMS() elif fs.threshold: splitter = Clustering.StoneSpectralThresholdDMS() elif fs.nj: splitter = Clustering.NeighborJoiningDMS() elif fs.random: splitter = Clustering.RandomDMS() # read the original tree tree = NewickIO.parse(fs.tree, FelTree.NewickTree) # Make sure that the splitter object is appropriate for the number # of taxa and the number of tree reconstructions. ntaxa = len(list(tree.gen_tips())) if splitter.get_complexity(ntaxa) * fs.iterations > 1000000: msg_a = 'use a faster bipartition function, fewer taxa, ' msg_b = 'or fewer tree reconstructions' raise HandlingError(msg_a + msg_b) # sample a bunch of sequences ordered_names = [node.name for node in tree.gen_tips()] sampler = DMSampler(tree, ordered_names, fs.length) # simulate a bunch of distance matrices and reconstruct the trees mismatch_count_tree_pairs = [] error_count_histogram = {} max_steps = 1000000 for sequence_list, distance_matrix in sampler.gen_distance_matrices( fs.iterations, max_steps): # create the tree builder tree_builder = NeighborhoodJoining.ValidatingTreeBuilder( distance_matrix, ordered_names, splitter) # Read the recourse string and set the corresponding method # in the tree builder. if fs.njrecourse: tree_builder.set_fallback_name('nj') elif fs.halvingrecourse: tree_builder.set_fallback_name('halving') # set parameters of the tree validating tree builder tree_builder.set_original_tree(tree) # build the tree reconstructed_tree = tree_builder.build() # note the number of partition errors during the reconstruction mismatch_count = tree_builder.get_mismatch_count() if mismatch_count not in error_count_histogram: error_count_histogram[mismatch_count] = 0 error_count_histogram[mismatch_count] += 1 # If we are saving the reconstructed trees # then remove branch lengths and add to the tree list. if fs.showtrees: for node in reconstructed_tree.preorder(): node.set_branch_length(None) mismatch_count_tree_pair = (mismatch_count, reconstructed_tree) mismatch_count_tree_pairs.append(mismatch_count_tree_pair) # See if we bailed early because # the sampling was predicted to take too long. if sampler.accepted_sample_count < fs.iterations: raise HandlingError(sampler.get_sampling_error_message()) # define the response out = StringIO() print >> out, 'partition error count frequencies:' max_mismatch_count = max(error_count_histogram) for i in range(max_mismatch_count + 1): frequency = error_count_histogram.get(i, 0) print >> out, i, ':', frequency if fs.showtrees: print >> out, '' print >> out, 'reconstructed tree topologies with mismatch counts:' for mismatch_count, tree in sorted(mismatch_count_tree_pairs): print >> out, NewickIO.get_newick_string(tree), mismatch_count # return the response return out.getvalue()