Python Clustering.NeighborJoiningDMS示例，Clustering.NeighborJoiningDMS, collective.machinelearning Python示例

示例#1

0

显示文件

文件： 20080903a.py 项目： BIGtigr/xgcode

def get_response_content(fs):
    """
    @param fs: a FieldStorage object containing the cgi arguments
    @return: a (response_headers, response_text) pair
    """
    # read the criterion string, creating the splitter object
    if fs.exact:
        splitter = Clustering.StoneExactDMS()
    elif fs.sign:
        splitter = Clustering.StoneSpectralSignDMS()
    elif fs.nj:
        splitter = Clustering.NeighborJoiningDMS()
    elif fs.random:
        splitter = Clustering.RandomDMS()
    # read the original tree
    tree = NewickIO.parse(fs.tree, FelTree.NewickTree)
    # define the maximum number of steps we want
    max_steps = 1000000
    # Make sure that the splitter object is appropriate
    # for the number of taxa and the number of tree reconstructions.
    ntaxa = len(list(tree.gen_tips()))
    if splitter.get_complexity(ntaxa) * fs.iterations > max_steps:
        msg_a = 'use a faster bipartition function, '
        msg_b = 'fewer taxa, or fewer tree reconstructions'
        raise HandlingError(msg_a + msg_b)
    # define the simulation parameters
    sim = Simulation(splitter, 'nj', 'cgi tree building simulation')
    sim.set_original_tree(tree)
    sim.set_step_limit(max_steps)
    # define an arbitrary but consistent ordering of the taxa
    ordered_names = [node.name for node in tree.gen_tips()]
    # attempt to simulate a bunch of distance matrices
    sampler = DMSampler.DMSampler(tree, ordered_names, fs.length)
    distance_matrices = []
    for result in sampler.gen_samples_or_none():
        # if a proposal was accepted then add it to the list
        if result:
            sequence_list, distance_matrix = result
            distance_matrices.append(distance_matrix)
        # if enough accepted samples have been generated then stop sampling
        remaining_acceptances = fs.iterations - len(distance_matrices)
        if not remaining_acceptances:
            break
        # If the remaining number of computrons is predicted
        # to be too much then stop.
        if sampler.get_remaining_computrons(remaining_acceptances) > max_steps:
            msg_a = 'this combination of parameters '
            msg_b = 'is predicted to take too long'
            raise HandlingError(msg)
    sim.run(distance_matrices, ordered_names)
    # define the response
    out = StringIO()
    print >> out, 'partition error count frequencies:'
    print >> out, sim.get_histogram_string()
    print >> out, ''
    print >> out, 'weighted partition errors:', sim.get_deep_loss()
    # return the response
    return out.getvalue()

示例#2

0

显示文件

文件： 20080903a.py 项目： BIGtigr/xgcode

def do_hard_coded_analysis_a(tree, tree_remark):
    """
    Do a hardcoded analysis of tree reconstruction methods.
    Make a bunch of R files.
    @param tree: a tree object
    @param tree_remark: a string that is a comment about the tree
    """
    # define an arbitrary order for the names of the leaves of the tree
    ordered_names = list(node.name for node in tree.gen_tips())
    # use 1000 replicates
    reconstruction_count = 1000
    # Make R files for reconstruction results
    # from sequences 100 and 500 nucleotides long.
    for sequence_length in (100, 500):
        # sample distance matrices
        print 'sampling', reconstruction_count, 'distance matrices'
        print 'from alignments of length', sequence_length
        sampler = DMSampler.DMSampler(tree, ordered_names, sequence_length)
        distance_matrices = []
        for result in sampler.gen_samples_or_none():
            # if the proposal was rejected then try again
            if not result:
                continue
            # add the accepted distance matrix sample to the list
            sequence_list, distance_matrix = result
            distance_matrices.append(distance_matrix)
            # stop when we have generated enough distance matrices
            if len(distance_matrices) == reconstruction_count:
                break
        # run both neighbor joining and spectral sign clustering
        sims = [
            Simulation(Clustering.NeighborJoiningDMS(), 'nj',
                       'neighbor joining'),
            Simulation(Clustering.StoneSpectralSignDMS(), 'nj',
                       'spectral sign')
        ]
        for sim in sims:
            print 'reconstructing', len(distance_matrices), 'trees'
            print 'using', sim.description
            sim.set_original_tree(tree)
            sim.run(distance_matrices, ordered_names)
        # consider the neighbor joining and the spectral sign results
        nj_sim, ss_sim = sims
        # write the uniform loss function comparison R script
        script_contents = R_helper(nj_sim.get_normalized_error_counts(),
                                   ss_sim.get_normalized_error_counts())
        filename = 'uniform_%d.R' % sequence_length
        with open(filename, 'w') as fout:
            print >> fout, script_contents
        # write the weighted loss function comparison R script
        script_contents = R_helper(nj_sim.get_normalized_loss_values(),
                                   ss_sim.get_normalized_loss_values())
        filename = 'weighted_%d.R' % sequence_length
        with open(filename, 'w') as fout:
            print >> fout, script_contents

示例#3

0

显示文件

文件： 20080703c.py 项目： BIGtigr/xgcode

def get_response_content(fs):
    # read the matrix
    D = fs.matrix
    if len(D) < 3:
        raise HandlingError('the matrix should have at least three rows')
    # read the ordered labels
    ordered_labels = Util.get_stripped_lines(StringIO(fs.labels))
    if len(ordered_labels) != len(D):
        msg_a = 'the number of ordered labels should be the same '
        msg_b = 'as the number of rows in the matrix'
        raise HandlingError(msg_a + msg_b)
    if len(set(ordered_labels)) != len(ordered_labels):
        raise HandlingError('the ordered labels must be unique')
    # read the criterion string, creating the splitter object
    if fs.exact:
        splitter = Clustering.StoneExactDMS()
    elif fs.sign:
        splitter = Clustering.StoneSpectralSignDMS()
    elif fs.threshold:
        splitter = Clustering.StoneSpectralThresholdDMS()
    elif fs.nj:
        splitter = Clustering.NeighborJoiningDMS()
    # Make sure that the splitter object
    # is appropriate for the size of the distance matrix.
    if splitter.get_complexity(len(D)) > 1000000:
        msg = 'use a smaller distance matrix or a faster bipartition function'
        raise HandlingError(msg)
    # read the original tree
    tree = NewickIO.parse(fs.tree, FelTree.NewickTree)
    if len(ordered_labels) != len(list(tree.gen_tips())):
        msg_a = 'the number of ordered labels should be the same '
        msg_b = 'as the number of tips in the tree'
        raise HandlingError(msg_a + msg_b)
    tree_tip_names = set(tip.name for tip in tree.gen_tips())
    if tree_tip_names != set(ordered_labels):
        msg_a = 'the leaf labels of the tree do not match '
        msg_b = 'the ordered labels of the distance matrix rows'
        raise HandlingError(msg_a + msg_b)
    # create the tree builder
    tree_builder = NeighborhoodJoining.ValidatingTreeBuilder(
        D.tolist(), ordered_labels, splitter)
    # Read the recourse string and set the corresponding method
    # in the tree builder.
    if fs.njrecourse:
        tree_builder.set_fallback_name('nj')
    elif fs.halvingrecourse:
        tree_builder.set_fallback_name('halving')
    # define the response
    out = StringIO()
    # set parameters of the tree validating tree builder
    tree_builder.set_original_tree(tree)
    tree_builder.set_output_stream(out)
    tree = tree_builder.build()
    # return the response
    return out.getvalue()

示例#4

0

显示文件

文件： 20080828a.py 项目： BIGtigr/xgcode

def main():
    """
    Run some tree reconstructions from the command line.
    """
    # initialize the simulation objects
    sims = [
        Simulation(Clustering.NeighborJoiningDMS(), 'nj', 'neighbor joining'),
        Simulation(Clustering.RandomDMS(), 'nj', 'random partitioning'),
        Simulation(Clustering.StoneExactDMS(), 'nj',
                   'exact criterion with neighbor joining fallback'),
        #Simulation(Clustering.StoneExactDMS(),
        #'halving', 'exact criterion with stem halving fallback'),
        Simulation(Clustering.StoneSpectralSignDMS(), 'nj',
                   'spectral sign cut with neighbor joining fallback')
        #Simulation(Clustering.StoneSpectralSignDMS(),
        #'halving', 'spectral sign cut with stem halving fallback')
    ]
    # define the simulation parameters
    tree = get_default_original_tree()
    reconstruction_count = 1000
    sequence_length = 100
    step_limit_per_method = 10000000
    # set the simulation parameters
    for sim in sims:
        sim.set_original_tree(get_default_original_tree())
        sim.set_reconstruction_count(reconstruction_count)
        sim.set_step_limit(step_limit_per_method)
        sim.set_sequence_length(sequence_length)
    # show the simulation parameters
    print 'simulation parameters:'
    print 'original tree:', NewickIO.get_newick_string(tree)
    print 'reconstruction count:', reconstruction_count
    print 'sequence length:', sequence_length
    # run the simulations
    print 'running the simulations...'
    for sim in sims:
        print 'running "%s"...' % sim.description
        try:
            sim.run()
        except HandlingError as e:
            print 'Error:', e
    # print the simulation data
    print 'simulation results:'
    for sim in sims:
        print sim.description + ':'
        print sim.get_histogram_string()

示例#5

0

显示文件

def get_response_content(fs):
    # read the matrix
    D = fs.matrix
    if len(D) < 3:
        raise HandlingError('the matrix should have at least three rows')
    # read the ordered labels
    ordered_labels = Util.get_stripped_lines(StringIO(fs.labels))
    if len(ordered_labels) != len(D):
        msg_a = 'the number of ordered labels should be the same '
        msg_b = 'as the number of rows in the matrix'
        raise HandlingError(msg_a + msg_b)
    if len(set(ordered_labels)) != len(ordered_labels):
        raise HandlingError('the ordered labels must be unique')
    # read the criterion string, creating the splitter object
    if fs.sign:
        splitter = Clustering.StoneSpectralSignDMS()
    elif fs.threshold:
        splitter = Clustering.StoneSpectralThresholdDMS()
    elif fs.nj_general:
        splitter = Clustering.NeighborJoiningDMS()
    elif fs.nj_specific:
        splitter = None
    # Make sure that the splitter object is appropriate
    # for the size of the distance matrix.
    if splitter.get_complexity(len(D)) > 1000000:
        msg_a = 'use a smaller distance matrix '
        msg_b = 'or a faster bipartition function'
        raise HandlingError(msg_a + msg_b)
    # create the tree builder
    tree_builder = NeighborhoodJoining.TreeBuilder(D.tolist(), ordered_labels,
                                                   splitter)
    tree_builder.set_fallback_name('nj')
    # define the response
    out = StringIO()
    # build the tree
    tree = tree_builder.build()
    # write the response
    return out.getvalue()

示例#6

0

显示文件

def get_response_content(fs):
    # get the newick trees.
    trees = []
    for tree_string in iterutils.stripped_lines(StringIO(fs.trees)):
        # Parse each tree and make sure
        # that it conforms to various requirements.
        tree = NewickIO.parse(tree_string, FelTree.NewickTree)
        tip_names = [tip.get_name() for tip in tree.gen_tips()]
        if len(tip_names) < 4:
            msg_a = 'expected at least four tips but found '
            msg_b = str(len(tip_names))
            raise HandlingError(msg_a + msg_b)
        if any(name is None for name in tip_names):
            raise HandlingError('each terminal node must be labeled')
        if len(set(tip_names)) != len(tip_names):
            raise HandlingError('each terminal node label must be unique')
        trees.append(tree)
    # read the criterion string, creating the splitter object
    if fs.exact:
        splitter = Clustering.StoneExactDMS()
    elif fs.sign:
        splitter = Clustering.StoneSpectralSignDMS()
    elif fs.threshold:
        splitter = Clustering.StoneSpectralThresholdDMS()
    elif fs.nj:
        splitter = Clustering.NeighborJoiningDMS()
    elif fs.random:
        splitter = Clustering.RandomDMS()
    # assert that the computation is fast
    complexity = 0
    for tree in trees:
        n = len(list(tree.gen_tips()))
        complexity += n * splitter.get_complexity(n)
    if complexity > 1000000:
        raise HandlingError('this computation would take too long')
    # evaluate the bipartition of each tree based on its distance matrix
    informative_split_count = 0
    degenerate_split_count = 0
    invalid_split_count = 0
    for tree in trees:
        tips = list(tree.gen_tips())
        n = len(tips)
        D = tree.get_distance_matrix()
        if fs.strength:
            P = [row[:] for row in D]
            for i in range(n):
                for j in range(i):
                    x = random.normalvariate(0, fs.strength)
                    new_distance = D[i][j] * math.exp(x)
                    P[i][j] = new_distance
                    P[j][i] = new_distance
        else:
            P = D
        index_selection = splitter.get_selection(P)
        tip_selection = [tips[i] for i in index_selection]
        n_selection = len(tip_selection)
        n_complement = n - n_selection
        if min(n_selection, n_complement) < 2:
            degenerate_split_count += 1
        else:
            if tree.get_split_branch(tip_selection):
                informative_split_count += 1
            else:
                invalid_split_count += 1
    # define the response
    out = StringIO()
    print >> out, informative_split_count, 'informative splits'
    print >> out, degenerate_split_count, 'degenerate splits'
    print >> out, invalid_split_count, 'invalid splits'
    # return the response
    return out.getvalue()

示例#7

0

显示文件

文件： 20080903a.py 项目： BIGtigr/xgcode

def do_command_line_analysis(options):
    """
    Print some stuff to stdout, and show a progress bar on stderr.
    @param options: an object from optparse
    """
    # load the tree, using the default tree if no filename was provided
    tree, tree_remark = get_tree_and_remark(options)
    # initialize the simulation objects
    sims = [
        Simulation(Clustering.NeighborJoiningDMS(), 'nj', 'neighbor joining'),
        Simulation(Clustering.StoneSpectralSignDMS(), 'nj',
                   'spectral sign cut with neighbor joining fallback'),
        Simulation(Clustering.RandomDMS(), 'nj', 'random partitioning')
    ]
    # possibly add the slow simulation
    if options.use_exact:
        sims.append(
            Simulation(Clustering.StoneExactDMS(), 'nj',
                       'exact criterion with neighbor joining fallback'))
    # define the simulation parameters
    reconstruction_count = options.nsamples
    sequence_length_string = options.sequence_length
    if sequence_length_string == 'inf':
        sequence_length = float('inf')
    else:
        sequence_length = int(sequence_length_string)
    inf_replacement = 20.0
    if options.reject_inf:
        inf_replacement = None
    elif options.replace_inf:
        try:
            inf_replacement = float(options.replace_inf)
        except ValueError:
            msg = 'invalid replace_inf value: '
            raise OptionError(msg + str(options.replace_inf))
    zero_replacement = 0
    if options.reject_zero:
        zero_replacement = None
    elif options.replace_zero:
        try:
            zero_replacement = float(options.replace_zero)
        except ValueError:
            msg = 'invalid replace_zero value: '
            raise OptionError(msg + str(options.replace_zero))
    # start the html file
    print '<html><body>'
    # show the simulation parameters
    print 'original tree source:', tree_remark, '<br/>'
    print 'reconstruction count:', reconstruction_count, '<br/>'
    print 'sequence length:', sequence_length, '<br/>'
    # set the simulation parameters for each simulation
    for sim in sims:
        sim.set_original_tree(tree)
        # If there is only one reconstruction per method
        # then show the progress of the tree builder.
        if reconstruction_count == 1:
            sim.set_verbose()
    # define an arbitrary but consistent ordering of the taxa
    ordered_names = [node.name for node in tree.gen_tips()]
    try:
        # attempt to simulate a bunch of distance matrices
        if options.verbose:
            print 'sampling', reconstruction_count, 'distance matrices...'
        # initialize the distance matrix sampler
        sampler = DMSampler.DMSampler(tree, ordered_names, sequence_length)
        sampler.set_inf_replacement(inf_replacement)
        sampler.set_zero_replacement(zero_replacement)
        # start the progress bar
        pbar = Progress.Bar(1.0)
        # sample some distance matrices
        distance_matrices = []
        for result in sampler.gen_samples_or_none():
            # if we got a result then update the distance matrix list
            if result:
                sequence_list, D = result
                distance_matrices.append(D)
            # Update the progressbar regardless of whether or not
            # the proposal was accepted.
            remaining_acceptances = reconstruction_count - len(
                distance_matrices)
            numerator = sampler.get_completed_proposals()
            denominator = numerator + sampler.get_remaining_proposals(
                remaining_acceptances)
            dms_fraction = float(numerator) / float(denominator)
            dms_total = 1.0 / (1 + len(sims))
            pbar.update(dms_fraction * dms_total)
            # if we have enough samples then break the loop
            if not remaining_acceptances:
                break
        # reconstruct trees using various methods
        for i, sim in enumerate(sims):
            if options.verbose:
                print 'running "%s"...' % sim.description
            sim.run(distance_matrices, ordered_names)
            pbar.update(float(i + 2) / float(1 + len(sims)))
        # stop the progress bar
        pbar.finish()
        # get the simulation data
        table = [('method', 'seconds', 'uniform loss', 'weighted loss')]
        for sim in sims:
            table.append((sim.description, sim.get_running_time(),
                          sim.get_uniform_loss(), sim.get_deep_loss()))
        # convert the row major matrix into an html table
        print HtmlTable.get_table_string(table)
        # end the html file
        print '</html></body>'
    except KeyboardInterrupt:
        print 'interrupted stage', pbar.progress, 'of', pbar.high

示例#8

0

显示文件

文件： 20080903a.py 项目： BIGtigr/xgcode

def do_hard_coded_analysis_b(tree, tree_remark):
    """
    Do a hardcoded analysis of tree reconstruction methods.
    Make R files of ordered reconstruction losses.
    @param tree: a tree object
    @param tree_remark: a string that is a comment about the tree
    """
    # define an arbitrary order for the names of the leaves of the tree
    ordered_names = list(node.name for node in tree.gen_tips())
    # use some replicates
    reconstruction_count = 100
    # Make R files for reconstruction results from sequences
    # of some number of nucleotides in length.
    sequence_length = 2000
    # define the tree reconstruction methods to be used
    sims = [
        Simulation(Clustering.NeighborJoiningDMS(), 'nj', 'neighbor joining'),
        Simulation(Clustering.StoneSpectralSignDMS(), 'nj', 'spectral sign')
    ]
    # set tree reconstruction parameters
    for sim in sims:
        sim.set_original_tree(tree)
    # initialize the distance matrix sampler
    sampler = DMSampler.InfiniteAllelesSampler(tree, ordered_names,
                                               sequence_length)
    sampler.set_inf_replacement(20.0)
    sampler.set_zero_replacement(0.0)
    # start the progress bar
    pbar = Progress.Bar(1.0)
    # sample some distance matrices
    distance_matrix_start_time = time.time()
    distance_matrices = []
    for result in sampler.gen_samples_or_none():
        # if we got a result then update the distance matrix list
        if result:
            sequence_list, D = result
            distance_matrices.append(D)
        # Update the progressbar regardless of whether or not
        # the proposal was accepted.
        remaining_acceptances = reconstruction_count - len(distance_matrices)
        numerator = sampler.get_completed_proposals()
        denominator = numerator + sampler.get_remaining_proposals(
            remaining_acceptances)
        dms_fraction = float(numerator) / float(denominator)
        dms_total = 1.0 / (1 + len(sims))
        pbar.update(dms_fraction * dms_total)
        # if we have enough samples then break the loop
        if not remaining_acceptances:
            break
    distance_matrix_seconds = time.time() - distance_matrix_start_time
    # reconstruct trees using various methods
    reconstruction_seconds = []
    for i, sim in enumerate(sims):
        reconstruction_start_time = time.time()
        print 'reconstructing', len(distance_matrices), 'trees'
        print 'using', sim.description
        sim.run(distance_matrices, ordered_names)
        pbar.update(float(i + 2) / float(1 + len(sims)))
        reconstruction_seconds.append(time.time() - reconstruction_start_time)
    # stop the progress bar
    pbar.finish()
    # consider the neighbor joining and the spectral sign results
    nj_sim, ss_sim = sims
    # extract the simulation data
    label_list_pairs = [
        ('nj.unweighted', nj_sim.get_normalized_error_counts()),
        ('ss.unweighted', ss_sim.get_normalized_error_counts()),
        ('nj.weighted', nj_sim.get_normalized_loss_values()),
        ('ss.weighted', ss_sim.get_normalized_loss_values())
    ]
    labels, transposed_table = zip(*label_list_pairs)
    table = zip(*transposed_table)
    table_string = RUtil.get_table_string(table, labels)
    # write the table
    filename = 'out3.table'
    with open(filename, 'w') as fout:
        print >> fout, '# tree source:', tree_remark
        print >> fout, '# number of taxa:', len(ordered_names)
        print >> fout, '# sampled distance matrices:', len(distance_matrices)
        print >> fout, '# sampling seconds elapsed:', distance_matrix_seconds
        print >> fout, '# sites per sequence:', sequence_length
        for sim, seconds in zip(sims, reconstruction_seconds):
            msg_a = '# seconds elapsed for tree reconstruction using '
            msg_b = sim.description + ': ' + str(seconds)
            print >> fout, msg_a + msg_b
        print >> fout, table_string
    print 'wrote', filename

示例#9

0

显示文件

文件： 20080918a.py 项目： BIGtigr/xgcode

def get_response_content(fs):
    """
    @param fs: a FieldStorage object containing the cgi arguments
    @return: a (response_headers, response_text) pair
    """
    # read the tree
    tree = NewickIO.parse(fs.tree, FelTree.NewickTree)
    # read the sequence length
    sequence_length = fs.sequence_length
    # get arbitrarily ordered leaf names
    ordered_names = list(node.name for node in tree.gen_tips())
    # read the criterion string, creating the splitter object
    if fs.sign:
        splitter = Clustering.StoneSpectralSignDMS()
    elif fs.nj:
        splitter = Clustering.NeighborJoiningDMS()
    elif fs.random:
        splitter = Clustering.RandomDMS()
    # define the distance matrix sampler
    if fs.infinite_alleles:
        sampler = DMSampler.InfiniteAllelesSampler(tree, ordered_names,
                                                   sequence_length)
    elif fs.jukes_cantor:
        sampler = DMSampler.DMSampler(tree, ordered_names, sequence_length)
    if fs.reject_infinity:
        sampler.set_inf_replacement(None)
    elif fs.replace_infinity:
        sampler.set_inf_replacement(20)
    if fs.reject_zero:
        sampler.set_zero_replacement(None)
    elif fs.replace_zero:
        sampler.set_zero_replacement(0.00001)
    elif fs.remain_zero:
        sampler.set_zero_replacement(0.0)
    # define the amount of time allotted to the sampler
    allocated_seconds = 1
    # get distance matrices until we run out of time
    distance_matrices = []
    start_time = time.clock()
    sampling_seconds = 0
    for result in sampler.gen_samples_or_none():
        # if the result was accepted then add the distance matrix
        if result is not None:
            sequence_list, D = result
            distance_matrices.append(D)
        # see if we need to stop sampling
        sampling_seconds = time.clock() - start_time
        if sampling_seconds >= allocated_seconds:
            break
    # reconstruct trees until we run out of time
    start_time = time.clock()
    reconstructing_seconds = 0
    reconstructed_tree_count = 0
    for D in distance_matrices:
        # reconstruct a tree using the method of choice
        tree_builder = NeighborhoodJoining.TreeBuilder(D, ordered_names,
                                                       splitter)
        tree_builder.set_fallback_name('nj')
        try:
            query_tree = tree_builder.build()
        except NeighborhoodJoining.NeighborhoodJoiningError as e:
            raise HandlingError(e)
        reconstructed_tree_count += 1
        # see if we need to stop reconstructing the trees
        reconstructing_seconds = time.clock() - start_time
        if reconstructing_seconds >= allocated_seconds:
            break
    # define the response
    out = StringIO()
    if distance_matrices:
        print >> out, 'seconds to sample', len(distance_matrices),
        print >> out, 'distance matrices:', sampling_seconds
        if reconstructed_tree_count:
            print >> out, 'seconds to reconstruct', reconstructed_tree_count,
            print >> out, 'trees:', reconstructing_seconds
        else:
            print >> out, 'no trees could be reconstructed',
            print >> out, 'in a reasonable amount of time'
    else:
        print >> out, 'no distance matrices could be sampled'
        print >> out, 'in a reasonable amount of time'
        print >> out, sampler.proposed,
        print >> out, 'distance matrices were proposed but were rejected'
        print >> out, sampler.proposals_with_zero,
        print >> out, 'proposed distance matrices had estimates of zero'
        print >> out, sampler.proposals_with_inf,
        print >> out, 'proposed distance matrices had estimates of infinity'
    # return the response
    return out.getvalue()

示例#10

0

显示文件

文件： 20080828a.py 项目： BIGtigr/xgcode

def get_response_content(fs):
    # read the criterion string, creating the splitter object
    if fs.exact:
        splitter = Clustering.StoneExactDMS()
    elif fs.sign:
        splitter = Clustering.StoneSpectralSignDMS()
    elif fs.threshold:
        splitter = Clustering.StoneSpectralThresholdDMS()
    elif fs.nj:
        splitter = Clustering.NeighborJoiningDMS()
    elif fs.random:
        splitter = Clustering.RandomDMS()
    # read the original tree
    tree = NewickIO.parse(fs.tree, FelTree.NewickTree)
    # Make sure that the splitter object is appropriate for the number
    # of taxa and the number of tree reconstructions.
    ntaxa = len(list(tree.gen_tips()))
    if splitter.get_complexity(ntaxa) * fs.iterations > 1000000:
        msg_a = 'use a faster bipartition function, fewer taxa, '
        msg_b = 'or fewer tree reconstructions'
        raise HandlingError(msg_a + msg_b)
    # sample a bunch of sequences
    ordered_names = [node.name for node in tree.gen_tips()]
    sampler = DMSampler(tree, ordered_names, fs.length)
    # simulate a bunch of distance matrices and reconstruct the trees
    mismatch_count_tree_pairs = []
    error_count_histogram = {}
    max_steps = 1000000
    for sequence_list, distance_matrix in sampler.gen_distance_matrices(
            fs.iterations, max_steps):
        # create the tree builder
        tree_builder = NeighborhoodJoining.ValidatingTreeBuilder(
            distance_matrix, ordered_names, splitter)
        # Read the recourse string and set the corresponding method
        # in the tree builder.
        if fs.njrecourse:
            tree_builder.set_fallback_name('nj')
        elif fs.halvingrecourse:
            tree_builder.set_fallback_name('halving')
        # set parameters of the tree validating tree builder
        tree_builder.set_original_tree(tree)
        # build the tree
        reconstructed_tree = tree_builder.build()
        # note the number of partition errors during the reconstruction
        mismatch_count = tree_builder.get_mismatch_count()
        if mismatch_count not in error_count_histogram:
            error_count_histogram[mismatch_count] = 0
        error_count_histogram[mismatch_count] += 1
        # If we are saving the reconstructed trees
        # then remove branch lengths and add to the tree list.
        if fs.showtrees:
            for node in reconstructed_tree.preorder():
                node.set_branch_length(None)
            mismatch_count_tree_pair = (mismatch_count, reconstructed_tree)
            mismatch_count_tree_pairs.append(mismatch_count_tree_pair)
    # See if we bailed early because
    # the sampling was predicted to take too long.
    if sampler.accepted_sample_count < fs.iterations:
        raise HandlingError(sampler.get_sampling_error_message())
    # define the response
    out = StringIO()
    print >> out, 'partition error count frequencies:'
    max_mismatch_count = max(error_count_histogram)
    for i in range(max_mismatch_count + 1):
        frequency = error_count_histogram.get(i, 0)
        print >> out, i, ':', frequency
    if fs.showtrees:
        print >> out, ''
        print >> out, 'reconstructed tree topologies with mismatch counts:'
        for mismatch_count, tree in sorted(mismatch_count_tree_pairs):
            print >> out, NewickIO.get_newick_string(tree), mismatch_count
    # return the response
    return out.getvalue()