def main2():
    args = process_args()
    gene_tree, species_tree, gene_root, recon_g, mpr_count = \
        ClusterUtil.get_tree_info(args.input, args.d,args.t,args.l)
    RV.visualizeAndSave(recon_g, "original.png")
    gs = ClusterUtil.full_split(recon_g, gene_root, args.depth)
    for i, g in enumerate(gs):
        RV.visualizeAndSave(g, "{}.png".format(i))
Exemplo n.º 2
0
def clusterize(month, year):
    Constants.MONTH = month
    Constants.YEAR = year
    print(
        "\n\n================================================================Clusterização para Mes: {} Ano: {}\n"
        .format(Constants.MONTH, Constants.YEAR))

    (arrayNormalizedCharacters,
     scaler) = prepareClusterization(month, year, False, False)

    for i in range(7):
        print(
            "================================================================Aplicando Algoritmo de Clusterização: ",
            Constants.CLUSTERS[i], "\n")

        (clustersResult,
         labels) = Cluster.getCluster(arrayNormalizedCharacters, scaler,
                                      Constants.CLUSTERS[i])

        now = datetime.now()
        current_time = now.strftime("%H:%M:%S")
        print("Hora de Finalização da Clusterização = ", current_time)

        print(
            "\n================================================================Clusterização Aplicada\n"
        )

        #Analisar caracteristicas de cada Cluster, media, variancia, extremos, etc
        ClusterUtil.getDescriptions(clustersResult, Constants.CLUSTERS[i],
                                    Constants.MONTH, Constants.YEAR)

        print(
            "\n================================================================Análise Estatística Aplicada\n"
        )

        #Avaliar qualidade do Cluster
        ClusterUtil.printEvaluations(arrayNormalizedCharacters, labels,
                                     Constants.CLUSTERS[i])

        print(
            "\n================================================================Avaliação Completa\n"
        )
    now = datetime.now()
    current_time = now.strftime("%H:%M:%S")
    print("Hora de Fim = ", current_time)
    print("\n")
def cluster(recon_g, gene_root, score, mpr_count, args, max_splits):
    """
    Choose (and perform) a clustering based on whether depth or nmprs is specified.
    :param recon_g <recon_graph>
    :param gene_root <node>
    :param score <function <recon_g>-><float>>
    :param mpr_count <int> - the number of MPRs in recon_g
    :param args <namespace>
    :param max_splits <int> - the maximum number of splits to consider before
        assuming that the operation will time out.
    :return ([<recon_graph>], [<float>]) - refer to ClusterUtil.cluster_graph_n
    """
    if args.depth is not None:
        return ClusterUtil.cluster_graph(recon_g, gene_root, score, args.depth,
                                         args.k, max_splits)
    elif args.nmprs is not None:
        return ClusterUtil.cluster_graph_n(recon_g, gene_root, score,
                                           args.nmprs, mpr_count, args.k,
                                           max_splits)
    else:
        assert False
def get_times(tree_files,
              mk_score,
              args,
              timeout=1200,
              min_mprs=1000,
              max_splits=200):
    """
    Collect timing data from the given trees
    :param tree_files [<Path>] - the tree files to analyze
    :param mk_score <function> - refer e.g. ClusterUtil.mk_support_score
    :param args <namespace>
    :param timeout <int> - the number of seconds to use before timing out
    :param min_mprs <int> - the minimum number of MPRs a reconciliation should induce
        before it is used in the data
    :param max_splits <int> - if more than max_splits are induced, we assume that the
        computation of clusters would have timed out.
    :return (nmprs, times, n_timed_out) ([<int>], [<float>], <int>) - nmprs is a list
        of the number of MPRs for each recon. times is a list of the computation times
        for each recon. n_timed_out is the number that timed out (or induced more splits
        than max_splits.
    :return ftrees [<Path>] - the list of trees. This is ordered so that the data for
        the tree at index i in ftrees is at index i in nmprs and times
    """
    nmprs = []
    times = []
    ftrees = []
    n_timed_out = 0
    n = len(tree_files)
    for (i, f) in enumerate(tree_files):
        print("{}: {}/{}".format(f, i + 1, n))
        # Get the recon graph + other info
        gene_tree, species_tree, gene_root, recon_g, mpr_count, _ = \
            ClusterUtil.get_tree_info(str(f), args.d,args.t,args.l)
        # Only care about trees with a certain number of MPRs
        if mpr_count < min_mprs:
            continue
        score = mk_score(species_tree, gene_tree, gene_root)
        start = time.time()
        t = timeout_cluster(recon_g, gene_root, score, mpr_count, args,
                            timeout, max_splits)
        end = time.time()
        elapsed = end - start
        if t is None:
            print("{} timed out".format(f))
            n_timed_out += 1
            continue
        nmprs.append(mpr_count)
        times.append(elapsed)
        ftrees.append(f)
    return (nmprs, times, n_timed_out), ftrees
def main():
    args = process_args()
    # Choose the distance metric
    if args.support:
        mk_score = ClusterUtil.mk_support_score
    elif args.pdv:
        mk_score = ClusterUtil.mk_pdv_score
    else:
        assert False
    # Get the recon graph + other info
    gene_tree, species_tree, gene_root, recon_g, mpr_count, best_roots = \
        ClusterUtil.get_tree_info(args.input, args.d,args.t,args.l)

    # Visualize the graphs
    #RV.visualizeAndSave(recon_g, "original.png")
    #gs = ClusterUtil.full_split(recon_g, gene_root, args.depth)
    #for i, g in enumerate(gs):
    #    RV.visualizeAndSave(g, "{}.png".format(i))
    
    mpr_counter = ClusterUtil.mk_count_mprs(gene_root)
    # Make the distance metric for these specific trees
    score = mk_score(species_tree, gene_tree, gene_root)
    # Actually perform the clustering
    if args.depth is not None:
        graphs,scores,_ = ClusterUtil.cluster_graph(recon_g, gene_root, score, args.depth, args.k, 200)
    elif args.nmprs is not None:
        graphs,scores,_ = ClusterUtil.cluster_graph_n(recon_g, gene_root, score, args.nmprs, mpr_count, args.k, 200)
    else:
        assert False
    # Visualization
    if args.pdv_vis:
        pdv_vis(species_tree, gene_tree, gene_root, recon_g, graphs, args)
    if args.support_vis:
        support_vis(species_tree, gene_tree, gene_root, recon_g, graphs, args)
    if args.medians:
        get_median = mk_get_median(gene_tree, species_tree, gene_root, best_roots)
        for i, g in enumerate(graphs):
            m = get_median(g)
            print("Median for Cluster {}:".format(i))
            # TODO: print to a better file format?
            print(m)
    # Statistics
    one_score = ClusterUtil.get_score_nodp([recon_g], score, mpr_counter)
    k_score = ClusterUtil.get_score_nodp(graphs, score, mpr_counter)
    improvement = ClusterUtil.calc_improvement(k_score, one_score)
    print("Old score: {}".format(one_score))
    print("New score: {}".format(k_score))
    print("Improvement:  {}".format(improvement))
def get_scores(tree_files,
               mk_score,
               args,
               timeout=1200,
               min_mprs=1000,
               max_splits=200):
    """
    Get all of the scores for given trees.
    :param - refer get_times
    :return scores [[<float>]] - The scores for each family for each value of k
    :return local_scores [[(<float>, <float>)]] - The local scores for each family for each k
    :return ftrees - refer get_times
    """
    scores = []
    local_scores = []
    ftrees = []
    n = len(tree_files)
    for (i, f) in enumerate(tree_files):
        print("{}: {}/{}".format(f, i + 1, n))
        # Get the recon graph + other info
        gene_tree, species_tree, gene_root, recon_g, mpr_count, _ = \
            ClusterUtil.get_tree_info(str(f), args.d,args.t,args.l)
        # Only care about trees with a certain number of MPRs
        if mpr_count < min_mprs:
            continue
        score = mk_score(species_tree, gene_tree, gene_root)
        t = timeout_cluster(recon_g, gene_root, score, mpr_count, args,
                            timeout, max_splits)
        if t is None:
            print("{} timed out".format(f))
            continue
        _, tree_scores, tree_local_scores = t
        scores.append(tree_scores)
        local_scores.append(tree_local_scores)
        ftrees.append(f)
    return (scores, local_scores), ftrees
def get_scores_mprs(tree_files,
                    mk_score,
                    args,
                    timeout=1200,
                    min_mprs=1000,
                    max_splits=200):
    """
    Get the data for correlating improvement with number of MPRs
    :param - refer get_times for info on the parameters
    :return (nmprs, scores) ([<int>], [(<float>, <float)]) - nmprs is the number of MPRs
        for each tree, scores is the last two entries of the scores list for each tree
    :return ftrees [<Path>] - refer get_times
    """
    nmprs = []
    scores = []
    ftrees = []
    n = len(tree_files)
    for (i, f) in enumerate(tree_files):
        print("{}: {}/{}".format(f, i + 1, n))
        # Get the recon graph + other info
        gene_tree, species_tree, gene_root, recon_g, mpr_count, _ = \
            ClusterUtil.get_tree_info(str(f), args.d,args.t,args.l)
        # Only care about trees with a certain number of MPRs
        if mpr_count < min_mprs:
            continue
        score = mk_score(species_tree, gene_tree, gene_root)
        t = timeout_cluster(recon_g, gene_root, score, mpr_count, args,
                            timeout, max_splits)
        if t is None:
            print("{} timed out".format(f))
            continue
        _, tree_scores, _ = t
        nmprs.append(mpr_count)
        scores.append((tree_scores[0], tree_scores[1]))
        ftrees.append(f)
    return (nmprs, scores), ftrees
Exemplo n.º 8
0
def prepareClusterization(month, year, nullCheck, preProcessing):
    Constants.MONTH = month
    Constants.YEAR = year
    if (preProcessing):
        print(
            "\n\n================================================================Análise para Mes: {} Ano: {}\n"
            .format(Constants.MONTH, Constants.YEAR))

    now = datetime.now()
    current_time = now.strftime("%H:%M:%S")
    print("Hora de Inicio = ", current_time)

    characters = DBCon.getCharacters(nullCheck)
    clusterCharacters = DataReduction.getClusterCharacters(characters)
    #print('Colunas: physicalMeeleExp,physicalRangeExp,tankExp,magicRangeExp,healerExp,arcanistExp,craftExp,gatherExp,minimalCPTime,hildibrand, PreHw,  Marriage\n')
    tupleCharactersAtributes = ClusterUtil.getTupleCharactersAtributes(
        clusterCharacters, not preProcessing)

    now = datetime.now()
    current_time = now.strftime("%H:%M:%S")
    print("Hora da Finalização da Redução de Dimensão = ", current_time)

    print(
        "\n================================================================Redução Completa\n"
    )

    if (preProcessing):
        #Evaluate Cluster Tendency
        print("Estatística de Hopkins: ",
              ClusterUtil.hopkins(tupleCharactersAtributes), "\n")

        now = datetime.now()
        current_time = now.strftime("%H:%M:%S")
        print("Hora da Finalização da Análise da Estatistica de Hopkins = ",
              current_time)
        print(
            "\n================================================================Avaliação de Tendência para Clusterização Completa\n"
        )

    #Normalizes Data
    (normalizedCharacters,
     scaler) = ClusterUtil.getNormalizedAndScaler(tupleCharactersAtributes)
    arrayNormalizedCharacters = np.array(normalizedCharacters)
    now = datetime.now()
    current_time = now.strftime("%H:%M:%S")
    print("Hora da Finalização da Normalização = ", current_time)
    print(
        "\n================================================================Normalização Completa\n"
    )
    if (preProcessing):
        #Using Pearson Correlation
        ClusterUtil.plotHeatMap(arrayNormalizedCharacters,
                                str(Constants.MONTH) + str(Constants.YEAR))

        print(
            "\n================================================================Mapa de Calor Criado\n"
        )

        ClusterUtil.plotElbow(arrayNormalizedCharacters,
                              str(Constants.MONTH) + str(Constants.YEAR))

        print(
            "\n================================================================Gráfico do Método do Cotovelo Criado\n"
        )

        now = datetime.now()
        current_time = now.strftime("%H:%M:%S")
        print("Hora de Fim = ", current_time)
        print("\n")
    else:
        ClusterUtil.getDescription(tupleCharactersAtributes, 'Total', month,
                                   year)
        return (arrayNormalizedCharacters, scaler)
Exemplo n.º 9
0
from sklearn.decomposition import FactorAnalysis
from DataReduction import ClusterCharacter
from pprint import pprint
from datetime import datetime
'''
ChosenAlgorithm = 3

0 = 'K-means'
1 = 'WARD'
2 = 'Spectral'
3 = 'DBSCAN'
4 = 'BANG'
5 = 'SOM'
6 = 'Fuzzy C-Means']
'''
'''
ClusterUtil.getDescription(tupleCharactersAtributes,'Total')
for i in range(7):
	print("================================================================Applying Clustering: ",Constants.CLUSTERS[ChosenAlgorithm],"\n")
	beginTime = time.time()
	#ClusterUtil.plotKnnDistance(arrayNormalizedCharacters)

	(clustersResult,labels) = Cluster.getCluster(arrayNormalizedCharacters,scaler,Constants.CLUSTERS[i])

	timeElapsed = time.time() - beginTime
	print("\nTempo de Pesquisa: ", "{:.5f}".format(timeElapsed)," Seconds")
		
	print("\n================================================================Clustering Applied\n")

	#Analisar caracteristicas de cada Cluster, media, variancia, extremos, etc
	ClusterUtil.getDescriptions(clustersResult,Constants.CLUSTERS[i])
Exemplo n.º 10
0
def get_n_improvements(tree_files,
                       mk_score,
                       args,
                       timeout=1200,
                       min_mprs=1000,
                       max_splits=200):
    """
    Get the data for correlating the initial number of clusters with
    the improvement
    :param - refer get_times
    :return series [([int], [float])] - for each family, xs are list of the initial number of
        clusters and ys are list of the improvements (cross-indexed)
    :return ftrees - refer get_times
    """
    imp = choose_imp(args)
    series = []
    ftrees = []
    n = len(tree_files)
    for (i, f) in enumerate(tree_files):
        print("{}: {}/{}".format(f, i + 1, n))
        # Get the recon graph + other info
        gene_tree, species_tree, gene_root, recon_g, mpr_count, _ = \
            ClusterUtil.get_tree_info(str(f), args.d,args.t,args.l)
        #print("MPR count: {}".format(mpr_count))
        # Only care about trees with a certain number of MPRs
        if mpr_count < min_mprs:
            continue
        mpr_counter = ClusterUtil.mk_count_mprs(gene_root)
        score = mk_score(species_tree, gene_tree, gene_root)
        xs = []
        ys = []
        old_ngs = 0
        # Try multiple values of n
        # from 2 to 128
        for n_thres in [2**i for i in range(1, 8)]:
            args.nmprs = n_thres

            # Timeout
            signal.signal(signal.SIGALRM, timeout_handler)
            signal.alarm(timeout)
            try:
                gs = ClusterUtil.full_split_n(recon_g, gene_root, args.nmprs,
                                              mpr_count)
                print("Number of splits: {}".format(len(gs)))
                # Don't bother re-clustering if we split to get the same number of gs (induces the same split)
                if len(gs) == old_ngs or len(gs) > max_splits:
                    continue
                else:
                    old_ngs = len(gs)
                    _, scores, _ = ClusterUtil.combine(gs, score, args.k,
                                                       mpr_counter)
            except TimeoutError:
                print("{} timed out".format(f))
                continue
            signal.alarm(0)

            true_n = len(scores)
            # Compare two clusters to one cluster
            two_s = scores[1]
            one_s = scores[0]
            improvement = imp(two_s, one_s)
            xs.append(true_n)
            ys.append(improvement)
        ftrees.append(f)
        series.append((xs[:], ys[:]))
    return series, ftrees
Exemplo n.º 11
0
def get_improvements(tree_files,
                     cluster_mk_scores,
                     eval_mk_scores,
                     args,
                     timeout=1200,
                     min_mprs=1000,
                     max_splits=200):
    """
    Get data for relating each score.
    :param - refer get_times.
    :param cluster_mk_scores [<function>] - score factories to use to make the clusters.
        Refer ClusterUtil.mk_support_score.
    :param eval_mk_scores [<function>] - score factories to use to evaluate the clusters.
        Refer ClusterUtil.mk_support_score.
    :return improvements <dict <int> -> <dict> <int> -> [<float>]> - First key is the
        index of the objective function that was used to create the clusters. Second key
        is the index of the objective function that was used to evaluatie the clusters.
        Value is list of improvements corresponding to each tree.
    :return ftrees - refer get_times
    """
    # Key: clustering method index. Value: list of trees that finished
    ftrees = collections.defaultdict(list)
    # Keys: clustering method index, evaluation method index. Value: list of improvements
    improvements = collections.defaultdict(mk_default_list)
    n = len(tree_files)
    for (i, f) in enumerate(tree_files):
        print("{}: {}/{}".format(f, i + 1, n))
        # Get the recon graph + other info
        gene_tree, species_tree, gene_root, recon_g, mpr_count, _ = \
            ClusterUtil.get_tree_info(str(f), args.d,args.t,args.l)
        # Only care about trees with a certain number of MPRs
        if mpr_count < min_mprs:
            continue
        # Get all the scoring functions ready for this tree
        cluster_scores = [
            mk_cs(species_tree, gene_tree, gene_root)
            for mk_cs in cluster_mk_scores
        ]
        eval_scores = [
            mk_es(species_tree, gene_tree, gene_root)
            for mk_es in eval_mk_scores
        ]
        # Counts MPRs to weight scores
        mpr_counter = ClusterUtil.mk_count_mprs(gene_root)
        # Evaluate the original graph on each eval metric to record improvement
        one_scores = [eval_s(recon_g) for eval_s in eval_scores]
        # Perform the clustering for each cluster score
        for i1, cluster_score in enumerate(cluster_scores):
            t = timeout_cluster(recon_g, gene_root, cluster_score, mpr_count,
                                args, timeout, max_splits)
            if t is None:
                print("{} timed out".format(f))
                continue
            ftrees[i1].append(f)
            graphs, _, _ = t
            # Evaluate the clustering for each evaluation score
            for i2, eval_score in enumerate(eval_scores):
                one_score = one_scores[i2]
                k_score = ClusterUtil.get_score_nodp(graphs, eval_score,
                                                     mpr_counter)
                improvement = ClusterUtil.calc_improvement(k_score, one_score)
                improvements[i1][i2].append(improvement)
    return improvements, ftrees
Exemplo n.º 12
0
def has_n_mprs(tree_file, d, t, l, nmprs):
    _, _, _, _, mpr_count = ClusterUtil.get_tree_info(str(tree_file), d, t, l)
    return mpr_count >= nmprs