def main(dataset_filename, output_data_filename,
         distance_matrix_filename=None, display=False):
    start = time.perf_counter()

    headers_pairs = get_headers_pairs_list(dataset_filename, verbose=True)
    labels_true = get_labels(dataset_filename, verbose=True)

    if distance_matrix_filename is None:
        dist_matrix, max_dist = get_distance_matrix(list(map(lambda x: x[1],
                                                             headers_pairs)),
                                                    verbose=True)
    else:
        dist_matrix, max_dist = \
            read_dist_matrix(distance_matrix_filename, verbose=True)

    # pd.write_dist_matrix(dist_matrix, max_dist,
    #                      r"C:\Users\pavel.zhuk\IdeaProjects\email-"
    #                      r"parser\clustering\data\training_set_dist_matr.txt"
    #                      r"", verbose=True)

    affinity_matr = get_affinity_matrix(dist_matrix, verbose=True,
                                        max_affinity=max_dist)
    print("Clustering...")
    af = AffinityPropagation(affinity="precomputed", verbose=True,
                             copy=True).fit(affinity_matr)
    print("Done.")

    cluster_centers_indices = af.cluster_centers_indices_
    n_clusters_ = len(cluster_centers_indices)
    labels = af.labels_

    metrics_list = [
        n_clusters_,
        metrics.homogeneity_score(labels_true, labels),
        metrics.completeness_score(labels_true, labels),
        metrics.v_measure_score(labels_true, labels),
        metrics.adjusted_rand_score(labels_true, labels),
        metrics.adjusted_mutual_info_score(labels_true, labels),
        metrics.silhouette_score(np.asmatrix(dist_matrix), labels,
                                 metric='precomputed')
    ]

    print_metrics(metrics_list)

    write_clusterized_data(output_data_filename, headers_pairs, labels,
                           metrics=metrics_list, verbose=True)

    end = time.perf_counter()
    print("\nWorking time: %f sec." % (end - start))

    if display:
        visualize(dist_matrix, labels, cluster_centers_indices,
                  show_cluster_sizes=True)
Пример #2
0
def main(dataset_filename, output_data_filename,
         distance_matrix_filename=None, eps=10, display=False):
    start = time.perf_counter()

    headers_pairs = get_headers_pairs_list(dataset_filename, verbose=True)
    labels_true = get_labels(dataset_filename, verbose=True)

    if distance_matrix_filename is None:
        dist_matrix, _ = get_distance_matrix(list(map(lambda x: x[1],
                                                      headers_pairs)),
                                             verbose=True)
    else:
        dist_matrix, _ = \
            read_dist_matrix(distance_matrix_filename, verbose=True)

    print("Clustering...")
    dbscan = DBSCAN(eps=eps, min_samples=2, metric="precomputed").fit(
        dist_matrix)
    print("Done.")

    labels = np.copy(dbscan.labels_)

    n_clusters_ = len(set(labels)) - (1 if -1 in labels else 0)
    clusters_without_noise = n_clusters_

    for i, l in enumerate(labels):
        if l == -1:
            labels[i] = n_clusters_
            n_clusters_ += 1

    metrics_list = [
        n_clusters_,
        metrics.homogeneity_score(labels_true, labels),
        metrics.completeness_score(labels_true, labels),
        metrics.v_measure_score(labels_true, labels),
        metrics.adjusted_rand_score(labels_true, labels),
        metrics.adjusted_mutual_info_score(labels_true, labels),
        metrics.silhouette_score(np.asmatrix(dist_matrix), labels,
                                 metric='precomputed')
    ]

    print_metrics(metrics_list)

    write_clusterized_data(output_data_filename, headers_pairs, labels,
                           metrics=metrics_list, verbose=True)

    end = time.perf_counter()
    print("\nWorking time: %f sec." % (end - start))

    if display:
        visualize_dbscan(dist_matrix, dbscan.labels_,
                         clusters_without_noise + 1, show_cluster_sizes=True)
Пример #3
0
def main(dataset_filename, output_data_filename,
         distance_matrix_filename=None, eps=10, display=False):
    start = time.perf_counter()

    headers_pairs = get_headers_pairs_list(dataset_filename, verbose=True)
    labels_true = get_labels(dataset_filename, verbose=True)

    if distance_matrix_filename is None:
        dist_matrix, _ = get_distance_matrix(list(map(lambda x: x[1],
                                                      headers_pairs)),
                                             verbose=True)
    else:
        dist_matrix, _ = \
            read_dist_matrix(distance_matrix_filename, verbose=True)

    print("Clustering...")
    dbscan = DBSCAN(eps=eps, min_samples=2, metric="precomputed").fit(
        dist_matrix)
    print("Done.")

    labels = np.copy(dbscan.labels_)

    n_clusters_ = len(set(labels)) - (1 if -1 in labels else 0)
    clusters_without_noise = n_clusters_

    for i, l in enumerate(labels):
        if l == -1:
            labels[i] = n_clusters_
            n_clusters_ += 1

    metrics_list = [
        n_clusters_,
        metrics.homogeneity_score(labels_true, labels),
        metrics.completeness_score(labels_true, labels),
        metrics.v_measure_score(labels_true, labels),
        metrics.adjusted_rand_score(labels_true, labels),
        metrics.adjusted_mutual_info_score(labels_true, labels),
        metrics.silhouette_score(np.asmatrix(dist_matrix), labels,
                                 metric='precomputed')
    ]

    print_metrics(metrics_list)

    write_clusterized_data(output_data_filename, headers_pairs, labels,
                           metrics=metrics_list, verbose=True)

    end = time.perf_counter()
    print("\nWorking time: %f sec." % (end - start))

    if display:
        visualize_dbscan(dist_matrix, dbscan.labels_,
                         clusters_without_noise + 1, show_cluster_sizes=True)
Пример #4
0
def main(dataset_filename):
    start = time.perf_counter()

    rand = Random()
    rand.seed(int(time.time()))
    headers = \
        list(map(lambda x: x[1], get_headers_pairs_list(dataset_filename)))

    my_ec = inspyred.ec.EvolutionaryComputation(rand)
    my_ec.selector = inspyred.ec.selectors.tournament_selection
    my_ec.variator = [
        inspyred.ec.variators.crossover(costs_uniform_crossover), mutate_costs
    ]
    my_ec.replacer = inspyred.ec.replacers.truncation_replacement
    my_ec.observer = [
        inspyred.ec.observers.file_observer,
        inspyred.ec.observers.stats_observer
    ]
    my_ec.terminator = inspyred.ec.terminators.evaluation_termination

    final_pop = my_ec.evolve(
        generator=generate_costs,
        evaluator=evaluate_costs,
        bounder=bound_costs,
        tournament_size=5,
        headers=headers,
        statistics_file=open("stats_mst_trunc_half_1.csv", "w"),
        individuals_file=open("inds_mst_trunc_half_1.csv", "w"),
        # --- customizable arguments ---
        pop_size=60,
        num_selected=30,
        mutation_rate=0.75,
        mutation_distance=0.15,
        max_evaluations=1860
        # ------------------------------
    )

    # Sort and print the best individual, who will be at index 0.
    final_pop.sort(reverse=True)
    print('Terminated due to {0}.'.format(my_ec.termination_cause))
    print(final_pop[0])

    end = time.perf_counter()
    print("\nWorking time: %f sec." % (end - start))

    inspyred.ec.analysis.generation_plot(open("stats_mst_trunc_half_1.csv"),
                                         errorbars=False)
Пример #5
0
def main(dataset_filename):
    start = time.perf_counter()

    rand = Random()
    rand.seed(int(time.time()))
    headers = \
        list(map(lambda x: x[1], get_headers_pairs_list(dataset_filename)))

    my_ec = inspyred.ec.EvolutionaryComputation(rand)
    my_ec.selector = inspyred.ec.selectors.tournament_selection
    my_ec.variator = [inspyred.ec.variators.crossover(costs_uniform_crossover),
                      mutate_costs]
    my_ec.replacer = inspyred.ec.replacers.truncation_replacement
    my_ec.observer = [inspyred.ec.observers.file_observer,
                      inspyred.ec.observers.stats_observer]
    my_ec.terminator = inspyred.ec.terminators.evaluation_termination

    final_pop = my_ec.evolve(generator=generate_costs,
                             evaluator=evaluate_costs,
                             bounder=bound_costs,
                             tournament_size=5,
                             headers=headers,
                             statistics_file=open(
                                 "stats_ap_trunc_nonzero_half_2.csv", "w"),
                             individuals_file=open(
                                 "inds_ap_trunc_nonzero_half_2.csv", "w"),
                             # --- customizable arguments ---
                             pop_size=60,
                             num_selected=30,
                             mutation_rate=0.75,
                             mutation_distance=0.15,
                             max_evaluations=1860
                             # ------------------------------
                             )

    # Sort and print the best individual, who will be at index 0.
    final_pop.sort(reverse=True)
    print('Terminated due to {0}.'.format(my_ec.termination_cause))
    print(final_pop[0])

    end = time.perf_counter()
    print("\nWorking time: %f sec." % (end - start))

    inspyred.ec.analysis.generation_plot(
        open("stats_ap_trunc_nonzero_half_2.csv"), errorbars=False)
Пример #6
0
def main(dataset_filename, output_data_filename,
         distance_matrix_filename=None, display=False, min_dist=10):
    start = time.perf_counter()

    headers_pairs = get_headers_pairs_list(dataset_filename, verbose=True)
    labels_true = get_labels(dataset_filename, verbose=True)

    if distance_matrix_filename is None:
        dist_matrix, max_dist = get_distance_matrix(list(map(lambda x: x[1],
                                                             headers_pairs)),
                                                    verbose=True)
    else:
        dist_matrix, max_dist = \
            read_dist_matrix(distance_matrix_filename, verbose=True)

    print("Clustering...")
    n_clusters_, labels = fit(dist_matrix, min_dist)
    print("Done.")

    print("clusters {0}".format(n_clusters_))
    print(labels)

    metrics_list = [
        n_clusters_,
        metrics.homogeneity_score(labels_true, labels),
        metrics.completeness_score(labels_true, labels),
        metrics.v_measure_score(labels_true, labels),
        metrics.adjusted_rand_score(labels_true, labels),
        metrics.adjusted_mutual_info_score(labels_true, labels),
        metrics.silhouette_score(np.asmatrix(dist_matrix), labels,
                                 metric='precomputed')
    ]

    print_metrics(metrics_list)

    write_clusterized_data(output_data_filename, headers_pairs, labels,
                           metrics=metrics_list, verbose=True)

    end = time.perf_counter()
    print("\nWorking time: %f sec." % (end - start))
Пример #7
0
import sys
import os
import time

sys.path.append(os.getcwd())

from cluster.prepare_data import get_headers_pairs_list, write_dist_matrix
from cluster.token_edit_distance import get_distance_matrix

if len(sys.argv) < 3:
    print(
        "Too few arguments. You should provide: \n1. dataset_filename" +
        "\n2. output_data_filename"
    )
    sys.exit()

start = time.perf_counter()
dataset_filename_ = sys.argv[1]
output_data_filename_ = sys.argv[2]

headers_pairs = get_headers_pairs_list(dataset_filename_, verbose=True)

dist_matrix, max_dist = get_distance_matrix(list(map(lambda x: x[1],
                                                     headers_pairs)),
                                            verbose=True)

write_dist_matrix(dist_matrix, max_dist, output_data_filename_, verbose=True)

end = time.perf_counter()
print("\nWorking time: %f sec." % (end - start))