Пример #1
0
def domanda_4():
    print("Computing: hierchical clustering on dataset_212...")
    C, t, d = hierarchical_clustering(dataset_212, 9, weighted)
    print("Drawing...")
    draw_clustering(
        C, "Clustering gerarchico su 212 contee" +
        (" (v. pesata)" if weighted else ""))
Пример #2
0
def visualize_data(cluster_input, data, method=None, display_centers=False):
    """
    Load a data table, compute a list of clusters and 
    plot a list of clusters

    Set DESKTOP = True/False to use either matplotlib or simplegui
    """
    data_table = load_data_table(data)
    
    singleton_list = []
    for line in data_table:
        singleton_list.append(alg_cluster.Cluster(set([line[0]]), line[1], line[2], line[3], line[4]))
        
    if method == None:
        cluster_list = sequential_clustering(singleton_list, cluster_input)	
        print("Displaying", len(cluster_list), "sequential clusters")
    elif method == 'hierarchical_clustering':
        cluster_list = clustering.hierarchical_clustering(singleton_list, cluster_input)
        print("Displaying", len(cluster_list), "hierarchical clusters")
    elif method == 'kmeans_clustering':
        cluster_list = clustering.kmeans_clustering(singleton_list,
                                                    cluster_input[0],
                                                    cluster_input[1])
        print("Displaying", len(cluster_list), "k-means clusters")
    else:
        print("ERROR: method entered into visualize_data not recognized")

    alg_clusters_matplotlib.plot_clusters(data_table, cluster_list, display_centers)
Пример #3
0
def run_example():
    """
    Load a data table, compute a list of clusters and 
    plot a list of clusters

    Set DESKTOP = True/False to use either matplotlib or simplegui
    """
    data_table = load_data_table(DATA_111_URL)

    singleton_list = []
    for line in data_table:
        singleton_list.append(
            alg_cluster.Cluster(set([line[0]]), line[1], line[2], line[3],
                                line[4]))

    # cluster_list = sequential_clustering(singleton_list, 15)
    # print "Displaying", len(cluster_list), "sequential clusters"

    cluster_list = \
        alg_project3_solution.hierarchical_clustering(singleton_list, 9)
    print "Displaying", len(cluster_list), "hierarchical clusters"

    # cluster_list = alg_project3_solution.kmeans_clustering(
    #     singleton_list, 9, 5)
    # print "Displaying", len(cluster_list), "k-means clusters"

    print ca.compute_distortion(cluster_list, data_table)
Пример #4
0
def domanda_1():
    print("Computing: hierchical clustering on dataset_full...")
    C, t, d = hierarchical_clustering(dataset_full, 15, weighted)
    print("Drawing...")
    draw_clustering(
        C, "Clustering gerarchico sull'intero dataset" +
        (" (v. pesata)" if weighted else ""))
Пример #5
0
def domanda_6():
    print("Computing: hierchical clustering on dataset_212...")
    C1, t1, d1 = hierarchical_clustering(dataset_212, 9, weighted)
    print("Distortion for hierchical clustering:", d1)
    print("Computing: kmeans clustering on dataset_212...")
    C2, t2, d2 = kmeans_clustering(dataset_212, 9, 5, weighted)
    print("Distortion for kmeans clustering:", d2)
Пример #6
0
def test_hierarchical24():
    """
    Test for hierarchical clustering
    Note that hierarchical_clustering mutates cluster_list
    """
    
    # load small data table
    print
    print "Testing hierarchical_clustering on 24 county set"
    data_24_table = load_data_table(DATA_24_URL)
    
    
    # test data of the form [size of output cluster, sets of county tuples]
    hierdata_24 = [[23, set([('11001', '51013'), ('01073',), ('06059',), ('06037',), ('06029',), ('06071',), ('06075',), ('08031',), ('24510',), ('34013',), ('34039',), ('34017',), ('36061',), ('36005',), ('36047',), ('36059',), ('36081',), ('41051',), ('41067',), ('51840',), ('51760',), ('55079',), ('54009',)])],
                   [22, set([('11001', '51013'), ('36047', '36081'), ('01073',), ('06059',), ('06037',), ('06029',), ('06071',), ('06075',), ('08031',), ('24510',), ('34013',), ('34039',), ('34017',), ('36061',), ('36005',), ('36059',), ('41051',), ('41067',), ('51840',), ('51760',), ('55079',), ('54009',)])],
                   [21, set([('11001', '51013'), ('36005', '36061'), ('36047', '36081'), ('01073',), ('06059',), ('06037',), ('06029',), ('06071',), ('06075',), ('08031',), ('24510',), ('34013',), ('34039',), ('34017',), ('36059',), ('41051',), ('41067',), ('51840',), ('51760',), ('55079',), ('54009',)])],
                   [20, set([('11001', '51013'), ('36005', '36061'), ('36047', '36081'), ('01073',), ('06059',), ('06037',), ('06029',), ('06071',), ('06075',), ('08031',), ('24510',), ('34039',), ('34013', '34017'), ('36059',), ('41051',), ('41067',), ('51840',), ('51760',), ('55079',), ('54009',)])],
                   [19, set([('34013', '34017', '34039'), ('11001', '51013'), ('36005', '36061'), ('36047', '36081'), ('01073',), ('06059',), ('06037',), ('06029',), ('06071',), ('06075',), ('08031',), ('24510',), ('36059',), ('41051',), ('41067',), ('51840',), ('51760',), ('55079',), ('54009',)])],
                   [18, set([('34013', '34017', '34039'), ('11001', '51013'), ('01073',), ('06059',), ('06037',), ('06029',), ('06071',), ('06075',), ('08031',), ('24510',), ('36059',), ('36005', '36047', '36061', '36081'), ('41051',), ('41067',), ('51840',), ('51760',), ('55079',), ('54009',)])],
                   [17, set([('11001', '51013'), ('01073',), ('06059',), ('06037',), ('06029',), ('06071',), ('06075',), ('08031',), ('24510',), ('36059',), ('34013', '34017', '34039', '36005', '36047', '36061', '36081'), ('41051',), ('41067',), ('51840',), ('51760',), ('55079',), ('54009',)])],
                   [16, set([('11001', '51013'), ('01073',), ('06059',), ('06037',), ('06029',), ('06071',), ('06075',), ('08031',), ('24510',), ('34013', '34017', '34039', '36005', '36047', '36059', '36061', '36081'), ('41051',), ('41067',), ('51840',), ('51760',), ('55079',), ('54009',)])],
                   [15, set([('11001', '51013'), ('01073',), ('06059',), ('06037',), ('06029',), ('06071',), ('06075',), ('08031',), ('24510',), ('34013', '34017', '34039', '36005', '36047', '36059', '36061', '36081'), ('41051', '41067'), ('51840',), ('51760',), ('55079',), ('54009',)])],
                   [14, set([('01073',), ('06059',), ('06037',), ('06029',), ('06071',), ('06075',), ('08031',), ('34013', '34017', '34039', '36005', '36047', '36059', '36061', '36081'), ('41051', '41067'), ('51840',), ('51760',), ('55079',), ('54009',), ('11001', '24510', '51013')])],
                   [13, set([('06037', '06059'), ('01073',), ('06029',), ('06071',), ('06075',), ('08031',), ('34013', '34017', '34039', '36005', '36047', '36059', '36061', '36081'), ('41051', '41067'), ('51840',), ('51760',), ('55079',), ('54009',), ('11001', '24510', '51013')])],
                   [12, set([('06037', '06059'), ('01073',), ('06029',), ('06071',), ('06075',), ('08031',), ('34013', '34017', '34039', '36005', '36047', '36059', '36061', '36081'), ('41051', '41067'), ('51760',), ('55079',), ('54009',), ('11001', '24510', '51013', '51840')])],
                   [11, set([('06029', '06037', '06059'), ('01073',), ('06071',), ('06075',), ('08031',), ('34013', '34017', '34039', '36005', '36047', '36059', '36061', '36081'), ('41051', '41067'), ('51760',), ('55079',), ('54009',), ('11001', '24510', '51013', '51840')])],
                   [10, set([('06029', '06037', '06059'), ('01073',), ('06071',), ('06075',), ('08031',), ('34013', '34017', '34039', '36005', '36047', '36059', '36061', '36081'), ('41051', '41067'), ('55079',), ('54009',), ('11001', '24510', '51013', '51760', '51840')])],
                   [9, set([('01073',), ('06029', '06037', '06059', '06071'), ('06075',), ('08031',), ('34013', '34017', '34039', '36005', '36047', '36059', '36061', '36081'), ('41051', '41067'), ('55079',), ('54009',), ('11001', '24510', '51013', '51760', '51840')])],
                   [8, set([('01073',), ('06029', '06037', '06059', '06071'), ('06075',), ('08031',), ('41051', '41067'), ('55079',), ('54009',), ('11001', '24510', '34013', '34017', '34039', '36005', '36047', '36059', '36061', '36081', '51013', '51760', '51840')])],
                   [7, set([('01073',), ('06029', '06037', '06059', '06071'), ('06075',), ('08031',), ('41051', '41067'), ('55079',), ('11001', '24510', '34013', '34017', '34039', '36005', '36047', '36059', '36061', '36081', '51013', '51760', '51840', '54009')])],
                   [6, set([('06029', '06037', '06059', '06071', '06075'), ('01073',), ('08031',), ('41051', '41067'), ('55079',), ('11001', '24510', '34013', '34017', '34039', '36005', '36047', '36059', '36061', '36081', '51013', '51760', '51840', '54009')])],
                   [5, set([('06029', '06037', '06059', '06071', '06075'), ('08031',), ('41051', '41067'), ('01073', '55079'), ('11001', '24510', '34013', '34017', '34039', '36005', '36047', '36059', '36061', '36081', '51013', '51760', '51840', '54009')])],
                   [4, set([('06029', '06037', '06059', '06071', '06075'), ('01073', '11001', '24510', '34013', '34017', '34039', '36005', '36047', '36059', '36061', '36081', '51013', '51760', '51840', '54009', '55079'), ('08031',), ('41051', '41067')])],
                   [3, set([('06029', '06037', '06059', '06071', '06075', '41051', '41067'), ('01073', '11001', '24510', '34013', '34017', '34039', '36005', '36047', '36059', '36061', '36081', '51013', '51760', '51840', '54009', '55079'), ('08031',)])],
                   [2, set([('01073', '11001', '24510', '34013', '34017', '34039', '36005', '36047', '36059', '36061', '36081', '51013', '51760', '51840', '54009', '55079'), ('06029', '06037', '06059', '06071', '06075', '08031', '41051', '41067')])],
                   ]

    #hierdata_24 = [[23, set([('11001', '51013'), ('01073',), ('06059',), ('06037',), ('06029',), ('06071',), ('06075',), ('08031',), ('24510',), ('34013',), ('34039',), ('34017',), ('36061',), ('36005',), ('36047',), ('36059',), ('36081',), ('41051',), ('41067',), ('51840',), ('51760',), ('55079',), ('54009',)])]]
        
    suite = poc_simpletest.TestSuite()
    
    for num_clusters, expected_county_tuple in hierdata_24:
        
        # build initial list of clusters for each test since mutation is allowed
        cluster_list = []
        for idx in range(len(data_24_table)):
            line = data_24_table[idx]
            cluster_list.append(alg_cluster.Cluster(set([line[0]]), line[1], line[2], line[3], line[4]))

        # compute student answer
        student_clustering = student.hierarchical_clustering(cluster_list, num_clusters)
        student_county_tuple = set_of_county_tuples(student_clustering)
        
        # Prepare test
        error_message = "Testing hierarchical_clustering on 24 county table, num_clusters = " + str(num_clusters)
        error_message += "\nStudent county tuples: " + str(student_county_tuple)
        error_message += "\nExpected county tuples: " + str(expected_county_tuple)
        suite.run_test(student_county_tuple == expected_county_tuple, True, error_message)
    suite.report_results()
def cluster_features(contours, cnt_dicts, drawer, edge_type, do_draw=False):

    # Do hierarchicalclustering by shape, color, and size
    label_dict = {}
    for feature_type in ['size', 'shape', 'color']:
        feature_list = [cnt_dic[feature_type] for cnt_dic in cnt_dicts]

        # ndarray e.g. ([1, 1, 1, 1, 1, 3, 3, 2, 2, 2]), len=#feature_list
        labels = hierarchical_clustering(feature_list, feature_type, edge_type,
                                         drawer, do_draw)
        label_dict[feature_type] = labels

        if do_draw:
            img = drawer.blank_img()
            for label in set(labels):
                cnt_dic_list_by_groups = [
                    c for i, c in enumerate(contours) if labels[i] == label
                ]
                img = drawer.draw_same_color(cnt_dic_list_by_groups, img)
            desc = 'f_Feature{}_{}'.format(feature_type.capitalize(),
                                           edge_type)
            drawer.save(img, desc)

    # combine the label clustered by size, shape, and color. ex: (0,1,1), (2,0,1)
    combine_labels = []
    for size, shape, color in zip(label_dict['size'], label_dict['shape'],
                                  label_dict['color']):
        combine_labels.append((size, shape, color))

    # find the final group by the intersected label and draw
    img = drawer.blank_img()
    groups_cnt_dicts = []
    for combine_label in set(combine_labels):
        if combine_labels.count(combine_label) < 2:
            continue

        # groups_cnt_dicts.append(
        #     [cnt_dicts[i] for i, label in enumerate(combine_labels) if label == combine_label]
        # )

        group_idx = [
            idx for idx, label in enumerate(combine_labels)
            if label == combine_label
        ]
        group_cnt_dicts = [cnt_dicts[i] for i in group_idx]
        groups_cnt_dicts.append(group_cnt_dicts)

        # for do_draw
        cnts = [contours[i] for i in group_idx]
        img = drawer.draw_same_color(cnts, img)

    if do_draw:
        desc = 'g_OriginalResult_{}'.format(edge_type)
        drawer.save(img, desc)

    return groups_cnt_dicts
Пример #8
0
def hier_dist(data_url):
    """
    Calculates distirtion of hierarchical alg for 6-20 clusters
    """
    res = {}
    data_table = load_data_table(data_url)
    singleton_list = []
    for line in data_table:
        singleton_list.append(
            alg_cluster.Cluster(set([line[0]]), line[1], line[2], line[3],
                                line[4]))
    cluster_list = \
        alg_project3_solution.hierarchical_clustering(singleton_list, 20)
    res[20] = ca.compute_distortion(cluster_list, data_table)

    for num_clust in range(19, 5, -1):
        cluster_list = \
            alg_project3_solution.hierarchical_clustering(
                cluster_list, num_clust)
        res[num_clust] = ca.compute_distortion(cluster_list, data_table)
    return res
Пример #9
0
def compute_q5_q6():
    # Load data
    table111 = viz_tools.load_data_table(DATA_111_URL)

    # Formate data as Clusters
    singleton_list = []
    for line in table111:
        singleton_list.append(
            alg_cluster.Cluster(set([line[0]]), line[1], line[2], line[3],
                                line[4]))

    # Note: K-means tested first b/c clustering.hierarchical_clustering
    # mutates list of clusters

    # K-means
    kmeans_clusters = clustering.kmeans_clustering(singleton_list, 9, 5)
    k_distortion = compute_distortion(kmeans_clusters, table111)
    print("K-means Distortion: {}".format(k_distortion))

    # Hierarchical
    hierarchical_clusters = clustering.hierarchical_clustering(
        singleton_list, 9)
    h_distortion = compute_distortion(hierarchical_clusters, table111)
    print("Hierarchical Distortion: {}".format(h_distortion))
Пример #10
0
def test_compute_distortion():
    # Load data
    table290 = viz_tools.load_data_table(DATA_290_URL)

    # Formate data as Clusters
    singleton_list = []
    for line in table290:
        singleton_list.append(
            alg_cluster.Cluster(set([line[0]]), line[1], line[2], line[3],
                                line[4]))

    # Note: K-means tested first b/c clustering.hierarchical_clustering
    # mutates list of clusters

    # Test 2: Expect 2.323×10^11
    kmeans_clusters = clustering.kmeans_clustering(singleton_list, 16, 5)
    k_distortion = compute_distortion(kmeans_clusters, table290)
    print("K-means Distortion: {}".format(k_distortion))

    # Test 1: Expect 2.575×10^11
    hierarchical_clusters = clustering.hierarchical_clustering(
        singleton_list, 16)
    h_distortion = compute_distortion(hierarchical_clusters, table290)
    print("Hierarchical Distortion: {}".format(h_distortion))
def main():

    years = None
    features_excluded = ['week_start_date']

    _outliers = None

    cities = get_values_of("../data/dengue_features_train.csv", 'city')
    target = ['total_cases']

    all_revelant_features = {}

    for city in cities:
        # Filtering by values of the keys
        _filter = {'city': [city], 'year': years}

        #Load city data
        data = load_data("../data/dengue_features_train.csv",
                         filter_parameters=_filter,
                         excludes_features=features_excluded,
                         outliers=_outliers)

        # Load total cases by city, year and week of year
        data_labels = load_data("../data/dengue_labels_train.csv",
                                filter_parameters=_filter)

        # Adapt data for clustering
        data_test_hiech = data.drop(labels=['city', 'year'],
                                    axis=1,
                                    inplace=False)

        # Outliers will be deleted
        elements, outliers, cut = clustering.hierarchical_clustering(
            data=data_test_hiech)

        n_element = count_elements(elements)
        n_outliers = count_elements(outliers)
        total = n_element + n_outliers

        print 'Analysis in: %s' % (city)

        total_outliers = []
        while (outliers != None):
            total_outliers += outliers
            data_test_hiech.drop(outliers, axis=0, inplace=True)
            elements, outliers, cut = clustering.hierarchical_clustering(
                data_test_hiech, cut=cut, first_total=total)

        if total_outliers:
            print 'Auto-detected Outliers:'
            print total_outliers

        # Join data
        data_without_outliers = data
        data_without_outliers.drop(total_outliers, axis=0, inplace=True)

        merge_data = pd.merge(data_without_outliers,
                              data_labels,
                              on=['city', 'year', 'weekofyear'],
                              how='outer')
        merge_data.drop(labels=['city', 'year'], axis=1, inplace=True)
        merge_data.dropna(inplace=True)

        # Features clustering
        data_for_features = merge_data.drop(labels=target, axis=1)
        clustering.hierarchical_clustering_features(data_for_features)

        # Croos Validation for select features
        feature_selected, max_deph = cros.cross_validation(
            merge_data, algorithm='DecisionTreeRegressor')

        # Regressor for select relevant features
        relevant_features = reg.tree_regressor(merge_data, max_deph,
                                               feature_selected, target, city)

        all_revelant_features[city] = relevant_features

        # For each city, one model KNN
        # Croos Validation for select features
        n_neighbors, X, y = cros.cross_validation(merge_data,
                                                  algorithm='KNN',
                                                  features=relevant_features,
                                                  target=target,
                                                  verbose=True)

        #---------------------------------------------

        # prediction
        data_Test = load_data("../data/dengue_features_test.csv",
                              filter_parameters=_filter,
                              excludes_features=features_excluded,
                              outliers=_outliers)

        #data_Test.dropna(inplace = True)
        test = data_Test[relevant_features]
        test.interpolate(method='linear', inplace=True)

        knn = neighbors.KNeighborsRegressor(n_neighbors, weights='distance')
        prediction = knn.fit(X, y).predict(test)

        # show prediction
        print "\nPREDICTION:"
        xx = np.stack(i for i in range(len(prediction)))
        plt.plot(xx, prediction, c='g', label='prediction')
        plt.axis('tight')
        plt.legend()
        plt.title("KNeighborsRegressor (k = %i, weights = '%s')" %
                  (n_neighbors, 'distance'))

        plt.show()

        # write the results in a csv file
        submission_data = load_data("../data/submission_format.csv",
                                    filter_parameters=_filter)
        final_data = []

        for i in range(len(final_data)):
            row = []

            row.append(submission_data.iloc[i]['city'])
            row.append(submission_data.iloc[i]['year'])
            row.append(submission_data.iloc[i]['weekofyear'])
            row.append(int(prediction[i]))

            final_data.append(row)

        col = ["city", "year", "weekofyear", "total_cases"]
        df = pd.DataFrame(final_data, columns=col)
        df.to_csv('../data/predictions_for_' + city + '.csv',
                  index=False,
                  sep=',',
                  encoding='utf-8')

        #---------------------------------------------

    print '\n\t [ SELECTED FEATURES ]'
    for key, value in all_revelant_features.iteritems():
        print 'City: %s, %2d features: \n\t %s' % (key, len(value), str(value))
Пример #12
0
     #pairwise sequence alignment results
     results = main_algorithm(df_encoded,gap,T,s,0)
     
     #reset indexes
     df_encoded = df_encoded.reset_index()
     
     #convert similarity matrix into distance matrix
     results['score'] = convert_to_distance_matrix(results['score'])
     
     #exception when all the scores are the same, in this case we continue with the next value of gap
     if((results['score']== 0).all()):
         #print('entrei')
         continue
     else:
         #hierarchical clustering
         Z = hierarchical_clustering(results['score'],method,gap,T,args.automatic,pp)
         
         #validation
         chosen = validation(M,df_encoded,results,Z,method,min_K,max_K+1,args.automatic,pp,gap,T)
         chosen_k = chosen[2]
         df_avgs = chosen[0]
         df_stds = chosen[1]
         
         chosen_results = df_avgs.loc[chosen_k]
         chosen_results['gap'] = gap
         concat_for_final_decision.append(chosen_results)
 
 ############################################################################
 #       RESULTS
 ############################################################################
 #close pdf  
Пример #13
0
import sys
sys.path.append('../../3_closest_pairs_&_clustering_algorithms')
import data.load_clusters as lc
import data.cluster as cl
import clustering as clr
import alg_clusters_matplotlib as cplot

data_table = lc.load_data_table(lc.DATA_896_URL) #DATA_3108_URL DATA_290_URL
    
singleton_list = []
for line in data_table:
    singleton_list.append(cl.Cluster(set([line[0]]), line[1], line[2], line[3], line[4]))

c = 7 # cluster count

cluster_list = clr.hierarchical_clustering(singleton_list, c)

        
cplot.plot_clusters(data_table, cluster_list, True)
    

def main():

    first = True
    name_file = assign_name()
    prediction_path = '../predictions/' + name_file

    if not os.path.exists(prediction_path):
        os.makedirs(prediction_path)

    years = None
    features_excluded = ['week_start_date']

    _outliers = None

    cities = get_values_of("../data/dengue_features_train.csv", 'city')

    target = 'total_cases'

    all_revelant_features = {}
    all_scores = []

    modes = [  #'dropna', 'interpolate', 'mean',
        ['interpolate', 'mean']
    ]  #, ['interpolate', 'dropna']]

    for mode in modes:

        first = True
        scores_city = {}
        for city in cities:

            # Filtering by values of the keys
            _filter = {'city': [city], 'year': years}

            #Load city data
            data = load_data("../data/dengue_features_train.csv",
                             filter_parameters=_filter,
                             excludes_features=features_excluded,
                             outliers=_outliers)

            # Load total cases by city, year and week of year
            data_labels = load_data("../data/dengue_labels_train.csv",
                                    filter_parameters=_filter)

            data_fill = data_fill_mode(data, mode)
            data_labels_fill = data_fill_mode(data_labels, mode)

            # Adapt data for clustering
            data_test_hiech = data_fill.drop(labels=['city', 'year'],
                                             axis=1,
                                             inplace=False)

            # Outliers will be deleted
            elements, outliers, cut = clustering.hierarchical_clustering(
                data=data_test_hiech, verbose=False)

            n_element = count_elements(elements)
            n_outliers = count_elements(outliers)
            total = n_element + n_outliers

            print 'Analysis in: %s on mode %s' % (city, str(mode))

            total_outliers = []
            while (outliers != None):
                total_outliers += outliers
                data_test_hiech.drop(outliers, axis=0, inplace=True)
                elements, outliers, cut = clustering.hierarchical_clustering(
                    data_test_hiech, cut=cut, first_total=total, verbose=False)

            if total_outliers:
                print 'Auto-detected Outliers:'
                print total_outliers

            # Join data
            data_without_outliers = data_fill
            data_without_outliers.drop(total_outliers, axis=0, inplace=True)

            merge_data = pd.merge(data_without_outliers,
                                  data_labels_fill,
                                  on=['city', 'year', 'weekofyear'],
                                  how='inner')
            first_year = merge_data['year'].min()
            last_year = merge_data['year'].max()
            split_year = int(last_year - round((last_year - first_year) * 0.2))

            # Features clustering
            data_for_features = merge_data.drop(labels=['city', 'total_cases'],
                                                axis=1)

            feature_groups = clustering.hierarchical_clustering_features(
                data_for_features, verbose=False)

            # Croos Validation for select features
            features_selected, max_deph = cros.cross_validation(merge_data,
                                                                feature_groups,
                                                                split_year,
                                                                target=target)

            # Regressor for select relevant features
            relevant_features = reg.tree_regressor(merge_data,
                                                   split_year,
                                                   max_deph,
                                                   features_selected,
                                                   target,
                                                   city,
                                                   verbose=False)

            all_revelant_features[city] = relevant_features

            all_features = merge_data.columns.tolist()[1:-1]

            data_Test = load_data("../data/dengue_features_test.csv",
                                  filter_parameters=_filter,
                                  excludes_features=features_excluded,
                                  outliers=_outliers)

            # prediction

            prediction_knn, score_knn = predict.knn_prediction(
                merge_data,
                split_year,
                features_selected,
                target,
                data_Test,
                verbose=True)
            print('Score KNN on %s mode is : %.4f' % (mode, score_knn))

            prediction_rf, score_rf = predict.rf_prediction(merge_data,
                                                            split_year,
                                                            all_features,
                                                            target,
                                                            data_Test,
                                                            verbose=True)
            print('Score RandomForest on %s mode is : %.4f' % (mode, score_rf))

            scores_city[city] = [(mode, 'Knn', score_knn),
                                 (mode, 'RF', score_rf)]

            # Load submission data file.
            submission_data = load_data("../data/submission_format.csv",
                                        filter_parameters=_filter)

            # wr ite the results in a csv file
            # Write result files.
            col = ["city", "year", "weekofyear", "total_cases"]
            write_result(col, submission_data, prediction_knn, prediction_rf,
                         prediction_path, (name_file + str(mode)), first)
            first = False

        all_scores.append(scores_city)

    print all_scores
    """ 
Пример #15
0
def q10(plot_key):
    # Load data
    table111 = viz_tools.load_data_table(DATA_111_URL)
    table290 = viz_tools.load_data_table(DATA_290_URL)
    table896 = viz_tools.load_data_table(DATA_896_URL)

    # Create cluster function
    create_cluster = lambda line: alg_cluster.Cluster(set([line[0]]), line[
        1], line[2], line[3], line[4])

    # Formate data as Clusters
    klist111 = [create_cluster(line) for line in table111]
    klist290 = [create_cluster(line) for line in table290]
    klist896 = [create_cluster(line) for line in table896]
    hlist111 = [create_cluster(line) for line in table111]
    hlist290 = [create_cluster(line) for line in table290]
    hlist896 = [create_cluster(line) for line in table896]

    # Initialize distortion lists
    distortion111k, distortion290k, distortion896k = [], [], []
    distortion111h, distortion290h, distortion896h = [], [], []

    # Calculate distortion lists
    for num in range(20, 5, -1):
        if plot_key == 111:
            kmeans_cluster111 = clustering.kmeans_clustering(klist111, num, 5)
            h_cluster111 = clustering.hierarchical_clustering(hlist111, num)
            distortion111k.append(
                compute_distortion(kmeans_cluster111, table111))
            distortion111h.append(compute_distortion(h_cluster111, table111))
        elif plot_key == 290:
            kmeans_cluster290 = clustering.kmeans_clustering(klist290, num, 5)
            h_cluster290 = clustering.hierarchical_clustering(hlist290, num)
            distortion290k.append(
                compute_distortion(kmeans_cluster290, table290))
            distortion290h.append(compute_distortion(h_cluster290, table290))
        elif plot_key == 896:
            kmeans_cluster896 = clustering.kmeans_clustering(klist896, num, 5)
            h_cluster896 = clustering.hierarchical_clustering(hlist896, num)
            distortion896k.append(
                compute_distortion(kmeans_cluster896, table896))
            distortion896h.append(compute_distortion(h_cluster896, table896))

    # Plot results
    fig = plt.figure('Distortion for Different Clustering Methods')
    plt.title('Distortion for Different Clustering Methods: {} Points'.format(
        plot_key))
    plt.xlabel('Number of Clusters')
    plt.ylabel('Distortion')

    x = list(range(20, 5, -1))

    if plot_key == 111:
        y1, y4 = distortion111k, distortion111h
        plt.plot(x, y1, '-bo', markersize=1, label='K-means (111)')
        plt.plot(x, y4, '-co', markersize=1, label='Hierarchical (111)')
    elif plot_key == 290:
        y2, y5 = distortion290k, distortion290h
        plt.plot(x, y2, '-go', markersize=1, label='K-means (290)')
        plt.plot(x, y5, '-mo', markersize=1, label='Hierarchical (290)')
    elif plot_key == 896:
        y3, y6 = distortion896k, distortion896h
        plt.plot(x, y3, '-ro', markersize=1, label='K-means (896)')
        plt.plot(x, y6, '-yo', markersize=1, label='Hierarchical (896)')

    plt.legend(loc='best')

    plt.show()
Пример #16
0
import clustering
import alg_cluster
import imp

foo = imp.load_source('poc_simpletest', '../PoC/poc_simpletest.py')
foo.TestSuite()

#                                   horiz_pos, vert_pos, population, risk
print clustering.closest_pair_strip([
    alg_cluster.Cluster(set([]), 1.0, 1.0, 1, 0),
    alg_cluster.Cluster(set([]), 1.0, 5.0, 1, 0),
    alg_cluster.Cluster(set([]), 1.0, 4.0, 1, 0),
    alg_cluster.Cluster(set([]), 1.0, 7.0, 1, 0)
], 1.0, 3.0)

# closest_pair_strip([alg_cluster.Cluster(set([]), 0, 0, 1, 0), alg_cluster.Cluster(set([]), 0, 1, 1, 0), alg_cluster.Cluster(set([]), 0, 2, 1, 0), alg_cluster.Cluster(set([]), 0, 3, 1, 0)], 0.0, 1.0) expected one of the tuples in set([(1.0, 2, 3), (1.0, 0, 1), (1.0, 1, 2)]) but received (1.0, 0, 0)
# fast_closest_pair([alg_cluster.Cluster(set([]), 0, 0, 1, 0), alg_cluster.Cluster(set([]), 1, 0, 1, 0), alg_cluster.Cluster(set([]), 2, 0, 1, 0), alg_cluster.Cluster(set([]), 3, 0, 1, 0)]) expected one of the tuples in set([(1.0, 1, 2), (1.0, 0, 1), (1.0, 2, 3)]) but received (Exception: TypeError) "'tuple' object does not support item assignment" at line 71, in fast_closest_pair
# print fast_closest_pair([alg_cluster.Cluster(set([]), 0, 0, 1, 0), alg_cluster.Cluster(set([]), 1, 0, 1, 0), alg_cluster.Cluster(set([]), 2, 0, 1, 0), alg_cluster.Cluster(set([]), 3, 0, 1, 0)])

# print slow_closest_pair([alg_cluster.Cluster(set([]), 0, 0, 1, 0), alg_cluster.Cluster(set([]), 1, 0, 1, 0), alg_cluster.Cluster(set([]), 2, 0, 1, 0), alg_cluster.Cluster(set([]), 3, 0, 1, 0)])

print clustering.hierarchical_clustering([
    alg_cluster.Cluster(set([]), 1.0, 1.0, 1, 0),
    alg_cluster.Cluster(set([]), 1.0, 5.0, 1, 0),
    alg_cluster.Cluster(set([]), 1.0, 4.0, 1, 0),
    alg_cluster.Cluster(set([]), 1.0, 7.0, 1, 0)
], 2)
Пример #17
0
    #pairwise sequence alignment results
    results = main_algorithm(df_encoded, gap, T, s, 0)

    #reset indexes
    df_encoded = df_encoded.reset_index()

    #convert similarity matrix into distance matrix
    results['score'] = convert_to_distance_matrix(results['score'])

    #exception when all the scores are the same, in this case we continue with the next value of gap
    if ((results['score'] == 0).all()):
        print('entrei')
        continue
    else:
        #hierarchical clustering
        Z = hierarchical_clustering(results['score'], method, gap)

        #validation
        chosen = validation(M, df_encoded, results, Z, method, min_K,
                            max_K + 1)
        chosen_k = chosen[2]
        df_avgs = chosen[0]
        df_stds = chosen[1]

        chosen_results = df_avgs.loc[chosen_k]
        chosen_results['gap'] = gap
        concat_for_final_decision.append(chosen_results)

df_final_decision = pd.concat(concat_for_final_decision, axis=1).T
final_k_results = final_decision(df_final_decision)
Пример #18
0
def main():
    
    years = None
    features_excluded = ['week_start_date']

    _outliers = None

    cities = get_values_of("../data/dengue_features_train.csv", 'city')

    all_revelant_features = {}
    for city in cities:
        # Filtering by values of the keys
        _filter = {'city':[city], 'year':years}
        
        #Load city data
        data = load_data(
               "../data/dengue_features_train.csv",
                filter_parameters = _filter, excludes_features = features_excluded,
                  outliers = _outliers)
        
        # Load total cases by city, year and week of year
        data_labels = load_data("../data/dengue_labels_train.csv",filter_parameters = _filter)    
        
        
        # Adapt data for clustering
        data_test_hiech = data.drop(labels = ['city', 'year'], axis = 1, inplace = False)        
        
        # Outliers will be deleted
        elements, outliers, cut = clustering.hierarchical_clustering(data = data_test_hiech)
    
        n_element= count_elements(elements)
        n_outliers = count_elements(outliers)        
        total=n_element + n_outliers
        
        print '\nOutliers in: %s \n\t' % (city)
       


        total_outliers = []
        while (outliers != None):  
            total_outliers += outliers
            data_test_hiech.drop(outliers, axis = 0, inplace = True)
            elements, outliers, cut = clustering.hierarchical_clustering(data_test_hiech,
                                                                cut = cut,
                                                                first_total = total)
        
        if total_outliers:
            print 'Auto-detected Outliers:'
            print total_outliers
        
        # Join data
        data_without_outliers = data
        data_without_outliers.drop(total_outliers, axis = 0, inplace = True)
        
        merge_data = pd.merge(data_without_outliers, data_labels, on = ['city', 'year', 'weekofyear'], how = 'outer')
        merge_data.drop(labels = ['city', 'year'], axis = 1, inplace = True)
        merge_data.dropna(inplace = True)
        
        # Features clustering
        data_for_features = merge_data.drop(labels = ['total_cases'], axis = 1)
        clustering.hierarchical_clustering_features(data_for_features)
        
        # Croos Validation for select features
        feature_selected, max_deph = cros.cross_validation(merge_data)
        
        # Regressor for select relevant features
        relevant_features = reg.tree_regressor(merge_data, max_deph, feature_selected, 'total_cases', city)
        
        all_revelant_features[city] = relevant_features
    
    print '\n\t [ SELECTED FEATURES ]'
    for key, value in all_revelant_features.iteritems():
        print 'City: %s, %2d features: \n\t %s' % (key, len(value), str(value))