Пример #1
0
def main():
    f = "colours"
    length = 151
    r = 90
    c = 120
    nbins = 10
    colors = np.zeros((length, r, c, 3), dtype=np.uint8)
    grays = np.zeros((length, r, c), dtype=np.uint8)

    # Read the images and store them in color and grayscale formats
    for i in range(length):
        im = Image.open(f+"/dwc"+str(i+1).zfill(3)+".png").convert('RGB')
        colors[i] = np.asarray(im, dtype=np.uint8)
        grays[i] = np.asarray(im.convert('L'))

    # Save the color and grayscale images
    for i in range(length):
        out_col = Image.fromarray(colors[i])
        out_gray = Image.fromarray(grays[i])
        out_col.save("out/col_im/dwc_col"+str(i+1).zfill(3)+".png")
        out_gray.save("out/gray_im/dwc_gray"+str(i+1).zfill(3)+".png")

    # Draw histograms
    colorhs, grayhs = histograms(length, nbins, colors, grays)

    # Kmeans
    clustering(length, nbins, colorhs, grayhs, colors, grays)

    # Other Methods
    other(length, nbins, grayhs, grays)
Пример #2
0
def main_test():

    current_dir = os.path.dirname(os.path.abspath(__file__))

    GO_Term_path = current_dir + '/example/GO_Term.xlsx'
    outpath = current_dir + '/example/test_dealing.csv'
    batch_read_GO_Term(GO_Term_path=GO_Term_path, outputpath=outpath)

    Save_path = current_dir + '/example/test_duplicate_removal_1.csv'
    screen_Term_gene_1(GO_Term=outpath, Save_path=Save_path)

    Save_path_1 = current_dir + '/example/test_duplicate_removal.csv'
    screen_Term_gene(GO_Term=Save_path, Save_path=Save_path_1)

    GO_Term_gene_expression_path = current_dir + '/example/test_count_matrix.csv'
    outputpath1 = current_dir + '/example/Term_matrix'
    batch_read_GO_Term_matrix(
        GO_Term_gene_path=Save_path_1,
        GO_Term_gene_expression_path=GO_Term_gene_expression_path,
        outputpath=outputpath1)

    outputpath2 = current_dir + '/example/Term_matrix'
    Save_path_feature_matrix = current_dir + '/example/test_feature_matrix.csv'
    batch_exacting_feature_KPCA(read_path=outputpath2,
                                outputpath=Save_path_feature_matrix,
                                n_components=0.4)

    label_path = current_dir + '/example/test_label.csv'
    clustering(read_path=Save_path_feature_matrix,
               n_clusters=5,
               label_path=label_path)
Пример #3
0
def main():
    n_regions = 4

    # Loading region data and calculate them if not saved
    try:
        with np.load('region_labels.npz') as r_labels:
            region_labels = r_labels['matrix']
    except:
        region_labels = region_calculation(
            n_regions=n_regions, show_silhouette=True)
        np.savez_compressed('region_labels.npz', matrix=region_labels)

    print('Fetching Data...')
    with np.load('av_model_data30.npz') as m:
        av_matrix = m['matrix']
    with open("datetimes.txt", "rb") as fp:
        dates = pickle.load(fp)
    print('Finished Fetching Data')

    # Clustering parameters
    mode = 'kmeans'
    n_clusters = [2, 3, 4] # Comparing results with 2, 3, and 4 temporal clusters

    data = []
    s_avg = []

    # Keeping relevant dates
    new_d = []
    for i in range(int(len(dates)/30.4325)):
        new_d.append(dates[int(i * 30.4325)])

    # Clustering with regional average by chemical
    for i in range(4):
        for n in n_clusters:
            data.append(average_by_region(matrix=av_matrix, chemical=i,
                                          r_labels=region_labels, n_regions=n_regions))

            # Clustering
            if mode == 'kmeans':
                clustered_data = clustering(
                    data=data[i], n_clusters=n, mode='kmeans', verbose=False)
            elif mode == 'hierarchical':
                clustered_data = clustering(
                    data=data[i], n_clusters=n, mode='hierarchical', verbose=False)

            print("The " + str(n) + " cluster sizes are:")
            cluster_sizes = [len(list(compress(data[i], clustered_data.labels_ == cluster)))
                             for cluster in range(n)]
            print(cluster_sizes)

            s_avg.append(silhouette_plot(labels=clustered_data.labels_,
                                         data=data[i], plotGraph=False, n_clusters=n))

            # Two different ways of visualizing the results
            timeseries_plot(data=clustered_data.labels_, t=new_d)
            timeClustersVisualization(
                labels=clustered_data.labels_, data_points_per_year=12, n_clusters=n)
    print(s_avg)
Пример #4
0
def main():
    if len(sys.argv) < 2 or len(sys.argv) > 3:
        print("Wrong number of arguments, usage: {0} input_file [output_file] \
                (\"output.txt\" by default)".format(sys.argv[0]))
        sys.exit()

    input_filename = sys.argv[1]
    if len(sys.argv) == 3:
        clustering.clustering(input_filename, sys.argv[2], console_out=True)
    else:
        clustering.clustering(input_filename, console_out=True)
Пример #5
0
def main(args):
    node_embeddings = load_embeddings(args.embedding_file)
    if args.label_file:
        labels = read_node_label(args.label_file)

    if args.modularity:
        print("Modularity")
        modularity(args, node_embeddings, args.min_k, args.max_k)

    if args.reconstruction:
        print("Graph reconstruction")
        reconstr(args, node_embeddings, args.k_nbrs)

    if args.clustering:
        print("Clustering")
        clustering(node_embeddings, labels, args.exp_times)

    if args.link_prediction:
        print("Link prediction")
        link_prediction(args.input, node_embeddings)

    if args.classification:
        X = list(labels.keys())
        Y = list(labels.values())
        print("Node classification")
        clf_ratio_list = args.clf_ratio.strip().split(',')
        result_list = {}
        train_ratio = np.asarray(range(1, 10)) * .1
        for clf_ratio in train_ratio:  # clf_ratio_list:
            result_per_test = []
            for ti in range(args.exp_times):
                clf = Classifier(vectors=node_embeddings, clf=LogisticRegression())
                myresult = clf.split_train_evaluate(X, Y, float(clf_ratio))
                result_per_test.append(myresult)
            result_list[clf_ratio] = result_per_test

        print('-------------------')
        for clf_ratio in train_ratio:
            print('Train percent:', clf_ratio)
            results = result_list[clf_ratio]
            for index, result in enumerate(results):
                print('Shuffle #%d:   ' % (index + 1), result)

            avg_score = defaultdict(float)
            for score_dict in results:
                for metric, score in score_dict.items():
                    avg_score[metric] += score
            for metric in avg_score:
                avg_score[metric] /= len(results)
            print('Average score:', dict(avg_score))
            print('-------------------')
Пример #6
0
def figure_5_parameter_values(subjects=None, shape=None, fignum=105):
    """Paper figure.
    Plots, on the left, the histogram of alphas. On the right, the inferred
    shapes for all selected subjects.
    """
    if subjects is None:
        subjects = range(35)

    if shape is None:
        # shapes = [shape_pars[0] for shape_pars in invp.__rds__()]
        shape = 'exponential'

    fig = plt.figure(fignum, figsize=(9, 5))
    fig.clear()

    num_colors = 3
    cmap = figure_colors('lines_cmap')
    colors = [cmap(x) for x in np.linspace(0, 1, num_colors)]

    maax_alpha = plt.subplot2grid((2, 2), (0, 0), rowspan=1)
    maax_bias = plt.subplot2grid((2, 2), (0, 1), rowspan=1)
    maax_lnc = plt.subplot2grid((2, 2), (1, 0), rowspan=1)
    maax_clu = plt.subplot2grid((2, 2), (1, 1), rowspan=1)
    alpha_hist(subjects, maax=maax_alpha, shapes=[
        shape], color=figure_colors('histograms'), divisors='auto')
    bias_hist(subjects, maax=maax_bias, shapes=[shape],
              color=figure_colors('histograms'),
              divisors=np.linspace(0.6, 1.2, 7))
    if shape == 'unimodal_s':
        centroids, membership, _ = cl.clustering(
            subjects, k=3, clustering_type='kmeans', shape=shape)
        cluster_names = [[]] * 3
        cluster_names[centroids[:, 1].argmax()] = r'$\uparrow \sigma$'
        cluster_names[centroids[:, 0].argmax()] = r'$\uparrow \mu$'
        cluster_names[centroids.sum(axis=1).argmin()] = r'$\downarrow \mu$'
    elif shape == 'exponential':
        centroids, membership, distorsion = cl.clustering(
            subjects, k=2, clustering_type='kmeans', shape=shape)
        cluster_names = [[]] * 2
        cluster_names[centroids.argmax()] = '$highSTP$'
        cluster_names[centroids.argmin()] = '$lowSTP$'

    plot_cluster_shapes(subjects=subjects, shape=shape, maax=maax_lnc,
                        colors=colors, centroids=centroids,
                        legends=cluster_names)
    scatter_kappas(subjects=subjects, shape=shape, maax=maax_clu,
                   membership=membership, centroids=centroids, colors=colors)
    maax_lnc.set_label('Label via method')
    maax_lnc.legend(loc='upper left')
    plt.tight_layout()
    plt.show(block=False)
Пример #7
0
    def build_model(self, path):
        list_crawling = []
        if self.crawling:
            with open(os.path.join(os.getcwd(), "ind_url/urls.txt"),
                      "r",
                      encoding="utf8",
                      errors="ignore") as f:
                document = f.read()
                f.close()
            list_crawling = [item for item in document.split('/n')]
        self.build = True
        docs = glob.glob(os.path.join(path + "/**", "*.txt"), recursive=True)
        print(docs)
        if not self.lineEdit_select_directory.text() == '':
            clustering.clustering(path + "/", 4)

            for i in reversed(range(self.tableWidget_relevant.rowCount())):
                self.tableWidget_relevant.removeRow(i)
            for item in range(len(docs)):
                row_position = self.tableWidget_relevant.rowCount()

                qwidget = QWidget()
                checkbox = QCheckBox()
                checkbox.setChecked(False)

                qhboxlayout = QHBoxLayout(qwidget)
                qhboxlayout.addWidget(checkbox)
                # qhboxlayout.setAlignment(Qt.AlignCenter)
                qhboxlayout.setContentsMargins(0, 0, 0, 0)
                name = os.path.basename(docs[item])
                if self.crawling:
                    name = list_crawling[item]
                self.tableWidget_relevant.insertRow(row_position)
                self.tableWidget_relevant.setItem(
                    row_position, 0,
                    QTableWidgetItem(clustering.all_label_from_cluster()))
                self.tableWidget_relevant.setItem(row_position, 1,
                                                  QTableWidgetItem(name))
                self.tableWidget_relevant.setCellWidget(
                    row_position, 2, qwidget)

            self.disable_buttons()

            json_value = json.dumps({'action': 'build', 'path': path})

            modelo.model(json_value)

            self.enable_buttons()
            self.indexing_label.setText("")
Пример #8
0
    def perform_clustering(self):
        self.cluster_dialog.textEdit.setText('Custering...')

        if self.cluster_dialog.checkbox_pca.isChecked():
            dim = int(self.cluster_dialog.line_pca.text())
            matrix = reduce_dimensionality(self.dicom_data.data_array, dim)
        else:
            matrix = self.dicom_data.data_array

        fr = int(self.cluster_dialog.fr.text())
        to = int(self.cluster_dialog.to.text())
        matrix[:,:,:fr,:] = 0
        matrix[:,:,to:,:] = 0

        coordinates = self.cluster_dialog.checkbox_coords.isChecked()    
        clusters = int(self.cluster_dialog.line_clusters.text())

        if self.cluster_dialog.kmean.isChecked():
            function = MiniBatchKMeans
        elif self.cluster_dialog.agglomerative.isChecked():
            function = AgglomerativeClustering
        elif self.cluster_dialog.dbscan.isChecked():
            function = DBSCAN
        elif self.cluster_dialog.optics.isChecked():
            function = OPTICS
        
        if self.cluster_dialog.slicewise.isChecked():
            w = matrix.shape[0]
            h = matrix.shape[1]
            slices = matrix.shape[2]
            stacks = matrix.shape[3]
            z = self.slider_slice.value()
            

            if self.plane == 'tra':
                matrix = matrix[:,:,z,:].reshape([w, h, 1, stacks])
                self.dicom_data.cluster_array[:,:,z,0] = clustering(matrix, function, clusters, coordinates)[:,:,0,0]
            if self.plane == 'cor':
                matrix = matrix[z,:,:,:].reshape([1, h, slices, stacks])
                self.dicom_data.cluster_array[z,:,:,0] = clustering(matrix, function, clusters, coordinates)[0,:,:,0]
            if self.plane == 'sag':
                matrix = matrix[:,z,:,:].reshape([w, 1, slices, stacks])
                self.dicom_data.cluster_array[:,z,:,0] = clustering(matrix, function, clusters, coordinates)[:,0,:,0]
            
        else:
            self.dicom_data.cluster_array = clustering(matrix, function, clusters, coordinates)
        self.cluster_dialog.textEdit.setText('Custering acomplished!')
        self.labeling_window.lineEdit.setText(str(clusters))
        self.set_view('segments')
Пример #9
0
def solution_test_main():
    filenames = ("solution_test1.txt", "solution_test2.txt",
                 "solution_test3.txt")

    expected_partition1 = (('A', 'B', 'C'), ('D', 'E', 'F'))
    expected_partition2 = (('0', '1', '2', '3', '4', '5'), ('6', '7', '8', '9',
                                                            '10', '11', '12'),
                           ('13', '14', '15', '16'))
    expected_partition3 = (('0', '1', '2', '3'), ('4', '5', '6', '7'),
                           ('8', '9', '10', '11'), ('12', '13', '14', '15'))
    expected_partitions = (expected_partition1, expected_partition2,
                           expected_partition3)

    for filename, expected_partition in zip(filenames, expected_partitions):
        clusters_partition = clustering.clustering(filename,
                                                   "solution_test_output.txt")
        for cluster in expected_partition:
            number = clusters_partition[cluster[0]]
            for node in cluster:
                if clusters_partition[node] != number:
                    os.remove("solution_test_output.txt")
                    return False
        labels_clusters = []
        for cluster in expected_partition:
            if clusters_partition[cluster[0]] in labels_clusters:
                os.remove("solution_test_output.txt")
                return False
            else:
                labels_clusters.append(clusters_partition[cluster[0]])

    os.remove("solution_test_output.txt")
    return True
Пример #10
0
def use_case_kmeans(users_skills, clusters_ground_truth):
    print("Clustering")
    clustering_model, times = clustering(users_skills,
                                         range(*clustering_range), True)
    print("- Number of clusters found", len(clustering_model.cluster_centers_))
    print("- Real number of clusters", len(skills_sets))

    evaluate_clustering(clusters_ground_truth, clustering_model.labels_)

    if False:
        pca = PCA(n_components=2)
        #
        pca.fit(users_skills)
        new_data = pca.transform(users_skills)
        #
        pca.fit(clustering_model.cluster_centers_)
        new_data2 = pca.transform(clustering_model.cluster_centers_)
        c = np.concatenate(
            (clustering_model.labels_,
             np.array([6] * len(clustering_model.cluster_centers_))))
        new_data = np.concatenate((new_data, new_data2), axis=0)
        #
        axs[1, 0].scatter(new_data.T[0], new_data.T[1], c=c, alpha=0.5)

    print("Plotting graph")
    plot_graph(G, None, colors=clustering_model.labels_)
    return times
Пример #11
0
def klasteryzacja():
    x_points, y_points, details = clustering()
    all_points = []
    for idx, xy in enumerate(zip(x_points, y_points)):
        point = Point(idx + 1, tuple(xy))
        all_points.append(point)
    return all_points, details
Пример #12
0
def use_case_kmeans(G, users_skills, clusters_ground_truth):
    clustering_range = (2, 10)
    distance_function = "euclidean"

    print("Clustering")
    print("Using KMeans")
    clustering_model = clustering(users_skills, range(*clustering_range), True)
    print("- Number of clusters found", len(clustering_model.cluster_centers_))
    print("- Real number of clusters", len(np.unique(clusters_ground_truth)))

    users_distances_to_centers = cdist(
        users_skills, clustering_model.cluster_centers_, metric=distance_function)

    print("Link prediction")
    model, y_train, predicted_train, y_test, predicted_test = link_prediction(
        G, users_distances_to_centers)

    print("Evaluation")
    print("- Train")
    print_evaluate(y_train, predicted_train)
    print("- Test")
    print_evaluate(y_test, predicted_test)

    print("Visualization")
    visualization(model, G, users_distances_to_centers,
                  clustering_model.labels_)
def main() -> None:
    small_data_answer = clustering(get_small_data(), k=4)
    print("Question 1 answer:", small_data_answer)

    big_graph = get_big_data()
    kosaraju = Kosaraju(big_graph)
    components = kosaraju.run()
    largest_k = len(components)
    print("Question 2 answer:", largest_k)
Пример #14
0
    def testBasic(self):
        point_objs = []
        for i in range(5):
            for j in range(10):
                point_objs.append(Point(i * 10000 + j, i * 10000 + j))

        km = clustering(point_objs, 5)
        clusters = km.k_means(False)

        self.assertEqual(len(clusters), 5)
Пример #15
0
def test_solution_correctness(filename, expected_partition):
    clusters_partition = clustering.clustering(filename, file_output=False)
    for cluster in expected_partition:
        number = clusters_partition[cluster[0]]
        for node in cluster:
            assert clusters_partition[node] == number

    labels_clusters = []
    for cluster in expected_partition:
        assert clusters_partition[cluster[0]] not in labels_clusters
        labels_clusters.append(clusters_partition[cluster[0]])
Пример #16
0
def main():
    # start = time.time()

    num = 5
    data = [
        'pollen_human', 'goolam', 'petal_human', 'biase', 'kelin', 'zeisel'
    ]
    cluster_num = [11, 5, 5, 4, 4, 7]
    n_component = [0.4, 0.4, 0.4, 0.4, 0.6, 0.8]
    data_name = data[num]
    n_components = n_component[num]
    n_clusters = cluster_num[num]

    GO_Term_path = 'GO_BP_MF_CC.xlsx'
    outputpath = data_name + '_dealing.csv'
    batch_read_GO_Term(GO_Term_path, outputpath)

    GO_Term = data_name + '_dealing.csv'
    Save_path = data_name + '_duplicate_removal_1.csv'
    screen_Term_gene_1(GO_Term, Save_path)

    GO_Term_1 = data_name + '_duplicate_removal_1.csv'
    Save_path_1 = data_name + '_duplicate_removal.csv'
    screen_Term_gene(GO_Term_1, Save_path_1)

    Save_path_2 = data_name + '_duplicate_removal.csv'
    GO_Term_gene_expression_path = data_name + '_count_matrix.csv'
    outputpath1 = 'Term_matrix'
    batch_read_GO_Term_matrix(
        GO_Term_gene_path=Save_path_2,
        GO_Term_gene_expression_path=GO_Term_gene_expression_path,
        outputpath=outputpath1)

    read_path = 'Term_matrix'
    outputpath2 = data_name + '_KPCA_' + str(n_components) + '_cosine.csv'

    batch_exacting_feature_KPCA(read_path, outputpath2, n_components)

    read_path = data_name + '_KPCA_' + str(n_components) + '_cosine.csv'
    label_path = data_name + '_label.csv'
    clustering(n_clusters, read_path, label_path)
Пример #17
0
def evaluate_single_extraction(prediction, truth, index_test,
                               labelled_entities):

    correct = True
    atleast_one = False

    entities = labelled_entities[index_test]
    cluster_map = clustering(entities)

    assert len(entities) == len(prediction)
    assert len(entities) == len(truth)

    true_clusters = set()

    for test, entity in zip(truth, entities):

        if test != 1.0:
            continue

        cluster = cluster_map[entity]
        if cluster > -1:
            true_clusters.add(cluster)

    for pred, test, entity in zip(prediction, truth, entities):

        pred_is_one = pred == 1.0

        test_is_one = test == 1.0
        test_is_zero = test == 0.0

        cluster = cluster_map[entity]

        if pred_is_one and cluster > -1:

            if cluster in true_clusters:
                atleast_one = True
            else:
                correct = False

        else:

            if pred_is_one and test_is_zero:
                correct = False

            if pred_is_one and test_is_one:
                atleast_one = True

    # if (correct and atleast_one) or (sum(prediction) == 0 and sum(truth) == 0):
    if correct or (sum(prediction) == 0 and sum(truth) == 0):
        return 1
    else:
        return 0
Пример #18
0
def factfile(DAG, TR, out, DAG_name='temp'):
    N = DAG.number_of_nodes()
    E = DAG.number_of_edges()
    [c_plus,c_zero,c_minus] = clus.clustering(DAG)
    N_TR = TR.number_of_nodes()
    E_TR = TR.number_of_edges()
    [c_plus_TR,c_zero_TR,c_minus_TR] = clus.clustering(TR)
    degree_test(DAG, TR, DAG_name)
    
    out.write('DAG:' + '\n')
    out.write('Number of nodes: ' + str(N) + '\n')
    out.write('Number of edges: ' + str(E) + '\n')
    out.write('clustering_plus: ' + str(c_plus) + '\n')
    out.write('clustering_zero: ' + str(c_zero) + '\n')
    out.write('clustering_minus: ' + str(c_minus) + '\n')
    
    out.write('Transitive Reduction of DAG:' + '\n')
    out.write('Number of nodes: ' + str(N_TR) + '\n')
    out.write('Number of edges: ' + str(E_TR) + '\n')
    out.write('clustering_plus: ' + str(c_plus_TR) + '\n')
    out.write('clustering_zero: ' + str(c_zero_TR) + '\n')
    out.write('clustering_minus: ' + str(c_minus_TR) + '\n')
Пример #19
0
def scatter_kappas(subjects=None, shape=None, membership=None,
                   colors=None, centroids=None, labels=None, maax=None,
                   fignum=6):
    """Makes a scatter plot of kappa values, coloring them according to
    their membership.
    """
    if subjects is None:
        subjects = range(35)
    if colors is None:
        colors = ['blue', 'red', 'green']
    if labels is None:
        labels = colors
    if shape is None:
        shape = 'exponential'

    if membership is None:
        _, membership, _ = cl.clustering(subjects, k=2,
                                         clustering_type='kmeans',
                                         shape=shape)
    membership = np.array([membership[key] for key in membership.keys()])
    num_clusters = np.unique(membership).size

    if maax is None:
        fig = plt.figure(fignum)
        fig.clear()
        maax = fig.add_subplot(111)

    best_pars = ba.best_model(subjects, shapes=[shape])
    kappa = np.zeros(len(subjects))
    for subject in best_pars.keys():
        kappa[subject] = best_pars[subject][1][0][-1][1]

    counter = 1
    for c_cluster in range(num_clusters):
        c_subs = np.where(membership == c_cluster)[0]
        maax.scatter(range(counter, counter + len(c_subs)), kappa[c_subs],
                     color=colors[c_cluster])
        if centroids is not None:
            maax.plot(range(counter, counter + len(c_subs)),
                      np.tile(centroids[c_cluster], len(c_subs)),
                      color=colors[c_cluster])
        counter += len(c_subs)

    maax.set_xlim(0, len(subjects) + 1)
    maax.set_ylim(0, 110)
    maax.set_yticklabels(np.array(maax.get_yticks() / 10, dtype=int))
    maax.set_xlabel('Subjects')
    maax.set_ylabel('Sensitivity to points (STP)')
    maax.set_title('D', loc='left')
    plt.show(block=False)
Пример #20
0
def evaluate_clustering():
    warnings.filterwarnings(action="ignore", category=ConvergenceWarning)

    X = []
    Y = []
    nb_cluster_found = []

    max_skills_sets_sizes = 30
    for i in range(3, max_skills_sets_sizes):
        print(i)
        X.append(i)
        skills_sets = generate_skills_sets(i, 5, 7)
        users_skills, clusters_ground_truth = generate_user_skills(
            skills_sets, 500, 1, 2)

        clustering_range = (3, max_skills_sets_sizes)
        clustering_model = clustering(users_skills, range(*clustering_range),
                                      False)

        nb_cluster_found.append(len(clustering_model.cluster_centers_))

        info_score = normalized_mutual_info_score(clusters_ground_truth,
                                                  clustering_model.labels_)
        Y.append(info_score)

    plt.figure(figsize=(10, 5))
    plt.tight_layout()
    plt.title("Normalized mutual info score over number of skills sets/jobs")
    plt.xlabel("Number of skill sets/jobs")
    plt.ylabel("Normalized mutual info score")
    X = np.array(X)
    Y = np.array(Y)
    plt.plot(X, Y)

    is_correct = np.array(X) == np.array(nb_cluster_found)
    correct_indices = np.where(is_correct)[0]
    incorrect_indices = np.where(np.logical_not(is_correct))[0]
    plt.scatter(X[correct_indices],
                Y[correct_indices],
                color="blue",
                label="Correct number of cluster found")
    plt.scatter(X[incorrect_indices],
                Y[incorrect_indices],
                color="red",
                label="Incorrect number of cluster found")
    plt.ylim(0.95, 1.05)
    plt.legend()
    plt.savefig("evaluation_clustering.png")
    plt.show()
Пример #21
0
Файл: ck.py Проект: konsbn/graph
def ck(adjm,nodes):
	import clustering as cl 
	import degree as dg 
	dist = {}
	for j in nodes:
		deg = dg.degree(adjm,j)
		cls = cl.clustering(adjm, j)
		if if_present(deg, dist) == False:
			dist[deg] = [cls]
		if if_present(deg, dist) == True:
			dist[deg].append(cls)
	dist = popit(dist)
	# dist = ckdist(dist)
		#dist[deg] = cls
	return dist
Пример #22
0
Файл: ck.py Проект: konsbn/graph
def ck(adjm, nodes):
    import clustering as cl
    import degree as dg
    dist = {}
    for j in nodes:
        deg = dg.degree(adjm, j)
        cls = cl.clustering(adjm, j)
        if if_present(deg, dist) == False:
            dist[deg] = [cls]
        if if_present(deg, dist) == True:
            dist[deg].append(cls)
    dist = popit(dist)
    # dist = ckdist(dist)
    #dist[deg] = cls
    return dist
Пример #23
0
def run():
    start_time = time.clock()
    jieba.set_dictionary('jieba/dict.txt.big')
    jieba.initialize()
    print ("jieba " + str(time.clock() - start_time))
    
    start_time = time.clock()

    news_rss_url = "http://hk.news.yahoo.com/rss/hong-kong"
    # news_rss_url = "http://hk.news.yahoo.com/rss/china"
    info = feedparser.parse(news_rss_url)

    
    start_time = time.clock()

    for entry in info.entries:
        # word count of each word of summary
        word_list = getBagOfWords(preprocess(jieba.cut(stripTag(entry.summary))))
        # word count of each word of title
        bag_of_word_of_title = getBagOfWords(preprocess(jieba.cut(stripTag(entry.title))))
        
        # Combine word count of both summary and title and title weights more
        bag_of_word = Counter()
        for i in range(3):
            bag_of_word.update(bag_of_word_of_title)
        bag_of_word.update(word_list)
        entry["bag_of_words"] = bag_of_word

    print ("preprocess " + str(time.clock() - start_time))
        

#     result = Counter()
#     for entry in info.entries:
#         result.update(entry["bag_of_words"])
#     printList(result) 
        
    # Clustering them
    start_time = time.clock()
    clusters = clustering.clustering([Cluster([Vector(entry)]) for entry in info.entries])
    print ("clustering " + str(time.clock() - start_time))

    # Print the result        
    newsList = []
    for (index, cluster) in enumerate(clusters):
        for vector in cluster.listOfVectors:
            news = News(index, (vector == cluster.centroidVector), vector.data["title"], vector.data["published"], vector.data["link"])
            newsList.append(news.__dict__)
    return json.dumps(newsList)
Пример #24
0
def plot_cluster_shapes(subjects=None, maax=None, shape=None, colors=None,
                        centroids=None, labels=None, legends=None, fignum=4):
    """Finds the centers of the clusters in the data and plots the shapes that
    these centers create.
    """
    if subjects is None:
        subjects = range(35)
    if shape is None:
        shape = 'unimodal_s'
    flag_noshow = True
    if maax is None:
        fig = plt.figure(fignum)
        maax = fig.add_subplot(111)
        flag_noshow = False
    if colors is None:
        colors = ['blue', 'red', 'green']
    if labels is None:
        labels = colors
    if legends is None:
        legends = colors

    if centroids is None:
        centroids, _, _ = cl.clustering(subjects, k=3,
                                        clustering_type='kmeans',
                                        shape=shape)

    mabes = bc.betMDP(nS=72, thres=60)
    for ix_cen, centroid in enumerate(centroids):
        shape_pars = [shape] + [par for par in centroid]
        lnc = mabes.set_prior_goals(shape_pars=shape_pars, convolute=False,
                                    cutoff=False, just_return=True)
        lnc /= lnc.max()
        maax.plot(lnc, color=colors[ix_cen],
                  linewidth=3, label=legends[ix_cen])
        maax.set_xlim([0, 71])
        maax.set_xticklabels(np.array(maax.get_xticks(), dtype=int) * 10)

        shaded_threshold = np.arange(mabes.thres, mabes.nS)
        bg_color = figure_colors('past_threshold_background')
        maax.fill_between(shaded_threshold, 0, maax.get_ylim()
                          [-1], alpha=0.1, color=bg_color)
    # maax.set_title('Cluster centers'' shapes')
    maax.set_title('C', loc='left')
    maax.set_xlabel('Points')
    maax.set_ylabel('Valuation (a.u.)')
    if not flag_noshow:
        plt.show(block=False)
def main():
	n = 5	#Number of nodes
	e = 7	#Number of edges
	A = [[0 for x in xrange(n)] for x in xrange(n)]  #Adjacency Matrix 

	i = 1	#iterator

	while i <= e:
		a = random.randint(0,n-1)
		b = random.randint(0,n-1)

		if a!=b and A[a][b]!=1:
			A[a][b] = 1
			A[b][a] = 1
		else:
			i=i-1
		i+=1

	print A, "\n"

	total_cost = []
	for i in xrange(len(A)):
		start = i
		costs = dijkstra(A, start)
		print i, ":", costs, 
		total_cost.append(sum(costs))

	print "\nTotal cost:", total_cost, "\n"

	characterstic_pathlength = sum(total_cost)/float(n)

	print "Characterstic pathlength:", characterstic_pathlength, "\n"


	#Clustering Coefficient of the graph:
	clustering_coefficient=clustering(A)	
	print "Clustering coefficients are:", clustering_coefficient, "\n"


	#Average clustering coefficient of the graph
	aver_clustering=sum(clustering_coefficient)/n
	print "Average clustering coefficient of the network:", aver_clustering, "\n"


	return 2
Пример #26
0
def interval_test(DAG,start,end):
    
    lp = dl.lpd(DAG,start,end)
    length = lp[2]
    print 'The longest path between %s and %s is %d edges long' %(start,end,length)
    
    interval = lc.interval(DAG,start,end)
    N = interval.number_of_nodes()
    E = interval.number_of_edges()
    print 'The interval contains %d nodes and %d edges' %(N,E)
    
    c = clus.clustering(interval)
    print 'For the interval, c+ is %f, c0 is %f, c- is %f' %(c[0],c[1],c[2])
    
    #MMd = MM.MM_dimension(interval)
    MPSD = mp.mpsd(interval,lp[1])
    #print 'The MM dimension of the interval is %f and the MPSD is %f' %(MMd,MPSD)
    print 'The MPSD is %f' %MPSD[0]
Пример #27
0
def main():
    n = 5  #Number of nodes
    e = 7  #Number of edges
    A = [[0 for x in xrange(n)] for x in xrange(n)]  #Adjacency Matrix

    i = 1  #iterator

    while i <= e:
        a = random.randint(0, n - 1)
        b = random.randint(0, n - 1)

        if a != b and A[a][b] != 1:
            A[a][b] = 1
            A[b][a] = 1
        else:
            i = i - 1
        i += 1

    print A, "\n"

    total_cost = []
    for i in xrange(len(A)):
        start = i
        costs = dijkstra(A, start)
        print i, ":", costs,
        total_cost.append(sum(costs))

    print "\nTotal cost:", total_cost, "\n"

    characterstic_pathlength = sum(total_cost) / float(n)

    print "Characterstic pathlength:", characterstic_pathlength, "\n"

    #Clustering Coefficient of the graph:
    clustering_coefficient = clustering(A)
    print "Clustering coefficients are:", clustering_coefficient, "\n"

    #Average clustering coefficient of the graph
    aver_clustering = sum(clustering_coefficient) / n
    print "Average clustering coefficient of the network:", aver_clustering, "\n"

    return 2
Пример #28
0
    def clustering(self, new_feat_names, nr_clusters):
        self.df_all = self.df_all.dropna(subset=new_feat_names)
        new_feat_df = self.df_all.loc[:, new_feat_names]
        x_matrix = new_feat_df.as_matrix()
        # kmeans = cl.KMeans(n_clusters=nr_clusters).fit(x_matrix)
        # cluster_centers = kmeans.cluster_centers_
        clusterer = clust.clustering(self.df_all, new_feat_names)
        cluster_centers, self.df_all = clusterer.dtw_clustering()
        x = np.arange(4)
        for cluster_center in cluster_centers:
            plt.plot(x, cluster_center)
        plt.show()
        # self.df_all['labels'] = kmeans.labels_
        cluster_groups = self.df_all.groupby(['label'])
        count = 0
        highest_group_len = 0
        highest_group = None
        highest_group_name = 0
        for name, group in cluster_groups:
            cluster_center_list = [cluster_centers[int(name)]] * group.shape[0]
            # group.loc[:, new_feat_names] = cluster_center_list
            # self.create_dataframe_from_groups(group, count)
            if group.shape[0] > highest_group_len:
                if count > 0:
                    second_highest_group_len = highest_group_len
                    second_highest_group = highest_group
                    second_highest_group_name = highest_group_name
                    second_highest_group_index = highest_group_index
                highest_group_len = group.shape[0]
                highest_group = group
                highest_group_name = cluster_centers[int(name)]
                highest_group_index = name
            count += 1

        group_strats = highest_group.loc[:, new_feat_names].as_matrix()
        print('highest group name: %s' % str(highest_group_name))
        print('highest group len: %s' % str(highest_group_len))
        print('highest group index: %s' % str(highest_group_index))
        for strat in group_strats:
            plt.plot(x, strat)
        # plt.plot(x, highest_group_name)
        plt.show()
Пример #29
0
def resultpage(f):
    Query = unicode(f.getvalue('query',''),'utf-8')
    (RequestURL, numbers, items) = googleajaxsearch(Query)
    # items: 辞書{title, url, content}がサイト数分ある配列
    html_ans = u'<p>検索ワード:<big><b>%s</b></big> - %s<br>%s</p>' % (
        Query,numbers,unicode(RequestURL,'utf-8'))

    # items: 辞書を{title, url, content, (extract,) unigram}にアップデート
    """
    for item in items:
        item["extract"] = extracting(Query,item["title"],item["content"])
    """
    morphing(items,target='title|content')
    result = clustering(items)
    # result = [[group1][group2]...[groupk]]

    html_tree_cates = u''
    groupid=1
    for group in result:
        site_num=0
        for itemid in group:
            html_site = u'<font color="#ff0000">タグ: '
            for unigram in items[itemid]["unigram"]:
                html_site = html_site + u'%s ' % unigram
            html_site = html_site + u'</font><br><b>%s</b><br>%s<br>%s<br><br>' % (
                items[itemid]["title"],items[itemid]["url"],items[itemid]["content"])
            if site_num == 0:   # 各カテゴリのトップサイト
                html_this_cate = html_tree_site1 % html_site
                site_num == len(group)
            elif site_num == 1: # 各カテゴリの最終サイト
                html_this_cate = html_this_cate + html_tree_site3 % html_site
            else:
                html_this_cate = html_this_cate + html_tree_site2 % html_site
            site_num-=1
        html_tree_cates += html_tree_category % (unicode(str(groupid),'utf-8'),unicode(str(groupid),'utf-8'),unicode(str(groupid),'utf-8'),unicode(str(groupid),'utf-8'),unicode(str(groupid),'utf-8'),html_this_cate)
        groupid = groupid + 1
    html_ans = html_ans + html_tree_script + html_tree_head + html_tree_cates + html_tree_foot

    html = html_head + html_form % Query + html_ans + html_foot
    return html.encode('utf-8')
Пример #30
0
def conjunto_segmentados(segmentados, n=3):
    import pandas as pd
    segments_df = pd.DataFrame()
    lens = []
    inicio = 0

    for conj in segmentados:
        segments_df = segments_df.append(conj)
        lens.append([inicio, conj.shape[0]])
        inicio = conj.shape[0]

    fit = clustering(segments_df, n)
    segments_df['cluster'] = fit.labels_
    segments_df['cluster'] = segments_df['cluster'].astype(str)

    for x in range(len(lens)):
        segmentados[x] = segments_df.iloc[lens[x]]
    confidences = []
    for i in range(n):
        confidences.append(
            extremos_incertidumbre(fit, filter_numerical(segments_df), i))

    return segmentados, fit, confidences, segments_df
Пример #31
0
def histogram_clusters(subjects=None, shape=None, membership=None,
                       colors=None, labels=None, maax=None, fignum=5):
    """Makes a bar plot for membership of subjects in clusters."""
    if subjects is None:
        subjects = range(35)
    if colors is None:
        colors = ['blue', 'red', 'green']
    if labels is None:
        labels = colors
    if shape is None:
        shape = 'unimodal_s'

    if membership is None:
        _, membership, _ = cl.clustering(subjects, k=3,
                                         clustering_type='kmeans',
                                         shape=shape)
    membership = np.array([membership[key] for key in membership.keys()])
    ix_centroid = np.unique(membership)
    ix_centroid.sort()

    count_members = np.array([len(np.where(membership == x)[0])
                              for x in ix_centroid])

    if maax is None:
        fig = plt.figure(fignum)
        fig.clear()
        maax = fig.add_subplot(111)

    maax.bar(ix_centroid, count_members, width=1,
             color=colors)
    maax.set_xticks(ix_centroid + 0.5)
    maax.set_xticklabels(labels)
    maax.set_ylim(1.2 * np.array(maax.get_ylim()))
    # maax.set_title('Cluster members')
    maax.set_title('C', loc='left')
    maax.set_ylabel('Number of subjects')
    plt.show(block=False)
Пример #32
0
#!coding:utf-8

# 一発で計算できるようにプログラム化

import sys
import make_data as md
import clustering as cls

#------------[start]データ生成〜類似度行列計算----------

if(len(sys.argv)>1): #オプション受け取り
    N = sys.argv[1] # N:個体数
    M = sys.argv[2] # M:個体の特徴ベクトルの次元

    K = md.make_K(N=N,M=M) # 特徴ベクトル生成->類似度行列生成

else:
    K = md.make_K() #オプションがなければデフォルト値(N=5,M=?)で類似度行列生成

print K
#------------[end]データ生成〜類似度行列計算----------

#------------クラスタリング----------

#clustering(N,M,W,alpha,beta) # 類似度行列とその他のパラメータが与えられたもとで崩壊型ギブスサンプリングを行う


M=5
cls.clustering(K=K,M=M,W=K,alpha=1,beta=2)
Пример #33
0
        if i ==0:
            cluster_pred = cluster_pred_tem.copy()
        else:
            cluster_pred = pd.concat([cluster_pred,cluster_pred_tem])
    
    return total_pred.sort_values(by='id').reset_index(drop=True), cluster_pred.sort_values(by='id').reset_index(drop=True)


# excution
if __name__=='__main__': 
    data_path = 'data/'
    perform_raw, rating, test_raw = load_data(data_path,trend=False,weather=False,query=False)
    train_var, test_var = make_variable(perform_raw,test_raw,rating)
    raw_data, y_km, train_len= preprocess(train_var,test_var,0.03,3,inner=False) 
    data = mk_trainset(raw_data,categorical=True) # lgbm만 categorical = True, 나머지 모델은 False -> one-hot encoding
    train, val, robustScaler = clustering(data,y_km,train_len,test=True) # test 할때만 test = True

    # permutation으로 날릴 변수들 
    # lgbm 기준이라서 one-hot 안된 카테고리 변수들이 있음, 다른 모델 랜덤 서치 돌릴 때는 
    # 해당 변수들은 이름이 없을테니(ex. min -> min_0, min_1, min_2 ...)
    # 알아서 에러나는거 보고 빼던가 미리 카테고리 변수는 drop 리스트에서 빼 놓으셈

    # 그리고 변수 drop은 0,1 모델 기준으로 한거라 cluster 기준으로 랜덤서치하는 거랑 안 맞을 수 있음
    # 혜린이한테는 일단 두 리스트 교집합으로 하라 했는데 더 좋은 방법 있음 생각해서 시도 ㄱㄱ

    drop1_  = ['min_sales_med',  'min_sales_std',  'day_sales_rank', 'min_sales_rank',
    'min_order_rank', 'cate_sales_rank', 'cate_order_rank', 'cate_order_med',
    'cate_sales_med', 'prime', 'min_order_std', 'min_sales_mean',
    'day_order_rank', 'cate_order_std', 'min_order_med', 'min_order_mean',
    'cate_sales_mean', 'cate_sales_std', 'day_order_std', 'rating',
    'day_sales_med', 'min', 'day_order_med']
Пример #34
0
    predictions = clf.predict(feature_vector)

    predictions_prob = clf.predict_proba(feature_vector)

    correctness = evaluate_single_extraction(predictions,
                                             target_vector_reshaped, idx,
                                             talker_entities)

    all_entities = [entity["entity"] for entity in entry["talker"]]
    # print("idx", idx)
    print("quote", entry["quote"])
    print("url", entry["source"])

    # print("all_entities", all_entities)
    cluster_map, inverse_cluster_map = clustering(all_entities,
                                                  return_inverse=True)
    print("cluster_map", cluster_map)
    # if correctness == 0:
    #     pass
    # print("wrong:")
    #
    # url_counts["wrong"].append(url_map_count[entry["source"]])
    # entities_counts["wrong"].append(len(all_entities))
    #
    # print("prediction")
    # talker_candidates = get_talker_candidates(predictions_prob, all_entities, cluster_map, inverse_cluster_map)
    # pprint(talker_candidates)
    #
    # print("truth")
    # pprint([entry["talker"][i]["entity"] for i, p in enumerate(target_vector_reshaped) if p==1])
    #
Пример #35
0
    #print(score_idx)
    # print(sorted_score_vector)
    #print(score_matrix)

    # extract top N candidate vanishing points
    if len(VP) < numPointRank:
        numPointRank = len(VP)

    topN_VP = []

    for i in range(numPointRank):
        topN_VP.append(VP[score_idx[i]][0:2])
        print(topN_VP[i])
    # print(topN_VP)

    a = clustering(topN_VP)
    #

    #print(tmp_product_norm[0])

    #print(tmp_product_norm)
    #VD3D.append()

    #cv2.line(rgb_img, (x1,y1), (x2,y2), (0,0,255),2)

    #cv2.imshow("RGB",rgb_img)
    #cv2.imshow("CANNY",edges)

    #cv2.waitKey(300000)

    cv2.imshow("CANNY", rgb_img)
Пример #36
0
    info = feedparser.parse(news_rss_url)
    printList(info.entries)
    for entry in info.entries:
        # word count of each word of summary
        word_list = getBagOfWords(preprocess(pseg.cut(stripTag(entry.summary))))
        # word count of each word of title
        bag_of_word_of_title = getBagOfWords(preprocess(pseg.cut(stripTag(entry.title))))
        
        # Combine word count of both summary and title and title weights more
        bag_of_word = Counter()
        for i in range(3):
            bag_of_word.update(bag_of_word_of_title)
        bag_of_word.update(word_list)
        entry["bag_of_words"] = bag_of_word
        
#     result = Counter()
#     for entry in info.entries:
#         result.update(entry["bag_of_words"])
#     printList(result) 
        
    # Clustering them
    clusters = clustering.clustering([Cluster([Vector(entry)]) for entry in info.entries])

    # Print the result        
    for cluster in clusters:
        print "____FINAL___CLUSTER___"
        printList("CENTROID: " + cluster.centroidVector.data["title"])
        for vector in cluster.listOfVectors:
            printList(vector.data["title"])
        print "____END_OF_CLUSTER___"
    
Пример #37
0
start = time.time()

geo_locs = []
#loc_ = Point(0.0, 0.0)  #tuples for location
#geo_locs.append(loc_)
#read the fountains location from the csv input file and store each fountain location as a Point(latit,longit) object
f = open('data.csv', 'r')
reader = csv.reader(f, delimiter=",")

for line in reader:
    loc_ = Point(line[0], float(line[1]), float(line[2]))  #tuples for name and location
    geo_locs.append(loc_)

#print len(geo_locs)
#for p in geo_locs:
#    print "%f %f" % (p.latit, p.longit)
#let's run k_means clustering. the second parameter is the no of clusters

cluster = clustering(geo_locs, 4 )
flag = cluster.k_means()

end = time.time()

if flag == -1:
    print "Error in arguments!"
else:	
    print "%f" % (end-start)
    #the clustering results is a list of lists where each list represents one cluster
    print "Clustering results:"
    cluster.print_clusters(cluster.clusters)
Пример #38
0
n_components = args.n_components
n_clusters = args.n_clusters

GO_Term_path = args.GO_Term_path
outputpath =  data_name + '_dealing.csv'
batch_read_GO_Term(GO_Term_path, outputpath)

GO_Term =  outputpath
Save_path = data_name + '_duplicate_removal_1.csv'
screen_Term_gene_1(GO_Term, Save_path)

GO_Term_1 =  Save_path
Save_path_1 =  data_name + '_duplicate_removal.csv'
screen_Term_gene(GO_Term_1, Save_path_1)

Save_path_2 =  Save_path_1
GO_Term_gene_expression_path = args.expression_path
outputpath1 = args.outputpath1
batch_read_GO_Term_matrix(GO_Term_gene_path=Save_path_2, GO_Term_gene_expression_path=GO_Term_gene_expression_path,
                          outputpath=outputpath1)

read_path = outputpath1
outputpath2 = data_name + '_KPCA_' + str(n_components) + '_cosine.csv'

batch_exacting_feature_KPCA(read_path, outputpath2, n_components)

read_path =  outputpath2
label_path =  args.label_path
clustering(n_clusters, read_path, label_path)

Пример #39
0
 def interpret_clustering_ten_minutes(self,times,entropy=False,minimum_traversals=2,minute_interval=10,days=False):
     if days==True:
             max_val = 1440*7
     else: 
             max_val=1440
     if len(times)>1:
         
         c = clustering()
         clusters=c.cluster_path_times(times,False,days)
         #print('clusters are:')
         #for key in clusters.keys():
          #   print clusters[key]
         averages=[]
         main_average=0
         total=0
         for key in clusters.keys():
             av_duration =0
             for (duration,date)in clusters[key]:
                 av_duration+=duration
                 main_average+=duration
                 total+=1
             av_duration/=len(clusters[key])
             averages.append(av_duration)
         main_average/=total
         #print averages
         #print main_average
         bins_amts = []
         bins_ests =[]
         zeroCount=[]
         partitions=[]
         #print 'days', days
        
         #print 'max_val',max_val
         for i in range(0,max_val/minute_interval):#create a bin for each ten minutes of the day
             bins_ests.append(0)
             bins_amts.append(0)
             zeroCount.append(0)
             partitions.append(i*minute_interval)
         partitions.append(max_val)
         #print clusters.keys()
         for i in range(len(clusters.keys())):
             #zeroCount=0
             for(duration,date) in clusters[clusters.keys()[i]]:
                 if duration ==0:
                     zeroCount[int(date)/minute_interval]+=1
                 else:
                     bins_amts[int(date)/minute_interval]+=1
                     bins_ests[int(date)/minute_interval]+=averages[i]
              
                     
         for i in range(0,max_val/minute_interval):
             
             if(bins_amts[i]>0):
                
                 bins_ests[i]/=bins_amts[i]
             else:
                 bins_ests[i]=main_average
             if zeroCount[i] > bins_amts[i]:
                 bins_ests[i]= -1 #special value for if this edge is blocked
         entropies=[]
         if entropy is True:
             entropies=[]
             for i in range(0,max_val/minute_interval):
                 bin=[]
                 for j in range(len(clusters.keys())):
                     for(duration,date) in clusters[clusters.keys()[j]]:
                         if (date>=i*minute_interval) & (date<(i+1)*(minute_interval)):
                            # print(i*minute_interval, (i+1)*minute_interval)
                             bin.append(j)
                 counts=dict()
                 total=len(bin)*1.0
                 for value in bin:
                     
                     try:
                         counts[value]+=1
                     except KeyError:
                         counts[value]=1.
                 #print counts
                 entropy=0
                 if total >= minimum_traversals:
                     for x in counts.keys():
                        # print('counts/total',counts[x]/total)
                         #print 'counts[x]',counts[x]
                         #print 'total',total
                         if counts[x] > 0:
                             entropy-=counts[x]/total*math.log(counts[x]/total)
                 else:
                     entropy=sys.maxint
                 entropies.append(entropy)
             return partitions,bins_ests,entropies
         else:   
             return partitions,bins_ests
     else:
         partitions=[]
         bin_ests=[]
         entropies=[]
         for i in range(0,max_val/minute_interval):
             partitions.append(i*minute_interval)
             bin_ests.append(sys.maxint)
             entropies.append(sys.maxint)
         if entropies:
             return partitions,bin_ests,entropies
         else:
             return partitions, bin_ests
video = pd.merge(video1, video2, right_index=True, left_index=True)
video = pd.merge(video, video3, right_index=True, left_index=True)
video.index = [idx[:7] for idx in video.index]
video_norm = pd.DataFrame(pca.fit_transform(normalize(video)),
                          index=video.index)
print(video_norm.head(1))
print(sum(pca.explained_variance_ratio_))

path = 'result/merge/'

#Fusion des modalités
audio_video = pd.merge(video_norm,
                       audio_norm,
                       right_index=True,
                       left_index=True)
clustering(audio_video, path + 'audio_video.html', nb_cluster=1)

audio_text = pd.merge(audio_norm, text_norm, right_index=True, left_index=True)
clustering(audio_text, path + 'audio_text.html', nb_cluster=1)

video_text = pd.merge(video_norm, text_norm, right_index=True, left_index=True)
clustering(video_text, path + 'video_text.html', nb_cluster=1)

audio_video_text = pd.merge(video_text,
                            audio_norm,
                            right_index=True,
                            left_index=True)
audio_video_text.to_csv('features/merge/all.csv',
                        sep='§',
                        index_label='Sequence')
# audio_video_text_norm = pd.DataFrame(normalize(audio_video_text), index=audio_video_text.index, columns=audio_video_text.columns)
Пример #41
0
        # word count of each word of summary
        word_list = getBagOfWords(preprocess(pseg.cut(stripTag(
            entry.summary))))
        # word count of each word of title
        bag_of_word_of_title = getBagOfWords(
            preprocess(pseg.cut(stripTag(entry.title))))

        # Combine word count of both summary and title and title weights more
        bag_of_word = Counter()
        for i in range(3):
            bag_of_word.update(bag_of_word_of_title)
        bag_of_word.update(word_list)
        entry["bag_of_words"] = bag_of_word

#     result = Counter()
#     for entry in info.entries:
#         result.update(entry["bag_of_words"])
#     printList(result)

# Clustering them
    clusters = clustering.clustering(
        [Cluster([Vector(entry)]) for entry in info.entries])

    # Print the result
    for cluster in clusters:
        print "____FINAL___CLUSTER___"
        printList("CENTROID: " + cluster.centroidVector.data["title"])
        for vector in cluster.listOfVectors:
            printList(vector.data["title"])
        print "____END_OF_CLUSTER___"
Пример #42
0
import csv
import sys

geo_locs = []
#loc_ = Point(0.0, 0.0)  #tuples for location
#geo_locs.append(loc_)
#read the fountains location from the csv input file and store each fountain location as a Point(latit,longit) object
if len(sys.argv)>2:
    print "Input CSV file:" + sys.argv[2]
    f=     open(sys.argv[2], 'r')
else:
    print "Input CSV file: it-2004.sites.gpscoords.csv"   
    f = open('it-2004.sites.gpscoords.csv', 'r')
reader = csv.reader(f, delimiter=",")
for line in reader:
    loc_ = Point(float(line[0]), float(line[1]))  #tuples for location
    geo_locs.append(loc_)
#print len(geo_locs)
#for p in geo_locs:
#    print "%f %f" % (p.latit, p.longit)
#let's run k_means clustering. the second parameter is the no of clusters
k_value = sys.argv[1]
print "K_Value: " + k_value
cluster = clustering(geo_locs, int(k_value))
flag = cluster.k_means(False)
if flag == -1:
    print "Error in arguments!"
else:
    #the clustering results is a list of lists where each list represents one cluster
    print "clustering results:"
    cluster.print_clusters(cluster.clusters)
Пример #43
0
import random as rand
from clustering import clustering
from point import Point
import csv

geo_locs = []
#loc_ = Point(0.0, 0.0)  #tuples for location
#geo_locs.append(loc_)
#read the fountains location from the csv input file and store each fountain location as a Point(latit,longit) object
f = open('./drinking_fountains.csv', 'r')
reader = csv.reader(f, delimiter=",")
for line in reader:
    loc_ = Point(float(line[1]), float(line[2]))  #tuples for location
    geo_locs.append(loc_)
print(len(geo_locs))
#for p in geo_locs:
#    print "%f %f" % (p.latit, p.longit)
#let's run k_means clustering. the second parameter is the no of clusters
cluster = clustering(geo_locs, 8)
flag = cluster.k_means(False)
if flag == -1:
    print("Error in arguments!")
else:
    #the clustering results is a list of lists where each list represents one cluster
    print("clustering results:")
    cluster.print_clusters(cluster.clusters)
Пример #44
0
#######################
# Dimension reduction
print("Reducing dimensionality ...")
vectors = dimensionreduction.reduce_dimension(vectors=vectors,
                                              method="pca",
                                              pre_normalization=False,
                                              target_dim=2,
                                              params={})
print("Dimensionality=%d" % vectors.shape[1])

#######################
# Clustering
print("Clustering ...")
model = clustering.clustering(vectors=vectors,
                              method="gmm",
                              params={
                                  "n_clusters": 10,
                                  "covariance_type": "full"
                              })
cluster_ids = model.get_cluster_assignments()
cluster_names = np.asarray(["C%s" % c_id for c_id in cluster_ids])
cluster_centers = model.get_cluster_centers()
cluster_covariances = model.get_cluster_covariances()
print("# of clusters=%d" % model.n_clusters)

#######################
# Visualization
print("Visualizing ...")
cluster_order = ["C%s" % c_id for c_id in range(model.n_clusters)]
visualizers.scatter(vectors=vectors,
                    categories=cluster_names,
                    category_name="Cluster",
Пример #45
0
#coding:utf-8
import clustering as cl

if __name__ == '__main__':
    '''
    クラス01234をcsvで出力
    文字コードはutf-8。 エクセルで見る場合は、サクラエディタとかでshift_JISに変換してから    
    '''
    
    current = cl.clustering('sony.csv')
    current.output_csv()
Пример #46
0
from combineCSV import combCSV
from clustering import clustering

#class_Combine = combCSV('combinedNew.csv')
#class_Combine.readCSV('tokenDatanew.csv','tokenDatanew2.csv')

class_Cluster = clustering('combinedNew.csv')
#class_Cluster.pca()
#print class_Cluster.top_words()
#print class_Cluster.getFrequency()
class_Cluster.write()

print 'End of the code'