Пример #1
0
def birch_algo(X, threshold=1.7, clustering=None):
        birch = Birch(threshold=threshold, n_clusters=clustering)
        t = time()
        birch.fit(X)
        time_ = time() - t
        labels = birch.labels_
        centroids = birch.subcluster_centers_
        n_clusters = np.unique(labels).size
        print(" The number of clusters is : %d" % n_clusters)
Пример #2
0
def birchcluster(X):
  brc = Birch()
  brc.fit(X)
  # Plot result
  labels = brc.labels_
  centroids = brc.subcluster_centers_
  n_clusters = np.unique(labels).size
  print("n_clusters : %d" % n_clusters)
  return labels
Пример #3
0
def birch_algo(X, threshold=1.7, clustering=None):
        birch = Birch(threshold=threshold, n_clusters=clustering)
        birch.fit(X)
        labels = birch.labels_
        centroids = birch.subcluster_centers_
        labels_unique = np.unique(labels)
        n_clusters = labels_unique.size
        print(" The number of clusters is : %d" % n_clusters)
        return labels, centroids, n_clusters     
def birch(x, n_clusters=None, threshold=0.5, branching_factor=5):
  birch_model = Birch(
    threshold=threshold, 
    n_clusters=n_clusters, 
    branching_factor=branching_factor
  )
  birch_model.fit(x)

  centroids = birch_model.subcluster_centers_
  c = birch_model.labels_
  k = len(centroids)

  return birch_model, (centroids, c, k)
def define_segments(QLINK_URLS, UNKNOWN_URLS, QUOTA):
	#t = time.clock()
	global quota_for_each_cluster
	global brc
	global v
	global quota
	global select
	quota = 10000
	result_arr = QLINK_URLS + UNKNOWN_URLS
	for i, url in enumerate(result_arr):
		result_arr[i] = urlparse.urlparse(unquote(url.strip()))

	#l_dict = 
	v = DictVectorizer(sparse=False)
	data = v.fit_transform(extract_features(result_arr))
	ind_list = []
	ind_list_data = []
	low_bound = 8

	for col in xrange(data.shape[1]):
		if (np.sum(data[:, col]) > low_bound):
			ind_list.append(1)
			ind_list_data.append(col)
		else:
			ind_list.append(0)

	v = v.restrict(ind_list)
	data = data[:, ind_list_data] 
	#if (start_url[0].find("wikipedia") != -1):
	#	out_data("som_data_wiki/qlink.tfxidf", data[:500], start_url[:500])
	#	out_data("som_data_wiki/notqlink.tfxidf", data[500:], start_url[500:])
	#	out_data("som_data_wiki/data.tfxidf", data, start_url)
	#	out_template("som_data_wiki/data_features.tv", v.get_feature_names(), len(data))
	#	out_template("som_data_wiki/qlink_features.tv", v.get_feature_names(), len(data) / 2)
	#	out_template("som_data_wiki/notqlink_features.tv", v.get_feature_names(), len(data) / 2)
	#	return 0
	best_cou_clusters = data.shape[1]
	#k_means = KMeans(n_clusters=best_cou_clusters, init = 'random')
	#clust = k_means.fit_predict(data)
	brc = Birch(branching_factor=50, n_clusters=best_cou_clusters, threshold=0.2, compute_labels=True)
	clust = brc.fit_predict(data)
	select = SelectKBest(k=min(data.shape[1], 30))
	data = select.fit_transform(data, clust)
	clust = brc.fit_predict(data)
	#print data.shape

	quota_for_each_cluster = np.zeros(best_cou_clusters)
	clust_qlink = list(clust[:500])
	for i in xrange(best_cou_clusters):
		quota_for_each_cluster[i] = clust_qlink.count(i) / 500.0 * QUOTA 
	quota_for_each_cluster *= 2.0
def birch_cluster(init_ds,ts_flag = False):
    '''
    Parameters: init_ds - 2D list of data
                ts_flag - boolean specifying if the first column of init_ds is a datetime object or not
    Returns: 2D list with additional column denoting which cluster said row falls into
    '''

    if ts_flag:
        init_ds = [i[1:] for i in init_ds]

    brc = Birch()
    labels = brc.fit_predict(init_ds)
    
    return [init_ds[i]+[labels[i]] for i in range(len(init_ds)) ]
def main():
    #remove sub folders
    removeSubFolders(path+algorithm+'\\')
    
    for file in os.listdir(path):
        if file.endswith("-d.txt"):
            text_file = open(path+file,'r')
            
            ar = (text_file.readline().split(' '))
            ar.remove('\n')
            if(len(ar)>0):
                #print map(int,ar)
                row = map(int,ar);
                data.append(row)
                fileNames.append(file)
            #print(row)

    #create np array

    npData = np.array(data)
    n_samples, n_features = npData.shape
    brc = Birch(branching_factor=50, n_clusters=n_digits, threshold=0.5,compute_labels=True)
    #kmeans = KMeans(init='random', n_clusters=n_digits, n_init=500)
    brc.fit(npData)
    list1 = brc.labels_
    list2 = fileNames
    print brc.labels_
    print fileNames

    list1, list2 = zip(*sorted(zip(list1, list2)))

    print list1
    print list2
    '''
    k=0
    lim = len(list1)-1
    for i in range(0,n_digits):
        
        while(list1[k]==i):
            # want to copy these into folders
            copychar(list1[k],list2[k])
            print list1[k],list2[k]
            k+=1
            if k==lim:
                break
    '''
    for i in range(0,len(list1)):
        print list1[i],list2[i]
        copychar(list1[i],list2[i])
Пример #8
0
   def obtainCodebook(self, sampled_x, x):

      print 'Obatining codebook using Birch from skilean...'
   
      scaled_x_sampled = StandardScaler().fit_transform(sampled_x)
      scaled_x = StandardScaler().fit_transform(x)
      
      brc = BIRCH(branching_factor=self.branching_factor, n_clusters=self.nclusters, threshold=self.threshold, compute_labels=True)
      
      #obatin the codebook and the projections of the images on the codebook (clusters of words)
      codebook = brc.fit(scaled_x_sampled)
      clusters = brc.predict(scaled_x)
      
      print 'Clusters obtained.'
      
      return codebook, clusters 
Пример #9
0
def define_segments(QLINK_URLS, UNKNOWN_URLS, QUOTA):
    # url to obj
    qlinks = map(parse_url, QLINK_URLS)
    ulinks = map(parse_url, UNKNOWN_URLS)

    # check netloc
    # print qlinks[0].netloc

    # extract features
    start = time.time()
    qlinks_f = [dict(Counter(zip(*extract_features([link], 0))[0])) for link in qlinks]
    ulinks_f = [dict(Counter(zip(*extract_features([link], 0))[0])) for link in ulinks]
    # print time.time() - start
    # start = time.time()

    v = DictVectorizer(sparse=False)
    x_ = v.fit_transform(qlinks_f + ulinks_f)

    best_features = np.sum(x_, axis=0) > 5

    m_features = np.sum(best_features)

    v = v.restrict(best_features)
    x_ = x_[:, best_features]

    clustering = Birch(branching_factor=BIRCH_BRANCHING_FACTOR, n_clusters=m_features,
                       threshold=BIRCH_THRESHOLD, compute_labels=True)
    y_ = clustering.fit_predict(x_)

    sel = SelectKBest(k=min(m_features, KBEST_K))
    x = sel.fit_transform(x_, y_)

    y = clustering.fit_predict(x)
    q_or_u = np.repeat([1, 0], [len(QLINK_URLS), len(UNKNOWN_URLS)])
    q_ = np.vstack((y, q_or_u)).T

    quota = zip(np.unique(y),
                (np.array([np.sum(q_[q_[:, 0] == c, 1]) for c in np.unique(y)]) / float(len(QLINK_URLS))) * QUOTA * 2)
    quota = {c: int(q) for c, q in quota}

    algos[qlinks[0].netloc] = {
        "clustering": clustering,
        "quota": quota,
        "sel": sel,
        "vect": v,
        "total_quota": QUOTA,
    }
Пример #10
0
   def obtainClusters(self, hist):

      print 'Obatining clusters using Birch from skilean...'
   
      hist = np.array(hist)
      hist = hist.astype(float)      
      scaled_vec = StandardScaler().fit_transform(hist)
      
      brc = BIRCH(branching_factor=self.branching_factor, n_clusters=self.nclusters, threshold=self.threshold, compute_labels=True)
      
      #obatin the codebook and the projections of the images on the codebook (clusters of words)
      codebook = brc.fit(scaled_vec)
      clusters = brc.predict(scaled_vec)
      
      print 'Clusters obtained.'
      
      return clusters
def runBrich(K_cluster, cluster_input):
    # clustering by topic-probability vector of each category
    t0 = time()
    bri = Birch(n_clusters=K_cluster)
    bri.fit(cluster_input)
    print("done in %0.3fs" % (time() - t0))

    with open('result/brich_cluster_' + str(K_cluster) + '.txt', 'w') as f:
        f.write("cluster_centers\n")
        f.write(str(bri.subcluster_centers_))
        f.write("\n==========\n")
        f.write("labels (sequence of cluster # which input belongs to )\n")
        f.write(str(bri.labels_))
        f.write("\n==========\n")
        f.write("inertia\n")
        f.write(str(bri.subcluster_labels_))
        f.write("\n==========\n")

    return bri.labels_
Пример #12
0
    def split_birch(self, branching_factor, threshold):

        # Extract dataset from files
        dataset = [f.dataset for f in self.files]

        # Initialize classifier
        classifier = Birch(branching_factor=branching_factor, n_clusters=None, threshold=threshold)

        classifier.fit(dataset)

        # Get index
        index = classifier.predict(dataset)

        count = max(index) + 1

        # Create new clusters
        clusters = [Cluster(self.directory, self.name + '-' + str(i)) for i in range(count)]
        for i in range(0, len(self.files), 1):
            clusters[index[i]].add_file(self.files[i])

        return clusters
Пример #13
0
def test_birch_with_depot_calculation():
    points = points_from_file('tsps/berlin52.txt')
    matrix = load_matrix(points)
    X = [[p[1],p[2]] for p in points]
    est = Birch(n_clusters=3)
    est.fit(X)
    labels = est.labels_
    hl_matrix, clusters, G = load_matrices_from_labels(points,labels)
    depots, C = compute_depots(clusters, matrix, G, per_cluster=True)
    depots_actual, _ = compute_depots(clusters, matrix, G)
    cluster_optimal_cost, R, hl_route = clustered_tsp_solve(points, 3, labels=labels, depots=depots)
    cluster_optimal_cost += C

    print(depots_actual)
    print(R,C)

    for depot in depots_actual:
        for r in R:
            if r[1][0] == depot:
                for point in r[1]:
                    print(matrix.points[point])
        print('')
Пример #14
0
def build_model(df, cluster_type="kmeans", seed=1):
    if cluster_type == "birch":
        model = Birch(n_clusters=N_CLUSTERS)
        res = model.fit_predict(df)
    elif cluster_type == "minibatch":
        model = MiniBatchKMeans(n_clusters=N_CLUSTERS, random_state=seed)
        res = model.fit_predict(df)
    elif cluster_type == "em":
        model = mixture.GMM(n_components=N_CLUSTERS)
        model.fit(df)
        res = model.predict(df)
    elif cluster_type == 'lda':
        model = lda.LDA(n_topics=N_CLUSTERS, n_iter=1500, random_state=seed)
        data_to_cluster = np.array(df).astype(int)
        lda_res = model.fit_transform(data_to_cluster)
        res = []
        for i in lda_res:  #for now - do hard clustering, take the higheset propability
            res.append(i.argmax())
    else:
        model = KMeans(n_clusters=N_CLUSTERS, random_state=seed)
        res = model.fit_predict(df)
        df_array = np.array(df)

        dis_dict = {}
        for i in range(N_CLUSTERS):
            dis_dict[i] = clusters_centers[i]
        all_dist = []
        for line_idx in range(len(df_array)):
            label =  model.labels_[line_idx]
            dist = calc_distance(df_array[line_idx],dis_dict[label])
            all_dist.append(dist)
        df["distance_from_cluster"] = all_dist

    #clusters = model.labels_.tolist()
    #print ("clusters are:",clusters)
    print(""">>>> model is: %s, # of clusters:%s, and %s""" %(cluster_type,N_CLUSTERS,Counter(res)))
    res = [str(i) for i in res]
    docs_clusteres = zip(df.index,res)
    return docs_clusteres
Пример #15
0
    def birch(self, n_clusters, threshold=0.5, lsi_components=None):
        """
        Perform Birch clustering

        Parameters
        ----------
        n_clusters : int
            number of clusters
        lsi_components : int
            apply LSA before the clustering algorithm
        threshold : float
            birch threshold
        """
        from sklearn.cluster import Birch
        pars = {'threshold': threshold}
        if lsi_components is None:
            raise ValueError("lsi_components=None detected. You must use LSI with Birch \
                    clustering for scaling reasons.")

        lsi = _generate_lsi(lsi_components)

        km = Birch(n_clusters=n_clusters, threshold=threshold)

        return self._cluster_func(n_clusters, km, pars, lsi=lsi)
Пример #16
0
def get_model(data, index):
    index += 1
    if index == 1:
        classifier = KMeans(n_clusters=2)
        classifier.fit(data)
    elif index == 2:
        classifier = svm.OneClassSVM(nu=params['alpha'] + 0.005,
                                     kernel="rbf",
                                     gamma=0.1)
        classifier.fit(data)
    elif index == 3:
        classifier = MeanShift(bin_seeding=True, n_jobs=-1)
        classifier.fit(data)
    elif index == 4:
        classifier = EllipticEnvelope(contamination=params['alpha'])
        classifier.fit(data)
    elif index == 5:
        classifier = IsolationForest(contamination=params['alpha'],
                                     random_state=None)
        classifier.fit(data)
    elif index == 6:
        classifier = Birch(n_clusters=2)
        classifier.fit(data)
    return classifier
Пример #17
0
 def set_Cluster(self, algorithm, param_dict):
     self.algorithm_name = algorithm
     if algorithm == "KMeans":
         self.cluster = KMeans(param_dict[0], max_iter=param_dict[1])
     elif algorithm == "BIRCH":
         self.cluster = Birch(n_clusters=param_dict[0],
                              threshold=param_dict[1])
     elif algorithm == "DBSCAN":
         self.cluster = DBSCAN(eps=param_dict[0], min_samples=param_dict[1])
     elif algorithm == "GMM":
         self.cluster = GMM(n_clusters=param_dict[0],
                            max_iter=param_dict[1])
     elif algorithm == "OPTICS":
         self.cluster = OPTICS(min_samples=param_dict[0],
                               max_eps=param_dict[1])
     elif algorithm == "MeanShift":
         self.cluster = MEANSHIFT(quantile=param_dict[0],
                                  n_samples=param_dict[1])
     elif algorithm == "CLIQUE":
         self.cluster = CLIQUE(intervals=param_dict[0],
                               threshold=param_dict[1])
     else:
         print("没有找到分类器")
     self.cluster.class_ = None
Пример #18
0
 def find_anomalous_edges(self):
     for edge in self.edges:
         elapsed_time = np.array(
             list(self.trace_data[self.trace_data.path == edge]
                  ['elapsedTime']))
         normalized_time = preprocessing.normalize([elapsed_time
                                                    ]).reshape(-1, 1)
         if self.take_minute_averages_of_trace_data:
             birch = Birch(branching_factor=50,
                           n_clusters=None,
                           threshold=0.05,
                           compute_labels=True)
         else:
             birch = Birch(branching_factor=50,
                           n_clusters=None,
                           threshold=0.001,
                           compute_labels=True)
         birch.fit_predict(normalized_time)
         labels = birch.labels_
         if np.unique(labels).size > 1:
             self.anomalous_edges[edge.split('-')[1]] = edge
Пример #19
0
    def birchModel(self):
        birch_model = Birch()
        birch_model.fit(self.X)
        # Plot result
        labels = birch_model.labels_
        centroids = birch_model.subcluster_centers_
        n_clusters = np.unique(labels).size
        print("n_clusters : %d" % n_clusters)
        print(birch_model.predict(self.X))
        for i in range(1, self.X.shape[0]):
            if birch_model.predict(self.X)[i] == 1:
                print(i)


#KMeansModel()
#linkageModel()
#agglomerativeClusteringModel()
#TSNETest()
#birchModel()
#decisiontree()
#model_predict()
#randomforest()
Пример #20
0
def train(feature, weights, cluster_num, feature_path=None):
    if feature_path != None:
        feature = pd.read_csv(feature_path)
    X, Y = [], []
    print("Training...\n")
    for i in range(len(feature[feature.columns[0]])):
        f = np.array(feature.iloc[i][1:])
        f_w = combine(feature.iloc[i][1:], weights)
        print(f)
        print(f_w)
        X.append(f_w)
        Y.append(f)
    clf = Birch(n_clusters=cluster_num)
    clf = KMeans(n_clusters=cluster_num)
    clf.fit(X)
    pred = clf.predict(X)
    joblib.dump(clf, 'curve_model_KMeans.pkl')
    rdf = RandomForestClassifier()
    rdf.fit(Y, pred)
    joblib.dump(rdf, 'rforest_model.pkl')
    print(pred)
    return pred
Пример #21
0
    def create_graph(self):
        # creates weighted graph from the trace data
        print('Creating graph of %d edges:' % len(self.edges))
        for edge in self.edges:
            source, destination = edge.split('-')
            if source != 'Start':
                vector_of_time = self.dictionary_of_times[edge]
                reshaped_vector_of_time = np.reshape(vector_of_time, (-1, 1))
                if len(reshaped_vector_of_time) > 5000:
                    k = len(reshaped_vector_of_time) // 5000 + 1
                    rnge = np.arange(len(reshaped_vector_of_time))
                    indices = (rnge % k) == 0
                    reshaped_vector_of_time = reshaped_vector_of_time[indices]
                KDE = KernelDensity(kernel='gaussian',
                                    bandwidth=1.0).fit(reshaped_vector_of_time)
                KDE_scores = KDE.score_samples(reshaped_vector_of_time)
                mean_of_KDE_scores = -np.mean(KDE_scores)

                normalized_vector_of_time = preprocessing.normalize(
                    [vector_of_time]).reshape(-1, 1)
                birch = Birch(n_clusters=None,
                              threshold=0.1,
                              compute_labels=True)
                birch.fit(normalized_vector_of_time)
                birch.predict(normalized_vector_of_time)
                labels = birch.labels_
                birch_clustering_score = 100 * len(
                    labels[np.where(labels != 0)]) / len(labels)

                total_weight = mean_of_KDE_scores * birch_clustering_score + mean_of_KDE_scores + birch_clustering_score

                self.base_graph.add_edge(source,
                                         destination,
                                         weight=total_weight)
                print('Added edge: %s with weight %f, ' %
                      (edge, total_weight) + 'KDE performed on %d rows' %
                      len(reshaped_vector_of_time))
        print('Finished creating graph.')
Пример #22
0
    def callback(self, odom_msg, scan_msg):
        print('-----------------------------------------')
        start_time = time.time()
        # process odometry message
        rx = odom_msg.pose.pose.position.x
        ry = odom_msg.pose.pose.position.y
        q = odom_msg.pose.pose.orientation
        rth = arctan2(2 * q.x * q.y - 2 * q.z * q.w,
                      1 - 2 * q.y**2 - 2 * q.z**2)
        rth = 2 * pi - rth % (2 * pi)
        pose = np.array([rx, ry, rth])
        self.pose = pose.copy()

        # process scan message
        bearings = self.bearings.copy()

        ranges = np.array(scan_msg.ranges)
        inf_flag = (-1 * np.isinf(ranges).astype(int) + 1)
        ranges = np.nan_to_num(ranges) * inf_flag

        euc_coord_x = pose[0] + np.cos(bearings + pose[2]) * ranges
        euc_coord_y = pose[1] + np.sin(bearings + pose[2]) * ranges
        dist_flag = np.where( (euc_coord_x-pose[0])**2 + \
                        (euc_coord_y-pose[1])**2 != 0.0)[0]
        points = np.array([euc_coord_x, euc_coord_y]).T
        points = points[dist_flag]

        self.obsv = []
        if len(points) > 0:
            brc = Birch(n_clusters=None, threshold=0.05)
            brc.fit(points)
            labels = brc.predict(points)
            u_labels = np.unique(labels)
            for l in u_labels:
                seg_idx = np.where(labels == l)
                seg = points[seg_idx]
                if seg.shape[0] <= 1:
                    fit_cov = 10
                else:
                    fit_cov = np.trace(np.cov(seg.T))
                if fit_cov < 0.001 and seg.shape[0] >= 5:
                    self.obsv.append(seg.mean(axis=0))

        print('odom: {}\nlandmarks:\n{}'.format(pose, self.obsv))

        # publish observed landmarks
        cube_list = Marker()
        cube_list.header.frame_id = 'odom'
        cube_list.header.stamp = rospy.Time.now()
        cube_list.ns = 'landmark_point'
        cube_list.action = Marker.ADD
        cube_list.pose.orientation.w = 1.0
        cube_list.id = 0
        cube_list.type = Marker.CUBE_LIST

        cube_list.scale.x = 0.05
        cube_list.scale.y = 0.05
        cube_list.scale.z = 0.5
        cube_list.color.b = 1.0
        cube_list.color.a = 1.0

        for landmark in self.obsv:
            p = Point()
            p.x = landmark[0]
            p.y = landmark[1]
            p.z = 0.25
            cube_list.points.append(p)

        self.obsv_pub.publish(cube_list)
        '''
        # send control
        ctrl = self.erg_ctrl(pose.copy())
        ctrl_lin = ctrl[0]
        ctrl_ang = ctrl[1]
        vel_msg = Twist()
        vel_msg.linear.x = ctrl_lin
        vel_msg.linear.y = 0.0
        vel_msg.linear.z = 0.0
        vel_msg.angular.x = 0.0
        vel_msg.angular.y = 0.0
        vel_msg.angular.z = ctrl_ang
        self.ctrl_pub.publish(vel_msg)
        '''

        # log
        self.log['count'] += 1
        self.log['traj'].append(pose.copy())
        # self.log['ctrls'].append(ctrl.copy())

        print('elasped time: {}'.format(time.time() - start_time))
Пример #23
0
def birch_1(a, kwargs):
    return Birch(**kwargs).fit_predict(a)
Пример #24
0
 def __init__(self):
     self.wrapped = Birch(n_clusters = 2)
     self.data = []
     self.indexes =[]
Пример #25
0
ratio = 0.9
n_paa_segments = 18
paa = PiecewiseAggregateApproximation(n_segments=n_paa_segments)
paa_mid = paa.fit_transform(stdData[:, :int(ratio * stdData.shape[1])])
paa_mid = paa_mid.reshape(paa_mid.shape[0], paa_mid.shape[1])

first_clus = paa_mid.copy()
for i in range(len(first_clus)):
    first_clus[i] = rankbased(paa_mid[i])

#################################################################
# 第一次聚类使用Birch跑出初始,然后使用Kmeans细分。数据使用rank-base
# 改进:直接使用原始数据,调整Birch的threshold
data = first_clus
s = time.time()
y_pre = Birch(n_clusters=None, threshold=getEpsilon(data,
                                                    0.8)).fit_predict(data)
y_pre = KMeans(n_clusters=max(y_pre) + 1, random_state=0).fit_predict(data)
e = time.time()

#################################################################
# 第二次聚类使用10以内间隔2的gap statistics。聚类对象为残差
# 改进:可以考虑聚类对象是残差或直接是标准数据
import pandas as pd


def optimalK(data, nrefs=3, maxClusters=15):
    """
    Calculates KMeans optimal K using Gap Statistic from Tibshirani, Walther, Hastie
    Params:
        data: ndarry of shape (n_samples, n_features)
        nrefs: number of sample reference datasets to create
Пример #26
0
from general_functions import *

if __name__ == '__main__':
    # hypothetical_goal = [0 if _ < 80 else 1 for _ in range(120)]

    MODEL_NAME = 'model/tf_idf_1.csv'
    N_ARRANGE = (1, 1)
    MODE = 'word'

    make_tf_idf_model(N_ARRANGE, MODEL_NAME, mode=MODE)

    data = pd.read_csv(MODEL_NAME, index_col=0)

    from sklearn.cluster import KMeans
    from sklearn.cluster import SpectralClustering
    from sklearn.decomposition import PCA
    from sklearn.cluster import Birch

    pca = PCA(n_components=3)
    data = pd.DataFrame(pca.fit_transform(data))
    spectral = SpectralClustering(2, random_state=0)
    k_means = KMeans(n_clusters=2, random_state=0)
    birch = Birch(threshold=0.1, n_clusters=2)

    train_and_show(data, spectral)
    train_and_show(data, k_means)
    train_and_show(data, birch)
Пример #27
0
for k in range_clusters:
  # fit data for k clusters
  spectral = Clustering(SpectralClustering(n_clusters=k))
  spectral.fit(data_df)
  
  # evaluate clustering through silhouette score
  score['Spectral'].append(spectral.evaluate())

# ------------------------------------------------------------------------------
# -- Birch Performance
# ------------------------------------------------------------------------------

for k in range_clusters:
  # fit data for k clusters
  birch = Clustering(Birch(n_clusters=k, threshold=0.36))
  birch.fit(data_df)
  
  # evaluate clustering through silhouette score
  score['Birch'].append(birch.evaluate())

# ------------------------------------------------------------------------------
# -- DBSCAN Performance
# ------------------------------------------------------------------------------

# DBSCAN 
dbscan = Clustering(DBSCAN(eps=.5, min_samples=3))
dbscan.fit(data_df)

score['DBSCAN'].append(dbscan.evaluate())
Пример #28
0
print(data, citypos.shape)

# KMeans
km = KMeans(n_clusters=100, n_init=1)
itime = time.perf_counter()
kmlabels = km.fit_predict(citypos)
etime = time.perf_counter()
print ('K-means Time = ', etime-itime)

# Minibatch Kmeans
itime = time.perf_counter()
mbkm = MiniBatchKMeans(n_clusters=100, batch_size=1000, n_init=1, max_iter=5000)
mbkmlabels = mbkm.fit_predict(citypos)
etime = time.perf_counter()
print ('MB K-means Time = ', etime-itime)

print('Similarity Km vs MBKm', adjusted_mutual_info_score(kmlabels, mbkmlabels))

# Birch
itime = time.perf_counter()
birch = Birch(threshold=0.02, n_clusters=100, branching_factor=100)
birchlabels = birch.fit_predict(citypos)
etime = time.perf_counter()
print ('BIRCH Time = ',etime-itime)

print('Similarity Km vs BIRCH',adjusted_mutual_info_score(kmlabels, birchlabels))



Пример #29
0
columns = ['RadPeer.Score', 'RadPeer.Significance.of.Errors',
 			'Technical.Performance.Score', 'Percent.Error']
features = summary[columns]
#fig = pd.scatter_matrix(features, figsize=(18,18), alpha=0.5, grid=True)
#sns.plt.savefig('features_scatter.png', bbox_inches='tight')

# scaling
mms = MinMaxScaler()
X = mms.fit_transform(features)

# set up clustering algorithms
db = DBSCAN(eps=0.3, min_samples=5)
ac = AgglomerativeClustering(n_clusters=2, affinity='euclidean',
                             linkage='average')
#km = MiniBatchKMeans(n_clusters=2, random_state=1, n_init=15)
bc = Birch(n_clusters=2)
#sp = SpectralClustering(n_clusters=2, eigen_solver='arpack', random_state=1) 
#bandwidth = estimate_bandwidth(X, quantile=0.3)
#ms = MeanShift(bandwidth=bandwidth, bin_seeding=True)
#ap= AffinityPropagation(damping=.9, preference=-200)

#y_km = km.fit_predict(X)
y_ac = ac.fit_predict(X)
utils.swap_label(y_ac)
y_bc = bc.fit_predict(X)
utils.swap_label(y_bc)
y_db = db.fit_predict(X)
y_db[y_db==-1] = 1
#print np.unique(y_db)
#y_sp = sp.fit_predict(X)
#y_ms = ms.fit_predict(X)
import numpy as np
from sklearn.cluster import Birch
from sklearn.datasets.samples_generator import make_blobs
import matplotlib.pyplot as plt
from itertools import cycle

# Generates random vectors to cluster
n_samples = 50
centers = [[0, 1], [4, -2], [-2, 2], [0, -1]]
X, _ = make_blobs(n_samples=n_samples, centers=centers, cluster_std=0.2)

# Creates the Birch classificator and gives it the vectors
brc = Birch(branching_factor=50, n_clusters=None, threshold=0.8, compute_labels=True)
brc.fit(X)

labels = brc.labels_
cluster_centers = brc.subcluster_centers_
labels_unique = np.unique(labels)
n_clusters_ = len(labels_unique)

# Prints the points generated
plt.figure(1)
plt.clf()

colors = cycle('bgrcmykbgrcmykbgrcmykbgrcmyk')
for k, col in zip(range(n_clusters_), colors):
    my_members = labels == k
    cluster_center = cluster_centers[k]
    plt.plot(X[my_members, 0], X[my_members, 1], col + '.')
    plt.axis([-4,12,-4,12])
plt.title('Estimated number of clusters: %d' % n_clusters_)
Пример #31
0
def birchclustering(datalist):
    brc = Birch(branching_factor=50, n_clusters=None, threshold=0.17,compute_labels=True)
    brc.fit(datalist)
    return brc
Пример #32
0
    event_array[i, 1] = dsp_dict["EVLO"]
station_array = np.array(station_list)
dsp_array = np.array(dsp_list)

# extract the unique station names
stations = np.unique(station_array)
print stations

for sta in stations:
    events = event_array[station_array == sta, :]
    dsp_shortlist = dsp_array[station_array == sta]
    print sta, events.shape, dsp_shortlist.shape

    # cluster on events so as to compare dispersion curves for nearby
    # events
    brc = Birch(branching_factor=50, n_clusters=None, threshold=dist, compute_labels=True)
    brc.fit(events)
    labels = brc.predict(events)
    print np.max(labels)
    for lab in np.unique(labels):
        dsp_this_label_list = dsp_shortlist[labels == lab]
        cluster_name = os.path.join(dirname, "cluster_%s_%03d" % (sta, lab))
        plot_all_dsp(dsp_this_label_list, legend=False, fname="%s_gvel.png" % cluster_name)
        plot_all_map(dsp_this_label_list, fname="%s_map.png" % cluster_name, legend=False)
        f = open("%s_info.txt" % cluster_name, "w")
        for (dsp, dsp_dict) in dsp_this_label_list:
            f.write(
                "%s %s %d %03d %02d %02d %.3f %.3f\n"
                % (
                    dsp_dict["STA"],
                    dsp_dict["COMP"],
 def __init__(self, num_clusters, feature_names, train_x, train_y, rep):
     ClusterModel.__init__(self, train_x, train_y, feature_names, rep)
     self.birch_model = Birch(n_clusters=num_clusters).fit(train_x)
     self.birch_model.predict(train_x)
     self.labels = self.birch_model.labels_
     self.num_clusters = num_clusters
Пример #34
0
KMS = MiniBatchKMeans(n_clusters=6,
                      init='k-means++',
                      n_init=10,
                      max_iter=300,
                      tol=0.0001).fit(Wine_Softmax)
SSE, SSB, SSE_cluster = calculateMeasures(Wine_Softmax, KMS.labels_,
                                          KMS.cluster_centers_)
print('SSB : %f' % (SSB))
print('SSE : %f' % (SSE))

KMS

# In[69]:

print('\nWine_Base')
KMS = Birch(n_clusters=6).fit(Wine_Base)
SSE, SSB, SSE_cluster = calculateMeasures(
    Wine_Base, KMS.labels_,
    updateCentroids(Wine_Base, KMS.labels_, np.zeros((k, Wine_Base.shape[1]))))
print('SSB : %f' % (SSB))
print('SSE : %f' % (SSE))

print('\nWine_Norm')
KMS = Birch(n_clusters=6).fit(Wine_Norm)
SSE, SSB, SSE_cluster = calculateMeasures(
    Wine_Base, KMS.labels_,
    updateCentroids(Wine_Base, KMS.labels_, np.zeros((k, Wine_Base.shape[1]))))
print('SSB : %f' % (SSB))
print('SSE : %f' % (SSE))

print('\nWine_Softmax')
Пример #35
0
X_train, X_test, y_train, y_test = train_test_split(X,
                                                    y,
                                                    test_size=0.33,
                                                    random_state=42)
print(X_train.shape, "\n\n", X_test.shape, "\n\n", y_train.shape, "\n\n",
      y_test.shape, "\n\n")

kms = KMeans(n_clusters=7, random_state=0).fit(X_train)
y_pred = kms.predict(X_test)
metrics.accuracy_score(y_test, y_pred)
print("Accuracy using KMeans Clustering: ",
      metrics.accuracy_score(y_test, y_pred))

agg = AgglomerativeClustering(n_clusters=1).fit(X_train)
y_pred = agg.fit_predict(X_test)
metrics.accuracy_score(y_test, y_pred)
print("Accuracy using Agglomerative Clustering: ",
      metrics.accuracy_score(y_test, y_pred))

brc = Birch(n_clusters=2).fit(X_train).fit(X_train)
y_pred = brc.predict(X_test)
metrics.accuracy_score(y_test, y_pred)
print("Accuracy using Birch Clustering: ",
      metrics.accuracy_score(y_test, y_pred))
'''
Accuracy using KMeans Clustering:  0.17061611374407584
Accuracy using Agglomerative Clustering:  0.6777251184834123
Accuracy using Birch Clustering:  0.5450236966824644
'''
Пример #36
0
for item in range(len(affinity_propagation_valid_performance_metric_array)):
    affinity_propagation_valid_performance_metrics_for_plotting[item + 1] = affinity_propagation_valid_performance_metric_array[item]
    affinity_propagation_test_performance_metrics_for_plotting[item + 1] = affinity_propagation_test_performance_metric_array[item]
Figures.save_valid_test_performance_measures_vs_hyper_parameters_figure(affinity_propagation_parameter_search_space_for_plotting,
                                                                        affinity_propagation_valid_performance_metrics_for_plotting,
                                                                        affinity_propagation_test_performance_metrics_for_plotting,
                                                                        'Adjusted Mutual Information Score',
                                                                        'AffinityPropagation Clustering damping parameter',
                                                                        'Affinity_Propagation_Performance',
                                                                        0,
                                                                        0.5,
                                                                        left_horizontal_limit=0.5)

# Do BIRCH, optimizing number of calls to partial_fit over a validation set
current_optimal_birch_number_of_calls = 1
initial_optimal_birch_clusterer = Birch()
initial_optimal_birch_clusterer.partial_fit(train_data_set)
initial_optimal_birch_clusterer.set_params(n_clusters=number_of_classes)
initial_birch_valid_predictions = initial_optimal_birch_clusterer.predict(valid_data_set)
initial_birch_test_predictions = initial_optimal_birch_clusterer.predict(test_data_set)

# Add one to the predictions to make them match up with range of labels, then apply Hungarian Fix
for element in range(number_of_valid_observations):
    initial_birch_valid_predictions[element] += 1
for element in range(number_of_test_observations):
    initial_birch_test_predictions[element] += 1
initial_birch_valid_predictions = Clustering.Hungarian_Fix(initial_birch_valid_predictions,
                                                           valid_labels).astype('int')
initial_birch_test_predictions = Clustering.Hungarian_Fix(initial_birch_test_predictions,
                                                          test_labels).astype('int')
Пример #37
0
def main():

    # parameters
    write_whole_cluster = False
    perform_pca = False
    birch_thresh = 2.0
    count_thresh = 0.1

    eval_file_names = [
        'filtered_eval_three_event.csv', 'filtered_eval_five_event.csv',
        'filtered_eval_seven_event.csv'
    ]
    annotated_file_names = [
        'annotated_three_event.txt', 'annotated_five_event.txt',
        'annotated_seven_event.txt'
    ]

    for m in range(0, len(eval_file_names)):
        fileName = eval_file_names[m]
        file_prefix = 'output'
        print(fileName)

        for birch_thresh in np.arange(0.0, 4.1, 0.2):
            for count_thresh in np.arange(0.1, 1.1, 0.1):
                '''for i in range(1,179):
                    if(i not in temp):
                        print(i)
                '''

                df = pd.read_csv(fileName, header=None, encoding='latin-1')

                df.columns = [
                    'record_id', 'date', 'url', 'counts', 'themes',
                    'locations', 'persons', 'organizations', 'tone'
                ]

                # Retaining only those news which have non-null themes and locations
                df = df[pd.notnull(df['themes'])]
                df = df[pd.notnull(df['locations'])]

                df_locations = pd.DataFrame(df['locations'])

                # Reading actual class labels assigned by expert human assessor
                class_labels = [None] * len(df)
                temp = {}
                with open(annotated_file_names[m], "r") as ins:
                    label = 1
                    for line in ins:
                        line = line.strip()
                        if line.startswith("#"):
                            continue
                        if line:
                            line = line.split(',')
                            # print(line)
                            for item in line:
                                class_labels[int(item) - 1] = label
                                temp[int(item)] = True
                            label += 1

                row_dict = df.copy(deep=True)
                row_dict.fillna('', inplace=True)
                row_dict.index = range(len(row_dict))
                row_dict = row_dict.to_dict(
                    'index')  # dictionary that maps row number to row

                identifier_dict = {
                }  # dictionary that maps GKG Record Id to Row Number
                i = 0
                for index, row in df.iterrows():
                    identifier_dict[row['record_id']] = i
                    i += 1

                df = df[df.columns[[4]]]
                df.columns = ['themes']

                df = pd.DataFrame(
                    df['themes'].str.split(';'))  # splitting themes

                df_locations = pd.DataFrame(
                    df_locations['locations'].str.split(
                        ';'))  # splitting locations

                for row in df_locations.itertuples():
                    for i in range(0, len(row.locations)):
                        try:
                            row.locations[i] = (row.locations[i].split('#'))[
                                3]  # for retaining only ADM1 Code
                        except:
                            continue
                    # merged = list(itertools.chain(*row.locations))
                    # df_locations.loc[row.Index, 'locations'] = merged

                df = df[pd.notnull(df['themes'])]

                mlb = MultiLabelBinarizer(sparse_output=True)
                sparse_themes = mlb.fit_transform(df['themes'])
                df = sparse_themes

                # df = sparse_locations

                # Reducing dimensions through principal component analysis
                if perform_pca:
                    pca = PCA(n_components=None)
                    df = pd.DataFrame(pca.fit_transform(df))

                #print("Starting clustering")
                brc = Birch(branching_factor=50,
                            n_clusters=None,
                            threshold=birch_thresh,
                            compute_labels=True)
                predicted_labels = brc.fit_predict(df)

                clusters = {}
                n = 0

                for item in predicted_labels:
                    if item in clusters:
                        clusters[item].append(list((row_dict[n]).values(
                        )))  # since row_dict[n] is itself a dictionary
                    else:
                        clusters[item] = [list((row_dict[n]).values())]
                    n += 1

                # clustering within each cluster, on counts
                count_clusters = {
                }  # dictionary which maps original_cluster_key to new clusters within that cluster
                for item in clusters:
                    count_clusters[item] = {}
                    cluster_df = pd.DataFrame(clusters[item])
                    cluster_row_dict = cluster_df.copy(deep=True)
                    cluster_row_dict.fillna('', inplace=True)
                    cluster_row_dict.index = range(len(cluster_row_dict))
                    cluster_row_dict = cluster_row_dict.to_dict('index')

                    df_counts = pd.DataFrame(
                        cluster_df[cluster_df.columns[[3]]])
                    df_counts.columns = ['counts']
                    df_counts = pd.DataFrame(
                        df_counts['counts'].str.split(';'))  # splitting counts

                    df_locations = pd.DataFrame(
                        cluster_df[cluster_df.columns[[5]]])
                    df_locations.columns = ['locations']
                    df_locations = pd.DataFrame(
                        df_locations['locations'].str.split(';'))

                    for row in df_locations.itertuples():
                        for i in range(0, len(row.locations)):
                            try:
                                row.locations[i] = (row.locations[i].split(
                                    '#'))[3]  # for retaining only ADM1 Code
                            except:
                                continue

                    for row in df_counts.itertuples():
                        for i in range(0, len(row.counts)):
                            try:
                                temp_list = row.counts[i].split('#')
                                row.counts[i] = temp_list[0] + '#' + temp_list[
                                    1] + '#' + temp_list[
                                        5]  # for retaining only COUNT_TYPE and QUANTITY and LOCATION ADM1 Code
                            except:
                                continue
                        if len(row.counts) == 1 and row.counts[0] == '':
                            row.counts.append(
                                '#'
                            )  # so that news with no counts are clustered together
                            row.counts.pop(0)

                        if row.counts[len(row.counts) - 1] == '':
                            row.counts.pop()

                        row.counts[:] = [
                            x for x in row.counts
                            if not x.startswith('CRISISLEX')
                        ]  # Removing CRISISLEX Entries due to elevated false positive rate

                    mlb4 = MultiLabelBinarizer(sparse_output=True)
                    sparse_counts = mlb4.fit_transform(df_counts['counts'])

                    mlb5 = MultiLabelBinarizer(sparse_output=True)
                    sparse_locations = mlb5.fit_transform(
                        df_locations['locations'])

                    small_df = hstack([sparse_locations, sparse_counts])
                    #pca = PCA(n_components=2)
                    #df_counts = pd.DataFrame(pca.fit_transform(df_counts))

                    # print(df_counts.to_string())
                    # df_counts.to_csv('one_hot_encoded_counts.csv', sep=',')
                    # return

                    brc2 = Birch(branching_factor=50,
                                 n_clusters=None,
                                 threshold=count_thresh,
                                 compute_labels=True)
                    predicted_labels2 = brc2.fit_predict(small_df)

                    n2 = 0
                    for item2 in predicted_labels2:
                        if item2 in count_clusters[item]:
                            count_clusters[item][item2].append(
                                list((cluster_row_dict[n2]).values())
                            )  # since cluster_row_dict[n2] is itself a dictionary
                        else:
                            count_clusters[item][item2] = [
                                list((cluster_row_dict[n2]).values())
                            ]
                        n2 += 1

                # if write_whole_cluster:
                #     with open('filtered_one/'+file+'.txt', 'w', encoding='utf-8') as file:
                #         for item in count_clusters:
                #             for item2 in count_clusters[item]:
                #                 file.write("\n\nCluster "+str(item)+': ' + str(item2) + "\n")
                #                 for i in range(0, len(count_clusters[item][item2])):
                #                     file.write(count_clusters[item][item2][i][2] + '\n')  # appending url
                # else:
                #     with open('filtered_one/'+file+'.csv', 'w',newline='', encoding='utf-8') as file:
                #         writer = csv.writer(file, delimiter=",")
                #         for item in count_clusters:
                #             for item2 in count_clusters[item]:
                #                 writer.writerow(count_clusters[item][item2][0])

                test_dict = {}

                label = 1
                cluster_labels = [None] * n
                with open(file_prefix + '.txt', 'w', encoding='utf-8') as file:
                    for item in count_clusters:
                        for item2 in count_clusters[item]:
                            file.write("\n\nCluster " + str(item) + ': ' +
                                       str(item2) + "\n")
                            for i in range(0,
                                           len(count_clusters[item][item2])):
                                gkg_record_id = count_clusters[item][item2][i][
                                    0]
                                if (gkg_record_id in test_dict):
                                    print("yes")
                                    print(gkg_record_id)
                                    return
                                test_dict[gkg_record_id] = True
                                #file.write(str(identifier_dict[gkg_record_id]+1)+'\n'+count_clusters[item][item2][i][2]+ '\n' +count_clusters[item][item2][i][3]+ '\n\n')  # appending url
                                file.write(
                                    str(identifier_dict[gkg_record_id] + 1) +
                                    '\n')
                                cluster_labels[
                                    identifier_dict[gkg_record_id]] = label
                            label += 1

                # print(cluster_labels)

                matrix = metrics.cluster.contingency_matrix(
                    class_labels, cluster_labels)
                rand_index, precision, recall, f1 = precision_recall_fmeasure(
                    matrix)

                ari = metrics.cluster.adjusted_rand_score(
                    class_labels, cluster_labels)
                #print("AdjustedRI:", ari)

                nmi = metrics.normalized_mutual_info_score(
                    class_labels, cluster_labels)
                #print("NMI       :", nmi)

                print(birch_thresh, ",", count_thresh, ",", rand_index, ",",
                      precision, ",", recall, ",", f1, ",", ari, ",", nmi)
	def cluster_birch(self):
		print "Starting Birch clustering"
		brc = Birch(branching_factor=10, n_clusters=40, threshold=self.cluster_distance,compute_labels=False)
		brc.fit(self.all_frames_xy)
		clusters = brc.predict(self.all_frames_xy)
		return clusters
Пример #39
0
subsets_original.append(X_subset1)
subsets_original.append(X_subset2)
subsets_original.append(X_subset3)
#subsets_original.append(X_subset4)
# diccionarios para el guardado de las variables
metrics_CH = dict()
metrics_SC = dict()
cluster_predict_all = dict()

# k = len(set(cluster_predict)) para ver cuantos clusters se han obtenido
# if k>1 and name is not ward en caso contrario pon las métricas a 0

print("------ Declarando los algoritmos")
k_means = KMeans(n_clusters=3, init='k-means++')
ward = AgglomerativeClustering(n_clusters=3, linkage='ward')
birch = Birch(n_clusters=3)
dbscan = DBSCAN(eps=0.01, min_samples=10)
spectral = SpectralClustering(n_clusters=3, affinity="nearest_neighbors")
#affinity_propagation = AffinityPropagation()
#ms = MeanShift()

clustering_algorithms = [("k-means", k_means), ("ward", ward),
                         ("birch", birch), ("dbscan", dbscan),
                         ('spectral', spectral)]

index = 1
for subset in subsets:
    print("Trabajando con subset {}".format(index))
    for name, algorithm in clustering_algorithms:
        print("{:7s}, ".format(name), end='')
        tiempo = time.time()
Пример #40
0
#create dendogram
#dendogram = sch.dendrogram(sch.linkage(points,method='ward'))
hc = ac(n_clusters=2, affinity='euclidean', linkage='ward')
y_hc = hc.fit_predict(points)
f2 = plt.figure()

plt.scatter(points[y_hc == 0, 0], points[y_hc == 0, 1], c='red')
plt.scatter(points[y_hc == 1, 0], points[y_hc == 1, 1], c='blue')
plt.scatter(points[y_hc == 2, 0], points[y_hc == 2, 1], c='black')
plt.scatter(points[y_hc == 3, 0], points[y_hc == 3, 1], c='cyan')
plt.title('Heirarchical Clustering')
plt.show()

#Birch clustering
bir = Birch(n_clusters=2, threshold=0.8, branching_factor=200)
bir.fit(points)
y_bir = bir.fit_predict(points)

f3 = plt.figure()

plt.scatter(points[y_bir == 0, 0], points[y_bir == 0, 1], c='red')
plt.scatter(points[y_bir == 1, 0], points[y_bir == 1, 1], c='blue')
plt.scatter(points[y_bir == 2, 0], points[y_bir == 2, 1], c='black')
plt.scatter(points[y_bir == 3, 0], points[y_bir == 3, 1], c='cyan')
plt.title('Birch Clustering')
plt.show()

#DBSCAN
dbs = DBSCAN(eps=0.1, min_samples=5)
dbs.fit(points)
Пример #41
0
def getSecondClus_2(data):
    epsilon = getEpsilonFromtiny(data)
    y_pre = Birch(n_clusters=None, threshold=epsilon).fit_predict(data)
    return y_pre
Пример #42
0
                                include_self=False)
ms = MeanShift(bandwidth=bandwidth, bin_seeding=True)
ward = AgglomerativeClustering(n_clusters=params['n_clusters'],
                               linkage='ward',
                               connectivity=connectivity)
spectral = SpectralClustering(n_clusters=params['n_clusters'],
                              eigen_solver='arpack',
                              affinity="nearest_neighbors")
dbscan = DBSCAN(eps=params['eps'])
affinity_propagation = AffinityPropagation(damping=params['damping'],
                                           preference=params['preference'])
average_linkage = AgglomerativeClustering(linkage="average",
                                          affinity="cityblock",
                                          n_clusters=params['n_clusters'],
                                          connectivity=connectivity)
birch = Birch(n_clusters=params['n_clusters'])
gmm = GaussianMixture(n_components=params['n_clusters'],
                      covariance_type='full')
clustering_algorithms = (('AffinityPropagation', affinity_propagation),
                         ('MeanShift', ms), ('SpectralClustering', spectral),
                         ('Ward', ward), ('AgglomerativeClustering',
                                          average_linkage), ('DBSCAN', dbscan),
                         ('Birch', birch), ('GaussianMixture', gmm))
#now plot everything
f, ax = plt.subplots(2, 4, figsize=(20, 15))
for idx, (name, algorithm) in enumerate(clustering_algorithms):
    algorithm.fit(embedding)
    if hasattr(algorithm, 'labels_'):
        y_pred = algorithm.labels_.astype(np.int)
    else:
        y_pred = algorithm.predict(embedding)
def birch(X):
    br = Birch(n_clusters=None, threshold=10).fit(X)
    print('br')
    print(silhouette_score(X, br.labels_))
    print(calinski_harabaz_score(X, br.labels_))
    return br
def clf_init(b_factor = 50, threshold = 0.8):
    return Birch(branching_factor=b_factor, n_clusters=None, threshold=threshold, compute_labels=True)
Пример #45
0
    w2v = Counter(documents_tokens[doc])
    row = []
    for idx in all_words:
        if idx in w2v:
            row.append(w2v[idx])
        else:
            row.append(0)
    matrix.append(row)

print('Matrix shape')
print(len(matrix), 'x', len(matrix[0]))

# Birch clustering

brc = Birch(branching_factor=20,
            n_clusters=7,
            threshold=0.5,
            compute_labels=True)

# Clustering

brc.fit(matrix)
document_labels = brc.predict(matrix)

print('Document labels: ', document_labels)

# Countplot

sns.countplot(document_labels)

# Jaccard similarity measure
Пример #46
0
from sklearn.cluster import Birch
import csv
import matplotlib.pyplot as plt
import matplotlib.cm as cm
import numpy as np

X = np.loadtxt(fname='Dataset.txt', skiprows=1)
# print(X)
X = [list(i) for i in X]
for i in range(len(X)):
    for j in range(2):
        X[i][j] = X[i][j] / 1000000

print(X)
X = np.array(X)
plt.scatter(X[:, 0], X[:, 1], s=4, c='black')
plt.show()

brc = Birch(branching_factor=50,
            n_clusters=7,
            threshold=0.05,
            compute_labels=True)
cftree = brc.fit(X)
ans = brc.predict(X)
labs = np.unique(ans)

cmap = plt.get_cmap('jet', len(labs))
plt.scatter(X[:, 0], X[:, 1], c=ans, s=4, cmap=cmap)
plt.show()
Пример #47
0
import time
"""
https://scikit-learn.org/stable/modules/generated/sklearn.cluster.Birch.html
https://docs.scipy.org/doc/scipy/reference/cluster.hierarchy.html#module-scipy.cluster.hierarchy
https://towardsdatascience.com/machine-learning-algorithms-part-12-hierarchical-agglomerative-clustering-example-in-python-1e18e0075019
https://joernhees.de/blog/2015/08/26/scipy-hierarchical-clustering-and-dendrogram-tutorial/
"""

print("Compute birch clustering...")
st = time.time()

X = np.stack([x1, x2], axis=1)
X = np.reshape(X, (-1, 2))

n_clusters = 3
birch = Birch(n_clusters=n_clusters, threshold=0.01, branching_factor=10)
birch.fit(X)

# label = birch.labels_
label = birch.predict(X)

print("Elapsed time: ", time.time() - st)
print("Number of clusters: ", np.unique(label).size)

import matplotlib.pyplot as plt

fig, ax = plt.subplots()

ax.scatter(x1, x2, c=label)

ax.set_xlabel(r"$x1$", fontsize=15)
Пример #48
0
def process(X, labels_num):
    print("Clustering using Birch")
    brc = Birch(branching_factor=20, n_clusters=32, threshold=10,compute_labels = True).fit(X)
    pred_label = brc.predict(X)
    return pred_label
Пример #49
0
data_thr = mask(data, 'orbit')  # rm too large values except for 'orbit'


np.random.seed(0)

X = np.c_[data_thr.orbit, data_thr.rate, data_thr.rateA, data_thr.rateB,
          data_thr.rateC, data_thr.rateCA]
Html_file = open("clustering_files/birch.html", "w")

scaler = StandardScaler()
X = scaler.fit_transform(X)


for n_clusters in range(2, 10):

    km = Birch(n_clusters=n_clusters)
    preds = km.fit_predict(X)

    print "components:", set(preds)
    print np.bincount(preds)

    data_thr['preds'] = pd.Series(preds).astype("category")

    color_key = ["red", "blue", "yellow", "grey", "black", "purple", "pink",
                 "brown", "green", "orange"] * 2  # Spectral9
    # color_key = color_key[:len(set(preds))+2]


    # single plot rateCA vs rate with predicted classes and ellipses:

    single_plot = bokeh_datashader_plot(data_thr, covs=None, means=None,
Пример #50
0

class chj_data(object):
    def __init__(self, data, target):
        self.data = data
        self.target = target


def chj_load_file(fdata, ftarget):
    res = chj_data(fdata, ftarget)
    return res


print(X_train)
print(X_train["Pclass"])
iris = chj_load_file(X_train, y_pred)
X_tsne = TSNE(n_components=2, learning_rate=100).fit_transform(iris.data)
plt.figure(figsize=(12, 6))
plt.scatter(X_tsne[:, 0], X_tsne[:, 1], c=iris.target)
plt.colorbar()
plt.show()

y_Birch = Birch(n_clusters=None).fit_predict(X_train)
iris_Birch = chj_load_file(X_train, y_Birch)
X_tsne_Birch = TSNE(n_components=2,
                    learning_rate=100).fit_transform(iris_Birch.data)
plt.figure(figsize=(12, 6))
plt.scatter(X_tsne_Birch[:, 0], X_tsne_Birch[:, 1], c=iris_Birch.target)
plt.colorbar()
plt.show()
Пример #51
0
class Mini():


    def __init__(self,minis,mini_names,mini_finds,sample_freq):
        self.mini_names =mini_names
        self.minis = minis
        self.sample_freq = sample_freq
        self.mini_finds=mini_finds
        self.offsets= self.fit_paras= self.event_sizes= self.amplitudes= self.fast_constants= self.slow_constants=self.a_constants=self.cur_labels = None
        self.dict=['mini_names','minis','offsets','fit_paras','event_sizes','amplitudes','fast_constants','slow_constants','a_constants','cur_labels','mini_finds']
        self.delete_index = set()

    def _delete_mini(self,index):
        # truly delete
        for name in self.dict:
            if hasattr(self,name):
                llist=getattr(self,name)
                if isinstance(llist,list):
                    llist.pop(index)
                    #print(llist==getattr(self,name))
                else:
                    print(name)
                    setattr(self,name,list(llist))
                    llist = getattr(self, name)
                    llist.pop(index)
    def mark_delete_mini(self,indexs):
        # delete candidate
        # indexs is list or union or tuple
        self.delete_index=self.delete_index.union(indexs)
    def truly_delete_mini(self):
        print(self.delete_index)
        self.delete_index=list(self.delete_index)
        self.delete_index.sort(reverse=True)
        for number in self.delete_index:
            self._delete_mini(number)
        self.delete_index=set() # clear the delete flush

    def reindex_mini(self):
        self.mini_reindex={'label':{},'sweep':{}}
        #self.mini_reindex['label']=func_base.list_to_dict(self.cur_labels,self.minis)
        self.mini_reindex['label']=func_base.list_to_dict(self.cur_labels,range(len(self.cur_labels)))
        #self.mini_reindex['sweep']=func_base.list_to_dict([x[0] for x in self.mini_finds],self.minis)
        self.mini_reindex['sweep']=func_base.list_to_dict([x[0] for x in self.mini_finds],range(len(self.mini_finds)))
        print(self.mini_reindex['label'])
    # self.minis_number,self.event_sizes,self.offsets,self.fast_constants,self.slow_constants,self.rise_10_90s,self.decay_90_50s=mini_base.statis(self.minis)
    def statis(self):
        if not self.minis:
            print('couldn\'t find any minis' )
            return
        #print(self.minis)
        self.mini_number=len(self.minis)


        def templete_func(x,a0,a1,tau1,tau2,t0):

            try:
                return np.piecewise(x,[x>=t0,x<t0],[lambda x: a0+a1*(1-math.exp((x-t0)/tau1))*(math.exp((x-t0)/tau2)),a0])
            except:
                print('xxx',x)

        self.fit_paras=[]
        self.event_sizes=[]
        self.amplitudes=[]
        self.offsets=[]
        self.fast_constants=[]
        self.slow_constants=[]
        self.a_constants=[]


        # fit use two expenent function
        param_bounds=([-np.inf,-np.inf,0,0,-np.inf],[np.inf,0,np.inf,np.inf,np.inf])
        #nn=0
        for mini in self.minis:
            self.amplitudes.append(max(mini)-min(mini))
            minilen= len(mini)
            # if too large  fitcurve cannt work
            if minilen>10000:
                minilen=10000
                mini=mini[:minilen]
            x_label=np.arange(0,minilen)/self.sample_freq
            #nn+=1
            #print(len(x_label))
            try:
                paraments,pcov = curve_fit(templete_func,x_label,mini,bounds=param_bounds)
            except:
                #print(nn)
                print("mini",mini,"label",x_label)
                plt.figure()
                plt.plot(x_label,mini)
                plt.show()
                raise

            self.fit_paras.append(paraments)
            self.offsets.append(paraments[4])
            self.fast_constants.append(paraments[2])
            self.slow_constants.append(paraments[3])
            self.a_constants.append(paraments[1])
            fit_mini=templete_func(x_label,*paraments)
            self.event_sizes.append(max(fit_mini)-min(fit_mini))


    def mini_dim_reduce(self,dim=5):

        # PCA anylysis
        pca=PCA(n_components=dim)
        # Convert Python sequence to NumPy array, filling missing values
        minis=np.array(list(itertools.zip_longest(*self.minis, fillvalue=0))).T
        # transform return array like
        self.proced_minis=pca.fit_transform(minis)

        print('explained variance ratio (first two components): %s' %str(pca.explained_variance_ratio_))

    def get_mini_info(self,index):
        #print(locals())
        mini=self.minis[index]
        x_label=np.arange(len(mini))/self.sample_freq
        return self.mini_names[index],mini,self.cur_labels[index],x_label


    def classify(self,n_cluster=5):
        # Using BIRCH cluster
        self.birch = Birch(threshold=0.5,n_clusters=n_cluster)
        self.birch.fit(self.proced_minis)
        self.ori_labels = self.birch.labels_
        self.ori_centroids = self.birch.subcluster_centers_
        self.ori_n_clusters = np.unique(self.ori_labels)
        self.ori_n_cluster = np.unique(self.ori_labels).size
        self.cur_labels = self.ori_labels
        self.cur_centroids = self.ori_centroids
        self.cur_n_cluster = self.ori_n_cluster
        self.cur_n_clusters = self.ori_n_clusters

    def set_n_cluster(self,n_cluster):
        self.birch.set_params(n_clusters=n_cluster)
        self.cur_labels = self.ori_labels=self.birch.predict(self.proced_minis)
        self.cur_n_cluster = np.unique(self.cur_labels).size
        self.cur_n_clusters = np.unique(self.cur_labels)
        self.cur_centroids = self.birch.subcluster_centers_
def cluster_junctions(juncs):
    birch_model = Birch(threshold=3, n_clusters=None)
    X = np.array(juncs)
    birch_model.fit(X)

    return birch_model.labels_
Пример #53
0
    def scan_callback(self, scan_msg):
        print('-----------------------------------------')
        start_time = time.time()

        # process scan message
        pose = self.pose.copy()
        bearings = self.bearings.copy()

        ranges = np.array(scan_msg.ranges)
        inf_flag = (-1 * np.isinf(ranges).astype(int) + 1)
        ranges = np.nan_to_num(ranges) * inf_flag

        euc_coord_x = pose[0] + np.cos(bearings + pose[2]) * ranges
        euc_coord_y = pose[1] + np.sin(bearings + pose[2]) * ranges
        dist_flag = np.where( (euc_coord_x-pose[0])**2 + \
                        (euc_coord_y-pose[1])**2 != 0.0)[0]
        points = np.array([euc_coord_x, euc_coord_y]).T
        points = points[dist_flag]

        self.obsv = []
        if len(points) > 0:
            brc = Birch(n_clusters=None, threshold=0.05)
            brc.fit(points)
            labels = brc.predict(points)
            u_labels = np.unique(labels)
            for l in u_labels:
                seg_idx = np.where(labels == l)
                seg = points[seg_idx]
                if seg.shape[0] <= 1:
                    fit_cov = 10
                else:
                    fit_cov = np.trace(np.cov(seg.T))
                if fit_cov < 0.001 and seg.shape[0] >= 4:
                    self.obsv.append(seg.mean(axis=0))

        print('odom: {}\nlandmarks:\n{}'.format(pose, self.obsv))

        # publish observed landmarks
        cube_list = Marker()
        cube_list.header.frame_id = 'odom'
        cube_list.header.stamp = rospy.Time.now()
        cube_list.ns = 'landmark_point'
        cube_list.action = Marker.ADD
        cube_list.pose.orientation.w = 1.0
        cube_list.id = 0
        cube_list.type = Marker.CUBE_LIST

        cube_list.scale.x = 0.05
        cube_list.scale.y = 0.05
        cube_list.scale.z = 0.5
        cube_list.color.b = 1.0
        cube_list.color.a = 1.0

        for landmark in self.obsv:
            p = Point()
            p.x = landmark[0]
            p.y = landmark[1]
            p.z = 0.25
            cube_list.points.append(p)

        self.obsv_pub.publish(cube_list)

        print('elasped time: {}'.format(time.time() - start_time))
Пример #54
0
import numpy as np
from sklearn.cluster import Birch
import cluster
import csv

clusters = 20
submit_file = 'submit_birch.csv'

X, plays = cluster.get_matrix()
brc = Birch()
X = np.array(X, dtype=float)
plays = np.array(plays, dtype=float)
# print X.shape
print "Running Birch on training data...",
brc = Birch(branching_factor=50, n_clusters=clusters, threshold=0.5, compute_labels=True)
labels = brc.fit_predict(X)
print "Done!"

print labels
# plays_sums = [0] * clusters 
# cluster_size = [0] * clusters
plays_sums = {}

# Median
for idx, label in enumerate(labels):
  if label in plays_sums:
    plays_sums[label].append(plays[idx])
  else:
    plays_sums[label] = [plays[idx]]
  # cluster_size[label] += 1