Exemplo n.º 1
0
    def _fit_dbscan(self, x):
        # clustering
        for r in xrange(self.repeats):
            # info
            if self.debug is True:
                print '\t[%s][c:%d][r:%d]' % (self.clus_type, k, r + 1),

            # fit and evaluate model
            model = DBSCAN(eps=1.0, min_samples=100)
            model.fit_predict(x)
            k = len(set(model.labels_)) - (1 if -1 in model.labels_ else 0)
            self._labels[r] = model.labels_
            self._parameters[r] = model.core_sample_indices_

            # build equivalent gmm
            model_gmm = GMM(n_components=k, covariance_type="full")
            model_gmm.means_ = model.core_sample_indices_
            model_gmm.covars_ = sp.ones(
                (k, self.input_dim)) * self.sigma_factor
            model_gmm.weights_ = sp.array(
                [(self._labels[r] == i).sum() for i in xrange(k)])

            # evaluate goodness of fit
            self._ll[r] = model_gmm.score(x).sum()
            if self.gof_type == 'aic':
                self._gof[r] = model_gmm.aic(x)
            if self.gof_type == 'bic':
                self._gof[r] = model_gmm.bic(x)

            # debug info
            if self.debug is True:
                print self._gof[r]
Exemplo n.º 2
0
def dbscan_outliers(data, genes, eps, min_samples, max_samples=1, as_json=True):
    db = DBSCAN(eps=eps, min_samples=min_samples)
    # sd_scaler = StandardScaler()
    res = dr.get_dataset_ensembl_info()
    outliers_id = []
    for g in genes:
        # scaled = sd_scaler.fit(data.loc[g, :])
        fit = db.fit(np.reshape(data.loc[g, :], (196, 1)))

        candidates = itemfreq(fit.labels_)

        try:
            class_zero = candidates[0][1]
            class_one = candidates[1][1]

            support = min(class_one, class_zero)

            if min_samples < support <= max_samples:
                info = [gene for gene in res if gene.ensemblgeneid == g][0]
                formatted_info = {"id": g, "name": info.genename, "type": info.genetype, "samples": str(support),
                                  "distance": "NA"}
                jinfo = json.dumps(formatted_info)
                jinfo += ","
                outliers_id.append(g)
                print("outlier found :" + g)
                if as_json:
                    yield (jinfo)
                else:
                    yield (formatted_info)
        except:
            pass
Exemplo n.º 3
0
def dbscan_outliers(df):
    """
    Find outliers (noise points) using DBSCAN.

    Parameters
    ----------
    df: A pandas.DataFrame

    Returns
    -------
    A tuple of (a sklearn.DBSCAN instance, a pandas.DataFrame)
    """

    scaler = StandardScaler()
    scaler.fit(df)
    scaled = scaler.transform(df)

    dbs = DBSCAN()

    db = dbs.fit(scaled)
    outliers = dbs.fit_predict(scaled)

    df_o = df.ix[np.nonzero(outliers)]

    return db, df_o
Exemplo n.º 4
0
    def on_squaremsg_received(self, msg):
        detected_squares = []
        for square_msg in msg.squares:
            detected_squares.append(TrackedSquare.from_msg(square_msg))

        self._prev_squares.append(detected_squares)
        
        all_squares = list(itertools.chain.from_iterable(self._prev_squares))
        square_centers = [list(s.center) + [s.hue] for s in all_squares]
        data = np.array(square_centers)

        ms = DBSCAN(eps=64, min_samples=3)
        ms.fit(data)
        labels = ms.labels_

        ts_msg = TrackedSquares()
        for i, s in enumerate(all_squares):
            label = np.int0(labels[i])
            if label < 0: 
                continue

            s.tracking_colour = TrackedSquare.TRACKING_COLOURS[label % len(TrackedSquare.TRACKING_COLOURS)]
            s.tracking_detected = True

            ts_msg.squares.append(s.to_msg())

        self._squares_pub.publish(ts_msg)
Exemplo n.º 5
0
def _cluster(params):
    cls = None
    method = sh.getConst('method')
    if method=='kmedoid':
        assert False
        # from kmedoid import kmedsoid
        # cls = kmedoid
    elif method=='dbscan':
        from sklearn.cluster import DBSCAN
        cls = DBSCAN(eps=params['eps'],min_samples=params['min_samples'],
                     metric='precomputed')
    else:
        assert False, 'FATAL: unknown cluster method'

    ##
    mat = sh.getConst('mat')
    labels = cls.fit_predict(mat)
    nLabels = len(set(labels))

    ##
    sil = None; cal = None
    if (nLabels >= 2)and(nLabels <= len(labels)-1):
        sil = met.silhouette_score(mat,labels,'precomputed')
        cal = met.calinski_harabaz_score(mat,labels)
    perf = dict(silhouette_score=sil,calinski_harabaz_score=cal)

    return (labels,perf)
Exemplo n.º 6
0
def clusterMalwareNames(malwareNames):
    # strictly lexical clustering over malware-names
    wordCount = {}
    # create a distance matrix
    matrix = np.zeros((len(malwareNames), len(malwareNames)))
    for i in range(len(malwareNames)):
        for j in range(len(malwareNames)):
            if matrix[i, j] == 0.0:        
                matrix[i, j] = computeSimilarity(malwareNames[i], malwareNames[j])
                matrix[j, i] = matrix[i, j]
    
    # Scikit-Learn's DBSCAN implementation to cluster the malware-names
    clust = DBSCAN(eps=0.1, min_samples=5, metric="precomputed")
    clust.fit(matrix)    
    
    preds = clust.labels_
    clabels = np.unique(preds)
    
    # create Word-Count Map
    for i in range(clabels.shape[0]):
        if clabels[i] < 0:
            continue
        
        cmem_ids = np.where(preds == clabels[i])[0]
        cmembers = []
        
        for cmem_id in cmem_ids:
            cmembers.append(malwareNames[cmem_id])
        
        wordCount[", ".join(uniqueList(cmembers))] = len(cmem_ids)
    return wordCount
Exemplo n.º 7
0
def cluster_with_dbscan(vectors, epsilon=0.5, min_samples=5, distances=None, metric="euclidean"):
    # precomputing our distances will be faster as we can use multiple cores
    if distances is None:
        distances = pairwise_distances(vectors, n_jobs=-1, metric=metric)

    dbscan = DBSCAN(eps=epsilon, min_samples=min_samples, metric="precomputed")
    return dbscan.fit_predict(distances)
Exemplo n.º 8
0
def cluster_tweets(tweets):
    #TODO get TFIDF vector
    #do clustering
    ner_tags = [get_ner_tags(tweet).tolist() for tweet in tweets['tweet']]
    vectorizer = TfidfVectorizer(preprocessor=_dummy_preprocess, tokenizer=lambda x:x,
                                 binary=True,
                                 min_df=0, use_idf=True, smooth_idf=True)
    tfidf = vectorizer.fit_transform(ner_tags) 
    
    #ner_tags = [get_ner_tags(tweet) for tweet in tweets['tweet']]
    print "clustering started"
    t0 = time()
    #cluster = AgglomerativeClustering(n_clusters=3, affinity="cosine" )
    #cluster = MiniBatchKMeans(n_clusters=10, max_iter=100, batch_size=100) 
    #metric=sklearn.metrics.pairwise.cosine_distances
    cluster = DBSCAN(min_samples=2, eps=0.5)    
        
    clustered = cluster.fit(tfidf.todense())
       
    #clustered = cluster.fit(ner_tags)
    labels = clustered.labels_
    n_clusters_ = len(set(labels)) - (1 if -1 in labels else 0)
    print "clustering finished in %.3f seconds"%(time()-t0)   
    print "%d clusters detected"%n_clusters_
    
    tweets['cluster'] = labels
    tweets['ner'] = ner_tags
    return tweets
Exemplo n.º 9
0
def DBScan_Flux(phots, ycenters, xcenters, dbsClean=0, useTheForce=False):
    """Class methods are similar to regular functions.

    Note:
        Do not include the `self` parameter in the ``Args`` section.

    Args:
        param1: The first parameter.
        param2: The second parameter.

    Returns:
        True if successful, False otherwise.

    """
    
    dbsPhots    = DBSCAN()#n_jobs=-1)
    stdScaler   = StandardScaler()
    
    phots       = np.copy(phots.ravel())
    phots[~np.isfinite(phots)] = np.median(phots[np.isfinite(phots)])
    
    featuresNow = np.transpose([stdScaler.fit_transform(ycenters[:,None]).ravel(), \
                                stdScaler.fit_transform(xcenters[:,None]).ravel(), \
                                stdScaler.fit_transform(phots[:,None]).ravel()   ] )
    
    # print(featuresNow.shape)
    dbsPhotsPred= dbsPhots.fit_predict(featuresNow)
    
    return dbsPhotsPred == dbsClean
def plot_dbscan():
    X, y = make_blobs(random_state=0, n_samples=12)

    dbscan = DBSCAN()
    clusters = dbscan.fit_predict(X)
    clusters

    fig, axes = plt.subplots(3, 4, figsize=(11, 8), subplot_kw={'xticks': (), 'yticks': ()})
    # Plot clusters as red, green and blue, and outliers (-1) as white
    colors = ['r', 'g', 'b']
    markers = ['o', '^', 'v']

    # iterate over settings of min_samples and eps
    for i, min_samples in enumerate([2, 3, 5]):
        for j, eps in enumerate([1, 1.5, 2, 3]):
            # instantiate DBSCAN with a particular setting
            dbscan = DBSCAN(min_samples=min_samples, eps=eps)
            # get cluster assignments
            clusters = dbscan.fit_predict(X)
            print("min_samples: %d eps: %f  cluster: %s" % (min_samples, eps, clusters))
            if np.any(clusters == -1):
                c = ['w'] + colors
                m = ['o'] + markers
            else:
                c = colors
                m = markers
            discrete_scatter(X[:, 0], X[:, 1], clusters, ax=axes[i, j], c=c, s=8, markers=m)
            inds = dbscan.core_sample_indices_
            # vizualize core samples and clusters.
            if len(inds):
                discrete_scatter(X[inds, 0], X[inds, 1], clusters[inds],
                                 ax=axes[i, j], s=15, c=colors,
                                 markers=markers)
            axes[i, j].set_title("min_samples: %d eps: %.1f" % (min_samples, eps))
    fig.tight_layout()
Exemplo n.º 11
0
    def search_charges(self, data, z=0, threshold = 30):
        A = deriv(data,z)
        
        print 'Searching charges...'
        time0 = time.time()        
        
        det = A[3]*A[5]-A[4]**2

        dx = -(A[1]*A[5]-A[2]*A[4])/det
        dy = -(A[2]*A[3]-A[1]*Aa[4])/det
        
        datamax = A[0]+A[1]*dx+A[2]*dy+A[3]*dx**2/2+A[4]*dx*dy+A[5]*dy**2/2        
        t = np.where((np.abs(dx) < 1)*(np.abs(dy) < 1)*(np.abs(datamax) > threshold)*(det > 0))        
        
        x = np.array([t[1]+dx[t], t[0]+dy[t]]).T
        
        db = DBSCAN(min_samples = 1, eps = 1)
        db.fit_predict(x)
        
        n_charges = np.max(db.labels_)+1
        qi = np.zeros(n_charges)
        xi = np.zeros((3,n_charges))
        
        for i in range(0, n_charges):
            xi[0:2,i] = np.mean(x[db.labels_ == i,:], axis=0)
            qi[i] = np.mean(datamax[t][db.labels_ == i])
        
        
        self.set_charges(qi,xi)
        print 'Done! Elapsed time: '+str(time.time()-time0)
        return self
Exemplo n.º 12
0
def cluster():
    eps_set = 0.5 * np.arange(1, 7)
    npt_set = np.arange(1, 6)
    scores = []
    global res
    res = []
    for eps in eps_set:
        for npt in npt_set:
            est = DBSCAN(eps=eps, min_samples=npt)
            est.fit(x)
            ari = metrics.adjusted_rand_score(y, est.labels_)
            scores.append(ari)
            n_noise = len([ l for l in est.labels_ if l == -1])
            res.append((ari, np.max(est.labels_) + 1 , n_noise))
            print ari
    max_score = np.max(scores)
    max_idx = scores.index(max_score)
    max_eps = eps_set[max_idx / len(npt_set)]
    max_npt = npt_set[max_idx % len(npt_set)]
    print max_score, max_eps, max_npt
    scores = np.array(scores).reshape(len(eps_set), len(npt_set))
    pl.imshow(scores, interpolation='nearest', cmap=pl.cm.spectral)
    pl.colorbar()
    pl.xticks(np.arange(len(npt_set)), npt_set)
    pl.yticks(np.arange(len(eps_set)), eps_set)
    pl.ylabel('eps')
    pl.xlabel('min_samples')
    pl.show()
Exemplo n.º 13
0
	def current_datapoints_dbscan(self):
		"""
		Method clusters points-outliers (after current_datapoints_threshold_filter and current_datapoints_outliers_filter) into slice-clusters using DBSCAN.
		Returns dict of slice-clusters - base for event-candidates. Uses self.eps attribute to estimate cluster boundaries.
		"""
		nets = self.current_datapoints.keys()
		ids = concatenate([self.current_datapoints[x]['ids'] for x in nets])
		coords = concatenate([self.current_datapoints[x]['array'] for x in nets])
		weights = concatenate([self.current_datapoints[x]['weights'] for x in nets])
		if len(ids) > 0:
			clustering = DBSCAN(eps=self.eps, min_samples=5)
			labels = clustering.fit_predict(coords)
			core_ids = ids[clustering.core_sample_indices_]
			ids = ids[labels > -1]
			coords = coords[labels > -1]
			weights = weights[labels > -1]
			labels = labels[labels > -1]
			ret_tab = {}
			for i in range(len(labels)):
				try:
					ret_tab[labels[i]].append({'id':ids[i], 'lng':coords[i,0], 'lat':coords[i,1], 'weight':weights[i], 'is_core':ids[i] in core_ids})
				except KeyError:
					ret_tab[labels[i]] = [{'id':ids[i], 'lng':coords[i,0], 'lat':coords[i,1], 'weight':weights[i], 'is_core':ids[i] in core_ids}]
			return ret_tab
		else:
			return {}
def get_clusters(tracks):
    neighbors = g.m.neighborsSpin.value()
    dist = g.m.neighborDistanceSpin.value()
    data = np.array([[tr['mean_x'], tr['mean_y']] for tr in tracks])
    scanner = DBSCAN(eps=dist, min_samples=neighbors)
    ids = scanner.fit_predict(data)
    return ids
Exemplo n.º 15
0
def cluster_dbscan(matrix, distance_measure="sts", eps=1):
    """Clusters the distance matrix for a given epsilon value, if distance
    measure is sts. Other distance measures are: [‘cityblock’, ‘cosine’, 
    ‘euclidean’, ‘l1’, ‘l2’, ‘manhattan’, ‘braycurtis’, ‘canberra’, 
    ‘chebyshev’, ‘correlation’, ‘dice’, ‘hamming’, ‘jaccard’, ‘kulsinski’, 
    ‘mahalanobis’, ‘matching’, ‘minkowski’, ‘rogerstanimoto’, ‘russellrao’, 
    ‘seuclidean’, ‘sokalmichener’, ‘sokalsneath’, ‘sqeuclidean’, ‘yule’]

    Parameters
    ----------
    matrix: np.matrix
        The input matrix. If distance measure is sts, this should be the sts
        distance matrix. If other distance, this should be the time-series
        matrix of size ngenes x nsamples.
    distance_measure: str
        The distance measure, default is sts, short time-series distance.
        Any distance measure available in scikit-learn is available here.
        Note: multiple time-series is NOT supported for distances other than    
        "sts".

    Returns
    -------
    cluster_labels: list of int
        A list of size ngenes that defines cluster membership.
    """
    if (distance_measure == "sts"):
        dbs = DBSCAN(eps=eps, metric='precomputed', min_samples=2)
    else:
        dbs = DBSCAN(eps=eps, metric=distance_measure, min_samples=2)
    cluster_labels = dbs.fit_predict(matrix)
    return cluster_labels
Exemplo n.º 16
0
def cluster_mappings(vector_inpath, do_pca=False, target_dim=100, indices_inpath=None, epsilon=2.5, min_s=20):
	# TODO: CLustering parameters
	# TODO: Metric cosine similarity or euclidian distance
	print alt("Load mappings...")
	indices, model = load_mappings_from_model(vector_inpath)
	X = numpy.array([model[key] for key in indices])
	# del model
	if do_pca:
		print alt("Truncate vectors with PCA to %i dimensions..." %(target_dim))
		pca = PCA(n_components=target_dim)
		pca.fit(X)
		X = pca.transform(X)
	print alt("Cluster points...")
	# k = 2 * X[0].shape[0] - 1
	# min_pts = k + 1
	#dbscan = DBSCAN(eps=0.1, min_samples=20, metric='cosine',algorithm='brute')
	dbscan = DBSCAN(eps=epsilon, min_samples=min_s)
	dbscan.fit(X)
	labels = dbscan.labels_
	print get_cluster_size(labels)
	print alt("Finished clustering!")
	sscore = silhouette_score(X, labels)
	print("Silhouette Coefficient: %0.3f" %(sscore))
	if indices_inpath:
		resolve_indices(indices, labels, indices_inpath, model)
Exemplo n.º 17
0
def test():
    global est
    est = DBSCAN(eps=1, min_samples=1)
    est.fit(x)
    print est.labels_
    ari = metrics.adjusted_rand_score(y, est.labels_)
    print ari
Exemplo n.º 18
0
def train_dbscan():
	print "starting dbscan clustering..."
	model = DBSCAN(eps=dbs_eps, min_samples=dbs_min_samples, metric=dbs_metric, algorithm='auto')
	model.fit(X)
	
	core_ponts = model.core_sample_indices_ 
	if output_core_points:
		print "core points data index"
		print core_points
	print "num of core points %d" %(len(core_ponts))
	
	print "all points clutser index"
	cluster_index = model.labels_
	if output_cluster_members:
		#print cluster_index
		cluster_members = {}
		for i,c in enumerate(cluster_index):
			index_list = cluster_members.get(c, list())
			index_list.append(i)
			cluster_members[c] = index_list
		for cl, indx_list in cluster_members.iteritems():
			if cl > 0:
				print "cluster index %d  size %d" %(cl, len(indx_list))
			else:
				print "noise points size %d" %(len(indx_list))
			print indx_list
	
	print "num of clusters %d" %(cluster_index.max() + 1)
Exemplo n.º 19
0
def cluster_DBSCAN(args):
	"""
	Clustering with Ward hierarchical clustering: constructs a tree and cuts it.
	"""
	#load data
	g_it = node_link_data.node_link_data_to_eden(input = args.input_file, input_type = "file")
	vec = graph.Vectorizer(r = args.radius,d = args.distance, nbits = args.nbits)
	logger.info('Vectorizer: %s' % vec)

	X = vec.transform(g_it, n_jobs = args.n_jobs)
	logger.info('Instances: %d Features: %d with an avg of %d features per instance' % (X.shape[0], X.shape[1], X.getnnz() / X.shape[0]))
	
	#project to lower dimensional space to use clustering algorithms
	transformer = TruncatedSVD(n_components=args.n_components)
	X_dense=transformer.fit_transform(X)

	#log statistics on data
	logger.info('Dimensionality reduction Instances: %d Features: %d with an avg of %d features per instance' % (X_dense.shape[0], X_dense.shape[1], X.getnnz() / X.shape[0]))

	#clustering
	clustering_algo = DBSCAN(eps = args.eps)
	y = clustering_algo.fit_predict(X_dense)
	msg = 'Predictions statistics: '
	msg += util.report_base_statistics(y)
	logger.info(msg)

	#save model for vectorizer
	out_file_name = "vectorizer"
	eden_io.dump(vec, output_dir_path = args.output_dir_path, out_file_name = out_file_name)
	logger.info("Written file: %s/%s",args.output_dir_path, out_file_name)

	#save result
	out_file_name = "labels"
	eden_io.store_matrix(matrix = y, output_dir_path = args.output_dir_path, out_file_name = out_file_name, output_format = "text")
	logger.info("Written file: %s/%s",args.output_dir_path, out_file_name)
Exemplo n.º 20
0
def find_tracks(data, eps=20, min_samples=20):
    """Applies the DBSCAN algorithm from scikit-learn to find tracks in the data.

    Parameters
    ----------
    data : array-like
        An array of (x, y, z, hits) data points
    eps : number, optional
        The minimum distance between adjacent points in a cluster
    min_samples : number, optional
        The min number of points in a cluster

    Returns
    -------
    tracks : list
        A list of tracks. Each track is an ndarray of points.

    """
    xyz = data[:, 0:3]
    dbs = DBSCAN(eps=eps, min_samples=min_samples)
    dbs.fit(xyz)

    tracks = []
    for track in (np.where(dbs.labels_ == n)[0] for n in np.unique(dbs.labels_) if n != -1):
        tracks.append(data[track])

    return tracks
    def classify_core(self, N_CLUSTERS, clusterType, data_for_trial_type, begin_time, end_time):

        BEGIN_TIME_FRAME = begin_time*self.griddy.TIME_GRID_SPACING
        END_TIME_FRAME = end_time*self.griddy.TIME_GRID_SPACING

        data = data_for_trial_type[:,BEGIN_TIME_FRAME:END_TIME_FRAME,self.griddy.VEL_X]

        labels = None
        if clusterType == 'kmeans':
            kmeans = KMeans(n_clusters=N_CLUSTERS)
            kmeans.fit(data)
            labels = kmeans.labels_
        elif clusterType == 'affinity_propagation':
            ap = AffinityPropagation(damping=0.75)
            ap.fit(data)
            labels = ap.labels_
            N_CLUSTERS = np.max(self.labels)+1
        elif clusterType == 'DBSCAN':
            dbscan = DBSCAN()
            dbscan.fit(data)
            labels = dbscan.labels_
            N_CLUSTERS = np.max(labels)+1
            print 'N_CLUSTERS=' + str(N_CLUSTERS)
        elif clusterType == 'AgglomerativeClustering':
            ac = AgglomerativeClustering(n_clusters=N_CLUSTERS)
            ac.fit(data)
            labels = ac.labels_
        else:
            print 'ERROR: clusterType: ' + clusterType + ' is not recognized'

        return (labels, N_CLUSTERS)
Exemplo n.º 22
0
def dbscan(similarity, concepts=2, euclid=False):
    if euclid:
        model = DBSCAN(eps=0.6, min_samples=10, algorithm='auto', leaf_size=30)
        return model.fit_predict(similarity)
    else:
        model = DBSCAN(eps=0.6, min_samples=10, metric='precomputed', algorithm='auto', leaf_size=30)
        return model.fit_predict(1 - similarity)
Exemplo n.º 23
0
 def cluster_lvl1(self, data):
     db = DBSCAN(eps=2., min_samples=2, metric='precomputed')
     processed = np.float32(np.vstack([
         np.mgrid[:self.map_height, :self.map_width].reshape(2, -1),
         data.ravel()
     ])).T
     dist = self.distances_for_lvl1(processed)
     return db.fit_predict(dist).reshape(self.map_height, self.map_width)
Exemplo n.º 24
0
    def regroup(self, maxdistance, minsize, algo = 'auto'):

        self.__loginfo('Regrouping')
        dbsfit = DBSCAN(eps=maxdistance, min_samples=minsize, algorithm=algo).fit(self.primarylist)
        dbsresult = dbsfit.fit_predict(self.primarylist)
        grouplist = []
        for grouplabel in dbsresult:
            if not grouplabel in grouplist: grouplist.append(grouplabel)
        self.__loginfo('Group label count: %s' % len(grouplist))
Exemplo n.º 25
0
    def cluster_dbscan(self, calpha=False, cluster_diameter=6, cluster_min_size=10):
        '''
        cluster the residues using the DBSCAN method. 
        The parameters here are neighborhood diameter (eps) and neighborhood 
        connectivity (min_samples).
        
        Returns a list of cluster labels, in which label ``-1`` means an outlier point,
        which doesn't belong to any cluster.
        '''

        if not self.positive_residues:
            return {}
        
        if calpha:
            data_atoms = self.positive_residues.select('ca')
        else:
            data_atoms = self.positive_residues.select('sidechain or ca').copy()
        
        assert (
                data_atoms.getHierView().numResidues() == 
                self.positive_residues.getHierView().numResidues()
                )
        
        OUTLIER_LABEL = -1
        
        db_clust = DBSCAN(eps=cluster_diameter, min_samples=cluster_min_size)
        db_clust.fit(data_atoms.getCoords())

        db_labels = db_clust.labels_.astype(int)
        #print db_labels, len(db_labels)
        if calpha:
            residue_labels = db_labels
        
        else:
            residues = list(data_atoms.getHierView().iterResidues())
            residue_labels = np.zeros(len(residues), dtype=int)
            
            def most_common(lst):
                lst = list(lst)
                return max(set(lst) or [OUTLIER_LABEL], key=lst.count)
            
            data_atoms.setBetas(db_labels)
            for i, res in enumerate(residues):
                atom_labels = res.getBetas()
                residue_labels[i] = most_common(atom_labels[atom_labels!=OUTLIER_LABEL])
                
        assert len(residue_labels) == self.positive_residues.getHierView().numResidues()
        
        residue_numbers = self.positive_residues.ca.getResnums()
        clusters = sorted(
                [residue_numbers[residue_labels==i] for i in
                    set(residue_labels) if i!=-1], 
                key=self.conf_sum, 
                reverse=True,
                )
        return dict(enumerate(clusters))
Exemplo n.º 26
0
def main(datafile, feature1, feature2, normalize, clusteroutput, percentile, copula):
    X, features = read_sah_h5(datafile, just_good=False)
    if 'id' not in features:
        ids = np.arange(len(X))
    else:
        ids = X[:, features.index('id')]
    x = X[:, features.index(feature1)]
    y = X[:, features.index(feature2)]
    D = np.column_stack([x, y])

    idx = np.random.randint(len(X), size=10000)

    D = D[idx]
    ids = ids[idx]

    if normalize:
        mean = np.average(D, axis=0)
        std = np.std(D, axis=0)
        std[np.nonzero(std == 0.0)] = 1.0 # Avoid NaNs
        Dnorm = (D - mean) / std
    elif copula:
        Dnorm = np.column_stack([copula_transform(f) for f in D.T])
    else:
        Dnorm = D

    kmeans = MiniBatchKMeans(n_clusters=50)
    gmm = GMM(n_components=200, covariance_type='full', verbose=True)
    #C = gmm.fit_predict(Dnorm)
    dbscan = DBSCAN(eps=100.0, min_samples=1)
    C = dbscan.fit_predict(Dnorm)
    print C

    with open(clusteroutput, 'w+') as f:
        for c, i in zip(C, ids):
            f.write('%d,%d\n' % (i, c))

    pl.scatter(D[:, 0], D[:, 1], color=pl.cm.spectral(C.astype(float) / np.max(C)))

#    for c in np.unique(C):
#        pl.bar(0, 0, lw=0, ec='none',
#            fc=pl.cm.spectral(float(c) / np.max(C)), label='Cluster %d' % c)
#    pl.legend(loc='upper left')

    if percentile > 0:
        pl.xlim(
            scoreatpercentile(x, percentile),
            scoreatpercentile(x, 100-percentile)
        )
        pl.ylim(
            scoreatpercentile(y, percentile),
            scoreatpercentile(y, 100-percentile)
        )

    pl.xlabel(feature1)
    pl.ylabel(feature2)
    pl.show()
Exemplo n.º 27
0
 def dbscan(self, eps=0.75, min_samples=3):
     """
     :param kwargs: key-value arguments to pass to DBSCAN
                    (eps: max dist between points in same neighbourhood,
                     min_samples: number of points in a neighbourhood)
     :return:
     """
     est = DBSCAN(metric='precomputed', eps=eps, min_samples=min_samples)
     est.fit(self.get_dm(False))
     return Partition(est.labels_)
Exemplo n.º 28
0
def fit(fvecs, params):
	eps_ = int(params[0])
	min_s = int(params[1])
	metric_=params[2]
	# affinity : “euclidean”, “l1”, “l2”, “manhattan”, “cosine”, or ‘precomputed’

	model = DBSCAN(eps=eps_, min_samples=min_s, metric=metric_)
	model.fit(fvecs)
	print len(set(model.labels_))
	return model.labels_
def mode_cluster(mode,eps,sam):
    mode_change_pnts=[]
    # print(tran_mat)
    query = {"$and": [{'type': 'move'},\
                      {'confirmed_mode':mode}]}
    # print(Sections.find(query).count())
    logging.debug("Trying to find cluster locations for %s trips" % (Sections.find(query).count()))
    for section in Sections.find(query).sort("section_start_datetime",1):
        try:
            mode_change_pnts.append(section['section_start_point']['coordinates'])
            mode_change_pnts.append(section['section_end_point']['coordinates'])
        except:
            logging.warn("Found trip %s with missing start and/or end points" % (section['_id']))
            pass
    # print(user_change_pnts)
    # print(len(mode_change_pnts))
    if len(mode_change_pnts) == 0:
      logging.debug("No points found in cluster input, nothing to fit..")
      return np.zeros(0)

    if len(mode_change_pnts)>=1:
        # print(mode_change_pnts)
        np_points=np.array(mode_change_pnts)
        # print(np_points[:,0])
        # fig, axes = plt.subplots(1, 1)
        # axes.scatter(np_points[:,0], np_points[:,1])
        # plt.show()
    else:
        pass
    utm_x = []
    utm_y = []
    for row in mode_change_pnts:
        # GEOJSON order is lng, lat
        utm_loc = utm.from_latlon(row[1],row[0])
        utm_x = np.append(utm_x,utm_loc[0])
        utm_y = np.append(utm_y,utm_loc[1])
    utm_location = np.column_stack((utm_x,utm_y))
    db = DBSCAN(eps=eps,min_samples=sam)
    db_fit = db.fit(utm_location)
    db_labels = db_fit.labels_
    #print db_labels
    new_db_labels = db_labels[db_labels!=-1]
    new_location = np_points[db_labels!=-1]
    # print len(new_db_labels)
    # print len(new_location)
    # print new_information

    label_unique = np.unique(new_db_labels)
    cluster_center = np.zeros((len(label_unique),2))
    for label in label_unique:
        sub_location = new_location[new_db_labels==label]
        temp_center = np.mean(sub_location,axis=0)
        cluster_center[int(label)] = temp_center
    # print cluster_center
    return cluster_center
Exemplo n.º 30
0
    def done(self):
        matrix = [[0]*self.count for i in range(self.count)]
        for keys,distance in self.matrixDict.iteritems():
            matrix[keys[0]][keys[1]] = distance
            matrix[keys[1]][keys[0]] = distance

        db = DBSCAN(eps=args.epsilon, metric='precomputed', min_samples=args.min_samples)
        output = db.fit(matrix)

        for label,i in self.labelToPos.iteritems():
            args.outfile.write(self.tup + [label, output.labels_[i]])
Exemplo n.º 31
0
"Import libraries"
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.cluster import DBSCAN
from sklearn import datasets
from sklearn.decomposition import PCA

"Import Datasets"
#data = pd.DataFrame(datasets.load_iris().data)
#y = list(datasets.load_iris().target)
data = pd.read_csv("glass.csv", sep=",")
data.drop(['Type'], axis=1, inplace=True)

"Apply DBSCAN model and its parameters"
model = DBSCAN(eps=0.5, min_samples=5, metric="euclidean", leaf_size=30)

"Fit data in given model"
model.fit(data)

"classified clusters"
model.labels_

"PCA decomposition for plotting"
pca = PCA(n_components=2).fit(data)
pca_2D = pca.transform(data)

"Plot clusters and Noise"
for i in np.arange(pca_2D.shape[0]):
    if model.labels_[i] == 0:
        c1 = plt.scatter(pca_2D[i, 0], pca_2D[i, 1], c="r", marker='+')
Exemplo n.º 32
0
def main_worker(args):
    global start_epoch, best_mAP

    cudnn.benchmark = True

    sys.stdout = Logger(osp.join(args.logs_dir, 'log.txt'))
    print("==========\nArgs:{}\n==========".format(args))

    # Create data loaders
    iters = args.iters if (args.iters > 0) else None
    dataset_target = get_data(args.dataset_target, args.data_dir)
    ori_train = dataset_target.train
    if not args.no_source:
        dataset_source = get_data(args.dataset_source, args.data_dir)
    test_loader_target = get_test_loader(dataset_target, args.height,
                                         args.width, args.batch_size,
                                         args.workers)

    # Create model
    model_1, model_1_ema = create_model(args)

    # Evaluator
    evaluator_1_ema = Evaluator(model_1_ema)

    best_mAP = 0

    for nc in range(args.epochs):

        cluster_loader = get_test_loader(dataset_target,
                                         args.height,
                                         args.width,
                                         args.batch_size,
                                         args.workers,
                                         testset=dataset_target.train)
        dict_f, _ = extract_features(model_1_ema,
                                     cluster_loader,
                                     print_freq=50)
        cf_1 = torch.stack(list(dict_f.values()))

        if not args.no_source:
            cluster_loader_source = get_test_loader(
                dataset_source,
                args.height,
                args.width,
                args.batch_size,
                args.workers,
                testset=dataset_source.train)
            dict_f_source, _ = extract_features(model_1_ema,
                                                cluster_loader_source,
                                                print_freq=50)
            cf_1_source = torch.stack(list(dict_f_source.values()))

        # DBSCAN cluster
        if args.no_source:
            rerank_dist = compute_jaccard_dist(cf_1,
                                               lambda_value=0,
                                               source_features=None,
                                               use_gpu=False).numpy()
        else:
            rerank_dist = compute_jaccard_dist(cf_1,
                                               lambda_value=args.lambda_value,
                                               source_features=cf_1_source,
                                               use_gpu=False).numpy()
        tri_mat = np.triu(rerank_dist, 1)  # tri_mat.dim=2
        tri_mat = tri_mat[np.nonzero(tri_mat)]  # tri_mat.dim=1
        tri_mat = np.sort(tri_mat, axis=None)
        top_num = np.round(args.rho * tri_mat.size).astype(int)
        eps = tri_mat[:top_num].mean()
        print('eps in cluster: {:.3f}'.format(eps))
        print('Clustering and labeling...')
        cluster = DBSCAN(eps=eps,
                         min_samples=4,
                         metric='precomputed',
                         n_jobs=-1)
        labels = cluster.fit_predict(rerank_dist)
        num_ids = len(set(labels)) - 1

        print('Epoch {} have {} training ids'.format(nc, num_ids))
        # generate new dataset
        labeled_ind, unlabeled_ind = [], []
        for ind, label in enumerate(labels):
            if label == -1:
                unlabeled_ind.append(ind)
            else:
                labeled_ind.append(ind)
        # print('Epoch {} have {} labeled samples and {} unlabeled samples'.format(nc + 1, len(labeled_ind), len(unlabeled_ind)))

        cf_1 = cf_1.numpy()
        centers = []
        for id in range(num_ids):
            centers.append(np.mean(cf_1[labels == id], axis=0))
        centers = np.stack(centers, axis=0)
        # print(centers.shape)

        if args.features == 0:
            model_1.module.classifier = nn.Linear(2048, num_ids,
                                                  bias=False).cuda()
            model_1_ema.module.classifier = nn.Linear(2048,
                                                      num_ids,
                                                      bias=False).cuda()
            model_1.module.classifier_max = nn.Linear(2048,
                                                      num_ids,
                                                      bias=False).cuda()
            model_1_ema.module.classifier_max = nn.Linear(2048,
                                                          num_ids,
                                                          bias=False).cuda()

            model_1.module.classifier.weight.data.copy_(
                torch.from_numpy(normalize(centers[:, :2048],
                                           axis=1)).float().cuda())
            model_1_ema.module.classifier.weight.data.copy_(
                torch.from_numpy(normalize(centers[:, :2048],
                                           axis=1)).float().cuda())

            model_1.module.classifier_max.weight.data.copy_(
                torch.from_numpy(normalize(centers[:, 2048:],
                                           axis=1)).float().cuda())
            model_1_ema.module.classifier_max.weight.data.copy_(
                torch.from_numpy(normalize(centers[:, 2048:],
                                           axis=1)).float().cuda())
        else:
            model_1.module.classifier = nn.Linear(1024, num_ids,
                                                  bias=False).cuda()
            model_1_ema.module.classifier = nn.Linear(1024,
                                                      num_ids,
                                                      bias=False).cuda()
            model_1.module.classifier_max = nn.Linear(1024,
                                                      num_ids,
                                                      bias=False).cuda()
            model_1_ema.module.classifier_max = nn.Linear(1024,
                                                          num_ids,
                                                          bias=False).cuda()

            model_1.module.classifier.weight.data.copy_(
                torch.from_numpy(normalize(centers[:, :1024],
                                           axis=1)).float().cuda())
            model_1_ema.module.classifier.weight.data.copy_(
                torch.from_numpy(normalize(centers[:, :1024],
                                           axis=1)).float().cuda())

            model_1.module.classifier_max.weight.data.copy_(
                torch.from_numpy(normalize(centers[:, 1024:],
                                           axis=1)).float().cuda())
            model_1_ema.module.classifier_max.weight.data.copy_(
                torch.from_numpy(normalize(centers[:, 1024:],
                                           axis=1)).float().cuda())

        target_label = labels

        for i in range(len(dataset_target.train)):
            dataset_target.train[i] = list(dataset_target.train[i])
            dataset_target.train[i][1] = int(target_label[i])
            dataset_target.train[i] = tuple(dataset_target.train[i])

        # Optimizer
        params = []
        for key, value in model_1.named_parameters():
            if not value.requires_grad:
                continue
            params += [{
                "params": [value],
                "lr": args.lr,
                "weight_decay": args.weight_decay
            }]

        optimizer = torch.optim.Adam(params)

        # Trainer
        trainer = ABMTTrainer(model_1,
                              model_1_ema,
                              num_cluster=num_ids,
                              alpha=args.alpha)
        epoch = nc
        # # DBSCAN
        dataset_target.train = [ori_train[i] for i in labeled_ind]
        print(len(dataset_target.train), 'are labeled.')
        labeled_loader_target = get_train_loader(dataset_target,
                                                 args.height,
                                                 args.width,
                                                 args.batch_size,
                                                 args.workers,
                                                 args.num_instances,
                                                 iters,
                                                 mutual=True)
        labeled_loader_target.new_epoch()

        trainer.train(epoch,
                      labeled_loader_target,
                      optimizer,
                      print_freq=args.print_freq,
                      train_iters=len(labeled_loader_target))

        def save_model(model_ema, is_best, best_mAP, mid):
            save_checkpoint(
                {
                    'state_dict': model_ema.state_dict(),
                    'epoch': epoch + 1,
                    'best_mAP': best_mAP,
                },
                is_best,
                fpath=osp.join(args.logs_dir,
                               'model' + str(mid) + '_checkpoint.pth.tar'))

        if ((epoch + 1) % args.eval_step == 0 or (epoch == args.epochs - 1)):
            print('Evaluating teacher net:')
            cmc, mAP_1 = evaluator_1_ema.evaluate(test_loader_target,
                                                  dataset_target.query,
                                                  dataset_target.gallery,
                                                  cmc_flag=True)
            is_best = (mAP_1 > best_mAP)
            best_mAP = max(mAP_1, best_mAP)

            save_model(model_1_ema, is_best, best_mAP, 1)
            dataset_target.train = ori_train
    print('Test on the best model.')
    checkpoint = load_checkpoint(osp.join(args.logs_dir, 'model_best.pth.tar'))
    model_1_ema.load_state_dict(checkpoint['state_dict'])
    evaluator_1_ema.evaluate(test_loader_target,
                             dataset_target.query,
                             dataset_target.gallery,
                             cmc_flag=True)
Exemplo n.º 33
0
def dbscan_cluster(data: np.ndarray,
                   eps: float = 0.05,
                   min_samples: int = 5) -> np.ndarray:
    return DBSCAN(eps=eps, min_samples=min_samples).fit_predict(X=data)
def test(file_name):
    data_frame = pd.read_csv('/home/mytrah-pc/Mytrah_Adithya/data_turbine/' +
                             file_name)
    num_rows = data_frame.shape[0]
    filter_data_frame = data_frame.copy()[['ActivePower', 'WindSpeed']]
    filter_data_frame['set_in'] = -2
    min_active_power = filter_data_frame['ActivePower'].min()
    max_active_power = filter_data_frame['ActivePower'].max()
    min_wind_speed = filter_data_frame['WindSpeed'].min()
    max_wind_speed = filter_data_frame['WindSpeed'].max()
    global_max_p = max_active_power
    global_min_p = min_active_power
    """
    Subract all active power by min_active_power    
    Subract all wind speed by min_wind_speed
    Divide all active power by max_active_power - min_active_power
    Divide all wind speed by max_wind_speed - min_wind_speed
    """
    filter_data_frame['ActivePowerScaled'] = (
        (filter_data_frame['ActivePower'] - min_active_power) *
        15) / (max_active_power - min_active_power)

    filter_data_frame['WindSpeedScaled'] = (
        (filter_data_frame['WindSpeed'] - min_wind_speed) *
        20) / (max_wind_speed - min_wind_speed)

    scan = DBSCAN(eps=0.28, min_samples=15).fit_predict(
        filter_data_frame[['ActivePowerScaled', 'WindSpeedScaled']])

    filter_data_frame['set_in'] = scan

    import random
    r = lambda: random.randint(0, 255)

    import matplotlib.pyplot as plt
    plt.figure(figsize=(10, 10))

    num_of_groups = 0
    static_compare = -1
    static_max = -2
    for group in filter_data_frame.groupby('set_in'):
        num_of_groups = num_of_groups + 1

        if len(group[1]) > static_compare:
            static_compare = len(group[1])
            static_max = group[0]

        plt.scatter(group[1]['WindSpeed'],
                    group[1]['ActivePower'],
                    s=np.pi * 2 * 2,
                    c='#c0c0c0')

    loop_list = list(range(9) - np.ones(9))
    del loop_list[loop_list.index(static_max)]

    temp_frame = pd.concat([
        filter_data_frame[filter_data_frame['set_in'] == i] for i in loop_list
    ])
    data_frame = temp_frame[(temp_frame['ActivePower'] < (global_max_p) * 0.8)
                            & (temp_frame['ActivePower'] >
                               (global_max_p) * 0.2)]

    num_rows = data_frame.shape[0]
    filter_data_frame = data_frame.copy()[['ActivePower', 'WindSpeed']]
    filter_data_frame['set_in'] = -2
    min_active_power = filter_data_frame['ActivePower'].min()
    max_active_power = filter_data_frame['ActivePower'].max()
    min_wind_speed = filter_data_frame['WindSpeed'].min()
    max_wind_speed = filter_data_frame['WindSpeed'].max()
    """
    Subract all active power by min_active_power    
    Subract all wind speed by min_wind_speed
    Divide all active power by max_active_power - min_active_power
    Divide all wind speed by max_wind_speed - min_wind_speed
    """
    filter_data_frame['ActivePowerScaled'] = (
        (filter_data_frame['ActivePower'] - min_active_power) *
        250) / (max_active_power - min_active_power)

    filter_data_frame['WindSpeedScaled'] = (
        (filter_data_frame['WindSpeed'] - min_wind_speed) *
        20) / (max_wind_speed - min_wind_speed)

    scan = DBSCAN(eps=2.5, min_samples=15).fit_predict(
        filter_data_frame[['ActivePowerScaled', 'WindSpeedScaled']])

    filter_data_frame['set_in'] = scan

    import random
    r = lambda: random.randint(0, 255)

    num_of_groups = 0
    static_compare = -1
    static_max = -2
    for group in filter_data_frame.groupby('set_in'):
        num_of_groups = num_of_groups + 1

        if len(group[1]) > static_compare:
            static_compare = len(group[1])
            static_max = group[0]

        if (group[0] == -1):
            continue

        plt.scatter(
            group[1]['WindSpeed'],
            group[1]['ActivePower'],
            s=np.pi * 2 * 2,
            c='#000000'  #'#%02X%02X%02X' % (r(),r(),r())
        )

    plt.show()
Exemplo n.º 35
0
def plot_cluster_map_multi(
        df_filtered_dict,
        # quantile_group,
        range_axis,
        data_column_i,
        OUTPUT_CHARTS_DIR,
        number_to_name_dict={},
        eps=4,
        min_samples=10,
        save_plot=True):
    # print('start_cluster_map')
    import numpy as np
    import pandas as pd
    import os
    from sklearn.cluster import DBSCAN
    from sklearn import metrics
    from sklearn.preprocessing import StandardScaler
    quantile_group = df_filtered_dict['quantile_group']
    range_low = df_filtered_dict['range_low']
    range_high = df_filtered_dict['range_high']
    df_filtered = df_filtered_dict['df_filtered']
    X = df_filtered[['X', 'Y']].values

    # #############################################################################
    # Compute DBSCAN
    # print(range_low)
    db = DBSCAN(eps=eps, min_samples=min_samples).fit(X)
    # print(range_high)
    core_samples_mask = np.zeros_like(db.labels_, dtype=bool)
    core_samples_mask[db.core_sample_indices_] = True
    labels = db.labels_

    # Number of clusters in labels, ignoring noise if present.
    n_clusters_ = len(set(labels)) - (1 if -1 in labels else 0)
    n_noise_ = list(labels).count(-1)
    n_clusters_points_ = len(X) - n_noise_
    if n_clusters_ > 0:
        print('Data Column Name: %s' % data_column_i)
        print('Quantile Group: %s' % quantile_group)
        print('Estimated number of clusters: %d' % n_clusters_)
        print('Estimated number of clusters points: %d' % n_clusters_points_)
        print('Estimated number of noise points: %d' % n_noise_)
        print('Range between : %s and %s' % (range_low, range_high))
        # #############################################################################
        # Plot result
        if save_plot:
            chart_name = data_column_i
            print('chartname {0}'.format(chart_name))
            if bool(number_to_name_dict):
                print('dict exists')
                if str(data_column_i) in number_to_name_dict:
                    print('keyexists')
                    chart_name = number_to_name_dict[str(data_column_i)]
            print('chartname after : {0}'.format(chart_name))
            import matplotlib.pyplot as plt

            # Black removed and is used for noise instead.
            unique_labels = set(labels)
            colors = [
                plt.cm.Spectral(each)
                for each in np.linspace(0, 1, len(unique_labels))
            ]
            for k, col in zip(unique_labels, colors):
                if k == -1:
                    # Black used for noise.
                    col = [0, 0, 0, 1]

                class_member_mask = (labels == k)

                xy = X[class_member_mask & core_samples_mask]
                plt.plot(xy[:, 0],
                         xy[:, 1],
                         '.',
                         color=tuple(col),
                         markersize=3)

            # xy = X[class_member_mask & ~core_samples_mask]
            # plt.plot(xy[:, 0], xy[:, 1], 'o', markerfacecolor=tuple(col),
            #		 markeredgecolor='k', markersize=1)

            plt.title('Between %s and %s : %d clusters and % d points' %
                      (round(range_low, 3), round(
                          range_high, 3), n_clusters_, n_clusters_points_))
            plt.axis(range_axis)

            F = plt.gcf()

            F.savefig(os.path.join(
                OUTPUT_CHARTS_DIR,
                '{0}_quantile_group_{1}.png'.format(chart_name,
                                                    quantile_group)),
                      dpi=(500))
            plt.show()
            plt.clf()
    df_filtered['CLUSTER'] = labels
    df_filtered = df_filtered.loc[df_filtered['CLUSTER'] != -1]
    df_filtered['CLUSTER_COUNT'] = df_filtered.groupby(
        'CLUSTER')['CLUSTER'].transform('count')

    df_filtered['PERCENTILE'] = [range_low] * len(df_filtered.index)
    return df_filtered
Exemplo n.º 36
0
    def getClusters(positions, distanceKM, min_samples=5):
        """
        Returns the clusters from the points based on provided data to no. of
            clusters based on DBScan Algorithm

        Parameters
        ----------
        positions : Geodataframe object
           Geodataframe with positions to be clustered
        distanceKM : Float
            Epsilon parameters fo dbscan algorithm in km. or, distance for
                clustering of points
        min_samples : Integer, optional
            DESCRIPTION. Minimum no. of points required to form cluster.
                If 1 is set,each individual will form their own cluster
                The default is 5.

        Returns
        -------
        Dataframe
            The dataframe with cluster centres co-ordinates and no. of points
                on the cluster.

        """
        def get_centermost_point(cluster):
            centroid = (MultiPoint(cluster).centroid.x,
                        MultiPoint(cluster).centroid.y)
            centermost_point = min(
                cluster, key=lambda point: great_circle(point, centroid).m)
            return tuple(centermost_point)

        df = positions.to_crs({'init': 'epsg:4326'})
        lon = df.geometry.x
        lat = df.geometry.y
        origin_pt = pd.DataFrame()
        # Populate lat lon to dataframe
        origin_pt['lat'] = lat
        origin_pt['lon'] = lon
        # add index to data
        coords = origin_pt.to_numpy()
        origin_pt.index = [i for i in range(len(lat))]
        #
        # Convert Data to projected and perform clustering
        kms_per_radian = 6371.0088
        epsilon = distanceKM / kms_per_radian
        db = DBSCAN(eps=epsilon,
                    min_samples=min_samples,
                    algorithm='ball_tree',
                    metric='haversine').fit(np.radians(coords))
        cluster_labels = db.labels_
        validClusters = []
        for cluster in cluster_labels:
            if cluster != -1:
                validClusters.append(cluster)
        num_clusters = len(set(validClusters))
        clusters = pd.Series(
            [coords[cluster_labels == n] for n in range(num_clusters)])
        # Assigining clusterId to each point
        origin_pt['clusterId'] = cluster_labels
        # Identify cluster Centres
        centermost_points = clusters.map(get_centermost_point)

        # Create Geodataframe with attributes for cluster centroids
        clusterId = [i for i in range(len(centermost_points))]
        centroidLat = [
            centermost_points[i][0] for i in range(len(centermost_points))
        ]
        centroidLon = [
            centermost_points[i][1] for i in range(len(centermost_points))
        ]
        clusterSize = [
            len(origin_pt[origin_pt['clusterId'] == i])
            for i in range(len(centermost_points))
        ]
        # Create dataframe for cluster centers
        clusterCentres_df = pd.DataFrame({
            'clusterId': clusterId,
            'clusterLat': centroidLat,
            'clusterLon': centroidLon,
            'clusterSize': clusterSize
        })
        clusterCentres = gpd.GeoDataFrame(clusterCentres_df,
                                          geometry=gpd.points_from_xy(
                                              clusterCentres_df.clusterLon,
                                              clusterCentres_df.clusterLat))
        return clusterCentres
Exemplo n.º 37
0
	def dbscan_dados(self, topics, epsilon=0.5, num_topics=5, n_words=15):
		X = self.topics_to_vectorspace(topics, num_topics=num_topics, n_words=n_words)
		clustering = DBSCAN(eps=epsilon).fit(X.toarray())
		return clustering.labels_
"""


from sklearn.cluster import DBSCAN
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt


df = pd.read_csv('week4event.csv',header=None)


X = df.to_numpy()
X = X[:20000,[2, 6]]

db = DBSCAN(eps=3, min_samples=2).fit(X)
core_samples_mask = np.zeros_like(db.labels_, dtype=bool)
core_samples_mask[db.core_sample_indices_] = True
labels = db.labels_
n_clusters_ = len(set(labels)) - (1 if -1 in labels else 0)
n_noise_ = list(labels).count(-1)
print('Estimated number of clusters: %d' % n_clusters_)
print('Estimated number of noise points: %d' % n_noise_)

##########
unique_labels = set(labels)
colors = [plt.cm.Spectral(each)
          for each in np.linspace(0, 1, len(unique_labels))]
for k, col in zip(unique_labels, colors):
    if k == -1:
        # Black used for noise.
Exemplo n.º 39
0
X_scaled = scaler.transform(X)

fig, axes = plt.subplots(1,
                         4,
                         figsize=(15, 3),
                         subplot_kw={
                             'xticks': (),
                             'yticks': ()
                         })
plt.subplots_adjust(left=0.05, right=0.95)

# 列出要使用的算法
algorithms = [
    KMeans(n_clusters=2),
    AgglomerativeClustering(n_clusters=2),
    DBSCAN()
]

# 创建一个随机的簇分配,作为参考
random_state = np.random.RandomState(seed=0)
random_clusters = random_state.randint(low=0, high=2, size=len(X))

# 绘制随机分配
axes[0].scatter(X_scaled[:, 0],
                X_scaled[:, 1],
                c=random_clusters,
                cmap=mglearn.cm3,
                s=60)
axes[0].set_title("Random assignment - ARI: {:.2f}".format(
    adjusted_rand_score(y, random_clusters)))
def cluster_torsions_DBSCAN(file_list,
                            cgmodel,
                            min_samples=5,
                            eps=0.5,
                            frame_start=0,
                            frame_stride=1,
                            frame_end=-1,
                            output_format="pdb",
                            output_dir="cluster_output",
                            backbone_torsion_type="bb_bb_bb_bb",
                            core_points_only=True,
                            filter=True,
                            filter_ratio=0.25,
                            plot_silhouette=True,
                            plot_distance_hist=True):
    """
    Given PDB or DCD trajectory files and coarse grained model as input, this function performs DBSCAN clustering on the poses in the trajectory, and returns a list of the coordinates for the medoid pose of each cluster.

    :param file_list: A list of PDB or DCD files to read and concatenate
    :type file_list: List( str )

    :param cgmodel: A CGModel() class object
    :type cgmodel: class
    
    :param min_samples: minimum of number of samples in neighborhood of a point to be considered a core point (includes point itself)
    :type min_samples: int
    
    :param eps: DBSCAN parameter neighborhood distance cutoff
    :type eps: float

    :param frame_start: First frame in trajectory file to use for clustering.
    :type frame_start: int

    :param frame_stride: Advance by this many frames when reading trajectories.
    :type frame_stride: int

    :param frame_end: Last frame in trajectory file to use for clustering.
    :type frame_end: int
    
    :param output_format: file format extension to write medoid coordinates to (default="pdb"), dcd also supported
    :type output_format: str
    
    :param output_dir: directory to write clustering medoid and plot files
    :type output_dir: str
    
    :param backbone_torsion_type: particle sequence of the backbone torsions (default="bb_bb_bb_bb") - for now only single sequence permitted
    :type backbone_torsion_type: str

    :param core_points_only: use only core points to calculate medoid structures (default=True)
    :type core_points_only: boolean    
    
    :param filter: option to apply neighborhood radius filtering to remove low-density data (default=True)
    :type filter: boolean
    
    :param filter_ratio: fraction of data points which pass through the neighborhood radius filter (default=0.05)
    :type filter_ratio: float
    
    :param plot_silhouette: option to create silhouette plot of clustering results (default=True)
    :type plot_silhouette: boolean
    
    :param plot_torsion_hist: option to plot a histogram of torsion euclidean distances (post-filtering)
    :type plot_torsion_hist: boolean

    :returns:
       - medoid_positions ( np.array( float * unit.angstrom ( n_clusters x num_particles x 3 ) ) ) - A 3D numpy array of poses corresponding to the medoids of all trajectory clusters.
       - medoid torsions ( np.array ( float * unit.degrees ( n_clusters x n_torsion ) - A 2D numpy array of the backbone torsion angles for each cluster medoid
       - cluster_sizes ( List ( int ) ) - A list of number of members in each cluster 
       - cluster_rmsd( np.array ( float ) ) - A 1D numpy array of rmsd (in cluster distance space) of samples to cluster centers
       - n_noise ( int ) - number of points classified as noise
       - silhouette_avg - ( float ) - average silhouette score across all clusters 
    """

    if not os.path.exists(output_dir):
        os.mkdir(output_dir)

    torsion_val_array, traj_all = get_torsion_matrix(file_list, cgmodel,
                                                     frame_start, frame_stride,
                                                     frame_end,
                                                     backbone_torsion_type)

    # We need to precompute the euclidean distance matrix, accounting for periodic boundaries

    total = 0
    angle_range = np.full(torsion_val_array.shape[1], 360)
    powers = np.full(torsion_val_array.shape[1], 2)

    torsion_distances = np.zeros(
        (torsion_val_array.shape[0], torsion_val_array.shape[0]))

    for i in range(torsion_val_array.shape[0]):
        for j in range(torsion_val_array.shape[0]):
            delta = np.abs(torsion_val_array[i, :] - torsion_val_array[j, :])
            delta = np.where(delta > 0.5 * angle_range, delta - angle_range,
                             delta)
            torsion_distances[i, j] = np.sqrt(np.power(delta, powers).sum())

    if filter:
        # Filter distances:
        torsion_distances, dense_indices, filter_ratio_actual = \
            filter_distances(torsion_distances, filter_ratio=filter_ratio)

        traj_all = traj_all[dense_indices]

    if plot_distance_hist:
        distances_row = np.reshape(
            torsion_distances,
            (torsion_distances.shape[0] * torsion_distances.shape[1], 1))

        # Remove the diagonal 0 elements:
        distances_row = distances_row[distances_row != 0]

        figure = plt.figure()
        n_out, bin_edges_out, patch = plt.hist(distances_row,
                                               bins=1000,
                                               density=True)
        plt.xlabel('rmsd')
        plt.ylabel('probability density')
        plt.savefig(f'{output_dir}/torsion_distances_hist.pdf')
        plt.close()

    # Cluster with sklearn DBSCAN
    dbscan = DBSCAN(min_samples=min_samples, eps=eps,
                    metric='precomputed').fit(torsion_distances)
    # The produces a cluster labels from 0 to n_clusters-1, and assigns -1 to noise points

    # Get labels
    labels = dbscan.labels_

    # Get core sample indices:
    core_sample_indices = dbscan.core_sample_indices_

    # Number of clusters:
    n_clusters = len(set(labels)) - (1 if -1 in labels else 0)

    # Number of noise points:
    n_noise = list(labels).count(-1)

    # Get indices of frames in each cluster:
    cluster_indices = {}
    cluster_indices_core = {}
    cluster_sizes = []
    cluster_sizes_core = []

    for k in range(n_clusters):
        cluster_indices[k] = np.argwhere(labels == k)[:, 0]
        cluster_indices_core[k] = []
        for elem in cluster_indices[k]:
            if elem in core_sample_indices:
                cluster_indices_core[k].append(elem)
        cluster_sizes.append(len(cluster_indices[k]))
        cluster_sizes_core.append(len(cluster_indices_core[k]))

    # Get indices of frames classified as noise:
    noise_indices = np.argwhere(labels == -1)[:, 0]

    # Find the structure closest to each center (medoid):
    # OPTICS/DBSCAN does not have a built-in function to transform to cluster-distance space,
    # as the centroids of the clusters are not physically meaningful in general. However, as
    # RMSD between structures is our only clustering feature, the cluster centers (regions of
    # high density) will likely be representative structures of each cluster.

    # Following the protocol outlined in MDTraj example:
    # http://mdtraj.org/1.9.3/examples/centroids.html

    # Create distance matrices within each cluster:
    torsion_distances_k = {}

    if core_points_only:
        for k in range(n_clusters):
            torsion_distances_k[k] = np.zeros(
                (cluster_sizes_core[k], cluster_sizes_core[k]))
            for i in range(cluster_sizes_core[k]):
                for j in range(cluster_sizes_core[k]):
                    torsion_distances_k[k][i, j] = torsion_distances[
                        cluster_indices_core[k][i], cluster_indices_core[k][j]]

        # Compute medoid based on similarity scores:
        medoid_index = []  # Global index
        intra_cluster_medoid_index = []  # Index within cluster
        for k in range(n_clusters):
            intra_cluster_medoid_index.append(
                np.exp(-torsion_distances_k[k] /
                       torsion_distances_k[k].std()).sum(axis=1).argmax())
            # Here we need to use the global sample index to find the medoid structure:
            medoid_index.append(
                cluster_indices_core[k][intra_cluster_medoid_index[k]])

    else:
        for k in range(n_clusters):
            torsion_distances_k[k] = np.zeros(
                (cluster_sizes[k], cluster_sizes[k]))
            for i in range(cluster_sizes[k]):
                for j in range(cluster_sizes[k]):
                    torsion_distances_k[k][i, j] = torsion_distances[
                        cluster_indices[k][i], cluster_indices[k][j]]

        # Compute medoid based on similarity scores:
        medoid_index = []  # Global index
        intra_cluster_medoid_index = []  # Index within cluster
        for k in range(n_clusters):
            intra_cluster_medoid_index.append(
                np.exp(-torsion_distances_k[k] /
                       torsion_distances_k[k].std()).sum(axis=1).argmax())
            # Here we need to use the global sample index to find the medoid structure:
            medoid_index.append(
                cluster_indices[k][intra_cluster_medoid_index[k]])

    medoid_xyz = np.zeros([n_clusters, traj_all.n_atoms, 3])
    for k in range(n_clusters):
        medoid_xyz[k, :, :] = traj_all[medoid_index[k]].xyz[0]

    # Write medoids to file
    write_medoids_to_file(cgmodel, medoid_xyz, output_dir, output_format)
    medoid_positions = medoid_xyz * unit.nanometer

    # Get medoid torsions:
    medoid_torsions = np.zeros([n_clusters, torsion_val_array.shape[1]])
    for k in range(n_clusters):
        medoid_torsions[k, :] = torsion_val_array[medoid_index[k], :]

    # Compute intra-cluster rmsd of samples to medoid based on structure rmsd
    cluster_rmsd = np.zeros(n_clusters)

    for k in range(n_clusters):
        cluster_rmsd[k] = np.sqrt((
            (torsion_distances_k[k][intra_cluster_medoid_index[k]]**2).sum()) /
                                  len(cluster_indices[k]))

    # Get silhouette scores
    try:
        silhouette_sample_values = silhouette_samples(torsion_distances,
                                                      labels)
        silhouette_avg = np.mean(silhouette_sample_values[labels != -1])

        if plot_silhouette:
            # Plot silhouette analysis
            plotfile = f"{output_dir}/silhouette_dbscan_min_sample_{min_samples}_eps_{eps}.pdf"

            make_silhouette_plot(dbscan, silhouette_sample_values,
                                 silhouette_avg, n_clusters, cluster_rmsd,
                                 cluster_sizes, plotfile)
    except ValueError:
        print(
            "There are either no clusters, or no noise points identified. Try adjusting DBSCAN min_samples, eps parameters."
        )
        silhouette_avg = None

    return medoid_positions, medoid_torsions, cluster_sizes, cluster_rmsd, n_noise, silhouette_avg
Exemplo n.º 41
0
knn.fit(X_train, y_train)
y_pred_class = knn.predict(X_test)
from sklearn import metrics
print metrics.accuracy_score(y_test, y_pred_class)

# KNN accuracy on scaled data
knn.fit(X_train_scaled, y_train)
y_pred_class = knn.predict(X_test_scaled)
print metrics.accuracy_score(y_test, y_pred_class)

'''
DB Scan Clustering
'''

# DBSCAN with eps=1 and min_samples=3
from sklearn.cluster import DBSCAN
db = DBSCAN(eps=1, min_samples=3)
db.fit(X_scaled)

# review the cluster labels
db.labels_

# save the cluster labels and sort by cluster
NHL['cluster'] = db.labels_
NHL.sort('cluster')

# review the cluster centers
NHL.groupby('cluster').mean()

# scatter plot matrix of DBSCAN cluster assignments (0=red, 1=green, 2=blue, -1=yellow)
pd.scatter_matrix(X, c=colors[NHL.cluster], figsize=(10,10), s=100)
def show_dbscan():
    centers = [[1, 1], [-1, -1], [1, -1]]
    #创建测试样本
    X, labels_true = make_blobs(n_samples=750,
                                centers=centers,
                                cluster_std=0.4,
                                random_state=0)

    X = StandardScaler().fit_transform(X)
    #DBSCAN主要参数:
    #eps:同一聚类集合中两个样本的最大距离
    #min_samples:同一聚类样本集合中最小样本数
    #algorithm:算法分为:'auto','ball_tree','kd_tree','brute'
    #leaf_size:使用balltree或者cKDTree算法时的叶子结点的个数
    #n_jobs:并发任务数
    db = DBSCAN(eps=0.3, min_samples=10).fit(X)
    core_samples_mask = np.zeros_like(db.labels_, dtype=bool)
    core_samples_mask[db.core_sample_indices_] = True
    labels = db.labels_

    # Number of clusters in labels, ignoring noise if present.
    n_clusters_ = len(set(labels)) - (1 if -1 in labels else 0)

    print('Estimated number of clusters: %d' % n_clusters_)
    print("Homogeneity: %0.3f" %
          metrics.homogeneity_score(labels_true, labels))
    print("Completeness: %0.3f" %
          metrics.completeness_score(labels_true, labels))
    print("V-measure: %0.3f" % metrics.v_measure_score(labels_true, labels))
    print("Adjusted Rand Index: %0.3f" %
          metrics.adjusted_rand_score(labels_true, labels))
    print("Adjusted Mutual Information: %0.3f" %
          metrics.adjusted_mutual_info_score(labels_true, labels))
    print("Silhouette Coefficient: %0.3f" %
          metrics.silhouette_score(X, labels))
    # Black removed and is used for noise instead.
    unique_labels = set(labels)
    colors = plt.cm.Spectral(np.linspace(0, 1, len(unique_labels)))
    for k, col in zip(unique_labels, colors):
        if k == -1:
            # Black used for noise.
            col = 'k'

        class_member_mask = (labels == k)

        xy = X[class_member_mask & core_samples_mask]
        plt.plot(xy[:, 0],
                 xy[:, 1],
                 'o',
                 markerfacecolor=col,
                 markeredgecolor='k',
                 markersize=14)

        xy = X[class_member_mask & ~core_samples_mask]
        plt.plot(xy[:, 0],
                 xy[:, 1],
                 'o',
                 markerfacecolor=col,
                 markeredgecolor='k',
                 markersize=6)

    plt.title('Estimated number of clusters: %d' % n_clusters_)
    plt.show()
Exemplo n.º 43
0
def do_dbs(ft):
    return DBSCAN(eps=0.3, min_samples=10).fit(ft).labels_
Exemplo n.º 44
0
print("Lower Bound:", lower_bound)

upper_bound = Q3 + 1.5 * IQR
print("Upper Bound:", upper_bound)

df_clean = df[(df['V13'] > lower_bound) & (df['V13'] < upper_bound)]

sns.boxplot(y=df_clean['V13'])
plt.show()

sns.scatterplot(df['V13'], df['V14'])

from sklearn.cluster import DBSCAN

X_train = df[['V13', 'V14']]
model = DBSCAN()
model.fit(X_train)

cluster_labels = model.labels_
plt.scatter(df["V13"], df["V14"], c=cluster_labels)

plt.show()

df['labels'] = cluster_labels
df_cluster_clean = df[df['labels'] != -1]

plt.scatter(df_cluster_clean["V13"], df_cluster_clean["V14"], c='r')
plt.xlabel('V13')
plt.ylabel('V14')
plt.show()
Exemplo n.º 45
0
def main_worker(args):
    global start_epoch, best_mAP

    cudnn.benchmark = True

    sys.stdout = Logger(osp.join(args.logs_dir, 'log.txt'))
    print("==========\nArgs:{}\n==========".format(args))

    # Create data loaders
    iters = args.iters if (args.iters > 0) else None
    ncs = [int(x) for x in args.ncs.split(',')]
    # ncs_dbscan=ncs.copy()
    dataset_target, label_dict = get_data(args.dataset_target, args.data_dir,
                                          len(ncs))
    test_loader_target = get_test_loader(dataset_target, args.height,
                                         args.width, args.batch_size,
                                         args.workers)
    tar_cluster_loader = get_test_loader(dataset_target,
                                         args.height,
                                         args.width,
                                         args.batch_size,
                                         args.workers,
                                         testset=dataset_target.train)

    dataset_source, _ = get_data(args.dataset_source, args.data_dir, len(ncs))
    sour_cluster_loader = get_test_loader(dataset_source,
                                          args.height,
                                          args.width,
                                          args.batch_size,
                                          args.workers,
                                          testset=dataset_source.train)
    train_loader_source = get_train_loader(dataset_source, args.height,
                                           args.width, 0, args.batch_size,
                                           args.workers, args.num_instances,
                                           args.iters, dataset_source.train)

    source_classes = dataset_source.num_train_pids
    distribution, _ = write_sta_im(dataset_source.train)

    fc_len = 3500
    model_1, _, model_1_ema, _ = create_model(
        args, [fc_len for _ in range(len(ncs))])
    # print(model_1)

    epoch = 0
    target_features_dict, _ = extract_features(model_1_ema,
                                               tar_cluster_loader,
                                               print_freq=100)
    target_features = F.normalize(torch.stack(
        list(target_features_dict.values())),
                                  dim=1)

    # Calculate distance
    print('==> Create pseudo labels for unlabeled target domain')

    rerank_dist = compute_jaccard_distance(target_features,
                                           k1=args.k1,
                                           k2=args.k2)
    del target_features
    if (epoch == 0):
        # DBSCAN cluster
        eps = 0.6  # 0.6
        print('Clustering criterion: eps: {:.3f}'.format(eps))
        cluster = DBSCAN(eps=eps,
                         min_samples=4,
                         metric='precomputed',
                         n_jobs=-1)

    # select & cluster images as training set of this epochs
    pseudo_labels = cluster.fit_predict(rerank_dist)

    # num_ids = len(set(pseudo_labels)) - (1 if -1 in pseudo_labels else 0)
    plabel = []
    new_dataset = []
    for i, (item, label) in enumerate(zip(dataset_target.train,
                                          pseudo_labels)):

        if label == -1:
            continue
        plabel.append(label)
        new_dataset.append((item[0], label, item[-1]))

    target_label = [plabel]
    ncs = [len(set(plabel)) + 1]
    print('new class are {}, length of new dataset is {}'.format(
        ncs, len(new_dataset)))
    model_1.module.classifier0_3500 = nn.Linear(2048,
                                                ncs[0] + source_classes,
                                                bias=False).cuda()
    model_1_ema.module.classifier0_3500 = nn.Linear(2048,
                                                    ncs[0] + source_classes,
                                                    bias=False).cuda()

    model_1.module.classifier3_0_3500 = nn.Linear(1024,
                                                  ncs[0] + source_classes,
                                                  bias=False).cuda()
    model_1_ema.module.classifier3_0_3500 = nn.Linear(1024,
                                                      ncs[0] + source_classes,
                                                      bias=False).cuda()
    print(model_1.module.classifier0_3500)
    # if epoch !=0:
    #     model_1.module.classifier0_3500.weight.data.copy_(torch.from_numpy(normalize(target_centers,axis=1)).float().cuda())
    #     model_1_ema.module.classifier0_3500.weight.data.copy_(torch.from_numpy(normalize(target_centers,axis=1)).float().cuda())

    # Initialize source-domain class centroids
    print("==> Initialize source-domain class centroids in the hybrid memory")
    source_features, _ = extract_features(model_1,
                                          sour_cluster_loader,
                                          print_freq=50)
    sour_fea_dict = collections.defaultdict(list)
    print("==> Ending source-domain class centroids in the hybrid memory")
    for f, pid, _ in sorted(dataset_source.train):
        sour_fea_dict[pid].append(source_features[f].unsqueeze(0))
    source_centers = [
        torch.cat(sour_fea_dict[pid], 0).mean(0)
        for pid in sorted(sour_fea_dict.keys())
    ]
    source_centers = torch.stack(source_centers, 0)
    source_centers = F.normalize(source_centers, dim=1)
    del sour_fea_dict, source_features, sour_cluster_loader

    # Evaluator
    evaluator_1 = Evaluator(model_1)
    evaluator_1_ema = Evaluator(model_1_ema)

    clusters = [args.num_clusters] * args.epochs  # TODO: dropout clusters

    k_memory = 8192
    contrast = onlinememory(2048,
                            len(new_dataset),
                            sour_numclass=source_classes,
                            K=k_memory + source_classes,
                            index2label=target_label,
                            choice_c=args.choice_c,
                            T=0.07,
                            use_softmax=True).cuda()
    contrast.index_memory = torch.cat(
        (torch.arange(source_classes), -1 * torch.ones(k_memory).long()),
        dim=0).cuda()
    contrast.memory = torch.cat((source_centers, torch.rand(k_memory, 2048)),
                                dim=0).cuda()

    tar_selflabel_loader = get_test_loader(dataset_target,
                                           args.height,
                                           args.width,
                                           args.batch_size,
                                           args.workers,
                                           testset=new_dataset)

    o = Optimizer(target_label,
                  dis_gt=distribution,
                  m=model_1,
                  ncl=ncs,
                  t_loader=tar_selflabel_loader,
                  N=len(new_dataset),
                  fc_len=fc_len)

    uncertainty = collections.defaultdict(list)
    print("Training begining~~~~~~!!!!!!!!!")
    for epoch in range(len(clusters)):

        iters_ = 300 if epoch % 1 == 0 else iters
        if epoch % 6 == 0 and epoch != 0:
            target_features_dict, _ = extract_features(model_1_ema,
                                                       tar_cluster_loader,
                                                       print_freq=50)

            target_features = torch.stack(list(target_features_dict.values()))
            target_features = F.normalize(target_features, dim=1)

            print('==> Create pseudo labels for unlabeled target domain with')
            rerank_dist = compute_jaccard_distance(target_features,
                                                   k1=args.k1,
                                                   k2=args.k2)

            # select & cluster images as training set of this epochs
            pseudo_labels = cluster.fit_predict(rerank_dist)
            plabel = []

            new_dataset = []

            for i, (item, label) in enumerate(
                    zip(dataset_target.train, pseudo_labels)):
                if label == -1: continue
                plabel.append(label)
                new_dataset.append((item[0], label, item[-1]))

            target_label = [plabel]
            ncs = [len(set(plabel)) + 1]

            tar_selflabel_loader = get_test_loader(dataset_target,
                                                   args.height,
                                                   args.width,
                                                   args.batch_size,
                                                   args.workers,
                                                   testset=new_dataset)
            o = Optimizer(target_label,
                          dis_gt=distribution,
                          m=model_1,
                          ncl=ncs,
                          t_loader=tar_selflabel_loader,
                          N=len(new_dataset),
                          fc_len=fc_len)

            contrast.index_memory = torch.cat(
                (torch.arange(source_classes),
                 -1 * torch.ones(k_memory).long()),
                dim=0).cuda()

            model_1.module.classifier0_3500 = nn.Linear(2048,
                                                        ncs[0] +
                                                        source_classes,
                                                        bias=False).cuda()
            model_1_ema.module.classifier0_3500 = nn.Linear(2048,
                                                            ncs[0] +
                                                            source_classes,
                                                            bias=False).cuda()

            model_1.module.classifier3_0_3500 = nn.Linear(1024,
                                                          ncs[0] +
                                                          source_classes,
                                                          bias=False).cuda()
            model_1_ema.module.classifier3_0_3500 = nn.Linear(
                1024, ncs[0] + source_classes, bias=False).cuda()
            print(model_1.module.classifier0_3500)
            # if epoch !=0:
            #     model_1.module.classifier0_3500.weight.data.copy_(torch.from_numpy(normalize(target_centers,axis=1)).float().cuda())
            #     model_1_ema.module.classifier0_3500.weight.data.copy_(torch.from_numpy(normalize(target_centers,axis=1)).float().cuda())

        target_label_o = o.L
        target_label = [
            list(np.asarray(target_label_o[0].data.cpu()) + source_classes)
        ]
        contrast.index2label = [[i for i in range(source_classes)] +
                                target_label[0]]

        # change pseudo labels
        for i in range(len(new_dataset)):
            new_dataset[i] = list(new_dataset[i])
            for j in range(len(ncs)):
                new_dataset[i][j + 1] = int(target_label[j][i])
            new_dataset[i] = tuple(new_dataset[i])

        cc = args.choice_c  #(args.choice_c+1)%len(ncs)
        train_loader_target = get_train_loader(dataset_target, args.height,
                                               args.width, cc, args.batch_size,
                                               args.workers,
                                               args.num_instances, iters_,
                                               new_dataset)

        # Optimizer
        params = []
        flag = 1.0
        # if 20<epoch<=40 or 60<epoch<=80 or 120<epoch:
        #     flag=0.1
        # else:
        #     flag=1.0

        for key, value in model_1.named_parameters():
            if not value.requires_grad:
                print(key)
                continue
            params += [{
                "params": [value],
                "lr": args.lr * flag,
                "weight_decay": args.weight_decay
            }]

        optimizer = torch.optim.Adam(params)

        # Trainer
        trainer = DbscanBaseTrainer(model_1,
                                    model_1_ema,
                                    contrast,
                                    num_cluster=ncs,
                                    alpha=args.alpha,
                                    fc_len=fc_len)

        train_loader_target.new_epoch()
        train_loader_source.new_epoch()

        trainer.train(epoch,
                      train_loader_target,
                      train_loader_source,
                      optimizer,
                      args.choice_c,
                      print_freq=args.print_freq,
                      train_iters=iters_)

        def save_model(model_ema, is_best, best_mAP, mid):
            save_checkpoint(
                {
                    'state_dict': model_ema.state_dict(),
                    'epoch': epoch + 1,
                    'best_mAP': best_mAP,
                },
                is_best,
                fpath=osp.join(args.logs_dir,
                               'model' + str(mid) + '_checkpoint.pth.tar'))

        if epoch == 20:
            args.eval_step = 2
        elif epoch == 40:
            args.eval_step = 1
        if ((epoch + 1) % args.eval_step == 0 or (epoch == args.epochs - 1)):
            mAP_1 = 0  #evaluator_1.evaluate(test_loader_target, dataset_target.query, dataset_target.gallery,
            #          cmc_flag=False)

            mAP_2 = evaluator_1_ema.evaluate(test_loader_target,
                                             dataset_target.query,
                                             dataset_target.gallery,
                                             cmc_flag=False)
            is_best = (mAP_1 > best_mAP) or (mAP_2 > best_mAP)
            best_mAP = max(mAP_1, mAP_2, best_mAP)
            save_model(model_1, (is_best), best_mAP, 1)
            save_model(model_1_ema, (is_best and (mAP_1 <= mAP_2)), best_mAP,
                       2)

            print(
                '\n * Finished epoch {:3d}  model no.1 mAP: {:5.1%} model no.2 mAP: {:5.1%}  best: {:5.1%}{}\n'
                .format(epoch, mAP_1, mAP_2, best_mAP,
                        ' *' if is_best else ''))
Exemplo n.º 46
0
import matplotlib.pyplot as plt
import numpy
from matplotlib.colors import ListedColormap

f = open("dataset3.txt", 'r', encoding='utf-8')
data = []
type = []
for line in f.readlines():
    item = line.split(',')
    x = float(item[0])
    y = float(item[1])
    type.append(int(item[2]) - 1)
    t = [x, y]
    data.append(t)
X = numpy.array(data)
k = KMeans(n_clusters=5, ).fit(X)
label = k.labels_
print(label)
print(type)
colors = ListedColormap(
    ['#FF0000', '#00FF00', '#0000FF', '#000000', '#ffcb00'])
plt.scatter(X[:, 0:1], X[:, 1:2], c=label, cmap=colors)
plt.show()
# 数据自带分类
plt.scatter(X[:, 0:1], X[:, 1:2], c=type, cmap=colors)
plt.show()
# DBSCAN 算法
y_pred = DBSCAN(eps=0.1, min_samples=15).fit_predict(X)
plt.scatter(X[:, 0:1], X[:, 1:2], c=y_pred, cmap=colors)
plt.show()
Exemplo n.º 47
0
from sklearn.cluster import DBSCAN
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import plotly.express as px
import plotly.graph_objects as go

df_125 = pd.read_csv(
    r'C:\Users\Max\Desktop\Hackathon_Data\Task 2 - Leadec - StarterKit\IU\Time-series\1_rms_125_2_test.csv'
)
df_125['date'] = pd.to_datetime(df_125['timestamp'])
del df_125['timestamp']

clustering1 = DBSCAN(eps=0.05, min_samples=3).fit(
    np.array(df_125['max_audio']).reshape(-1, 1))
labels = clustering1.labels_
outlier_pos = np.where(labels == -1)[0]

n_clusters_ = len(set(labels)) - (1 if -1 in labels else 0)
n_noise_ = list(labels).count(-1)

print(labels)
print('Estimated number of clusters: %d' % n_clusters_)
print('Estimated number of noise points: %d' % n_noise_)

x = []
y = []
for pos in outlier_pos:
    x.append(np.array(df_125['max_audio'])[pos])
    y.append(df_125['max_audio'].index[pos])
def runClusterer(clusterer_name,params,data,param_scale='',metricstring=''):
    #print('S2 runClusterer>>>')
    from time import time

    #----------------------------------s1 读取数据
    #如果data[0]存储的是字符串,则读出data[0],data[1],即训练数据和标签位置
    if isinstance(data[0],str):
        X,y,size = loadPictureData(data[0],data[1],data[2])
        SX = X
    #如果存储的不是字符串,那就是直接能用的向量,直接存储就行,各分量自动存储
    else:
        X,SX,y,size = data
    #print('S2 data load done')
    #----------------------------------s2 参数缩放 
    # params: (5,10,) param_scale: (1,100,)
    # ture params : (5,0.1,)
    # 建议meanshift ,dbsacan eps /10 
    if param_scale != '':
        params = list(params)
        for i in range(0,len(params)):
            params[i] /= param_scale[i]

    #s2 选择聚类器
    #kmeans 需指定k
    if clusterer_name == 'kmeans':
        from sklearn.cluster import KMeans
        clusterer = KMeans(init='k-means++', n_clusters=int(params[0]), n_init=10)
        ms = 'sc'
    elif clusterer_name == 'dbscan':        
        from sklearn.cluster import DBSCAN
        # 0.5,10 注意!! eps 被缩小一个尺度!!!
        clusterer = DBSCAN(eps=params[0], min_samples=params[1])
        ms = 'sc'
    #birch 需指定k
    elif clusterer_name == 'birch':
        # None,0.5,50
        from sklearn.cluster import Birch
        clusterer = Birch(n_clusters = params[0], threshold = params[1], branching_factor = params[2])
        ms = 'sc'
    #optics 
    elif clusterer_name == 'optics':
        from sklearn.cluster import OPTICS
        clusterer = OPTICS(min_samples=int(params[0]))#,xi=params[1],min_cluster_size=params[2])
        #OPTICS(min_samples = 10, xi = 0.05, min_cluster_size = 0.05)
        ms = 'sc'
    #Spectral 需指定k
    elif clusterer_name == 'spectral':
        pass
        #clusterer = SpectralClustering(n_clusters = params[0], assign_labels = params[1], random_state = params[2])
    elif clusterer_name == 'hierarch':
        from sklearn.cluster import AgglomerativeClustering
        #clusterer = AgglomerativeClustering(n_clusters=params[0],affinity=params[1],linkage=params[2])#'canberra',linkage='complete')
        clusterer = AgglomerativeClustering(n_clusters=int(params[0]), affinity='euclidean', memory=None, connectivity=None, compute_full_tree='auto', linkage='average')#, distance_threshold=None)
        ms = 'sc'
    elif clusterer_name == 'meanshift':
        from sklearn.cluster import MeanShift,estimate_bandwidth
        #0.2,500
        bandwidth = estimate_bandwidth(X, quantile=params[0], n_samples=params[1])
        clusterer = MeanShift(bandwidth=bandwidth, bin_seeding=True) 
        ms = 'sc'
    else:
        print('no cluster name specify')
        import sys
        sys.exit(0)

    if metricstring == '':
        metricstring = ms
    #s3 正式运行聚类
    t0 = time()
    clusterer.fit(X)
    t1 = time()
    
    infoDict = {'clusterer':clusterer,'clusterer_name':clusterer_name,'params':params,'metricstring':metricstring}
    # 聚类器,聚类器生成字符串,度量列表字符串
    dataDict = {'X':X,'SX':SX,'y':y,'size':size}
    # 存储数据的字典,三样全
    performanceDict = {'time':t1-t0,'clusters_num':max(clusterer.labels_)+1}
    # 存储表现的字典,先存储时间和聚类数量
    clusterer_container = {'info':infoDict ,'data':dataDict,'performance':performanceDict}    
    #print('S4 done.<<<')
    return clusterer_container
Exemplo n.º 49
0
def clustering(feature, eps, minPoints):
    dbscan = DBSCAN(eps=eps, min_samples=minPoints)
    dbscan.fit(feature)
    pred = dbscan.labels_
    return pred
Exemplo n.º 50
0
def touchdowns(image, n):
    """
    Function to obtain the locations of the touchdown passes from the image
    of the pass chart using k-means, and DBSCAN to account for difficulties in 
    extracting touchdown passes, since they have the are the same color as both the line of 
    scrimmage and the attached touchdown trajectory lines. 
    
    Input: 
        image: image from the folder 'Cleaned_Pass_Charts'
        n: number of toucndowns, from the corresponding data of the image
    Return:
        call to map_pass_locations:
            centers: list of pass locations in pixels
            col: width of image from which the pass locations were extracted
            pass_type: "TOUCHDOWN"
    """

    im = Image.open(image)
    pix = im.load()
    col, row = im.size

    img = Image.new('RGB', (col, row), 'black')
    p = img.load()

    for i in range(col):
        for j in range(row):
            r = pix[i, j][0]
            g = pix[i, j][1]
            b = pix[i, j][2]
            if (col < 1370) and (j < row - 105) and (j > row - 111):
                if (b > 2 * g) and (b > 60):
                    p[i, j] = (0, 0, 0)
            elif (col > 1370) and (j < row - 81) and (j > row - 86):
                if (b > 2 * g) and (b > 60):
                    p[i, j] = (0, 0, 0)
            else:
                p[i, j] = pix[i, j]
            r = p[i, j][0]
            g = p[i, j][1]
            b = p[i, j][2]
            f = ((r - 20)**2 + (g - 80)**2 + (b - 200)**2)**0.5
            if f < 32 and b > 100:
                p[i, j] = (255, 255, 0)

    scipy.misc.imsave('temp.jpg', img)
    imag = cv2.imread('temp.jpg')
    os.remove('temp.jpg')
    hsv = cv2.cvtColor(imag, cv2.COLOR_BGR2HSV)
    lower = np.array([20, 100, 100])
    upper = np.array([30, 255, 255])
    mask = cv2.inRange(hsv, lower, upper)
    res = cv2.bitwise_and(imag, imag, mask=mask)
    res = cv2.cvtColor(res, cv2.COLOR_HSV2RGB)
    res = cv2.cvtColor(res, cv2.COLOR_BGR2GRAY)
    res = cv2.fastNlMeansDenoising(res, h=10)
    x = np.where(res != 0)[0]
    y = np.where(res != 0)[1]
    pairs = zip(x, y)
    X = map(list, pairs)

    if (len(pairs) != 0):
        db = DBSCAN(eps=10, min_samples=n).fit(X)
        labels = db.labels_
        coords = pd.DataFrame([x, y, labels]).T
        coords.columns = ['x', 'y', 'label']
        clusters = Counter(labels).most_common(n)
        td_labels = np.array([clust[0] for clust in clusters])
        km_coords = coords.loc[coords['label'].isin(td_labels)]
        km = map(list, zip(km_coords.iloc[:, 0], km_coords.iloc[:, 1]))

        kmeans = KMeans(n_clusters=n, random_state=0).fit(km)
        centers = kmeans.cluster_centers_

        return map_pass_locations(centers, col, "TOUCHDOWN")

    else:
        return map_pass_locations([], col, "TOUCHDOWN", n)
Exemplo n.º 51
0
def cps(img, points, lines):
    """
    Chessboard position search in the given image.

    :param img: Image to search.
    :param points: Points obtained in laps.
    :param lines: Lines detected by slid.
    :return: The four inner points of the detected chessboard.
    """
    ptp_cache = {}

    def ptp_distance(a, b):
        """
        Distance from point to point with a cache to avoid multiple
        calculations.
        """
        idx = hash("__dis" + str(a) + str(b))
        if idx in ptp_cache:
            return ptp_cache[idx]
        ptp_cache[idx] = math.sqrt((a[0] - b[0])**2 + (a[1] - b[1])**2)
        return ptp_cache[idx]

    points = __check_correctness(__normalize(points), img.shape)

    # Clustering
    __points = {}
    points = __sort_points(points)
    __max = 0
    __points_max = []
    alfa = math.sqrt(cv2.contourArea(np.array(points)) / 49)
    X = DBSCAN(eps=alfa * 4).fit(points)
    for i in range(len(points)):
        __points[i] = []
    for i in range(len(points)):
        if X.labels_[i] != -1:
            __points[X.labels_[i]].append(points[i])
    for i in range(len(points)):
        if len(__points[i]) > __max:
            __max = len(__points[i])
            __points_max = __points[i]

    if len(__points) > 0 and len(points) > 49 / 2:
        points = __points_max

    n = len(points)
    beta = n * (5 / 100)  # beta = n * (100 - (CPS efectiveness))
    alfa = math.sqrt(cv2.contourArea(np.array(points)) / 49)

    # We are looking for the focal point of the cluster
    x = [p[0] for p in points]
    y = [p[1] for p in points]
    centroid = (sum(x) / len(points), sum(y) / len(points))

    def __v(l):
        y_0, x_0 = l[0][0], l[0][1]
        y_1, x_1 = l[1][0], l[1][1]

        x_2 = 0
        t = (x_0 - x_2) / (x_0 - x_1 + 0.0001)
        a = [int((1 - t) * x_0 + t * x_1), int((1 - t) * y_0 + t * y_1)][::-1]

        x_2 = img.shape[0]
        t = (x_0 - x_2) / (x_0 - x_1 + 0.0001)
        b = [int((1 - t) * x_0 + t * x_1), int((1 - t) * y_0 + t * y_1)][::-1]

        poly1 = __sort_points([[0, 0], [0, img.shape[0]], a, b])
        s1 = __polyscore(np.array(poly1), points, centroid, alfa / 2, beta)
        poly2 = __sort_points(
            [a, b, [img.shape[1], 0], [img.shape[1], img.shape[0]]])
        s2 = __polyscore(np.array(poly2), points, centroid, alfa / 2, beta)

        return [a, b], s1, s2

    def __h(l):
        x_0, y_0 = l[0][0], l[0][1]
        x_1, y_1 = l[1][0], l[1][1]

        x_2 = 0
        t = (x_0 - x_2) / (x_0 - x_1 + 0.0001)
        a = [int((1 - t) * x_0 + t * x_1), int((1 - t) * y_0 + t * y_1)]

        x_2 = img.shape[1]
        t = (x_0 - x_2) / (x_0 - x_1 + 0.0001)
        b = [int((1 - t) * x_0 + t * x_1), int((1 - t) * y_0 + t * y_1)]

        poly1 = __sort_points([[0, 0], [img.shape[1], 0], a, b])
        s1 = __polyscore(np.array(poly1), points, centroid, alfa / 2, beta)
        poly2 = __sort_points(
            [a, b, [0, img.shape[0]], [img.shape[1], img.shape[0]]])
        s2 = __polyscore(np.array(poly2), points, centroid, alfa / 2, beta)

        return [a, b], s1, s2

    pregroup = [[], []]  # Division into 2 groups (for the frame)
    for l in lines:  # We will review all of the lines
        # We reject lines that pass through the center of the cluster
        if __ptl_distance(l, centroid, ptp_distance(*l)) > alfa * 2.5:
            for p in points:
                # We check that the line passes near a good point
                if __ptl_distance(l, p, ptp_distance(*l)) < alfa:
                    # The line belongs to the ring
                    tx, ty = l[0][0] - l[1][0], l[0][1] - l[1][1]
                    if abs(tx) < abs(ty):
                        ll, s1, s2 = __v(l)
                        orientation = 0
                    else:
                        ll, s1, s2 = __h(l)
                        orientation = 1
                    if s1 == 0 and s2 == 0:
                        continue
                    pregroup[orientation].append(ll)

    pregroup[0] = __remove_duplicates(pregroup[0])
    pregroup[1] = __remove_duplicates(pregroup[1])

    if debug.DEBUG:
        # We create an outer ring
        def convex_approx(points, alfa=0.01):
            points = np.array(points)
            hull = ConvexHull(points).vertices
            cnt = points[hull]
            approx = cv2.approxPolyDP(cnt, alfa * cv2.arcLength(cnt, True),
                                      True)
            return __normalize(itertools.chain(*approx))

        ring = convex_approx(__sort_points(points))

        debug.DebugImage(img) \
            .lines(lines, color=(0, 0, 255)) \
            .points(points, color=(0, 0, 255)) \
            .points(ring, color=(0, 255, 0)) \
            .points([centroid], color=(255, 0, 0)) \
            .save("cps_debug")

        debug.DebugImage(img) \
            .lines(pregroup[0], color=(0, 0, 255)) \
            .lines(pregroup[1], color=(255, 0, 0)) \
            .save("cps_pregroups")

    score = {}  # Frame ranking with the result
    for v in itertools.combinations(pregroup[0], 2):  # Horizontal
        for h in itertools.combinations(pregroup[1], 2):  # Vertical
            poly = [
                __intersection(v[0], v[1]),
                __intersection(v[0], h[0]),
                __intersection(v[0], h[1]),
                __intersection(v[1], h[0]),
                __intersection(v[1], h[1]),
                __intersection(h[0], h[1])
            ]
            poly = __check_correctness(poly, img.shape)
            if len(poly) != 4:
                continue
            poly = np.array(__sort_points(__normalize(poly)))
            if not cv2.isContourConvex(poly):
                continue
            score[-__polyscore(poly, points, centroid, alfa / 2, beta)] = poly

    score = collections.OrderedDict(sorted(score.items()))
    K = next(iter(score))

    inner_points = __normalize(score[K])
    inner_points = __order_points(inner_points)

    debug.DebugImage(img) \
        .points(points, color=(0, 255, 0)) \
        .points(inner_points, color=(0, 0, 255)) \
        .points([centroid], color=(255, 0, 0)) \
        .lines([[inner_points[0], inner_points[1]],
                [inner_points[1], inner_points[2]],
                [inner_points[2], inner_points[3]],
                [inner_points[3], inner_points[0]]],
               color=(255, 255, 255)) \
        .save("cps_debug_2")

    return __padcrop(img, inner_points)
Exemplo n.º 52
0
import matplotlib.pyplot as plt
import numpy as np
from sklearn.cluster import KMeans
from sklearn.cluster import AgglomerativeClustering
from sklearn.cluster import DBSCAN
from sklearn import datasets

x, y = datasets.make_moons(n_samples=1500, noise=0.09)
plt.scatter(x[:, 0], x[:, 1], s=5)

cores = np.array(['red', 'blue'])

# KMeans
kmeans = KMeans(n_clusters=2)
previsoes = kmeans.fit_predict(x)
plt.scatter(x[:, 0], x[:, 1], color=cores[previsoes])

# Hierarquico
hc = AgglomerativeClustering(n_clusters=2,
                             affinity='euclidean',
                             linkage='ward')
previsoes = hc.fit_predict(x)
plt.scatter(x[:, 0], x[:, 1], color=cores[previsoes])

# DBSCAN
dbscan = DBSCAN(eps=0.1)
previsoes = dbscan.fit_predict(x)
plt.scatter(x[:, 0], x[:, 1], color=cores[previsoes])
Exemplo n.º 53
0
def plot_cluster_map(X,
                     quantile_group,
                     range_low,
                     range_high,
                     range_axis,
                     data_column_i,
                     OUTPUT_CHARTS_DIR,
                     number_to_name_dict={},
                     eps=4,
                     min_samples=10,
                     save_plot=True):

    import numpy as np
    import pandas as pd
    from sklearn.cluster import DBSCAN
    import os
    from sklearn import metrics
    from sklearn.preprocessing import StandardScaler
    # #############################################################################
    # Compute DBSCAN
    db = DBSCAN(eps=eps, min_samples=min_samples).fit(X)
    core_samples_mask = np.zeros_like(db.labels_, dtype=bool)
    core_samples_mask[db.core_sample_indices_] = True
    labels = db.labels_

    # Number of clusters in labels, ignoring noise if present.
    n_clusters_ = len(set(labels)) - (1 if -1 in labels else 0)
    n_noise_ = list(labels).count(-1)
    n_clusters_points_ = len(X) - n_noise_

    if n_clusters_ > 0:
        print('Data Column Name: %s' % data_column_i)
        print('Quantile Group: %s' % quantile_group)
        print('Estimated number of clusters: %d' % n_clusters_)
        print('Estimated number of clusters points: %d' % n_clusters_points_)
        print('Estimated number of noise points: %d' % n_noise_)
        print('Range between : %s and %s' % (range_low, range_high))
        # #############################################################################
        # Plot result
        if save_plot:
            import matplotlib.pyplot as plt

            # Black removed and is used for noise instead.
            unique_labels = set(labels)
            colors = [
                plt.cm.Spectral(each)
                for each in np.linspace(0, 1, len(unique_labels))
            ]
            for k, col in zip(unique_labels, colors):
                if k == -1:
                    # Black used for noise.
                    col = [0, 0, 0, 1]

                class_member_mask = (labels == k)

                xy = X[class_member_mask & core_samples_mask]
                plt.plot(xy[:, 0],
                         xy[:, 1],
                         '.',
                         color=tuple(col),
                         markersize=3)

            # xy = X[class_member_mask & ~core_samples_mask]
            # plt.plot(xy[:, 0], xy[:, 1], 'o', markerfacecolor=tuple(col),
            #		 markeredgecolor='k', markersize=1)

            plt.title('Between %s and %s : %d clusters and % d points' %
                      (round(range_low, 3), round(
                          range_high, 3), n_clusters_, n_clusters_points_))
            plt.axis(range_axis)

            F = plt.gcf()
            chart_name = data_column_i
            if bool(number_to_name_dict):
                if str(data_column_i) in number_to_name_dict:
                    chart_name = number_to_name_dict[str(data_column_i)]

            F.savefig(os.path.join(
                OUTPUT_CHARTS_DIR,
                '{0}_quantile_group_{1}.png'.format(chart_name,
                                                    quantile_group)),
                      dpi=(500))
            # plt.show()
            plt.clf()
    return (labels)
Exemplo n.º 54
0
def ji_cluster_dbscan(df_trades,
                      cccy,
                      cdate,
                      alltime,
                      allorderIdNum,
                      wstart=3,
                      wend=8,
                      max_dist=0.25,
                      showplot=1,
                      showtext=0):
    """Clustering by time and orderIdNum(converted to AscII). Converting to AcsII will make to clusters that have similar
       IDs on the left-side.
    - Density-Based Spatial Clustering and Application with Noise (DBSCAN) is used since it is appropriate for finding
       dense trades clusters. Clan is defined around the center based on distance btwn samples. So it will cluster samples
       that are close each other.
    - It seems to work well with small and large sample sizes, thus more robust. """
    res = matlab_like()

    # alltime = df_trades.orderstart.apply(ji_time_str2sec).values # Apply outside once for efficiency
    allorderId = df_trades.orderid.values
    # allorderIdNum = df_trades.orderid.apply(ji_nlp_word2asc,n=wend-wstart+1).values

    # Particular currency and date
    mask = (df_trades.ccy == cccy) & (df_trades.trdate == cdate)
    masked_orders = allorderIdNum[mask]
    corderIds = [s[wstart:wend] for s in allorderId[mask]]  # for visualization

    masked_time = alltime[mask]
    X = [[i, j]
         for i, j in zip(masked_time, masked_orders)]  # seconds, Asc(orderId)

    # #############################################################################
    # Generate sample data
    # centers = [[1, 1], [-1, -1], [1, -1]]
    # X, labels_true = make_blobs(n_samples=750, centers=centers, cluster_std=0.4,
    #                            random_state=0)

    # Standardize
    do_scale = 1
    X0 = np.asarray(X)
    if do_scale:
        X = StandardScaler().fit_transform(X)
    else:
        X = np.asarray(X)

    # #############################################################################
    # Compute DBSCAN
    db = DBSCAN(eps=max_dist, min_samples=3).fit(X)
    core_samples_mask = np.zeros_like(db.labels_, dtype=bool)
    core_samples_mask[db.core_sample_indices_] = True
    labels = db.labels_

    # Number of clusters in labels, ignoring noise if present.
    n_clusters_ = len(set(labels)) - (1 if -1 in labels else 0)

    # Store
    res.labels = labels
    res.label_info = '-1 is for the noise, not counted as cluster'
    res.n_clusters = n_clusters_
    res.db = db

    if showplot:
        print(
            '******************* DBSCAN using feats: time, orderIdNum ********************'
        )
        # #############################################################################
        # Plot result

        # Black removed and is used for noise instead.
        unique_labels = set(labels)
        # colors = [plt.cm.Spectral(each) # Sample colormaps: 'PiYG', 'PRGn', 'BrBG', 'PuOr', 'RdGy', 'RdBu',
        #                               'RdYlBu', 'RdYlGn', 'Spectral', 'coolwarm', 'bwr', 'seismic'
        colors = [
            plt.cm.RdYlBu(each)
            for each in np.linspace(0, 1, len(unique_labels))
        ]
        plt.figure(figsize=(20, 10))
        cont = 0
        mycolors = [
            '#ff2ff0', '#ff880f', '#ff0000', '#00ff00', '#0000ff', '#ff00ff',
            '#00ffff', '#ff0088', '#ff8800', '#0088ff'
        ]
        for k, col in zip(unique_labels, colors):
            if k == -1:
                # Black used for noise.
                col = [0, 0, 0, 1]

            class_member_mask = (labels == k)

            # k>=0) standard clusters
            xy = X[class_member_mask & core_samples_mask]
            xy0 = X0[class_member_mask & core_samples_mask]
            # plt.plot(xy[:, 0], xy[:, 1], 'o', markerfacecolor=tuple(col),
            #         markeredgecolor='k', markersize=14)

            if showtext:
                plt.plot(xy0[:, 0] / 3600,
                         xy[:, 1],
                         '.',
                         markerfacecolor=mycolors[cont % 10],
                         markeredgecolor='y',
                         markersize=1)
                # add annotation
                # print('k:',k,xy0[:, 0]/3600, xy[:, 1])
                for cx, cy in zip(xy0[:, 0] / 3600, xy[:, 1]):
                    plt.annotate(k, (cx, cy),
                                 horizontalalignment='center',
                                 verticalalignment='center',
                                 fontsize=20,
                                 color=mycolors[cont % 10])
            else:
                plt.plot(xy0[:, 0] / 3600,
                         xy[:, 1],
                         'o',
                         markerfacecolor=mycolors[cont % 10],
                         markeredgecolor='k',
                         markersize=14)

            # -1) noise clusters
            xy = X[class_member_mask & ~core_samples_mask]
            xy0 = X0[class_member_mask & ~core_samples_mask]
            plt.plot(xy0[:, 0] / 3600,
                     xy[:, 1],
                     'o',
                     markerfacecolor=tuple(col),
                     markeredgecolor='k',
                     markersize=6)

            # plt.plot(xy[:, 0], xy[:, 1], 'o', markerfacecolor=mycolors[cont%8],
            #         markeredgecolor='k', markersize=6)
            cont += 1

        # plt.title('Estimated number of clusters: %d' % n_clusters_)
        plt.xlabel('time(hours)', size=20)
        plt.ylabel('orderId Group (ascII)', size=20)
        plt.title('%s: %s (%d trades, %d clusters) - DBSCAN(time,orderId)' %
                  (cccy, cdate, len(masked_time), n_clusters_),
                  size=20)
        plt.show()

        print('******************')
        for i, j, k in zip(corderId, labels, masked_time):
            print('%s: %d (%1.1f hours)' % (i, j, k / 3600))
        print('****************** (END) ********************** \n')

    return res
Exemplo n.º 55
0
    readCSV = csv.reader(csvdata, delimiter=',')
    next(readCSV)  # Skipping Header content
    for row in readCSV:
        for i in entityIndex:
            if (row[i] in Attributes[str(i)]) == False:
                Attributes[str(i)][row[i]] = len(Attributes[str(i)].keys())
        datarow = []
        try:
            for i in entityIndex:
                datarow.append(Attributes[str(i)][row[i]])
            dataAttributes.append(numpy.asarray(datarow))
        except ValueError:
            pass
feats = ["Age", "Number of Cigarettes per Day"]

print("Features: ", feats)

print("Done Loading Data\n")

# seperating test and training data
dataAttributes = numpy.asarray(dataAttributes)

print("DBSCAN Clustering Started")
DBSCluster = DBSCAN(eps=1, min_samples=150).fit(dataAttributes)
print("DBSCAN Clustering Finished\n")

# adding 10 as the first key it will take to b 0
pyplot.scatter(dataAttributes.T[0] + 10,
               dataAttributes.T[1],
               c=DBSCluster.labels_)
pyplot.show()
Exemplo n.º 56
0
    data_scale = 50

    gm_centers = np.random.rand(class_num, feat_dim) * data_scale # uniform, (0 ,1)
    gm_stds = np.random.rand(class_num) * 0.5 + 1
    gm_colors = np.random.rand(class_num*3, 3) * 0.8 + 0.2

    gm_X, gm_y = make_blobs(n_samples = data_amout, n_features = feat_dim, centers = gm_centers,
                  cluster_std = gm_stds , random_state = 9)

    gm_X = gm_X[:, :select_feat_dim]
    print("[info] data amout: %04d data dim: %04d"%(gm_X.shape[0], gm_X.shape[1]))

    plt.scatter(gm_X[:, 0], gm_X[:, 1], marker='o', s = 10) 
    plt.scatter(gm_centers[:, 0], gm_centers[:, 1], marker='x', s = 25, c = "r") 
    plt.title("Orignal Blob Dist(First 2 dims). ")
    plt.show()

    gm_cluster = DBSCAN(eps = 9.5, min_samples = 10)
    # gm_cluster.fit(gm_X) # training 

    y_pred = gm_cluster.fit_predict(gm_X) # or gm_cluster.labels_
    n_clusters_pred = len(np.unique(y_pred))
    # center_pred = gm_cluster.cluster_centers_

    plt.scatter(gm_X[:, 0], gm_X[:, 1], c = gm_colors[y_pred.tolist()])
    plt.scatter(gm_centers[:, 0], gm_centers[:, 1], marker='x', s = 25, c = "r") 
    # plt.scatter(center_pred[:, 0], center_pred[:, 1], marker='x', s = 25, c = "k") 
    plt.title("DBSCAN Cluster. use feat_dim = %d, eps = %.1f, #class = %d"%(select_feat_dim, gm_cluster.eps, n_clusters_pred))
    plt.show()
Exemplo n.º 57
0
]

to_revert = test.groupby(['Labels'])['1stPolYear', 'BirthYear',
                                     'GrossMthSalary', 'GeoLivArea',
                                     'HasChild'].mean()
#to_revert = to_revert.loc[:,-'Index']

my_scaler.inverse_transform(X=to_revert)
test['Labels'].value_counts()

#### DBSCAN

from sklearn.cluster import DBSCAN
from sklearn import metrics

db = DBSCAN(eps=0.2, min_samples=5).fit(test)

labels = db.labels_

# Number of clusters in labels, ignoring noise if present.
n_clusters_ = len(set(labels)) - (1 if -1 in labels else 0)

unique_clusters, counts_clusters = np.unique(db.labels_, return_counts=True)
print(np.asarray((unique_clusters, counts_clusters)))

from sklearn.decomposition import PCA
pca = PCA(n_components=None).fit(test)
pca_2d = pca.transform(test)
explained_variance = pca.explained_variance_ratio_

from sklearn.decomposition import PCA
Exemplo n.º 58
0
plt.scatter(X[:, 0], X[:, 1], marker='o', c=label_color, s=25, edgecolor='k')
plt.show()

print('### KMEANS CORRECTNESS')
f.benchmark(labels, y)

NN = NearestNeighbors(n_neighbors=int(np.log(len(X)))).fit(X)
distances, indices = NN.kneighbors(X)

fig = plt.figure()
plt.plot(np.sort(distances[:, distances.shape[1] - 1]), color='red')

###### S1 DBSCAN ######
eps = 27000
min_samples = np.log(len(X))
db = DBSCAN(eps=eps, min_samples=min_samples).fit(X)
labels = db.labels_
label_color = [f.LABEL_COLOR_MAP[l] for l in labels]
unique, counts = np.unique(labels, return_counts=True)
print('#FINAL:' + str(dict(zip(unique, counts))))
fig = plt.figure()
plt.scatter(X[:, 0], X[:, 1], marker='o', c=label_color, s=25, edgecolor='k')
plt.show()

print('### DBSCAN CORRECTNESS')
f.benchmark(labels, y)

###### S1 MYALG #####
n = 25
labels = f.squareBFS(X, n)
print('### MYALG CORRECTNESS')
Exemplo n.º 59
0
    print("***   DBSCAN clustering   ***")
    print("---------------------------------")
    import matplotlib.pyplot as plt
    from sklearn.cluster import DBSCAN

    print("Image shape: ", np.shape(the_image))
    the_image_list = the_image

    print("Image code got from autoencoder")
    image_autoencoded = [
        my_net.getCode(torch.Tensor(point)).detach().numpy()
        for point in the_image_list
    ]

    print("Runing fit function for DBSCAN clustering")
    clust = DBSCAN(eps=3, min_samples=2).fit(image_autoencoded)

    print("Creating list for clastered data")
    clustered_data = np.zeros((100, 100))

    print("Clustered data shape:  ", np.shape(clustered_data))

    x = 0
    y = 0
    for i in range(np.shape(clustered_data)[0] * np.shape(clustered_data)[1]):
        clustered_data[x][y] = clust.labels_[i]
        x = x + 1
        if x == 100:
            x = 0
            y = y + 1
Exemplo n.º 60
0
                 title="sklearn库中kmeans分类后的点集")

    #利用自己建立的kmeans模型对dots做分类
    pred_tags = my_kmeans(dots, class_num)
    scatter_dots(dots,
                 pred_tags,
                 new_plot=False,
                 subplot=223,
                 title="自己训练得到的kmeans分类后的点集")
    plt.show()

    dots1, tags1 = generate_dots2(200)
    scatter_dots(dots1, tags1, new_plot=True, subplot=221, title="原始点集")

    #利用sklearn库中的DBScan算法对dots做分类
    model1 = DBSCAN(eps=2, min_samples=1)
    db_pred_tags = model1.fit_predict(dots1)
    scatter_dots(dots1,
                 db_pred_tags,
                 new_plot=False,
                 subplot=222,
                 title="sklearn库中dbscan分类后的点集")

    #利用自己训练的dbscan算法最dots做分类
    mydb_pred_tags = my_dbscan(dots1, epsilon=2, minSamples=1)
    scatter_dots(dots1,
                 mydb_pred_tags,
                 new_plot=False,
                 subplot=223,
                 title="自己训练得到的dbscan分类后的点集")
    plt.show()