file = open('../../inputfiles/Kaohsiung2014_case.csv') lines = file.readlines() file.close() posision = [] for data in lines[1:]: data = data.split(',') if data[9] != "" and data[10] != "": posision.append([float(data[9]), float(data[10])]) print(posision) posision = np.array(posision) print(posision) posision = StandardScaler().fit_transform(posision) # dbscan db = DBSCAN(eps=0.15, min_samples=5).fit(posision) # db = DBSCAN().fit(posision) core_samples_mask = np.zeros_like(db.labels_, dtype=bool) core_samples_mask[db.core_sample_indices_] = True labels = db.labels_ print(labels) # Number of clusters in labels, ignoring noise if present. n_clusters_ = len(set(labels)) - (1 if -1 in labels else 0) print(n_clusters_) # print('Estimated number of clusters: %d' % n_clusters_) # print("Homogeneity: %0.3f" % metrics.homogeneity_score(labels_true, labels)) # print("Completeness: %0.3f" % metrics.completeness_score(labels_true, labels)) # print("V-measure: %0.3f" % metrics.v_measure_score(labels_true, labels)) # print("Adjusted Rand Index: %0.3f"
def main(): input_file = args.IN output = args.OUT window_size = args.WINDOW cut_name = args.CUT min_seed_number = args.MINNSEED if cut_name != '99perc' and cut_name != 'weights' and cut_name != 'dummy_cut': cut_file = pkl.load(open(args.CUTfile, 'rb')) df_cut = pd.DataFrame(cut_file) file_root = r.TFile(args.IN, "READ") tree_sig = file_root.Get("events") nentries = tree_sig.GetEntries() print('Entries before ' + str(cut_name), nentries) time, alpha, pitch, L, energy, event_idx, cut = [], [], [], [], [], [], [] for i in np.arange(0, nentries): tree_sig.GetEntry(i) day = float(str(tree_sig.day)[-2:]) if cut_name == '99perc': cut_val = tree_sig.rms99_of_99 cut.append(cut_val) if tree_sig.counts < cut_val: continue elif cut_name == 'weights': cut_val = tree_sig.weight cut.append(cut_val) if cut_val < 2: continue elif cut_name == 'dummy_cut': cut_val = 0 cut.append(cut_val) else: cut_val = df_cut[(df_cut.day.values == day) & (df_cut.L.values == tree_sig.L) & (df_cut.alpha.values == tree_sig.alpha) & (df_cut.energy.values == tree_sig.energy)][cut_name].values[0] cut.append(cut_val) if tree_sig.counts < cut_val: continue time_hour = float(tree_sig.time) + 24. * (day - 1) time_sec = time_hour * 3600. # variales for clustering time.append(time_sec) alpha.append(tree_sig.alpha) L.append(tree_sig.L) energy.append(tree_sig.energy) event_idx.append(i) print('Entries after ' + str(cut_name), len(time)) # clustering algorithm lines X = np.stack([ np.array(time), np.array(alpha) * 10000., np.array(L) * 10000., np.array(energy) * 10000 ], axis=1) clustering = DBSCAN(eps=window_size, metric='euclidean', min_samples=1, n_jobs=-1).fit(X) y_temp = clustering.labels_ y = [] for i_y in y_temp: if i_y != -1: if len(y_temp[y_temp == i_y]) < min_seed_number: y.append(-1) else: y.append(i_y) else: y.append(i_y) y = np.array(y) n_clusters = len(set(y)) - (1 if -1 in y else 0) print('Nclusters ', n_clusters, np.unique(y)) y = y.reshape([len(X), 1]) Xy = np.concatenate((X, np.array(event_idx).reshape([len(X), 1])), axis=1) Xy = np.concatenate((Xy, y), axis=1) # nnumber of good clusters n_clusters = len(set( clustering.labels_)) - (1 if -1 in clustering.labels_ else 0) n_noise = list(clustering.labels_).count(-1) good_cluster_list = np.unique(Xy[Xy[:, -1] != -1][:, -1]) good_cluster_index = np.arange(0, len(good_cluster_list)) cluster_dict = dict(zip(good_cluster_list, good_cluster_index)) start_cluster = [] end_cluster = [] L_cluster = [] alpha_cluster = [] cluster_index = -1 * np.ones(nentries, dtype=int) for cls_i in good_cluster_list: cluster_entries = Xy[Xy[:, -1] == cls_i] start_cluster = cluster_entries[0, 4] end_cluster = cluster_entries[-1, 4] alpha_cluster = cluster_entries[0, 1] / 10000 L_cluster = cluster_entries[0, 2] / 10000 energy_cluster = cluster_entries[0, 3] / 10000 for cls_ev in np.arange(start_cluster, end_cluster + 1): tree_sig.GetEntry(int(cls_ev)) if (tree_sig.L == L_cluster) and ( tree_sig.alpha == alpha_cluster) and (tree_sig.energy == energy_cluster): cluster_index[int(cls_ev)] = int(cluster_dict[int(cls_i)]) file_root.Close() clsnr_b = array('i', [-1]) thr_b = array('d', [0.]) newroot = r.TFile(input_file, "update") t = newroot.Get("events") clsnr_new = t.Branch('cls_idx', clsnr_b, 'cls_idx/I') thr_new = t.Branch('thr_cut', thr_b, 'thr_cut/D') for i in np.arange(0, nentries): t.GetEntry(i) clsnr_b[0] = cluster_index[i] thr_b[0] = cut[i] clsnr_new.Fill() thr_new.Fill() newroot.Write("", r.TObject.kOverwrite) newroot.Close() '''
def check_flights(): URL = "https://www.google.com/flights/explore/#explore;f=JFK,EWR,LGA;t=HND,NRT,TPE,HKG,KIX;s=1;li=8;lx=12;d=2018-04-01" driver = webdriver.PhantomJS() dcap = dict(DesiredCapabilities.PHANTOMJS) dcap["phantomjs.page.settings.userAgent"] = ( "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_10_5) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/46.0.2490.80 Safari/537.36" ) driver = webdriver.PhantomJS(desired_capabilities=dcap, executable_path="/usr/local/bin/phantomjs") driver.implicitly_wait(20) driver.get(URL) wait = WebDriverWait(driver, 20) wait.until( EC.visibility_of_element_located((By.CSS_SELECTOR, "div.CTPFVNB-w-e"))) s = BeautifulSoup(driver.page_source, "lxml") best_price_tags = s.findAll('div', 'CTPFVNB-w-e') # check if scrape worked - alert if it fails and shutdown if len(best_price_tags) < 4: print('Failed to Load Page Data') requests.post( 'https://maker.ifttt.com/trigger/fare_alert/with/key/API_TOKEN', data={ "value1": "script", "value2": "failed", "value3": "" }) sys.exit(0) else: print('Successfully Loaded Page Data') best_prices = [] for tag in best_price_tags: best_prices.append(int(tag.text.replace('$', ''))) best_price = best_prices[0] best_height_tags = s.findAll('div', 'CTPFVNB-w-f') best_heights = [] for t in best_height_tags: best_heights.append( float(t.attrs['style'].split('height:')[1].replace('px;', ''))) best_height = best_heights[0] # price per pixel of height pph = np.array(best_price) / np.array(best_height) cities = s.findAll('div', 'CTPFVNB-w-o') hlist = [] for bar in cities[0]\ .findAll('div', 'CTPFVNB-w-x'): hlist.append( float(bar['style'].split('height: ')[1].replace('px;', '')) * pph) fares = pd.DataFrame(hlist, columns=['price']) px = [x for x in fares['price']] ff = pd.DataFrame(px, columns=['fare']).reset_index() # begin the clustering X = StandardScaler().fit_transform(ff) db = DBSCAN(eps=1.5, min_samples=1).fit(X) labels = db.labels_ clusters = len(set(labels)) pf = pd.concat([ff, pd.DataFrame(db.labels_, columns=['cluster'])], axis=1) rf = pf.groupby('cluster')['fare'].agg(['min', 'count' ]).sort_values('min', ascending=True) # set up our rules # must have more than one cluster # cluster min must be equal to lowest price fare # cluster size must be less than 10th percentile # cluster must be $100 less the next lowest-priced cluster if clusters > 1 and ff['fare'].min() == rf.iloc[0]['min']\ and rf.iloc[0]['count'] < rf['count'].quantile(.10)\ and rf.iloc[0]['fare'] + 100 < rf.iloc[1]['fare']: city = s.find('span', 'CTPFVNB-v-c').text fare = s.find('div', 'CTPFVNB-w-e').text r = requests.post( 'https://maker.ifttt.com/trigger/fare_alert/with/key/API_TOKEN', data={ "value1": city, "value2": fare, "value3": "" }) else: print('no alert triggered')
pyplot.show() from numpy import unique from numpy import where from sklearn.datasets import make_classification from sklearn.cluster import DBSCAN from matplotlib import pyplot # define dataset X, _ = make_classification(n_samples=1000, n_features=2, n_informative=2, n_redundant=0, n_clusters_per_class=1, random_state=4) # define the model model = DBSCAN(eps=0.30, min_samples=9) # fit model and predict clusters yhat = model.fit_predict(X) # retrieve unique clusters clusters = unique(yhat) # create scatter plot for samples from each cluster for cluster in clusters: # get row indexes for samples with this cluster row_ix = where(yhat == cluster) # create scatter of these samples pyplot.scatter(X[row_ix, 0], X[row_ix, 1]) # show the plot pyplot.show() from numpy import unique from numpy import where
def avg(x): if len(x) < 1: return x[0] else: return sum(x) / len(x) for i in vin_list[:20]: X = auto[auto.vin == i] Y = X[['vin', 'ignition_time']] X = X[['stop_longitude', 'stop_latitude']] try: db = DBSCAN(eps=0.001, min_samples=5) db.fit(X) labels = db.labels_ labels_unique = np.unique(labels) n_clusters_ = len(labels_unique) - (1 if -1 in labels else 0) print("number of estimated clusters : %d" % n_clusters_) except: print("number of estimated clusters : 0") # ============================================================================= # #噪音点评估 # raito = len(labels[labels[:] == -1]) / len(labels) # print('Noise raito:', format(raito, '.2%')) # ============================================================================= ''' 聚类画图调参
rgb_im1 = im1.convert('HSV') color = rgb_im1.getcolors() my_list = [] for x in range(im1.size[0]): for y in range(im1.size[1]): (h, s, v) = rgb_im1.getpixel((x, y)) if h == 26: continue if s < 75: continue if v < 75: my_list.append([x,y]) X = np.array(my_list) clustering = DBSCAN(eps=3, min_samples=2).fit(X) labels = clustering.labels_ n_clusters_ = len(set(labels)) - (1 if -1 in labels else 0) # print(n_clusters_) clusters = pd.Series( [X[labels==n] for n in range(n_clusters_)]) centers = [] for x in clusters: center = MultiPoint(x).centroid (x, y) = (int(center.x), int(center.y)) centers.append([x,y]) print ('click') xy=pyautogui.position() pyautogui.moveTo(random.choice(centers)) pyautogui.click(button='left')
def SampleSelection_v3(setOfPoints,nSamples,returnIndicies=False, nTrials=10, debug=False): """Separating into clusters. Using Convex Hull to select boundary points. Filling the rest by performing random selections """ # from sklearn.mixture import GaussianMixture # model = GaussianMixture(n_components=4) # model.fit(setOfPoints) # yhat =model.predict(setOfPoints) nPoints = setOfPoints.shape[0] from sklearn.preprocessing import MinMaxScaler scaler = MinMaxScaler() data = scaler.fit_transform(setOfPoints) from sklearn.cluster import DBSCAN model = DBSCAN(eps=0.1, min_samples=10) yhat = model.fit_predict(data) clusters=np.unique(yhat) Gindicies = [];GboundaryPoints=[] for cluster in clusters: row_ix = np.where(yhat==cluster) clusterPoints = np.squeeze(setOfPoints[row_ix,:]) if np.unique(clusterPoints,axis=0).shape[0] < 3: GboundaryPoints.append(setOfPoints[row_ix[0][0]]) Gindicies.append(row_ix[0][0]) continue hull = ConvexHull(clusterPoints) indicies = hull.vertices.tolist() boundaryPoints = [];removeIndicies=[] for idx in indicies: if not arreqclose_in_list(setOfPoints[row_ix[0][idx]],GboundaryPoints): GboundaryPoints.append(setOfPoints[row_ix[0][idx]]) else: removeIndicies.append(idx) for idx in removeIndicies: indicies.remove(idx) for idx in indicies: Gindicies.append(row_ix[0][idx]) if debug:print("Finished Calculating Convex Hull of the set. Number of boundary points " + str(len(GboundaryPoints))) if len(Gindicies) >= nSamples: #Perform prunning operation #Removing the entry that lowers the entropy the least while len(Gindicies) != nSamples: worstDist=0 for i in range(len(GboundaryPoints)): dist = TotalAverageDistance(GboundaryPoints.copy().pop(i)) if dist > worstDist: worstDist=dist idx = i GboundaryPoints.pop(idx) Gindicies.pop(idx) if returnIndicies: return Gindicies return GboundaryPoints else: maxDist = 0 for trial in range(nTrials): if debug:print("Begining sampling trial " + str(trial)) points = GboundaryPoints.copy() idx = Gindicies.copy() while len(points) < nSamples: x = randint(0,nPoints-1) if x in idx: continue if arreqclose_in_list(setOfPoints[x],points): continue idx = np.append(idx,x) points.append(setOfPoints[x]) dist = TotalAverageDistance(points) if dist >= maxDist: maxDist=dist bestPoints = points.copy() bestIndicies = idx.copy() if debug: print(maxDist,len(bestPoints),len(bestIndicies)) if returnIndicies: return bestIndicies return bestPoints
D = np.sort(D, axis=0) minPts = 10 nearest = D[1:(minPts + 1), :] nearest = nearest.reshape(1, nearest.size) sort_nearest = np.sort(nearest) plt.plot(range(len(sort_nearest[0, :])), sort_nearest[0, :], linewidth=1.0, marker='x') #plt.axis([-2, len(sort_nearest[0,:])+1000, -2, max(sort_nearest[0,:])+2]) plt.savefig(cur_file_dir + 'result/' + 'nearest.png') plt.cla() plt.clf() plt.close() #db = DBSCAN(eps=0.90, min_samples=minPts).fit(X) # 高维数据 db = DBSCAN(eps=30, min_samples=minPts).fit(reduced_data) # 低维数据 labels = db.labels_ # Number of clusters in labels, ignoring noise if present. n_clusters_ = len(set(labels)) - (1 if -1 in labels else 0) print labels print n_clusters_ print len(labels[labels >= 0]) ## 散点图 #colors = [c[int(i) % len(c)] for i in labels] #colors = labels #plt.scatter(reduced_data[:, 0], reduced_data[:, 1], 20, colors) # 20为散点的直径 ## 图形展示 unique_labels = set(labels)
weight.append(50) we = np.asarray(weight) print we print we.shape print newInp.shape print meantrain.shape res = list() for row in newInp: curres = distance.euclidean(row, meantrain, we) res.append(curres) print "euclidean {}".format(curres) dbinput = np.asarray(res) dbinput = [[x, 1] for x in dbinput] print dbinput stdv = math.ceil(np.std(dbinput)) print stdv model = DBSCAN(eps=int(stdv), min_samples=3).fit(dbinput) print model.labels_ clust = list() i = 0 for row in model.labels_: if row == 0: clust.append(res[i]) i = i + 1 print clust
random_state = np.random.RandomState(seed=0) random_clusters = random_state.randint(low=0, high=2, size=len(X)) # 绘制随机分配 axes[0].scatter(X_scaled[:, 0], X_scaled[:, 1], c=random_clusters, cmap=mglearn.cm3, s=60) axes[0].set_title("Random assignment: {:.2f}".format( silhouette_score(X_scaled, random_clusters))) algorithms = [ KMeans(n_clusters=2), AgglomerativeClustering(n_clusters=2), DBSCAN() ] for ax, algorithm in zip(axes[1:], algorithms): clusters = algorithm.fit_predict(X_scaled) # 绘制簇分配和簇中心 ax.scatter(X_scaled[:, 0], X_scaled[:, 1], c=clusters, cmap=mglearn.cm3, s=60) ax.set_title("{} : {:.2f}".format(algorithm.__class__.__name__, silhouette_score(X_scaled, clusters))) plt.show()
def cluster(frame, sift): #frame_gray = cv2.cvtColor(frame, cv2.COLOR_BGR2GRAY) kps = sift.detect(frame, None) kps.sort(key=attrgetter('octave'), reverse=False) # group keypoints from different scale to cluster them seperately groups = collections.defaultdict(list) for kp in kps: groups[get_keypoint_attrs(kp)[2]].append(kp) # for each of the groups get multiple clusters(list of keypoints) clusters = [] avg_response_of_clusters = [] for item in groups.items(): #print(str(item[0]) +": "+ str(len(item[1])) + "\n") #build cluster X = [] kp_index = 0 for kp in item[1]: #creating histogram region = crop(frame, kp.pt, 5) bgr_hist = [] histb = cv2.calcHist([region], [0], None, [64], [0, 256]) histg = cv2.calcHist([region], [1], None, [64], [0, 256]) histr = cv2.calcHist([region], [2], None, [64], [0, 256]) bgr_hist.append(histb) bgr_hist.append(histg) bgr_hist.append(histr) a = np.array(bgr_hist) bgr_hist = a.flatten() bgr_hist_max = max(bgr_hist) bgr_hist_norm = [float(i) / bgr_hist_max for i in bgr_hist] #print(bgr_hist_norm) #print(bgr_hist) # normalize weight and add histogram as feature x = [kp.pt[0] / frame.shape[1], kp.pt[1] / frame.shape[0]] x = [i * 100 for i in x] x += bgr_hist_norm #adding histogram as features with position X.append(x) kp.class_id = kp_index kp_index = kp_index + 1 #pprint(dir(kp)) if item[0] == 2.0: db = DBSCAN(eps=3, min_samples=15).fit(X) elif item[0] == 1.0: db = DBSCAN(eps=5, min_samples=10).fit(X) elif item[0] == 0.5: db = DBSCAN(eps=7, min_samples=4).fit(X) else: db = DBSCAN(eps=10, min_samples=2).fit(X) labels = db.labels_ # Number of clusters in labels, ignoring noise if present n_clusters_ = len(set(labels)) - (1 if -1 in labels else 0) # assigning corresponding cluster id to the keypoints for kp in item[1]: kp.class_id = labels[kp.class_id] # calculate average response of each cluster for clstr_no in list(range(0, n_clusters_)): clstr = [kp for kp in item[1] if kp.class_id == clstr_no] avg_response = np.average([k.response for k in clstr]) clusters.append(clstr) avg_response_of_clusters.append(avg_response) # Parallel sorting of clusters using their avg. response if n_clusters_ > 0: avg_response_of_clusters, clusters = zip( *sorted(zip(avg_response_of_clusters, clusters), reverse=True)) avg_response_of_clusters = list(avg_response_of_clusters) clusters = list(clusters) #print(clusters) #print(avg_response_of_clusters) #quit() best_keypoints = [] frame_attention_window = None # find best avaiable cluster for c in clusters: aw = cluster_to_window(c) octave, layer, scale = get_keypoint_attrs(c[0]) if window_history.add_if_new(aw, scale): frame_attention_window = aw if len(sys.argv) > 3: best_keypoints += kps # returning all keypoints for visualization else: best_keypoints += c # returning only the best cluster break ''' for i in range(len(kps)): aw = keypoint_to_window(kps[i]) octave, layer, scale= get_keypoint_attrs(kps[i]) kp = kps[i] #cv2.imshow("Test1", frame[int(kp.pt[1]-5):int(kp.pt[1]+5), int(kp.pt[0]-5):int(kp.pt[0]+5)]) if window_history.add_if_new(aw, scale): frame_attention_window = aw best_keypoints += groups[2.0] print(scale, groups[scale][3].class_id) break ''' return (frame_attention_window, best_keypoints)
df_abnormal = finalDf.query('actual == 1') print("normal :", df_normal.shape[0]) print("abnormal :", df_abnormal.shape[0]) df = pd.concat([df_normal, df_abnormal]) names = df['timestamp'] df.drop(columns=['timestamp'], inplace=True) df['avg'] = df.apply(lambda row: get_weighted_avg(row), axis=1) df_scaler = StandardScaler().fit(df[['avg']].to_numpy()) newDf = df_scaler.transform(df[['avg']].to_numpy()) # newDf = df[['avg']] outlier_detection = DBSCAN(eps=0.9, min_samples=100, metric='euclidean') clusters = outlier_detection.fit_predict(newDf) df['scores'] = clusters df['names'] = names df['pred'] = df.apply(lambda row: 1 if row['scores'] == -1 else 0, axis=1) outliers = df.query('pred == 1') normal = df.query('pred == 0') threedee = plt.figure().gca(projection='3d') threedee.scatter(normal['pitch'], normal['roll'], normal['yaw'], color="#00FF00", label="normal points")
# - *eps* is the max distance between two samples for one to be considered a 'neighborhood' of the other (so kind of like radius of the cluster) # - *min_samples* number of points ina neighborhood to consider a central point the 'core point' # - *metric* chooses the type of distance # - *p* is the power parameter in the minkowski distance equation # In[142]: get_ipython().run_cell_magic( 'latex', '', '\\begin{align}\n\\mathrm{Minkowski \\,distance} = \\left( \n \\sum_{i=1}^{n}|X_i - Y_i|^p\n \\right)^\\frac{1}{p}\n\\end{align}' ) # In[143]: X = StandardScaler().fit_transform(np.asarray(df2.accTotal).reshape(-1, 1)) model = DBSCAN(eps=0.5, min_samples=110, metric='minkowski', p=1.5).fit(X) model.labels_ true_false = [] for item in model.labels_: if item == 0: true_false.append(False) else: true_false.append(True) anomalies = df2[true_false] actuals = df2[[not i for i in true_false]] # In[144]: plt.plot(anomalies.index, anomalies.accTotal, 'r.') plt.plot(actuals.index, actuals.accTotal, 'b.')
type=int, default=-1, help="# of parallel jobs to run (-1 will use all CPUs)") args = vars(ap.parse_args()) # load the serialized face encodings + bounding box locations from # disk, then extract the set of encodings to so we can cluster on # them print("[INFO] loading encodings...") data = pickle.loads(open(args["encodings"], "rb").read()) data = np.array(data) encodings = [d["encoding"] for d in data] # cluster the embeddings print("[INFO] clustering...") clt = DBSCAN(metric="euclidean", n_jobs=args["jobs"]) clt.fit(encodings) # determine the total number of unique faces found in the dataset labelIDs = np.unique(clt.labels_) numUniqueFaces = len(np.where(labelIDs > -1)[0]) print("[INFO] # unique faces: {}".format(numUniqueFaces)) # loop over the unique face integers for labelID in labelIDs: # find all indexes into the `data` array that belong to the # current label ID, then randomly sample a maximum of 25 indexes # from the set print("[INFO] faces for face ID: {}".format(labelID)) idxs = np.where(clt.labels_ == labelID)[0] idxs = np.random.choice(idxs, size=min(25, len(idxs)), replace=False)
zaxis=dict( range=[-5,10], title='PC_3', gridcolor='rgb(255, 255, 255)', zerolinecolor='rgb(255, 255, 255)', showbackground=True, backgroundcolor='rgb(230, 230,230)', showticklabels=False, ticks='' ) ) centers = [[1, 1], [-1, -1], [1, -1]] X = x_pca y = clust_df_region['Region'] estimators = {'dbscan': DBSCAN(eps=1.9, min_samples=15).fit(X) } fignum = 1 for name, est in estimators.items(): est.fit(X) labels = est.labels_ trace = go.Scatter3d(x=X[:, 0], y=X[:, 1], z=X[:, 2], showlegend=False, mode='markers', marker=dict( color=labels.astype(np.float), line=dict(color='black', width=1) )) fig.append_trace(trace, 1, fignum)
def touchdowns(image, n): """ Function to obtain the locations of the touchdown passes from the image of the pass chart using k-means, and DBSCAN to account for difficulties in extracting touchdown passes, since they have the are the same color as both the line of scrimmage and the attached touchdown trajectory lines. Input: image: image from the folder 'Cleaned_Pass_Charts' n: number of toucndowns, from the corresponding data of the image Return: call to map_pass_locations: centers: list of pass locations in pixels col: width of image from which the pass locations were extracted pass_type: "TOUCHDOWN" """ im = Image.open(image) pix = im.load() col, row = im.size img = Image.new('RGB', (col, row), 'black') p = img.load() for i in range(col): for j in range(row): r = pix[i, j][0] g = pix[i, j][1] b = pix[i, j][2] if (col < 1370) and (j < row - 105) and (j > row - 111): if (b > 2 * g) and (b > 60): p[i, j] = (0, 0, 0) elif (col > 1370) and (j < row - 81) and (j > row - 86): if (b > 2 * g) and (b > 60): p[i, j] = (0, 0, 0) else: p[i, j] = pix[i, j] r = p[i, j][0] g = p[i, j][1] b = p[i, j][2] f = ((r - 20)**2 + (g - 80)**2 + (b - 200)**2)**0.5 if f < 32 and b > 100: p[i, j] = (255, 255, 0) #scipy.misc.imsave('temp.jpg', img) imageio.imwrite('temp.jpg', img) imag = cv2.imread('temp.jpg') os.remove('temp.jpg') hsv = cv2.cvtColor(imag, cv2.COLOR_BGR2HSV) lower = np.array([20, 100, 100]) upper = np.array([30, 255, 255]) mask = cv2.inRange(hsv, lower, upper) res = cv2.bitwise_and(imag, imag, mask=mask) res = cv2.cvtColor(res, cv2.COLOR_HSV2RGB) res = cv2.cvtColor(res, cv2.COLOR_BGR2GRAY) res = cv2.fastNlMeansDenoising(res, h=10) x = np.where(res != 0)[0] y = np.where(res != 0)[1] pairs = list(zip(x, y)) X = list(map(list, pairs)) if (len(pairs) != 0): db = DBSCAN(eps=10, min_samples=n).fit(X) labels = db.labels_ coords = pd.DataFrame([x, y, labels]).T coords.columns = ['x', 'y', 'label'] clusters = Counter(labels).most_common(n) td_labels = np.array([clust[0] for clust in clusters]) km_coords = coords.loc[coords['label'].isin(td_labels)] km = list(map(list, zip(km_coords.iloc[:, 0], km_coords.iloc[:, 1]))) kmeans = KMeans(n_clusters=n, random_state=0).fit(km) centers = kmeans.cluster_centers_ return map_pass_locations(centers, col, "TOUCHDOWN") else: return map_pass_locations([], col, "TOUCHDOWN", n)
embNump[i] = val.detach().numpy() print(embNump.size) if input("Save features to file? y/n") == 'y': filename = username + '.npy' np.save(filename, embNump) elif input("Load features from data? y/n") == 'y': inFile = input("Filename: ") embNump = np.load(inFile) print("Number of Faces: ") print(len(embNump)) print("Clustering now") #compute embeddings #FIND distance matrix: #dists = [[(e1 - e2).norm().item() for e1 in embeddings]for e2 in embeddings] db = DBSCAN(eps=0.8).fit(embNump) print("Labels: ") print(db.labels_) userProfiles = np.empty(0) userNames = np.empty(0) if input("load userProfiles? y/n") == 'y': userProfiles = np.load('faceDictionary.npy') userNames = np.load('userNames.npy') userProfiles, userNames = addUserToList(db, embNump, userProfiles, userNames, username) print(len(userProfiles)) #call function print("Saving new user profiles...") np.save('faceDictionary.npy', userProfiles) np.save('userNames.npy', userNames) print("Completed adding: " + username)
def main(self): # check innput path = self.e1.get() try: image_RGB = plt.imread(path) cluster_tolerance = float(self.e2.get()) pass except: self.state.set('ERROR') self.lstate.config(bg='#FF7F7F') self.face.update_idletasks() messagebox.showinfo(title='ERROR', message='输入错误!') return None self.lstate.config(bg='#7FFF7F') self.state.set('正常') self.face.update_idletasks() # show image self.state.set('显示图片中。。。') self.face.update_idletasks() img_open = Image.open(path) img = img_open.resize((128, 64)) img = ImageTk.PhotoImage(img) self.lp1.config(image=img) self.lp1.image = img self.face.update_idletasks() # resize to array image_RGB = Image.open(path) w_resize = 96 h_resize = int(w_resize*image_RGB.size[1]/image_RGB.size[0]) image_RGB = image_RGB.resize((w_resize, h_resize)) image_RGB = np.array(image_RGB) # to lab self.state.set('转换RGB为LAB中。。。') self.face.update_idletasks() image_LAB = cv2.cvtColor(image_RGB, cv2.COLOR_RGB2LAB) # cluster self.state.set('图片聚类中。。。') self.face.update_idletasks() dbscan = DBSCAN(eps=cluster_tolerance, min_samples=1) h_1, w_1, c_1 = image_LAB.shape image_data = image_LAB.reshape((h_1*w_1, c_1)) image_lab_data = [] # uint8 to true lab for data in image_data: image_lab_data.append([data[0]*100/255, data[1]-128, data[2]-128]) pass image_lab_data = np.array(image_lab_data) dbscan.fit(image_lab_data) labels = dbscan.labels_ n_clusters = len(set(labels)) - (1 if -1 in labels else 0) # find the cluster center themes = [] clusters_area = [] for i in range(n_clusters): one_cluster = image_lab_data[labels == i] if len(one_cluster)!=1: km = KMeans(n_clusters=1, max_iter=300) km.fit(one_cluster) themes.append(np.squeeze(km.cluster_centers_)) pass else: themes.append(one_cluster[0]) pass clusters_area.append(len(one_cluster)/len(image_lab_data)) pass themes = np.array(themes) # show themes image uint8_themes = [] for theme in themes: uint8_themes.append([theme[0]*255/100, theme[1]+128, theme[2]+128]) pass uint8_themes = np.array(uint8_themes) pic_array = cv2.cvtColor(np.uint8(uint8_themes.reshape(1, len(uint8_themes), 3)), \ cv2.COLOR_LAB2RGB)[0] self.themes_pic_array = pic_array pic_array = self.make_themes_image(pic_array) pic = Image.fromarray(pic_array.astype('uint8')).convert('RGB') img = ImageTk.PhotoImage(pic) self.lp1c.config(image=img) self.lp1c.image = img self.face.update_idletasks() self.state.set('聚类完成') self.face.update_idletasks() # sort themes_areas = list(zip(clusters_area, themes)) sorted_themes_areas = sorted(themes_areas, key=lambda x:(x[0]), reverse=True) self.listbox.delete(0, tk.END) self.face.update_idletasks() # write to list box sum_area = 0.0 #info = ' '*5 + '序号' + ' '*28 + 'LAB' + ' '*51 + '面积占比' + ' '*26 + '前n行面积占比之和' info = ' '*5 + '序号' + ' '*28 + 'LAB' + ' '*80 + '面积占比' self.listbox.insert(tk.END, info) self.face.update_idletasks() self.state.set('处理列表中。。。请稍后') self.face.update_idletasks() count = 0 # for excel self.results = [] for area, theme in sorted_themes_areas: count = count + 1 sum_area += area L, A, B = theme L = round(L, 3) A = round(A, 3) B = round(B, 3) self.results.append((count, [L, A, B], round(100*area, 3))) info = ' '*(8-len(str(count))) + str(count) + ' '*18 + str([L, A, B]) #info += ' '*(60-len(str([L, A, B]))) + str(round(100*area, 3)) + '%' #info += ' '*(125-len(info)) + str(round(100*sum_area, 3)) + '%' info += ' '*(90-len(str([L, A, B]))) + str(round(100*area, 3)) + '%' self.listbox.insert(tk.END, info) self.face.update_idletasks() pass self.scrollbar.config(command=self.listbox.yview) self.face.update_idletasks() self.state.set('选取完成') self.face.update_idletasks() pass
def separate_watershed( vdf_temp, min_distance=1, min_size=1, max_size=np.inf, max_number_of_grains=np.inf, marker_radius=1, threshold=False, exclude_border=False, plot_on=False, ): """Separate segments from one VDF image using edge-detection by the sobel transform and the watershed segmentation implemented in scikit-image. See [1,2] for examples from scikit-image. Parameters ---------- vdf_temp : np.array One VDF image. min_distance: int Minimum distance (in pixels) between markers for them to be considered separate markers for the watershed segmentation. min_size : float Grains with size (i.e. total number of pixels) below min_size are discarded. max_size : float Grains with size (i.e. total number of pixels) above max_size are discarded. max_number_of_grains : int Maximum number of grains included in the returned separated grains. If it is exceeded, those with highest peak intensities will be returned. marker_radius : float If 1 or larger, each marker for watershed is expanded to a disk of radius marker_radius. marker_radius should not exceed 2*min_distance. threshold : bool If True, a mask is calculated by thresholding the VDF image by the Li threshold method in scikit-image. If False (default), the mask is the boolean VDF image. exclude_border : int or True, optional If non-zero integer, peaks within a distance of exclude_border from the boarder will be discarded. If True, peaks at or closer than min_distance of the boarder, will be discarded. plot_on : bool If True, the VDF, the mask, the distance transform and the separated grains will be plotted in one figure window. Returns ------- sep : np.array Array containing segments from VDF images (i.e. separated grains). Shape: (image size x, image size y, number of grains) References ---------- [1] http://scikit-image.org/docs/dev/auto_examples/segmentation/ plot_watershed.html [2] http://scikit-image.org/docs/dev/auto_examples/xx_applications/ plot_coins_segmentation.html#sphx-glr-auto-examples-xx- applications-plot-coins-segmentation-py """ # Create a mask from the input VDF image. if threshold: th = threshold_li(vdf_temp) mask = np.zeros_like(vdf_temp) mask[vdf_temp > th] = True else: mask = vdf_temp.astype("bool") # Calculate the Eucledian distance from each point in the mask to the # nearest background point of value 0. distance = distance_transform_edt(mask) # If exclude_boarder is given, the edge of the distance is removed # by erosion. The distance image is used to find markers, and so the # erosion is done to avoid that markers are located at the edge # of the mask. if exclude_border > 0: distance_mask = binary_erosion(distance, structure=disk(exclude_border)) distance = distance * distance_mask.astype("bool") # Find the coordinates of the local maxima of the distance transform. local_maxi = peak_local_max( distance, indices=False, min_distance=1, num_peaks=max_number_of_grains, exclude_border=exclude_border, threshold_rel=None, ) maxi_coord1 = np.where(local_maxi) # Discard maxima that are found at pixels that are connected to a # smaller number of pixels than min_size. Used as markers, these would lead # to segments smaller than min_size and should therefore not be # considered when deciding which maxima to use as markers. if min_size > 1: labels_check = label(mask)[0] delete_indices = [] for i in np.arange(np.shape(maxi_coord1)[1]): index = np.transpose(maxi_coord1)[i] label_value = labels_check[index[0], index[1]] if len(labels_check[labels_check == label_value]) < min_size: delete_indices.append(i) local_maxi[index[0], index[1]] = False maxi_coord1 = np.delete(maxi_coord1, delete_indices, axis=1) # Cluster the maxima by DBSCAN based on min_distance. For each # cluster, only the maximum closest to the average maxima position is # used as a marker. if min_distance > 1 and np.shape(maxi_coord1)[1] > 1: clusters = DBSCAN( eps=min_distance, metric="euclidean", min_samples=1, ).fit(np.transpose(maxi_coord1)) local_maxi = np.zeros_like(local_maxi) for n in np.arange(clusters.labels_.max() + 1): maxi_coord1_n = np.transpose(maxi_coord1)[clusters.labels_ == n] com = np.average(maxi_coord1_n, axis=0).astype("int") index = distance_matrix([com], maxi_coord1_n).argmin() index = maxi_coord1_n[index] local_maxi[index[0], index[1]] = True # Use the resulting maxima as markers. Each marker should have a # unique label value. For each maximum, generate markers with the same # label value in a radius given by marker_radius centered at the # maximum position. This is done to make the segmentation more robust # to local changes in pixel values around the marker. markers = label(local_maxi)[0] if marker_radius >= 1: disk_mask = disk(marker_radius) for mm in np.arange(1, np.max(markers) + 1): im = np.zeros_like(markers) im[np.where(markers == mm)] = markers[np.where(markers == mm)] markers_temp = convolve2d(im, disk_mask, boundary="fill", mode="same", fillvalue=0) markers[np.where(markers_temp)] = mm markers = markers * mask # Find the edges of the VDF image using the Sobel transform. elevation = sobel(vdf_temp) # 'Flood' the elevation (i.e. edge) image from basins at the marker # positions. Find the locations where different basins meet, i.e. # the watershed lines (segment boundaries). Only search for segments # (labels) in the area defined by mask. labels = watershed(elevation, markers=markers, mask=mask) sep = np.zeros( (np.shape(vdf_temp)[0], np.shape(vdf_temp)[1], (np.max(labels))), dtype="int32") n, i = 1, 0 while (np.max(labels)) > n - 1: sep_temp = labels * (labels == n) / n sep_temp = np.nan_to_num(sep_temp) # Discard a segment if it is too small or too large, or else add # it to the list of separated segments. if (np.sum(sep_temp, axis=(0, 1)) < min_size) or np.sum( sep_temp, axis=(0, 1)) > max_size: sep = np.delete(sep, ((n - i) - 1), axis=2) i = i + 1 else: sep[:, :, (n - i) - 1] = sep_temp n = n + 1 # Put the intensity from the input VDF image into each segmented area. vdf_sep = np.broadcast_to(vdf_temp.T, np.shape(sep.T)) * (sep.T == 1) if plot_on: # pragma: no cover # If segments have been discarded, make new labels that do not # include the discarded segments. if np.max(labels) != (np.shape(sep)[2]) and (np.shape(sep)[2] != 0): labels = sep[:, :, 0] for i in range(1, np.shape(sep)[2]): labels = labels + sep[..., i] * (i + 1) # If no separated particles were found, set all elements in # labels to 0. elif np.shape(sep)[2] == 0: labels = np.zeros(np.shape(labels)) seps_img_sum = np.zeros_like(vdf_temp).astype("float64") for lbl, vdf in zip(np.arange(1, np.max(labels) + 1), vdf_sep): mask_l = np.zeros_like(labels).astype("bool") _idx = np.where(labels == lbl) mask_l[_idx] = 1 seps_img_sum += vdf_temp * mask_l / np.max(vdf_temp[_idx]) seps_img_sum[_idx] += lbl maxi_coord = np.where(local_maxi) fig, axes = plt.subplots(2, 3, sharex=True, sharey=True) ax = axes.ravel() ax[0].imshow(vdf_temp, cmap=plt.cm.magma_r) ax[0].axis("off") ax[0].set_title("VDF") ax[1].imshow(mask, cmap=plt.cm.gray_r) ax[1].axis("off") ax[1].set_title("Mask") ax[2].imshow(distance, cmap=plt.cm.gray_r) ax[2].axis("off") ax[2].set_title("Distance and markers") ax[2].imshow(masked_where(markers == 0, markers), cmap=plt.cm.gist_rainbow) ax[2].plot(maxi_coord1[1], maxi_coord1[0], "k+") ax[2].plot(maxi_coord[1], maxi_coord[0], "gx") ax[3].imshow(elevation, cmap=plt.cm.magma_r) ax[3].axis("off") ax[3].set_title("Elevation") ax[4].imshow(labels, cmap=plt.cm.gnuplot2_r) ax[4].axis("off") ax[4].set_title("Labels") ax[5].imshow(seps_img_sum, cmap=plt.cm.magma_r) ax[5].axis("off") ax[5].set_title("Segments") return vdf_sep
def main(self): image_1_path = self.e1.get() image_2_path = self.e2.get() try: image_1_RGB = plt.imread(image_1_path) image_2_RGB = plt.imread(image_2_path) cluster_tolerance = float(self.e3.get()) color_tolerance = float(self.e4.get()) pass except: self.state.set('ERROR') self.lstate.config(bg='#FF7F7F') self.face.update_idletasks() messagebox.showinfo(title='ERROR', message='输入错误!') return None self.lstate.config(bg='#7FFF7F') self.face.update_idletasks() # show images self.state.set('显示图片中。。。') self.face.update_idletasks() img_open = Image.open(self.e1.get()) img = img_open.resize((128, 64)) img = ImageTk.PhotoImage(img) self.lp1.config(image=img) self.lp1.image = img self.face.update_idletasks() img_open = Image.open(self.e2.get()) img = img_open.resize((128, 64)) img = ImageTk.PhotoImage(img) self.lp2.config(image=img) self.lp2.image = img self.face.update_idletasks() # resize to speed up image_1_RGB = Image.open(image_1_path) w_resize = 96 h_resize = int(w_resize*image_1_RGB.size[1]/image_1_RGB.size[0]) image_1_RGB = image_1_RGB.resize((w_resize, h_resize)) image_1_RGB = np.array(image_1_RGB) # resize to speed up image_2_RGB = Image.open(image_2_path) w_resize = 96 h_resize = int(w_resize*image_2_RGB.size[1]/image_2_RGB.size[0]) image_2_RGB = image_2_RGB.resize((w_resize, h_resize)) image_2_RGB = np.array(image_2_RGB) # to lab self.state.set('转换RGB为LAB中。。。') self.face.update_idletasks() image_1_LAB = cv2.cvtColor(image_1_RGB, cv2.COLOR_RGB2LAB) image_2_LAB = cv2.cvtColor(image_2_RGB, cv2.COLOR_RGB2LAB) # image 1 self.state.set('第一张图片聚类中。。。') self.face.update_idletasks() dbscan1 = DBSCAN(eps=cluster_tolerance, min_samples=1) h_1, w_1, c_1 = image_1_LAB.shape image_1_data = image_1_LAB.reshape((h_1*w_1, c_1)) image_1_lab_data = [] for data in image_1_data: image_1_lab_data.append([data[0]*100/255, data[1]-128, data[2]-128]) pass image_1_lab_data = np.array(image_1_lab_data) dbscan1.fit(image_1_lab_data) labels = dbscan1.labels_ n_clusters_1 = len(set(labels)) - (1 if -1 in labels else 0) # find the cluster center themes_1 = [] clusters_area_1 = [] for i in range(n_clusters_1): one_cluster = image_1_lab_data[labels == i] if len(one_cluster)!=1: km = KMeans(n_clusters=1, max_iter=300) km.fit(one_cluster) themes_1.append(np.squeeze(km.cluster_centers_)) pass else: themes_1.append(one_cluster[0]) pass clusters_area_1.append(len(one_cluster)/len(image_1_lab_data)) pass themes_1 = np.array(themes_1) # show image uint8_themes_1 = [] for theme in themes_1: uint8_themes_1.append([theme[0]*255/100, theme[1]+128, theme[2]+128]) pass uint8_themes_1 = np.array(uint8_themes_1) pic_array = cv2.cvtColor(np.uint8(uint8_themes_1.reshape(1, len(uint8_themes_1), 3)), \ cv2.COLOR_LAB2RGB) pic_array = self.make_themes_image(pic_array[0]) pic = Image.fromarray(pic_array.astype('uint8')).convert('RGB') img = ImageTk.PhotoImage(pic) self.lp1c.config(image=img) self.lp1c.image = img self.face.update_idletasks() # image 2 self.state.set('第二张图片聚类中。。。') self.face.update_idletasks() dbscan2 = DBSCAN(eps=cluster_tolerance, min_samples=1) h_2, w_2, c_2 = image_2_LAB.shape image_2_data = image_2_LAB.reshape((h_2*w_2, c_2)) image_2_lab_data = [] for data in image_2_data: image_2_lab_data.append([data[0]*100/255, data[1]-128, data[2]-128]) pass image_2_lab_data = np.array(image_2_lab_data) dbscan2.fit(image_2_lab_data) labels = dbscan2.labels_ n_clusters_2 = len(set(labels)) - (1 if -1 in labels else 0) # find the cluster center themes_2 = [] clusters_area_2 = [] for i in range(n_clusters_2): one_cluster = image_2_lab_data[labels == i] if len(one_cluster)!=1: km = KMeans(n_clusters=1, max_iter=300) km.fit(one_cluster) themes_2.append(np.squeeze(km.cluster_centers_)) pass else: themes_2.append(one_cluster[0]) pass clusters_area_2.append(len(one_cluster)/len(image_2_lab_data)) pass themes_2 = np.array(themes_2) # show image uint8_themes_2 = [] for theme in themes_2: uint8_themes_2.append([theme[0]*255/100, theme[1]+128, theme[2]+128]) pass uint8_themes_2 = np.array(uint8_themes_2) pic_array = cv2.cvtColor(np.uint8(uint8_themes_2.reshape(1, len(uint8_themes_2), 3)), \ cv2.COLOR_LAB2RGB) pic_array = self.make_themes_image(pic_array[0]) pic = Image.fromarray(pic_array.astype('uint8')).convert('RGB') img = ImageTk.PhotoImage(pic) self.lp2c.config(image=img) self.lp2c.image = img self.face.update_idletasks() self.state.set('聚类完成') self.face.update_idletasks() # select common color Image_1_Area = clusters_area_1[:] Image_2_Area = clusters_area_2[:] self.state.set('共同色选取中。。。') self.face.update_idletasks() common_color_infos = [] for i in range(n_clusters_1): L1 = themes_1[i][0] A1 = themes_1[i][1] B1 = themes_1[i][2] LAB1 = [L1, A1, B1] for j in range(n_clusters_2): L2 = themes_2[j][0] A2 = themes_2[j][1] B2 = themes_2[j][2] LAB2 = [L2, A2, B2] deltaE = self.calc_chromatism(LAB1, LAB2) if deltaE <= color_tolerance: S1 = Image_1_Area[i] / (Image_1_Area[i] + Image_2_Area[j]) S2 = Image_2_Area[j] / (Image_1_Area[i] + Image_2_Area[j]) L3 = L1 * S1 + L2 * S2 A3 = A1 * S1 + A2 * S2 B3 = B1 * S1 + B2 * S2 L1 = round(L1, 3) A1 = round(A1, 3) B1 = round(B1, 3) L2 = round(L2, 3) A2 = round(A2, 3) B2 = round(B2, 3) L3 = round(L3, 3) A3 = round(A3, 3) B3 = round(B3, 3) LAB1 = [L1, A1, B1] LAB2 = [L2, A2, B2] LAB3 = [L3, A3, B3] selected_std_color = select_std_color(LAB3) selected_std_color_lab = std_colors[selected_std_color] # knn label = colour_classify(selected_std_color_lab, \ image_1_lab_data, np.squeeze(dbscan1.labels_), k=10) # area std_color_area_1 = clusters_area_1[label] # knn label = colour_classify(selected_std_color_lab, \ image_2_lab_data, np.squeeze(dbscan2.labels_), k=10) # area std_color_area_2 = clusters_area_2[label] # info info = (LAB3, LAB1, Image_1_Area[i], LAB2, Image_2_Area[j], \ selected_std_color, std_color_area_1, std_color_area_2) common_color_infos.append(info) pass pass pass self.state.set('共同色选取完成') self.face.update_idletasks() selected_std_colors = [] # keys: num appears values: selected std colors dict_selected_std_colors = {} # num appears std_colors_nums = [] for i in range(len(common_color_infos)): # std color index: -3 selected_std_colors.append(common_color_infos[i][-3]) selected_std_colors_set = set(selected_std_colors) pass for selected_std_color in selected_std_colors_set: num = selected_std_colors.count(selected_std_color) if str(num) not in dict_selected_std_colors.keys(): std_colors_nums.append(num) dict_selected_std_colors[str(num)] = [selected_std_color] pass else: dict_selected_std_colors[str(num)].append(selected_std_color) pass pass std_colors_nums.sort(reverse=True) # list box index = 0 self.listbox.delete(0, tk.END) self.face.update_idletasks() info = ' '*2 + str('') + ' '*(7-len(str(''))) info += ' '*8 + str('') + ' '*(17-len(str(''))) info += ' '*12 info += ' '*15 + str('背景图片一') + ' '*(20-len(str('背景图片一'))) info += ' '*15 + str('背景图片二') + ' '*(20-len(str('背景图片二'))) info += ' '*16 + str('军标色') + ' '*(14-len(str('军标色'))) self.listbox.insert(tk.END, info) self.face.update_idletasks() info = ' '*1 + str('序号') + ' '*(8-len(str('序号'))) info += ' '*8 + str('共同色') + ' '*(17-len(str('共同色'))) info += ' '*10 + str('LAB1') + ' '*(15-len(str('LAB1'))) info += ' '*5 + str('Area1') + ' '*(5-len(str('Area1'))) info += ' '*10 + str('LAB2') + ' '*(15-len(str('LAB2'))) info += ' '*5 + str('Area2') + ' '*(5-len(str('Area2'))) info += ' '*6 + str('色号') + ' '*(4-len(str('色号'))) info += ' '*5 + str('图片一占比') + ' '*(5-len(str('图片一占比'))-1) info += ' '*5 + str('图片二占比') + ' '*(5-len(str('图片二占比'))-1) self.listbox.insert(tk.END, info) self.face.update_idletasks() self.state.set('处理列表中。。。请稍后') self.face.update_idletasks() self.results = [] for num in std_colors_nums: for color in dict_selected_std_colors[str(num)]: count = 0 for color_info in common_color_infos: # std color index: -3 if color_info[-3] == color: index += 1 count += 1 c3, c1, a1, c2, a2, \ sc, sca1, sca2 = color_info a1 = round(100*a1, 3) a2 = round(100*a2, 3) sca1 = round(100*sca1, 3) sca2 = round(100*sca2, 3) if count<=1: self.results.append((index, c3, c1, a1, c2, a2, sc, \ sca1, sca2)) pass else: self.results.append((index, c3, c1, a1, c2, a2, sc, \ None, None)) pass info = ' '*2 + str(index) + ' '*(7-len(str(index))) info += str(c3) + ' '*(25-len(str(c3))) info += str(c1) + ' '*(25-len(str(c1))) info += str(a1) + '%' + ' '*(10-len(str(a1))-1) info += str(c2) + ' '*(25-len(str(c2))) info += str(a2) + '%' + ' '*(10-len(str(a2))-1) info += str(sc) + ' '*(10-len(str(sc))) if count<=1: info += str(sca1) + '%' + ' '*(10-len(str(sca1))-1) info += str(sca2) + '%' + ' '*(10-len(str(sca2))-1) pass self.listbox.insert(tk.END, info) self.face.update_idletasks() pass pass pass pass self.scrollbar.config(command=self.listbox.yview) self.face.update_idletasks() self.state.set('选取完成') self.face.update_idletasks() pass
count += 1 if count > 300: break idx += 6000 # from list to array sel_label = np.array(sel_label) sel_data = np.array(sel_data) # t-SNE (2-D) tsne = TSNE(n_components=2, perplexity=30).fit_transform(sel_data) # PCA (2-D) pca = PCA(n_components=2) pca = pca.fit_transform(sel_data, sel_label) # Clustering clustering = DBSCAN(eps=4, min_samples=20).fit(tsne) clustering_pca = DBSCAN(eps=60, min_samples=8).fit(pca) # Plot DR_Plot(sel_label, tsne, 'tsne') DR_Plot(sel_label, pca, 'pca') DR_Plot(clustering.labels_, tsne, 'clustering_tsne') DR_Plot(clustering_pca.labels_, pca, 'clustering_pca') DR_Plot_black(tsne, 'tsne_black') DR_Plot_black(pca, 'pca_black')
def compute_lenta_dbscan(self, algorithm='auto', allowed_hierarchic_iterations=10): """ Extract the silhouette value for every cluster, and iterate the clustering algorithms over those cluster with negative silhouette values""" logging.debug("Starting computing ITERATIVE-DBSCAN," + "at {0}".format( datetime.datetime.fromtimestamp(time.time()).strftime( '%Y-%m-%d %H:%M:%S'))) self.select_automatic_epsilon() # compute classic dbscan in first instance first_db = DBSCAN( metric='precomputed', # Use a pre-computed distance matrix eps=self.epsilon, min_samples=self.min_points, algorithm=algorithm).fit(self.D) # extract infos on core points self.core_samples_mask = np.zeros_like(first_db.labels_, dtype=bool) self.core_samples_mask[first_db.core_sample_indices_] = True self.labels = np.asarray(first_db.labels_.astype(int)) initial_max_label = max(self.labels) logging.debug("Initially extracted {0} clusters".format( sum(x is not -1 for x in set(self.labels)))) self.compute_silhouette() if max(self.labels) > 0: flag = True label = 0 while flag: cluster_matrix = np.array([]) file_mmap = None # create a mask for the elements belonging to that label index_mask = self.labels == label cl_indexes = np.array(range(len(self.labels)))[index_mask] # compute silhouette formula logging.debug("CLUSTER {0}".format(label)) logging.debug("Number of elements in the cluster:" + "%i" % np.count_nonzero(self.labels == label)) # Try to group differently points in clusters with bad silhouette if self.mean_silhouette_labels[label] < self.smin: if allowed_hierarchic_iterations > 0 and np.count_nonzero( self.labels == label) > self.min_points: # Re-apply DBSCAN only over the elements of the cluster cluster_matrix = self.extract_sub_matrix( cl_indexes, index_mask, self.D) print(cluster_matrix) logging.debug( os.listdir(self.output_folder + '/results')) elected_epsilon = self.compute_kdist_graph( cluster_matrix, self.min_points) try: db_hier = DBSCAN( metric= 'precomputed', # Use a pre-computed distance matrix eps=elected_epsilon, min_samples=self.min_points, algorithm=algorithm).fit(cluster_matrix) db_hier_labels = db_hier.labels_ logging.debug("From cluster {0} ".format(label) + "extracted {0} clusters".format( max(db_hier_labels) + 1)) # if we are in a no-end loop if max(db_hier_labels) + 1 == 1 and label == max( self.labels): for cl_index in cl_indexes: self.labels[cl_index] = -1 allowed_hierarchic_iterations = 0 logging.debug( "From cluster {0} ".format(label) + "is not possible to extract more clusters". format(max(db_hier_labels) + 1)) #del self.mean_silhouette_values[label] self.compute_silhouette() else: allowed_hierarchic_iterations -= 1 logging.debug("Extracting new clusters") logging.debug( os.listdir(self.output_folder + '/results')) db_hier_labels = np.array([ lab + (max(self.labels) + 1) if lab != -1 else lab for lab in db_hier_labels ]) logging.debug("Updating the system") # Update the labels' list with the new labels for i, cl_index in enumerate(cl_indexes): self.labels[cl_index] = db_hier_labels[i] logging.debug("re-compute silhouette") logging.debug( os.listdir(self.output_folder + '/results')) self.compute_silhouette() finally: logging.debug("Finally " + str( os.listdir(self.output_folder + '/results'))) cluster_matrix_filename = cluster_matrix.filename del cluster_matrix os.remove(cluster_matrix_filename) else: for cl_index in cl_indexes: self.labels[cl_index] = -1 label += 1 if label > max(self.labels): flag = False logging.debug("Finished computing ITERATIVE-DBSCAN," + "at {0}".format( datetime.datetime.fromtimestamp(time.time()).strftime( '%Y-%m-%d %H:%M:%S'))) return self.labels
# ############################################################################# # Generate sample data # ############################################################################# # Compute DBSCAN eps_range=[] for i in range(1,50): eps_range.append(i*0.1) for eps in eps_range: for min_pts in range(0,10): X = np.array(values) db = DBSCAN(eps=eps, min_samples=min_pts).fit(X) core_samples_mask = np.zeros_like(db.labels_, dtype=bool) core_samples_mask[db.core_sample_indices_] = True label = db.labels_ # Number of clusters in labels, ignoring noise if present. n_clusters_ = len(set(label)) - (1 if -1 in label else 0) if(n_clusters_ == 4): print('Estimated number of clusters: %d' % n_clusters_) print("Eps value :"+str(eps)+" min_pts: "+str(min_pts)) # ############################################################################# # Plot result '''import matplotlib.pyplot as plt
def dbscan_func(all_pos, mol_list, col_pos, col_neg): X = all_pos # = StandardScaler().fit_transform(all_pos) # ############################################################################# # Compute DBSCAN db = DBSCAN(eps=0.8, min_samples=4).fit(X) core_samples_mask = np.zeros_like(db.labels_, dtype=bool) core_samples_mask[db.core_sample_indices_] = True labels = db.labels_ # Number of clusters in labels, ignoring noise if present. n_clusters_ = len(set(labels)) - (1 if -1 in labels else 0) print('Estimated number of clusters: %d' % n_clusters_) print("Silhouette Coefficient: %0.3f" % metrics.silhouette_score(X, labels)) # ############################################################################# # Plot result new_xyz = [] # np.zeros((X.shape[0],3)) new_cols = [] # np.zeros((X.shape[0], 4)) new_labels = [] new_op = [] fig = plt.figure() ax = Axes3D(fig) # Black removed and is used for noise instead. plotly_data = [] unique_labels = set(labels) colors = [plt.cm.Spectral(each) for each in np.linspace(0, 1, len(unique_labels))] # print("Color variety: ", len(colors)) # print(colors) for k, col in zip(unique_labels, colors): if k == -1: # Black used for noise. col = [0, 0, 0, 0.3] class_member_mask = (labels == k) xy = X[class_member_mask & core_samples_mask] for tmp_xy in xy: _id, dh, tds, dg = find_mol_id(tmp_xy, mol_list) # new_xyz.append(tmp_xy) if k == -1: pass # new_cols.append(tuple(col)) # new_labels.append('Unclustered') # new_op.append(float(0.3)) else: #col_pos = [255, 204, 0, 1] #col_neg = [204, 0, 153, 1] # new_cols.append(tuple(col)) if float(dg) < 0.0: new_cols.append(col_pos) else: new_cols.append(col_neg) new_xyz.append(tmp_xy) new_labels.append('Cluster: ' + str(k) + '<br>id: ' + _id \ + '<br>dH: ' + dh + '<br>TdS: ' + tds + '<br>dG: ' + dg) new_op.append(float(1.0)) ax.scatter(xy[:, 0], xy[:, 1], xy[:, 2], 'o', c=tuple(col), edgecolors='k', s=14) xy = X[class_member_mask & ~core_samples_mask] for tmp_xy in xy: # print(tmp_xy) _id, dh, tds, dg = find_mol_id(tmp_xy, mol_list) # new_xyz.append(tmp_xy) if k == -1: pass # new_cols.append(tuple(col)) # new_labels.append('Unclustered') # new_op.append(float(0.3)) else: #col_pos = [255, 204, 0, 1] #col_neg = [204, 0, 153, 1] # new_cols.append(tuple(col)) if float(dg) < 0.0: new_cols.append(col_pos) else: new_cols.append(col_neg) new_xyz.append(tmp_xy) new_labels.append('Cluster: ' + str(k) + '<br>id: ' + _id \ + '<br>dH: ' + dh + '<br>TdS: ' + tds + '<br>dG: ' + dg) new_op.append(float(1.0)) ax.scatter(xy[:, 0], xy[:, 1], xy[:, 2], 'o', c=tuple(col), edgecolors='k', s=6) new_xyz = np.asarray(new_xyz) print(len(new_op)) # names = set_mol_info(X) remove_idx = [] for lbl_tmp, opa_tmp, col_tmp, xyz_tmp, idx_tmp in zip(new_labels, new_op, new_cols, new_xyz, range(len(new_labels))): if lbl_tmp != 'Unclustered': remove_idx.append(idx_tmp) # new_labels = new_labels.pop(for r in remove_idx) # new_xyz = # new_cols = # new_op = return new_xyz, new_cols, new_labels, new_op
def main(args, pnet, rnet, onet): _pnet = pnet _rnet = rnet _onet = onet # Instantiate the class containing the functions to call the Reddit API functions = reddit_functions.Functions() # Open the facenet model with tf.gfile.FastGFile(args.model, 'rb') as f: graph_def = tf.GraphDef() graph_def.ParseFromString(f.read()) _ = tf.import_graph_def(graph_def, name='') with tf.Session() as sess: # Open the text file containing the names into a variable with open(args.names) as file: # Read the text file containing the names line by line for name in file.readlines(): # Get the posts from a sub Reddit, strip is so the enter at the end of a line is removed and the backspace # has to be removed because most sub Reddits are "NameLastname" posts = functions.get_posts(name.strip(), str(args.limit)) # Check if the Reddit API returned something if posts is not None: # Get the images from the posts from the sub Reddit images = functions.get_images(posts) # Check if the image list is not empty if images is not None: # Align the image data images_aligned = align_data(images, args.image_size, args.margin, _pnet, _rnet, _onet) # Get the required input and output tensors images_placeholder = sess.graph.get_tensor_by_name( "input:0") embeddings = sess.graph.get_tensor_by_name( "embeddings:0") phase_train_placeholder = sess.graph.get_tensor_by_name( "phase_train:0") feed_dict = { images_placeholder: images_aligned, phase_train_placeholder: False } emb = sess.run(embeddings, feed_dict=feed_dict) # Get number of faces in the list after alignment nrof_images = len(images_aligned) print(nrof_images) # Create empty distance matrix matrix = np.zeros((nrof_images, nrof_images)) for i in range(nrof_images): for j in range(nrof_images): # Calc distance and fill the matrix dist = np.sqrt( np.sum( np.square( np.subtract(emb[i, :], emb[j, :])))) matrix[i][j] = dist # Instantiate the cluster algorithm, eps = the min distance to cluster db = DBSCAN(eps=1, min_samples=5, metric='precomputed') # Fit the distance matrix to the algorithm db.fit(matrix) labels = db.labels_ # Find how many clusters there are no_clusters = len( set(labels)) - (1 if -1 in labels else 0) # Check if there is more than 1 cluster if no_clusters > 0: print('No of clusters:', no_clusters) biggest_cluster = 0 len_biggest_cluster = 0 for i in range(no_clusters): print('Cluster ' + str(i) + ' : ', np.nonzero(labels == i)[0]) # Find the biggest cluster if len(np.nonzero( labels == i)[0]) > len_biggest_cluster: biggest_cluster = i len_biggest_cluster = len( np.nonzero(labels == i)[0]) print('Biggest cluster: ' + str(biggest_cluster)) cnt = 1 # Putting the full path in a variable to make it easy path = os.path.join(args.out_dir, str(name.strip())) if not os.path.exists(path): # Create a dir in the chosen output location with the name of the persons sub Reddit if it doesn't exist os.makedirs(path) # Loop over the images array positions in the largest dir for j in np.nonzero( labels == biggest_cluster)[0]: # Save the image to the output dir misc.imsave( os.path.join( path, name.strip() + '_' + str('%0*d' % (4, cnt)) + '.png'), images_aligned[j]) cnt += 1 else: for j in np.nonzero( labels == biggest_cluster)[0]: misc.imsave( os.path.join( path, name.strip() + '_ ' + str('%0*d' % (4, cnt)) + '.png'), images_aligned[j]) cnt += 1
def classifierData(self, pipelineDict): self.__preprocessData__() if pipelineDict["scaler"] == "MinMaxScaler": scaler = EachMinMaxScaler(self.scaleList) elif pipelineDict["scaler"] == "RobustScaler": scaler = RobustScaler() elif pipelineDict["scaler"] == "Normalizer": scaler = Normalizer() elif pipelineDict["scaler"] == "StandardScaler": scaler = StandardScaler() elif pipelineDict["scaler"] == "None": scaler = None else: raise TypeError if pipelineDict["reduceDim"] == "TSNE": reduceDimension = TSNE(random_state=0) elif pipelineDict["reduceDim"] == "PCA": reduceDimension = PCA(n_components=2) elif pipelineDict["reduceDim"] == "None": reduceDimension = None else: raise TypeError if pipelineDict["cluster"] == "DBSCAN": eps = pipelineDict["params"]["eps"] min_samples = pipelineDict["params"]["min_samples"] cluster = DBSCAN(eps=float(eps), min_samples=int(min_samples)) else: if pipelineDict["cluster"] == "KMeans": n_clusters = pipelineDict["params"]["n_clusters"] cluster = KMeans(n_clusters=int(n_clusters)) elif pipelineDict["cluster"] == "Agglomerative": n_clusters = pipelineDict["params"]["n_clusters"] cluster = AgglomerativeClustering(n_clusters=int(n_clusters)) else: raise TypeError pipe = chain([("scaler", scaler), ("reduceDim", reduceDimension), ("cluster", cluster)]) labels = pipe.fit_predict(self.theFinalData) eachScaledData = pipe.named_steps("scaler_output") cluster = pipe.named_steps("cluster") championClusters = [] spellClusters = [] try: for index in range(cluster.n_clusters): championClusters.append([]) spellClusters.append([]) except AttributeError: maxLabel = 0 for label in labels: if label > maxLabel: maxLabel = label for index in range(maxLabel + 2): championClusters.append([]) spellClusters.append([]) labels.tolist() for i, label in enumerate(labels.tolist()): tableData = [] tableData.append(self.championNameList[i]) tableData.extend( self.theFinalData[i][0:len(self.data_feature_names[0:-3])]) tableData.extend(self.spellNameList[i]) championClusters[label].append(tableData) spellClusters[label].append(self.spellNameList[i]) dim2Output = pipe.named_steps("reduceDim_output") try: if dim2Output == None: reduceDimension = TSNE(random_state=0) scaledData = pipe.named_steps("scaler_output") try: if scaledData == None: scaledData = self.theFinalData except ValueError: pass finally: dim2Output = reduceDimension.fit_transform(scaledData) except ValueError: pass plt.clf() plt.xlim(dim2Output[:, 0].min(), dim2Output[:, 0].max() + 1) plt.ylim(dim2Output[:, 1].min(), dim2Output[:, 1].max() + 1) colors = [] for _ in championClusters: rgbValue = random.randrange(0, 16777216 - 1) rgbValue = "%X" % (rgbValue) rgbString = "" for _ in range(6 - len(rgbValue)): rgbString = "0" + rgbString rgbString = "#" + rgbString + rgbValue colors.append(rgbString) colors[-1] = "#000000" for i in range(len(dim2Output)): plt.text(dim2Output[i, 0], dim2Output[i, 1], str(self.championNameList[i]), color=colors[labels[i]]) return championClusters
print(d) for i, d in enumerate(l_docs): if type(d) == list: l_docs[i] = "" vectorizer = CountVectorizer(strip_accents="unicode", max_df=0.8, stop_words=get_stop_words()) counts = vectorizer.fit_transform(l_docs) tfidf_transformer = TfidfTransformer().fit_transform(counts) l_target_en = target_encode(l_target) centers = [[1, 1], [-1, -1], [1, -1]] X = StandardScaler().fit_transform(tfidf_transformer.todense()) # ############################################################################# # Compute DBSCAN db = DBSCAN(eps=0.3, min_samples=10).fit(X) core_samples_mask = np.zeros_like(db.labels_, dtype=bool) core_samples_mask[db.core_sample_indices_] = True labels = db.labels_ # Number of clusters in labels, ignoring noise if present. n_clusters_ = len(set(labels)) - (1 if -1 in labels else 0) print('Estimated number of clusters: %d' % n_clusters_) print("Homogeneity: %0.3f" % metrics.homogeneity_score(l_target_en, labels)) print("Completeness: %0.3f" % metrics.completeness_score(l_target_en, labels)) print("V-measure: %0.3f" % metrics.v_measure_score(l_target_en, labels)) print("Adjusted Rand Index: %0.3f" % metrics.adjusted_rand_score(l_target_en, labels)) print("Adjusted Mutual Information: %0.3f" % metrics.adjusted_mutual_info_score(l_target_en, labels))
import argparse import pickle import cv2 print("[INFO] loading encodings...") data = pickle.loads(open("/content/face_encodings.pickle", "rb").read()) data = np.array(data) encodings = [d["encoding"] for d in data] import tensorflow as tf # cluster the embeddings #print('Enter number of clusters:') #n_clusters=input('Enter number of clusters') print("[INFO] clustering...") #for comparision purpose... clt = DBSCAN(metric="euclidean", n_jobs=100) #clt =KMeans(n_clusters=5) clt.fit(encodings) labels=clt.labels_ # determine the total number of unique faces found in the dataset labelIDs = np.unique(clt.labels_) numUniqueFaces = len(np.where(labelIDs > -1)[0]) print("[INFO] # unique faces: {}".format(numUniqueFaces)) pip install pytest-shutil pip install python-resize-image from resizeimage import resizeimage import shutil
import matplotlib.pyplot as plt import cv2 from sklearn.cluster import DBSCAN from k_means_playground import cluster_gen # generate data some clusters n_clusters = 50 clusters_x, clusters_y = cluster_gen(n_clusters) # convert to a single dataset in OpenCV format data = np.float32((np.concatenate(clusters_x), np.concatenate(clusters_y))).transpose() # Define max_distance (eps parameter in DBSCAN()) max_distance = 1 db = DBSCAN(eps=max_distance, min_samples=10).fit(data) # Extract a mask of core cluster members core_samples_mask = np.zeros_like(db.labels_, dtype=bool) core_samples_mask[db.core_sample_indices_] = True # Extract labels (-1 is used for the outliers) labels = db.labels_ n_clusters = len(set(labels)) - (1 if -1 in labels else 0) unique_labels = set(labels) # Plot up the results min_x = np.min(data[:, 0]) max_x = np.max(data[:, 0]) min_y = np.min(data[:, 1]) max_y = np.max(data[:, 0])
partially_propagated = (X_cluster_dist != -1) X_train_partially_propagated = X_train[partially_propagated] y_train_partially_propagated = y_train_propagated[partially_propagated] log_reg = LogisticRegression(random_state=42) log_reg.fit(X_train_partially_propagated, y_train_partially_propagated) print(log_reg.score(X_test, y_test)) # DBSCAN from sklearn.datasets import make_moons X, y = make_moons(n_samples=1000, noise=0.05, random_state=42) plt.plot(X[:, 0], X[:, 1], 'b.') from sklearn.cluster import DBSCAN dbscan = DBSCAN(eps=0.05, min_samples=5) dbscan.fit(X) # 常用属性 print(dbscan.labels_[:10]) print(dbscan.core_sample_indices_[:10]) print(np.unique(dbscan.labels_)) dbscan2 = DBSCAN(eps=0.2, min_samples=5) dbscan2.fit(X) # 画图 def plot_dbscan(dbscan, X, size, show_xlabels=True, show_ylabels=True):