Exemplo n.º 1
0
file = open('../../inputfiles/Kaohsiung2014_case.csv')
lines = file.readlines()
file.close()

posision = []
for data in lines[1:]:
    data = data.split(',')
    if data[9] != "" and data[10] != "":
        posision.append([float(data[9]), float(data[10])])
print(posision)
posision = np.array(posision)
print(posision)

posision = StandardScaler().fit_transform(posision)
# dbscan
db = DBSCAN(eps=0.15, min_samples=5).fit(posision)
# db = DBSCAN().fit(posision)

core_samples_mask = np.zeros_like(db.labels_, dtype=bool)
core_samples_mask[db.core_sample_indices_] = True
labels = db.labels_
print(labels)

# Number of clusters in labels, ignoring noise if present.
n_clusters_ = len(set(labels)) - (1 if -1 in labels else 0)
print(n_clusters_)
# print('Estimated number of clusters: %d' % n_clusters_)
# print("Homogeneity: %0.3f" % metrics.homogeneity_score(labels_true, labels))
# print("Completeness: %0.3f" % metrics.completeness_score(labels_true, labels))
# print("V-measure: %0.3f" % metrics.v_measure_score(labels_true, labels))
# print("Adjusted Rand Index: %0.3f"
Exemplo n.º 2
0
def main():
    input_file = args.IN
    output = args.OUT
    window_size = args.WINDOW
    cut_name = args.CUT
    min_seed_number = args.MINNSEED

    if cut_name != '99perc' and cut_name != 'weights' and cut_name != 'dummy_cut':
        cut_file = pkl.load(open(args.CUTfile, 'rb'))
        df_cut = pd.DataFrame(cut_file)

    file_root = r.TFile(args.IN, "READ")
    tree_sig = file_root.Get("events")

    nentries = tree_sig.GetEntries()

    print('Entries before ' + str(cut_name), nentries)

    time, alpha, pitch, L, energy, event_idx, cut = [], [], [], [], [], [], []

    for i in np.arange(0, nentries):
        tree_sig.GetEntry(i)

        day = float(str(tree_sig.day)[-2:])

        if cut_name == '99perc':
            cut_val = tree_sig.rms99_of_99
            cut.append(cut_val)
            if tree_sig.counts < cut_val:
                continue

        elif cut_name == 'weights':
            cut_val = tree_sig.weight
            cut.append(cut_val)
            if cut_val < 2:
                continue

        elif cut_name == 'dummy_cut':
            cut_val = 0
            cut.append(cut_val)

        else:
            cut_val = df_cut[(df_cut.day.values == day)
                             & (df_cut.L.values == tree_sig.L) &
                             (df_cut.alpha.values == tree_sig.alpha) &
                             (df_cut.energy.values
                              == tree_sig.energy)][cut_name].values[0]
            cut.append(cut_val)
            if tree_sig.counts < cut_val:
                continue

        time_hour = float(tree_sig.time) + 24. * (day - 1)

        time_sec = time_hour * 3600.

        # variales for clustering
        time.append(time_sec)
        alpha.append(tree_sig.alpha)
        L.append(tree_sig.L)
        energy.append(tree_sig.energy)
        event_idx.append(i)

    print('Entries after ' + str(cut_name), len(time))

    # clustering algorithm lines
    X = np.stack([
        np.array(time),
        np.array(alpha) * 10000.,
        np.array(L) * 10000.,
        np.array(energy) * 10000
    ],
                 axis=1)
    clustering = DBSCAN(eps=window_size,
                        metric='euclidean',
                        min_samples=1,
                        n_jobs=-1).fit(X)
    y_temp = clustering.labels_

    y = []
    for i_y in y_temp:
        if i_y != -1:
            if len(y_temp[y_temp == i_y]) < min_seed_number:
                y.append(-1)
            else:
                y.append(i_y)
        else:
            y.append(i_y)

    y = np.array(y)
    n_clusters = len(set(y)) - (1 if -1 in y else 0)
    print('Nclusters ', n_clusters, np.unique(y))
    y = y.reshape([len(X), 1])

    Xy = np.concatenate((X, np.array(event_idx).reshape([len(X), 1])), axis=1)
    Xy = np.concatenate((Xy, y), axis=1)

    # nnumber of good clusters
    n_clusters = len(set(
        clustering.labels_)) - (1 if -1 in clustering.labels_ else 0)
    n_noise = list(clustering.labels_).count(-1)

    good_cluster_list = np.unique(Xy[Xy[:, -1] != -1][:, -1])
    good_cluster_index = np.arange(0, len(good_cluster_list))
    cluster_dict = dict(zip(good_cluster_list, good_cluster_index))

    start_cluster = []
    end_cluster = []
    L_cluster = []
    alpha_cluster = []

    cluster_index = -1 * np.ones(nentries, dtype=int)

    for cls_i in good_cluster_list:
        cluster_entries = Xy[Xy[:, -1] == cls_i]
        start_cluster = cluster_entries[0, 4]
        end_cluster = cluster_entries[-1, 4]
        alpha_cluster = cluster_entries[0, 1] / 10000
        L_cluster = cluster_entries[0, 2] / 10000
        energy_cluster = cluster_entries[0, 3] / 10000
        for cls_ev in np.arange(start_cluster, end_cluster + 1):
            tree_sig.GetEntry(int(cls_ev))
            if (tree_sig.L == L_cluster) and (
                    tree_sig.alpha == alpha_cluster) and (tree_sig.energy
                                                          == energy_cluster):
                cluster_index[int(cls_ev)] = int(cluster_dict[int(cls_i)])

    file_root.Close()

    clsnr_b = array('i', [-1])
    thr_b = array('d', [0.])
    newroot = r.TFile(input_file, "update")
    t = newroot.Get("events")
    clsnr_new = t.Branch('cls_idx', clsnr_b, 'cls_idx/I')
    thr_new = t.Branch('thr_cut', thr_b, 'thr_cut/D')

    for i in np.arange(0, nentries):
        t.GetEntry(i)

        clsnr_b[0] = cluster_index[i]
        thr_b[0] = cut[i]

        clsnr_new.Fill()
        thr_new.Fill()

    newroot.Write("", r.TObject.kOverwrite)
    newroot.Close()
    '''
Exemplo n.º 3
0
def check_flights():
    URL = "https://www.google.com/flights/explore/#explore;f=JFK,EWR,LGA;t=HND,NRT,TPE,HKG,KIX;s=1;li=8;lx=12;d=2018-04-01"

    driver = webdriver.PhantomJS()

    dcap = dict(DesiredCapabilities.PHANTOMJS)

    dcap["phantomjs.page.settings.userAgent"] = (
        "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_10_5) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/46.0.2490.80 Safari/537.36"
    )
    driver = webdriver.PhantomJS(desired_capabilities=dcap,
                                 executable_path="/usr/local/bin/phantomjs")
    driver.implicitly_wait(20)
    driver.get(URL)
    wait = WebDriverWait(driver, 20)
    wait.until(
        EC.visibility_of_element_located((By.CSS_SELECTOR, "div.CTPFVNB-w-e")))

    s = BeautifulSoup(driver.page_source, "lxml")

    best_price_tags = s.findAll('div', 'CTPFVNB-w-e')

    # check if scrape worked - alert if it fails and shutdown
    if len(best_price_tags) < 4:
        print('Failed to Load Page Data')
        requests.post(
            'https://maker.ifttt.com/trigger/fare_alert/with/key/API_TOKEN',
            data={
                "value1": "script",
                "value2": "failed",
                "value3": ""
            })
        sys.exit(0)
    else:
        print('Successfully Loaded Page Data')

    best_prices = []
    for tag in best_price_tags:
        best_prices.append(int(tag.text.replace('$', '')))

    best_price = best_prices[0]

    best_height_tags = s.findAll('div', 'CTPFVNB-w-f')
    best_heights = []
    for t in best_height_tags:
        best_heights.append(
            float(t.attrs['style'].split('height:')[1].replace('px;', '')))

    best_height = best_heights[0]

    # price per pixel of height
    pph = np.array(best_price) / np.array(best_height)

    cities = s.findAll('div', 'CTPFVNB-w-o')

    hlist = []
    for bar in cities[0]\
            .findAll('div', 'CTPFVNB-w-x'):
        hlist.append(
            float(bar['style'].split('height: ')[1].replace('px;', '')) * pph)

    fares = pd.DataFrame(hlist, columns=['price'])
    px = [x for x in fares['price']]
    ff = pd.DataFrame(px, columns=['fare']).reset_index()

    # begin the clustering
    X = StandardScaler().fit_transform(ff)
    db = DBSCAN(eps=1.5, min_samples=1).fit(X)

    labels = db.labels_
    clusters = len(set(labels))

    pf = pd.concat([ff, pd.DataFrame(db.labels_, columns=['cluster'])], axis=1)

    rf = pf.groupby('cluster')['fare'].agg(['min', 'count'
                                            ]).sort_values('min',
                                                           ascending=True)

    # set up our rules
    # must have more than one cluster
    # cluster min must be equal to lowest price fare
    # cluster size must be less than 10th percentile
    # cluster must be $100 less the next lowest-priced cluster
    if clusters > 1 and ff['fare'].min() == rf.iloc[0]['min']\
            and rf.iloc[0]['count'] < rf['count'].quantile(.10)\
            and rf.iloc[0]['fare'] + 100 < rf.iloc[1]['fare']:
        city = s.find('span', 'CTPFVNB-v-c').text
        fare = s.find('div', 'CTPFVNB-w-e').text
        r = requests.post(
            'https://maker.ifttt.com/trigger/fare_alert/with/key/API_TOKEN',
            data={
                "value1": city,
                "value2": fare,
                "value3": ""
            })
    else:
        print('no alert triggered')
pyplot.show()

from numpy import unique
from numpy import where
from sklearn.datasets import make_classification
from sklearn.cluster import DBSCAN
from matplotlib import pyplot
# define dataset
X, _ = make_classification(n_samples=1000,
                           n_features=2,
                           n_informative=2,
                           n_redundant=0,
                           n_clusters_per_class=1,
                           random_state=4)
# define the model
model = DBSCAN(eps=0.30, min_samples=9)
# fit model and predict clusters
yhat = model.fit_predict(X)
# retrieve unique clusters
clusters = unique(yhat)
# create scatter plot for samples from each cluster
for cluster in clusters:
    # get row indexes for samples with this cluster
    row_ix = where(yhat == cluster)
    # create scatter of these samples
    pyplot.scatter(X[row_ix, 0], X[row_ix, 1])
# show the plot
pyplot.show()

from numpy import unique
from numpy import where
Exemplo n.º 5
0

def avg(x):
    if len(x) < 1:
        return x[0]
    else:
        return sum(x) / len(x)


for i in vin_list[:20]:
    X = auto[auto.vin == i]
    Y = X[['vin', 'ignition_time']]
    X = X[['stop_longitude', 'stop_latitude']]
    try:

        db = DBSCAN(eps=0.001, min_samples=5)
        db.fit(X)
        labels = db.labels_
        labels_unique = np.unique(labels)
        n_clusters_ = len(labels_unique) - (1 if -1 in labels else 0)
        print("number of estimated clusters : %d" % n_clusters_)
    except:
        print("number of estimated clusters : 0")

# =============================================================================
# #噪音点评估
# raito = len(labels[labels[:] == -1]) / len(labels)
# print('Noise raito:', format(raito, '.2%'))
# =============================================================================
    '''
    聚类画图调参
Exemplo n.º 6
0
    rgb_im1 = im1.convert('HSV')
    color = rgb_im1.getcolors()
    my_list = []

    for x in range(im1.size[0]):
        for y in range(im1.size[1]):
            (h, s, v) = rgb_im1.getpixel((x, y))
            if h == 26:
                continue
            if s < 75:
                continue
            if v < 75:
             my_list.append([x,y])

    X = np.array(my_list)
    clustering = DBSCAN(eps=3, min_samples=2).fit(X)
    labels = clustering.labels_
    n_clusters_ = len(set(labels)) - (1 if -1 in labels else 0)
    # print(n_clusters_)

    clusters = pd.Series( [X[labels==n] for n in range(n_clusters_)])
    centers = []
    for x in clusters:
        center = MultiPoint(x).centroid
        (x, y) = (int(center.x), int(center.y))
        centers.append([x,y])

    print ('click')
    xy=pyautogui.position()
    pyautogui.moveTo(random.choice(centers))
    pyautogui.click(button='left')
Exemplo n.º 7
0
def SampleSelection_v3(setOfPoints,nSamples,returnIndicies=False, nTrials=10, debug=False):
    """Separating into clusters. Using Convex Hull to select boundary points. Filling the rest by performing random selections  """
    # from sklearn.mixture import GaussianMixture
    # model = GaussianMixture(n_components=4)
    # model.fit(setOfPoints)
    # yhat =model.predict(setOfPoints)
    nPoints = setOfPoints.shape[0]
    from sklearn.preprocessing import MinMaxScaler
    scaler = MinMaxScaler()
    data = scaler.fit_transform(setOfPoints)
    from sklearn.cluster import DBSCAN
    model = DBSCAN(eps=0.1, min_samples=10)
    yhat = model.fit_predict(data)
    clusters=np.unique(yhat)


    Gindicies = [];GboundaryPoints=[]
    for cluster in clusters:
        row_ix = np.where(yhat==cluster)
        clusterPoints = np.squeeze(setOfPoints[row_ix,:])
        if np.unique(clusterPoints,axis=0).shape[0] < 3:
            GboundaryPoints.append(setOfPoints[row_ix[0][0]])
            Gindicies.append(row_ix[0][0])
            continue
        hull = ConvexHull(clusterPoints)
        indicies = hull.vertices.tolist()
        boundaryPoints = [];removeIndicies=[]
        for idx in indicies:
            if not arreqclose_in_list(setOfPoints[row_ix[0][idx]],GboundaryPoints):
                GboundaryPoints.append(setOfPoints[row_ix[0][idx]])
            else:
                removeIndicies.append(idx)
        for idx in removeIndicies:
            indicies.remove(idx)
        for idx in indicies:
            Gindicies.append(row_ix[0][idx])
        if debug:print("Finished Calculating Convex Hull of the set. Number of boundary points " + str(len(GboundaryPoints)))

    if len(Gindicies) >= nSamples: #Perform prunning operation
        #Removing the entry that lowers the entropy the least
        while len(Gindicies) != nSamples:
            worstDist=0
            for i in range(len(GboundaryPoints)):
                dist = TotalAverageDistance(GboundaryPoints.copy().pop(i))
                if dist > worstDist:
                    worstDist=dist
                    idx = i
            GboundaryPoints.pop(idx)
            Gindicies.pop(idx)
        if returnIndicies:
            return Gindicies
        return GboundaryPoints
    else:

        maxDist = 0
        for trial in range(nTrials):
            if debug:print("Begining sampling trial " + str(trial))

            points = GboundaryPoints.copy()
            idx = Gindicies.copy()
            while len(points) < nSamples:
                x = randint(0,nPoints-1)
                if x in idx:
                    continue
                if arreqclose_in_list(setOfPoints[x],points):
                    continue
                idx = np.append(idx,x)
                points.append(setOfPoints[x])
            dist = TotalAverageDistance(points)
            if dist >= maxDist:
                maxDist=dist
                bestPoints = points.copy()
                bestIndicies = idx.copy()
            if debug: print(maxDist,len(bestPoints),len(bestIndicies))
        if returnIndicies:
            return bestIndicies
        return bestPoints
Exemplo n.º 8
0
D = np.sort(D, axis=0)
minPts = 10
nearest = D[1:(minPts + 1), :]
nearest = nearest.reshape(1, nearest.size)
sort_nearest = np.sort(nearest)
plt.plot(range(len(sort_nearest[0, :])),
         sort_nearest[0, :],
         linewidth=1.0,
         marker='x')
#plt.axis([-2, len(sort_nearest[0,:])+1000, -2, max(sort_nearest[0,:])+2])
plt.savefig(cur_file_dir + 'result/' + 'nearest.png')
plt.cla()
plt.clf()
plt.close()
#db = DBSCAN(eps=0.90, min_samples=minPts).fit(X) # 高维数据
db = DBSCAN(eps=30, min_samples=minPts).fit(reduced_data)  # 低维数据
labels = db.labels_

# Number of clusters in labels, ignoring noise if present.
n_clusters_ = len(set(labels)) - (1 if -1 in labels else 0)
print labels
print n_clusters_
print len(labels[labels >= 0])

## 散点图
#colors = [c[int(i) % len(c)] for i in labels]
#colors = labels
#plt.scatter(reduced_data[:, 0], reduced_data[:, 1], 20, colors) # 20为散点的直径

## 图形展示
unique_labels = set(labels)
Exemplo n.º 9
0
weight.append(50)

we = np.asarray(weight)
print we
print we.shape
print newInp.shape
print meantrain.shape

res = list()
for row in newInp:
    curres = distance.euclidean(row, meantrain, we)
    res.append(curres)
    print "euclidean {}".format(curres)

dbinput = np.asarray(res)
dbinput = [[x, 1] for x in dbinput]
print dbinput
stdv = math.ceil(np.std(dbinput))
print stdv
model = DBSCAN(eps=int(stdv), min_samples=3).fit(dbinput)
print model.labels_

clust = list()
i = 0
for row in model.labels_:
    if row == 0:
        clust.append(res[i])
    i = i + 1

print clust
Exemplo n.º 10
0
random_state = np.random.RandomState(seed=0)
random_clusters = random_state.randint(low=0, high=2, size=len(X))

# 绘制随机分配
axes[0].scatter(X_scaled[:, 0],
                X_scaled[:, 1],
                c=random_clusters,
                cmap=mglearn.cm3,
                s=60)
axes[0].set_title("Random assignment: {:.2f}".format(
    silhouette_score(X_scaled, random_clusters)))

algorithms = [
    KMeans(n_clusters=2),
    AgglomerativeClustering(n_clusters=2),
    DBSCAN()
]

for ax, algorithm in zip(axes[1:], algorithms):
    clusters = algorithm.fit_predict(X_scaled)
    # 绘制簇分配和簇中心
    ax.scatter(X_scaled[:, 0],
               X_scaled[:, 1],
               c=clusters,
               cmap=mglearn.cm3,
               s=60)
    ax.set_title("{} : {:.2f}".format(algorithm.__class__.__name__,
                                      silhouette_score(X_scaled, clusters)))

plt.show()
def cluster(frame, sift):
    #frame_gray = cv2.cvtColor(frame, cv2.COLOR_BGR2GRAY)

    kps = sift.detect(frame, None)
    kps.sort(key=attrgetter('octave'), reverse=False)

    # group keypoints from different scale to cluster them seperately
    groups = collections.defaultdict(list)
    for kp in kps:
        groups[get_keypoint_attrs(kp)[2]].append(kp)

    # for each of the groups get multiple clusters(list of keypoints)
    clusters = []
    avg_response_of_clusters = []
    for item in groups.items():
        #print(str(item[0]) +": "+ str(len(item[1])) + "\n")
        #build cluster
        X = []
        kp_index = 0

        for kp in item[1]:
            #creating histogram
            region = crop(frame, kp.pt, 5)
            bgr_hist = []
            histb = cv2.calcHist([region], [0], None, [64], [0, 256])
            histg = cv2.calcHist([region], [1], None, [64], [0, 256])
            histr = cv2.calcHist([region], [2], None, [64], [0, 256])
            bgr_hist.append(histb)
            bgr_hist.append(histg)
            bgr_hist.append(histr)
            a = np.array(bgr_hist)
            bgr_hist = a.flatten()
            bgr_hist_max = max(bgr_hist)
            bgr_hist_norm = [float(i) / bgr_hist_max for i in bgr_hist]
            #print(bgr_hist_norm)

            #print(bgr_hist)
            # normalize weight and add histogram as feature
            x = [kp.pt[0] / frame.shape[1], kp.pt[1] / frame.shape[0]]
            x = [i * 100 for i in x]
            x += bgr_hist_norm  #adding histogram as features with position
            X.append(x)
            kp.class_id = kp_index
            kp_index = kp_index + 1
            #pprint(dir(kp))
        if item[0] == 2.0:
            db = DBSCAN(eps=3, min_samples=15).fit(X)
        elif item[0] == 1.0:
            db = DBSCAN(eps=5, min_samples=10).fit(X)
        elif item[0] == 0.5:
            db = DBSCAN(eps=7, min_samples=4).fit(X)
        else:
            db = DBSCAN(eps=10, min_samples=2).fit(X)
        labels = db.labels_
        # Number of clusters in labels, ignoring noise if present
        n_clusters_ = len(set(labels)) - (1 if -1 in labels else 0)

        # assigning corresponding cluster id to the keypoints
        for kp in item[1]:
            kp.class_id = labels[kp.class_id]

        # calculate average response of each cluster
        for clstr_no in list(range(0, n_clusters_)):
            clstr = [kp for kp in item[1] if kp.class_id == clstr_no]
            avg_response = np.average([k.response for k in clstr])
            clusters.append(clstr)
            avg_response_of_clusters.append(avg_response)

        # Parallel sorting of clusters using their avg. response
        if n_clusters_ > 0:
            avg_response_of_clusters, clusters = zip(
                *sorted(zip(avg_response_of_clusters, clusters), reverse=True))
            avg_response_of_clusters = list(avg_response_of_clusters)
            clusters = list(clusters)
            #print(clusters)
            #print(avg_response_of_clusters)

    #quit()

    best_keypoints = []
    frame_attention_window = None

    # find best avaiable cluster
    for c in clusters:
        aw = cluster_to_window(c)
        octave, layer, scale = get_keypoint_attrs(c[0])
        if window_history.add_if_new(aw, scale):
            frame_attention_window = aw
            if len(sys.argv) > 3:
                best_keypoints += kps  # returning all keypoints for visualization
            else:
                best_keypoints += c  # returning only the best cluster
            break
    '''
	for i in range(len(kps)):
		aw = keypoint_to_window(kps[i])
		octave, layer, scale= get_keypoint_attrs(kps[i])
		kp = kps[i]
		#cv2.imshow("Test1", frame[int(kp.pt[1]-5):int(kp.pt[1]+5), int(kp.pt[0]-5):int(kp.pt[0]+5)])
		if window_history.add_if_new(aw, scale):
			frame_attention_window = aw
			best_keypoints += groups[2.0]
			print(scale, groups[scale][3].class_id)
			break
	'''
    return (frame_attention_window, best_keypoints)
Exemplo n.º 12
0
df_abnormal = finalDf.query('actual == 1')

print("normal :", df_normal.shape[0])
print("abnormal :", df_abnormal.shape[0])

df = pd.concat([df_normal, df_abnormal])

names = df['timestamp']
df.drop(columns=['timestamp'], inplace=True)
df['avg'] = df.apply(lambda row: get_weighted_avg(row), axis=1)

df_scaler = StandardScaler().fit(df[['avg']].to_numpy())
newDf = df_scaler.transform(df[['avg']].to_numpy())
# newDf = df[['avg']]

outlier_detection = DBSCAN(eps=0.9, min_samples=100, metric='euclidean')
clusters = outlier_detection.fit_predict(newDf)
df['scores'] = clusters

df['names'] = names
df['pred'] = df.apply(lambda row: 1 if row['scores'] == -1 else 0, axis=1)

outliers = df.query('pred == 1')
normal = df.query('pred == 0')

threedee = plt.figure().gca(projection='3d')
threedee.scatter(normal['pitch'],
                 normal['roll'],
                 normal['yaw'],
                 color="#00FF00",
                 label="normal points")
Exemplo n.º 13
0
# - *eps* is the max distance between two samples for one to be considered a 'neighborhood' of the other (so kind of like radius of the cluster)
# - *min_samples* number of points ina  neighborhood to consider a central point the 'core point'
# - *metric* chooses the type of distance
# - *p* is the power parameter in the minkowski distance equation

# In[142]:

get_ipython().run_cell_magic(
    'latex', '',
    '\\begin{align}\n\\mathrm{Minkowski \\,distance} = \\left( \n    \\sum_{i=1}^{n}|X_i - Y_i|^p\n    \\right)^\\frac{1}{p}\n\\end{align}'
)

# In[143]:

X = StandardScaler().fit_transform(np.asarray(df2.accTotal).reshape(-1, 1))
model = DBSCAN(eps=0.5, min_samples=110, metric='minkowski', p=1.5).fit(X)
model.labels_
true_false = []
for item in model.labels_:
    if item == 0:
        true_false.append(False)
    else:
        true_false.append(True)

anomalies = df2[true_false]
actuals = df2[[not i for i in true_false]]

# In[144]:

plt.plot(anomalies.index, anomalies.accTotal, 'r.')
plt.plot(actuals.index, actuals.accTotal, 'b.')
Exemplo n.º 14
0
                type=int,
                default=-1,
                help="# of parallel jobs to run (-1 will use all CPUs)")
args = vars(ap.parse_args())

# load the serialized face encodings + bounding box locations from
# disk, then extract the set of encodings to so we can cluster on
# them
print("[INFO] loading encodings...")
data = pickle.loads(open(args["encodings"], "rb").read())
data = np.array(data)
encodings = [d["encoding"] for d in data]

# cluster the embeddings
print("[INFO] clustering...")
clt = DBSCAN(metric="euclidean", n_jobs=args["jobs"])
clt.fit(encodings)

# determine the total number of unique faces found in the dataset
labelIDs = np.unique(clt.labels_)
numUniqueFaces = len(np.where(labelIDs > -1)[0])
print("[INFO] # unique faces: {}".format(numUniqueFaces))

# loop over the unique face integers
for labelID in labelIDs:
    # find all indexes into the `data` array that belong to the
    # current label ID, then randomly sample a maximum of 25 indexes
    # from the set
    print("[INFO] faces for face ID: {}".format(labelID))
    idxs = np.where(clt.labels_ == labelID)[0]
    idxs = np.random.choice(idxs, size=min(25, len(idxs)), replace=False)
    zaxis=dict(
        range=[-5,10],
        title='PC_3',
        gridcolor='rgb(255, 255, 255)',
        zerolinecolor='rgb(255, 255, 255)',
        showbackground=True,
        backgroundcolor='rgb(230, 230,230)',
        showticklabels=False, ticks=''
    )
)

centers = [[1, 1], [-1, -1], [1, -1]]
X = x_pca
y = clust_df_region['Region']

estimators = {'dbscan': DBSCAN(eps=1.9, min_samples=15).fit(X)
              }
fignum = 1
for name, est in estimators.items():
    est.fit(X)
    labels = est.labels_

    trace = go.Scatter3d(x=X[:, 0], y=X[:, 1], z=X[:, 2],
                         showlegend=False,
                         mode='markers',
                         marker=dict(
                                color=labels.astype(np.float),
                                line=dict(color='black', width=1)
        ))
    fig.append_trace(trace, 1, fignum)
    
def touchdowns(image, n):
    """
    Function to obtain the locations of the touchdown passes from the image
    of the pass chart using k-means, and DBSCAN to account for difficulties in 
    extracting touchdown passes, since they have the are the same color as both the line of 
    scrimmage and the attached touchdown trajectory lines. 
    
    Input: 
        image: image from the folder 'Cleaned_Pass_Charts'
        n: number of toucndowns, from the corresponding data of the image
    Return:
        call to map_pass_locations:
            centers: list of pass locations in pixels
            col: width of image from which the pass locations were extracted
            pass_type: "TOUCHDOWN"
    """

    im = Image.open(image)
    pix = im.load()
    col, row = im.size

    img = Image.new('RGB', (col, row), 'black')
    p = img.load()

    for i in range(col):
        for j in range(row):
            r = pix[i, j][0]
            g = pix[i, j][1]
            b = pix[i, j][2]
            if (col < 1370) and (j < row - 105) and (j > row - 111):
                if (b > 2 * g) and (b > 60):
                    p[i, j] = (0, 0, 0)
            elif (col > 1370) and (j < row - 81) and (j > row - 86):
                if (b > 2 * g) and (b > 60):
                    p[i, j] = (0, 0, 0)
            else:
                p[i, j] = pix[i, j]
            r = p[i, j][0]
            g = p[i, j][1]
            b = p[i, j][2]
            f = ((r - 20)**2 + (g - 80)**2 + (b - 200)**2)**0.5
            if f < 32 and b > 100:
                p[i, j] = (255, 255, 0)

    #scipy.misc.imsave('temp.jpg', img)
    imageio.imwrite('temp.jpg', img)
    imag = cv2.imread('temp.jpg')
    os.remove('temp.jpg')
    hsv = cv2.cvtColor(imag, cv2.COLOR_BGR2HSV)
    lower = np.array([20, 100, 100])
    upper = np.array([30, 255, 255])
    mask = cv2.inRange(hsv, lower, upper)
    res = cv2.bitwise_and(imag, imag, mask=mask)
    res = cv2.cvtColor(res, cv2.COLOR_HSV2RGB)
    res = cv2.cvtColor(res, cv2.COLOR_BGR2GRAY)
    res = cv2.fastNlMeansDenoising(res, h=10)
    x = np.where(res != 0)[0]
    y = np.where(res != 0)[1]
    pairs = list(zip(x, y))
    X = list(map(list, pairs))

    if (len(pairs) != 0):
        db = DBSCAN(eps=10, min_samples=n).fit(X)
        labels = db.labels_
        coords = pd.DataFrame([x, y, labels]).T
        coords.columns = ['x', 'y', 'label']
        clusters = Counter(labels).most_common(n)
        td_labels = np.array([clust[0] for clust in clusters])
        km_coords = coords.loc[coords['label'].isin(td_labels)]
        km = list(map(list, zip(km_coords.iloc[:, 0], km_coords.iloc[:, 1])))

        kmeans = KMeans(n_clusters=n, random_state=0).fit(km)
        centers = kmeans.cluster_centers_

        return map_pass_locations(centers, col, "TOUCHDOWN")

    else:
        return map_pass_locations([], col, "TOUCHDOWN", n)
Exemplo n.º 17
0
        embNump[i] = val.detach().numpy()
        print(embNump.size)
    if input("Save features to file? y/n") == 'y':
        filename = username + '.npy'
        np.save(filename, embNump)
elif input("Load features from data? y/n") == 'y':
    inFile = input("Filename: ")
    embNump = np.load(inFile)

print("Number of Faces: ")
print(len(embNump))
print("Clustering now")
#compute embeddings
#FIND distance matrix:
#dists =  [[(e1 - e2).norm().item() for e1 in embeddings]for e2 in embeddings]
db = DBSCAN(eps=0.8).fit(embNump)
print("Labels: ")
print(db.labels_)
userProfiles = np.empty(0)
userNames = np.empty(0)
if input("load userProfiles? y/n") == 'y':
    userProfiles = np.load('faceDictionary.npy')
    userNames = np.load('userNames.npy')
userProfiles, userNames = addUserToList(db, embNump, userProfiles, userNames,
                                        username)
print(len(userProfiles))
#call function
print("Saving new user profiles...")
np.save('faceDictionary.npy', userProfiles)
np.save('userNames.npy', userNames)
print("Completed adding: " + username)
Exemplo n.º 18
0
 def main(self):
     # check innput
     path = self.e1.get()
     try:
         image_RGB = plt.imread(path)
         cluster_tolerance = float(self.e2.get())
         pass
     except:
         self.state.set('ERROR')
         self.lstate.config(bg='#FF7F7F')
         self.face.update_idletasks()
         messagebox.showinfo(title='ERROR', message='输入错误!')
         return None
     self.lstate.config(bg='#7FFF7F')
     self.state.set('正常')
     self.face.update_idletasks()
     # show image
     self.state.set('显示图片中。。。')
     self.face.update_idletasks()
     img_open = Image.open(path)
     img = img_open.resize((128, 64))
     img = ImageTk.PhotoImage(img)
     self.lp1.config(image=img)
     self.lp1.image = img
     self.face.update_idletasks()
     # resize to array
     image_RGB = Image.open(path)
     w_resize = 96
     h_resize = int(w_resize*image_RGB.size[1]/image_RGB.size[0])
     image_RGB = image_RGB.resize((w_resize, h_resize))
     image_RGB = np.array(image_RGB)
     # to lab
     self.state.set('转换RGB为LAB中。。。')
     self.face.update_idletasks()
     image_LAB = cv2.cvtColor(image_RGB, cv2.COLOR_RGB2LAB)
     # cluster
     self.state.set('图片聚类中。。。')
     self.face.update_idletasks()
     dbscan = DBSCAN(eps=cluster_tolerance, min_samples=1)
     h_1, w_1, c_1 = image_LAB.shape
     image_data = image_LAB.reshape((h_1*w_1, c_1))
     image_lab_data = []
     # uint8 to true lab
     for data in image_data:
         image_lab_data.append([data[0]*100/255, data[1]-128, data[2]-128])
         pass
     image_lab_data = np.array(image_lab_data)
     dbscan.fit(image_lab_data)
     labels = dbscan.labels_
     n_clusters = len(set(labels)) - (1 if -1 in labels else 0)
     # find the cluster center
     themes = []
     clusters_area = []
     for i in range(n_clusters):
         one_cluster = image_lab_data[labels == i]
         if len(one_cluster)!=1:
             km = KMeans(n_clusters=1, max_iter=300)
             km.fit(one_cluster)
             themes.append(np.squeeze(km.cluster_centers_))
             pass
         else:
             themes.append(one_cluster[0])
             pass
         clusters_area.append(len(one_cluster)/len(image_lab_data))
         pass
     themes = np.array(themes)
     # show themes image
     uint8_themes = []
     for theme in themes:
         uint8_themes.append([theme[0]*255/100, theme[1]+128, theme[2]+128])
         pass
     uint8_themes = np.array(uint8_themes)
     pic_array = cv2.cvtColor(np.uint8(uint8_themes.reshape(1, len(uint8_themes), 3)), \
                              cv2.COLOR_LAB2RGB)[0]
     self.themes_pic_array = pic_array
     pic_array = self.make_themes_image(pic_array)
     pic = Image.fromarray(pic_array.astype('uint8')).convert('RGB')
     img = ImageTk.PhotoImage(pic)
     self.lp1c.config(image=img)
     self.lp1c.image = img
     self.face.update_idletasks()
     self.state.set('聚类完成')
     self.face.update_idletasks()
     # sort
     themes_areas = list(zip(clusters_area, themes))
     sorted_themes_areas = sorted(themes_areas, key=lambda x:(x[0]), reverse=True)
     self.listbox.delete(0, tk.END)
     self.face.update_idletasks()
     # write to list box
     sum_area = 0.0
     #info = ' '*5 + '序号' + ' '*28 + 'LAB' + ' '*51 + '面积占比' + ' '*26 + '前n行面积占比之和'
     info = ' '*5 + '序号' + ' '*28 + 'LAB' + ' '*80 + '面积占比'
     self.listbox.insert(tk.END, info)
     self.face.update_idletasks()
     self.state.set('处理列表中。。。请稍后')
     self.face.update_idletasks()
     count = 0
     # for excel
     self.results = []
     for area, theme in sorted_themes_areas:
         count = count + 1
         sum_area += area
         L, A, B = theme
         L = round(L, 3)
         A = round(A, 3)
         B = round(B, 3)
         self.results.append((count, [L, A, B], round(100*area, 3)))
         info = ' '*(8-len(str(count))) + str(count) + ' '*18 + str([L, A, B])
         #info += ' '*(60-len(str([L, A, B]))) + str(round(100*area, 3)) + '%'
         #info += ' '*(125-len(info)) + str(round(100*sum_area, 3)) + '%'
         info += ' '*(90-len(str([L, A, B]))) + str(round(100*area, 3)) + '%'
         self.listbox.insert(tk.END, info)
         self.face.update_idletasks()
         pass
     self.scrollbar.config(command=self.listbox.yview)
     self.face.update_idletasks()
     self.state.set('选取完成')
     self.face.update_idletasks()
     pass
Exemplo n.º 19
0
def separate_watershed(
    vdf_temp,
    min_distance=1,
    min_size=1,
    max_size=np.inf,
    max_number_of_grains=np.inf,
    marker_radius=1,
    threshold=False,
    exclude_border=False,
    plot_on=False,
):
    """Separate segments from one VDF image using edge-detection by the
    sobel transform and the watershed segmentation implemented in
    scikit-image. See [1,2] for examples from scikit-image.

    Parameters
    ----------
    vdf_temp : np.array
        One VDF image.
    min_distance: int
        Minimum distance (in pixels) between markers for them to be
        considered separate markers for the watershed segmentation.
    min_size : float
        Grains with size (i.e. total number of pixels) below min_size
        are discarded.
    max_size : float
        Grains with size (i.e. total number of pixels) above max_size
        are discarded.
    max_number_of_grains : int
        Maximum number of grains included in the returned separated
        grains. If it is exceeded, those with highest peak intensities
        will be returned.
    marker_radius : float
        If 1 or larger, each marker for watershed is expanded to a disk
        of radius marker_radius. marker_radius should not exceed
        2*min_distance.
    threshold : bool
        If True, a mask is calculated by thresholding the VDF image by
        the Li threshold method in scikit-image. If False (default), the
        mask is the boolean VDF image.
    exclude_border : int or True, optional
        If non-zero integer, peaks within a distance of exclude_border
        from the boarder will be discarded. If True, peaks at or closer
        than min_distance of the boarder, will be discarded.
    plot_on : bool
        If True, the VDF, the mask, the distance transform
        and the separated grains will be plotted in one figure window.

    Returns
    -------
    sep : np.array
        Array containing segments from VDF images (i.e. separated
        grains). Shape: (image size x, image size y, number of grains)

    References
    ----------
    [1] http://scikit-image.org/docs/dev/auto_examples/segmentation/
        plot_watershed.html
    [2] http://scikit-image.org/docs/dev/auto_examples/xx_applications/
        plot_coins_segmentation.html#sphx-glr-auto-examples-xx-
        applications-plot-coins-segmentation-py
    """

    # Create a mask from the input VDF image.
    if threshold:
        th = threshold_li(vdf_temp)
        mask = np.zeros_like(vdf_temp)
        mask[vdf_temp > th] = True
    else:
        mask = vdf_temp.astype("bool")

    # Calculate the Eucledian distance from each point in the mask to the
    # nearest background point of value 0.
    distance = distance_transform_edt(mask)

    # If exclude_boarder is given, the edge of the distance is removed
    # by erosion. The distance image is used to find markers, and so the
    # erosion is done to avoid that markers are located at the edge
    # of the mask.
    if exclude_border > 0:
        distance_mask = binary_erosion(distance,
                                       structure=disk(exclude_border))
        distance = distance * distance_mask.astype("bool")

    # Find the coordinates of the local maxima of the distance transform.
    local_maxi = peak_local_max(
        distance,
        indices=False,
        min_distance=1,
        num_peaks=max_number_of_grains,
        exclude_border=exclude_border,
        threshold_rel=None,
    )
    maxi_coord1 = np.where(local_maxi)

    # Discard maxima that are found at pixels that are connected to a
    # smaller number of pixels than min_size. Used as markers, these would lead
    # to segments smaller than min_size and should therefore not be
    # considered when deciding which maxima to use as markers.
    if min_size > 1:
        labels_check = label(mask)[0]
        delete_indices = []
        for i in np.arange(np.shape(maxi_coord1)[1]):
            index = np.transpose(maxi_coord1)[i]
            label_value = labels_check[index[0], index[1]]
            if len(labels_check[labels_check == label_value]) < min_size:
                delete_indices.append(i)
                local_maxi[index[0], index[1]] = False
        maxi_coord1 = np.delete(maxi_coord1, delete_indices, axis=1)

    # Cluster the maxima by DBSCAN based on min_distance. For each
    # cluster, only the maximum closest to the average maxima position is
    # used as a marker.
    if min_distance > 1 and np.shape(maxi_coord1)[1] > 1:
        clusters = DBSCAN(
            eps=min_distance,
            metric="euclidean",
            min_samples=1,
        ).fit(np.transpose(maxi_coord1))
        local_maxi = np.zeros_like(local_maxi)
        for n in np.arange(clusters.labels_.max() + 1):
            maxi_coord1_n = np.transpose(maxi_coord1)[clusters.labels_ == n]
            com = np.average(maxi_coord1_n, axis=0).astype("int")
            index = distance_matrix([com], maxi_coord1_n).argmin()
            index = maxi_coord1_n[index]
            local_maxi[index[0], index[1]] = True

    # Use the resulting maxima as markers. Each marker should have a
    # unique label value. For each maximum, generate markers with the same
    # label value in a radius given by marker_radius centered at the
    # maximum position. This is done to make the segmentation more robust
    # to local changes in pixel values around the marker.
    markers = label(local_maxi)[0]
    if marker_radius >= 1:
        disk_mask = disk(marker_radius)
        for mm in np.arange(1, np.max(markers) + 1):
            im = np.zeros_like(markers)
            im[np.where(markers == mm)] = markers[np.where(markers == mm)]
            markers_temp = convolve2d(im,
                                      disk_mask,
                                      boundary="fill",
                                      mode="same",
                                      fillvalue=0)
            markers[np.where(markers_temp)] = mm
    markers = markers * mask

    # Find the edges of the VDF image using the Sobel transform.
    elevation = sobel(vdf_temp)

    # 'Flood' the elevation (i.e. edge) image from basins at the marker
    # positions. Find the locations where different basins meet, i.e.
    # the watershed lines (segment boundaries). Only search for segments
    # (labels) in the area defined by mask.
    labels = watershed(elevation, markers=markers, mask=mask)

    sep = np.zeros(
        (np.shape(vdf_temp)[0], np.shape(vdf_temp)[1], (np.max(labels))),
        dtype="int32")
    n, i = 1, 0
    while (np.max(labels)) > n - 1:
        sep_temp = labels * (labels == n) / n
        sep_temp = np.nan_to_num(sep_temp)
        # Discard a segment if it is too small or too large, or else add
        # it to the list of separated segments.
        if (np.sum(sep_temp, axis=(0, 1)) < min_size) or np.sum(
                sep_temp, axis=(0, 1)) > max_size:
            sep = np.delete(sep, ((n - i) - 1), axis=2)
            i = i + 1
        else:
            sep[:, :, (n - i) - 1] = sep_temp
        n = n + 1
    # Put the intensity from the input VDF image into each segmented area.
    vdf_sep = np.broadcast_to(vdf_temp.T, np.shape(sep.T)) * (sep.T == 1)

    if plot_on:  # pragma: no cover
        # If segments have been discarded, make new labels that do not
        # include the discarded segments.
        if np.max(labels) != (np.shape(sep)[2]) and (np.shape(sep)[2] != 0):
            labels = sep[:, :, 0]
            for i in range(1, np.shape(sep)[2]):
                labels = labels + sep[..., i] * (i + 1)
        # If no separated particles were found, set all elements in
        # labels to 0.
        elif np.shape(sep)[2] == 0:
            labels = np.zeros(np.shape(labels))

        seps_img_sum = np.zeros_like(vdf_temp).astype("float64")
        for lbl, vdf in zip(np.arange(1, np.max(labels) + 1), vdf_sep):
            mask_l = np.zeros_like(labels).astype("bool")
            _idx = np.where(labels == lbl)
            mask_l[_idx] = 1
            seps_img_sum += vdf_temp * mask_l / np.max(vdf_temp[_idx])
            seps_img_sum[_idx] += lbl

        maxi_coord = np.where(local_maxi)

        fig, axes = plt.subplots(2, 3, sharex=True, sharey=True)
        ax = axes.ravel()

        ax[0].imshow(vdf_temp, cmap=plt.cm.magma_r)
        ax[0].axis("off")
        ax[0].set_title("VDF")

        ax[1].imshow(mask, cmap=plt.cm.gray_r)
        ax[1].axis("off")
        ax[1].set_title("Mask")

        ax[2].imshow(distance, cmap=plt.cm.gray_r)
        ax[2].axis("off")
        ax[2].set_title("Distance and markers")
        ax[2].imshow(masked_where(markers == 0, markers),
                     cmap=plt.cm.gist_rainbow)
        ax[2].plot(maxi_coord1[1], maxi_coord1[0], "k+")
        ax[2].plot(maxi_coord[1], maxi_coord[0], "gx")

        ax[3].imshow(elevation, cmap=plt.cm.magma_r)
        ax[3].axis("off")
        ax[3].set_title("Elevation")

        ax[4].imshow(labels, cmap=plt.cm.gnuplot2_r)
        ax[4].axis("off")
        ax[4].set_title("Labels")

        ax[5].imshow(seps_img_sum, cmap=plt.cm.magma_r)
        ax[5].axis("off")
        ax[5].set_title("Segments")

    return vdf_sep
Exemplo n.º 20
0
 def main(self):    
     image_1_path = self.e1.get()
     image_2_path = self.e2.get()
     try:
         image_1_RGB = plt.imread(image_1_path)
         image_2_RGB = plt.imread(image_2_path)
         cluster_tolerance = float(self.e3.get())
         color_tolerance = float(self.e4.get())
         pass
     except:
         self.state.set('ERROR')
         self.lstate.config(bg='#FF7F7F')
         self.face.update_idletasks()
         messagebox.showinfo(title='ERROR', message='输入错误!')
         return None
     self.lstate.config(bg='#7FFF7F')
     self.face.update_idletasks()
     # show images
     self.state.set('显示图片中。。。')
     self.face.update_idletasks()
     img_open = Image.open(self.e1.get())
     img = img_open.resize((128, 64))
     img = ImageTk.PhotoImage(img)
     self.lp1.config(image=img)
     self.lp1.image = img
     self.face.update_idletasks()
     img_open = Image.open(self.e2.get())
     img = img_open.resize((128, 64))
     img = ImageTk.PhotoImage(img)
     self.lp2.config(image=img)
     self.lp2.image = img
     self.face.update_idletasks()
     # resize to speed up
     image_1_RGB = Image.open(image_1_path)
     w_resize = 96
     h_resize = int(w_resize*image_1_RGB.size[1]/image_1_RGB.size[0])
     image_1_RGB = image_1_RGB.resize((w_resize, h_resize))
     image_1_RGB = np.array(image_1_RGB)
     # resize to speed up
     image_2_RGB = Image.open(image_2_path)
     w_resize = 96
     h_resize = int(w_resize*image_2_RGB.size[1]/image_2_RGB.size[0])
     image_2_RGB = image_2_RGB.resize((w_resize, h_resize))
     image_2_RGB = np.array(image_2_RGB)
     # to lab
     self.state.set('转换RGB为LAB中。。。')
     self.face.update_idletasks()
     image_1_LAB = cv2.cvtColor(image_1_RGB, cv2.COLOR_RGB2LAB)
     image_2_LAB = cv2.cvtColor(image_2_RGB, cv2.COLOR_RGB2LAB)  
     # image 1
     self.state.set('第一张图片聚类中。。。')
     self.face.update_idletasks()
     dbscan1 = DBSCAN(eps=cluster_tolerance, min_samples=1)
     h_1, w_1, c_1 = image_1_LAB.shape
     image_1_data = image_1_LAB.reshape((h_1*w_1, c_1))
     image_1_lab_data = []
     for data in image_1_data:
         image_1_lab_data.append([data[0]*100/255, data[1]-128, data[2]-128])
         pass
     image_1_lab_data = np.array(image_1_lab_data)
     dbscan1.fit(image_1_lab_data)
     labels = dbscan1.labels_
     n_clusters_1 = len(set(labels)) - (1 if -1 in labels else 0)
     # find the cluster center
     themes_1 = []
     clusters_area_1 = []
     for i in range(n_clusters_1):
         one_cluster = image_1_lab_data[labels == i]
         if len(one_cluster)!=1:
             km = KMeans(n_clusters=1, max_iter=300)
             km.fit(one_cluster)
             themes_1.append(np.squeeze(km.cluster_centers_))
             pass
         else:
             themes_1.append(one_cluster[0])
             pass
         clusters_area_1.append(len(one_cluster)/len(image_1_lab_data))
         pass
     themes_1 = np.array(themes_1)
     # show image
     uint8_themes_1 = []
     for theme in themes_1:
         uint8_themes_1.append([theme[0]*255/100, theme[1]+128, theme[2]+128])
         pass
     uint8_themes_1 = np.array(uint8_themes_1)   
     pic_array = cv2.cvtColor(np.uint8(uint8_themes_1.reshape(1, len(uint8_themes_1), 3)), \
                              cv2.COLOR_LAB2RGB)
     pic_array = self.make_themes_image(pic_array[0])
     pic = Image.fromarray(pic_array.astype('uint8')).convert('RGB')
     img = ImageTk.PhotoImage(pic)
     self.lp1c.config(image=img)
     self.lp1c.image = img
     self.face.update_idletasks()
     # image 2
     self.state.set('第二张图片聚类中。。。')
     self.face.update_idletasks()
     dbscan2 = DBSCAN(eps=cluster_tolerance, min_samples=1)
     h_2, w_2, c_2 = image_2_LAB.shape
     image_2_data = image_2_LAB.reshape((h_2*w_2, c_2))
     image_2_lab_data = []
     for data in image_2_data:
         image_2_lab_data.append([data[0]*100/255, data[1]-128, data[2]-128])
         pass
     image_2_lab_data = np.array(image_2_lab_data)
     dbscan2.fit(image_2_lab_data)
     labels = dbscan2.labels_
     n_clusters_2 = len(set(labels)) - (1 if -1 in labels else 0)
     # find the cluster center
     themes_2 = []
     clusters_area_2 = []
     for i in range(n_clusters_2):
         one_cluster = image_2_lab_data[labels == i]
         if len(one_cluster)!=1:
             km = KMeans(n_clusters=1, max_iter=300)
             km.fit(one_cluster)
             themes_2.append(np.squeeze(km.cluster_centers_))
             pass
         else:
             themes_2.append(one_cluster[0])
             pass
         clusters_area_2.append(len(one_cluster)/len(image_2_lab_data))
         pass
     themes_2 = np.array(themes_2)
     # show image
     uint8_themes_2 = []
     for theme in themes_2:
         uint8_themes_2.append([theme[0]*255/100, theme[1]+128, theme[2]+128])
         pass
     uint8_themes_2 = np.array(uint8_themes_2)
     pic_array = cv2.cvtColor(np.uint8(uint8_themes_2.reshape(1, len(uint8_themes_2), 3)), \
                              cv2.COLOR_LAB2RGB)
     pic_array = self.make_themes_image(pic_array[0])
     pic = Image.fromarray(pic_array.astype('uint8')).convert('RGB')
     img = ImageTk.PhotoImage(pic)
     self.lp2c.config(image=img)
     self.lp2c.image = img
     self.face.update_idletasks()
     self.state.set('聚类完成')
     self.face.update_idletasks()
     # select common color
     Image_1_Area = clusters_area_1[:]
     Image_2_Area = clusters_area_2[:]
     self.state.set('共同色选取中。。。')
     self.face.update_idletasks()
     common_color_infos = []
     for i in range(n_clusters_1):
         L1 = themes_1[i][0]
         A1 = themes_1[i][1]
         B1 = themes_1[i][2]
         LAB1 = [L1, A1, B1]
         for j in range(n_clusters_2):
             L2 = themes_2[j][0]
             A2 = themes_2[j][1]
             B2 = themes_2[j][2]
             LAB2 = [L2, A2, B2]
             deltaE = self.calc_chromatism(LAB1, LAB2)
             if deltaE <= color_tolerance:
                 S1 = Image_1_Area[i] / (Image_1_Area[i] + Image_2_Area[j])
                 S2 = Image_2_Area[j] / (Image_1_Area[i] + Image_2_Area[j])
                 L3 = L1 * S1 + L2 * S2
                 A3 = A1 * S1 + A2 * S2
                 B3 = B1 * S1 + B2 * S2
                 L1 = round(L1, 3)
                 A1 = round(A1, 3)
                 B1 = round(B1, 3)
                 L2 = round(L2, 3)
                 A2 = round(A2, 3)
                 B2 = round(B2, 3)
                 L3 = round(L3, 3)
                 A3 = round(A3, 3)
                 B3 = round(B3, 3)
                 LAB1 = [L1, A1, B1]
                 LAB2 = [L2, A2, B2]
                 LAB3 = [L3, A3, B3]
                 selected_std_color = select_std_color(LAB3)
                 selected_std_color_lab = std_colors[selected_std_color]
                 # knn
                 label = colour_classify(selected_std_color_lab, \
                                         image_1_lab_data, np.squeeze(dbscan1.labels_), k=10)
                 # area
                 std_color_area_1 = clusters_area_1[label]
                 # knn
                 label = colour_classify(selected_std_color_lab, \
                                         image_2_lab_data, np.squeeze(dbscan2.labels_), k=10)
                 # area
                 std_color_area_2 = clusters_area_2[label]
                 # info
                 info = (LAB3, LAB1, Image_1_Area[i], LAB2, Image_2_Area[j], \
                         selected_std_color, std_color_area_1, std_color_area_2)
                 common_color_infos.append(info)
                 pass
             pass
         pass
     self.state.set('共同色选取完成')
     self.face.update_idletasks()
     selected_std_colors = []
     # keys: num appears values: selected std colors
     dict_selected_std_colors = {}
     # num appears
     std_colors_nums = []
     for i in range(len(common_color_infos)):
         # std color index: -3
         selected_std_colors.append(common_color_infos[i][-3])
         selected_std_colors_set = set(selected_std_colors)
         pass
     for selected_std_color in selected_std_colors_set:
         num = selected_std_colors.count(selected_std_color)
         if str(num) not in dict_selected_std_colors.keys():  
             std_colors_nums.append(num)
             dict_selected_std_colors[str(num)] = [selected_std_color]
             pass
         else:
             dict_selected_std_colors[str(num)].append(selected_std_color)
             pass
         pass
     std_colors_nums.sort(reverse=True)
     # list box
     index = 0
     self.listbox.delete(0, tk.END)
     self.face.update_idletasks()
     info = ' '*2 + str('') + ' '*(7-len(str('')))
     info += ' '*8 + str('') + ' '*(17-len(str('')))
     info += ' '*12
     info += ' '*15 + str('背景图片一') + ' '*(20-len(str('背景图片一')))
     info += ' '*15 + str('背景图片二') + ' '*(20-len(str('背景图片二')))
     info += ' '*16 + str('军标色') + ' '*(14-len(str('军标色'))) 
     self.listbox.insert(tk.END, info)
     self.face.update_idletasks()   
     info = ' '*1 + str('序号') + ' '*(8-len(str('序号')))
     info += ' '*8 + str('共同色') + ' '*(17-len(str('共同色')))
     info += ' '*10 + str('LAB1') + ' '*(15-len(str('LAB1')))
     info += ' '*5 + str('Area1') + ' '*(5-len(str('Area1')))
     info += ' '*10 + str('LAB2') + ' '*(15-len(str('LAB2')))
     info += ' '*5 + str('Area2') + ' '*(5-len(str('Area2')))
     info += ' '*6 + str('色号') + ' '*(4-len(str('色号'))) 
     info += ' '*5 + str('图片一占比') + ' '*(5-len(str('图片一占比'))-1)
     info += ' '*5 + str('图片二占比') + ' '*(5-len(str('图片二占比'))-1)
     self.listbox.insert(tk.END, info)
     self.face.update_idletasks()
     self.state.set('处理列表中。。。请稍后')
     self.face.update_idletasks()
     self.results = []
     for num in std_colors_nums:
         for color in dict_selected_std_colors[str(num)]:
             count = 0
             for color_info in common_color_infos:
                 # std color index: -3
                 if color_info[-3] == color:
                     index += 1
                     count += 1
                     c3, c1, a1, c2, a2, \
                     sc, sca1, sca2 = color_info
                     a1 = round(100*a1, 3)
                     a2 = round(100*a2, 3)
                     sca1 = round(100*sca1, 3)
                     sca2 = round(100*sca2, 3)
                     if count<=1:
                         self.results.append((index, c3, c1, a1, c2, a2, sc, \
                                             sca1, sca2))
                         pass
                     else:
                         self.results.append((index, c3, c1, a1, c2, a2, sc, \
                                             None, None))
                         pass
                     info = ' '*2 + str(index) + ' '*(7-len(str(index)))
                     info += str(c3) + ' '*(25-len(str(c3)))
                     info += str(c1) + ' '*(25-len(str(c1)))
                     info += str(a1) + '%' + ' '*(10-len(str(a1))-1)
                     info += str(c2) + ' '*(25-len(str(c2)))
                     info += str(a2) + '%' + ' '*(10-len(str(a2))-1)
                     info += str(sc) + ' '*(10-len(str(sc))) 
                     if count<=1:
                         info += str(sca1) + '%' + ' '*(10-len(str(sca1))-1)
                         info += str(sca2) + '%' + ' '*(10-len(str(sca2))-1)
                         pass
                     self.listbox.insert(tk.END, info)
                     self.face.update_idletasks()
                     pass
                 pass
             pass
         pass
     self.scrollbar.config(command=self.listbox.yview)
     self.face.update_idletasks()
     self.state.set('选取完成')
     self.face.update_idletasks()
     pass
Exemplo n.º 21
0
        count += 1
        if count > 300:
            break
    idx += 6000

# from list to array
sel_label = np.array(sel_label)
sel_data = np.array(sel_data)

# t-SNE (2-D)
tsne = TSNE(n_components=2, perplexity=30).fit_transform(sel_data)

# PCA (2-D)
pca = PCA(n_components=2)
pca = pca.fit_transform(sel_data, sel_label)

# Clustering
clustering = DBSCAN(eps=4, min_samples=20).fit(tsne)
clustering_pca = DBSCAN(eps=60, min_samples=8).fit(pca)

# Plot
DR_Plot(sel_label, tsne, 'tsne')
DR_Plot(sel_label, pca, 'pca')

DR_Plot(clustering.labels_, tsne, 'clustering_tsne')
DR_Plot(clustering_pca.labels_, pca, 'clustering_pca')

DR_Plot_black(tsne, 'tsne_black')
DR_Plot_black(pca, 'pca_black')
Exemplo n.º 22
0
    def compute_lenta_dbscan(self,
                             algorithm='auto',
                             allowed_hierarchic_iterations=10):
        """ Extract the silhouette value for every cluster, and iterate the
        clustering algorithms over those cluster with negative silhouette values"""

        logging.debug("Starting computing ITERATIVE-DBSCAN," + "at {0}".format(
            datetime.datetime.fromtimestamp(time.time()).strftime(
                '%Y-%m-%d %H:%M:%S')))

        self.select_automatic_epsilon()

        # compute classic dbscan in first instance
        first_db = DBSCAN(
            metric='precomputed',  # Use a pre-computed distance matrix
            eps=self.epsilon,
            min_samples=self.min_points,
            algorithm=algorithm).fit(self.D)

        # extract infos on core points
        self.core_samples_mask = np.zeros_like(first_db.labels_, dtype=bool)
        self.core_samples_mask[first_db.core_sample_indices_] = True

        self.labels = np.asarray(first_db.labels_.astype(int))

        initial_max_label = max(self.labels)

        logging.debug("Initially extracted {0} clusters".format(
            sum(x is not -1 for x in set(self.labels))))

        self.compute_silhouette()

        if max(self.labels) > 0:
            flag = True
            label = 0
            while flag:
                cluster_matrix = np.array([])
                file_mmap = None
                # create a mask for the elements belonging to that label
                index_mask = self.labels == label
                cl_indexes = np.array(range(len(self.labels)))[index_mask]

                # compute silhouette formula
                logging.debug("CLUSTER {0}".format(label))

                logging.debug("Number of elements in the cluster:" +
                              "%i" % np.count_nonzero(self.labels == label))

                # Try to group differently points in clusters with bad silhouette
                if self.mean_silhouette_labels[label] < self.smin:
                    if allowed_hierarchic_iterations > 0 and np.count_nonzero(
                            self.labels == label) > self.min_points:

                        # Re-apply DBSCAN only over the elements of the cluster
                        cluster_matrix = self.extract_sub_matrix(
                            cl_indexes, index_mask, self.D)
                        print(cluster_matrix)
                        logging.debug(
                            os.listdir(self.output_folder + '/results'))
                        elected_epsilon = self.compute_kdist_graph(
                            cluster_matrix, self.min_points)
                        try:
                            db_hier = DBSCAN(
                                metric=
                                'precomputed',  # Use a pre-computed distance matrix
                                eps=elected_epsilon,
                                min_samples=self.min_points,
                                algorithm=algorithm).fit(cluster_matrix)

                            db_hier_labels = db_hier.labels_

                            logging.debug("From cluster {0} ".format(label) +
                                          "extracted {0} clusters".format(
                                              max(db_hier_labels) + 1))

                            # if we are in a no-end loop
                            if max(db_hier_labels) + 1 == 1 and label == max(
                                    self.labels):
                                for cl_index in cl_indexes:
                                    self.labels[cl_index] = -1
                                allowed_hierarchic_iterations = 0
                                logging.debug(
                                    "From cluster {0} ".format(label) +
                                    "is not possible to extract more clusters".
                                    format(max(db_hier_labels) + 1))
                                #del self.mean_silhouette_values[label]
                                self.compute_silhouette()
                            else:
                                allowed_hierarchic_iterations -= 1
                                logging.debug("Extracting new clusters")
                                logging.debug(
                                    os.listdir(self.output_folder +
                                               '/results'))
                                db_hier_labels = np.array([
                                    lab + (max(self.labels) + 1)
                                    if lab != -1 else lab
                                    for lab in db_hier_labels
                                ])
                                logging.debug("Updating the system")
                                # Update the labels' list with the new labels
                                for i, cl_index in enumerate(cl_indexes):
                                    self.labels[cl_index] = db_hier_labels[i]
                                logging.debug("re-compute silhouette")
                                logging.debug(
                                    os.listdir(self.output_folder +
                                               '/results'))
                                self.compute_silhouette()
                        finally:
                            logging.debug("Finally " + str(
                                os.listdir(self.output_folder + '/results')))
                            cluster_matrix_filename = cluster_matrix.filename
                            del cluster_matrix
                            os.remove(cluster_matrix_filename)

                    else:
                        for cl_index in cl_indexes:
                            self.labels[cl_index] = -1

                label += 1
                if label > max(self.labels):
                    flag = False

        logging.debug("Finished computing ITERATIVE-DBSCAN," + "at {0}".format(
            datetime.datetime.fromtimestamp(time.time()).strftime(
                '%Y-%m-%d %H:%M:%S')))

        return self.labels
Exemplo n.º 23
0



# #############################################################################
# Generate sample data

# #############################################################################
# Compute DBSCAN
eps_range=[]
for i in range(1,50):
    eps_range.append(i*0.1)
for eps in eps_range:
    for min_pts in range(0,10):
        X = np.array(values)
        db = DBSCAN(eps=eps, min_samples=min_pts).fit(X)
        core_samples_mask = np.zeros_like(db.labels_, dtype=bool)
        core_samples_mask[db.core_sample_indices_] = True
        label = db.labels_

        # Number of clusters in labels, ignoring noise if present.
        n_clusters_ = len(set(label)) - (1 if -1 in label else 0)

        
        if(n_clusters_ == 4):
            print('Estimated number of clusters: %d' % n_clusters_)
            print("Eps value :"+str(eps)+" min_pts: "+str(min_pts))
# #############################################################################
# Plot result
'''import matplotlib.pyplot as plt
Exemplo n.º 24
0
def dbscan_func(all_pos, mol_list, col_pos, col_neg):
    X = all_pos
    # = StandardScaler().fit_transform(all_pos)

    # #############################################################################
    # Compute DBSCAN

    db = DBSCAN(eps=0.8, min_samples=4).fit(X)
    core_samples_mask = np.zeros_like(db.labels_, dtype=bool)
    core_samples_mask[db.core_sample_indices_] = True
    labels = db.labels_

    # Number of clusters in labels, ignoring noise if present.
    n_clusters_ = len(set(labels)) - (1 if -1 in labels else 0)

    print('Estimated number of clusters: %d' % n_clusters_)
    print("Silhouette Coefficient: %0.3f" %
          metrics.silhouette_score(X, labels))

    # #############################################################################
    # Plot result

    new_xyz = []  # np.zeros((X.shape[0],3))
    new_cols = []  # np.zeros((X.shape[0], 4))
    new_labels = []
    new_op = []

    fig = plt.figure()
    ax = Axes3D(fig)
    # Black removed and is used for noise instead.
    plotly_data = []
    unique_labels = set(labels)
    colors = [plt.cm.Spectral(each) for each in np.linspace(0, 1, len(unique_labels))]

    # print("Color variety: ", len(colors))
    # print(colors)

    for k, col in zip(unique_labels, colors):
        if k == -1:
            # Black used for noise.
            col = [0, 0, 0, 0.3]

        class_member_mask = (labels == k)

        xy = X[class_member_mask & core_samples_mask]

        for tmp_xy in xy:
            _id, dh, tds, dg = find_mol_id(tmp_xy, mol_list)
            # new_xyz.append(tmp_xy)

            if k == -1:
                pass
                # new_cols.append(tuple(col))
                # new_labels.append('Unclustered')
                # new_op.append(float(0.3))
            else:
                #col_pos = [255, 204, 0, 1]
                #col_neg = [204, 0, 153, 1]
                # new_cols.append(tuple(col))
                if float(dg) < 0.0:
                    new_cols.append(col_pos)
                else:
                    new_cols.append(col_neg)
                new_xyz.append(tmp_xy)
                new_labels.append('Cluster: ' + str(k) + '<br>id: ' + _id \
                                  + '<br>dH: ' + dh + '<br>TdS: ' + tds + '<br>dG: ' + dg)
                new_op.append(float(1.0))

        ax.scatter(xy[:, 0], xy[:, 1], xy[:, 2], 'o', c=tuple(col),
                   edgecolors='k', s=14)

        xy = X[class_member_mask & ~core_samples_mask]

        for tmp_xy in xy:
            # print(tmp_xy)
            _id, dh, tds, dg = find_mol_id(tmp_xy, mol_list)

            # new_xyz.append(tmp_xy)

            if k == -1:
                pass
                # new_cols.append(tuple(col))
                # new_labels.append('Unclustered')
                # new_op.append(float(0.3))
            else:

                #col_pos = [255, 204, 0, 1]
                #col_neg = [204, 0, 153, 1]
                # new_cols.append(tuple(col))
                if float(dg) < 0.0:
                    new_cols.append(col_pos)
                else:
                    new_cols.append(col_neg)
                new_xyz.append(tmp_xy)
                new_labels.append('Cluster: ' + str(k) + '<br>id: ' + _id \
                                  + '<br>dH: ' + dh + '<br>TdS: ' + tds + '<br>dG: ' + dg)
                new_op.append(float(1.0))

        ax.scatter(xy[:, 0], xy[:, 1], xy[:, 2], 'o', c=tuple(col),
                   edgecolors='k', s=6)

    new_xyz = np.asarray(new_xyz)
    print(len(new_op))

    # names = set_mol_info(X)
    remove_idx = []
    for lbl_tmp, opa_tmp, col_tmp, xyz_tmp, idx_tmp in zip(new_labels, new_op, new_cols, new_xyz,
                                                           range(len(new_labels))):
        if lbl_tmp != 'Unclustered':
            remove_idx.append(idx_tmp)

    #    new_labels = new_labels.pop(for r in remove_idx)
    #    new_xyz =
    #    new_cols =
    #    new_op =
    return new_xyz, new_cols, new_labels, new_op
Exemplo n.º 25
0
def main(args, pnet, rnet, onet):
    _pnet = pnet
    _rnet = rnet
    _onet = onet

    # Instantiate the class containing the functions to call the Reddit API
    functions = reddit_functions.Functions()

    # Open the facenet model
    with tf.gfile.FastGFile(args.model, 'rb') as f:
        graph_def = tf.GraphDef()
        graph_def.ParseFromString(f.read())
        _ = tf.import_graph_def(graph_def, name='')

    with tf.Session() as sess:
        # Open the text file containing the names into a variable
        with open(args.names) as file:
            # Read the text file containing the names line by line
            for name in file.readlines():
                # Get the posts from a sub Reddit, strip is so the enter at the end of a line is removed and the backspace
                # has to be removed because most sub Reddits are "NameLastname"
                posts = functions.get_posts(name.strip(), str(args.limit))

                # Check if the Reddit API returned something
                if posts is not None:
                    # Get the images from the posts from the sub Reddit
                    images = functions.get_images(posts)

                    # Check if the image list is not empty
                    if images is not None:
                        # Align the image data
                        images_aligned = align_data(images, args.image_size,
                                                    args.margin, _pnet, _rnet,
                                                    _onet)

                        # Get the required input and output tensors
                        images_placeholder = sess.graph.get_tensor_by_name(
                            "input:0")
                        embeddings = sess.graph.get_tensor_by_name(
                            "embeddings:0")
                        phase_train_placeholder = sess.graph.get_tensor_by_name(
                            "phase_train:0")
                        feed_dict = {
                            images_placeholder: images_aligned,
                            phase_train_placeholder: False
                        }
                        emb = sess.run(embeddings, feed_dict=feed_dict)

                        # Get number of faces in the list after alignment
                        nrof_images = len(images_aligned)

                        print(nrof_images)

                        # Create empty distance matrix
                        matrix = np.zeros((nrof_images, nrof_images))
                        for i in range(nrof_images):
                            for j in range(nrof_images):
                                # Calc distance and fill the matrix
                                dist = np.sqrt(
                                    np.sum(
                                        np.square(
                                            np.subtract(emb[i, :],
                                                        emb[j, :]))))
                                matrix[i][j] = dist

                        # Instantiate the cluster algorithm, eps = the min distance to cluster
                        db = DBSCAN(eps=1, min_samples=5, metric='precomputed')
                        # Fit the distance matrix to the algorithm
                        db.fit(matrix)
                        labels = db.labels_

                        # Find how many clusters there are
                        no_clusters = len(
                            set(labels)) - (1 if -1 in labels else 0)

                        # Check if there is more than 1 cluster
                        if no_clusters > 0:
                            print('No of clusters:', no_clusters)
                            biggest_cluster = 0
                            len_biggest_cluster = 0
                            for i in range(no_clusters):
                                print('Cluster ' + str(i) + ' : ',
                                      np.nonzero(labels == i)[0])
                                # Find the biggest cluster
                                if len(np.nonzero(
                                        labels == i)[0]) > len_biggest_cluster:
                                    biggest_cluster = i
                                    len_biggest_cluster = len(
                                        np.nonzero(labels == i)[0])

                            print('Biggest cluster: ' + str(biggest_cluster))
                            cnt = 1
                            # Putting the full path in a variable to make it easy
                            path = os.path.join(args.out_dir,
                                                str(name.strip()))
                            if not os.path.exists(path):
                                # Create a dir in the chosen output location with the name of the persons sub Reddit if it doesn't exist
                                os.makedirs(path)
                                # Loop over the images array positions in the largest dir
                                for j in np.nonzero(
                                        labels == biggest_cluster)[0]:
                                    # Save the image to the output dir
                                    misc.imsave(
                                        os.path.join(
                                            path,
                                            name.strip() + '_' +
                                            str('%0*d' % (4, cnt)) + '.png'),
                                        images_aligned[j])
                                    cnt += 1
                            else:
                                for j in np.nonzero(
                                        labels == biggest_cluster)[0]:
                                    misc.imsave(
                                        os.path.join(
                                            path,
                                            name.strip() + '_ ' +
                                            str('%0*d' % (4, cnt)) + '.png'),
                                        images_aligned[j])
                                    cnt += 1
Exemplo n.º 26
0
    def classifierData(self, pipelineDict):

        self.__preprocessData__()
        if pipelineDict["scaler"] == "MinMaxScaler":
            scaler = EachMinMaxScaler(self.scaleList)
        elif pipelineDict["scaler"] == "RobustScaler":
            scaler = RobustScaler()
        elif pipelineDict["scaler"] == "Normalizer":
            scaler = Normalizer()
        elif pipelineDict["scaler"] == "StandardScaler":
            scaler = StandardScaler()
        elif pipelineDict["scaler"] == "None":
            scaler = None
        else:
            raise TypeError

        if pipelineDict["reduceDim"] == "TSNE":
            reduceDimension = TSNE(random_state=0)
        elif pipelineDict["reduceDim"] == "PCA":
            reduceDimension = PCA(n_components=2)
        elif pipelineDict["reduceDim"] == "None":
            reduceDimension = None
        else:
            raise TypeError

        if pipelineDict["cluster"] == "DBSCAN":
            eps = pipelineDict["params"]["eps"]
            min_samples = pipelineDict["params"]["min_samples"]
            cluster = DBSCAN(eps=float(eps), min_samples=int(min_samples))
        else:
            if pipelineDict["cluster"] == "KMeans":
                n_clusters = pipelineDict["params"]["n_clusters"]
                cluster = KMeans(n_clusters=int(n_clusters))
            elif pipelineDict["cluster"] == "Agglomerative":
                n_clusters = pipelineDict["params"]["n_clusters"]
                cluster = AgglomerativeClustering(n_clusters=int(n_clusters))
            else:
                raise TypeError

        pipe = chain([("scaler", scaler), ("reduceDim", reduceDimension),
                      ("cluster", cluster)])

        labels = pipe.fit_predict(self.theFinalData)

        eachScaledData = pipe.named_steps("scaler_output")

        cluster = pipe.named_steps("cluster")

        championClusters = []
        spellClusters = []
        try:
            for index in range(cluster.n_clusters):
                championClusters.append([])
                spellClusters.append([])
        except AttributeError:
            maxLabel = 0
            for label in labels:
                if label > maxLabel:
                    maxLabel = label
            for index in range(maxLabel + 2):
                championClusters.append([])
                spellClusters.append([])

        labels.tolist()
        for i, label in enumerate(labels.tolist()):
            tableData = []
            tableData.append(self.championNameList[i])
            tableData.extend(
                self.theFinalData[i][0:len(self.data_feature_names[0:-3])])
            tableData.extend(self.spellNameList[i])
            championClusters[label].append(tableData)
            spellClusters[label].append(self.spellNameList[i])

        dim2Output = pipe.named_steps("reduceDim_output")
        try:
            if dim2Output == None:
                reduceDimension = TSNE(random_state=0)
                scaledData = pipe.named_steps("scaler_output")
                try:
                    if scaledData == None:
                        scaledData = self.theFinalData
                except ValueError:
                    pass
                finally:
                    dim2Output = reduceDimension.fit_transform(scaledData)
        except ValueError:
            pass

        plt.clf()
        plt.xlim(dim2Output[:, 0].min(), dim2Output[:, 0].max() + 1)
        plt.ylim(dim2Output[:, 1].min(), dim2Output[:, 1].max() + 1)

        colors = []
        for _ in championClusters:
            rgbValue = random.randrange(0, 16777216 - 1)
            rgbValue = "%X" % (rgbValue)
            rgbString = ""
            for _ in range(6 - len(rgbValue)):
                rgbString = "0" + rgbString
            rgbString = "#" + rgbString + rgbValue
            colors.append(rgbString)
        colors[-1] = "#000000"

        for i in range(len(dim2Output)):
            plt.text(dim2Output[i, 0],
                     dim2Output[i, 1],
                     str(self.championNameList[i]),
                     color=colors[labels[i]])
        return championClusters
Exemplo n.º 27
0
            print(d)
    for i, d in enumerate(l_docs):
        if type(d) == list:
            l_docs[i] = ""

    vectorizer = CountVectorizer(strip_accents="unicode", max_df=0.8, stop_words=get_stop_words())
    counts = vectorizer.fit_transform(l_docs)
    tfidf_transformer = TfidfTransformer().fit_transform(counts)
    l_target_en = target_encode(l_target)
    centers = [[1, 1], [-1, -1], [1, -1]]

    X = StandardScaler().fit_transform(tfidf_transformer.todense())

    # #############################################################################
    # Compute DBSCAN
    db = DBSCAN(eps=0.3, min_samples=10).fit(X)
    core_samples_mask = np.zeros_like(db.labels_, dtype=bool)
    core_samples_mask[db.core_sample_indices_] = True
    labels = db.labels_

    # Number of clusters in labels, ignoring noise if present.
    n_clusters_ = len(set(labels)) - (1 if -1 in labels else 0)

    print('Estimated number of clusters: %d' % n_clusters_)
    print("Homogeneity: %0.3f" % metrics.homogeneity_score(l_target_en, labels))
    print("Completeness: %0.3f" % metrics.completeness_score(l_target_en, labels))
    print("V-measure: %0.3f" % metrics.v_measure_score(l_target_en, labels))
    print("Adjusted Rand Index: %0.3f"
          % metrics.adjusted_rand_score(l_target_en, labels))
    print("Adjusted Mutual Information: %0.3f"
          % metrics.adjusted_mutual_info_score(l_target_en, labels))
import argparse
import pickle
import cv2

print("[INFO] loading encodings...")
data = pickle.loads(open("/content/face_encodings.pickle", "rb").read())
data = np.array(data)
encodings = [d["encoding"] for d in data]

import tensorflow as tf
# cluster the embeddings
#print('Enter number of clusters:')
#n_clusters=input('Enter number of clusters')
print("[INFO] clustering...")
#for comparision purpose...
clt = DBSCAN(metric="euclidean", n_jobs=100)
#clt =KMeans(n_clusters=5)
clt.fit(encodings)
labels=clt.labels_ 
# determine the total number of unique faces found in the dataset
labelIDs = np.unique(clt.labels_)
numUniqueFaces = len(np.where(labelIDs > -1)[0])
print("[INFO] # unique faces: {}".format(numUniqueFaces))

pip install pytest-shutil

pip install python-resize-image

from resizeimage import resizeimage
import shutil
import matplotlib.pyplot as plt
import cv2

from sklearn.cluster import DBSCAN
from k_means_playground import cluster_gen

# generate data some clusters
n_clusters = 50
clusters_x, clusters_y = cluster_gen(n_clusters)

# convert to a single dataset in OpenCV format
data = np.float32((np.concatenate(clusters_x), np.concatenate(clusters_y))).transpose()

# Define max_distance (eps parameter in DBSCAN())
max_distance = 1
db = DBSCAN(eps=max_distance, min_samples=10).fit(data)

# Extract a mask of core cluster members
core_samples_mask = np.zeros_like(db.labels_, dtype=bool)
core_samples_mask[db.core_sample_indices_] = True

# Extract labels (-1 is used for the outliers)
labels = db.labels_
n_clusters = len(set(labels)) - (1 if -1 in labels else 0)
unique_labels = set(labels)

# Plot up the results
min_x = np.min(data[:, 0])
max_x = np.max(data[:, 0])
min_y = np.min(data[:, 1])
max_y = np.max(data[:, 0])
Exemplo n.º 30
0
partially_propagated = (X_cluster_dist != -1)
X_train_partially_propagated = X_train[partially_propagated]
y_train_partially_propagated = y_train_propagated[partially_propagated]

log_reg = LogisticRegression(random_state=42)
log_reg.fit(X_train_partially_propagated, y_train_partially_propagated)

print(log_reg.score(X_test, y_test))

# DBSCAN
from sklearn.datasets import make_moons
X, y = make_moons(n_samples=1000, noise=0.05, random_state=42)
plt.plot(X[:, 0], X[:, 1], 'b.')

from sklearn.cluster import DBSCAN
dbscan = DBSCAN(eps=0.05, min_samples=5)
dbscan.fit(X)

# 常用属性
print(dbscan.labels_[:10])

print(dbscan.core_sample_indices_[:10])

print(np.unique(dbscan.labels_))

dbscan2 = DBSCAN(eps=0.2, min_samples=5)
dbscan2.fit(X)


# 画图
def plot_dbscan(dbscan, X, size, show_xlabels=True, show_ylabels=True):