def test_clustering_tree(directory=None): s = np.array([[0., 0, 1, 2, 1, 0, 1, 0, 0], [0., 1, 2, 0, 0, 0, 0, 0, 0], [1., 2, 0, 0, 0, 0, 0, 1, 1], [0., 0, 1, 2, 1, 0, 1, 0, 0], [0., 1, 2, 0, 0, 0, 0, 0, 0], [1., 2, 0, 0, 0, 0, 0, 1, 1], [1., 2, 0, 0, 0, 0, 0, 1, 1]]) def test_hook(from_idx, to_idx, distance): assert (from_idx, to_idx) in [(3, 0), (4, 1), (5, 2), (6, 2), (1, 0), (2, 0)] model = clustering.Hierarchical(dtw.distance_matrix_fast, {}, merge_hook=test_hook, show_progress=False) modelw = clustering.HierarchicalTree(model) cluster_idx = modelw.fit(s) assert cluster_idx[0] == {0, 1, 2, 3, 4, 5, 6} if directory: hierarchy_fn = os.path.join(directory, "hierarchy.png") graphviz_fn = os.path.join(directory, "hierarchy.dot") else: file = tempfile.NamedTemporaryFile() hierarchy_fn = file.name + "_hierarchy.png" graphviz_fn = file.name + "_hierarchy.dot" modelw.plot(hierarchy_fn) print("Figure saved to", hierarchy_fn) with open(graphviz_fn, "w") as ofile: print(modelw.to_dot(), file=ofile) print("Dot saved to", graphviz_fn)
def main(): s = np.array([ np.flip([0., 0, 1, 2, 1, 0, 1, 0, 0, 1]), [0., 1, 2, 0, 0, 0, 0, 0, 0, 1], np.flip([1., 2, 0, 0, 0, 0, 0, 1, 1, 1], 0), [0., 0, 1, 2, 1, 0, 1, 0, 0, 1], [0., 1, 2, 0, 0, 0, 0, 0, 0, 1], np.flip([1., 2, 0, 0, 0, 0, 0, 1, 1, 1], 0), np.flip([1., 2, 0, 0, 0, 0, 0, 1, 1, 1], 0) ]) # Custom Hierarchical clustering model1 = clustering.Hierarchical(dtw.distance_matrix_fast, {}) cluster_idx = model1.fit(s) # Keep track of full tree by using the HierarchicalTree wrapper class model2 = clustering.HierarchicalTree(model1) cluster_idx = model2.fit(s) fig, ax = plt.subplots(nrows=1, ncols=2, figsize=(10, 10)) show_ts_label = lambda idx: "ts-" + str(idx) model2.plot('hierarchy.jpg', axes=ax, show_ts_label=show_ts_label, show_tr_label=True, ts_label_margin=-10, ts_left_margin=10, ts_sample_length=1) # reading png image file im = img.imread('hierarchy.jpg') # show image plt.imshow(im)
def test_clustering_tree_ndim(): with util_numpy.test_uses_numpy() as np: s = np.array([ [[0.,0.], [0,0], [1,0], [2,0], [1,0], [0,0], [1,0], [0,0], [0,0]], [[0.,0.], [1,0], [2,0], [0,0], [0,0], [0,0], [0,0], [0,0], [0,0]], [[1.,0.], [2,0], [0,0], [0,0], [0,0], [0,0], [0,0], [1,0], [1,0]]]) model = clustering.Hierarchical(dtw_ndim.distance_matrix_fast, {'ndim':2}, show_progress=False) cluster_idx = model.fit(s) assert cluster_idx[0] == {0, 1, 2}
def test_clustering(): s = np.array([[0., 0, 1, 2, 1, 0, 1, 0, 0], [0., 1, 2, 0, 0, 0, 0, 0, 0], [1., 2, 0, 0, 0, 0, 0, 1, 1], [0., 0, 1, 2, 1, 0, 1, 0, 0], [0., 1, 2, 0, 0, 0, 0, 0, 0], [1., 2, 0, 0, 0, 0, 0, 1, 1]]) def test_hook(from_idx, to_idx, distance): assert (from_idx, to_idx) in [(3, 0), (4, 1), (5, 2), (1, 0)] model = clustering.Hierarchical(dtw.distance_matrix_fast, {}, 2, merge_hook=test_hook, show_progress=False) cluster_idx = model.fit(s) assert cluster_idx[0] == {0, 1, 3, 4} assert cluster_idx[2] == {2, 5}
def d(): c = clustering.Hierarchical(dtw.distance_matrix_fast, {}) return c.fit(s)
def get_cluster(): """ Function to get the clustering for the time series getting the distances between each operation. """ series = [] aux_file_path = r'C:\TFM\auxdata\hist_protected.csv' data_path = r'C:\TFM\data\2018\2018.csv' hierarchical_plot = r'C:\TFM\dtw\hierarchical_cluster.png' linkage_plot = r'C:\TFM\dtw\linkage_cluster.png' df_aux = pd.read_csv(aux_file_path, header=0, delimiter=',', parse_dates=[SEGMENT_BEGIN, SEGMENT_END]) df_data = pd.read_csv(data_path, header=0, delimiter=',', parse_dates=[DATE]) # print(df_aux[SEGMENT_BEGIN, SEGMENT_END][df_data[OPERATION_ID_NUMBER] == 4]) op_no = 28 program_number = 1108805036 # df1 = df[(df.a != -1) & (df.b != -1)] # begin_date = (df_aux.loc[(df_aux[OPERATION_ID_NUMBER] == op_no)][SEGMENT_BEGIN]) # Get begin date and end date for each time serie corresponding to the begin_date = ( df_aux[(df_aux[OPERATION_ID_NUMBER] == op_no) & (df_aux[PROGRAM_NAME] == program_number)][SEGMENT_BEGIN]) end_date = ( df_aux[(df_aux[OPERATION_ID_NUMBER] == op_no) & (df_aux[PROGRAM_NAME] == program_number)][SEGMENT_END]) data_index = begin_date.index # data_index = data_index[:30] for item in data_index: if item > YEAR_INDEX_LIMIT: break else: series_begin = begin_date[item] series_end = end_date[item] aux_series = df_data.loc[(df_data[DATE] >= series_begin) & (df_data[DATE] <= series_end)] if not aux_series.empty: df_spload = aux_series[SPINDLE_LOAD] df_spload = np.array(df_spload) series.append(df_spload) # Custom Hierarchical clustering model1 = clustering.Hierarchical(dtw.distance_matrix_fast, {}) cluster_idx = model1.fit(series) try: # Augment Hierarchical object to keep track of the full tree model2 = clustering.HierarchicalTree(model1) cluster_idx = model2.fit(series) model2.plot(hierarchical_plot, show_tr_label=True) except Exception as ex: print(ex) # SciPy linkage clustering try: model3 = clustering.LinkageTree(dtw.distance_matrix_fast, {}) cluster_idx = model3.fit(series) model3.plot(linkage_plot, show_tr_label=True) except Exception as ex: print(ex)
df = pd.read_csv("Scania_Data_Clustering.csv", header=0) # header=0 is default head = list(df.columns.values) # get machine names print("head", head) # print machine names df = df.T # transpose the data df = df.values ds = dtw.distance_matrix_fast(df) # get dist matrix ds[ds == inf] = 0 # replace all infinity vals in the dist matrix with 0. pd.DataFrame(ds).to_excel("ds.xlsx") # save dist matrix to a xlsx. # clustering starts # Custom Hierarchical clustering model1 = clustering.Hierarchical(dtw.distance_matrix_fast, {}) # Augment Hierarchical object to keep track of the full tree model2 = clustering.HierarchicalTree(model1) # SciPy linkage clustering model3 = clustering.LinkageTree(dtw.distance_matrix_fast, {}) cluster_idx = model3.fit(df) # plot fig, ax = plt.subplots(nrows=1, ncols=2, figsize=(15, 15)) model3.plot("hierarchy.png", axes=ax, show_ts_label=head, show_tr_label=True, ts_label_margin=-10, ts_left_margin=10, ts_sample_length=1) # to find number of clusters NumberOfClusters=range(2,30)
def clusterTest(pq,XTest, distParams, generalClusterParams, pqClusterParams, doAll = True, groundTruth=None ): #quantizer model generalClusterParams['min_clusters']=20 n_clust=generalClusterParams['min_clusters'] #normal model modelN = clustering.Hierarchical( dtaidistance.dtw.distance_matrix_fast, distParams, **generalClusterParams) if not doAll: print('Exact clustering') start = time.clock() cluster_idxN = modelN.fit(XTest) end = time.clock() dtwTime = end-start print('dtw time', dtwTime) else: generalClusterParams = {'dists_merger':clustering.singleLinkageUpdater, 'min_clusters':n_clust} modelS = clustering.Hierarchical( dtaidistance.dtw.distance_matrix_fast, distParams, **generalClusterParams) print('Exact single') start = time.clock() cluster_idxS = modelS.fit(XTest) end = time.clock() pqTime = end-start print('time',pqTime) generalClusterParams = {'dists_merger':clustering.completeLinkageUpdater, 'min_clusters':n_clust} modelC = clustering.Hierarchical( dtaidistance.dtw.distance_matrix_fast, distParams, **generalClusterParams) print('Exact complete') start = time.clock() cluster_idxC = modelC.fit(XTest) end = time.clock() pqTime = end-start print('time',pqTime) generalClusterParams = {'dists_merger':None, 'min_clusters':n_clust} modelP = clustering.Hierarchical( dtaidistance.dtw.distance_matrix_fast, distParams, **generalClusterParams) print('Exact prototypes') start = time.clock() cluster_idxP = modelP.fit(XTest) end = time.clock() pqTime = end-start print('time',pqTime) if groundTruth is not None: jaccards, aris = equaliseClusterLabelsAndCalculateScores(groundTruth, cluster_idxS, XTest) print(jaccards, aris) jaccardc, aric = equaliseClusterLabelsAndCalculateScores(groundTruth, cluster_idxC, XTest) print(jaccardc, aric) jaccardp, arip = equaliseClusterLabelsAndCalculateScores(groundTruth, cluster_idxP, XTest) print(jaccardp, arip) return #print('Exact clustering done') if not doAll: model = clustering.HierarchicalWithQuantizer( dtaidistance.dtw.distance_fast, distParams, **generalClusterParams, **pqClusterParams) model.setQuantizer(pq) print('Approximate clustering') start = time.clock() cluster_idx = model.fit(XTest) end = time.clock() #print('Approximate clustering done') pqTime = end-start jaccard, ari = equaliseClusterLabelsAndCalculateScores(cluster_idxN, cluster_idx, XTest) print(jaccard, ari, dtwTime) return {'jaccard':jaccard, 'ari':ari, 'DTWTime': dtwTime, 'PQTime':pqTime} else: def performCluster(trueResults, pq,XTest, distParams, generalClusterParams, pqClusterParams): model = clustering.HierarchicalWithQuantizer( dtaidistance.dtw.distance_fast, distParams, **generalClusterParams, **pqClusterParams) model.setQuantizer(pq) start = time.clock() cluster_idx = model.fit(XTest) end = time.clock() #print('Approximate clustering done') pqTime = end-start jaccard, ari = equaliseClusterLabelsAndCalculateScores(trueResults, cluster_idx, XTest) print( {'jaccard':jaccard, 'ari':ari, 'PQTime':pqTime}) tot = len(XTest) calcs = (tot*tot-tot)/2 #amount of distance calculations required # test 0, 2, 5, 10, 25 %precalcs testkperc = [2,5,10,25] # test 5, 10, 25 %calcsper merge testkpermerge = [0.5, 2.0, 5.0, 10.0, 20.0] testperc = [2,5,10,25,50] print(calcs, tot) n_clust = generalClusterParams['min_clusters'] clusterTypes = [None, clustering.singleLinkageUpdater, clustering.completeLinkageUpdater] print ('approx', 'single') generalClusterParams = {'dists_merger':clustering.singleLinkageUpdater, 'min_clusters':n_clust} pqClusterParams = {'k':199800,'quantizer_usage':clustering.QuantizerUsage.ONLY_APPROXIMATES} performCluster(cluster_idxS, pq,XTest, distParams, generalClusterParams, pqClusterParams ) for k in testperc: print ('percent',k, 'single',k) pqClusterParams = {'k':int(k*calcs/100),'quantizer_usage':clustering.QuantizerUsage.TOP_K_ONLY_AT_INITIALISATION} performCluster(cluster_idxS, pq,XTest, distParams, generalClusterParams, pqClusterParams ) print ('approx', 'complete') generalClusterParams = {'dists_merger':clustering.completeLinkageUpdater, 'min_clusters':n_clust} pqClusterParams = {'k':199800,'quantizer_usage':clustering.QuantizerUsage.ONLY_APPROXIMATES} performCluster(cluster_idxC, pq,XTest, distParams, generalClusterParams, pqClusterParams ) for k in testperc: print ('percent',k, 'complete',k) pqClusterParams = {'k':int(k*calcs/100.0),'quantizer_usage':clustering.QuantizerUsage.TOP_K_ONLY_AT_INITIALISATION} performCluster(cluster_idxC, pq,XTest, distParams, generalClusterParams, pqClusterParams ) print ('approx', 'proto') generalClusterParams = {'dists_merger':None, 'min_clusters':n_clust} pqClusterParams = {'k':199800,'quantizer_usage':clustering.QuantizerUsage.ONLY_APPROXIMATES} performCluster(cluster_idxP, pq,XTest, distParams, generalClusterParams, pqClusterParams ) for k in testperc: print ('percent',k, 'proto',k) pqClusterParams = {'k':int(k*calcs/100.0),'quantizer_usage':clustering.QuantizerUsage.TOP_K_ONLY_AT_INITIALISATION} performCluster(cluster_idxP, pq,XTest, distParams, generalClusterParams, pqClusterParams ) for k in testkpermerge: print ('partEachMerge',k, 'proto',k) pqClusterParams = {'k':int(k*tot/100),'quantizer_usage':clustering.QuantizerUsage.TOP_K} performCluster(cluster_idxP, pq,XTest, distParams, generalClusterParams, pqClusterParams )
def cluster(time_series_set, name): path = "./static/cluster_data.csv" cluster_data = csv.reader(open(path, 'r')) name_list = [] series_list = [] for row in cluster_data: #print(row) #print("row", row) name_list.append(row[0]) #print("name", name_list) series = row[1:] #print("series", series) float_series = [] for i in series: float_series.append(float(i)) np_series = np.array(float_series) temp_series = stats.zscore(np_series) series_list.append(temp_series) if name not in name_list: # timeseries是性能指标序列 time_series = [] time_series_with_name = [] time_series_with_name.append(name) for row in time_series_set: time_series.append(row[1]) time_series_with_name.append(row[1]) #print(time_series) with open(path, 'a') as f: csv_write = csv.writer(f) csv_write.writerow(time_series_with_name) f.close() name_list.append(name) float_series = [] for i in time_series: float_series.append(float(i)) np_series = np.array(float_series) temp_series = stats.zscore(np_series) series_list.append(temp_series) # Custom Hierarchical clustering model1 = clustering.Hierarchical(dtw.distance_matrix_fast, {}) cluster_idx = model1.fit(series_list) # Augment Hierarchical object to keep track of the full tree model2 = clustering.HierarchicalTree(model1) cluster_idx = model2.fit(series_list) # SciPy linkage clustering model3 = clustering.LinkageTree(dtw.distance_matrix_fast, {}) cluster_idx = model3.fit(series_list) # model2.plot("hierarchy.png") fig, ax = plt.subplots(nrows=1, ncols=2, figsize=(10, 10)) show_ts_label = lambda idx: name_list[idx] model2.plot("hierarchy.png", axes=ax, show_ts_label=show_ts_label, show_tr_label=True, ts_label_margin=-10, ts_left_margin=10, ts_sample_length=1)