def __init__( self, isExtract=True, root_dir="C:\\Users\\DELL\\Projects\\VHR_CD\\image-v2-timeseries\\newest", filename="4Band_Subtracted_20040514_20050427"): """ Args: root_dir (string): the path of the file file_name (string): the name of the picture isExtract (boolean): use the mask to extract changed area """ self.isExtract = isExtract self.root_dir = root_dir self.file_name = filename self.dataset = oi.open_tiff(root_dir, filename) self.H = self.dataset[1] self.W = self.dataset[2] self.n_bands = self.dataset[3] self.npdataset = art.tif2vec( self.dataset[0]) #flatten and transform the array if self.isExtract: # extract out the changed area self.select_path = "C:\\Users\\DELL\\Projects\\VHR_CD\\image-v2-timeseries\\EXTRACT" self.select_img = "SOMOCLU_20_20_HDBSCAN_cl_2_2004_2005_min_cluster_size_4_alg_best_" self.simg = oi.open_tiff(self.select_path, self.select_img) self.select = self.simg[0] #(2720000) self.changePos = DataProcess.selectArea(self.select, self.n_bands, -1, isStack=True) self.ns_changePos = DataProcess.selectArea(self.select, self.n_bands, -1, isStack=False) self.ns_nonChangePos = DataProcess.selectArea(self.select, self.n_bands, 0, isStack=False) self.npdataset = self.npdataset[self.changePos].reshape( -1, self.n_bands) # normalization self.nmax = self.npdataset.max(axis=0) self.nmin = self.npdataset.min(axis=0) self.norm_data = (self.npdataset - self.nmin) / (self.nmax - self.nmin) # TODO:don't know what's for yet, add only to be compatible to TensorDataset self.target_data = np.zeros_like(self.norm_data) #clear the memory # self.simg=None self.dataset = None
def extract_compareClustering(clusterClass): # Get data, n_bands=4 norm_img_path = "C:\\Users\\DELL\\Projects\\MLS_cluster\\image-v2-timeseries\\newest" img = "4Band_Subtracted_20040514_20050427" dataset = oi.open_tiff(norm_img_path, img) H = dataset[1] W = dataset[2] n_bands = dataset[3] org_data = art.tif2vec(dataset[0]) #NOTE: this step is really important select_path = "C:\\Users\\DELL\\Projects\\MLS_cluster\\image-v2-timeseries\\EXTRACT" select_img = "SOMOCLU_20_20_HDBSCAN_cl_2_2004_2005_min_cluster_size_4_alg_best_" simg = oi.open_tiff(select_path, select_img) select = simg[0] #(2720000) changePos = DataProcess.selectArea(select, n_bands, -1, isStack=True) ns_changePos = DataProcess.selectArea(select, n_bands, -1, isStack=False) ns_nonChangePos = DataProcess.selectArea(select, n_bands, 0, isStack=False) X_train = org_data[changePos].reshape(-1, n_bands) result = np.zeros_like(select.reshape(-1, 1)) for cls_name, cls_class in clusterClass.items(): print("running", cls_name, "...") t0 = time.clock() cls_class.fit(X_train) usingTime = time.clock() - t0 # combine the result result[ns_changePos] = cls_class.labels_ result[ns_nonChangePos] = np.max(cls_class.labels_) + 1 evaluation = silhouette_score(X=org_data, labels=result, metric='euclidean', sample_size=10000) save_path = "C:\\Users\\DELL\\Projects\\MLS_cluster\\image-v2-timeseries\\sklearn_clustering\\compare" DataProcess.visualize_class( result.reshape(H, W), save_path + '\\' + cls_name + "_change_area_class") # save using time print("save the information to txt file...") with open( save_path + '/' + "Outlier Detection Algorithms Running Time.txt", 'a') as f: f.write("detetion algorithm: " + cls_name + "\nsilhouette_score:" + str(evaluation) + "\ndetection using time: " + str(usingTime)) f.write("\n----------------------------------------------\n")
def anomaly(self,scores): if isExtract: self.score_result=np.empty_like(self.select.reshape(-1,1)) # scale the scores self.score_result[self.ns_changePos]=DataProcess.scaleNormalize(scores,(0,500)).reshape(-1,) self.score_result[self.ns_nonChangePos]=0 else: self.score_result=DataProcess.scaleNormalize(scores,(0,500)).reshape(-1,) # give labels self.outlier_result=highRank.getOutliers(self.score_result,99) # generate picture GeoProcess.getSHP(img_path=self.root_dir,img_name=self.file_name, save_path="C:\\Users\\DELL\\Projects\\VHR_CD\\repository\\code-v2",extend_name="VAE_noEXT_",result_array=self.outlier_result)
def RunPyodOutlier(classifiers, outlier_save_path, isExtract=True): # Get data, n_bands=4 norm_img_path = "C:\\Users\\DELL\\Projects\\MLS_cluster\\image-v2-timeseries\\newest" img = "4Band_Subtracted_20040514_20050427" dataset = oi.open_tiff(norm_img_path, img) H = dataset[1] W = dataset[2] n_bands = dataset[3] org_data = art.tif2vec(dataset[0]) #NOTE: this step is really important #NOTE: Normalize the scale of the orignialdata org_data = org_data / org_data.max(axis=0) #TODO: normalize the data? if isExtract: # extract out the changed area select_path = "C:\\Users\\DELL\\Projects\\MLS_cluster\\image-v2-timeseries\\EXTRACT" select_img = "SOMOCLU_20_20_HDBSCAN_cl_2_2004_2005_min_cluster_size_4_alg_best_" simg = oi.open_tiff(select_path, select_img) select = simg[0] #(2720000) changePos = DataProcess.selectArea(select, n_bands, -1, isStack=True) ns_changePos = DataProcess.selectArea(select, n_bands, -1, isStack=False) ns_nonChangePos = DataProcess.selectArea(select, n_bands, 0, isStack=False) X_train = org_data[changePos].reshape(-1, n_bands) print("shape of original data: ", org_data.shape) print("shape of extracted data: ", X_train.shape) # to save the final result outlier_result = np.zeros_like(select.reshape(-1, 1)) score_result = np.empty_like(select.reshape(-1, 1)) else: X_train = org_data.reshape(-1, n_bands) print("shape of training data: ", X_train.shape) for clf_name, clf in classifiers.items(): if not isExtract: clf_name = "no_extract_" + clf_name print("running " + clf_name + "...") t0 = time.clock() clf.fit(X_train) usingTime = time.clock() - t0 # get the prediction labels and outlier scores of the training data y_train_pred = clf.labels_ # binary labels (0: inliers, 1: outliers) y_train_scores = clf.decision_scores_ # raw outlier scores if isExtract: # combine the extraction non-changed label&&scores and the algorithm result outlier_result[ns_changePos] = y_train_pred outlier_result[ns_nonChangePos] = 0 score_result[ns_changePos] = DataProcess.scaleNormalize( y_train_scores, (0, 500)).reshape(-1, ) score_result[ns_nonChangePos] = 0 #save the outlier detection result as .tif and .shp file else: # combine the extraction non-changed label and the algorithm result outlier_result = y_train_pred score_result = DataProcess.scaleNormalize(y_train_scores, (0, 500)).reshape(-1, ) print("the scale of the y_train_score is:", y_train_scores.min(), y_train_scores.max()) print("the scale of the score_result is:", score_result.min(), score_result.max()) DataProcess.int_to_csv(outlier_save_path, img, outlier_result, clf_name + "_outliers") GeoProcess.getSHP(norm_img_path, img, outlier_save_path, clf_name + "_outliers", outlier_result) #save the outlier scores as heatmap DataProcess.saveHeatMap(score_result.reshape(H, W), outlier_save_path + "\\" + clf_name) print("save the information to txt file...") with open( outlier_save_path + '/' + "Outlier Detection Algorithms Running Time.txt", 'a') as f: f.write("detetion algorithm: " + clf_name + "\ndetection using time: " + str(usingTime)) f.write("\n----------------------------------------------\n")
def runClusteringBased(img_path,img_name,data_path,data_name,outlier_save_path,\ clusteringPara,outlierPara,o_filter="highRank"): #clusteringPara[0] is the name, the rest are parameters #TODO:change score and filter para org_data = DataProcess.csv_to_array(data_path, data_name) AlgorithmName = clusteringPara[0] print("running " + AlgorithmName + " for clustering...") t0 = time.time() if AlgorithmName == "kMeans": d_label = cl.kMeans.getCluster(org_data, *(clusteringPara[1])) elif AlgorithmName == "Affinity": d_label = cl.Affinity.getCluster(org_data, *(clusteringPara[1])) elif AlgorithmName == "MeanShift": d_label = cl.MeanShift.getCluster(org_data, *(clusteringPara[1])) elif AlgorithmName == "Spectral": d_label = cl.Spectral.getCluster(org_data, *(clusteringPara[1])) elif AlgorithmName == "Agglomerative": d_label = cl.Agglomerative.getCluster(org_data, *(clusteringPara[1])) AlgorithmName = AlgorithmName + '_' + clusteringPara[1][6] elif AlgorithmName == "DBSCAN": d_label = cl.DBSCAN.getCluster(org_data, *(clusteringPara[1])) elif AlgorithmName == "BIRCH": d_label = cl.BIRCH.getCluster(org_data, *(clusteringPara[1])) else: print("algorithm name ilegal") exit() AlgorithmName += '_' #save the cluster information saveclass_extend_name = '_' + AlgorithmName + "cluster_label" DataProcess.int_to_csv(outlier_save_path, img_name, d_label, saveclass_extend_name) DataProcess.visualize_class(img_path, img_name, outlier_save_path, img_name + saveclass_extend_name) t1 = time.time() print("running " + outlierPara[0] + " for calculating the outlier scores...") if outlierPara[0] == "LDCOF": d_score = cb.calLDCOF.findLDCOF(org_data, d_label, outlierPara[1], outlierPara[2], outlierPara[3]) if o_filter == "highRank": outlier_label = sc2r.highRank.getOutliers(d_score, 98) #save the label information for further usage savelabel_extend_name = '_' + AlgorithmName + "outlier_label" DataProcess.int_to_csv(outlier_save_path, img_name, outlier_label, savelabel_extend_name) GeoProcess.getSHP( img_path, img_name, outlier_save_path, AlgorithmName, outlier_label) #FIXME: the .tif file could not be specified the path # DataProcess.visualize_class(img_path,img_name,outlier_save_path,img_name+savelabel_extend_name) #calculate the Silhouette Coefficient as a reference of the performance of the outcome #NOTE:due to the limited memory, I adjust the sample_size to 10000,which may cause the score less reliable print("calculating Silhouette Coefficients...") clusteringScore = cl.Silhouette.getSilhouette(org_data, d_label, sample_size=10000) usingTime = t1 - t0 print("save the information to txt file...") with open(data_path + '/' + "runningstatus.txt", 'a') as f: f.write("clustering algorithm: " + AlgorithmName + "\nsilhouette coefficient: " + str(clusteringScore) + "\nclstering using time: " + str(usingTime)) f.write("\n----------------------------------------------\n") org_data = None return clusteringScore, usingTime
from mypackages.processing import DataProcess from mypackages.processing import GeoProcess from mypackages import clustering as cl from mypackages import clusteringBased as cb from mypackages import scoresToResults as sc2r from mypackages.processing import open_image as oi import numpy as np norm_img_path = "C:\\Users\\DELL\\Projects\\MLS_cluster\\image-v2-timeseries\\Encoded_dataset\\Encoded_models_2018-10-03_1337\\subtracted_norm_from_norm" norm_data_path = "C:\\Users\\DELL\\Projects\\MLS_cluster\\image-v2-timeseries\\raw_data\\1337_sub" img = "Subtracted_20040514_20050427" raw = "Subtracted_20040514_20050427_raw_data" #get the paths and names in the dir # img_path_name,img_f_names = DataProcess.file_name(norm_subtracted_path,".TIF") # data_path_name,data_f_names = DataProcess.file_name(norm_subtracted_save,".csv") DataProcess.img_to_csv(norm_img_path, norm_data_path, img) #transform all the images in the path into csv (done) # for name in img_f_names: # DataProcess.img_to_csv(norm_subtracted_path,norm_subtracted_save,name)