def classificationWineRed(): ''' Test Classification on WineRed dataset ''' seed(1) n_folds = 5 l_rate = 0.3 n_epoch = 1000 n_hidden = [10] mlp = MultilayerNnClassifier(); activationFunction = Sigmoid() dp = DataPreparation(); evaluator = ClassificationEvaluator(); splitting = Splitting(); # Test Backprop on Seeds dataset # load and prepare data filename = '../Datasets/winequality-red.csv' dataset = dp.load_csv(filename) for i in range(len(dataset[0]) - 1): dp.str_column_to_float(dataset, i) # convert class column to integers dp.str_column_to_int(dataset, len(dataset[0]) - 1) # normalize input variables minmax = dp.dataset_minmax(dataset) dp.normalize_dataset_classification(dataset, minmax) # evaluate algorithm scores = evaluator.evaluate_algorithm(dataset, splitting, mlp.back_propagation, n_folds, l_rate, n_epoch, n_hidden, activationFunction) print_classification_scores(scores)
def regressionWineWhite(): ''' Test Classification on WineWhite dataset ''' seed(1) n_folds = 5 l_rate = 0.3 n_epoch = 1000 n_hidden = [10,5] mlp = MultilayerNnRegressor(); activationFunction = Sigmoid() dp = DataPreparation(); evaluator = RegressionEvaluator(); splitting = Splitting(); # load and prepare data filename = '../Datasets/winequality-white.csv' dataset = dp.load_csv(filename) for i in range(len(dataset[0])): dp.str_column_to_float(dataset, i) # normalize input variables including the target minmax = dp.dataset_minmax(dataset) target_minmax = minmax[-1] dp.normalize_dataset_regression(dataset, minmax) # evaluate algorithm scores = evaluator.evaluate_algorithm(dataset, splitting , mlp.back_propagation, n_folds, target_minmax, l_rate, n_epoch, n_hidden, activationFunction, target_minmax) print_regression_scores(scores)
class PreprocessingManager: def __init__(self): self._dataProcessed = list() self._dataPreparation = DataPreparation() self._dataCleaning = DataCleaning() self._dataTransforming = DataTransformation() def setDataProcesses(self, tweets, pathvectorizer): self._dataPreparation.setDataPrepared(tweets) self._dataCleaning.setDataCleaned( self._dataPreparation.getDataPrepared()) self._dataTransforming.setDataTransformed( self._dataCleaning.getDataCleaned(), pathvectorizer) self._dataProcessed = self._dataTransforming.getDataTransformed() def getDataProcesses(self): return self._dataProcessed
def classificationPokemon(): ''' Test Classification on Pokemon dataset id_combat pk1_ID pk1_Name pk1_Type1 pk1_Type2 pk1_HP pk1_Attack pk1_Defense pk1_SpAtk pk1_SpDef pk1_Speed pk1_Generation pk1_Legendary pk1_Grass pk1_Fire pk1_Water pk1_Bug pk1_Normal pk1_Poison pk1_Electric pk1_Ground pk1_Fairy pk1_Fighting pk1_Psychic pk1_Rock pk1_Ghost pk1_Ice pk1_Dragon pk1_Dark pk1_Steel pk1_Flying ID pk2_Name pk2_Type1 pk2_Type2 pk2_HP pk2_Attack pk2_Defense pk2_SpAtk pk2_SpDef pk2_Speed pk2_Generation pk2_Legendary pk2_Grass pk2_Fire pk2_Water pk2_Bug pk2_Normal pk2_Poison pk2_Electric pk2_Ground pk2_Fairy pk2_Fighting pk2_Psychic pk2_Rock pk2_Ghost pk2_Ice pk2_Dragon pk2_Dark pk2_Steel pk2_Flying winner ''' seed(1) n_folds = 5 l_rate = 0.1 n_epoch = 500 n_hidden = [5] mlp = MultilayerNnClassifier(); activationFunction = Sigmoid() dp = DataPreparation(); evaluator = ClassificationEvaluator(); splitting = Splitting(); # load and prepare data filename = '../Datasets/pkmn.csv' dataset = dp.load_csv(filename) for i in range(len(dataset[0]) - 1): dp.str_column_to_float(dataset, i) # convert class column to integers dp.str_column_to_int(dataset, len(dataset[0]) - 1) # normalize input variables minmax = dp.dataset_minmax(dataset) dp.normalize_dataset_classification(dataset, minmax) # evaluate algorithm scores = evaluator.evaluate_algorithm(dataset, splitting, mlp.back_propagation, n_folds, l_rate, n_epoch, n_hidden, activationFunction) print_classification_scores(scores)
def KFold_MFSSEL(featureNameDirPath): # 初始化dataPreparation对象 dataPreparation = DataPreparation() train_test_tuple_list = dataPreparation.getLabelAndNameTupleList_KFold(featureNameDirPath) # 每个类别的准确率 hog_class_1_accuracy_list = [] hog_class_2_accuracy_list = [] hog_class_3_accuracy_list = [] hog_class_4_accuracy_list = [] hog_class_5_accuracy_list = [] hog_class_6_accuracy_list = [] hog_class_7_accuracy_list = [] hog_class_8_accuracy_list = [] hog_class_9_accuracy_list = [] hog_class_10_accuracy_list = [] hog_class_11_accuracy_list = [] # 每个类别的准确率 hsv_class_1_accuracy_list = [] hsv_class_2_accuracy_list = [] hsv_class_3_accuracy_list = [] hsv_class_4_accuracy_list = [] hsv_class_5_accuracy_list = [] hsv_class_6_accuracy_list = [] hsv_class_7_accuracy_list = [] hsv_class_8_accuracy_list = [] hsv_class_9_accuracy_list = [] hsv_class_10_accuracy_list = [] hsv_class_11_accuracy_list = [] # 每个类别的准确率 lbp_class_1_accuracy_list = [] lbp_class_2_accuracy_list = [] lbp_class_3_accuracy_list = [] lbp_class_4_accuracy_list = [] lbp_class_5_accuracy_list = [] lbp_class_6_accuracy_list = [] lbp_class_7_accuracy_list = [] lbp_class_8_accuracy_list = [] lbp_class_9_accuracy_list = [] lbp_class_10_accuracy_list = [] lbp_class_11_accuracy_list = [] for train_test_tuple in train_test_tuple_list: print("-----------------------------------------------------------------") print(train_test_tuple_list.index(train_test_tuple) + 1) print("-----------------------------------------------------------------") accuracy_max = 0 _81FeatureDir = "/_81videoFeature/" _30FeatureDir = "/_30videoFeature/" hogFeatureDir = "/hogvideoFeature/" # 有标签数据集大小list label_data_num_list = [] label_name_labeled_train_tuple_list, label_name_unlabeled_train_tuple_list, label_name_test_tuple_list = train_test_tuple # 获取hog维有标签训练集 hog_labeled_train_tuple_list = dataPreparation.loadData(featureNameDirPath, hogFeatureDir, label_name_labeled_train_tuple_list) hog_labeled_train_Y, hog_labeled_train_X, hog_labeled_train_Name = Utilities.get_Y_X_Name_list_from_tuple( hog_labeled_train_tuple_list) # 获取hog维无标签训练集 hog_unlabeled_tuple_list = dataPreparation.loadData(featureNameDirPath, hogFeatureDir, label_name_unlabeled_train_tuple_list) hog_unlabeled_Y, hog_unlabeled_X, hog_unlabeled_Name = Utilities.get_Y_X_Name_list_from_tuple( hog_unlabeled_tuple_list) # 获取hog维测试集 hog_test_tuple_list = dataPreparation.loadData(featureNameDirPath, hogFeatureDir, label_name_test_tuple_list) hog_test_Y, hog_test_X, hog_test_Name = Utilities.get_Y_X_Name_list_from_tuple(hog_test_tuple_list) # 获取81维有标签训练集 _81_labeled_train_tuple_list = dataPreparation.loadData(featureNameDirPath, _81FeatureDir, label_name_labeled_train_tuple_list) _81_labeled_train_Y, _81_labeled_train_X, _81_labeled_train_Name = Utilities.get_Y_X_Name_list_from_tuple( _81_labeled_train_tuple_list) # 获取81维无标签训练集 _81_unlabeled_tuple_list = dataPreparation.loadData(featureNameDirPath, _81FeatureDir, label_name_unlabeled_train_tuple_list) _81_unlabeled_Y, _81_unlabeled_X, _81_unlabeled_Name = Utilities.get_Y_X_Name_list_from_tuple( _81_unlabeled_tuple_list) # 获取81维测试集 _81_test_tuple_list = dataPreparation.loadData(featureNameDirPath, _81FeatureDir, label_name_test_tuple_list) _81_test_Y, _81_test_X, _81_test_Name = Utilities.get_Y_X_Name_list_from_tuple(_81_test_tuple_list) # 获取30维有标签训练集 _30_labeled_train_tuple_list = dataPreparation.loadData(featureNameDirPath, _30FeatureDir, label_name_labeled_train_tuple_list) _30_labeled_train_Y, _30_labeled_train_X, _30_labeled_train_Name = Utilities.get_Y_X_Name_list_from_tuple( _30_labeled_train_tuple_list) # 获取30维无标签训练集 _30_unlabeled_tuple_list = dataPreparation.loadData(featureNameDirPath, _30FeatureDir, label_name_unlabeled_train_tuple_list) _30_unlabeled_Y, _30_unlabeled_X, _30_unlabeled_Name = Utilities.get_Y_X_Name_list_from_tuple( _30_unlabeled_tuple_list) # 获取30维测试集 _30_test_tuple_list = dataPreparation.loadData(featureNameDirPath, _30FeatureDir, label_name_test_tuple_list) _30_test_Y, _30_test_X, _30_test_Name = Utilities.get_Y_X_Name_list_from_tuple(_30_test_tuple_list) # hog维svm训练 hog_svc_1 = SVC(C=4, kernel='rbf', gamma=2, probability=True) # c:4 gamma=2 hog_svc_1.fit(hog_labeled_train_X, hog_labeled_train_Y) # 81维svm训练 _81_svc_1 = SVC(C=4, kernel='rbf', gamma=14, probability=True) # c:2 gamma=6 _81_svc_1.fit(_81_labeled_train_X, _81_labeled_train_Y) # 30维svm训练 _30_svc_1 = SVC(C=32, kernel='rbf', gamma=8, probability=True) # c:32 gamma=12 _30_svc_1.fit(_30_labeled_train_X, _30_labeled_train_Y) whole_labeled_train_X = hog_labeled_train_X.copy() whole_labeled_train_X = np.concatenate([whole_labeled_train_X, _81_labeled_train_X], axis=1) whole_labeled_train_X = np.concatenate([whole_labeled_train_X, _30_labeled_train_X], axis=1) # hog维svm训练 whole_svc_1 = SVC(C=4, kernel='rbf', gamma=2, probability=True) # c:4 gamma=2 whole_svc_1.fit(whole_labeled_train_X, hog_labeled_train_Y) whole_test_X = hog_test_X.copy() whole_test_X = np.concatenate([whole_test_X, _81_test_X], axis=1) whole_test_X = np.concatenate([whole_test_X, _30_test_X], axis=1) label_data_num_list.append(len(hog_labeled_train_Y)) # 获得准确率 hog_accuracy = hog_svc_1.score(hog_test_X, hog_test_Y) # 获得准确率 _81_accuracy = _81_svc_1.score(_81_test_X, _81_test_Y) # 获得准确率 _30_accuracy = _30_svc_1.score(_30_test_X, _30_test_Y) # 获得准确率 whole_accuracy = whole_svc_1.score(whole_test_X, _30_test_Y) # hog特征下的无标签数据集的所对应的各个类别的概率 hog_svc_1_test_probility = hog_svc_1.predict_proba(hog_test_X) # hog特征下无标签数据的预测标签 hog_svc_1_test_predict_Y = hog_svc_1.predict(hog_test_X) # 81维特征下的无标签数据集的所对应的各个类别的概率 _81_svc_1_test_probility = _81_svc_1.predict_proba(_81_test_X) # 81维特征下无标签数据的预测标签 _81_svc_1_test_predict_Y = _81_svc_1.predict(_81_test_X) # 30维特征下的无标签数据集的所对应的各个类别的概率 _30_svc_1_test_probility = _30_svc_1.predict_proba(_30_test_X) # 30维特征下无标签数据的预测标签 _30_svc_1_test_predict_Y = _30_svc_1.predict(_30_test_X) each_class_hog_accuracy_list = Utilities.get_each_class_accuracy(hog_svc_1_test_predict_Y, hog_test_Y) each_class_hsv_accuracy_list = Utilities.get_each_class_accuracy(_81_svc_1_test_predict_Y, hog_test_Y) each_class_lbp_accuracy_list = Utilities.get_each_class_accuracy(_30_svc_1_test_predict_Y, hog_test_Y) hog_class_1_accuracy_list.append(each_class_hog_accuracy_list[0]) hog_class_2_accuracy_list.append(each_class_hog_accuracy_list[1]) hog_class_3_accuracy_list.append(each_class_hog_accuracy_list[2]) hog_class_4_accuracy_list.append(each_class_hog_accuracy_list[3]) hog_class_5_accuracy_list.append(each_class_hog_accuracy_list[4]) hog_class_6_accuracy_list.append(each_class_hog_accuracy_list[5]) hog_class_7_accuracy_list.append(each_class_hog_accuracy_list[6]) hog_class_8_accuracy_list.append(each_class_hog_accuracy_list[7]) hog_class_9_accuracy_list.append(each_class_hog_accuracy_list[8]) hog_class_10_accuracy_list.append(each_class_hog_accuracy_list[9]) hog_class_11_accuracy_list.append(each_class_hog_accuracy_list[10]) hsv_class_1_accuracy_list.append(each_class_hsv_accuracy_list[0]) hsv_class_2_accuracy_list.append(each_class_hsv_accuracy_list[1]) hsv_class_3_accuracy_list.append(each_class_hsv_accuracy_list[2]) hsv_class_4_accuracy_list.append(each_class_hsv_accuracy_list[3]) hsv_class_5_accuracy_list.append(each_class_hsv_accuracy_list[4]) hsv_class_6_accuracy_list.append(each_class_hsv_accuracy_list[5]) hsv_class_7_accuracy_list.append(each_class_hsv_accuracy_list[6]) hsv_class_8_accuracy_list.append(each_class_hsv_accuracy_list[7]) hsv_class_9_accuracy_list.append(each_class_hsv_accuracy_list[8]) hsv_class_10_accuracy_list.append(each_class_hsv_accuracy_list[9]) hsv_class_11_accuracy_list.append(each_class_hsv_accuracy_list[10]) lbp_class_1_accuracy_list.append(each_class_lbp_accuracy_list[0]) lbp_class_2_accuracy_list.append(each_class_lbp_accuracy_list[1]) lbp_class_3_accuracy_list.append(each_class_lbp_accuracy_list[2]) lbp_class_4_accuracy_list.append(each_class_lbp_accuracy_list[3]) lbp_class_5_accuracy_list.append(each_class_lbp_accuracy_list[4]) lbp_class_6_accuracy_list.append(each_class_lbp_accuracy_list[5]) lbp_class_7_accuracy_list.append(each_class_lbp_accuracy_list[6]) lbp_class_8_accuracy_list.append(each_class_lbp_accuracy_list[7]) lbp_class_9_accuracy_list.append(each_class_lbp_accuracy_list[8]) lbp_class_10_accuracy_list.append(each_class_lbp_accuracy_list[9]) lbp_class_11_accuracy_list.append(each_class_lbp_accuracy_list[10]) # max_each_class_hog_accuracy_list = max(each_class_hog_accuracy_list) # min_each_class_hog_accuracy_list = min(each_class_hog_accuracy_list) # max_each_class_hsv_accuracy_list = max(each_class_hsv_accuracy_list) # min_each_class_hsv_accuracy_list = min(each_class_hsv_accuracy_list) # max_each_class_lbp_accuracy_list = max(each_class_lbp_accuracy_list) # min_each_class_lbp_accuracy_list = min(each_class_lbp_accuracy_list) print("SVM-hog准确率:") print(hog_accuracy * 100) print("SVM-hsv准确率:") print(_81_accuracy * 100) print("SVM-lbp准确率:") print(_30_accuracy * 100) print("SVM-whole准确率:") print(whole_accuracy * 100) print("有标签数据集大小:") print(label_data_num_list)
[plt.setp(item.yaxis.get_label(), 'size', 40) for item in axes.ravel()] [plt.setp(item.xaxis.get_label(), 'size', 40) for item in axes.ravel()] corr = df.corr().as_matrix() for i, j in zip(*plt.np.triu_indices_from(axes, k=1)): axes[i, j].annotate("%.3f" %corr[i,j], (0.8, 0.8), xycoords='axes fraction', ha='center', va='center', size=45) plt.show() if __name__ == '__main__': # data preparation file = '3222' data_preparation = DataPreparation(file) logs = data_preparation.logs_preparation() rules = data_preparation.rules_preparation() indicators = data_preparation.indexes_preparation(rules) rules_ids = data_preparation.rules_to_ids(rules) # frequent patterns spam = SPAM(file) spam.set_max_gap(1) spam.set_min_pattern_length(2) spam.set_max_pattern_length(3) spam.spam_algorithm(logs, 0.5) #unusual patterns ind = str(indicators.loc[0][0]) logs = logs.reset_index()
w = csv.writer(f) for row in DS.dataset_names: w.writerow([row]) # Read in the data set names so they can be used to # label the chip stacks. dataset_names = [] for row in open('dataset_names.csv'): dataset_names.append(row.rstrip('\n')) # Example usage of DataLabel class to create and label # the individual chip stacks. DL = DataLabel(path_to_directory_for_storage, name_of_hdf5_file_for_storage, dataset_names, 50, 50) # Plot a single chip from the first chip stack. plt.imshow(DL.images[0][0], cmap=plt.get_cmap('gray')) plt.colorbar() plt.show() # Example usage of the DataPreparation class to convert # the labeled chip stacks to .tfrecords. number_training = 80 number_validation = 5 number_testing = 0 DP = DataPreparation('astrodetection', DL.images, DL.labels, number_training, number_validation, number_testing, path_to_directory_for_storage)
def __init__(self): self._dataProcessed = list() self._dataPreparation = DataPreparation() self._dataCleaning = DataCleaning() self._dataTransforming = DataTransformation()
def KFold_MFSSEL(featureNameDirPath, savePath=None, trainAndTestFlag="train"): # 初始化dataPreparation对象 dataPreparation = DataPreparation() train_test_tuple_list = dataPreparation.getLabelAndNameTupleList_KFold(featureNameDirPath) # 每个类别的初始准确率 first_class_1_accuracy_list = [] first_class_2_accuracy_list = [] first_class_3_accuracy_list = [] first_class_4_accuracy_list = [] first_class_5_accuracy_list = [] first_class_6_accuracy_list = [] first_class_7_accuracy_list = [] first_class_8_accuracy_list = [] first_class_9_accuracy_list = [] first_class_10_accuracy_list = [] first_class_11_accuracy_list = [] # 每个类别的最终准确率 class_1_accuracy_list = [] class_2_accuracy_list = [] class_3_accuracy_list = [] class_4_accuracy_list = [] class_5_accuracy_list = [] class_6_accuracy_list = [] class_7_accuracy_list = [] class_8_accuracy_list = [] class_9_accuracy_list = [] class_10_accuracy_list = [] class_11_accuracy_list = [] # 整体初始准确率 whole_first_accuracy_list = [] # 整体准确率 whole_accuracy_list = [] for train_test_tuple in train_test_tuple_list: print("-----------------------------------------------------------------") print(train_test_tuple_list.index(train_test_tuple) + 1) print("-----------------------------------------------------------------") model_max_hog = None model_max_81 = None model_max_30 = None accuracy_max = 0 _81FeatureDir = "/_81videoFeature/" _30FeatureDir = "/_30videoFeature/" hogFeatureDir = "/hogvideoFeature/" # 准确率 accuracy_list = [] # 整体类别 whole_class = 11 # 选取置信度前topk个样本 topk = 5 # 迭代次数 loop_num = 100 num = 0.7 para = 0.7 # 有标签数据集大小list label_data_num_list = [] label_name_labeled_train_tuple_list, label_name_unlabeled_train_tuple_list, label_name_test_tuple_list = train_test_tuple # 获取hog维有标签训练集 hog_labeled_train_tuple_list = dataPreparation.loadData(featureNameDirPath, hogFeatureDir, label_name_labeled_train_tuple_list) hog_labeled_train_Y, hog_labeled_train_X, hog_labeled_train_Name = Utilities.get_Y_X_Name_list_from_tuple( hog_labeled_train_tuple_list) # 获取hog维无标签训练集 hog_unlabeled_tuple_list = dataPreparation.loadData(featureNameDirPath, hogFeatureDir, label_name_unlabeled_train_tuple_list) hog_unlabeled_Y, hog_unlabeled_X, hog_unlabeled_Name = Utilities.get_Y_X_Name_list_from_tuple( hog_unlabeled_tuple_list) # 获取hog维测试集 hog_test_tuple_list = dataPreparation.loadData(featureNameDirPath, hogFeatureDir, label_name_test_tuple_list) hog_test_Y, hog_test_X, hog_test_Name = Utilities.get_Y_X_Name_list_from_tuple(hog_test_tuple_list) # 获取81维有标签训练集 _81_labeled_train_tuple_list = dataPreparation.loadData(featureNameDirPath, _81FeatureDir, label_name_labeled_train_tuple_list) _81_labeled_train_Y, _81_labeled_train_X, _81_labeled_train_Name = Utilities.get_Y_X_Name_list_from_tuple( _81_labeled_train_tuple_list) # 获取81维无标签训练集 _81_unlabeled_tuple_list = dataPreparation.loadData(featureNameDirPath, _81FeatureDir, label_name_unlabeled_train_tuple_list) _81_unlabeled_Y, _81_unlabeled_X, _81_unlabeled_Name = Utilities.get_Y_X_Name_list_from_tuple( _81_unlabeled_tuple_list) # 获取81维测试集 _81_test_tuple_list = dataPreparation.loadData(featureNameDirPath, _81FeatureDir, label_name_test_tuple_list) _81_test_Y, _81_test_X, _81_test_Name = Utilities.get_Y_X_Name_list_from_tuple(_81_test_tuple_list) # 获取30维有标签训练集 _30_labeled_train_tuple_list = dataPreparation.loadData(featureNameDirPath, _30FeatureDir, label_name_labeled_train_tuple_list) _30_labeled_train_Y, _30_labeled_train_X, _30_labeled_train_Name = Utilities.get_Y_X_Name_list_from_tuple( _30_labeled_train_tuple_list) # 获取30维无标签训练集 _30_unlabeled_tuple_list = dataPreparation.loadData(featureNameDirPath, _30FeatureDir, label_name_unlabeled_train_tuple_list) _30_unlabeled_Y, _30_unlabeled_X, _30_unlabeled_Name = Utilities.get_Y_X_Name_list_from_tuple( _30_unlabeled_tuple_list) # 获取30维测试集 _30_test_tuple_list = dataPreparation.loadData(featureNameDirPath, _30FeatureDir, label_name_test_tuple_list) _30_test_Y, _30_test_X, _30_test_Name = Utilities.get_Y_X_Name_list_from_tuple(_30_test_tuple_list) for h in range(1, loop_num + 1): label_data_num_list.append(len(hog_labeled_train_Y)) # hog维svm训练 hog_svc_1 = SVC(C=4, kernel='rbf', gamma=2, probability=True) # c:4 gamma=2 hog_svc_1.fit(hog_labeled_train_X, hog_labeled_train_Y) # 81维svm训练 _81_svc_1 = SVC(C=4, kernel='rbf', gamma=14, probability=True) # c:2 gamma=6 _81_svc_1.fit(_81_labeled_train_X, _81_labeled_train_Y) # 30维svm训练 _30_svc_1 = SVC(C=32, kernel='rbf', gamma=8, probability=True) # c:32 gamma=12 _30_svc_1.fit(_30_labeled_train_X, _30_labeled_train_Y) # 获得准确率 hog_accuracy = hog_svc_1.score(hog_test_X, hog_test_Y) # 获得准确率 _81_accuracy = _81_svc_1.score(_81_test_X, _81_test_Y) # 获得准确率 _30_accuracy = _30_svc_1.score(_30_test_X, _30_test_Y) if h == 1: if hog_svc_1 is not None: print("正在保存first_hog.model...") joblib.dump(hog_svc_1, savePath + "first_hog.model") print("保存first_hog.model完毕。") if _81_svc_1 is not None: print("正在保存first_81.model...") joblib.dump(_81_svc_1, savePath + "first_81.model") print("保存first_81.model完毕。") if _30_svc_1 is not None: print("正在保存first_30.model...") joblib.dump(_30_svc_1, savePath + "first_30.model") print("保存first_30.model完毕。") # hog_accuracy,_81_accuracy,_30_accuracy中最大的accuracy accuracy_max_from_single = max(hog_accuracy, _81_accuracy, _30_accuracy) accuracy_list.append(accuracy_max_from_single * 100) # hog_accuracy_list.append(hog_accuracy * 100) # _81_accuracy_list.append(_81_accuracy * 100) # _30_accuracy_list.append(_30_accuracy * 100) if accuracy_max_from_single > accuracy_max: accuracy_max = accuracy_max_from_single model_max_hog = hog_svc_1 model_max_81 = _81_svc_1 model_max_30 = _30_svc_1 if len(hog_unlabeled_X) == 0 or len(_81_unlabeled_X) == 0 or len(_30_unlabeled_X) == 0: break if h == loop_num: break # print(accuracy_max * 100) # hog特征下的测试数据集的所对应的各个类别的概率 hog_svc_1_probility = hog_svc_1.predict_proba(hog_unlabeled_X) # hog特征下测试数据集的预测标签 hog_svc_1_predict_Y = hog_svc_1.predict(hog_unlabeled_X) # 81维特征下的测试数据集的所对应的各个类别的概率 _81_svc_1_probility = _81_svc_1.predict_proba(_81_unlabeled_X) # 81维特征下测试数据集的预测标签 _81_svc_1_predict_Y = _81_svc_1.predict(_81_unlabeled_X) # 30维特征下的测试数据集的所对应的各个类别的概率 _30_svc_1_probility = _30_svc_1.predict_proba(_30_unlabeled_X) # 30维特征下测试数据集的预测标签 _30_svc_1_predict_Y = _30_svc_1.predict(_30_unlabeled_X) probility_list_1 = [hog_svc_1_probility, _81_svc_1_probility, _30_svc_1_probility] unlabeled_Y_list_1 = [hog_unlabeled_Y, _81_unlabeled_Y, _30_unlabeled_Y] predict_Y_list_1 = [hog_svc_1_predict_Y, _81_svc_1_predict_Y, _30_svc_1_predict_Y] # voted_index_predict_Y_list = Utilities.vote(predict_Y_list_1, unlabeled_Y_list_1, whole_class, topk) # # voted_Y_list, voted_index_list = Utilities.get_voted_confidence(probility_list_1, # voted_index_predict_Y_list[0], # voted_index_predict_Y_list[1], whole_class, # topk) selected_ind_list, selected_pesudo_label_list = Utilities.get_pesudo_label(probility_list_1, predict_Y_list_1, unlabeled_Y_list_1, whole_class, topk, num, para) # a = [] # for i in selected_ind_list: # a.append(_30_unlabeled_Y[i]) # print(selected_pesudo_label_list) # print(a) for i in range(len(selected_ind_list)): hog_labeled_train_X.append(hog_unlabeled_X[selected_ind_list[i]]) hog_labeled_train_Y.append(selected_pesudo_label_list[i]) _81_labeled_train_X.append(_81_unlabeled_X[selected_ind_list[i]]) _81_labeled_train_Y.append(selected_pesudo_label_list[i]) _30_labeled_train_X.append(_30_unlabeled_X[selected_ind_list[i]]) _30_labeled_train_Y.append(selected_pesudo_label_list[i]) hog_unlabeled_X = [i for j, i in enumerate(hog_unlabeled_X) if j not in selected_ind_list] hog_unlabeled_Y = [i for j, i in enumerate(hog_unlabeled_Y) if j not in selected_ind_list] _81_unlabeled_X = [i for j, i in enumerate(_81_unlabeled_X) if j not in selected_ind_list] _81_unlabeled_Y = [i for j, i in enumerate(_81_unlabeled_Y) if j not in selected_ind_list] _30_unlabeled_X = [i for j, i in enumerate(_30_unlabeled_X) if j not in selected_ind_list] _30_unlabeled_Y = [i for j, i in enumerate(_30_unlabeled_Y) if j not in selected_ind_list] # print(accuracy_max * 100) # print("有标签数据集大小:") # print(label_data_num_list) # print("accuracy_list:") # print(accuracy_list) if model_max_hog is not None: print("正在保存hog.model...") joblib.dump(model_max_hog, savePath + "hog.model") print("保存hog.model完毕。") if model_max_81 is not None: print("正在保存81.model...") joblib.dump(model_max_81, savePath + "81.model") print("保存81.model完毕。") if model_max_30 is not None: print("正在保存30.model...") joblib.dump(model_max_30, savePath + "30.model") print("保存30.model完毕。") if os.path.exists(savePath): # 加载model文件 first_hogModel = joblib.load(savePath + "/first_hog.model") first_81Model = joblib.load(savePath + "/first_81.model") first_30Model = joblib.load(savePath + "/first_30.model") # hog特征下的无标签数据集的所对应的各个类别的概率 first_hog_svc_test_probility = first_hogModel.predict_proba(hog_test_X) # hog特征下无标签数据的预测标签 first_hog_svc_test_predict_Y = first_hogModel.predict(hog_test_X) # 81维特征下的无标签数据集的所对应的各个类别的概率 first_81_svc_test_probility = first_81Model.predict_proba(_81_test_X) # 81维特征下无标签数据的预测标签 first_81_svc_test_predict_Y = first_81Model.predict(_81_test_X) # 30维特征下的无标签数据集的所对应的各个类别的概率 first_30_svc_test_probility = first_30Model.predict_proba(_30_test_X) # 30维特征下无标签数据的预测标签 first_30_svc_test_predict_Y = first_30Model.predict(_30_test_X) first_probility_list = first_hog_svc_test_probility, first_81_svc_test_probility, first_30_svc_test_probility first_predict_Y_list = first_hog_svc_test_predict_Y, first_81_svc_test_predict_Y, first_30_svc_test_predict_Y first_mfssel_predict_Y_list = Utilities.predict_Y_test(first_probility_list, first_predict_Y_list, whole_class, para) first_each_class_accuracy_list = Utilities.get_each_class_accuracy(first_mfssel_predict_Y_list, hog_test_Y) first_class_1_accuracy_list.append(first_each_class_accuracy_list[0]) first_class_2_accuracy_list.append(first_each_class_accuracy_list[1]) first_class_3_accuracy_list.append(first_each_class_accuracy_list[2]) first_class_4_accuracy_list.append(first_each_class_accuracy_list[3]) first_class_5_accuracy_list.append(first_each_class_accuracy_list[4]) first_class_6_accuracy_list.append(first_each_class_accuracy_list[5]) first_class_7_accuracy_list.append(first_each_class_accuracy_list[6]) first_class_8_accuracy_list.append(first_each_class_accuracy_list[7]) first_class_9_accuracy_list.append(first_each_class_accuracy_list[8]) first_class_10_accuracy_list.append(first_each_class_accuracy_list[9]) first_class_11_accuracy_list.append(first_each_class_accuracy_list[10]) if len(first_mfssel_predict_Y_list) == len(hog_test_Y): rightNum = 0 for i in range(len(first_mfssel_predict_Y_list)): if first_mfssel_predict_Y_list[i] == hog_test_Y[i]: rightNum = rightNum + 1 whole_first_accuracy_list.append(rightNum / len(hog_test_Y)) print("整体初始准确率:") print(rightNum / len(hog_test_Y)) else: print("出错!!") print("每一类的初始准确率:") print(first_each_class_accuracy_list) print("初始平均准确率:") print(sum(first_each_class_accuracy_list) / len(first_each_class_accuracy_list)) # 加载model文件 hogModel = joblib.load(savePath + "/hog.model") _81Model = joblib.load(savePath + "/81.model") _30Model = joblib.load(savePath + "/30.model") # hog特征下的无标签数据集的所对应的各个类别的概率 hog_svc_test_probility = hogModel.predict_proba(hog_test_X) # hog特征下无标签数据的预测标签 hog_svc_test_predict_Y = hogModel.predict(hog_test_X) # 81维特征下的无标签数据集的所对应的各个类别的概率 _81_svc_test_probility = _81Model.predict_proba(_81_test_X) # 81维特征下无标签数据的预测标签 _81_svc_test_predict_Y = _81Model.predict(_81_test_X) # 30维特征下的无标签数据集的所对应的各个类别的概率 _30_svc_test_probility = _30Model.predict_proba(_30_test_X) # 30维特征下无标签数据的预测标签 _30_svc_test_predict_Y = _30Model.predict(_30_test_X) probility_list = hog_svc_test_probility, _81_svc_test_probility, _30_svc_test_probility predict_Y_list = hog_svc_test_predict_Y, _81_svc_test_predict_Y, _30_svc_test_predict_Y mfssel_predict_Y_list = Utilities.predict_Y_test(probility_list, predict_Y_list, whole_class, para) each_class_accuracy_list = Utilities.get_each_class_accuracy(mfssel_predict_Y_list, hog_test_Y) class_1_accuracy_list.append(each_class_accuracy_list[0]) class_2_accuracy_list.append(each_class_accuracy_list[1]) class_3_accuracy_list.append(each_class_accuracy_list[2]) class_4_accuracy_list.append(each_class_accuracy_list[3]) class_5_accuracy_list.append(each_class_accuracy_list[4]) class_6_accuracy_list.append(each_class_accuracy_list[5]) class_7_accuracy_list.append(each_class_accuracy_list[6]) class_8_accuracy_list.append(each_class_accuracy_list[7]) class_9_accuracy_list.append(each_class_accuracy_list[8]) class_10_accuracy_list.append(each_class_accuracy_list[9]) class_11_accuracy_list.append(each_class_accuracy_list[10]) print("每一类的最终准确率:") print(each_class_accuracy_list) print("平均准确率:") print(sum(each_class_accuracy_list) / len(each_class_accuracy_list)) if len(mfssel_predict_Y_list) == len(hog_test_Y): rightNum = 0 for i in range(len(mfssel_predict_Y_list)): if mfssel_predict_Y_list[i] == hog_test_Y[i]: rightNum = rightNum + 1 whole_accuracy_list.append(rightNum / len(hog_test_Y)) print("整体准确率:") print(rightNum / len(hog_test_Y)) else: print("出错!!") print("---------------------------------------------------") avg_first_class_1_accuracy = sum(first_class_1_accuracy_list) / len(first_class_1_accuracy_list) avg_first_class_2_accuracy = sum(first_class_2_accuracy_list) / len(first_class_2_accuracy_list) avg_first_class_3_accuracy = sum(first_class_3_accuracy_list) / len(first_class_3_accuracy_list) avg_first_class_4_accuracy = sum(first_class_4_accuracy_list) / len(first_class_4_accuracy_list) avg_first_class_5_accuracy = sum(first_class_5_accuracy_list) / len(first_class_5_accuracy_list) avg_first_class_6_accuracy = sum(first_class_6_accuracy_list) / len(first_class_6_accuracy_list) avg_first_class_7_accuracy = sum(first_class_7_accuracy_list) / len(first_class_7_accuracy_list) avg_first_class_8_accuracy = sum(first_class_8_accuracy_list) / len(first_class_8_accuracy_list) avg_first_class_9_accuracy = sum(first_class_9_accuracy_list) / len(first_class_9_accuracy_list) avg_first_class_10_accuracy = sum(first_class_10_accuracy_list) / len(first_class_10_accuracy_list) avg_first_class_11_accuracy = sum(first_class_11_accuracy_list) / len(first_class_11_accuracy_list) print("shooting初始平均准确率:") print(avg_first_class_1_accuracy) print("biking初始平均准确率:") print(avg_first_class_2_accuracy) print("diving初始平均准确率:") print(avg_first_class_3_accuracy) print("golf初始平均准确率:") print(avg_first_class_4_accuracy) print("riding初始平均准确率:") print(avg_first_class_5_accuracy) print("juggle初始平均准确率:") print(avg_first_class_6_accuracy) print("swing初始平均准确率:") print(avg_first_class_7_accuracy) print("tennis初始平均准确率:") print(avg_first_class_8_accuracy) print("jumping初始平均准确率:") print(avg_first_class_9_accuracy) print("spiking初始平均准确率:") print(avg_first_class_10_accuracy) print("walk初始平均准确率:") print(avg_first_class_11_accuracy) print("---------------------------------------------------") avg_class_1_accuracy = sum(class_1_accuracy_list) / len(class_1_accuracy_list) avg_class_2_accuracy = sum(class_2_accuracy_list) / len(class_2_accuracy_list) avg_class_3_accuracy = sum(class_3_accuracy_list) / len(class_3_accuracy_list) avg_class_4_accuracy = sum(class_4_accuracy_list) / len(class_4_accuracy_list) avg_class_5_accuracy = sum(class_5_accuracy_list) / len(class_5_accuracy_list) avg_class_6_accuracy = sum(class_6_accuracy_list) / len(class_6_accuracy_list) avg_class_7_accuracy = sum(class_7_accuracy_list) / len(class_7_accuracy_list) avg_class_8_accuracy = sum(class_8_accuracy_list) / len(class_8_accuracy_list) avg_class_9_accuracy = sum(class_9_accuracy_list) / len(class_9_accuracy_list) avg_class_10_accuracy = sum(class_10_accuracy_list) / len(class_10_accuracy_list) avg_class_11_accuracy = sum(class_11_accuracy_list) / len(class_11_accuracy_list) print("shooting最终平均准确率:") print(avg_class_1_accuracy) print("biking最终平均准确率:") print(avg_class_2_accuracy) print("diving最终平均准确率:") print(avg_class_3_accuracy) print("golf最终平均准确率:") print(avg_class_4_accuracy) print("riding最终平均准确率:") print(avg_class_5_accuracy) print("juggle最终平均准确率:") print(avg_class_6_accuracy) print("swing最终平均准确率:") print(avg_class_7_accuracy) print("tennis最终平均准确率:") print(avg_class_8_accuracy) print("jumping最终平均准确率:") print(avg_class_9_accuracy) print("spiking最终平均准确率:") print(avg_class_10_accuracy) print("walk最终平均准确率:") print(avg_class_11_accuracy) print("---------------------------------------------------") print("整体初始平均准确率:") a = [avg_first_class_1_accuracy, avg_first_class_2_accuracy, avg_first_class_3_accuracy, avg_first_class_4_accuracy, avg_first_class_5_accuracy, avg_first_class_6_accuracy, avg_first_class_7_accuracy, avg_first_class_8_accuracy, avg_first_class_9_accuracy, avg_first_class_10_accuracy, avg_first_class_11_accuracy] print(sum(a)/len(a)) print(sum(whole_first_accuracy_list) / len(whole_first_accuracy_list)) print("---------------------------------------------------") print("整体平均准确率:") b = [avg_class_1_accuracy, avg_class_2_accuracy, avg_class_3_accuracy, avg_class_4_accuracy, avg_class_5_accuracy, avg_class_6_accuracy, avg_class_7_accuracy, avg_class_8_accuracy, avg_class_9_accuracy, avg_class_10_accuracy, avg_class_11_accuracy] print(sum(b) / len(b)) print(sum(whole_accuracy_list) / len(whole_accuracy_list))
def MFSSEL(featureNameDirPath, savePath=None, trainAndTestFlag="train"): model_max_hog = None model_max_81 = None model_max_30 = None accuracy_max = 0 accuracy_max_from_single = 0 featureDir = "/home/sunbite/MFSSEL/features/" featureDir_KFold = "/home/sunbite/MFSSEL/features_new_2/" train_81FeatureDir = "/train/_81videoFeature/" train_30FeatureDir = "/train/_30videoFeature/" train_hogFeatureDir = "/train/hogvideoFeature/" test_81FeatureDir = "/test/_81videoFeature/" test_30FeatureDir = "/test/_30videoFeature/" test_hogFeatureDir = "/test/hogvideoFeature/" # 每个类别的准确率 class_1_accuracy_list = [] class_2_accuracy_list = [] class_3_accuracy_list = [] class_4_accuracy_list = [] class_5_accuracy_list = [] class_6_accuracy_list = [] class_7_accuracy_list = [] class_8_accuracy_list = [] class_9_accuracy_list = [] class_10_accuracy_list = [] class_11_accuracy_list = [] # 准确率 accuracy_list = [] # 整体类别 whole_class = 11 # 选取置信度前topk个样本 topk = 5 # 迭代次数 loop_num = 100 # 有标签数据集大小list label_data_num_list = [] num = 0.8 para = 0.9 # 训练 if trainAndTestFlag == "train": # 初始化dataPreparation对象 dataPreparation = DataPreparation() # label_name_labeled_train_tuple_list, label_name_unlabeled_train_tuple_list, label_name_test_tuple_list = dataPreparation.getLabelAndNameTupleList_KFold( # featureDir_KFold) # 获取有标签训练集,无标签训练集,测试集的标签和名字tuplelist label_name_labeled_train_tuple_list, label_name_unlabeled_train_tuple_list, label_name_test_tuple_list = dataPreparation.getLabelAndNameTupleList( featureNameDirPath) # # 取样后的有标签训练集的标签和名字tuplelist # bootstrapped_Labeled_train_tuple_list_1 = dataPreparation.getBootstrapSample(label_name_labeled_train_tuple_list, 1) # bootstrapped_Labeled_train_tuple_list_2 = dataPreparation.getBootstrapSample(label_name_labeled_train_tuple_list, # 40) # 获取hog维有标签训练集 hog_labeled_train_tuple_list = dataPreparation.loadData( featureDir, train_hogFeatureDir, label_name_labeled_train_tuple_list) hog_labeled_train_Y, hog_labeled_train_X, hog_labeled_train_Name = Utilities.get_Y_X_Name_list_from_tuple( hog_labeled_train_tuple_list) # 获取hog维无标签训练集 hog_unlabeled_tuple_list = dataPreparation.loadData( featureDir, train_hogFeatureDir, label_name_unlabeled_train_tuple_list) hog_unlabeled_Y, hog_unlabeled_X, hog_unlabeled_Name = Utilities.get_Y_X_Name_list_from_tuple( hog_unlabeled_tuple_list) # 获取hog维测试集 hog_test_tuple_list = dataPreparation.loadData( featureDir, test_hogFeatureDir, label_name_test_tuple_list) hog_test_Y, hog_test_X, hog_test_Name = Utilities.get_Y_X_Name_list_from_tuple( hog_test_tuple_list) # 获取81维有标签训练集 _81_labeled_train_tuple_list = dataPreparation.loadData( featureDir, train_81FeatureDir, label_name_labeled_train_tuple_list) _81_labeled_train_Y, _81_labeled_train_X, _81_labeled_train_Name = Utilities.get_Y_X_Name_list_from_tuple( _81_labeled_train_tuple_list) # 获取81维无标签训练集 _81_unlabeled_tuple_list = dataPreparation.loadData( featureDir, train_81FeatureDir, label_name_unlabeled_train_tuple_list) _81_unlabeled_Y, _81_unlabeled_X, _81_unlabeled_Name = Utilities.get_Y_X_Name_list_from_tuple( _81_unlabeled_tuple_list) # 获取81维测试集 _81_test_tuple_list = dataPreparation.loadData( featureDir, test_81FeatureDir, label_name_test_tuple_list) _81_test_Y, _81_test_X, _81_test_Name = Utilities.get_Y_X_Name_list_from_tuple( _81_test_tuple_list) # 获取30维有标签训练集 _30_labeled_train_tuple_list = dataPreparation.loadData( featureDir, train_30FeatureDir, label_name_labeled_train_tuple_list) _30_labeled_train_Y, _30_labeled_train_X, _30_labeled_train_Name = Utilities.get_Y_X_Name_list_from_tuple( _30_labeled_train_tuple_list) # 获取30维无标签训练集 _30_unlabeled_tuple_list = dataPreparation.loadData( featureDir, train_30FeatureDir, label_name_unlabeled_train_tuple_list) _30_unlabeled_Y, _30_unlabeled_X, _30_unlabeled_Name = Utilities.get_Y_X_Name_list_from_tuple( _30_unlabeled_tuple_list) # 获取30维测试集 _30_test_tuple_list = dataPreparation.loadData( featureDir, test_30FeatureDir, label_name_test_tuple_list) _30_test_Y, _30_test_X, _30_test_Name = Utilities.get_Y_X_Name_list_from_tuple( _30_test_tuple_list) for h in range(1, loop_num + 1): print("有标签数据集大小:") label_data_num_list.append(len(hog_labeled_train_Y)) print(label_data_num_list) # hog维svm训练 hog_svc_1 = SVC(C=4, kernel='rbf', gamma=2, probability=True) # c:4 gamma=2 hog_svc_1.fit(hog_labeled_train_X, hog_labeled_train_Y) # 81维svm训练 _81_svc_1 = SVC(C=4, kernel='rbf', gamma=14, probability=True) # c:2 gamma=6 _81_svc_1.fit(_81_labeled_train_X, _81_labeled_train_Y) # 30维svm训练 _30_svc_1 = SVC(C=32, kernel='rbf', gamma=8, probability=True) # c:32 gamma=12 _30_svc_1.fit(_30_labeled_train_X, _30_labeled_train_Y) # 获得准确率 hog_accuracy = hog_svc_1.score(hog_test_X, hog_test_Y) # 获得准确率 _81_accuracy = _81_svc_1.score(_81_test_X, _81_test_Y) # 获得准确率 _30_accuracy = _30_svc_1.score(_30_test_X, _30_test_Y) # hog特征下的无标签数据集的所对应的各个类别的概率 hog_svc_1_test_probility = hog_svc_1.predict_proba(hog_test_X) # hog特征下无标签数据的预测标签 hog_svc_1_test_predict_Y = hog_svc_1.predict(hog_test_X) # 81维特征下的无标签数据集的所对应的各个类别的概率 _81_svc_1_test_probility = _81_svc_1.predict_proba(_81_test_X) # 81维特征下无标签数据的预测标签 _81_svc_1_test_predict_Y = _81_svc_1.predict(_81_test_X) # 30维特征下的无标签数据集的所对应的各个类别的概率 _30_svc_1_test_probility = _30_svc_1.predict_proba(_30_test_X) # 30维特征下无标签数据的预测标签 _30_svc_1_test_predict_Y = _30_svc_1.predict(_30_test_X) mfssel_predict_Y_list_ = Utilities.predict_Y( hog_svc_1_test_probility, _81_svc_1_test_probility, _30_svc_1_test_probility, hog_svc_1_test_predict_Y, _81_svc_1_test_predict_Y, _30_svc_1_test_predict_Y) each_class_accuracy_list = Utilities.get_each_class_accuracy( mfssel_predict_Y_list_, hog_test_Y) class_1_accuracy_list.append(each_class_accuracy_list[0]) class_2_accuracy_list.append(each_class_accuracy_list[1]) class_3_accuracy_list.append(each_class_accuracy_list[2]) class_4_accuracy_list.append(each_class_accuracy_list[3]) class_5_accuracy_list.append(each_class_accuracy_list[4]) class_6_accuracy_list.append(each_class_accuracy_list[5]) class_7_accuracy_list.append(each_class_accuracy_list[6]) class_8_accuracy_list.append(each_class_accuracy_list[7]) class_9_accuracy_list.append(each_class_accuracy_list[8]) class_10_accuracy_list.append(each_class_accuracy_list[9]) class_11_accuracy_list.append(each_class_accuracy_list[10]) # hog_accuracy,_81_accuracy,_30_accuracy中最大的accuracy accuracy_max_from_single = max(hog_accuracy, _81_accuracy, _30_accuracy) accuracy_list.append(accuracy_max_from_single * 100) # hog_accuracy_list.append(hog_accuracy * 100) # _81_accuracy_list.append(_81_accuracy * 100) # _30_accuracy_list.append(_30_accuracy * 100) print("accuracy_list:") print(accuracy_list) if accuracy_max_from_single > accuracy_max: accuracy_max = accuracy_max_from_single model_max_hog = hog_svc_1 model_max_81 = _81_svc_1 model_max_30 = _30_svc_1 if len(hog_unlabeled_X) == 0 or len(_81_unlabeled_X) == 0 or len( _30_unlabeled_X) == 0: break if h == loop_num: break # hog特征下的测试数据集的所对应的各个类别的概率 hog_svc_1_probility = hog_svc_1.predict_proba(hog_unlabeled_X) # hog特征下测试数据集的预测标签 hog_svc_1_predict_Y = hog_svc_1.predict(hog_unlabeled_X) # 81维特征下的测试数据集的所对应的各个类别的概率 _81_svc_1_probility = _81_svc_1.predict_proba(_81_unlabeled_X) # 81维特征下测试数据集的预测标签 _81_svc_1_predict_Y = _81_svc_1.predict(_81_unlabeled_X) # 30维特征下的测试数据集的所对应的各个类别的概率 _30_svc_1_probility = _30_svc_1.predict_proba(_30_unlabeled_X) # 30维特征下测试数据集的预测标签 _30_svc_1_predict_Y = _30_svc_1.predict(_30_unlabeled_X) probility_list_1 = [ hog_svc_1_probility, _81_svc_1_probility, _30_svc_1_probility ] unlabeled_Y_list_1 = [ hog_unlabeled_Y, _81_unlabeled_Y, _30_unlabeled_Y ] predict_Y_list_1 = [ hog_svc_1_predict_Y, _81_svc_1_predict_Y, _30_svc_1_predict_Y ] # voted_index_predict_Y_list = Utilities.vote(predict_Y_list_1, unlabeled_Y_list_1, whole_class, topk) # # voted_Y_list, voted_index_list = Utilities.get_voted_confidence(probility_list_1, # voted_index_predict_Y_list[0], # voted_index_predict_Y_list[1], whole_class, # topk) selected_ind_list, selected_pesudo_label_list = Utilities.get_pesudo_label( probility_list_1, predict_Y_list_1, unlabeled_Y_list_1, whole_class, topk, num, para) a = [] for i in selected_ind_list: a.append(_30_unlabeled_Y[i]) print(selected_pesudo_label_list) print(a) for i in range(len(selected_ind_list)): hog_labeled_train_X.append( hog_unlabeled_X[selected_ind_list[i]]) hog_labeled_train_Y.append(selected_pesudo_label_list[i]) _81_labeled_train_X.append( _81_unlabeled_X[selected_ind_list[i]]) _81_labeled_train_Y.append(selected_pesudo_label_list[i]) _30_labeled_train_X.append( _30_unlabeled_X[selected_ind_list[i]]) _30_labeled_train_Y.append(selected_pesudo_label_list[i]) hog_unlabeled_X = [ i for j, i in enumerate(hog_unlabeled_X) if j not in selected_ind_list ] hog_unlabeled_Y = [ i for j, i in enumerate(hog_unlabeled_Y) if j not in selected_ind_list ] _81_unlabeled_X = [ i for j, i in enumerate(_81_unlabeled_X) if j not in selected_ind_list ] _81_unlabeled_Y = [ i for j, i in enumerate(_81_unlabeled_Y) if j not in selected_ind_list ] _30_unlabeled_X = [ i for j, i in enumerate(_30_unlabeled_X) if j not in selected_ind_list ] _30_unlabeled_Y = [ i for j, i in enumerate(_30_unlabeled_Y) if j not in selected_ind_list ] print(accuracy_max * 100) print(class_1_accuracy_list) print(class_2_accuracy_list) print(class_3_accuracy_list) print(class_4_accuracy_list) print(class_5_accuracy_list) print(class_6_accuracy_list) print(class_7_accuracy_list) print(class_8_accuracy_list) print(class_9_accuracy_list) print(class_10_accuracy_list) print(class_11_accuracy_list) if model_max_hog is not None: print("正在保存hog.model...") joblib.dump(model_max_hog, savePath + "hog.model") print("保存hog.model完毕。") if model_max_81 is not None: print("正在保存81.model...") joblib.dump(model_max_81, savePath + "81.model") print("保存81.model完毕。") if model_max_30 is not None: print("正在保存30.model...") joblib.dump(model_max_30, savePath + "30.model") print("保存30.model完毕。") # 加载训练好的model,进行测试 elif trainAndTestFlag == "test": if os.path.exists(savePath): # 加载model文件 model_max_hog = joblib.load(savePath + "hog.model") model_max_81 = joblib.load(savePath + "81.model") model_max_30 = joblib.load(savePath + "30.model")
from keras.preprocessing.sequence import pad_sequences from keras.utils import to_categorical #Modeling #from keras.models import Sequential #from keras.layers import LSTM,Bidirectional,Dense,Embedding,Dropout #%% Loading data #https://www.kaggle.com/wcukierski/enron-email-dataset emails = pd.read_csv('emails.csv', skiprows=lambda x: x % 9) #%% to see how emails look like print(emails['message'][1]) #%% object of class DataPreparation prep = DataPreparation(emails) #to extract emails' body then add it to a new column emails['Email'] = prep.bodyExtraction(emails['message']) #labeling, adding label to each column. emails['Sentiment'] = prep.labeling(emails['Email']) #creating a separate dataset with just two columns body and sentiment df = prep.newData(emails['Email'], emails['Sentiment']) #%% df.info() #%% #after extracting the body
plt.title('Support Vector Regression') plt.legend() plt.show() def fit(self, x, y): parameters = {'kernel': ('rbf', ), 'C': [1e3, 1e2, 1, 10]} svr = svm.SVR() self.estimator = GridSearchCV(svr, parameters) self.estimator.fit(x, y.values.ravel()) print self.estimator def predict(self, x, real_y): pred_y = self.estimator.predict(x) self.plotData(real_y, pred_y) if __name__ == "__main__": mysvm = MySVM() # trainning dp = DataPreparation(7, 'mid') x, y = dp.generateData() print "=====fit====" mysvm.fit(x, y) # predict dp = DataPreparation(8, 'mid') x, y = dp.generateData() print "=====predict====" mysvm.predict(x, y)
prefix_s_step, s_temp_bitmap) # recursively try to extend that pattern if self.max_pattern_length > size_current_prefix: self.dfs_pruning(prefix_s_step, s_temp_bitmap, s_temp, s_temp, s_temp[pos], size_current_prefix + 1) # Save the results in a file def write_to_file(self, file): results_to_strings = self.result.apply( lambda sequence: " -1 ".join(str(x) for x in sequence['Pattern']) + " -1 #SUP: " + str(sequence['SUP']), axis=1) results_to_strings.to_csv('../results-SPAM/' + self.file + '-out.txt', index=False, header=None) if __name__ == '__main__': f = 'test1' l = DataPreparation(f) logs = l.logs_preparation() s = SPAM(f) #s.set_max_gap(1) s.set_min_pattern_length(2) s.set_max_pattern_length(3) s.spam_algorithm(logs, 0.4) s.write_to_file(f)