def classificationWineRed():
    '''
    Test Classification on WineRed dataset
    '''
        
    seed(1)
    
    n_folds = 5
    l_rate = 0.3
    n_epoch = 1000
    n_hidden = [10]
    
    mlp = MultilayerNnClassifier();
    activationFunction = Sigmoid()
    dp = DataPreparation();
    evaluator = ClassificationEvaluator();
    splitting = Splitting();
        
    # Test Backprop on Seeds dataset
    # load and prepare data
    filename = '../Datasets/winequality-red.csv'
    dataset = dp.load_csv(filename)
    for i in range(len(dataset[0]) - 1):
        dp.str_column_to_float(dataset, i)
    # convert class column to integers
    dp.str_column_to_int(dataset, len(dataset[0]) - 1)
    # normalize input variables
    minmax = dp.dataset_minmax(dataset)
    dp.normalize_dataset_classification(dataset, minmax)    
    # evaluate algorithm
    scores = evaluator.evaluate_algorithm(dataset, splitting, mlp.back_propagation, n_folds, l_rate, n_epoch, n_hidden, activationFunction)  
    print_classification_scores(scores)  
def regressionWineWhite():
    '''
    Test Classification on WineWhite dataset
    '''
        
    seed(1)
    
    n_folds = 5
    l_rate = 0.3
    n_epoch = 1000
    n_hidden = [10,5]
    
    mlp = MultilayerNnRegressor();
    activationFunction = Sigmoid()
    dp = DataPreparation();
    evaluator = RegressionEvaluator();
    splitting = Splitting();

    # load and prepare data
    filename = '../Datasets/winequality-white.csv'
    dataset = dp.load_csv(filename)
    for i in range(len(dataset[0])):
        dp.str_column_to_float(dataset, i)
    # normalize input variables including the target
    minmax = dp.dataset_minmax(dataset)
    target_minmax = minmax[-1]
    dp.normalize_dataset_regression(dataset, minmax)    
    # evaluate algorithm
    scores = evaluator.evaluate_algorithm(dataset, splitting , mlp.back_propagation, n_folds, target_minmax, l_rate, n_epoch, n_hidden, activationFunction, target_minmax) 
    print_regression_scores(scores)
class PreprocessingManager:
    def __init__(self):
        self._dataProcessed = list()
        self._dataPreparation = DataPreparation()
        self._dataCleaning = DataCleaning()
        self._dataTransforming = DataTransformation()

    def setDataProcesses(self, tweets, pathvectorizer):
        self._dataPreparation.setDataPrepared(tweets)
        self._dataCleaning.setDataCleaned(
            self._dataPreparation.getDataPrepared())
        self._dataTransforming.setDataTransformed(
            self._dataCleaning.getDataCleaned(), pathvectorizer)
        self._dataProcessed = self._dataTransforming.getDataTransformed()

    def getDataProcesses(self):
        return self._dataProcessed
def classificationPokemon():
    '''
    Test Classification on Pokemon dataset
    id_combat    pk1_ID    pk1_Name    pk1_Type1    pk1_Type2    pk1_HP    pk1_Attack    pk1_Defense    pk1_SpAtk
    pk1_SpDef    pk1_Speed    pk1_Generation    pk1_Legendary    pk1_Grass    pk1_Fire    pk1_Water    pk1_Bug    
    pk1_Normal    pk1_Poison    pk1_Electric    pk1_Ground    pk1_Fairy    pk1_Fighting    pk1_Psychic    pk1_Rock    
    pk1_Ghost    pk1_Ice    pk1_Dragon    pk1_Dark    pk1_Steel    pk1_Flying    ID    pk2_Name    pk2_Type1    pk2_Type2    
    pk2_HP    pk2_Attack    pk2_Defense    pk2_SpAtk    pk2_SpDef    pk2_Speed    pk2_Generation    pk2_Legendary    
    pk2_Grass    pk2_Fire    pk2_Water    pk2_Bug    pk2_Normal    pk2_Poison    pk2_Electric    pk2_Ground    pk2_Fairy    
    pk2_Fighting    pk2_Psychic    pk2_Rock    pk2_Ghost    pk2_Ice    pk2_Dragon    pk2_Dark    pk2_Steel    pk2_Flying    winner                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                        
    '''
        
    seed(1)
    
    n_folds = 5
    l_rate = 0.1
    n_epoch = 500
    n_hidden = [5]
    
    mlp = MultilayerNnClassifier();
    activationFunction = Sigmoid()
    dp = DataPreparation();
    evaluator = ClassificationEvaluator();
    splitting = Splitting();
   
    # load and prepare data
    filename = '../Datasets/pkmn.csv'
    dataset = dp.load_csv(filename)
    for i in range(len(dataset[0]) - 1):
        dp.str_column_to_float(dataset, i)
    # convert class column to integers
    dp.str_column_to_int(dataset, len(dataset[0]) - 1)
    # normalize input variables
    minmax = dp.dataset_minmax(dataset)
    dp.normalize_dataset_classification(dataset, minmax)    
    # evaluate algorithm
    scores = evaluator.evaluate_algorithm(dataset, splitting, mlp.back_propagation, n_folds, l_rate, n_epoch, n_hidden, activationFunction)  
    print_classification_scores(scores) 
Пример #5
0
def KFold_MFSSEL(featureNameDirPath):
    # 初始化dataPreparation对象
    dataPreparation = DataPreparation()
    train_test_tuple_list = dataPreparation.getLabelAndNameTupleList_KFold(featureNameDirPath)
    # 每个类别的准确率
    hog_class_1_accuracy_list = []
    hog_class_2_accuracy_list = []
    hog_class_3_accuracy_list = []
    hog_class_4_accuracy_list = []
    hog_class_5_accuracy_list = []
    hog_class_6_accuracy_list = []
    hog_class_7_accuracy_list = []
    hog_class_8_accuracy_list = []
    hog_class_9_accuracy_list = []
    hog_class_10_accuracy_list = []
    hog_class_11_accuracy_list = []

    # 每个类别的准确率
    hsv_class_1_accuracy_list = []
    hsv_class_2_accuracy_list = []
    hsv_class_3_accuracy_list = []
    hsv_class_4_accuracy_list = []
    hsv_class_5_accuracy_list = []
    hsv_class_6_accuracy_list = []
    hsv_class_7_accuracy_list = []
    hsv_class_8_accuracy_list = []
    hsv_class_9_accuracy_list = []
    hsv_class_10_accuracy_list = []
    hsv_class_11_accuracy_list = []
    # 每个类别的准确率
    lbp_class_1_accuracy_list = []
    lbp_class_2_accuracy_list = []
    lbp_class_3_accuracy_list = []
    lbp_class_4_accuracy_list = []
    lbp_class_5_accuracy_list = []
    lbp_class_6_accuracy_list = []
    lbp_class_7_accuracy_list = []
    lbp_class_8_accuracy_list = []
    lbp_class_9_accuracy_list = []
    lbp_class_10_accuracy_list = []
    lbp_class_11_accuracy_list = []
    for train_test_tuple in train_test_tuple_list:
        print("-----------------------------------------------------------------")
        print(train_test_tuple_list.index(train_test_tuple) + 1)
        print("-----------------------------------------------------------------")
        accuracy_max = 0
        _81FeatureDir = "/_81videoFeature/"
        _30FeatureDir = "/_30videoFeature/"
        hogFeatureDir = "/hogvideoFeature/"

        # 有标签数据集大小list
        label_data_num_list = []
        label_name_labeled_train_tuple_list, label_name_unlabeled_train_tuple_list, label_name_test_tuple_list = train_test_tuple

        # 获取hog维有标签训练集
        hog_labeled_train_tuple_list = dataPreparation.loadData(featureNameDirPath, hogFeatureDir,
                                                                label_name_labeled_train_tuple_list)
        hog_labeled_train_Y, hog_labeled_train_X, hog_labeled_train_Name = Utilities.get_Y_X_Name_list_from_tuple(
            hog_labeled_train_tuple_list)
        # 获取hog维无标签训练集
        hog_unlabeled_tuple_list = dataPreparation.loadData(featureNameDirPath, hogFeatureDir,
                                                            label_name_unlabeled_train_tuple_list)
        hog_unlabeled_Y, hog_unlabeled_X, hog_unlabeled_Name = Utilities.get_Y_X_Name_list_from_tuple(
            hog_unlabeled_tuple_list)
        # 获取hog维测试集
        hog_test_tuple_list = dataPreparation.loadData(featureNameDirPath, hogFeatureDir, label_name_test_tuple_list)
        hog_test_Y, hog_test_X, hog_test_Name = Utilities.get_Y_X_Name_list_from_tuple(hog_test_tuple_list)

        # 获取81维有标签训练集
        _81_labeled_train_tuple_list = dataPreparation.loadData(featureNameDirPath, _81FeatureDir,
                                                                label_name_labeled_train_tuple_list)
        _81_labeled_train_Y, _81_labeled_train_X, _81_labeled_train_Name = Utilities.get_Y_X_Name_list_from_tuple(
            _81_labeled_train_tuple_list)
        # 获取81维无标签训练集
        _81_unlabeled_tuple_list = dataPreparation.loadData(featureNameDirPath, _81FeatureDir,
                                                            label_name_unlabeled_train_tuple_list)
        _81_unlabeled_Y, _81_unlabeled_X, _81_unlabeled_Name = Utilities.get_Y_X_Name_list_from_tuple(
            _81_unlabeled_tuple_list)
        # 获取81维测试集
        _81_test_tuple_list = dataPreparation.loadData(featureNameDirPath, _81FeatureDir, label_name_test_tuple_list)
        _81_test_Y, _81_test_X, _81_test_Name = Utilities.get_Y_X_Name_list_from_tuple(_81_test_tuple_list)

        # 获取30维有标签训练集
        _30_labeled_train_tuple_list = dataPreparation.loadData(featureNameDirPath, _30FeatureDir,
                                                                label_name_labeled_train_tuple_list)
        _30_labeled_train_Y, _30_labeled_train_X, _30_labeled_train_Name = Utilities.get_Y_X_Name_list_from_tuple(
            _30_labeled_train_tuple_list)
        # 获取30维无标签训练集
        _30_unlabeled_tuple_list = dataPreparation.loadData(featureNameDirPath, _30FeatureDir,
                                                            label_name_unlabeled_train_tuple_list)
        _30_unlabeled_Y, _30_unlabeled_X, _30_unlabeled_Name = Utilities.get_Y_X_Name_list_from_tuple(
            _30_unlabeled_tuple_list)
        # 获取30维测试集
        _30_test_tuple_list = dataPreparation.loadData(featureNameDirPath, _30FeatureDir, label_name_test_tuple_list)
        _30_test_Y, _30_test_X, _30_test_Name = Utilities.get_Y_X_Name_list_from_tuple(_30_test_tuple_list)

        # hog维svm训练
        hog_svc_1 = SVC(C=4, kernel='rbf', gamma=2, probability=True)  # c:4 gamma=2
        hog_svc_1.fit(hog_labeled_train_X, hog_labeled_train_Y)

        # 81维svm训练
        _81_svc_1 = SVC(C=4, kernel='rbf', gamma=14, probability=True)  # c:2 gamma=6
        _81_svc_1.fit(_81_labeled_train_X, _81_labeled_train_Y)

        # 30维svm训练
        _30_svc_1 = SVC(C=32, kernel='rbf', gamma=8, probability=True)  # c:32 gamma=12
        _30_svc_1.fit(_30_labeled_train_X, _30_labeled_train_Y)

        whole_labeled_train_X = hog_labeled_train_X.copy()
        whole_labeled_train_X = np.concatenate([whole_labeled_train_X, _81_labeled_train_X], axis=1)
        whole_labeled_train_X = np.concatenate([whole_labeled_train_X, _30_labeled_train_X], axis=1)

        # hog维svm训练
        whole_svc_1 = SVC(C=4, kernel='rbf', gamma=2, probability=True)  # c:4 gamma=2
        whole_svc_1.fit(whole_labeled_train_X, hog_labeled_train_Y)

        whole_test_X = hog_test_X.copy()
        whole_test_X = np.concatenate([whole_test_X, _81_test_X], axis=1)
        whole_test_X = np.concatenate([whole_test_X, _30_test_X], axis=1)
        label_data_num_list.append(len(hog_labeled_train_Y))
        # 获得准确率
        hog_accuracy = hog_svc_1.score(hog_test_X, hog_test_Y)
        # 获得准确率
        _81_accuracy = _81_svc_1.score(_81_test_X, _81_test_Y)
        # 获得准确率
        _30_accuracy = _30_svc_1.score(_30_test_X, _30_test_Y)

        # 获得准确率
        whole_accuracy = whole_svc_1.score(whole_test_X, _30_test_Y)

        # hog特征下的无标签数据集的所对应的各个类别的概率
        hog_svc_1_test_probility = hog_svc_1.predict_proba(hog_test_X)
        # hog特征下无标签数据的预测标签
        hog_svc_1_test_predict_Y = hog_svc_1.predict(hog_test_X)

        # 81维特征下的无标签数据集的所对应的各个类别的概率
        _81_svc_1_test_probility = _81_svc_1.predict_proba(_81_test_X)
        # 81维特征下无标签数据的预测标签
        _81_svc_1_test_predict_Y = _81_svc_1.predict(_81_test_X)

        # 30维特征下的无标签数据集的所对应的各个类别的概率
        _30_svc_1_test_probility = _30_svc_1.predict_proba(_30_test_X)
        # 30维特征下无标签数据的预测标签
        _30_svc_1_test_predict_Y = _30_svc_1.predict(_30_test_X)

        each_class_hog_accuracy_list = Utilities.get_each_class_accuracy(hog_svc_1_test_predict_Y, hog_test_Y)
        each_class_hsv_accuracy_list = Utilities.get_each_class_accuracy(_81_svc_1_test_predict_Y, hog_test_Y)
        each_class_lbp_accuracy_list = Utilities.get_each_class_accuracy(_30_svc_1_test_predict_Y, hog_test_Y)

        hog_class_1_accuracy_list.append(each_class_hog_accuracy_list[0])
        hog_class_2_accuracy_list.append(each_class_hog_accuracy_list[1])
        hog_class_3_accuracy_list.append(each_class_hog_accuracy_list[2])
        hog_class_4_accuracy_list.append(each_class_hog_accuracy_list[3])
        hog_class_5_accuracy_list.append(each_class_hog_accuracy_list[4])
        hog_class_6_accuracy_list.append(each_class_hog_accuracy_list[5])
        hog_class_7_accuracy_list.append(each_class_hog_accuracy_list[6])
        hog_class_8_accuracy_list.append(each_class_hog_accuracy_list[7])
        hog_class_9_accuracy_list.append(each_class_hog_accuracy_list[8])
        hog_class_10_accuracy_list.append(each_class_hog_accuracy_list[9])
        hog_class_11_accuracy_list.append(each_class_hog_accuracy_list[10])

        hsv_class_1_accuracy_list.append(each_class_hsv_accuracy_list[0])
        hsv_class_2_accuracy_list.append(each_class_hsv_accuracy_list[1])
        hsv_class_3_accuracy_list.append(each_class_hsv_accuracy_list[2])
        hsv_class_4_accuracy_list.append(each_class_hsv_accuracy_list[3])
        hsv_class_5_accuracy_list.append(each_class_hsv_accuracy_list[4])
        hsv_class_6_accuracy_list.append(each_class_hsv_accuracy_list[5])
        hsv_class_7_accuracy_list.append(each_class_hsv_accuracy_list[6])
        hsv_class_8_accuracy_list.append(each_class_hsv_accuracy_list[7])
        hsv_class_9_accuracy_list.append(each_class_hsv_accuracy_list[8])
        hsv_class_10_accuracy_list.append(each_class_hsv_accuracy_list[9])
        hsv_class_11_accuracy_list.append(each_class_hsv_accuracy_list[10])

        lbp_class_1_accuracy_list.append(each_class_lbp_accuracy_list[0])
        lbp_class_2_accuracy_list.append(each_class_lbp_accuracy_list[1])
        lbp_class_3_accuracy_list.append(each_class_lbp_accuracy_list[2])
        lbp_class_4_accuracy_list.append(each_class_lbp_accuracy_list[3])
        lbp_class_5_accuracy_list.append(each_class_lbp_accuracy_list[4])
        lbp_class_6_accuracy_list.append(each_class_lbp_accuracy_list[5])
        lbp_class_7_accuracy_list.append(each_class_lbp_accuracy_list[6])
        lbp_class_8_accuracy_list.append(each_class_lbp_accuracy_list[7])
        lbp_class_9_accuracy_list.append(each_class_lbp_accuracy_list[8])
        lbp_class_10_accuracy_list.append(each_class_lbp_accuracy_list[9])
        lbp_class_11_accuracy_list.append(each_class_lbp_accuracy_list[10])

        # max_each_class_hog_accuracy_list = max(each_class_hog_accuracy_list)
        # min_each_class_hog_accuracy_list = min(each_class_hog_accuracy_list)
        # max_each_class_hsv_accuracy_list = max(each_class_hsv_accuracy_list)
        # min_each_class_hsv_accuracy_list = min(each_class_hsv_accuracy_list)
        # max_each_class_lbp_accuracy_list = max(each_class_lbp_accuracy_list)
        # min_each_class_lbp_accuracy_list = min(each_class_lbp_accuracy_list)

        print("SVM-hog准确率:")
        print(hog_accuracy * 100)
        print("SVM-hsv准确率:")
        print(_81_accuracy * 100)
        print("SVM-lbp准确率:")
        print(_30_accuracy * 100)
        print("SVM-whole准确率:")
        print(whole_accuracy * 100)
        print("有标签数据集大小:")
        print(label_data_num_list)
        [plt.setp(item.yaxis.get_label(), 'size', 40) for item in axes.ravel()]
        [plt.setp(item.xaxis.get_label(), 'size', 40) for item in axes.ravel()]
                
        corr = df.corr().as_matrix()
        for i, j in zip(*plt.np.triu_indices_from(axes, k=1)):
            axes[i, j].annotate("%.3f" %corr[i,j], (0.8, 0.8), xycoords='axes fraction', ha='center', va='center', size=45)
        
        plt.show()
    
    
    
if __name__ == '__main__':
    
    # data preparation
    file = '3222'
    data_preparation = DataPreparation(file)
    logs = data_preparation.logs_preparation()
    rules = data_preparation.rules_preparation()
    indicators = data_preparation.indexes_preparation(rules)
    rules_ids = data_preparation.rules_to_ids(rules)
    
    # frequent patterns
    spam = SPAM(file)
    spam.set_max_gap(1)
    spam.set_min_pattern_length(2)
    spam.set_max_pattern_length(3)
    spam.spam_algorithm(logs, 0.5)
    
    #unusual patterns
    ind = str(indicators.loc[0][0])
    logs = logs.reset_index()
Пример #7
0
    w = csv.writer(f)

    for row in DS.dataset_names:

        w.writerow([row])

# Read in the data set names so they can be used to
# label the chip stacks.
dataset_names = []
for row in open('dataset_names.csv'):

    dataset_names.append(row.rstrip('\n'))

# Example usage of DataLabel class to create and label
# the individual chip stacks.
DL = DataLabel(path_to_directory_for_storage, name_of_hdf5_file_for_storage,
               dataset_names, 50, 50)
# Plot a single chip from the first chip stack.
plt.imshow(DL.images[0][0], cmap=plt.get_cmap('gray'))
plt.colorbar()
plt.show()

# Example usage of the DataPreparation class to convert
# the labeled chip stacks to .tfrecords.
number_training = 80
number_validation = 5
number_testing = 0
DP = DataPreparation('astrodetection', DL.images, DL.labels, number_training,
                     number_validation, number_testing,
                     path_to_directory_for_storage)
 def __init__(self):
     self._dataProcessed = list()
     self._dataPreparation = DataPreparation()
     self._dataCleaning = DataCleaning()
     self._dataTransforming = DataTransformation()
Пример #9
0
def KFold_MFSSEL(featureNameDirPath, savePath=None, trainAndTestFlag="train"):

    # 初始化dataPreparation对象
    dataPreparation = DataPreparation()
    train_test_tuple_list = dataPreparation.getLabelAndNameTupleList_KFold(featureNameDirPath)
    # 每个类别的初始准确率
    first_class_1_accuracy_list = []
    first_class_2_accuracy_list = []
    first_class_3_accuracy_list = []
    first_class_4_accuracy_list = []
    first_class_5_accuracy_list = []
    first_class_6_accuracy_list = []
    first_class_7_accuracy_list = []
    first_class_8_accuracy_list = []
    first_class_9_accuracy_list = []
    first_class_10_accuracy_list = []
    first_class_11_accuracy_list = []
    # 每个类别的最终准确率
    class_1_accuracy_list = []
    class_2_accuracy_list = []
    class_3_accuracy_list = []
    class_4_accuracy_list = []
    class_5_accuracy_list = []
    class_6_accuracy_list = []
    class_7_accuracy_list = []
    class_8_accuracy_list = []
    class_9_accuracy_list = []
    class_10_accuracy_list = []
    class_11_accuracy_list = []
    # 整体初始准确率
    whole_first_accuracy_list = []
    # 整体准确率
    whole_accuracy_list = []
    for train_test_tuple in train_test_tuple_list:
        print("-----------------------------------------------------------------")
        print(train_test_tuple_list.index(train_test_tuple) + 1)
        print("-----------------------------------------------------------------")
        model_max_hog = None
        model_max_81 = None
        model_max_30 = None
        accuracy_max = 0
        _81FeatureDir = "/_81videoFeature/"
        _30FeatureDir = "/_30videoFeature/"
        hogFeatureDir = "/hogvideoFeature/"

        # 准确率
        accuracy_list = []
        # 整体类别
        whole_class = 11
        # 选取置信度前topk个样本
        topk = 5
        # 迭代次数
        loop_num = 100
        num = 0.7
        para = 0.7
        # 有标签数据集大小list
        label_data_num_list = []
        label_name_labeled_train_tuple_list, label_name_unlabeled_train_tuple_list, label_name_test_tuple_list = train_test_tuple

        # 获取hog维有标签训练集
        hog_labeled_train_tuple_list = dataPreparation.loadData(featureNameDirPath, hogFeatureDir,
                                                                label_name_labeled_train_tuple_list)
        hog_labeled_train_Y, hog_labeled_train_X, hog_labeled_train_Name = Utilities.get_Y_X_Name_list_from_tuple(
            hog_labeled_train_tuple_list)
        # 获取hog维无标签训练集
        hog_unlabeled_tuple_list = dataPreparation.loadData(featureNameDirPath, hogFeatureDir,
                                                            label_name_unlabeled_train_tuple_list)
        hog_unlabeled_Y, hog_unlabeled_X, hog_unlabeled_Name = Utilities.get_Y_X_Name_list_from_tuple(
            hog_unlabeled_tuple_list)
        # 获取hog维测试集
        hog_test_tuple_list = dataPreparation.loadData(featureNameDirPath, hogFeatureDir, label_name_test_tuple_list)
        hog_test_Y, hog_test_X, hog_test_Name = Utilities.get_Y_X_Name_list_from_tuple(hog_test_tuple_list)

        # 获取81维有标签训练集
        _81_labeled_train_tuple_list = dataPreparation.loadData(featureNameDirPath, _81FeatureDir,
                                                                label_name_labeled_train_tuple_list)
        _81_labeled_train_Y, _81_labeled_train_X, _81_labeled_train_Name = Utilities.get_Y_X_Name_list_from_tuple(
            _81_labeled_train_tuple_list)
        # 获取81维无标签训练集
        _81_unlabeled_tuple_list = dataPreparation.loadData(featureNameDirPath, _81FeatureDir,
                                                            label_name_unlabeled_train_tuple_list)
        _81_unlabeled_Y, _81_unlabeled_X, _81_unlabeled_Name = Utilities.get_Y_X_Name_list_from_tuple(
            _81_unlabeled_tuple_list)
        # 获取81维测试集
        _81_test_tuple_list = dataPreparation.loadData(featureNameDirPath, _81FeatureDir, label_name_test_tuple_list)
        _81_test_Y, _81_test_X, _81_test_Name = Utilities.get_Y_X_Name_list_from_tuple(_81_test_tuple_list)

        # 获取30维有标签训练集
        _30_labeled_train_tuple_list = dataPreparation.loadData(featureNameDirPath, _30FeatureDir,
                                                                label_name_labeled_train_tuple_list)
        _30_labeled_train_Y, _30_labeled_train_X, _30_labeled_train_Name = Utilities.get_Y_X_Name_list_from_tuple(
            _30_labeled_train_tuple_list)
        # 获取30维无标签训练集
        _30_unlabeled_tuple_list = dataPreparation.loadData(featureNameDirPath, _30FeatureDir,
                                                            label_name_unlabeled_train_tuple_list)
        _30_unlabeled_Y, _30_unlabeled_X, _30_unlabeled_Name = Utilities.get_Y_X_Name_list_from_tuple(
            _30_unlabeled_tuple_list)
        # 获取30维测试集
        _30_test_tuple_list = dataPreparation.loadData(featureNameDirPath, _30FeatureDir, label_name_test_tuple_list)
        _30_test_Y, _30_test_X, _30_test_Name = Utilities.get_Y_X_Name_list_from_tuple(_30_test_tuple_list)

        for h in range(1, loop_num + 1):
            label_data_num_list.append(len(hog_labeled_train_Y))
            # hog维svm训练
            hog_svc_1 = SVC(C=4, kernel='rbf', gamma=2, probability=True)  # c:4 gamma=2
            hog_svc_1.fit(hog_labeled_train_X, hog_labeled_train_Y)

            # 81维svm训练
            _81_svc_1 = SVC(C=4, kernel='rbf', gamma=14, probability=True)  # c:2 gamma=6
            _81_svc_1.fit(_81_labeled_train_X, _81_labeled_train_Y)

            # 30维svm训练
            _30_svc_1 = SVC(C=32, kernel='rbf', gamma=8, probability=True)  # c:32 gamma=12
            _30_svc_1.fit(_30_labeled_train_X, _30_labeled_train_Y)

            # 获得准确率
            hog_accuracy = hog_svc_1.score(hog_test_X, hog_test_Y)
            # 获得准确率
            _81_accuracy = _81_svc_1.score(_81_test_X, _81_test_Y)
            # 获得准确率
            _30_accuracy = _30_svc_1.score(_30_test_X, _30_test_Y)

            if h == 1:
                if hog_svc_1 is not None:
                    print("正在保存first_hog.model...")
                    joblib.dump(hog_svc_1, savePath + "first_hog.model")
                    print("保存first_hog.model完毕。")
                if _81_svc_1 is not None:
                    print("正在保存first_81.model...")
                    joblib.dump(_81_svc_1, savePath + "first_81.model")
                    print("保存first_81.model完毕。")
                if _30_svc_1 is not None:
                    print("正在保存first_30.model...")
                    joblib.dump(_30_svc_1, savePath + "first_30.model")
                    print("保存first_30.model完毕。")

            # hog_accuracy,_81_accuracy,_30_accuracy中最大的accuracy
            accuracy_max_from_single = max(hog_accuracy, _81_accuracy, _30_accuracy)
            accuracy_list.append(accuracy_max_from_single * 100)
            # hog_accuracy_list.append(hog_accuracy * 100)
            # _81_accuracy_list.append(_81_accuracy * 100)
            # _30_accuracy_list.append(_30_accuracy * 100)

            if accuracy_max_from_single > accuracy_max:
                accuracy_max = accuracy_max_from_single
                model_max_hog = hog_svc_1
                model_max_81 = _81_svc_1
                model_max_30 = _30_svc_1

            if len(hog_unlabeled_X) == 0 or len(_81_unlabeled_X) == 0 or len(_30_unlabeled_X) == 0:
                break
            if h == loop_num:
                break
            # print(accuracy_max * 100)
            # hog特征下的测试数据集的所对应的各个类别的概率
            hog_svc_1_probility = hog_svc_1.predict_proba(hog_unlabeled_X)
            # hog特征下测试数据集的预测标签
            hog_svc_1_predict_Y = hog_svc_1.predict(hog_unlabeled_X)

            # 81维特征下的测试数据集的所对应的各个类别的概率
            _81_svc_1_probility = _81_svc_1.predict_proba(_81_unlabeled_X)
            # 81维特征下测试数据集的预测标签
            _81_svc_1_predict_Y = _81_svc_1.predict(_81_unlabeled_X)

            # 30维特征下的测试数据集的所对应的各个类别的概率
            _30_svc_1_probility = _30_svc_1.predict_proba(_30_unlabeled_X)
            # 30维特征下测试数据集的预测标签
            _30_svc_1_predict_Y = _30_svc_1.predict(_30_unlabeled_X)

            probility_list_1 = [hog_svc_1_probility, _81_svc_1_probility, _30_svc_1_probility]
            unlabeled_Y_list_1 = [hog_unlabeled_Y, _81_unlabeled_Y, _30_unlabeled_Y]
            predict_Y_list_1 = [hog_svc_1_predict_Y, _81_svc_1_predict_Y, _30_svc_1_predict_Y]

            # voted_index_predict_Y_list = Utilities.vote(predict_Y_list_1, unlabeled_Y_list_1, whole_class, topk)
            #
            # voted_Y_list, voted_index_list = Utilities.get_voted_confidence(probility_list_1,
            #                                                                 voted_index_predict_Y_list[0],
            #                                                                 voted_index_predict_Y_list[1], whole_class,
            #                                                                 topk)

            selected_ind_list, selected_pesudo_label_list = Utilities.get_pesudo_label(probility_list_1,
                                                                                       predict_Y_list_1,
                                                                                       unlabeled_Y_list_1,
                                                                                       whole_class,
                                                                                       topk, num, para)
            # a = []
            # for i in selected_ind_list:
            #     a.append(_30_unlabeled_Y[i])
            # print(selected_pesudo_label_list)
            # print(a)

            for i in range(len(selected_ind_list)):
                hog_labeled_train_X.append(hog_unlabeled_X[selected_ind_list[i]])
                hog_labeled_train_Y.append(selected_pesudo_label_list[i])
                _81_labeled_train_X.append(_81_unlabeled_X[selected_ind_list[i]])
                _81_labeled_train_Y.append(selected_pesudo_label_list[i])
                _30_labeled_train_X.append(_30_unlabeled_X[selected_ind_list[i]])
                _30_labeled_train_Y.append(selected_pesudo_label_list[i])

            hog_unlabeled_X = [i for j, i in enumerate(hog_unlabeled_X) if j not in selected_ind_list]
            hog_unlabeled_Y = [i for j, i in enumerate(hog_unlabeled_Y) if j not in selected_ind_list]
            _81_unlabeled_X = [i for j, i in enumerate(_81_unlabeled_X) if j not in selected_ind_list]
            _81_unlabeled_Y = [i for j, i in enumerate(_81_unlabeled_Y) if j not in selected_ind_list]
            _30_unlabeled_X = [i for j, i in enumerate(_30_unlabeled_X) if j not in selected_ind_list]
            _30_unlabeled_Y = [i for j, i in enumerate(_30_unlabeled_Y) if j not in selected_ind_list]

        # print(accuracy_max * 100)
        # print("有标签数据集大小:")
        # print(label_data_num_list)
        # print("accuracy_list:")
        # print(accuracy_list)

        if model_max_hog is not None:
            print("正在保存hog.model...")
            joblib.dump(model_max_hog, savePath + "hog.model")
            print("保存hog.model完毕。")
        if model_max_81 is not None:
            print("正在保存81.model...")
            joblib.dump(model_max_81, savePath + "81.model")
            print("保存81.model完毕。")
        if model_max_30 is not None:
            print("正在保存30.model...")
            joblib.dump(model_max_30, savePath + "30.model")
            print("保存30.model完毕。")
        if os.path.exists(savePath):

            # 加载model文件
            first_hogModel = joblib.load(savePath + "/first_hog.model")
            first_81Model = joblib.load(savePath + "/first_81.model")
            first_30Model = joblib.load(savePath + "/first_30.model")

            # hog特征下的无标签数据集的所对应的各个类别的概率
            first_hog_svc_test_probility = first_hogModel.predict_proba(hog_test_X)
            # hog特征下无标签数据的预测标签
            first_hog_svc_test_predict_Y = first_hogModel.predict(hog_test_X)

            # 81维特征下的无标签数据集的所对应的各个类别的概率
            first_81_svc_test_probility = first_81Model.predict_proba(_81_test_X)
            # 81维特征下无标签数据的预测标签
            first_81_svc_test_predict_Y = first_81Model.predict(_81_test_X)

            # 30维特征下的无标签数据集的所对应的各个类别的概率
            first_30_svc_test_probility = first_30Model.predict_proba(_30_test_X)
            # 30维特征下无标签数据的预测标签
            first_30_svc_test_predict_Y = first_30Model.predict(_30_test_X)

            first_probility_list = first_hog_svc_test_probility, first_81_svc_test_probility, first_30_svc_test_probility
            first_predict_Y_list = first_hog_svc_test_predict_Y, first_81_svc_test_predict_Y, first_30_svc_test_predict_Y
            first_mfssel_predict_Y_list = Utilities.predict_Y_test(first_probility_list, first_predict_Y_list,
                                                                   whole_class, para)
            first_each_class_accuracy_list = Utilities.get_each_class_accuracy(first_mfssel_predict_Y_list, hog_test_Y)
            first_class_1_accuracy_list.append(first_each_class_accuracy_list[0])
            first_class_2_accuracy_list.append(first_each_class_accuracy_list[1])
            first_class_3_accuracy_list.append(first_each_class_accuracy_list[2])
            first_class_4_accuracy_list.append(first_each_class_accuracy_list[3])
            first_class_5_accuracy_list.append(first_each_class_accuracy_list[4])
            first_class_6_accuracy_list.append(first_each_class_accuracy_list[5])
            first_class_7_accuracy_list.append(first_each_class_accuracy_list[6])
            first_class_8_accuracy_list.append(first_each_class_accuracy_list[7])
            first_class_9_accuracy_list.append(first_each_class_accuracy_list[8])
            first_class_10_accuracy_list.append(first_each_class_accuracy_list[9])
            first_class_11_accuracy_list.append(first_each_class_accuracy_list[10])
            if len(first_mfssel_predict_Y_list) == len(hog_test_Y):
                rightNum = 0
                for i in range(len(first_mfssel_predict_Y_list)):
                    if first_mfssel_predict_Y_list[i] == hog_test_Y[i]:
                        rightNum = rightNum + 1
                whole_first_accuracy_list.append(rightNum / len(hog_test_Y))
                print("整体初始准确率:")
                print(rightNum / len(hog_test_Y))
            else:
                print("出错!!")
            print("每一类的初始准确率:")
            print(first_each_class_accuracy_list)
            print("初始平均准确率:")
            print(sum(first_each_class_accuracy_list) / len(first_each_class_accuracy_list))
            # 加载model文件
            hogModel = joblib.load(savePath + "/hog.model")
            _81Model = joblib.load(savePath + "/81.model")
            _30Model = joblib.load(savePath + "/30.model")
            # hog特征下的无标签数据集的所对应的各个类别的概率
            hog_svc_test_probility = hogModel.predict_proba(hog_test_X)
            # hog特征下无标签数据的预测标签
            hog_svc_test_predict_Y = hogModel.predict(hog_test_X)

            # 81维特征下的无标签数据集的所对应的各个类别的概率
            _81_svc_test_probility = _81Model.predict_proba(_81_test_X)
            # 81维特征下无标签数据的预测标签
            _81_svc_test_predict_Y = _81Model.predict(_81_test_X)

            # 30维特征下的无标签数据集的所对应的各个类别的概率
            _30_svc_test_probility = _30Model.predict_proba(_30_test_X)
            # 30维特征下无标签数据的预测标签
            _30_svc_test_predict_Y = _30Model.predict(_30_test_X)

            probility_list = hog_svc_test_probility, _81_svc_test_probility, _30_svc_test_probility
            predict_Y_list = hog_svc_test_predict_Y, _81_svc_test_predict_Y, _30_svc_test_predict_Y
            mfssel_predict_Y_list = Utilities.predict_Y_test(probility_list, predict_Y_list, whole_class, para)
            each_class_accuracy_list = Utilities.get_each_class_accuracy(mfssel_predict_Y_list, hog_test_Y)
            class_1_accuracy_list.append(each_class_accuracy_list[0])
            class_2_accuracy_list.append(each_class_accuracy_list[1])
            class_3_accuracy_list.append(each_class_accuracy_list[2])
            class_4_accuracy_list.append(each_class_accuracy_list[3])
            class_5_accuracy_list.append(each_class_accuracy_list[4])
            class_6_accuracy_list.append(each_class_accuracy_list[5])
            class_7_accuracy_list.append(each_class_accuracy_list[6])
            class_8_accuracy_list.append(each_class_accuracy_list[7])
            class_9_accuracy_list.append(each_class_accuracy_list[8])
            class_10_accuracy_list.append(each_class_accuracy_list[9])
            class_11_accuracy_list.append(each_class_accuracy_list[10])
            print("每一类的最终准确率:")
            print(each_class_accuracy_list)
            print("平均准确率:")
            print(sum(each_class_accuracy_list) / len(each_class_accuracy_list))
            if len(mfssel_predict_Y_list) == len(hog_test_Y):
                rightNum = 0
                for i in range(len(mfssel_predict_Y_list)):
                    if mfssel_predict_Y_list[i] == hog_test_Y[i]:
                        rightNum = rightNum + 1
                whole_accuracy_list.append(rightNum / len(hog_test_Y))
                print("整体准确率:")
                print(rightNum / len(hog_test_Y))
            else:
                print("出错!!")
    print("---------------------------------------------------")
    avg_first_class_1_accuracy = sum(first_class_1_accuracy_list) / len(first_class_1_accuracy_list)
    avg_first_class_2_accuracy = sum(first_class_2_accuracy_list) / len(first_class_2_accuracy_list)
    avg_first_class_3_accuracy = sum(first_class_3_accuracy_list) / len(first_class_3_accuracy_list)
    avg_first_class_4_accuracy = sum(first_class_4_accuracy_list) / len(first_class_4_accuracy_list)
    avg_first_class_5_accuracy = sum(first_class_5_accuracy_list) / len(first_class_5_accuracy_list)
    avg_first_class_6_accuracy = sum(first_class_6_accuracy_list) / len(first_class_6_accuracy_list)
    avg_first_class_7_accuracy = sum(first_class_7_accuracy_list) / len(first_class_7_accuracy_list)
    avg_first_class_8_accuracy = sum(first_class_8_accuracy_list) / len(first_class_8_accuracy_list)
    avg_first_class_9_accuracy = sum(first_class_9_accuracy_list) / len(first_class_9_accuracy_list)
    avg_first_class_10_accuracy = sum(first_class_10_accuracy_list) / len(first_class_10_accuracy_list)
    avg_first_class_11_accuracy = sum(first_class_11_accuracy_list) / len(first_class_11_accuracy_list)
    print("shooting初始平均准确率:")
    print(avg_first_class_1_accuracy)
    print("biking初始平均准确率:")
    print(avg_first_class_2_accuracy)
    print("diving初始平均准确率:")
    print(avg_first_class_3_accuracy)
    print("golf初始平均准确率:")
    print(avg_first_class_4_accuracy)
    print("riding初始平均准确率:")
    print(avg_first_class_5_accuracy)
    print("juggle初始平均准确率:")
    print(avg_first_class_6_accuracy)
    print("swing初始平均准确率:")
    print(avg_first_class_7_accuracy)
    print("tennis初始平均准确率:")
    print(avg_first_class_8_accuracy)
    print("jumping初始平均准确率:")
    print(avg_first_class_9_accuracy)
    print("spiking初始平均准确率:")
    print(avg_first_class_10_accuracy)
    print("walk初始平均准确率:")
    print(avg_first_class_11_accuracy)
    print("---------------------------------------------------")
    avg_class_1_accuracy = sum(class_1_accuracy_list) / len(class_1_accuracy_list)
    avg_class_2_accuracy = sum(class_2_accuracy_list) / len(class_2_accuracy_list)
    avg_class_3_accuracy = sum(class_3_accuracy_list) / len(class_3_accuracy_list)
    avg_class_4_accuracy = sum(class_4_accuracy_list) / len(class_4_accuracy_list)
    avg_class_5_accuracy = sum(class_5_accuracy_list) / len(class_5_accuracy_list)
    avg_class_6_accuracy = sum(class_6_accuracy_list) / len(class_6_accuracy_list)
    avg_class_7_accuracy = sum(class_7_accuracy_list) / len(class_7_accuracy_list)
    avg_class_8_accuracy = sum(class_8_accuracy_list) / len(class_8_accuracy_list)
    avg_class_9_accuracy = sum(class_9_accuracy_list) / len(class_9_accuracy_list)
    avg_class_10_accuracy = sum(class_10_accuracy_list) / len(class_10_accuracy_list)
    avg_class_11_accuracy = sum(class_11_accuracy_list) / len(class_11_accuracy_list)
    print("shooting最终平均准确率:")
    print(avg_class_1_accuracy)
    print("biking最终平均准确率:")
    print(avg_class_2_accuracy)
    print("diving最终平均准确率:")
    print(avg_class_3_accuracy)
    print("golf最终平均准确率:")
    print(avg_class_4_accuracy)
    print("riding最终平均准确率:")
    print(avg_class_5_accuracy)
    print("juggle最终平均准确率:")
    print(avg_class_6_accuracy)
    print("swing最终平均准确率:")
    print(avg_class_7_accuracy)
    print("tennis最终平均准确率:")
    print(avg_class_8_accuracy)
    print("jumping最终平均准确率:")
    print(avg_class_9_accuracy)
    print("spiking最终平均准确率:")
    print(avg_class_10_accuracy)
    print("walk最终平均准确率:")
    print(avg_class_11_accuracy)
    print("---------------------------------------------------")
    print("整体初始平均准确率:")
    a = [avg_first_class_1_accuracy, avg_first_class_2_accuracy, avg_first_class_3_accuracy, avg_first_class_4_accuracy,
     avg_first_class_5_accuracy, avg_first_class_6_accuracy, avg_first_class_7_accuracy, avg_first_class_8_accuracy,
     avg_first_class_9_accuracy, avg_first_class_10_accuracy, avg_first_class_11_accuracy]
    print(sum(a)/len(a))
    print(sum(whole_first_accuracy_list) / len(whole_first_accuracy_list))
    print("---------------------------------------------------")
    print("整体平均准确率:")
    b = [avg_class_1_accuracy, avg_class_2_accuracy, avg_class_3_accuracy, avg_class_4_accuracy,
         avg_class_5_accuracy, avg_class_6_accuracy, avg_class_7_accuracy, avg_class_8_accuracy,
         avg_class_9_accuracy, avg_class_10_accuracy, avg_class_11_accuracy]
    print(sum(b) / len(b))
    print(sum(whole_accuracy_list) / len(whole_accuracy_list))
Пример #10
0
def MFSSEL(featureNameDirPath, savePath=None, trainAndTestFlag="train"):
    model_max_hog = None
    model_max_81 = None
    model_max_30 = None
    accuracy_max = 0
    accuracy_max_from_single = 0
    featureDir = "/home/sunbite/MFSSEL/features/"
    featureDir_KFold = "/home/sunbite/MFSSEL/features_new_2/"
    train_81FeatureDir = "/train/_81videoFeature/"
    train_30FeatureDir = "/train/_30videoFeature/"
    train_hogFeatureDir = "/train/hogvideoFeature/"
    test_81FeatureDir = "/test/_81videoFeature/"
    test_30FeatureDir = "/test/_30videoFeature/"
    test_hogFeatureDir = "/test/hogvideoFeature/"

    # 每个类别的准确率
    class_1_accuracy_list = []
    class_2_accuracy_list = []
    class_3_accuracy_list = []
    class_4_accuracy_list = []
    class_5_accuracy_list = []
    class_6_accuracy_list = []
    class_7_accuracy_list = []
    class_8_accuracy_list = []
    class_9_accuracy_list = []
    class_10_accuracy_list = []
    class_11_accuracy_list = []

    # 准确率
    accuracy_list = []
    # 整体类别
    whole_class = 11
    # 选取置信度前topk个样本
    topk = 5
    # 迭代次数
    loop_num = 100
    # 有标签数据集大小list
    label_data_num_list = []

    num = 0.8
    para = 0.9
    # 训练
    if trainAndTestFlag == "train":
        # 初始化dataPreparation对象
        dataPreparation = DataPreparation()
        # label_name_labeled_train_tuple_list, label_name_unlabeled_train_tuple_list, label_name_test_tuple_list = dataPreparation.getLabelAndNameTupleList_KFold(
        #     featureDir_KFold)
        # 获取有标签训练集,无标签训练集,测试集的标签和名字tuplelist
        label_name_labeled_train_tuple_list, label_name_unlabeled_train_tuple_list, label_name_test_tuple_list = dataPreparation.getLabelAndNameTupleList(
            featureNameDirPath)

        # # 取样后的有标签训练集的标签和名字tuplelist
        # bootstrapped_Labeled_train_tuple_list_1 = dataPreparation.getBootstrapSample(label_name_labeled_train_tuple_list, 1)
        # bootstrapped_Labeled_train_tuple_list_2 = dataPreparation.getBootstrapSample(label_name_labeled_train_tuple_list,
        #                                                                              40)
        # 获取hog维有标签训练集
        hog_labeled_train_tuple_list = dataPreparation.loadData(
            featureDir, train_hogFeatureDir,
            label_name_labeled_train_tuple_list)
        hog_labeled_train_Y, hog_labeled_train_X, hog_labeled_train_Name = Utilities.get_Y_X_Name_list_from_tuple(
            hog_labeled_train_tuple_list)
        # 获取hog维无标签训练集
        hog_unlabeled_tuple_list = dataPreparation.loadData(
            featureDir, train_hogFeatureDir,
            label_name_unlabeled_train_tuple_list)
        hog_unlabeled_Y, hog_unlabeled_X, hog_unlabeled_Name = Utilities.get_Y_X_Name_list_from_tuple(
            hog_unlabeled_tuple_list)
        # 获取hog维测试集
        hog_test_tuple_list = dataPreparation.loadData(
            featureDir, test_hogFeatureDir, label_name_test_tuple_list)
        hog_test_Y, hog_test_X, hog_test_Name = Utilities.get_Y_X_Name_list_from_tuple(
            hog_test_tuple_list)

        # 获取81维有标签训练集
        _81_labeled_train_tuple_list = dataPreparation.loadData(
            featureDir, train_81FeatureDir,
            label_name_labeled_train_tuple_list)
        _81_labeled_train_Y, _81_labeled_train_X, _81_labeled_train_Name = Utilities.get_Y_X_Name_list_from_tuple(
            _81_labeled_train_tuple_list)
        # 获取81维无标签训练集
        _81_unlabeled_tuple_list = dataPreparation.loadData(
            featureDir, train_81FeatureDir,
            label_name_unlabeled_train_tuple_list)
        _81_unlabeled_Y, _81_unlabeled_X, _81_unlabeled_Name = Utilities.get_Y_X_Name_list_from_tuple(
            _81_unlabeled_tuple_list)
        # 获取81维测试集
        _81_test_tuple_list = dataPreparation.loadData(
            featureDir, test_81FeatureDir, label_name_test_tuple_list)
        _81_test_Y, _81_test_X, _81_test_Name = Utilities.get_Y_X_Name_list_from_tuple(
            _81_test_tuple_list)

        # 获取30维有标签训练集
        _30_labeled_train_tuple_list = dataPreparation.loadData(
            featureDir, train_30FeatureDir,
            label_name_labeled_train_tuple_list)
        _30_labeled_train_Y, _30_labeled_train_X, _30_labeled_train_Name = Utilities.get_Y_X_Name_list_from_tuple(
            _30_labeled_train_tuple_list)
        # 获取30维无标签训练集
        _30_unlabeled_tuple_list = dataPreparation.loadData(
            featureDir, train_30FeatureDir,
            label_name_unlabeled_train_tuple_list)
        _30_unlabeled_Y, _30_unlabeled_X, _30_unlabeled_Name = Utilities.get_Y_X_Name_list_from_tuple(
            _30_unlabeled_tuple_list)
        # 获取30维测试集
        _30_test_tuple_list = dataPreparation.loadData(
            featureDir, test_30FeatureDir, label_name_test_tuple_list)
        _30_test_Y, _30_test_X, _30_test_Name = Utilities.get_Y_X_Name_list_from_tuple(
            _30_test_tuple_list)

        for h in range(1, loop_num + 1):
            print("有标签数据集大小:")
            label_data_num_list.append(len(hog_labeled_train_Y))
            print(label_data_num_list)
            # hog维svm训练
            hog_svc_1 = SVC(C=4, kernel='rbf', gamma=2,
                            probability=True)  # c:4 gamma=2
            hog_svc_1.fit(hog_labeled_train_X, hog_labeled_train_Y)

            # 81维svm训练
            _81_svc_1 = SVC(C=4, kernel='rbf', gamma=14,
                            probability=True)  # c:2 gamma=6
            _81_svc_1.fit(_81_labeled_train_X, _81_labeled_train_Y)

            # 30维svm训练
            _30_svc_1 = SVC(C=32, kernel='rbf', gamma=8,
                            probability=True)  # c:32 gamma=12
            _30_svc_1.fit(_30_labeled_train_X, _30_labeled_train_Y)

            # 获得准确率
            hog_accuracy = hog_svc_1.score(hog_test_X, hog_test_Y)
            # 获得准确率
            _81_accuracy = _81_svc_1.score(_81_test_X, _81_test_Y)
            # 获得准确率
            _30_accuracy = _30_svc_1.score(_30_test_X, _30_test_Y)

            # hog特征下的无标签数据集的所对应的各个类别的概率
            hog_svc_1_test_probility = hog_svc_1.predict_proba(hog_test_X)
            # hog特征下无标签数据的预测标签
            hog_svc_1_test_predict_Y = hog_svc_1.predict(hog_test_X)

            # 81维特征下的无标签数据集的所对应的各个类别的概率
            _81_svc_1_test_probility = _81_svc_1.predict_proba(_81_test_X)
            # 81维特征下无标签数据的预测标签
            _81_svc_1_test_predict_Y = _81_svc_1.predict(_81_test_X)

            # 30维特征下的无标签数据集的所对应的各个类别的概率
            _30_svc_1_test_probility = _30_svc_1.predict_proba(_30_test_X)
            # 30维特征下无标签数据的预测标签
            _30_svc_1_test_predict_Y = _30_svc_1.predict(_30_test_X)

            mfssel_predict_Y_list_ = Utilities.predict_Y(
                hog_svc_1_test_probility, _81_svc_1_test_probility,
                _30_svc_1_test_probility, hog_svc_1_test_predict_Y,
                _81_svc_1_test_predict_Y, _30_svc_1_test_predict_Y)
            each_class_accuracy_list = Utilities.get_each_class_accuracy(
                mfssel_predict_Y_list_, hog_test_Y)
            class_1_accuracy_list.append(each_class_accuracy_list[0])
            class_2_accuracy_list.append(each_class_accuracy_list[1])
            class_3_accuracy_list.append(each_class_accuracy_list[2])
            class_4_accuracy_list.append(each_class_accuracy_list[3])
            class_5_accuracy_list.append(each_class_accuracy_list[4])
            class_6_accuracy_list.append(each_class_accuracy_list[5])
            class_7_accuracy_list.append(each_class_accuracy_list[6])
            class_8_accuracy_list.append(each_class_accuracy_list[7])
            class_9_accuracy_list.append(each_class_accuracy_list[8])
            class_10_accuracy_list.append(each_class_accuracy_list[9])
            class_11_accuracy_list.append(each_class_accuracy_list[10])
            # hog_accuracy,_81_accuracy,_30_accuracy中最大的accuracy
            accuracy_max_from_single = max(hog_accuracy, _81_accuracy,
                                           _30_accuracy)
            accuracy_list.append(accuracy_max_from_single * 100)
            # hog_accuracy_list.append(hog_accuracy * 100)
            # _81_accuracy_list.append(_81_accuracy * 100)
            # _30_accuracy_list.append(_30_accuracy * 100)
            print("accuracy_list:")
            print(accuracy_list)

            if accuracy_max_from_single > accuracy_max:
                accuracy_max = accuracy_max_from_single
                model_max_hog = hog_svc_1
                model_max_81 = _81_svc_1
                model_max_30 = _30_svc_1

            if len(hog_unlabeled_X) == 0 or len(_81_unlabeled_X) == 0 or len(
                    _30_unlabeled_X) == 0:
                break
            if h == loop_num:
                break

            # hog特征下的测试数据集的所对应的各个类别的概率
            hog_svc_1_probility = hog_svc_1.predict_proba(hog_unlabeled_X)
            # hog特征下测试数据集的预测标签
            hog_svc_1_predict_Y = hog_svc_1.predict(hog_unlabeled_X)

            # 81维特征下的测试数据集的所对应的各个类别的概率
            _81_svc_1_probility = _81_svc_1.predict_proba(_81_unlabeled_X)
            # 81维特征下测试数据集的预测标签
            _81_svc_1_predict_Y = _81_svc_1.predict(_81_unlabeled_X)

            # 30维特征下的测试数据集的所对应的各个类别的概率
            _30_svc_1_probility = _30_svc_1.predict_proba(_30_unlabeled_X)
            # 30维特征下测试数据集的预测标签
            _30_svc_1_predict_Y = _30_svc_1.predict(_30_unlabeled_X)

            probility_list_1 = [
                hog_svc_1_probility, _81_svc_1_probility, _30_svc_1_probility
            ]
            unlabeled_Y_list_1 = [
                hog_unlabeled_Y, _81_unlabeled_Y, _30_unlabeled_Y
            ]
            predict_Y_list_1 = [
                hog_svc_1_predict_Y, _81_svc_1_predict_Y, _30_svc_1_predict_Y
            ]

            # voted_index_predict_Y_list = Utilities.vote(predict_Y_list_1, unlabeled_Y_list_1, whole_class, topk)
            #
            # voted_Y_list, voted_index_list = Utilities.get_voted_confidence(probility_list_1,
            #                                                                 voted_index_predict_Y_list[0],
            #                                                                 voted_index_predict_Y_list[1], whole_class,
            #                                                                 topk)
            selected_ind_list, selected_pesudo_label_list = Utilities.get_pesudo_label(
                probility_list_1, predict_Y_list_1, unlabeled_Y_list_1,
                whole_class, topk, num, para)
            a = []
            for i in selected_ind_list:
                a.append(_30_unlabeled_Y[i])
            print(selected_pesudo_label_list)
            print(a)
            for i in range(len(selected_ind_list)):
                hog_labeled_train_X.append(
                    hog_unlabeled_X[selected_ind_list[i]])
                hog_labeled_train_Y.append(selected_pesudo_label_list[i])
                _81_labeled_train_X.append(
                    _81_unlabeled_X[selected_ind_list[i]])
                _81_labeled_train_Y.append(selected_pesudo_label_list[i])
                _30_labeled_train_X.append(
                    _30_unlabeled_X[selected_ind_list[i]])
                _30_labeled_train_Y.append(selected_pesudo_label_list[i])

            hog_unlabeled_X = [
                i for j, i in enumerate(hog_unlabeled_X)
                if j not in selected_ind_list
            ]
            hog_unlabeled_Y = [
                i for j, i in enumerate(hog_unlabeled_Y)
                if j not in selected_ind_list
            ]
            _81_unlabeled_X = [
                i for j, i in enumerate(_81_unlabeled_X)
                if j not in selected_ind_list
            ]
            _81_unlabeled_Y = [
                i for j, i in enumerate(_81_unlabeled_Y)
                if j not in selected_ind_list
            ]
            _30_unlabeled_X = [
                i for j, i in enumerate(_30_unlabeled_X)
                if j not in selected_ind_list
            ]
            _30_unlabeled_Y = [
                i for j, i in enumerate(_30_unlabeled_Y)
                if j not in selected_ind_list
            ]

        print(accuracy_max * 100)
        print(class_1_accuracy_list)
        print(class_2_accuracy_list)
        print(class_3_accuracy_list)
        print(class_4_accuracy_list)
        print(class_5_accuracy_list)
        print(class_6_accuracy_list)
        print(class_7_accuracy_list)
        print(class_8_accuracy_list)
        print(class_9_accuracy_list)
        print(class_10_accuracy_list)
        print(class_11_accuracy_list)
        if model_max_hog is not None:
            print("正在保存hog.model...")
            joblib.dump(model_max_hog, savePath + "hog.model")
            print("保存hog.model完毕。")
        if model_max_81 is not None:
            print("正在保存81.model...")
            joblib.dump(model_max_81, savePath + "81.model")
            print("保存81.model完毕。")
        if model_max_30 is not None:
            print("正在保存30.model...")
            joblib.dump(model_max_30, savePath + "30.model")
            print("保存30.model完毕。")
    # 加载训练好的model,进行测试
    elif trainAndTestFlag == "test":
        if os.path.exists(savePath):
            # 加载model文件
            model_max_hog = joblib.load(savePath + "hog.model")
            model_max_81 = joblib.load(savePath + "81.model")
            model_max_30 = joblib.load(savePath + "30.model")
Пример #11
0
from keras.preprocessing.sequence import pad_sequences
from keras.utils import to_categorical

#Modeling
#from keras.models import Sequential
#from keras.layers import LSTM,Bidirectional,Dense,Embedding,Dropout

#%% Loading data
#https://www.kaggle.com/wcukierski/enron-email-dataset
emails = pd.read_csv('emails.csv', skiprows=lambda x: x % 9)

#%% to see how emails look like
print(emails['message'][1])

#%% object of class DataPreparation
prep = DataPreparation(emails)

#to extract emails' body then add it to a new column
emails['Email'] = prep.bodyExtraction(emails['message'])

#labeling, adding label to each column.
emails['Sentiment'] = prep.labeling(emails['Email'])

#creating a separate dataset with just two columns body and sentiment
df = prep.newData(emails['Email'], emails['Sentiment'])

#%%
df.info()

#%%
#after extracting the body
Пример #12
0
        plt.title('Support Vector Regression')
        plt.legend()
        plt.show()

    def fit(self, x, y):
        parameters = {'kernel': ('rbf', ), 'C': [1e3, 1e2, 1, 10]}
        svr = svm.SVR()
        self.estimator = GridSearchCV(svr, parameters)
        self.estimator.fit(x, y.values.ravel())

        print self.estimator

    def predict(self, x, real_y):
        pred_y = self.estimator.predict(x)
        self.plotData(real_y, pred_y)


if __name__ == "__main__":
    mysvm = MySVM()
    # trainning
    dp = DataPreparation(7, 'mid')
    x, y = dp.generateData()
    print "=====fit===="
    mysvm.fit(x, y)

    # predict
    dp = DataPreparation(8, 'mid')
    x, y = dp.generateData()
    print "=====predict===="
    mysvm.predict(x, y)
                        prefix_s_step, s_temp_bitmap)
                # recursively try to extend that pattern
                if self.max_pattern_length > size_current_prefix:
                    self.dfs_pruning(prefix_s_step, s_temp_bitmap, s_temp,
                                     s_temp, s_temp[pos],
                                     size_current_prefix + 1)

    # Save the results in a file
    def write_to_file(self, file):
        results_to_strings = self.result.apply(
            lambda sequence: " -1 ".join(str(x) for x in sequence['Pattern']) +
            " -1 #SUP: " + str(sequence['SUP']),
            axis=1)
        results_to_strings.to_csv('../results-SPAM/' + self.file + '-out.txt',
                                  index=False,
                                  header=None)


if __name__ == '__main__':
    f = 'test1'
    l = DataPreparation(f)
    logs = l.logs_preparation()

    s = SPAM(f)
    #s.set_max_gap(1)
    s.set_min_pattern_length(2)
    s.set_max_pattern_length(3)
    s.spam_algorithm(logs, 0.4)

    s.write_to_file(f)