def train_and_predict(): filename = '../data/6.5w_distinct1_labeled.csv' data = pandas.read_csv(filename) user_agent = n_gram_v2(filename, 2) class_features = text_to_number(filename) mouse_features = data_normalization(filename) #mouse_features = data[['mouse_x', 'mouse_y']] target = data.label # 矩阵降维(n, 1)--> (n,) target = target.ravel() all_features = numpy.concatenate( (class_features, mouse_features, user_agent), axis=1) # 样本随机 all_features, target = shuffle(all_features, target, random_state=0) x_train, x_test, y_train, y_test = train_test_split(all_features, target, test_size=0.4, random_state=0) classifier = RandomForestClassifier(n_jobs=-1, max_depth=47, min_samples_split=2, min_samples_leaf=1, max_features=31, n_estimators=180) ''' classifier.fit(x_train, y_train) joblib.dump(classifier, '../models/RandomForest.m') y_predict = classifier.predict(x_test) y_predict_proba = classifier.predict_proba(x_test) do_metrics(y_test, y_predict, y_predict_proba[:, 1]) ''' scores = cross_val_score(classifier, all_features, target, cv=10) print(scores) print(numpy.mean(scores))
def cross_validate(): filename = '../data/6.5w_distinct1_labeled.csv' data = pandas.read_csv(filename) user_agent = n_gram_v2(filename, 3) class_features = text_to_number(filename) mouse_features = data_normalization(filename) target = data.label # 矩阵降维(n, 1)--> (n,) target = target.ravel() all_features = numpy.concatenate((class_features, mouse_features), axis=1) # 样本随机 all_features, target = shuffle(all_features, target, random_state=0) #x_train, x_test, y_train, y_test = train_test_split(all_features, target, test_size = 0.4, random_state = 0) classifier = LogisticRegression() scores = cross_val_score(classifier, all_features, target, cv=10) print(scores) print(numpy.mean(scores))
def train_and_predict(): filename = '../data/6.5w_distinct1_labeled.csv' data = pandas.read_csv(filename) user_agent = n_gram_v2(filename, 1) class_features = text_to_number(filename) mouse_features = data_normalization(filename) #mouse_features = data[['mouse_x', 'mouse_y']] target = data.label # 矩阵降维(n, 1)--> (n,) target = target.ravel() all_features = numpy.concatenate((class_features, mouse_features, user_agent), axis = 1) # 样本随机 all_features, target = shuffle(all_features, target, random_state = 0) x_train, x_test, y_train, y_test = train_test_split(all_features, target, test_size = 0.4, random_state = 0) classifier = KNeighborsClassifier(n_neighbors = 200) classifier.fit(x_train, y_train) joblib.dump(classifier, '../models/KNN.m') y_predict = classifier.predict(x_test) y_predict_proba = classifier.predict_proba(x_test) do_metrics(y_test, y_predict, y_predict_proba[:, 1])
def write_features_to_file(): filename = '../data/6.5w_distinct1_labeled.csv' data = pandas.read_csv(filename) user_agent = n_gram_v2(filename, 2) # 所有特征放入DataFrame all_features = user_agent class_features = text_to_number(filename) class_features_names = [ 'event_action_account_input', 'event_action_pwd_input', 'event_action_login_click', 'java_enable', 'java_disable', 'cookie_enable', 'cookie_disenable' ] count = 0 for name in class_features_names: all_features[name] = class_features[:, count] count += 1 #mouse_features = data[['mouse_x', 'mouse_y']] mouse_features = data_normalization(filename) all_features['mouse_x'] = mouse_features[:, 0] all_features['mouse_y'] = mouse_features[:, 1] all_features.to_csv('../data/6.5w_distinct1_features.csv')
def random_forest(): filename = '../data/6.5w_distinct1_labeled.csv' data = pandas.read_csv(filename) user_agent = n_gram_v2(filename, 1) # 所有特征放入DataFrame all_features = user_agent class_features = text_to_number(filename) class_features_names = [ 'event_action_account_input', 'event_action_pwd_input', 'event_action_login_click', 'java_enable', 'java_disable', 'cookie_enable', 'cookie_disenable' ] count = 0 for name in class_features_names: all_features[name] = class_features[:, count] count += 1 #mouse_features = data[['mouse_x', 'mouse_y']] mouse_features = data_normalization(filename) all_features['mouse_x'] = mouse_features[:, 0] all_features['mouse_y'] = mouse_features[:, 1] #all_features['mouse_x'] = data['mouse_x'] #all_features['mouse_y'] = data['mouse_y'] target = data.label # 矩阵降维(n, 1)--> (n,) target = target.ravel() #all_features = numpy.concatenate((class_features, mouse_features, user_agent), axis = 1) # 样本随机 #all_features, target = shuffle(all_features, target, random_state = 0) classifier = RandomForestClassifier() classifier.fit(all_features, target) feature_importances_indices = numpy.argsort( classifier.feature_importances_) for num in range(numpy.shape(all_features)[1]): print('%d. %s %f' % ( num, all_features.columns.values[feature_importances_indices[num]], classifier.feature_importances_[feature_importances_indices[num]]))