예제 #1
0
def train_and_predict():
    filename = '../data/6.5w_distinct1_labeled.csv'
    data = pandas.read_csv(filename)
    user_agent = n_gram_v2(filename, 2)
    class_features = text_to_number(filename)
    mouse_features = data_normalization(filename)
    #mouse_features = data[['mouse_x', 'mouse_y']]
    target = data.label
    # 矩阵降维(n, 1)--> (n,)
    target = target.ravel()
    all_features = numpy.concatenate(
        (class_features, mouse_features, user_agent), axis=1)
    # 样本随机
    all_features, target = shuffle(all_features, target, random_state=0)
    x_train, x_test, y_train, y_test = train_test_split(all_features,
                                                        target,
                                                        test_size=0.4,
                                                        random_state=0)
    classifier = RandomForestClassifier(n_jobs=-1,
                                        max_depth=47,
                                        min_samples_split=2,
                                        min_samples_leaf=1,
                                        max_features=31,
                                        n_estimators=180)
    '''
    classifier.fit(x_train, y_train)
    joblib.dump(classifier, '../models/RandomForest.m')
    y_predict = classifier.predict(x_test)
    y_predict_proba = classifier.predict_proba(x_test)
    do_metrics(y_test, y_predict, y_predict_proba[:, 1])
    '''
    scores = cross_val_score(classifier, all_features, target, cv=10)
    print(scores)
    print(numpy.mean(scores))
예제 #2
0
def cross_validate():
    filename = '../data/6.5w_distinct1_labeled.csv'
    data = pandas.read_csv(filename)
    user_agent = n_gram_v2(filename, 3)
    class_features = text_to_number(filename)
    mouse_features = data_normalization(filename)
    target = data.label
    # 矩阵降维(n, 1)--> (n,)
    target = target.ravel()
    all_features = numpy.concatenate((class_features, mouse_features), axis=1)
    # 样本随机
    all_features, target = shuffle(all_features, target, random_state=0)
    #x_train, x_test, y_train, y_test = train_test_split(all_features, target, test_size = 0.4, random_state = 0)
    classifier = LogisticRegression()
    scores = cross_val_score(classifier, all_features, target, cv=10)
    print(scores)
    print(numpy.mean(scores))
예제 #3
0
def train_and_predict():
    filename = '../data/6.5w_distinct1_labeled.csv'
    data = pandas.read_csv(filename)
    user_agent = n_gram_v2(filename, 1)
    class_features = text_to_number(filename)
    mouse_features = data_normalization(filename)
    #mouse_features = data[['mouse_x', 'mouse_y']]
    target = data.label
    # 矩阵降维(n, 1)--> (n,)
    target = target.ravel()
    all_features = numpy.concatenate((class_features, mouse_features, user_agent), axis = 1)
    # 样本随机
    all_features, target = shuffle(all_features, target, random_state = 0)
    x_train, x_test, y_train, y_test = train_test_split(all_features, target, test_size = 0.4, random_state = 0)
    classifier = KNeighborsClassifier(n_neighbors = 200)
    classifier.fit(x_train, y_train)
    joblib.dump(classifier, '../models/KNN.m')
    y_predict = classifier.predict(x_test)
    y_predict_proba = classifier.predict_proba(x_test)
    do_metrics(y_test, y_predict, y_predict_proba[:, 1])
예제 #4
0
def write_features_to_file():
    filename = '../data/6.5w_distinct1_labeled.csv'
    data = pandas.read_csv(filename)
    user_agent = n_gram_v2(filename, 2)
    # 所有特征放入DataFrame
    all_features = user_agent
    class_features = text_to_number(filename)
    class_features_names = [
        'event_action_account_input', 'event_action_pwd_input',
        'event_action_login_click', 'java_enable', 'java_disable',
        'cookie_enable', 'cookie_disenable'
    ]
    count = 0
    for name in class_features_names:
        all_features[name] = class_features[:, count]
        count += 1
    #mouse_features = data[['mouse_x', 'mouse_y']]
    mouse_features = data_normalization(filename)
    all_features['mouse_x'] = mouse_features[:, 0]
    all_features['mouse_y'] = mouse_features[:, 1]
    all_features.to_csv('../data/6.5w_distinct1_features.csv')
예제 #5
0
def random_forest():
    filename = '../data/6.5w_distinct1_labeled.csv'
    data = pandas.read_csv(filename)
    user_agent = n_gram_v2(filename, 1)
    # 所有特征放入DataFrame
    all_features = user_agent
    class_features = text_to_number(filename)
    class_features_names = [
        'event_action_account_input', 'event_action_pwd_input',
        'event_action_login_click', 'java_enable', 'java_disable',
        'cookie_enable', 'cookie_disenable'
    ]
    count = 0
    for name in class_features_names:
        all_features[name] = class_features[:, count]
        count += 1
    #mouse_features = data[['mouse_x', 'mouse_y']]
    mouse_features = data_normalization(filename)
    all_features['mouse_x'] = mouse_features[:, 0]
    all_features['mouse_y'] = mouse_features[:, 1]
    #all_features['mouse_x'] = data['mouse_x']
    #all_features['mouse_y'] = data['mouse_y']
    target = data.label
    # 矩阵降维(n, 1)--> (n,)
    target = target.ravel()
    #all_features = numpy.concatenate((class_features, mouse_features, user_agent), axis = 1)
    # 样本随机
    #all_features, target = shuffle(all_features, target, random_state = 0)

    classifier = RandomForestClassifier()
    classifier.fit(all_features, target)
    feature_importances_indices = numpy.argsort(
        classifier.feature_importances_)
    for num in range(numpy.shape(all_features)[1]):
        print('%d. %s  %f' % (
            num, all_features.columns.values[feature_importances_indices[num]],
            classifier.feature_importances_[feature_importances_indices[num]]))