示例#1
0
def AdaBoostClassifier_predict(features, classes, unknown):
    """
        Provices the most likely author for each unknown text
    """
    FP = Feature_Preprocessor(features, True, False, 30)
    features = FP.batch_normalize(features)
    unknown = FP.batch_normalize(unknown)

    clf = AdaBoostClassifier(n_estimators=10, learning_rate=0.998, algorithm='SAMME.R', random_state=1)
    clf.fit(features, classes)

    return clf.predict(unknown)
示例#2
0
def DecisionTreeClassifier_predict(features, classes, unknown):
    """
        Provices the most likely author for each unknown text
    """
    FP = Feature_Preprocessor(features, True, True, 30)
    features = FP.batch_normalize(features)
    unknown = FP.batch_normalize(unknown)

    clf = DecisionTreeClassifier(criterion='entropy', min_samples_split=2, splitter='best')
    clf.fit(features, classes)

    return clf.predict(unknown)
示例#3
0
def KNeighborsClassifier_predict(features, classes, unknown):
    """
        Provices the most likely author for each unknown text
    """
    FP = Feature_Preprocessor(features, True, False, 30)
    features = FP.batch_normalize(features)
    unknown = FP.batch_normalize(unknown)

    clf = KNeighborsClassifier(n_neighbors=4,  weights='distance', algorithm='brute', metric='minkowski', p=1)
    clf.fit(features, classes)

    return clf.predict(unknown)
示例#4
0
def SVM_predict(features, classes, unknown):
    """
        Provices the most likely author for each unknown text
    """
    FP = Feature_Preprocessor(features, True, False, 30)
    features = FP.batch_normalize(features)
    unknown = FP.batch_normalize(unknown)

    clf = SVC(kernel='rbf', C=2.4, degree=1, gamma=0.7/len(features[0]))
    clf.fit(features, classes)

    return clf.predict(unknown)
示例#5
0
def AdaBoostClassifier_predict_texttype(features, classes, unknown):
    """
        Predicts the type of a text (binary classification)
        Parameters optimized for natural vs obfuscated.
    """
    from sklearn.ensemble import AdaBoostClassifier

    FP = Feature_Preprocessor(features, True, False, 30)
    features = FP.batch_normalize(features)
    unknown = FP.batch_normalize(unknown)

    clf = AdaBoostClassifier(n_estimators=80, learning_rate=0.998, algorithm='SAMME.R', random_state=1)
    clf.fit(features, classes)

    return clf.predict(unknown)
示例#6
0
def SVM_predict_rank(features, classes, unknown, actual_classes):
    """
        Proviced a ranking of the different authors by likelyhood of having authored each unknown text.
    """
    FP = Feature_Preprocessor(features, True, False, 30)
    features = FP.batch_normalize(features)
    unknown = FP.batch_normalize(unknown)

    clf = SVC(probability=True, kernel='rbf', C=2.4, degree=1, gamma=0.7/len(features[0]))
    clf.fit(features, classes)

    # I'm sorry about the following lines:
    predictions = map(lambda x : zip(clf.classes_, x), clf.predict_log_proba(unknown))
    orderings = zip(map(lambda x : sorted(x, key = lambda s : s[1], reverse=True), predictions), actual_classes)

    orderings = [([ e[0] for e in l[0] ], l[1]) for l in orderings]
    rankings = map(lambda x : x[0].index(x[1]), orderings )

    return rankings
    return set_f, set_c

if __name__ == '__main__':
    from matplotlib import pyplot as plt
    print "Loading data.."
    from feature_extraction.Cached_Features import data
    print "Normalizing..."

    # Select features
    data = data_select_specific_features(data, ['bi_char_dist', 'legomena', 'word_length', 'tri_char_dist', 'mono_tag_dist', 'sentence_length', 'readability'])

    # Get the data separated in features and classes
    features, classes = get_feature_vectors_from_data(data)

    # Compres the features to two numbers (points)
    FP = Feature_Preprocessor(features, False, True, 2)
    features = FP.batch_normalize(features)


    print "Data processed, now plotting..."

    # Convert a list of points to two lists of x and y points (fortran style)
    x = [ p[0] for p in features ]
    y = [ p[1] for p in features ]

    # Split into points of interest and normal points (obfuscated texts ad natural texts)
    interest = [ p for p in zip(x,y,classes) if p[2] == 1 ]
    normal = [ p for p in zip(x,y,classes) if p[2] == 0 ]

    # scatterplot the points
    norml = plt.scatter(zip(*normal)[0], zip(*normal)[1], marker='x', c='b', s=40)