set_f.append(f)
                if name_has_substring(n, interest):
                    set_c.append(1)
                else:
                    set_c.append(0)

    return set_f, set_c

if __name__ == '__main__':
    from matplotlib import pyplot as plt
    print "Loading data.."
    from feature_extraction.Cached_Features import data
    print "Normalizing..."

    # Select features
    data = data_select_specific_features(data, ['bi_char_dist', 'legomena', 'word_length', 'tri_char_dist', 'mono_tag_dist', 'sentence_length', 'readability'])

    # Get the data separated in features and classes
    features, classes = get_feature_vectors_from_data(data)

    # Compres the features to two numbers (points)
    FP = Feature_Preprocessor(features, False, True, 2)
    features = FP.batch_normalize(features)


    print "Data processed, now plotting..."

    # Convert a list of points to two lists of x and y points (fortran style)
    x = [ p[0] for p in features ]
    y = [ p[1] for p in features ]
示例#2
0
        Decide based on _pairs_ if text described as _outset_f_ is obfuscated or not
    """
    regular = []
    obfuscated = []
    for (reg, obf) in pairs:
        obfuscated.append(obf)
        for r in reg:
            regular.append(r)
    features = regular + obfuscated
    classes = [ 0 for _ in xrange(len(regular)) ] + [ 1 for _ in xrange(len(obfuscated)) ]
    return AdaBoostClassifier_predict_texttype(features, classes, outset_f)[0] == 1


if __name__ == '__main__':
    print "Loading data.."
    from feature_extraction.Cached_Features import data
    print "Working..."

    features=['mono_char_dist', 'mono_chunk_dist', 'bi_tag_dist', 'word_length', 'legomena', 'bi_char_dist', 'readability', 'mono_tag_dist']

    samples = 1 # Set to high number for more accurate mearusements
    data = data_select_specific_features(data, features)
    sets = create_splits(data, samples=samples)

    average = lambda x : sum(x) / len(x)

    for deobf in ['never', 'detect', 'always']:
        ranks = get_precision_at_rank(sets, deobf=deobf)
        print "deobf:"+deobf, ",  ave(recall):", average(ranks), ranks

    print "Done"