Пример #1
0
def check_features(allfeatures, smallset, star, classifier):
    test_set = list(smallset.index)[-test_size:]
    true_values = [vote(classifier(a)) for a in test_set]
    pres_value = -1
    res = dict()
    for feature in allfeatures:
        a = enable_feature(star, feature, smallset)
        train_star = {
            apn: fea
            for apn, fea in a.items()
            if apn in list(smallset.index)[:-test_size]
        }
        vn = create_voting_net(
            gamma=0.7,
            apns=list(train_star.keys()),
            distance=lambda x, y: mjaccard(x, y, train_star),
            classifier=classifier)

        #TP, FP, TN, FN
        #e0, e1, e3, e2 = eval_net(net=vn, test_set=test_set, distance=lambda x,y: mjaccard(x,y, star), classifier=nc)
        predictions = predict(net=vn,
                              test_set=test_set,
                              distance=lambda x, y: mjaccard(x, y, a))

        r1 = f1_score(y_true=true_values, y_pred=predictions)

        if r1 > pres_value:
            print(f"new value detected {r1} {pres_value}")
            pres_value = r1

        res[feature] = r1
    return res
Пример #2
0
def exp(funcs, labels, test_size=10):
    nets = dict()
    gamma = 0

    train, test = train_test_split(funcs, test_size=test_size, random_state=42)
    test.to_csv('/tmp/test.csv')
    classifier = lambda x: [[0, 1], [1, 0]][int(labels.loc[x]['malware_label'])
                                            ]
    d = lambda x, y: distance(x, y, funcs)
    intervals = 18
    #for gamma in tqdm([0, 1, 2, 4, 8, 16, 32, 64, 72, 80, 88, 96, 104, 110, 128, 164, 180, 192]):
    for gamma in tqdm([x * 1 / intervals for x in range(0, intervals + 1)]):
        print(f"Current {gamma=}")
        mv = make_and_merge(train, labels, gamma)
        print("Creating reference voting netwrok")
        start = time.time()
        reference_voting = create_voting_net(gamma=gamma,
                                             apns=train.index,
                                             distance=d,
                                             classifier=classifier)
        end = time.time()
        print(f"\tElapsed: {end-start}")

        nets[gamma] = [dict(mv), dict(reference_voting)]
        if gamma == 0:
            gamma = 1
        else:
            gamma *= 2

        print(f"Anchor points: {len(mv.keys())}")
        if len(mv.keys()) == 1:
            break

    # save nets:
    save_nets(nets=nets, name=f"{len(train)}-jaccard-votingnets")
Пример #3
0
def rr(x, gamma, classifier):
    start = time.time()
    net = create_voting_net(gamma=gamma,
                            apns=x.keys(),
                            distance=lambda x1, y1: distance(x1, y1, x),
                            classifier=classifier)
    end = time.time()
    print(f"Creating voting network...{gamma=} {len(x)} Elapsed: {end-start}")
    return net
Пример #4
0
def run_euclid(train, test, distance,classifier):
    res = dict()
    nets = dict()
    distance = lambda x,y: adf(x, y, afs)

    for gamma in tqdm([0, 1, 2, 4, 8, 16, 32, 64, 128]):
        mv = create_voting_net(gamma=gamma, apns=train, distance=distance, classifier=classifier)
        false_negative, false_positives = evaluate_voting_net(apns=test, classifier=classifier, net=mv, distance=distance)
        res[gamma] = [false_negative, false_positives]
        nets[gamma] = mv.copy()
        
        sizes = {g:len(n.keys()) for g, n in nets.items()}
        perfs_di = pd.DataFrame.from_dict(res, orient='index', columns=['fp', 'fn'])
        szs_di = pd.DataFrame.from_dict(sizes, orient='index', columns=['sizes'])
        gj_euc = perfs_di.join(szs_di)
        part_size = len(train)
        gj_euc['compression'] = gj_euc.sizes/part_size
        gj_euc.to_csv(f"res/euc{part_size}.csv")
Пример #5
0
def run_jaccard(train, test, distance, classifier):
    res = dict()
    nets = dict()
    
    for gamma in tqdm([0, 0.1, 0.2, 0.4, 0.5, 0.8, 0.85, 0.9, 1.0]):
        mv = create_voting_net(gamma=gamma, apns=train, distance=distance, classifier=classifier)
        false_negative, false_positives = evaluate_voting_net(apns=test, classifier=classifier, net=mv, distance=distance)
        res[gamma] = [false_negative, false_positives]
        nets[gamma] = mv.copy()

    sizes = {g:len(n.keys()) for g, n in nets.items()}
    perfs = pd.DataFrame.from_dict(res, orient='index', columns=['fp', 'fn'])
    szs = pd.DataFrame.from_dict(sizes, orient='index', columns=['sizes'])

    gj = perfs.join(szs)
    part_size = len(train)
    gj['compression'] = gj.sizes/part_size
    gj.to_csv(f"res/jac{part_size}.csv")