def check_features(allfeatures, smallset, star, classifier): test_set = list(smallset.index)[-test_size:] true_values = [vote(classifier(a)) for a in test_set] pres_value = -1 res = dict() for feature in allfeatures: a = enable_feature(star, feature, smallset) train_star = { apn: fea for apn, fea in a.items() if apn in list(smallset.index)[:-test_size] } vn = create_voting_net( gamma=0.7, apns=list(train_star.keys()), distance=lambda x, y: mjaccard(x, y, train_star), classifier=classifier) #TP, FP, TN, FN #e0, e1, e3, e2 = eval_net(net=vn, test_set=test_set, distance=lambda x,y: mjaccard(x,y, star), classifier=nc) predictions = predict(net=vn, test_set=test_set, distance=lambda x, y: mjaccard(x, y, a)) r1 = f1_score(y_true=true_values, y_pred=predictions) if r1 > pres_value: print(f"new value detected {r1} {pres_value}") pres_value = r1 res[feature] = r1 return res
def exp(funcs, labels, test_size=10): nets = dict() gamma = 0 train, test = train_test_split(funcs, test_size=test_size, random_state=42) test.to_csv('/tmp/test.csv') classifier = lambda x: [[0, 1], [1, 0]][int(labels.loc[x]['malware_label']) ] d = lambda x, y: distance(x, y, funcs) intervals = 18 #for gamma in tqdm([0, 1, 2, 4, 8, 16, 32, 64, 72, 80, 88, 96, 104, 110, 128, 164, 180, 192]): for gamma in tqdm([x * 1 / intervals for x in range(0, intervals + 1)]): print(f"Current {gamma=}") mv = make_and_merge(train, labels, gamma) print("Creating reference voting netwrok") start = time.time() reference_voting = create_voting_net(gamma=gamma, apns=train.index, distance=d, classifier=classifier) end = time.time() print(f"\tElapsed: {end-start}") nets[gamma] = [dict(mv), dict(reference_voting)] if gamma == 0: gamma = 1 else: gamma *= 2 print(f"Anchor points: {len(mv.keys())}") if len(mv.keys()) == 1: break # save nets: save_nets(nets=nets, name=f"{len(train)}-jaccard-votingnets")
def rr(x, gamma, classifier): start = time.time() net = create_voting_net(gamma=gamma, apns=x.keys(), distance=lambda x1, y1: distance(x1, y1, x), classifier=classifier) end = time.time() print(f"Creating voting network...{gamma=} {len(x)} Elapsed: {end-start}") return net
def run_euclid(train, test, distance,classifier): res = dict() nets = dict() distance = lambda x,y: adf(x, y, afs) for gamma in tqdm([0, 1, 2, 4, 8, 16, 32, 64, 128]): mv = create_voting_net(gamma=gamma, apns=train, distance=distance, classifier=classifier) false_negative, false_positives = evaluate_voting_net(apns=test, classifier=classifier, net=mv, distance=distance) res[gamma] = [false_negative, false_positives] nets[gamma] = mv.copy() sizes = {g:len(n.keys()) for g, n in nets.items()} perfs_di = pd.DataFrame.from_dict(res, orient='index', columns=['fp', 'fn']) szs_di = pd.DataFrame.from_dict(sizes, orient='index', columns=['sizes']) gj_euc = perfs_di.join(szs_di) part_size = len(train) gj_euc['compression'] = gj_euc.sizes/part_size gj_euc.to_csv(f"res/euc{part_size}.csv")
def run_jaccard(train, test, distance, classifier): res = dict() nets = dict() for gamma in tqdm([0, 0.1, 0.2, 0.4, 0.5, 0.8, 0.85, 0.9, 1.0]): mv = create_voting_net(gamma=gamma, apns=train, distance=distance, classifier=classifier) false_negative, false_positives = evaluate_voting_net(apns=test, classifier=classifier, net=mv, distance=distance) res[gamma] = [false_negative, false_positives] nets[gamma] = mv.copy() sizes = {g:len(n.keys()) for g, n in nets.items()} perfs = pd.DataFrame.from_dict(res, orient='index', columns=['fp', 'fn']) szs = pd.DataFrame.from_dict(sizes, orient='index', columns=['sizes']) gj = perfs.join(szs) part_size = len(train) gj['compression'] = gj.sizes/part_size gj.to_csv(f"res/jac{part_size}.csv")