def get_features(self, domain_list, psl): domain_labels = [] domain_indexes = [] index = 0 for d in domain_list: d = d[:d.rindex(psl.publicsuffix(d)) - 1] if len(d) == 0: continue d_labels = d.split(".") for l in d_labels: domain_labels.append(l) domain_indexes.append(index) index = index + 1 label_features = extract_all_features(domain_labels) minMax = MinMaxScaler() label_features = minMax.fit_transform(label_features) zero_array = np.zeros(32) domain_features = [] i = 0 while i < len(label_features): if i == len(label_features ) - 1 or domain_indexes[i] != domain_indexes[i + 1]: domain_features.append(np.append(zero_array, label_features[i])) i = i + 1 else: domain_features.append( np.append(label_features[i], label_features[i + 1])) i = i + 2 return domain_features
def get_dataset(self, DGADomain,benignDomain): domains = DGADomain + benignDomain y = np.concatenate((np.ones(len(DGADomain)), np.zeros(len(benignDomain)))) allLabels, index = self.getAllDomainLabels(domains) labelFeatures = char_feature.extract_all_features(allLabels) X = self.unionFeature(labelFeatures, index) if len(X)!=len(y): print("error") return X,y
def get_dataset(self, bfile): trainDGADomain, testDGADomain, trainBenignDomain, testBenignDomain = self.pontus.getTrainTestDomains( benignFile="../data_sets/{}".format(bfile)) trainData = trainDGADomain + trainBenignDomain y_train = np.concatenate((np.ones(len(trainDGADomain)), np.zeros(len(trainBenignDomain)))) testData = testDGADomain + testBenignDomain y_test = np.concatenate((np.ones(len(testDGADomain)), np.zeros(len(testBenignDomain)))) allLabels, index = self.pontus.getAllDomainLabels(trainData) labelFeatures = extract_all_features(allLabels) X_train = self.pontus.unionFeature(labelFeatures, index) allLabels, index = self.pontus.getAllDomainLabels(testData) labelFeatures = extract_all_features(allLabels) X_test = self.pontus.unionFeature(labelFeatures, index) return X_train, y_train, X_test, y_test
def createdataset(type="train", AGD_file="../data_sets/split_AGDs", BD_file="../data_sets/split_benign_nx.json", datasetname="nx_train_data"): if type == "train": v_index = 0 else: v_index = 1 psl = PublicSuffixList() with open(AGD_file, "r") as f: AGD_dict = json.loads(f.read()) with open(BD_file, "r") as f: bd_dict = json.loads(f.read()) allAGDs = set() allBDs = set() for k, v in AGD_dict.items(): for d in v[v_index]: pre_d = d[:d.rindex(psl.publicsuffix(d)) - 1] for l in pre_d.split("."): allAGDs.add(l) for d in bd_dict[type]: pre_d = d[:d.rindex(psl.publicsuffix(d)) - 1] for l in pre_d.split("."): allBDs.add(l) length = len(allAGDs) print(length) allBDs = list(allBDs)[:length] allAGDs = list(allAGDs) alldomains = allAGDs + allBDs alllabels = list(np.ones(length)) + list(np.zeros(length)) allfeatures = extract_all_features(alldomains) np.save("../data_sets/{}_features.npy".format(datasetname), allfeatures) data = dict() data["domains"] = pd.Series(alldomains, dtype='str') data["labels"] = pd.Series(alllabels, dtype='int32') df = pd.DataFrame(data=data) df.to_csv("../data_sets/{}.csv".format(datasetname), index=False)
def MY_expirement_process(root_dir="/home/yandingkui/dga_detection/result_data/", m_file="split_AGDs", benign_file="split_benign_ac.json", n=815, m=10, c='entropy'): psl=PublicSuffixList() with open(root_dir + m_file, "r") as f: malicious_data = json.loads(f.read()) with open(root_dir + benign_file, "r") as f: benign_data = json.loads(f.read()) train_domains = [] train_labels = [] pred_domains = [] pred_labels = [] for k, v in malicious_data.items(): for d in v[0]: d_split = d[:d.index(psl.publicsuffix(d)) - 1].split(".") if len(d_split) == 1: train_domains.append(d_split[0]) else: m = 0 lm = None for l in d_split: if len(l) > m: lm = l train_domains.append(lm) train_labels.append(1) for d in v[1]: pred_domains.append(d) pred_labels.append(1) for d in benign_data.get("train"): pri_d=psl.privatesuffix(d) lm=pri_d[:pri_d.index(psl.publicsuffix(pri_d))-1] train_domains.append(lm) train_labels.append(0) for d in benign_data.get("pred"): pred_domains.append(d) pred_labels.append(0) train_features = char_feature.extract_all_features(train_domains) index = list(range(len(train_domains))) random.shuffle(index) real_train_features = [] real_train_labels = [] for i in index: real_train_features.append(train_features[i]) real_train_labels.append(train_labels[i]) # clf = RandomForestClassifier(n_estimators=800, random_state=0) # {'criterion': 'entropy', 'max_features': 14, 'n_estimators': 820, 'random_state': 0} clf = RandomForestClassifier(n_estimators=n, max_features=m, criterion=c, random_state=0) # print("features") # n_es_list=range(750,850,5) # max_fea_list=range(10,30,2) # tuned_parameters = [{'n_estimators':n_es_list , 'random_state': [0],'max_features': max_fea_list,'criterion':["gini","entropy"]}] # clf = GridSearchCV(RandomForestClassifier(), tuned_parameters, cv=5,scoring='accuracy',n_jobs=30) clf.fit(real_train_features, real_train_labels) # print("best_params:") # print(clf.best_params_) print("Pontus:feature_importance_") im=clf.feature_importances_ feature_items=[] for i in range(len(im)): feature_items.append((i+1,im[i])) feature_items.sort(key=takeSecond,reverse=True) print(feature_items)
def getDomainLabelFeatures(self, domainlabels): return char_feature.extract_all_features(domainlabels)
def get_suspicious(year, month, day): timestring = "{}{:0>2d}{:0>2d}".format(year, month, day) suspicious_domains_set = set() if os.path.exists("../result_data/{}domains.txt".format(timestring)): with open("../result_data/{}domains.txt".format(timestring), "r") as f: for r in f: suspicious_domains_set.add(r.strip()) check_active_domains(suspicious_domains_set, timestring) else: init_domain_set = set() # get all domains for hour in range(24): file_path = "{}{:0>2d}{:0>2d}{:0>2d}".format(year, month, day, hour) if not os.path.exists("../result_data/{}".format(file_path)): continue with open("../result_data/{}".format(file_path), "r") as f: for r in f: domain = r.strip().split(",")[1] init_domain_set.add(domain) psl = PublicSuffixList() domain_labels = [] labels_labels = [] i = 0 # get labels domains_list = list(init_domain_set) for d in domains_list: s = d[:d.index(psl.publicsuffix(d)) - 1] for l in s.split("."): if len(l) > 0: domain_labels.append(l) labels_labels.append(i) i = i + 1 features_path = "../result_data/{}_features.npy".format(timestring) if os.path.exists(features_path): features = np.load(features_path) else: features = extract_all_features(domain_labels) np.save(features_path, features) # classifier identifies labels clf = joblib.load("../result_data/ac_model.m") pred_labels = clf.predict(features) domain_index = set() for i in range(len(labels_labels)): if pred_labels[i] == 1: domain_index.add(labels_labels[i]) # get suspicious domains for index in domain_index: ps = psl.privatesuffix(domains_list[index]) if ps is None: continue suspicious_domains_set.add(ps) print("{} domains".format(len(suspicious_domains_set))) with open("../result_data/{}domains.txt".format(timestring), "w") as f: f.write("\n".join(suspicious_domains_set)) print("save finish") # dgarchive check check_active_domains(suspicious_domains_set, timestring)
root_dir, t))) self.save_model(features, labels, type=t, parameters=p) def test_model(self, test_data, real_labels, type): print("{} model test result:".format(type)) clf = joblib.load("../result_data/{}_model.m".format(type)) pred_labels = clf.predict(test_data) print(pred_labels) print("accuracy:{}\nrecall:{}\nprecision:{}\nf1-score:{}" \ .format(accuracy_score(pred_labels, real_labels), \ recall_score(pred_labels, real_labels), \ precision_score(pred_labels, real_labels), \ f1_score(pred_labels, real_labels))) def test(self): types = ["ac", "nx"] root_dir = "../data_sets/" for i in range(2): t = types[i] features, labels = self.get_data( os.path.abspath("{}{}_pred_data.csv".format(root_dir, t)), os.path.abspath("{}{}_pred_data_features.npy".format( root_dir, t))) self.test_model(features, labels, t) if __name__ == "__main__": modelextractor = ModelExtractor() # modelextractor.test() features = extract_all_features(["www", "xxfeee0d8", "validttu"]) modelextractor.test_model(features, [0, 1, 0], "ac")
def getDomainFeatures(self,domains): allLabels, index = self.getAllDomainLabels(domains) labelFeatures = char_feature.extract_all_features(allLabels) X = self.unionFeature(labelFeatures, index) return X