class machine_learning_algorithm(): def __init__(self, algorithm, train_data_name="gsb.arff"): self.logger = NsLog("log") self.path_output_arff = "../output/arff/" self.path_test_output = "" self.json2arff_object = json2arff() self.parser_object = domain_parser() self.train_data_name = train_data_name self.rule_calculation = rule_extraction() self.time_now = str(datetime.datetime.now())[0:19].replace(" ", "_") if algorithm == 'NB': self.model = self.create_model_NB() elif algorithm == 'RF': self.model = self.create_model_RF() def __txt_to_list(self, txt_object): lst = [] for line in txt_object: lst.append(line.strip()) txt_object.close() return lst def preparing_train_data(self, file_name="gsb.arff"): train = [] target = [] try: train_dataset, train_meta = arff.loadarff( open("{0}{1}".format(self.path_output_arff, file_name), "r")) train = train_dataset[train_meta.names() [:-1]] # everything but the last column target = train_dataset[train_meta.names()[len(train_meta.names()) - 1]] # last column train = np.asarray(train.tolist(), dtype=np.float32) # olay burda except: self.logger.debug(file_name + " ile eğitim sırasında hata") self.logger.error("Error : {0}".format(format_exc())) return train, target def preparing_test_data(self, test_dataset_list): try: feat_json = open( "../output/test-output/json-" + self.time_now + ".txt", "w") feat_arff = open( "../output/test-output/arff-" + self.time_now + ".arff", "w") "domain_parsed to json without class" self.test_parsed_domains = self.parser_object.parse_nonlabeled_samples( test_dataset_list) "rule calculation for test samples without class information -- output json format" test_features = self.rule_calculation.extraction( self.test_parsed_domains) "test sampleları için oluşturulan json -> arff e dönüştür. Class yok." arff_test_str = self.json2arff_object.convert_for_test( test_features, '') # feat_json.write(json.dumps(test_features)) feat_arff.write(arff_test_str) feat_arff.close() feat_json.close() arff_raw = StringIO(arff_test_str) test_dataset, test_meta = arff.loadarff(arff_raw) test = test_dataset[test_meta.names()] test = np.asarray(test.tolist(), dtype=np.float32) except: self.logger.error( "Test verisi ayarlanırken hata / Error : {0}".format( format_exc())) return test, self.test_parsed_domains def create_model_NB(self): train, target = self.preparing_train_data() gnb = GaussianNB() model = gnb.fit(train, target) return model def create_model_RF(self): train, target = self.preparing_train_data() clf = RandomForestClassifier(n_estimators=10, random_state=0, verbose=1) model = clf.fit(train, target) return model def model_run(self, test): model = self.create_model_RF() model_pre = model.predict(test) model_probability = model.predict_proba(test) model_pre_list = [] for p in model_pre: model_pre_list.append(str(p).replace("b'", "").replace("'", "")) model_probability = model_probability.tolist() return model_pre_list, model_probability def output(self, test_data): test, test_parsed_domains = self.preparing_test_data(test_data) model_pre, model_probability = self.model_run(test) test_parsed_domain = self.test_parsed_domains result_list = [] for test_domain in test_parsed_domain: result = {} result['domain'] = test_domain['url'] result['id'] = test_domain['id'] result['predicted_class'] = model_pre[test_domain['id']] result['probability_phish'] = ( model_probability[test_domain['id']][1] / sum(model_probability[test_domain['id']])) * 100 result['probability_legitimate'] = ( model_probability[test_domain['id']][0] / sum(model_probability[test_domain['id']])) * 100 result_list.append(result) test_result = open( "../output/test-output/result-" + self.time_now + ".txt", "w") test_result.write(json.dumps(result_list)) test_result.close() return result_list def accuracy(self): model = self.model test_data, test_label = self.preparing_train_data() scores = cross_val_score(model, test_data, test_label, cv=10) return scores def confusion_matrix(self, name): """ train dataseti gsb.arff model içerisinde bu dataset var. confisioun matris çıkarmayı istediğimiz datayı preparing_train_data fonksiyonu ile arff formatı okunur. okunan dosya data ve label olarak bölünür. data model üzerinde çalıştırılır. elde edilen tahmin sonuçlarına ilişkin labellar model_preye atılır. test_label--bytes array formatında unicode formatına dönüştürülür ardından confusion matrix çalıştırılır. :param name: :return: """ test, test_label = self.preparing_train_data(file_name=name) model_pre, model_pro = self.model_run(test) test_label_unicode = [] for t in test_label: test_label_unicode.append(str(t, 'utf-8')) return confusion_matrix(test_label_unicode, model_pre, labels=['phish', 'legitimate'])
class WordSplitterClass(object): def __init__(self): self.logger = NsLog("log") self.path_data = "../data/" self.name_brand_file = "All_Brand.txt" self.dictionary_en = enchant.DictWithPWL( "en_US", "{0}{1}".format(self.path_data, self.name_brand_file)) #self.__file_capitalize(self.path_data, self.name_brand_file) self.pp = pprint.PrettyPrinter(indent=4) def _split(self, gt7_word_list): return_word_list = [] for word in gt7_word_list: try: ss = {'raw': word, 'splitted': []} # kelime içerisinde rakam varsa temizlenir. word = re.sub("\d+", "", word) sub_words = [] if not self.dictionary_en.check(word): # işlenen kelime sözlükte bu kelimeyi geri döndür. İşlenen kelime sözlükte yoksa ayırma işlemine geç. for number in range( len(word), 3, -1 ): # uzunluğu 3 den yüksek olan alt kelimelerin üretilmesi for l in range(0, len(word) - number + 1): if self.dictionary_en.check( self.__capitalize(word[l:l + number])): # bir kelime tespit ettiğim zaman diğer kelimelerin tespit edilmesinde fp ye sebep olmasın diye # tespit edilen kelime yerine * karekteri koydum w = self.__check_last_char(word[l:l + number]) sub_words.append(w) word = word.replace(w, "*" * len(w)) rest = max(re.split("\*+", word), key=len) if len(rest) > 3: sub_words.append(rest) split_w = sub_words for l in split_w: for w in reversed(split_w): """ tespit edilen bir kelime daha büyük olan bir kelimenin içerisinde de geçiyorsa o fp dir. Bunları temizledim. Örn. secure, cure. Cure kelimesi listeden çıkarılır. """ if l != w: # todo edit distance eklenecek if l.find(w) != -1 or l.find(w.lower()) != -1: sub_words.remove(w) if len(sub_words) == 0: # eğer hiç kelime bulunamadıysa ham kelime olduğu gibi geri döndürülür. sub_words.append(word.lower()) else: sub_words.append(word.lower()) ss['splitted'] = sub_words return_word_list.append(ss) except: self.logger.debug("|" + word + "| işlenirken hata") self.logger.error( "word_splitter.split() muhtemelen boş dizi gelme hatası / Error : {0}" .format(format_exc())) return return_word_list def _splitl(self, gt7_word_list): result = [] for val in self._split(gt7_word_list): result += val["splitted"] return result def _splitw(self, word): word_l = [] word_l.append(word) result = self._split(word_l) return result def __check_last_char(self, word): confusing_char = ['s', 'y'] last_char = word[len(word) - 1] word_except_last_char = word[0:len(word) - 1] if last_char in confusing_char: if self.dictionary_en.check(word_except_last_char): return word_except_last_char return word def __clear_fp(self, sub_words): length_check = 0 for w in sub_words: if (length_check + len(w)) < self.length + 1: length_check = length_check + len(w) else: sub_words.remove(w) sub_words = self.__to_lower(sub_words) return sub_words def __to_lower(self, sub_words): lower_sub_list = [] for w in sub_words: lower_sub_list.append(str(w.lower())) return lower_sub_list def __capitalize(self, word): return word[0].upper() + word[1:] def __file_capitalize(self, path, name): """ enchant paketinde özel kelimelerin kontrol edilebilmesi için baş harfinin büyük olması gerekiyor. bir kelime kontrol edilmeden önce baş harfi büyük hale gitirilip sonra sözlüğe sorduruyorum. Bu nedenle dosyadaki kelimelerin de baş harflerini büyük hale getirip kaydettim ve bu şekilde kullandım. :return: """ personel_dict_txt = open("{0}{1}".format(path, name), "r") personel_dict = [] for word in personel_dict_txt: personel_dict.append(self.__capitalize(word.strip())) personel_dict_txt.close() personel_dict_txt = open("{0}{1}-2".format(path, name), "w") for word in personel_dict: personel_dict_txt.write(word + "\n")
class json2arff: def __init__(self): self.logger = NsLog("log") def convert_for_train(self, features, param): # arff convert header try: ArffStr = '''@relation weka-test\n\n''' features_keys_url = list(features[0]['url_features'].keys()) features_keys_active = [] if param == '-a': features_keys_active = list( features[0]['active_features'].keys()) for line in features_keys_url: ArffStr = ArffStr + '@attribute ' + line + " numeric\n" if param == '-a': for line in features_keys_active: ArffStr = ArffStr + '@attribute ' + line + " numeric\n" ArffStr = ArffStr + '@attribute class {phish, legitimate}' + "\n\n@data\n" except: self.logger.debug("Hata - Json_to_arff e gelen sample sayısı" + str(len(features)) + "\nurl_feature_keys: " + str(features_keys_url) + "\nactive_features_key: " + str(features_keys_active)) self.logger.error("Error Arff Header : {0}".format(format_exc())) # header son for each_domain in features: try: tmp = "" for key in features_keys_url: tmp = tmp + str(each_domain['url_features'][key]) + "," if param == '-a': for key_a in features_keys_active: tmp = tmp + str( each_domain['active_features'][key_a]) + "," tmp = tmp + each_domain['info']['class'] + "\n" ArffStr = ArffStr + tmp except: self.logger.debug("Arffe çevrilen sample da hata :\n" + str(each_domain)) self.logger.error("Error Arff Body : {0}".format(format_exc())) return ArffStr def convert_for_test(self, features, param): #todo active rules a göre güncellenecek # arff convert header ArffStr = '''@relation weka-test\n\n''' features_keys_url = features[0]['url_features'].keys() if param == '-dns': features_keys_dns = features[0]['dns_features'].keys() for line in features_keys_url: ArffStr = ArffStr + '@attribute ' + line + " numeric\n" if param == '-dns': for key in features_keys_dns: ArffStr = ArffStr + '@attribute ' + key + " numeric\n" ArffStr = ArffStr + "\n@data\n" # header son for each_domain in features: tmp = "" for key in features_keys_url: tmp = tmp + str(each_domain['url_features'][key]) + "," if param == '-dns': for key_dns in features_keys_dns: tmp = tmp + str(each_domain['dns_features'][key_dns]) + "," tmp = tmp[0:len(tmp) - 1] + "\n" ArffStr = ArffStr + tmp return ArffStr def convert_for_NLP_without_features(self, features): # arff convert header try: ArffStr = '''@relation weka-test\n\n''' ArffStr += '@attribute words string\n' ArffStr += '@attribute class {phish, legitimate}' + "\n\n@data\n" for sample in features: ArffStr += "'" for word in sample['info']['nlp_info']['words_nlp']: ArffStr += word + " " ArffStr = ArffStr.strip() + "',{0}\n".format( sample['info']['class']) except: self.logger.error("Error Arff Header : {0}".format(format_exc())) return ArffStr def convert_for_NLP_with_features(self, features): # arff convert header try: features_keys_url = list(features[0]['url_features'].keys()) ArffStr = '''@relation weka-test\n\n''' ArffStr += '@attribute words string\n' for line in features_keys_url: ArffStr = ArffStr + '@attribute ' + line + " numeric\n" ArffStr += '@attribute class {phish, legitimate}' + "\n\n@data\n" for sample in features: ArffStr += '"' for word in sample['info']['nlp_info']['words_nlp']: ArffStr += word + " " ArffStr = ArffStr.strip() + '",' for key in features_keys_url: ArffStr += str(sample['url_features'][key]) + "," ArffStr = ArffStr.strip() + '{0}\n'.format( sample['info']['class']) except: self.logger.error("Error Arff Header : {0}".format(format_exc())) return ArffStr
class url_rules: def __init__(self): print("initializing") self.logger = NsLog("log") self.path_data = "data/" self.name_brand_file = "allbrand.txt" self.path_alexa_files = "../data/alexa-tld/" self.nlp_manager = nlp_class() self.pp = pprint.PrettyPrinter(indent=4) self.word_splitter = WordSplitterClass() allbrand_txt = open(self.path_data + self.name_brand_file, "r") self.allbrand = self.__txt_to_list(allbrand_txt) # trie #self.trie_alexa_tld = pygtrie.CharTrie(json.loads(open("constant/alexa_tld_json.txt", "r").read())) #self.trie_alexa_tldsiz = pygtrie.CharTrie(json.loads(open("constant/alexa_tldsiz_dct.json", "r").read())) def __txt_to_list(self, txt_object): list = [] for line in txt_object: list.append(line.strip()) txt_object.close() return list def rules_main(self, domain, tld, subdomain, path, words_raw): features = {} info_nlp = {} try: features.update(self.digit_count(domain, subdomain, path)) # digitcount features.update(self.length(domain, subdomain, path)) # uzunluk features.update(self.tld_check(tld)) # tld check features.update(self.check_rule_5(words_raw)) # www-com features.update(self.punny_code(domain)) # punnycode features.update(self.random_domain(domain)) # random_domain features.update(self.subdomain_count(subdomain)) # subdomain count features.update(self.char_repeat(words_raw)) # char_repeat # features.update(self.alexa_check(domain, tld)) # alexa1m check #features.update(self.alexa_trie(domain, tld)) # alexa1m check trie features.update(self.special_chars(domain, subdomain, path)) # - . / @ features.update(self.check_domain_in_list(domain)) result_nlp = self.nlp_features(words_raw) features.update(result_nlp['features']) # words_info info_nlp = result_nlp['info'] except: self.logger.error("url_rules.main() Error : {0}".format( format_exc())) return info_nlp, features def digit_count(self, domain, subdomain, path): result = { 'domain_digit_count': 0, 'subdomain_digit_count': 0, 'path_digit_count': 0 } for letter in domain: if letter.isdigit(): result['domain_digit_count'] = result['domain_digit_count'] + 1 for letter in subdomain: if letter.isdigit(): result['subdomain_digit_count'] = result[ 'subdomain_digit_count'] + 1 for letter in path: if letter.isdigit(): result['path_digit_count'] = result['path_digit_count'] + 1 return result def length(self, domain, subdomain, path): domain_uzunluk = len(domain) subdomain_uzunluk = len(subdomain) path_uzunluk = len(path) result = {} result['domain_length'] = domain_uzunluk result['subdomain_length'] = subdomain_uzunluk result['path_length'] = path_uzunluk return result def tld_check(self, tld): common_tld = ["com", "org", "net", "de", "edu", "gov"] result = {} if tld in common_tld: result["isKnownTld"] = 1 else: result["isKnownTld"] = 0 return result def check_rule_5(self, words_raw): result = {'www': 0, "com": 0} for word in words_raw: if not word.find('www') == -1: result['www'] = result['www'] + 1 if not word.find('com') == -1: result['com'] = result['com'] + 1 return result def punny_code(self, line): result = {} if line.startswith("xn--"): result['punnyCode'] = 1 return result else: result['punnyCode'] = 0 return result def random_domain(self, domain): result = {'random_domain': self.nlp_manager.check_word_random(domain)} return result def subdomain_count(self, line): sub = line.split(".") result = {} result['subDomainCount'] = len(sub) return result def __all_same(self, items): return all(x == items[0] for x in items) def char_repeat(self, words_raw): result = {'char_repeat': 0} repeat = {'2': 0, '3': 0, '4': 0, '5': 0} part = [2, 3, 4, 5] "sliding window mantigi repeat sayisi kadar eleman al" \ "hepsi ayni mi diye bak - ayni ise artir" for word in words_raw: for char_repeat_count in part: for i in range(len(word) - char_repeat_count + 1): sub_word = word[i:i + char_repeat_count] if self.__all_same(sub_word): repeat[str(char_repeat_count )] = repeat[str(char_repeat_count)] + 1 result['char_repeat'] = sum(list(repeat.values())) return result def alexa_check(self, domain, tld): is_find_tld = 0 is_find = 0 line = domain + "." + tld letter = [ "a", "b", "c", "d", "e", "f", "g", "h", "i", "j", "k", "l", "m", "n", "o", "p", "r", "s", "t", "u", "v", "y", "z", "w", "x", "q", "1", "2", "3", "4", "5", "6", "7", "8", "9", "0" ] try: if line[0] in letter: alexa_txt = open( "{0}{1}.txt".format(self.path_alexa_files, line[0]), "r") alexaList_tld = [] #tldli alexa_list = [] #tldsiz for alexa_line in alexa_txt: alexaList_tld.append(alexa_line.strip()) alexa_list.append(alexa_line.strip().split(".")[0]) alexa_txt.close() for alexa_line in alexaList_tld: if line.strip() == alexa_line.strip(): is_find_tld = 1 break for alexa_line in alexa_list: line_domain = line.split(".")[0] if line_domain.strip() == alexa_line.strip(): is_find = 1 break except: self.logger.debug(line + "işlenirken hata uzunluktan dolayı") self.logger.error( "url_rules.check_rule_11()-Alexa / Error : {0}".format( format_exc())) result = {} if is_find_tld == 1: result['alexa1m_tld'] = 1 else: result['alexa1m_tld'] = 0 if is_find == 1: result['alexa1m'] = 1 else: result['alexa1m'] = 0 return result def alexa_trie(self, domain, tld): line = domain + "." + tld result = {} try: #if self.alexa1mm[line[0].lower()].has_key(line): if self.trie_alexa_tld.has_key(line): result['alexa1m_tld_trie'] = 1 else: result['alexa1m_tld_trie'] = 0 if self.trie_alexa_tldsiz.has_key(domain): result['alexa1m_tldsiz_trie'] = 1 else: result['alexa1m_tldsiz_trie'] = 0 except: self.logger.debug(line + "işlenirken alexa") self.logger.error( "url_rules.check_rule_11()-Alexa / Error : {0}".format( format_exc())) return result def special_chars(self, domain, subdomain, path): special_char = { '-': 0, ".": 0, "/": 0, '@': 0, '?': 0, '&': 0, '=': 0, "_": 0 } special_char_letter = special_char.keys() for l in domain: if l in special_char_letter: special_char[l] = special_char[l] + 1 for l in subdomain: if l in special_char_letter: special_char[l] = special_char[l] + 1 for l in path: if l in special_char_letter: special_char[l] = special_char[l] + 1 return special_char def check_domain_in_list(self, domain): result = {} if domain in self.allbrand: result['domain_in_brand_list'] = 1 else: result['domain_in_brand_list'] = 0 return result def nlp_features(self, words_raw): """ keywords_in_words, brands_in_words, dga_in_words, len_lt_7, len_gt_7 """ grouped_words = self.nlp_manager.parse(words_raw) splitted_words = self.word_splitter._splitl(grouped_words['len_gt_7']) """ found_keywords, found_brands, similar_to_keyword, similar_to_brand, other_words, target_words """ fraud_analyze_result = self.nlp_manager.fraud_analysis( grouped_words, splitted_words) result = self.nlp_manager.evaluate(grouped_words, fraud_analyze_result, splitted_words) split = {'raw': grouped_words['len_gt_7'], 'splitted': splitted_words} result['info']['compoun_words'] = split return result
class nlp_class: def __init__(self): self.logger = NsLog("log") self.path_data = "../data/" self.name_keywords = "keywords.txt" self.name_brand_file = "allbrand.txt" self.name_random_model = "gib_model.pki" model_data = pickle.load(open(self.name_random_model, 'rb')) self.model_mat = model_data['mat'] self.threshold = model_data['thresh'] self.allbrand = self.__txt_to_list( open("{0}{1}".format(self.path_data, self.name_brand_file), "r")) self.keywords = self.__txt_to_list( open("{0}{1}".format(self.path_data, self.name_keywords), "r")) def __txt_to_list(self, txt_object): list = [] for line in txt_object: list.append(line.strip()) txt_object.close() return list def __is_similar_to_any_element(self, word, list): target = '' for l in list: if editdistance.eval(word, l) < 2: target = l if len(target) > 0: return word else: return 0 def parse(self, words): keywords_in_words = [] brands_in_words = [] similar_to_brands = [] similar_to_keywords = [] dga_in_words = [] len_gt_7 = [] len_lt_7 = [] try: for word in words: word = re.sub("\d+", "", word) if word in self.keywords: keywords_in_words.append(word) elif word in self.allbrand: brands_in_words.append(word) elif self.__is_similar_to_any_element(word, self.allbrand) != 0: target = self.__is_similar_to_any_element( word, self.allbrand) similar_to_brands.append(target) elif self.__is_similar_to_any_element(word, self.keywords) != 0: target = self.__is_similar_to_any_element( word, self.keywords) similar_to_keywords.append(target) elif len(word) > 3 and not word.isnumeric(): if (gib_detect_train.avg_transition_prob( word, self.model_mat) > self.threshold) == False: dga_in_words.append(word) # todo keyword benzeri olanlar temizlenmeli elif len(word) < 7: len_lt_7.append(word) else: len_gt_7.append(word) result = { 'keywords_in_words': keywords_in_words, 'brands_in_words': brands_in_words, 'dga_in_words': dga_in_words, 'len_lt_7': len_lt_7, 'len_gt_7': len_gt_7, 'similar_to_brands': similar_to_brands, 'similar_to_keywords': similar_to_keywords } except: self.logger.debug(str(words) + " işlenirken hata") self.logger.error("Error : {0}".format(format_exc())) return result def fraud_analysis(self, grouped_words, splitted_words): word_list = grouped_words['len_lt_7'] + grouped_words[ 'similar_to_brands'] + grouped_words[ 'similar_to_keywords'] + splitted_words word_list_nlp = grouped_words['len_lt_7'] + grouped_words['similar_to_brands'] + \ grouped_words['similar_to_keywords'] + grouped_words['brands_in_words'] + \ grouped_words['keywords_in_words'] + grouped_words['dga_in_words'] + splitted_words found_keywords = [] found_brands = [] similar_to_keyword = [] similar_to_brand = [] other_words = [] target_words = {'brand': [], 'keyword': []} try: for word in word_list: word = re.sub("\d+", "", word) if word in self.keywords: found_keywords.append(word) elif word in self.allbrand: found_brands.append(word) else: for brand in self.allbrand: if editdistance.eval(word, brand) < 2: target_words['brand'].append(brand) similar_to_brand.append(word) for keyword in self.keywords: if editdistance.eval(word, keyword) < 2: target_words['keyword'].append(keyword) similar_to_keyword.append(word) if word not in found_keywords + found_brands + similar_to_keyword + similar_to_brand: other_words.append(word) result = { 'found_keywords': found_keywords, 'found_brands': found_brands, 'similar_to_keywords': similar_to_keyword, 'similar_to_brands': similar_to_brand, 'other_words': other_words, 'target_words': target_words, 'words_nlp': word_list_nlp } except: self.logger.debug(str(word_list) + " işlenirken hata") self.logger.error("Error : {0}".format(format_exc())) return result def evaluate(self, grouped_words, fraud_analyze_result, splitted_words): """ grouped_words keywords_in_words, brands_in_words, dga_in_words, len_lt_7, len_gt_7 fraud_anaylze_result found_keywords, found_brands, similar_to_keyword, similar_to_brand, other_words, target_words """ try: words_raw = grouped_words['keywords_in_words'] + grouped_words['brands_in_words'] + \ grouped_words['similar_to_brands'] + grouped_words['similar_to_keywords'] + \ grouped_words['dga_in_words'] + grouped_words['len_lt_7'] + grouped_words['len_gt_7'] words_len = [] compound_word_len = [] for word in words_raw: words_len.append(len(word)) for word in grouped_words['len_gt_7']: compound_word_len.append(len(word)) all_keywords = grouped_words[ 'keywords_in_words'] + fraud_analyze_result['found_keywords'] all_brands = grouped_words[ 'brands_in_words'] + fraud_analyze_result['found_brands'] similar_brands = fraud_analyze_result['similar_to_brands'] similar_keywords = fraud_analyze_result['similar_to_keywords'] if len(compound_word_len) == 0: av_com = 0 else: av_com = float(np.average(compound_word_len)) if len(words_len) == 0: min = 0 max = 0 av_w = 0 std = 0 else: min = int(np.min(words_len)) max = int(np.max(words_len)) av_w = float(np.average(words_len)) std = float(np.std(words_len)) result = { 'info': { 'keywords': all_keywords, 'brands': all_brands, 'dga_in_words': grouped_words['dga_in_words'], 'similar_to_keywords': similar_keywords, 'similar_to_brands': similar_brands, 'negligible_words': fraud_analyze_result['other_words'], 'target_words': fraud_analyze_result['target_words'], 'words_nlp': fraud_analyze_result['words_nlp'] }, 'features': { 'raw_word_count': len(words_len), 'splitted_word_count': len(splitted_words), 'average_word_length': av_w, 'longest_word_length': max, 'shortest_word_length': min, 'std_word_length': std, 'compound_word_count': len(grouped_words['len_gt_7']), 'keyword_count': len(all_keywords), 'brand_name_count': len(all_brands), 'negligible_word_count': len(fraud_analyze_result['other_words']), 'target_brand_count': len(fraud_analyze_result['target_words']['brand']), 'target_keyword_count': len(fraud_analyze_result['target_words']['keyword']), 'similar_keyword_count': len(similar_keywords), 'similar_brand_count': len(similar_brands), 'average_compound_words': av_com, 'random_words': len(grouped_words['dga_in_words']) } } except: self.logger.error("Error : {0}".format(format_exc())) return result def check_word_random(self, word): if gib_detect_train.avg_transition_prob( word, self.model_mat) < self.threshold: return 1 else: return 0
class Train: def __init__(self): self.logger = NsLog("log") self.json2arff_object = json2arff() self.parser_object = domain_parser() self.rule_calculation = rule_extraction() self.path_input = "../input/" self.path_arff = "../output/arff/" self.path_features = "../output/features/" self.path_parsed_domain = "../output/domain_parser/" def txt_to_list(self, txt_object): lst = [] for line in txt_object: lst.append(line.strip()) txt_object.close() return lst def domain_parser(self, param): parsed_domains = [] for i in range(1, len(param), 2): try: if param[i + 1] == 'phish' or param[i + 1] == 'legitimate': #dataset = self.txt_to_list(open("{0}{1}".format(self.path_input, param[i]), "r")) # txt read dataset = json.loads( open("{0}{1}".format(self.path_input, param[i]), "r").read()) # json read parsed_domains = parsed_domains + self.parser_object.parse( dataset, param[i + 1], len(parsed_domains)) else: self.logger.debug( "class labels must be entered one of (phish, legitimate)" ) except: self.logger.error("an error is occurred : {0}".format( format_exc())) self.logger.debug( "an error occurred when | {0}.txt | file was processing". format(param)) self.logger.info( "Domain Parse process is done {0} unique URLs are parsed".format( len(parsed_domains))) return parsed_domains def json_to_file(self, name, path, data): time_now = str(datetime.datetime.now())[0:19].replace(" ", "_") file_name = name + "_" + time_now + ".txt" file = open(path + file_name, "w") file.write(json.dumps(data)) file.close() self.logger.info("{0} Dosyaya Yazıldı.".format(name)) def arff_to_file(self, name, path, data): time_now = str(datetime.datetime.now())[0:19].replace(" ", "_") file_name = name + "_" + time_now + ".txt" file = open(path + file_name, "w") file.write(data) file.close() self.logger.info("{0} Dosyaya Yazıldı.".format(name))
class active_rules: def __init__(self): self.pp = pprint.PrettyPrinter(indent=2) self.logger = NsLog("log") def goog_safe_browsing(self, domain_features): try: url_list = [] updated_domain_features = domain_features for sample in domain_features: url_list.append(sample['info']['url']) sep_list = self.__seperate(url_list, 500) #phishing_url_list = self.get_urls(self.google_sb_query(list, sep_list.index(list), len(domain_features))) # aktif phishing_url_list = self.get_urls(json.loads(open("constant/gb_phish.json", "r").read())) #dosyadan updated_domain_features = [] for each in domain_features: element = each if each['info']['url'] in phishing_url_list: element.update({'active_features': {'google_safe_browsing': 1}}) else: element.update({'active_features': {'google_safe_browsing': 0}}) updated_domain_features.append(element) except: self.logger.error("Error : {0}".format(format_exc())) return updated_domain_features def __seperate(self, url_list, size): sep_urls = [] k = int((len(url_list)/size)+1) for i in range(1, k+1): if (i*size) > len(url_list): sep_urls.append(url_list[(i - 1) * size : len(url_list)]) else: sep_urls.append(url_list[(i-1)*size: i*size]) return sep_urls def google_sb_query(self, url_list, count, overall_count): query_url_list = self.sb_format(url_list) sep_list = self.__seperate(query_url_list, 500) for list in sep_list: time_now = str(datetime.datetime.now())[0:19].replace(" ", "_") api_key = 'AIzaSyCGmGpCMt-PNQTrWAsp3LqcM_UvCF6NJ1I' url = "https://safebrowsing.googleapis.com/v4/threatMatches:find" payload = {'client': {'clientId': "mycompany", 'clientVersion': "0.1"}, 'threatInfo': {'threatTypes': ["SOCIAL_ENGINEERING", "MALWARE"], 'platformTypes': ["ANY_PLATFORM"], 'threatEntryTypes': ["URL"], 'threatEntries': list }} params = {'key': api_key} r = requests.post(url, params=params, json=payload).json() phish_url_list = [] if 'matches' in r.keys(): for each in r['matches']: phish_url_list.append(each['threat']['url']) elif 'error' in r.keys(): self.logger.debug("Google-SB sorgusunda hata - Toplam işlenen örnek sayısı: "+overall_count+"\nişlenen parça (500): "+count) return phish_url_list def sb_format(self, url_list): sb_query = [] for url in url_list: sb_query.append({'url': url}) return sb_query def get_urls(self, ph_db_json): urls = [] for obj in ph_db_json: urls.append(obj['url']) return urls def txt_to_list(self, txt_object): list = [] for line in txt_object: list.append(line.strip()) txt_object.close() return list