Exemplo n.º 1
0
    def __init__(self, algorithm, train_data_name="gsb.arff"):

        self.logger = NsLog("log")

        self.path_output_arff = "../output/arff/"
        self.path_test_output = ""

        self.json2arff_object = json2arff()
        self.parser_object = domain_parser()
        self.train_data_name = train_data_name
        self.rule_calculation = rule_extraction()

        self.train, self.test, self.train_label, self.test_label = self.split_test_and_train_data()

        self.time_now = str(datetime.datetime.now())[0:19].replace(" ", "_")

        if algorithm == 'NB':
            self.model = self.create_model_NB()
            self.active_model = self.active_create_model_NB()
        elif algorithm == 'RF':
            self.model = self.create_model_RF()
            self.active_model = self.active_create_model_RF()
        elif algorithm == 'GB':
            self.model = self.create_model_GB()
            self.active_model = self.active_create_model_GB()
Exemplo n.º 2
0
    def __init__(self):
        self.logger = NsLog("log")
        self.path_data = "../data/"
        self.name_brand_file = "All_Brand.txt"
        self.dictionary_en = enchant.DictWithPWL(
            "en_US", "{0}{1}".format(self.path_data, self.name_brand_file))
        #self.__file_capitalize(self.path_data, self.name_brand_file)

        self.pp = pprint.PrettyPrinter(indent=4)
    def __init__(self):
        self.logger = NsLog("log")
        self.path_data = "data/"
        self.name_brand_file = "allbrand.txt"
        self.dictionary_en = enchant.DictWithPWL(
            "en_US", self.path_data + self.name_brand_file)
        #self.__file_capitalize(self.path_data, self.name_brand_file)

        self.pp = pprint.PrettyPrinter(indent=4)
Exemplo n.º 4
0
    def __init__(self):
        self.logger = NsLog("log")

        self.json2arff_object = json2arff()
        self.parser_object = domain_parser()
        self.rule_calculation = rule_extraction()

        self.path_input = "../input/"
        self.path_arff = "../output/arff/"
        self.path_features = "../output/features/"
        self.path_parsed_domain = "../output/domain_parser/"
Exemplo n.º 5
0
    def __init__(self):

        print("initializing")

        self.logger = NsLog("log")
        self.path_data = "data/"
        self.name_brand_file = "allbrand.txt"
        self.path_alexa_files = "../data/alexa-tld/"

        self.nlp_manager = nlp_class()
        self.pp = pprint.PrettyPrinter(indent=4)
        self.word_splitter = WordSplitterClass()

        allbrand_txt = open(self.path_data + self.name_brand_file, "r")
        self.allbrand = self.__txt_to_list(allbrand_txt)
Exemplo n.º 6
0
    def __init__(self):
        self.logger = NsLog("log")
        self.path_data = "../data/"
        self.name_keywords = "keywords.txt"
        self.name_brand_file = "allbrand.txt"
        self.name_random_model = "gib_model.pki"

        model_data = pickle.load(open(self.name_random_model, 'rb'))
        self.model_mat = model_data['mat']
        self.threshold = model_data['thresh']

        self.allbrand = self.__txt_to_list(
            open("{0}{1}".format(self.path_data, self.name_brand_file), "r"))
        self.keywords = self.__txt_to_list(
            open("{0}{1}".format(self.path_data, self.name_keywords), "r"))
class rule_extraction:
    def __init__(self):
        self.logger = NsLog("log")
        self.url_rules_o = url_rules()
        self.active_rules_o = active_rules()

    def extraction(self, parsed_domains):

        self.logger.info("rule_extraction.extraction() is running")

        domain_features = []
        try:
            for line in tqdm(parsed_domains):  # self.bar(parsed_domains)

                info = line

                #  info['mail'] = 'whoisden cekilecek'

                nlp_info, url_features = self.url_rules_o.rules_main(
                    info['domain'], info['tld'], info['subdomain'],
                    info['path'],
                    info['words_raw'])  # url kurallarin calistigi yer

                info['nlp_info'] = nlp_info
                info['nlp_info']['words_raw'] = info['words_raw']
                info.pop("words_raw", None)

                #  domain_info, dns_features = self.dns_rules_o.rules_main(line_lst)  # dns rules

                outputDict = {}

                #  info['dns_records'] = domain_info

                outputDict['info'] = info
                outputDict['url_features'] = url_features

                #  outputDict['dns_features'] = dns_features

                domain_features.append(outputDict)

            #domain_features = self.active_rules_o.goog_safe_browsing(domain_features)  # active kuralların çalıştığı yer
        except:
            self.logger.error("Error : {0}".format(format_exc()))

        return domain_features
Exemplo n.º 8
0
    def __init__(self):

        self.logger = NsLog("log")
Exemplo n.º 9
0
class domain_parser(object):
    def __init__(self):

        self.logger = NsLog("log")

    def parse(self, domain_list, class_info, count):
        self.logger.info("domain_parser.parse() is running")

        parsed_domain_list = []
        registered_domain_lst = []

        for line in tqdm(domain_list):

            domain = {}
            line = line.strip().replace('"', "").replace("'", '')
            extracted_domain = tldextract.extract(line)

            registered_domain_lst.append(extracted_domain.registered_domain)

            domain['url'] = line
            domain['domain'] = extracted_domain.domain
            domain['registered_domain'] = extracted_domain.registered_domain
            domain['tld'] = extracted_domain.suffix
            domain['subdomain'] = extracted_domain.subdomain
            domain['class'] = class_info
            domain['id'] = count
            count = count + 1

            if line.find('://') == -1:
                domain['protocol'] = ''
            else:
                domain['protocol'] = line.split("://")[0]

            tmp = line[line.find(extracted_domain.suffix):len(
                line)]  # tld sonraki ilk / e gore parse --> path
            pth = tmp.partition("/")

            domain['path'] = pth[1] + pth[2]

            domain['words_raw'] = self.words_raw_extraction(
                extracted_domain.domain, extracted_domain.subdomain, pth[2])

            parsed_domain_list.append(domain)

        return parsed_domain_list

    def parse_nonlabeled_samples(self, domain_list, count=0):
        self.logger.info("domain_parser.parse_nonlabeled_samples() is running")
        parsed_domain_list = []
        registered_domain_lst = []

        for line in tqdm(domain_list):
            domain = {}
            line = line.strip().replace('"', "").replace("'", '')

            extracted_domain = tldextract.extract(line)

            registered_domain_lst.append(extracted_domain.registered_domain)

            domain['url'] = line  #.strip()
            domain['domain'] = extracted_domain.domain
            domain['registered_domain'] = extracted_domain.registered_domain
            domain['tld'] = extracted_domain.suffix
            domain['subdomain'] = extracted_domain.subdomain
            domain['id'] = count
            count = count + 1

            if line.find('://') == -1:
                domain['protocol'] = ''
            else:
                domain['protocol'] = line.split("://")[0]

            tmp = line[line.find(extracted_domain.suffix):len(
                line)]  # tld sonraki ilk / e gore parse --> path
            pth = tmp.partition("/")

            domain['path'] = pth[1] + pth[2]
            # domain['path'].append(pth[1] + pth[2]) # path liste olarak kaydedilip istatistik cikarilabilir

            domain['words_raw'] = self.words_raw_extraction(
                extracted_domain.domain, extracted_domain.subdomain, pth[2])

            parsed_domain_list.append(domain)

        return parsed_domain_list

    def words_raw_extraction(self, domain, subdomain, path):

        w_domain = re.split("\-|\.|\/|\?|\=|\@|\&|\%|\:|\_", domain.lower())
        w_subdomain = re.split("\-|\.|\/|\?|\=|\@|\&|\%|\:|\_",
                               subdomain.lower())
        w_path = re.split("\-|\.|\/|\?|\=|\@|\&|\%|\:|\_", path.lower())

        raw_words = w_domain + w_path + w_subdomain
        #raw_words = list(set(raw_words))
        raw_words = list(filter(None, raw_words))

        return raw_words
Exemplo n.º 10
0
class WordSplitterClass(object):
    def __init__(self):
        self.logger = NsLog("log")
        self.path_data = "../data/"
        self.name_brand_file = "All_Brand.txt"
        self.dictionary_en = enchant.DictWithPWL(
            "en_US", "{0}{1}".format(self.path_data, self.name_brand_file))
        #self.__file_capitalize(self.path_data, self.name_brand_file)

        self.pp = pprint.PrettyPrinter(indent=4)

    def _split(self, gt7_word_list):

        return_word_list = []

        for word in gt7_word_list:
            try:
                ss = {'raw': word, 'splitted': []}

                # kelime içerisinde rakam varsa temizlenir.
                word = re.sub("\d+", "", word)
                sub_words = []

                if not self.dictionary_en.check(word):
                    #  işlenen kelime sözlükte bu kelimeyi geri döndür. İşlenen kelime sözlükte yoksa ayırma işlemine geç.

                    for number in range(
                            len(word), 3, -1
                    ):  # uzunluğu 3 den yüksek olan alt kelimelerin üretilmesi
                        for l in range(0, len(word) - number + 1):
                            if self.dictionary_en.check(
                                    self.__capitalize(word[l:l + number])):

                                #  bir kelime tespit ettiğim zaman diğer kelimelerin tespit edilmesinde fp ye sebep olmasın diye
                                #  tespit edilen kelime yerine * karekteri koydum
                                w = self.__check_last_char(word[l:l + number])
                                sub_words.append(w)
                                word = word.replace(w, "*" * len(w))

                    rest = max(re.split("\*+", word), key=len)
                    if len(rest) > 3:
                        sub_words.append(rest)

                    split_w = sub_words

                    for l in split_w:
                        for w in reversed(split_w):
                            """
                            tespit edilen bir kelime daha büyük olan bir kelimenin içerisinde de geçiyorsa o fp dir.
                            Bunları temizledim. Örn.  secure, cure.
                            Cure kelimesi listeden çıkarılır.
                            """

                            if l != w:  # todo edit distance eklenecek
                                if l.find(w) != -1 or l.find(w.lower()) != -1:
                                    sub_words.remove(w)

                    if len(sub_words) == 0:
                        #  eğer hiç kelime bulunamadıysa ham kelime olduğu gibi geri döndürülür.
                        sub_words.append(word.lower())
                else:
                    sub_words.append(word.lower())

                ss['splitted'] = sub_words
                return_word_list.append(ss)
            except:
                self.logger.debug("|" + word + "| işlenirken hata")
                self.logger.error(
                    "word_splitter.split()  muhtemelen boş dizi gelme hatası  /  Error : {0}"
                    .format(format_exc()))

        return return_word_list

    def _splitl(self, gt7_word_list):

        result = []

        for val in self._split(gt7_word_list):
            result += val["splitted"]

        return result

    def _splitw(self, word):

        word_l = []
        word_l.append(word)

        result = self._split(word_l)

        return result

    def __check_last_char(self, word):

        confusing_char = ['s', 'y']
        last_char = word[len(word) - 1]
        word_except_last_char = word[0:len(word) - 1]
        if last_char in confusing_char:
            if self.dictionary_en.check(word_except_last_char):
                return word_except_last_char

        return word

    def __clear_fp(self, sub_words):

        length_check = 0
        for w in sub_words:
            if (length_check + len(w)) < self.length + 1:
                length_check = length_check + len(w)
            else:
                sub_words.remove(w)

        sub_words = self.__to_lower(sub_words)
        return sub_words

    def __to_lower(self, sub_words):

        lower_sub_list = []

        for w in sub_words:
            lower_sub_list.append(str(w.lower()))

        return lower_sub_list

    def __capitalize(self, word):
        return word[0].upper() + word[1:]

    def __file_capitalize(self, path, name):
        """
        enchant paketinde özel kelimelerin kontrol edilebilmesi için baş harfinin büyük olması gerekiyor.
        bir kelime kontrol edilmeden önce baş harfi büyük hale gitirilip sonra sözlüğe sorduruyorum.
        Bu nedenle dosyadaki kelimelerin de baş harflerini büyük hale getirip kaydettim ve bu şekilde kullandım.
        :return: 
        """

        personel_dict_txt = open("{0}{1}".format(path, name), "r")

        personel_dict = []

        for word in personel_dict_txt:
            personel_dict.append(self.__capitalize(word.strip()))

        personel_dict_txt.close()

        personel_dict_txt = open("{0}{1}-2".format(path, name), "w")

        for word in personel_dict:
            personel_dict_txt.write(word + "\n")
Exemplo n.º 11
0
class machine_learning_algorithm():
    def __init__(self, algorithm, train_data_name="gsb.arff"):

        self.logger = NsLog("log")

        self.path_output_arff = "../output/arff/"
        self.path_test_output = ""

        self.json2arff_object = json2arff()
        self.parser_object = domain_parser()
        self.train_data_name = train_data_name
        self.rule_calculation = rule_extraction()

        self.time_now = str(datetime.datetime.now())[0:19].replace(" ", "_")

        if algorithm == 'NB':
            self.model = self.create_model_NB()
        elif algorithm == 'RF':
            self.model = self.create_model_RF()

    def __txt_to_list(self, txt_object):

        lst = []

        for line in txt_object:
            lst.append(line.strip())

        txt_object.close()

        return lst

    def preparing_train_data(self, file_name="gsb.arff"):

        train = []
        target = []

        try:
            train_dataset, train_meta = arff.loadarff(
                open("{0}{1}".format(self.path_output_arff, file_name), "r"))

            train = train_dataset[train_meta.names()
                                  [:-1]]  # everything but the last column
            target = train_dataset[train_meta.names()[len(train_meta.names()) -
                                                      1]]  # last column

            train = np.asarray(train.tolist(), dtype=np.float32)  # olay burda
        except:
            self.logger.debug(file_name + " ile eğitim sırasında hata")
            self.logger.error("Error : {0}".format(format_exc()))

        return train, target

    def preparing_test_data(self, test_dataset_list):

        try:
            feat_json = open(
                "../output/test-output/json-" + self.time_now + ".txt", "w")
            feat_arff = open(
                "../output/test-output/arff-" + self.time_now + ".arff", "w")

            "domain_parsed to json without class"
            self.test_parsed_domains = self.parser_object.parse_nonlabeled_samples(
                test_dataset_list)

            "rule calculation for test samples without class information -- output json format"
            test_features = self.rule_calculation.extraction(
                self.test_parsed_domains)

            "test sampleları için oluşturulan json -> arff e dönüştür. Class yok."
            arff_test_str = self.json2arff_object.convert_for_test(
                test_features, '')

            # feat_json.write(json.dumps(test_features))
            feat_arff.write(arff_test_str)

            feat_arff.close()
            feat_json.close()

            arff_raw = StringIO(arff_test_str)

            test_dataset, test_meta = arff.loadarff(arff_raw)

            test = test_dataset[test_meta.names()]
            test = np.asarray(test.tolist(), dtype=np.float32)
        except:
            self.logger.error(
                "Test verisi ayarlanırken hata  /  Error : {0}".format(
                    format_exc()))

        return test, self.test_parsed_domains

    def create_model_NB(self):

        train, target = self.preparing_train_data()
        gnb = GaussianNB()
        model = gnb.fit(train, target)

        return model

    def create_model_RF(self):
        train, target = self.preparing_train_data()
        clf = RandomForestClassifier(n_estimators=10,
                                     random_state=0,
                                     verbose=1)
        model = clf.fit(train, target)

        return model

    def model_run(self, test):

        model = self.create_model_RF()

        model_pre = model.predict(test)
        model_probability = model.predict_proba(test)

        model_pre_list = []
        for p in model_pre:
            model_pre_list.append(str(p).replace("b'", "").replace("'", ""))

        model_probability = model_probability.tolist()

        return model_pre_list, model_probability

    def output(self, test_data):

        test, test_parsed_domains = self.preparing_test_data(test_data)
        model_pre, model_probability = self.model_run(test)

        test_parsed_domain = self.test_parsed_domains
        result_list = []

        for test_domain in test_parsed_domain:
            result = {}
            result['domain'] = test_domain['url']
            result['id'] = test_domain['id']
            result['predicted_class'] = model_pre[test_domain['id']]
            result['probability_phish'] = (
                model_probability[test_domain['id']][1] /
                sum(model_probability[test_domain['id']])) * 100
            result['probability_legitimate'] = (
                model_probability[test_domain['id']][0] /
                sum(model_probability[test_domain['id']])) * 100
            result_list.append(result)

        test_result = open(
            "../output/test-output/result-" + self.time_now + ".txt", "w")
        test_result.write(json.dumps(result_list))
        test_result.close()

        return result_list

    def accuracy(self):
        model = self.model
        test_data, test_label = self.preparing_train_data()
        scores = cross_val_score(model, test_data, test_label, cv=10)
        return scores

    def confusion_matrix(self, name):
        """
        train dataseti gsb.arff model içerisinde bu dataset var.
        confisioun matris çıkarmayı istediğimiz datayı preparing_train_data fonksiyonu ile arff formatı okunur.
        okunan dosya data ve label olarak bölünür.
        data model üzerinde çalıştırılır.
        elde edilen tahmin sonuçlarına ilişkin labellar model_preye atılır.
        
        test_label--bytes array formatında unicode formatına dönüştürülür
        
        ardından confusion matrix çalıştırılır.
        :param name: 
        :return: 
        """

        test, test_label = self.preparing_train_data(file_name=name)
        model_pre, model_pro = self.model_run(test)

        test_label_unicode = []

        for t in test_label:
            test_label_unicode.append(str(t, 'utf-8'))

        return confusion_matrix(test_label_unicode,
                                model_pre,
                                labels=['phish', 'legitimate'])
class active_rules:
    def __init__(self):
        self.pp = pprint.PrettyPrinter(indent=2)
        self.logger = NsLog("log")

    def goog_safe_browsing(self, domain_features):

        try:

            url_list = []
            updated_domain_features = domain_features
            for sample in domain_features:
                url_list.append(sample['info']['url'])

            sep_list = self.__seperate(url_list, 500)

            #phishing_url_list = self.get_urls(self.google_sb_query(list, sep_list.index(list), len(domain_features)))  # aktif

            phishing_url_list = self.get_urls(json.loads(open("constant/gb_phish.json", "r").read()))  #dosyadan

            updated_domain_features = []
            for each in domain_features:
                element = each
                if each['info']['url'] in phishing_url_list:
                    element.update({'active_features': {'google_safe_browsing': 1}})
                else:
                    element.update({'active_features': {'google_safe_browsing': 0}})

                updated_domain_features.append(element)
        except:
            self.logger.error("Error : {0}".format(format_exc()))

        return updated_domain_features

    def __seperate(self, url_list, size):

        sep_urls = []

        k = int((len(url_list)/size)+1)

        for i in range(1, k+1):
            if (i*size) > len(url_list):
                sep_urls.append(url_list[(i - 1) * size : len(url_list)])
            else:
                sep_urls.append(url_list[(i-1)*size: i*size])

        return sep_urls

    def google_sb_query(self, url_list, count, overall_count):

        query_url_list = self.sb_format(url_list)
        sep_list = self.__seperate(query_url_list, 500)

        for list in sep_list:
            time_now = str(datetime.datetime.now())[0:19].replace(" ", "_")

            api_key = 'AIzaSyCGmGpCMt-PNQTrWAsp3LqcM_UvCF6NJ1I'
            url = "https://safebrowsing.googleapis.com/v4/threatMatches:find"
            payload = {'client': {'clientId': "mycompany", 'clientVersion': "0.1"},
                       'threatInfo': {'threatTypes': ["SOCIAL_ENGINEERING", "MALWARE"],
                                      'platformTypes': ["ANY_PLATFORM"],
                                      'threatEntryTypes': ["URL"],
                                      'threatEntries': list
                                      }}
            params = {'key': api_key}
            r = requests.post(url, params=params, json=payload).json()

            phish_url_list = []
            if 'matches' in r.keys():
                for each in r['matches']:
                    phish_url_list.append(each['threat']['url'])
            elif 'error' in r.keys():
                self.logger.debug("Google-SB sorgusunda hata - Toplam işlenen örnek sayısı: "+overall_count+"\nişlenen parça (500): "+count)

        return phish_url_list

    def sb_format(self, url_list):

        sb_query = []
        for url in url_list:
            sb_query.append({'url': url})

        return sb_query

    def get_urls(self, ph_db_json):

        urls = []

        for obj in ph_db_json:
            urls.append(obj['url'])

        return urls

    def txt_to_list(self, txt_object):
        list = []

        for line in txt_object:
            list.append(line.strip())
        txt_object.close()
        return list
Exemplo n.º 13
0
class url_rules:
    def __init__(self):

        print("initializing")

        self.logger = NsLog("log")
        self.path_data = "data/"
        self.name_brand_file = "allbrand.txt"
        self.path_alexa_files = "../data/alexa-tld/"

        self.nlp_manager = nlp_class()
        self.pp = pprint.PrettyPrinter(indent=4)
        self.word_splitter = WordSplitterClass()

        allbrand_txt = open(self.path_data + self.name_brand_file, "r")
        self.allbrand = self.__txt_to_list(allbrand_txt)

        #  trie
        #self.trie_alexa_tld = pygtrie.CharTrie(json.loads(open("constant/alexa_tld_json.txt", "r").read()))
        #self.trie_alexa_tldsiz = pygtrie.CharTrie(json.loads(open("constant/alexa_tldsiz_dct.json", "r").read()))

    def __txt_to_list(self, txt_object):

        list = []

        for line in txt_object:
            list.append(line.strip())

        txt_object.close()

        return list

    def rules_main(self, domain, tld, subdomain, path, words_raw):

        features = {}
        info_nlp = {}

        try:
            features.update(self.digit_count(domain, subdomain,
                                             path))  # digitcount
            features.update(self.length(domain, subdomain, path))  # uzunluk
            features.update(self.tld_check(tld))  # tld check
            features.update(self.check_rule_5(words_raw))  # www-com
            features.update(self.punny_code(domain))  # punnycode
            features.update(self.random_domain(domain))  # random_domain
            features.update(self.subdomain_count(subdomain))  # subdomain count
            features.update(self.char_repeat(words_raw))  # char_repeat
            # features.update(self.alexa_check(domain, tld))                         # alexa1m  check
            #features.update(self.alexa_trie(domain, tld))                         # alexa1m check trie
            features.update(self.special_chars(domain, subdomain,
                                               path))  # - . / @
            features.update(self.check_domain_in_list(domain))

            result_nlp = self.nlp_features(words_raw)
            features.update(result_nlp['features'])  # words_info

            info_nlp = result_nlp['info']

        except:
            self.logger.error("url_rules.main() Error : {0}".format(
                format_exc()))

        return info_nlp, features

    def digit_count(self, domain, subdomain, path):

        result = {
            'domain_digit_count': 0,
            'subdomain_digit_count': 0,
            'path_digit_count': 0
        }

        for letter in domain:
            if letter.isdigit():
                result['domain_digit_count'] = result['domain_digit_count'] + 1

        for letter in subdomain:
            if letter.isdigit():
                result['subdomain_digit_count'] = result[
                    'subdomain_digit_count'] + 1

        for letter in path:
            if letter.isdigit():
                result['path_digit_count'] = result['path_digit_count'] + 1

        return result

    def length(self, domain, subdomain, path):

        domain_uzunluk = len(domain)
        subdomain_uzunluk = len(subdomain)
        path_uzunluk = len(path)

        result = {}

        result['domain_length'] = domain_uzunluk
        result['subdomain_length'] = subdomain_uzunluk
        result['path_length'] = path_uzunluk

        return result

    def tld_check(self, tld):

        common_tld = ["com", "org", "net", "de", "edu", "gov"]

        result = {}

        if tld in common_tld:
            result["isKnownTld"] = 1
        else:
            result["isKnownTld"] = 0

        return result

    def check_rule_5(self, words_raw):

        result = {'www': 0, "com": 0}

        for word in words_raw:
            if not word.find('www') == -1:
                result['www'] = result['www'] + 1

            if not word.find('com') == -1:
                result['com'] = result['com'] + 1

        return result

    def punny_code(self, line):

        result = {}

        if line.startswith("xn--"):

            result['punnyCode'] = 1
            return result

        else:
            result['punnyCode'] = 0
            return result

    def random_domain(self, domain):

        result = {'random_domain': self.nlp_manager.check_word_random(domain)}

        return result

    def subdomain_count(self, line):

        sub = line.split(".")

        result = {}
        result['subDomainCount'] = len(sub)

        return result

    def __all_same(self, items):
        return all(x == items[0] for x in items)

    def char_repeat(self, words_raw):

        result = {'char_repeat': 0}
        repeat = {'2': 0, '3': 0, '4': 0, '5': 0}
        part = [2, 3, 4, 5]

        "sliding window mantigi repeat sayisi kadar eleman al" \
        "hepsi ayni mi diye bak - ayni ise artir"

        for word in words_raw:
            for char_repeat_count in part:
                for i in range(len(word) - char_repeat_count + 1):
                    sub_word = word[i:i + char_repeat_count]
                    if self.__all_same(sub_word):
                        repeat[str(char_repeat_count
                                   )] = repeat[str(char_repeat_count)] + 1

        result['char_repeat'] = sum(list(repeat.values()))

        return result

    def alexa_check(self, domain, tld):

        is_find_tld = 0
        is_find = 0
        line = domain + "." + tld

        letter = [
            "a", "b", "c", "d", "e", "f", "g", "h", "i", "j", "k", "l", "m",
            "n", "o", "p", "r", "s", "t", "u", "v", "y", "z", "w", "x", "q",
            "1", "2", "3", "4", "5", "6", "7", "8", "9", "0"
        ]

        try:
            if line[0] in letter:
                alexa_txt = open(
                    "{0}{1}.txt".format(self.path_alexa_files, line[0]), "r")
                alexaList_tld = []  #tldli
                alexa_list = []  #tldsiz

                for alexa_line in alexa_txt:
                    alexaList_tld.append(alexa_line.strip())
                    alexa_list.append(alexa_line.strip().split(".")[0])
                alexa_txt.close()

                for alexa_line in alexaList_tld:
                    if line.strip() == alexa_line.strip():
                        is_find_tld = 1
                        break

                for alexa_line in alexa_list:
                    line_domain = line.split(".")[0]
                    if line_domain.strip() == alexa_line.strip():
                        is_find = 1
                        break
        except:
            self.logger.debug(line + "işlenirken hata uzunluktan dolayı")
            self.logger.error(
                "url_rules.check_rule_11()-Alexa  /  Error : {0}".format(
                    format_exc()))

        result = {}

        if is_find_tld == 1:
            result['alexa1m_tld'] = 1
        else:
            result['alexa1m_tld'] = 0

        if is_find == 1:
            result['alexa1m'] = 1
        else:
            result['alexa1m'] = 0

        return result

    def alexa_trie(self, domain, tld):

        line = domain + "." + tld

        result = {}

        try:
            #if self.alexa1mm[line[0].lower()].has_key(line):
            if self.trie_alexa_tld.has_key(line):
                result['alexa1m_tld_trie'] = 1
            else:
                result['alexa1m_tld_trie'] = 0

            if self.trie_alexa_tldsiz.has_key(domain):
                result['alexa1m_tldsiz_trie'] = 1
            else:
                result['alexa1m_tldsiz_trie'] = 0
        except:
            self.logger.debug(line + "işlenirken alexa")
            self.logger.error(
                "url_rules.check_rule_11()-Alexa  /  Error : {0}".format(
                    format_exc()))

        return result

    def special_chars(self, domain, subdomain, path):

        special_char = {
            '-': 0,
            ".": 0,
            "/": 0,
            '@': 0,
            '?': 0,
            '&': 0,
            '=': 0,
            "_": 0
        }
        special_char_letter = special_char.keys()

        for l in domain:
            if l in special_char_letter:
                special_char[l] = special_char[l] + 1

        for l in subdomain:
            if l in special_char_letter:
                special_char[l] = special_char[l] + 1

        for l in path:
            if l in special_char_letter:
                special_char[l] = special_char[l] + 1

        return special_char

    def check_domain_in_list(self, domain):

        result = {}
        if domain in self.allbrand:
            result['domain_in_brand_list'] = 1
        else:
            result['domain_in_brand_list'] = 0

        return result

    def nlp_features(self, words_raw):
        """
        keywords_in_words, brands_in_words,
        dga_in_words, len_lt_7, len_gt_7 
        """
        grouped_words = self.nlp_manager.parse(words_raw)
        splitted_words = self.word_splitter._splitl(grouped_words['len_gt_7'])
        """
        found_keywords, found_brands,
        similar_to_keyword, similar_to_brand,
        other_words, target_words
        """

        fraud_analyze_result = self.nlp_manager.fraud_analysis(
            grouped_words, splitted_words)

        result = self.nlp_manager.evaluate(grouped_words, fraud_analyze_result,
                                           splitted_words)
        split = {'raw': grouped_words['len_gt_7'], 'splitted': splitted_words}
        result['info']['compoun_words'] = split

        return result
Exemplo n.º 14
0
class nlp_class:
    def __init__(self):
        self.logger = NsLog("log")
        self.path_data = "../data/"
        self.name_keywords = "keywords.txt"
        self.name_brand_file = "allbrand.txt"
        self.name_random_model = "gib_model.pki"

        model_data = pickle.load(open(self.name_random_model, 'rb'))
        self.model_mat = model_data['mat']
        self.threshold = model_data['thresh']

        self.allbrand = self.__txt_to_list(
            open("{0}{1}".format(self.path_data, self.name_brand_file), "r"))
        self.keywords = self.__txt_to_list(
            open("{0}{1}".format(self.path_data, self.name_keywords), "r"))

    def __txt_to_list(self, txt_object):
        list = []

        for line in txt_object:
            list.append(line.strip())
        txt_object.close()
        return list

    def __is_similar_to_any_element(self, word, list):

        target = ''
        for l in list:
            if editdistance.eval(word, l) < 2:
                target = l

        if len(target) > 0:
            return word
        else:
            return 0

    def parse(self, words):

        keywords_in_words = []
        brands_in_words = []
        similar_to_brands = []
        similar_to_keywords = []
        dga_in_words = []
        len_gt_7 = []
        len_lt_7 = []
        try:
            for word in words:

                word = re.sub("\d+", "", word)

                if word in self.keywords:
                    keywords_in_words.append(word)

                elif word in self.allbrand:
                    brands_in_words.append(word)

                elif self.__is_similar_to_any_element(word,
                                                      self.allbrand) != 0:
                    target = self.__is_similar_to_any_element(
                        word, self.allbrand)
                    similar_to_brands.append(target)

                elif self.__is_similar_to_any_element(word,
                                                      self.keywords) != 0:
                    target = self.__is_similar_to_any_element(
                        word, self.keywords)
                    similar_to_keywords.append(target)

                elif len(word) > 3 and not word.isnumeric():

                    if (gib_detect_train.avg_transition_prob(
                            word, self.model_mat) > self.threshold) == False:
                        dga_in_words.append(word)
                        # todo keyword benzeri olanlar temizlenmeli
                    elif len(word) < 7:
                        len_lt_7.append(word)
                    else:
                        len_gt_7.append(word)

            result = {
                'keywords_in_words': keywords_in_words,
                'brands_in_words': brands_in_words,
                'dga_in_words': dga_in_words,
                'len_lt_7': len_lt_7,
                'len_gt_7': len_gt_7,
                'similar_to_brands': similar_to_brands,
                'similar_to_keywords': similar_to_keywords
            }
        except:
            self.logger.debug(str(words) + " işlenirken hata")
            self.logger.error("Error : {0}".format(format_exc()))

        return result

    def fraud_analysis(self, grouped_words, splitted_words):

        word_list = grouped_words['len_lt_7'] + grouped_words[
            'similar_to_brands'] + grouped_words[
                'similar_to_keywords'] + splitted_words

        word_list_nlp = grouped_words['len_lt_7'] + grouped_words['similar_to_brands'] + \
                        grouped_words['similar_to_keywords'] + grouped_words['brands_in_words'] + \
                        grouped_words['keywords_in_words'] + grouped_words['dga_in_words'] + splitted_words

        found_keywords = []
        found_brands = []
        similar_to_keyword = []
        similar_to_brand = []
        other_words = []
        target_words = {'brand': [], 'keyword': []}
        try:
            for word in word_list:

                word = re.sub("\d+", "", word)

                if word in self.keywords:
                    found_keywords.append(word)
                elif word in self.allbrand:
                    found_brands.append(word)
                else:

                    for brand in self.allbrand:
                        if editdistance.eval(word, brand) < 2:
                            target_words['brand'].append(brand)
                            similar_to_brand.append(word)

                    for keyword in self.keywords:
                        if editdistance.eval(word, keyword) < 2:
                            target_words['keyword'].append(keyword)
                            similar_to_keyword.append(word)

                if word not in found_keywords + found_brands + similar_to_keyword + similar_to_brand:
                    other_words.append(word)

            result = {
                'found_keywords': found_keywords,
                'found_brands': found_brands,
                'similar_to_keywords': similar_to_keyword,
                'similar_to_brands': similar_to_brand,
                'other_words': other_words,
                'target_words': target_words,
                'words_nlp': word_list_nlp
            }
        except:
            self.logger.debug(str(word_list) + " işlenirken hata")
            self.logger.error("Error : {0}".format(format_exc()))
        return result

    def evaluate(self, grouped_words, fraud_analyze_result, splitted_words):
        """
        grouped_words
        keywords_in_words, brands_in_words,
        dga_in_words, len_lt_7, len_gt_7 

        fraud_anaylze_result
        found_keywords, found_brands,
        similar_to_keyword, similar_to_brand,
        other_words, target_words 
        """
        try:
            words_raw = grouped_words['keywords_in_words'] + grouped_words['brands_in_words'] + \
                        grouped_words['similar_to_brands'] + grouped_words['similar_to_keywords'] + \
                        grouped_words['dga_in_words'] + grouped_words['len_lt_7'] + grouped_words['len_gt_7']

            words_len = []
            compound_word_len = []

            for word in words_raw:
                words_len.append(len(word))

            for word in grouped_words['len_gt_7']:
                compound_word_len.append(len(word))

            all_keywords = grouped_words[
                'keywords_in_words'] + fraud_analyze_result['found_keywords']
            all_brands = grouped_words[
                'brands_in_words'] + fraud_analyze_result['found_brands']
            similar_brands = fraud_analyze_result['similar_to_brands']
            similar_keywords = fraud_analyze_result['similar_to_keywords']

            if len(compound_word_len) == 0:
                av_com = 0
            else:
                av_com = float(np.average(compound_word_len))

            if len(words_len) == 0:
                min = 0
                max = 0
                av_w = 0
                std = 0
            else:
                min = int(np.min(words_len))
                max = int(np.max(words_len))
                av_w = float(np.average(words_len))
                std = float(np.std(words_len))

            result = {
                'info': {
                    'keywords': all_keywords,
                    'brands': all_brands,
                    'dga_in_words': grouped_words['dga_in_words'],
                    'similar_to_keywords': similar_keywords,
                    'similar_to_brands': similar_brands,
                    'negligible_words': fraud_analyze_result['other_words'],
                    'target_words': fraud_analyze_result['target_words'],
                    'words_nlp': fraud_analyze_result['words_nlp']
                },
                'features': {
                    'raw_word_count':
                    len(words_len),
                    'splitted_word_count':
                    len(splitted_words),
                    'average_word_length':
                    av_w,
                    'longest_word_length':
                    max,
                    'shortest_word_length':
                    min,
                    'std_word_length':
                    std,
                    'compound_word_count':
                    len(grouped_words['len_gt_7']),
                    'keyword_count':
                    len(all_keywords),
                    'brand_name_count':
                    len(all_brands),
                    'negligible_word_count':
                    len(fraud_analyze_result['other_words']),
                    'target_brand_count':
                    len(fraud_analyze_result['target_words']['brand']),
                    'target_keyword_count':
                    len(fraud_analyze_result['target_words']['keyword']),
                    'similar_keyword_count':
                    len(similar_keywords),
                    'similar_brand_count':
                    len(similar_brands),
                    'average_compound_words':
                    av_com,
                    'random_words':
                    len(grouped_words['dga_in_words'])
                }
            }
        except:
            self.logger.error("Error : {0}".format(format_exc()))
        return result

    def check_word_random(self, word):

        if gib_detect_train.avg_transition_prob(
                word, self.model_mat) < self.threshold:
            return 1
        else:
            return 0
Exemplo n.º 15
0
class Train:
    def __init__(self):
        self.logger = NsLog("log")

        self.json2arff_object = json2arff()
        self.parser_object = domain_parser()
        self.rule_calculation = rule_extraction()

        self.path_input = "../input/"
        self.path_arff = "../output/arff/"
        self.path_features = "../output/features/"
        self.path_parsed_domain = "../output/domain_parser/"

    def txt_to_list(self, txt_object):

        lst = []

        for line in txt_object:
            lst.append(line.strip())

        txt_object.close()

        return lst

    def domain_parser(self, param):

        parsed_domains = []

        for i in range(1, len(param), 2):
            try:
                if param[i + 1] == 'phish' or param[i + 1] == 'legitimate':

                    #dataset = self.txt_to_list(open("{0}{1}".format(self.path_input, param[i]), "r"))  # txt read
                    dataset = json.loads(
                        open("{0}{1}".format(self.path_input, param[i]),
                             "r").read())  # json read

                    parsed_domains = parsed_domains + self.parser_object.parse(
                        dataset, param[i + 1], len(parsed_domains))

                else:
                    self.logger.debug(
                        "class labels must be entered one of (phish, legitimate)"
                    )

            except:
                self.logger.error("an error is occurred  : {0}".format(
                    format_exc()))
                self.logger.debug(
                    "an error occurred when | {0}.txt | file was processing".
                    format(param))

        self.logger.info(
            "Domain Parse process is done {0} unique URLs are parsed".format(
                len(parsed_domains)))

        return parsed_domains

    def json_to_file(self, name, path, data):
        time_now = str(datetime.datetime.now())[0:19].replace(" ", "_")
        file_name = name + "_" + time_now + ".txt"
        file = open(path + file_name, "w")
        file.write(json.dumps(data))
        file.close()
        self.logger.info("{0} Dosyaya Yazıldı.".format(name))

    def arff_to_file(self, name, path, data):
        time_now = str(datetime.datetime.now())[0:19].replace(" ", "_")
        file_name = name + "_" + time_now + ".txt"
        file = open(path + file_name, "w")
        file.write(data)
        file.close()
        self.logger.info("{0} Dosyaya Yazıldı.".format(name))
 def __init__(self):
     self.logger = NsLog("log")
     self.url_rules_o = url_rules()
     self.active_rules_o = active_rules()
Exemplo n.º 17
0
class json2arff:
    def __init__(self):
        self.logger = NsLog("log")

    def convert_for_train(self, features, param):

        # arff convert header
        try:
            ArffStr = '''@relation weka-test\n\n'''

            features_keys_url = list(features[0]['url_features'].keys())
            features_keys_active = []

            if param == '-a':
                features_keys_active = list(
                    features[0]['active_features'].keys())

            for line in features_keys_url:
                ArffStr = ArffStr + '@attribute ' + line + " numeric\n"

            if param == '-a':
                for line in features_keys_active:
                    ArffStr = ArffStr + '@attribute ' + line + " numeric\n"

            ArffStr = ArffStr + '@attribute class {phish, legitimate}' + "\n\n@data\n"
        except:
            self.logger.debug("Hata - Json_to_arff e gelen sample sayısı" +
                              str(len(features)) + "\nurl_feature_keys: " +
                              str(features_keys_url) +
                              "\nactive_features_key: " +
                              str(features_keys_active))
            self.logger.error("Error Arff Header : {0}".format(format_exc()))
        # header son

        for each_domain in features:
            try:
                tmp = ""

                for key in features_keys_url:
                    tmp = tmp + str(each_domain['url_features'][key]) + ","

                if param == '-a':
                    for key_a in features_keys_active:
                        tmp = tmp + str(
                            each_domain['active_features'][key_a]) + ","

                tmp = tmp + each_domain['info']['class'] + "\n"
                ArffStr = ArffStr + tmp
            except:
                self.logger.debug("Arffe çevrilen sample da hata :\n" +
                                  str(each_domain))
                self.logger.error("Error Arff Body : {0}".format(format_exc()))

        return ArffStr

    def convert_for_test(self, features, param):

        #todo active rules a göre güncellenecek

        # arff convert header

        ArffStr = '''@relation weka-test\n\n'''

        features_keys_url = features[0]['url_features'].keys()

        if param == '-dns':
            features_keys_dns = features[0]['dns_features'].keys()

        for line in features_keys_url:
            ArffStr = ArffStr + '@attribute ' + line + " numeric\n"

        if param == '-dns':
            for key in features_keys_dns:
                ArffStr = ArffStr + '@attribute ' + key + " numeric\n"

        ArffStr = ArffStr + "\n@data\n"

        # header son

        for each_domain in features:
            tmp = ""

            for key in features_keys_url:
                tmp = tmp + str(each_domain['url_features'][key]) + ","

            if param == '-dns':
                for key_dns in features_keys_dns:
                    tmp = tmp + str(each_domain['dns_features'][key_dns]) + ","

            tmp = tmp[0:len(tmp) - 1] + "\n"
            ArffStr = ArffStr + tmp

        return ArffStr

    def convert_for_NLP_without_features(self, features):
        # arff convert header
        try:
            ArffStr = '''@relation weka-test\n\n'''

            ArffStr += '@attribute words string\n'

            ArffStr += '@attribute class {phish, legitimate}' + "\n\n@data\n"

            for sample in features:
                ArffStr += "'"
                for word in sample['info']['nlp_info']['words_nlp']:
                    ArffStr += word + " "
                ArffStr = ArffStr.strip() + "',{0}\n".format(
                    sample['info']['class'])

        except:
            self.logger.error("Error Arff Header : {0}".format(format_exc()))

        return ArffStr

    def convert_for_NLP_with_features(self, features):
        # arff convert header
        try:
            features_keys_url = list(features[0]['url_features'].keys())

            ArffStr = '''@relation weka-test\n\n'''

            ArffStr += '@attribute words string\n'

            for line in features_keys_url:
                ArffStr = ArffStr + '@attribute ' + line + " numeric\n"

            ArffStr += '@attribute class {phish, legitimate}' + "\n\n@data\n"

            for sample in features:
                ArffStr += '"'
                for word in sample['info']['nlp_info']['words_nlp']:
                    ArffStr += word + " "

                ArffStr = ArffStr.strip() + '",'

                for key in features_keys_url:
                    ArffStr += str(sample['url_features'][key]) + ","
                ArffStr = ArffStr.strip() + '{0}\n'.format(
                    sample['info']['class'])

        except:
            self.logger.error("Error Arff Header : {0}".format(format_exc()))

        return ArffStr
 def __init__(self):
     self.pp = pprint.PrettyPrinter(indent=2)
     self.logger = NsLog("log")