def test_levenstein_check_no_match(self): fake = Faker() keywords = [fake.domain_word()] info("Generated keyword: {}".format(keywords)) domain = "{}x.com".format(fake.domain_word()) info("Requested domain - {}".format(domain)) l = levenstein_check(keywords, domain.split('.')) assert_type(l, tuple, "Check if proper tuple is returned") assert_false(l[0], "Check if keyword matches domain") assert_none(l[1], "Check if no keyword matches domain") assert_none(l[2], "Check if no keyword is returned") assert_none(l[3], "Check if no levenstein distance is returned")
def test_levenstein_check(self): fake = Faker() keywords = [fake.domain_word() for i in range(0, 2)] info("Generated keyword: {}".format(keywords)) proper_keyword = keywords[0] domain = "{}x.{}.{}awdawdawdfawytdawdrawd.com".format( proper_keyword, proper_keyword, proper_keyword) info("Requested domain - {}".format(domain)) l = levenstein_check(keywords, domain.split('.')) assert_type(l, tuple, "Check if proper tuple is returned") assert_true(l[0], "Check if keyword matches domain") assert_equal(l[1], 1, "Check if keyword matches domain only one time") assert_equal(l[2], proper_keyword, "Check if proper keyword is returned") assert_equal(l[3], 1, "Check if proper levenstein distance is returned")
def get_levenstein_details(url_body): try: jsonschema.validate(url_body, details_url_schema) except jsonschema.exceptions.ValidationError as exc: raise BadRequest(exc.message) domain = url_to_domain(url_body.get('url')) good_keywords = [k['good_keyword'] for k in Goodies.get_all_goodies()] domain_phrases = domain.split('.') _, _, lev_keyword, lev_dist = levenstein.levenstein_check(good_keywords, domain_phrases) if not lev_keyword: return _no_data_response() response_text = { "details": { "matched_keyword": lev_keyword, "levenstein_distance": lev_dist } } return Response(json.dumps( response_text, default=_default_json_model ), 200, mimetype="application/json")
def verify_levenstein(domain): """ Prepares list of good_keywords eg ['facebook', 'google', 'onet'] Splits domain by '.' eg ['weka', 'pwr', 'edu', 'pl'] Prepares dict for levenstein's values: { 'keyword': 0 } Do not compare if given keyword is same as domain If length of keyword is less than 2 * length of domain phrase and length of domain phrase is longer than 2 (to exclude eg 'wp' or 'pl') and levenstein's distance is less than 3 and more than 0 then increase lev's amount for keyword If amount of matches for a phrase from domain is less than half of amount of phrases then it is malicious else it is not """ good_keywords = [k['good_keyword'] for k in Goodies.get_all_goodies()] domain_phrases = domain.split('.') verdict, _, _, _ = levenstein_check(good_keywords, domain_phrases) return verdict
def create_baddie(domain): _, ip_id = add_ip(domain) _, crt_id = add_cert(domain) good_keywords = [k['good_keyword'] for k in Goodies.get_all_goodies()] domain_phrases = domain.split('.') _, _, lev_matched_keyword = lev.levenstein_check(good_keywords, domain_phrases) min_lev_distance = 0 lev_distance = 0 if lev_matched_keyword: for phrase in domain_phrases: lev_distance = lev.calculate_levenstein(lev_matched_keyword, phrase) if 3 > min_lev_distance > lev_distance: min_lev_distance = lev_distance if not lev_matched_keyword: lev_matched_keyword = '' _, contained_matched_keyword = match_keyword(domain) if not contained_matched_keyword: contained_matched_keyword = '' entropy = ent.get_entropy(domain) return add_baddie(domain, ip_id[1], crt_id[1], lev_distance, lev_matched_keyword, contained_matched_keyword, entropy)