def node(self, data, tags): err = [] for key, value in tags.items(): m = self.non_printable.search(key) if m: err.append({"class": 50702, "subclass": 0, "text": T_f(u"\"{0}\" unexpected non printable char ({1}, 0x{2:04x}) in key at position {3}", key, unicodedata.name(m.group(0), ''), ord(m.group(0)), m.start() + 1)}) continue m = self.non_printable.search(value) if m: err.append({"class": 50702, "subclass": 1, "text": T_f(u"\"{0}\"=\"{1}\" unexpected non printable char ({2}, 0x{3:04x}) in value at position {4}", key, value, unicodedata.name(m.group(0), ''), ord(m.group(0)), m.start() + 1)}) continue m = self.other_symbol.search(key) if m: err.append({"class": 50703, "subclass": 0, "text": T_f(u"\"{0}\" unexpected symbol char ({1}, 0x{2:04x}) in key at position {3}", key, unicodedata.name(m.group(0), ''), ord(m.group(0)), m.start() + 1)}) continue m = self.other_symbol.search(value) if m: err.append({"class": 50703, "subclass": 1, "text": T_f(u"\"{0}\"=\"{1}\" unexpected symbol char ({2}, 0x{3:04x}) in value at position {4}", key, value, unicodedata.name(m.group(0), ''), ord(m.group(0)), m.start() + 1)}) continue # https://en.wikipedia.org/wiki/Bi-directional_text#Table_of_possible_BiDi-types for c in u"\u200E\u200F\u061C\u202A\u202D\u202B\u202E\u202C\u2066\u2067\u2068\u2069": m = key.find(c) if m > 0: err.append({"class": 50702, "subclass": 2, "text": T_f(u"\"{0}\" unexpected non printable char ({1}, 0x{2:04x}) in key at position {3}", key, unicodedata.name(c, ''), ord(c), m + 1)}) m = value.find(c) if m > 0: err.append({"class": 50702, "subclass": 2, "text": T_f(u"\"{0}\"=\"{1}\" unexpected non printable char ({2}, 0x{3:04x}) in value at position {4}", key, value, unicodedata.name(c, ''), ord(c), m + 1)}) if self.default: if key in self.names: s = self.non_letter.sub(u" ", value) s = self.alone_char.sub(u"", s) s = self.roman_number.sub(u"", s) s = self.default.sub(u"", s) if len(s) > 0 and not(len(value) == 2 and len(s) == 1) and len(s) <= len(value) / 10 + 1: if len(s) == 1: c = s[0] u = self.uniq_script and confusables.unconfuse(c, self.uniq_script) if u: err.append({"class": 50701, "subclass": 0, "text": T_f(u"\"{0}\"=\"{1}\" unexpected char \"{2}\" ({3}, 0x{4:04x}). Means \"{5}\" ({6}, 0x{7:04x})?", key, value, s, unicodedata.name(c, ''), ord(c), u, unicodedata.name(u, ''), ord(u)), "fix": {key: value.replace(c, u)} }) else: err.append({"class": 50701, "subclass": 0, "text": T_f(u"\"{0}\"=\"{1}\" unexpected char \"{2}\" ({3}, 0x{4:04x})", key, value, s, unicodedata.name(c, ''), ord(c)) }) else: err.append({"class": 50701, "subclass": 0, "text": T_f(u"\"{0}\"=\"{1}\" unexpected \"{2}\"", key, value, s)}) l = key.split(':') if len(l) > 1 and l[0] in self.names and l[1] in self.lang: s = self.non_letter.sub(u" ", value) s = self.alone_char.sub(u"\\1", s) s = self.roman_number.sub(u"\\1", s) s = self.lang[l[1]].sub(u"", s) if len(s) > 0: if len(s) == 1: c = s[0] u = self.uniq_scripts.get(l[1]) and confusables.unconfuse(c, self.uniq_scripts.get(l[1])) if u: err.append({"class": 50701, "subclass": 1, "text": T_f(u"\"{0}\"=\"{1}\" unexpected char \"{2}\" ({3}, 0x{4:04x}). Means \"{5}\" ({6}, 0x{7:04x})?", key, value, s, unicodedata.name(c, ''), ord(c), u, unicodedata.name(u, ''), ord(u)), "fix": {key: value.replace(c, u)} }) else: err.append({"class": 50701, "subclass": 1, "text": T_f(u"\"{0}\"=\"{1}\" unexpected char \"{2}\" ({3}, 0x{4:04x})", key, value, s, unicodedata.name(c, ''), ord(c)) }) else: err.append({"class": 50701, "subclass": 1, "text": T_f(u"\"{0}\"=\"{1}\" unexpected \"{2}\"", key, value, s)}) return err
def node(self, data, tags): err = [] for key, value in tags.items(): m = self.non_printable.search(key) if m: err.append({ "class": 50702, "subclass": 0, "text": T_( "\"%s\" unexpected non printable char (%s, 0x%04x) in key at position %s", key, unicodedata.name(m.group(0), ''), ord(m.group(0)), m.start() + 1) }) continue m = self.non_printable.search(value) if m: err.append({ "class": 50702, "subclass": 1, "text": T_( "\"%s\"=\"%s\" unexpected non printable char (%s, 0x%04x) in value at position %s", key, value, unicodedata.name(m.group(0), ''), ord(m.group(0)), m.start() + 1) }) continue m = self.other_symbol.search(key) if m: err.append({ "class": 50703, "subclass": 0, "text": T_( "\"%s\" unexpected symbol char (%s, 0x%04x) in key at position %s", key, unicodedata.name(m.group(0), ''), ord(m.group(0)), m.start() + 1) }) continue m = self.other_symbol.search(value) if m: err.append({ "class": 50703, "subclass": 1, "text": T_( "\"%s\"=\"%s\" unexpected symbol char (%s, 0x%04x) in value at position %s", key, value, unicodedata.name(m.group(0), ''), ord(m.group(0)), m.start() + 1) }) continue # https://en.wikipedia.org/wiki/Bi-directional_text#Table_of_possible_BiDi-types for c in u"\u200E\u200F\u061C\u202A\u202D\u202B\u202E\u202C\u2066\u2067\u2068\u2069": m = key.find(c) if m > 0: err.append({ "class": 50702, "subclass": 2, "text": T_( "\"%s\" unexpected non printable char (%s, 0x%04x) in key at position %s", key, unicodedata.name(c, ''), ord(c), m + 1) }) m = value.find(c) if m > 0: err.append({ "class": 50702, "subclass": 2, "text": T_( "\"%s\"=\"%s\" unexpected non printable char (%s, 0x%04x) in value at position %s", key, value, unicodedata.name(c, ''), ord(c), m + 1) }) if self.default: if key in self.names: s = self.non_letter.sub(u" ", value) s = self.alone_char.sub(u"", s) s = self.roman_number.sub(u"", s) s = self.default.sub(u"", s) if len(s) > 0 and \ not(len(value) == 2 and len(s) == 1) and \ len(s) <= len(value) / 10 + 1: if len(s) == 1: c = s[0] u = self.uniq_script and confusables.unconfuse( c, self.uniq_script) if u: err.append({ "class": 50701, "subclass": 0, "text": T_( "\"%s\"=\"%s\" unexpected char \"%s\" (%s, 0x%04x). Means \"%s\" (%s, 0x%04x)?", key, value, s, unicodedata.name(c, ''), ord(c), u, unicodedata.name(u, ''), ord(u)), "fix": { key: value.replace(c, u) } }) else: err.append({ "class": 50701, "subclass": 0, "text": T_( "\"%s\"=\"%s\" unexpected char \"%s\" (%s, 0x%04x)", key, value, s, unicodedata.name(c, ''), ord(c)) }) else: err.append({ "class": 50701, "subclass": 0, "text": T_("\"%s\"=\"%s\" unexpected \"%s\"", key, value, s) }) l = key.split(':') if len(l) > 1 and l[0] in self.names and l[1] in self.lang: s = self.non_letter.sub(u" ", value) s = self.alone_char.sub(u"\\1", s) s = self.roman_number.sub(u"\\1", s) s = self.lang[l[1]].sub(u"", s) if len(s) > 0: if len(s) == 1: c = s[0] u = self.uniq_scripts.get( l[1]) and confusables.unconfuse( c, self.uniq_scripts.get(l[1])) if u: err.append({ "class": 50701, "subclass": 1, "text": T_( "\"%s\"=\"%s\" unexpected char \"%s\" (%s, 0x%04x). Means \"%s\" (%s, 0x%04x)?", key, value, s, unicodedata.name(c, ''), ord(c), u, unicodedata.name(u, ''), ord(u)), "fix": { key: value.replace(c, u) } }) else: err.append({ "class": 50701, "subclass": 1, "text": T_( "\"%s\"=\"%s\" unexpected char \"%s\" (%s, 0x%04x)", key, value, s, unicodedata.name(c, ''), ord(c)) }) else: err.append({ "class": 50701, "subclass": 1, "text": T_("\"%s\"=\"%s\" unexpected \"%s\"", key, value, s) }) return err
def score_domain(domain): """Score `domain`. The highest score, the most probable `domain` is a phishing site. Args: domain (str): the domain to check. Returns: int: the score of `domain`. """ score = 0 for t in suspicious['tlds']: if domain.endswith(t): score += 20 # Remove initial '*.' for wildcard certificates bug if domain.startswith('*.'): domain = domain[2:] # Removing TLD to catch inner TLD in subdomain (ie. paypal.com.domain.com) try: res = get_tld(domain, as_object=True, fail_silently=True, fix_protocol=True) domain = '.'.join([res.subdomain, res.domain]) except Exception: pass # Higer entropy is kind of suspicious score += int(round(entropy(domain) * 10)) # Remove lookalike characters using list from http://www.unicode.org/reports/tr39 domain = unconfuse(domain) words_in_domain = re.split("\W+", domain) # ie. detect fake .com (ie. *.com-account-management.info) if words_in_domain[0] in ['com', 'net', 'org']: score += 10 # Testing keywords for word in suspicious['keywords']: if word in domain: score += suspicious['keywords'][word] # Testing Levenshtein distance for strong keywords (>= 70 points) (ie. paypol) for key in [k for (k, s) in suspicious['keywords'].items() if s >= 70]: # Removing too generic keywords (ie. mail.domain.com) for word in [ w for w in words_in_domain if w not in ['email', 'mail', 'cloud'] ]: if distance(str(word), str(key)) == 1: score += 70 # Lots of '-' (ie. www.paypal-datacenter.com-acccount-alert.com) if 'xn--' not in domain and domain.count('-') >= 4: score += domain.count('-') * 3 # Deeply nested subdomains (ie. www.paypal.com.security.accountupdate.gq) if domain.count('.') >= 3: score += domain.count('.') * 3 return score