def redirects(href): href_url, href_num_redirects, href_protocol, href_content = get_redirects(href) if not href_content == -1: redirect_object = RedirectEntry(url=href_url, redirects=href_num_redirects, protocol=href_protocol, content=href_content) return redirect_object
def get_feature_entry(): while True: href = q.get() href_url, href_num_redirects, href_protocol, href_content = get_redirects(href) if not href_content == -1: redirect_object = RedirectEntry(url=href_url, redirects=href_num_redirects, protocol=href_protocol, content=href_content) append_in_list(redirect_object) q.task_done() return
def extract_features_from_website(url, label, predict): """ extract all features from website, if predict set to true a pandas dataframe is created """ try: global brand_list global phishy_list global login_list global tld_list # save original url for object instance url_orig = url # get different components of url components = get_url_components(url) fqdn = components[0] scheme = components[1] subdomain = components[2] domain = components[3] suffix = components[4] port = components[5] path = components[6] query = components[7] fragment = components[8] netloc = fqdn url_no_prot = url if scheme: netloc = scheme + "://" + fqdn if port: netloc = netloc + ":" + port url_no_prot = url.replace(scheme + "://", "", 1) # check for redirects of url resp_url, num_redirects, protocol, content = get_redirects(url) # try again if no connection could have been established if content == -1: time.sleep(3) resp_url, num_redirects, protocol, content = get_redirects(url) if content == -1: return None # get content for homepage hp_url, hp_num_redirects, hp_protocol, hp_content = get_redirects( "{}://www.{}.{}".format(scheme, domain, suffix)) if hp_content == -1: time.sleep(3) hp_url, hp_num_redirects, hp_protocol, hp_content = get_redirects( "{}://www.{}.{}".format(scheme, domain, suffix)) # read content in parser if not hp_content == -1: hp_soup = bs4.BeautifulSoup(hp_content.lower(), 'html.parser') soup = bs4.BeautifulSoup(content.lower(), 'html.parser') url = resp_url # number of redirects done by website if num_redirects > 0: bool_redirect_website = True else: bool_redirect_website = False # website has favicon/ check if website has favicon bool_favicon_website = False try: icon = favicon.get(url, timeout=3) bool_favicon_website = True except Exception as e: bool_favicon_website = False # website has links pointing to extern content bool_content_extern_website = False # number of links pointing to extern content int_links_extern_website = 0 bool_content_extern_website, int_links_extern_website = find_extern_links( content.lower(), domain, suffix, url) # check for custom status bar bool_custom_statusbar_website = bool( str(content).lower().replace(" ", "").__contains__("window.status=")) # custom right click bool_disable_rightclick_website = False if str(content).replace( " ", "").lower().__contains__("document.oncontextmenu="): bool_disable_rightclick_website = True res = soup.findAll("body") if res: for element in res: try: right_click_arg = element['oncontextmenu'] if str(right_click_arg) == "return false": bool_disable_right_click = True except Exception as e: continue # has pop up window bool_popup_website = False hidden_count = 0 res = soup.findAll("div") if res: for tag in res: try: arg = tag['class'] if "popup" in arg: bool_popup_website = True except Exception as e: pass try: arg = tag['style'] arg = str(arg).replace(" ", "") if arg.__contains__("display:none") or arg.__contains__( "visibility:hidden"): hidden_count += 1 except Exception as e: continue # has iframe bool_iframe_website = False res = soup.findAll("iframe") if res: bool_iframe_website = True # has action tag > custom 2. feature - is action extern? bool_action_website = False bool_action_extern_website = False # has bool form post bool_form_post_website = False res = soup.findAll("form") if res: for element in res: try: if element["action"]: bool_action_website = True action_url = element["action"] if validate_url(action_url) or validate_url( urljoin(netloc, action_url)): if validate_url(urljoin(netloc, action_url)): action_url = urljoin(netloc, action_url) extracted_action_url = get_url_components( action_url) domain_action_url = extracted_action_url[3] suffix_action_url = extracted_action_url[4] if not suffix == suffix_action_url or not domain == domain_action_url: bool_action_extern_website = True break if element["method"] == "post": bool_form_post_website = True except Exception as e: continue # has phishy tokens in visible content int_phishy_tokens_website = 0 for text in soup.stripped_strings: int_phishy_tokens_website += sum(1 for word in phishy_list if text.__contains__(word)) # has input tag bool_input_website = False if get_element_count("input", soup) > 0: bool_input_website = True # find meta description res = soup.find('meta', attrs={'name': 'og:description'}) if not res: res = soup.find('meta', attrs={'property': 'description'}) if not res: res = soup.find('meta', attrs={'name': 'description'}) if not hp_content == -1: hp_res = hp_soup.find('meta', attrs={'name': 'og:description'}) if not hp_res: hp_res = hp_soup.find('meta', attrs={'property': 'description'}) if not hp_res: hp_res = hp_soup.find('meta', attrs={'name': 'description'}) float_description_sim_website = 0 if hp_content == -1: float_description_sim_website = -1 if not hp_content == -1: if res and hp_res: try: hp_desc = hp_res['content'] desc = res['content'] # compute similarity of description from home and login page float_description_sim_website = string_similarity( desc, hp_desc) except Exception: pass # bond status login and homepage bool_bond_status_website = False # most frequent domain ist extern > tru/false bool_freq_domain_extern_website = False res = soup.findAll("a") domain_list = [] link_list = [] href_count = 0 redirect_object_list = [] if res: for a_tag in res: try: href = a_tag.attrs.get("href") href_count += 1 if validate_url(href) or validate_url(urljoin( netloc, href)): if validate_url(urljoin(netloc, href)): href = urljoin(netloc, href) if href == hp_url: bool_bond_status_website = True components_href = get_url_components(href) domain_href = components_href[3] suffix_href = components_href[4] if is_IP(domain): continue link_list.append(href) domain_list.append("{},{}".format( domain_href, suffix_href)) except Exception as e: continue link_list = list(set(link_list)) link_list = link_list[:10] if not hp_content == -1: try: redirect_object_list = get_redirects_list(link_list) except Exception as e: log(action_logging_enum=ERROR, logging_text=str(e)) if redirect_object_list: for redirect_object in redirect_object_list: if not bool_bond_status_website and not hp_content == -1 and redirect_object_list: try: website_sim = html_similarity.similarity( str(hp_content).lower(), str(redirect_object.content).lower(), k=0.3) if website_sim == 1: bool_bond_status_website = True except Exception: continue if domain_list: occure_count = Counter(domain_list) most_freq = occure_count.most_common(1)[0][0] most_frq_domain, most_freq_suffix = most_freq.split(",", 1) if not str(most_frq_domain) == domain or not str( suffix) == most_freq_suffix: bool_freq_domain_extern_website = True # jaccard similarity between homepage and login page float_login_home_website = 0 if not hp_content == -1: try: float_login_home_website = html_similarity.similarity( str(content).lower(), str(hp_content).lower(), k=0.3) except Exception: pass # website has copyright bool_copyright_website = False # similarity from copyright of login page and home page copy = "" hp_copy = "" if not hp_content == -1: float_copyright_sim_website = 0 for text in soup.stripped_strings: if '©' in text: copy = re.sub(r'\s+', ' ', text) bool_copyright_website = True for text in hp_soup.stripped_strings: if '©' in text: hp_copy = re.sub(r'\s+', ' ', text) if copy and hp_copy: float_copyright_sim_website = string_similarity(copy, hp_copy) else: float_copyright_sim_website = 0 # similarity from title of login page and home page float_title_sim_website = 0 if not hp_content == -1: try: title = soup.title.text hp_title = hp_soup.title.text float_title_sim_website = string_similarity(title, hp_title) except Exception: float_title_sim_website = 0 pass # unique links/all links on page float_unique_links_website = 0 if link_list: float_unique_links_website = len(list( set(link_list))) / len(link_list) # lexical analysis for all links on website bool_link_analysis_website = True # dataframe = pd.DataFrame() # try: # redirect_object = RedirectEntry(url=url, redirects=num_redirects, content=content, protocol=protocol) # dataframe = pd.DataFrame(extract_features_from_URL(redirect_object, "Predict", brand_list=brand_list, # tld_list=tld_list, phishy_list=phishy_list, predict=True)) # except Exception as e: # pass # if not dataframe.empty: # try: # df = pd.DataFrame(dataframe.iloc[0]).transpose() # prediction = predict_url(df) # if int(prediction) == 0: # bool_link_analysis_website = False # except Exception: # pass # number of input elements int_input_website = 0 # find form accompanied by labels with loginwords bool_input_login_website = False form = soup.find("form") try: if form: inputs = form.find_all("input") if inputs: int_input_website = len(inputs) for inp in inputs: try: if inp["type"] == "hidden": hidden_count += 1 except Exception: continue label_tags = form.findAll("label") if label_tags: for label_entry in label_tags: if any( str(label_entry.text).__contains__(word) for word in login_list): bool_input_login_website = True except Exception: pass # website has button bool_button_website = False button_count = get_element_count("button", soup) if button_count > 0: bool_button_website = True # website has meta information bool_meta_website = False if soup.find("meta"): bool_meta_website = True # has hidden elements bool_hidden_element_website = False if hidden_count > 0: bool_hidden_element_website = True # number of option tags int_option_website = get_element_count("option", soup) int_option_website = get_element_count("option", soup) # number select tags int_select_website = get_element_count("select", soup) # number th tags int_th_website = get_element_count("th", soup) # number of tr tags int_tr_website = get_element_count("tr", soup) # number of table tags int_table_website = get_element_count("table", soup) # number of href in a tag int_href_website = href_count # number of list item tags int_li_website = get_element_count("li", soup) # number of unordered list tags int_ul_website = get_element_count("ul", soup) # number of ordered list tags int_ol_website = get_element_count("ol", soup) # number of div tags int_div_website = get_element_count("div", soup) # number of span tags int_span_website = get_element_count("span", soup) # number of article tags int_article_website = get_element_count("article", soup) # number of p tags int_p_website = get_element_count("p", soup) # number of checkbox tags int_checkbox_website = get_element_count("input", soup, "type", "checkbox") # number of buttons int_button_website = button_count # number of images int_image_website = get_element_count("img", soup) if predict == False: entry = FeatureEntryContent( bool_redirect_website=bool_redirect_website, bool_favicon_website=bool_favicon_website, bool_content_extern_website=bool_content_extern_website, int_links_extern_website=int_links_extern_website, bool_custom_statusbar_website=bool_custom_statusbar_website, bool_disable_rightclick_website=bool_disable_rightclick_website, bool_popup_website=bool_popup_website, bool_iframe_website=bool_iframe_website, bool_action_website=bool_action_website, bool_action_extern_website=bool_action_extern_website, bool_form_post_website=bool_form_post_website, int_phishy_tokens_website=int_phishy_tokens_website, bool_input_website=bool_input_website, float_description_sim_website=float_description_sim_website, bool_bond_status_website=bool_bond_status_website, bool_freq_domain_extern_website=bool_freq_domain_extern_website, float_login_home_website=float_login_home_website, bool_copyright_website=bool_copyright_website, float_copyright_sim_website=float_copyright_sim_website, float_title_sim_website=float_title_sim_website, float_unique_links_website=float_unique_links_website, # bool_link_analysis_website=bool_link_analysis_website, int_input_website=int_input_website, bool_input_login_website=bool_input_login_website, bool_button_website=bool_button_website, bool_meta_website=bool_meta_website, bool_hidden_element_website=bool_hidden_element_website, int_option_website=int_option_website, int_select_website=int_select_website, int_th_website=int_th_website, int_tr_website=int_tr_website, int_table_website=int_table_website, int_href_website=int_href_website, int_li_website=int_li_website, int_ul_website=int_ul_website, int_ol_website=int_ol_website, int_div_website=int_div_website, int_span_website=int_span_website, int_article_website=int_article_website, int_p_website=int_p_website, int_checkbox_website=int_checkbox_website, int_button_website=int_button_website, int_image_website=int_image_website, label=label, url=url_orig, final_url=url) log(action_logging_enum=INFO, logging_text="Processed datapoint. {}".format(url)) return entry elif predict: data = { "ID": [0], "Has Redirect": [bool_redirect_website], "Has Favicon": [bool_favicon_website], "Has Extern Content": [bool_content_extern_website], "Number Extern Links": [int_links_extern_website], "Has Custom StatusBar": [bool_custom_statusbar_website], "Has Disabled RightClick": [bool_disable_rightclick_website], "Has PopUp": [bool_popup_website], "Has iFrame": [bool_iframe_website], "Has Action": [bool_action_website], "Has Extern Action": [bool_action_extern_website], "Has Form with POST": [bool_form_post_website], "Number PhishyTokens": [int_phishy_tokens_website], "Has Input": [bool_input_website], "Ratio Description Sim": [float_description_sim_website], "Has Bond Status": [bool_bond_status_website], "Has Freq Domain Extern": [bool_freq_domain_extern_website], "Ratio Similarity": [float_login_home_website], "Has Copyright": [bool_copyright_website], "Ratio Copyright Sim": [float_copyright_sim_website], "Ratio Title Sim": [float_title_sim_website], "Ratio Unique Links": [float_unique_links_website], "Number Inputs": [int_input_website], "Has Input for Login": [bool_input_login_website], "Has Button": [bool_button_website], "Has Meta": [bool_meta_website], "Has Hidden Element": [bool_hidden_element_website], "Number Option": [int_option_website], "Number Select": [int_select_website], "Number TH": [int_th_website], "Number TR": [int_tr_website], "Number Table": [int_table_website], "Number HREF": [int_href_website], "Number LI": [int_li_website], "Number UL": [int_ul_website], "Number OL": [int_ol_website], "Number DIV": [int_div_website], "Number Span": [int_span_website], "Number Article": [int_article_website], "Number Paragr": [int_p_website], "Number Checkbox": [int_checkbox_website], "Number Button": [int_checkbox_website], "Number Image": [int_image_website], "Label": [label], "URL": [url_orig], "Final URL": [url] } columns = list(CONTENT_FEATURE_LIST_COLUMN_NAMES) df = pd.DataFrame(data, columns=columns) return df except Exception as e: log(action_logging_enum=WARNING, logging_text=str(e)) log(action_logging_enum=WARNING, logging_text=str(e.__traceback__)) exc_type, exc_obj, tb = sys.exc_info() f = tb.tb_frame lineno = tb.tb_lineno filename = f.f_code.co_filename linecache.checkcache(filename) line = linecache.getline(filename, lineno, f.f_globals) log( ERROR, 'EXCEPTION IN ({}, LINE {} "{}"): {}'.format( filename, lineno, line.strip(), exc_obj)) log(action_logging_enum=WARNING, logging_text="Could not extract content features for {}".format( url)) log(action_logging_enum=INFO, logging_text="Failed datapoint. {}".format(url)) return None
def extract_features_from_URL(url, label="", predict=False): """ extract all features from url, if predict set to true a pandas dataframe is created """ # get components netloc, filepath, query, path, Subdomain, SLD (domainname), TLD (incl. ccTLD) df = pd.DataFrame() try: data = url orig_web = "" redirect_object = isinstance(data, RedirectEntry) if redirect_object: url = data.url int_redirect_url = data.redirects protocol = data.protocol content = data.content components_orig = get_url_components(url) domain_orig = components_orig[3] suffix_orig = components_orig[4] if suffix_orig: orig_web = "{}.{}".format(domain_orig, suffix_orig) else: orig_web = domain_orig else: components_orig = get_url_components(url) domain_orig = components_orig[3] suffix_orig = components_orig[4] if suffix_orig: orig_web = "{}.{}".format(domain_orig, suffix_orig) else: orig_web = domain_orig # number of redirects resp_url, int_redirect_url, protocol, content = get_redirects(url) final_url = resp_url components = get_url_components(url) fqdn = components[0] scheme = components[1] subdomain = components[2] domain = components[3] suffix = components[4] port = components[5] path = components[6] query = components[7] fragment = components[8] if suffix: new_web = "{}.{}".format(domain, suffix) else: new_web = domain # FEATURE EXTRACTION START netloc = fqdn url_no_prot = url if port: netloc = netloc + ":" + port if not scheme: scheme = "http" if protocol == 1: scheme = "https" url_no_prot = url.replace(scheme + "://", "", 1) # IP address in netloc existent bool_ip_netloc = is_IP(domain) if subdomain == "" and not bool_ip_netloc: if url.startswith("https://"): url = url.replace("https://", "https://www.", 1) if url.startswith("http://"): url = url.replace("http://", "http://www.", 1) # URL FEATURES # use of shortening service bool_shortening_url = False if not orig_web == new_web: bool_shortening_url = True # kullback-leibler divergence float_divergence_url = 0 if not content == -1: float_divergence_url = compute_divergence(url, content) # length of the URL int_length_url = len(url) # uses redirect (e.g. shortening service) bool_redirect_url = bool(fqdn.__contains__('//')) # https used as token in url bool_https_token_url = bool(fqdn.lower().__contains__("https")) # ratio of capital and non-capital letters cap = sum(1 for c in url if c.isupper()) non_cap = sum(1 for c in url if c.islower()) float_cap_noncap_letters_url = float(cap / non_cap) # number of dots in url int_dots_url = url.count(".") # number of queries in url > custom bool has query and number of values in query bool_query_url = False int_query_values_url = 0 if query: bool_query_url = True int_query_values_url = query.count("&") + 1 # validate tld bool_validate_tld_url = True if not suffix == '': suffix_tokens = suffix.split(".") for token in suffix_tokens: if token not in tld_list: bool_validate_tld_url = False break # number of comma in url int_comma_url = url.count(",") # number of stars in url int_star_url = url.count("*") # number of semicolon in url int_semicolon_url = url.count(";") # number of spaces in url int_plus_url = url.count(" ") # javascript in url bool_javascript_url = bool(url.lower().__contains__("javascript:")) # number of equal signs in url int_equals_url = url.count("=") # number of dashes in url int_dash_url = url.count("-") # number of hash in url > custom as for query bool has fragment and number of value pairs bool_fragment_url = True int_fragment_values_url = 0 if fragment: bool_fragment_url = False int_fragment_values_url = fragment.count("&") + 1 # number of ampersands in url int_ampersand_url = url.count("&") # usage of % in url bool_html_url = bool(url.__contains__("%")) # number of tilde in url int_tilde_url = url.count("~") # number of not alpha-numerical symbols | without protocol ... https:// int_symbols_url = sum( (1 for c in url_no_prot if not c.isalpha() and not c.isdigit()), 1) # entropy of the url float_entropy_url = entropy_of_string(url) # ratio of vowel to consonant vowel = sum(1 for char in url.lower() if char in ( 'a', 'e', 'i', 'o', 'u', ) and char.isalpha()) consonant = sum(1 for char in url if char.isalpha()) - vowel float_vowel_consonant_url = vowel / consonant # ratio of numbers to letters in url digits = sum(1 for char in url if char.isdigit()) letters = sum(1 for char in url if char.isalpha()) float_digits_letters_url = digits / letters # brand in url | not used because of brand token list consist of alexa list domains as tokens # bool_brand_url = any(url.__contains__(str(brand).lower()) for brand in brand_list) # OWN URL FEATURES # percentage length of netloc to url float_percent_netloc_url = round(len(netloc) / len(url), 2) # percentage length of path to url float_percent_path_url = round(len(path) / len(url), 2) # percentage length of query to url float_percent_query_url = round(len(query) / len(url), 2) # percentage length of fragment to url float_percent_fragment_url = round(len(fragment) / len(url), 2) # NETLOC FEATURES # uses @ in netloc bool_at_symbol_netloc = bool(str(url).split(fqdn)[0].__contains__("@")) # prefix or suffix in netloc used bool_prefix_suffix_netloc = bool(netloc.__contains__('-')) # contains subdomains > custom number of subdomains bool_subdomain_netloc = False int_subdomain_netloc = 0 if not components[4] == "www" and not components[4] == "": bool_subdomain_netloc = True int_subdomain_netloc = components[4].count(".") + 1 # usage of https or not if protocol == -1: bool_https_protocol_netloc = request_used_protocol(url) else: bool_https_protocol_netloc = protocol # usage of abnormal ports and identify port for further features bool_other_ports_netloc = False if bool_https_protocol_netloc == 1 and not port: port = 443 if bool_https_protocol_netloc == 1 and not port: port = 80 if fqdn.__contains__(':'): port = netloc.split(':')[1] if not port == 80 and not port == 443: bool_other_ports_netloc = True # length of netloc int_length_netloc = len(netloc) # number of domains in netloc int_domains_netloc = netloc.count(".") # number of dashes in netloc int_dash_netloc = netloc.count("-") # count tokens in netloc that are created via - and . int_domain_tokens_netloc = sum( (1 for c in fqdn if not c.isalpha() and not c.isdigit()), 1) # number of digits in netloc int_digits_netloc = sum(1 for c in netloc if c.isdigit()) # number of dots in netloc int_dots_netloc = netloc.count(".") # number of underscores in netloc int_underscores_netloc = netloc.count("_") # true if digits in netloc contained bool_digits_netloc = bool(any(char.isdigit() for char in netloc)) # PATH FEATURES # number digits in path int_digits_path = sum(1 for c in path if c.isdigit()) # number of phishy words in netloc int_phishy_tokens_netloc = sum(1 for word in phishy_list if netloc.__contains__(word)) # number of phishy tokens in path int_phishy_tokens_path = sum(1 for word in phishy_list if path.__contains__(word)) # brand in path bool_brand_path = bool( any(path.lower().__contains__(str(brand).lower()) for brand in brand_list)) # number of slashes in path int_slash_path = path.count("/") # number of dashes in path int_dash_path = path.count("-") # SUBDOMAIN FEATURES # brand in subdomain bool_brand_subdomain = bool( any(subdomain.lower().__contains__(str(brand).lower()) for brand in brand_list)) # HOST FEATURES # define todays date today_date = datetime.date(datetime.now()) # domain created longer than one month ago # certificate based bool_created_shortly_host = False bool_domain_restlive_host = False creation_date = None expire_date = None if bool_https_protocol_netloc: expire_date, creation_date = get_ssl_information(url) if not creation_date == None: num_months = (today_date.year - creation_date.year) * 12 + ( today_date.month - creation_date.month) if num_months < 2: bool_created_shortly_host = True # restlive of domain more than 3 months # all certificate based if not expire_date == None: num_months = (expire_date.year - today_date.year) * 12 + ( expire_date.month - today_date.month) if num_months >= 3: bool_domain_restlive_host = True if not predict: entry = FeatureEntryLexical( bool_ip_netloc=bool_ip_netloc, int_length_url=int_length_url, bool_redirect_url=bool_redirect_url, bool_at_symbol_netloc=bool_at_symbol_netloc, bool_prefix_suffix_netloc=bool_prefix_suffix_netloc, bool_subdomain_netloc=bool_subdomain_netloc, int_subdomain_netloc=int_subdomain_netloc, bool_https_protocol_netloc=bool_https_protocol_netloc, bool_other_ports_netloc=bool_other_ports_netloc, bool_https_token_url=bool_https_token_url, int_redirect_url=int_redirect_url, float_cap_noncap_letters_url=float_cap_noncap_letters_url, int_dots_url=int_dots_url, int_length_netloc=int_length_netloc, int_domains_netloc=int_domains_netloc, int_dash_netloc=int_dash_netloc, int_domain_tokens_netloc=int_domain_tokens_netloc, int_digits_netloc=int_digits_netloc, int_digits_path=int_digits_path, int_phishy_tokens_netloc=int_phishy_tokens_netloc, int_phishy_tokens_path=int_phishy_tokens_path, bool_brand_subdomain=bool_brand_subdomain, bool_brand_path=bool_brand_path, bool_query_url=bool_query_url, int_query_values_url=int_query_values_url, int_dots_netloc=int_dots_netloc, int_underscores_netloc=int_underscores_netloc, bool_validate_tld_url=bool_validate_tld_url, int_slash_path=int_slash_path, int_comma_url=int_comma_url, int_star_url=int_star_url, int_semicolon_url=int_semicolon_url, int_plus_url=int_plus_url, bool_javascript_url=bool_javascript_url, int_equals_url=int_equals_url, int_dash_url=int_dash_url, bool_fragment_url=bool_fragment_url, int_fragment_values_url=int_fragment_values_url, int_ampersand_url=int_ampersand_url, bool_html_url=bool_html_url, int_tilde_url=int_tilde_url, int_symbols_url=int_symbols_url, float_entropy_url=float_entropy_url, float_vowel_consonant_url=float_vowel_consonant_url, bool_digits_netloc=bool_digits_netloc, float_digits_letters_url=float_digits_letters_url, int_dash_path=int_dash_path, bool_domain_restlive_host=bool_domain_restlive_host, bool_created_shortly_host=bool_created_shortly_host, float_percent_netloc_url=float_percent_netloc_url, float_percent_path_url=float_percent_path_url, float_percent_query_url=float_percent_query_url, float_percent_fragment_url=float_percent_fragment_url, float_divergence_url=float_divergence_url, bool_shortening_url=bool_shortening_url, label=label, url=url, final_url=final_url) return entry elif predict: data = { 'ID': 0, 'Has IP': [bool_ip_netloc], 'Length URL': [int_length_url], 'Has Redirect': [bool_redirect_url], 'Has At Symbol': [bool_at_symbol_netloc], 'Has Token Netloc': [bool_prefix_suffix_netloc], 'Has Subdomains': [bool_subdomain_netloc], 'Number Subdomains': [int_subdomain_netloc], 'Has HTTPS': [bool_https_protocol_netloc], 'Has Other Port': [bool_other_ports_netloc], 'Has HTTPS Token': [bool_https_token_url], 'Number Redirects': [int_redirect_url], 'Ratio Cap/NonCap': [float_cap_noncap_letters_url], 'Number Dots': [int_dots_url], 'Length Netloc': [int_length_netloc], 'Number Dash Netloc': [int_dash_netloc], 'Number Tokens Netloc': [int_domain_tokens_netloc], 'Number Digits Netloc': [int_digits_netloc], 'Number Digits Path': [int_digits_path], 'Number PhishyTokens Netloc': [int_phishy_tokens_netloc], 'Number PhishyTokens Path': [int_phishy_tokens_path], 'Has Brand Subdomain': [bool_brand_subdomain], 'Has Brand Path': [bool_brand_path], 'Has Query': [bool_query_url], 'Number Query Parameters': [int_query_values_url], 'Number Dots Netloc': [int_dots_netloc], 'Number Underscore Netloc': [int_underscores_netloc], 'Has Valide TLD': [bool_validate_tld_url], 'Number Slash Path': [int_slash_path], 'Number Comma': [int_comma_url], 'Number Stars': [int_star_url], 'Number Semicolon': [int_semicolon_url], 'Number Plus': [int_plus_url], 'Has Javascript': [bool_javascript_url], 'Number Equals': [int_equals_url], 'Number Dash': [int_dash_url], 'Has Fragment': [bool_fragment_url], 'Number Fragment Values': [int_fragment_values_url], 'Number Ampersand': [int_ampersand_url], 'Has HTML Code': [bool_html_url], 'Number Tilde': [int_tilde_url], 'Number Symbols': [int_symbols_url], 'Entropy': [float_entropy_url], 'Ratio Vowel/Consonant': [float_vowel_consonant_url], 'Has Digits Netloc': [bool_digits_netloc], 'Ratio Digit/Letter': [float_digits_letters_url], 'Number Dash Path': [int_dash_path], 'Cert Restlive': [bool_domain_restlive_host], 'Cert Created Shortly': [bool_created_shortly_host], 'Ratio Netloc/URL': [float_percent_netloc_url], 'Ratio Path/URL': [float_percent_path_url], 'Ratio Query/URL': [float_percent_query_url], 'Ratio Fragment/URL': [float_percent_fragment_url], 'KL Divergence': [float_divergence_url], 'Has Shortening': [bool_shortening_url], 'Label': [label], 'URL': [url], 'Final URL': [final_url] } columns = list(LEXICAL_FEATURE_LIST_COLUMN_NAMES) df = pd.DataFrame(data, columns=columns) return df except Exception as e: exc_type, exc_obj, tb = sys.exc_info() f = tb.tb_frame lineno = tb.tb_lineno filename = f.f_code.co_filename linecache.checkcache(filename) line = linecache.getline(filename, lineno, f.f_globals) log( ERROR, 'EXCEPTION IN ({}, LINE {} "{}"): {}'.format( filename, lineno, line.strip(), exc_obj)) log(action_logging_enum=ERROR, logging_text="Could not extract lexical features from url: {}". format(url)) if predict: return df return None
def extract_features_from_signature(url, label): """ extract all signature features from url """ common_ents = [] common_terms = [] cert_subject = "" search_vec = [] try: final_url, redirects, protocol, content = get_redirects(url) lang_code = detect_website_language(content) # get entities if the language was detected if lang_code == "de" or lang_code == "en": text = get_website_text(content) common_ents = get_common_entities(text, n=5, lang_code=lang_code) common_terms = get_common_terms(text=text, n=5, lang_code=lang_code) cert_subject = get_ssl_subject(url) terms = [[None, 0] for i in range(5)] ents = [[None, 0] for i in range(5)] for i in range(len(common_ents)): ents[i] = common_ents[i] for i in range(len(common_terms)): terms[i] = common_terms[i] if not label == "PREDICT": entry = SignatureEntry(url=url, final_url=final_url, label=label, cert_subject=cert_subject, term1=terms[0][0], term2=terms[1][0], term3=terms[2][0], term4=terms[3][0], term5=terms[4][0], ent1=ents[0][0], ent2=ents[1][0], ent3=ents[2][0], ent4=ents[3][0], ent5=ents[4][0]) log(action_logging_enum=INFO, logging_text="Signature extracted from {}.".format(url)) return entry else: data = { "ID": [0], "Term1": [terms[0][0]], "Term2": [terms[1][0]], "Term3": [terms[2][0]], "Term4": [terms[3][0]], "Term5": [terms[4][0]], "Entity1": [ents[0][0]], "Entity2": [ents[1][0]], "Entity3": [ents[2][0]], "Entity4": [ents[3][0]], "Entity5": [ents[4][0]], "Label": [label], "URL": [url_orig], "Final URL": [url] } columns = list(SIGNATURE_FEATURE_LIST_COLUMN_NAMES) entry = pd.DataFrame(data, columns=columns) log(action_logging_enum=INFO, logging_text="Signature extracted from {}.".format(url)) return entry except Exception as e: log(action_logging_enum=ERROR, logging_text="Could not extract signature from {}. [{}]".format( url, str(e))) exc_type, exc_obj, tb = sys.exc_info() f = tb.tb_frame lineno = tb.tb_lineno filename = f.f_code.co_filename linecache.checkcache(filename) line = linecache.getline(filename, lineno, f.f_globals) log( ERROR, 'EXCEPTION IN ({}, LINE {} "{}"): {}'.format( filename, lineno, line.strip(), exc_obj)) return None
def search_login_page(url, login_token_list, selenium_analysis=False): def do_selenium_search(driver, content): # search by driver for clickable links try: driver.get(url) except Exception as e: log(action_logging_enum=INFO, logging_text=str(e)) return url, False for token in login_token_list: if token not in str(content.lower()): continue try: # find button/a Anchor that contains login token (case-insensitive) xpath = "//*[contains(translate(text(), 'ABCDEFGHIJKLMNOPQRSTUVWXYZ', 'abcdefghijklmnopqrstuvwxyz'),'{f}')]".format( f=token) button = driver.find_element_by_xpath(xpath) button.click() except Exception as e: log(action_logging_enum=INFO, logging_text=str(e)) continue login_url = str(driver.current_url) log(action_logging_enum=INFO, logging_text="Login page found by selenium analysis.") return login_url, True return url, False def do_content_search(content): soup = bs4.BeautifulSoup(content.lower(), 'html.parser') if soup.find('a', href=True): for link in soup.find_all('a', href=True): if any(link.text.__eq__(token.lower()) for token in login_token_list): login_url = str(link['href']).strip() if login_url.startswith('/') and url.endswith('/'): login_url = url + login_url.replace('/', '', 1) if login_url.startswith('./'): if url.endswith('/'): login_url = url + login_url.replace('./', '', 1) else: login_url = url + login_url.replace('.', '', 1) if validate_url(login_url): log(action_logging_enum=INFO, logging_text="Login page found by content analysis.") return login_url, True return url, False def do_bing_search(domain, ccTLD): return (bingsearch.search("{d}.{s} login".format(d=domain, s=ccTLD), wait_after_429=False)) def do_google_search(domain, ccTLD): return (googlesearch.search("{d}.{s} login".format(d=domain, s=ccTLD), 5, "de", wait_after_429=False)) def do_search(domain, ccTLD, url): try: result = do_bing_search(domain, ccTLD) except Exception as e: result = -1 log(action_logging_enum=WARNING, logging_text=str(e)) if result == -1: try: result = do_google_search(domain, ccTLD) except Exception as e: result = -1 log(action_logging_enum=ERROR, logging_text=str(e)) if result == -1: return url, False for entry in result: orig_url = url extracted_search = extract(entry) search_domain = extracted_search.domain search_cctld = extracted_search.suffix found_url = str(entry) if cut_protocol(orig_url).__ne__(cut_protocol(found_url)): orig_url = orig_url + "/" if str(search_domain).__eq__(domain) and not cut_protocol(found_url).__eq__(cut_protocol(orig_url)): if str(ccTLD).__ne__(str(search_cctld)) and cut_protocol(found_url).__contains__("/"): path = cut_protocol(found_url).split("/", 1)[1] if not path: continue if entry.startswith("http://") and orig_url.startswith("https://"): found_url = found_url.replace("http://", "https://", 1) try: requests.get(found_url, timeout=10, headers=headers) except Exception as e: found_url = found_url.replace("https://", "http://", 1) login_url = found_url log(action_logging_enum=INFO, logging_text="Login page found by search engine.") return login_url, True return url, False if selenium_analysis: driver = webdriver.Chrome() driver.implicitly_wait(20) if is_url(str("https://www." + url)): url = "https://www." + url if is_url(str("https://" + url)): url = "https://" + url if not url.startswith("https://") and not url.startswith("http://"): url = "https://" + url if not is_url(url): log(action_logging_enum=ERROR, logging_text="URL does not provide the needed scheme. [{}]".format(url)) return url, False extracted = extract(url) domain = extracted.domain ccTLD = extracted.suffix login_url, changed_status = do_search(domain, ccTLD, url) # get redirects for url try: url, num_redirects, protocol, content = get_redirects(url) except Exception as e: log(action_logging_enum=ERROR, logging_text="Error while getting redirects for url: {}".format(url)) return None, False if protocol == -1 and content == -1: return None, False lang_website = detect_website_language(content) if lang_website == None: log(action_logging_enum=WARNING, logging_text="Website does not use German or English.[{}]".format(url)) return None, False if not changed_status: if changed_status.__eq__(False): login_url, changed_status = do_content_search(content) if changed_status.__eq__(False) and selenium_analysis: login_url, changed_status = do_selenium_search(driver, content) if selenium_analysis: driver.close() if changed_status.__eq__(False): log(action_logging_enum=WARNING, logging_text="No login page found for: " + url) return url, False log(action_logging_enum=INFO, logging_text="Login page found: " + login_url) return login_url, changed_status