def write_signature_features_CSV(feature_list, file_name=""): """ write features for signature filter to csv """ if not file_name: file_name = SIGNATURE_FEATURE_DATABASE id = 0 with open(DATA_PATH + file_name, 'w') as file: writer = csv.writer(file, delimiter=',') writer.writerow(SIGNATURE_FEATURE_LIST_COLUMN_NAMES) for entry in feature_list: if not entry == None: writer.writerow([ id, entry.url, entry.final_url, entry.label, entry.cert_subject, entry.ent1, entry.ent2, entry.ent3, entry.ent4, entry.ent5, entry.term1, entry.term2, entry.term3, entry.term4, entry.term5 ]) id += 1 log(action_logging_enum=INFO, logging_text="Feature list written to CSV file. [{f}]".format( f=file_name))
def do_search(domain, suffix, url, words): search_string = "" search_string = "{d}.{s}".format(d=domain, s=suffix) for word in words: if not word == None: search_string += " {w}".format(w=word) try: result = bingsearch.search(search_string, wait_after_429=False) except Exception as e: result = -1 log(action_logging_enum=WARNING, logging_text=str(e)) if result == -1: return None for entry in result: extracted_search = get_url_components(str(entry)) search_domain = extracted_search[3] search_cctld = extracted_search[4] found_url = str(entry) if str(search_domain).__eq__(domain) and str(suffix).__eq__(str(search_cctld)) : log(action_logging_enum=INFO, logging_text="Login page found by search engine.") return found_url, True return url, False
def save_model(knn): with open(SAVED_MODEL_FILE, 'wb') as file: pickle.dump(knn, file) log(action_logging_enum=INFO, logging_text="[K-NEAREST NEIGHBOR]: Model saved.")
def load_model(): # load model from file logistic_regression = pickle.load(open(SAVED_MODEL_FILE, 'rb')) log(action_logging_enum=INFO, logging_text="[LOGISTIC REGRESSION]: Model loaded.") return logistic_regression
def train_model(do_optimize=False, data=pd.DataFrame()): log_module_start(MODULE_NAME=MODEL_NAME) pd.set_option('display.max_columns', None) if len(data) == 0: data = pd.DataFrame(pd.read_csv(DATA_PATH + LEXICAL_FEATURE_DATABASE)) data = transform_data(data) data = data.replace("True", 1) data = data.replace("False", 0) y = data['Label'].copy() x = data.drop(["Label"], axis=1).copy() train, test = train_test_split(data, test_size=0.35) log(action_logging_enum=INFO, logging_text="[K-NEAREST NEIGHBOR] Data ready for use.") y_train = train['Label'].copy() x_train = train.drop(["Label"], axis=1).copy() params = {'n_neighbor': 9} knn = KNeighborsClassifier() # params) f1 = print_scores(knn, x, y) log(action_logging_enum=INFO, logging_text="[K-NEAREST NEIGHBOR] Starting training.") knn.fit(x_train, y_train) save_model(knn=knn) log_module_complete(MODULE_NAME=MODEL_NAME) return f1
def neighborhood_entropy(r, s=None, union=False): # get neighborhood entropy for subsets R and S or UNION of R and S feature_set = r sum_list = [] if union: feature_set = pd.concat([feature_set, s], axis=1) n = len(feature_set) def do_get(i_): g_e = get_neighborhood(feature_set, i_, size_row, distance, feature_vec) return math.log((g_e / n), 2) try: feature_vec = feature_set.values.tolist() size_row = len(feature_set) distance = 0.15 for i in range(n): sum_list.append(do_get(i)) except Exception as e: log(ERROR, str(e)) sum = 0 for i in sum_list: sum += i return (sum * -(1 / n))
def predict_url(url): try: if isinstance(url, str): features = mod_feature_extraction.extract_features_from_URL(url, "PREDICT", True) x_pred = pd.DataFrame(features) else: x_pred = url x_pred = transform_data(x_pred) random_forest = load_model() y_pred = random_forest.predict(x_pred) return y_pred.tolist() except Exception as e: exc_type, exc_obj, tb = sys.exc_info() f = tb.tb_frame lineno = tb.tb_lineno filename = f.f_code.co_filename linecache.checkcache(filename) line = linecache.getline(filename, lineno, f.f_globals) print('EXCEPTION IN ({}, LINE {} "{}"): {}'.format(filename, lineno, line.strip(), exc_obj)) log(action_logging_enum=WARNING, logging_text=str(e)) log(action_logging_enum=WARNING, logging_text=str(e.__traceback__)) return None
def check_status(): nonlocal complete nonlocal failed nonlocal size_before while True: entry = q.get() url = entry.url is_up = False try: response = requests.get(url, timeout=10, headers=headers) response.raise_for_status() if response.status_code >= 200 and response.status_code < 400: is_up = True except Exception as e: log(action_logging_enum=ERROR, logging_text=str(e)) is_up = False if is_up: complete += 1 log(action_logging_enum=INFO, logging_text="Found {} of {}. (Failed: {})".format( complete, size_before, failed)) modify_list(entry) else: failed += 1 q.task_done()
def extract_features_from_signature_list(data): """ extract all signature features from list with urls and labels """ @ray.remote def get_feature_entry(entry): new_entry = extract_features_from_signature(entry.url, entry.label) if not new_entry == None: return new_entry try: result_ids = [] for entry in data: result_ids.append(get_feature_entry.remote(entry)) feature_list = ray.get(result_ids) except KeyboardInterrupt as e: log(action_logging_enum=ERROR, logging_text= "Process interrupted by keyboard signal. Returning the list.") feature_list = ray.get(result_ids) return feature_list return feature_list
def save_model(adaptive_boosting): with open(SAVED_MODEL_FILE, 'wb') as file: pickle.dump(adaptive_boosting, file) log(action_logging_enum=INFO, logging_text="[ADAPTIVE BOOSTING]: Model saved.")
def extract_features_from_website_list_ray(data): """ extract all features from website list, ray used for parallelism """ @ray.remote def get_feature_entry(entry): new_entry = extract_features_from_website(entry.url, entry.label, False) if not new_entry == None: return new_entry try: result_ids = [] for entry in data: result_ids.append(get_feature_entry.remote(entry)) feature_list = ray.get(result_ids) except KeyboardInterrupt as e: log(action_logging_enum=ERROR, logging_text= "Process interrupted by keyboard signal. Returning the list.") feature_list = ray.get(result_ids) return feature_list return feature_list
def create_brand_list_by_alexa(): alexa_list = open_dataset_XML_file(filename=ALEXA_FILE, iterateable="entry", label="Benign", url_label="url", label_label="label") brand_list = [] for entry in alexa_list: url = entry.url brand_list.append(tldextract.extract(url).domain) root = et.Element("data") tree = et.ElementTree(root) for i in range(len(brand_list)): entry = et.SubElement(root, "entry") entrylabel = et.SubElement(entry, "brandname") entrylabel.text = str(brand_list[i]) tree.write(open(DATA_PATH + BRAND_FILE, 'wb'), pretty_print=True) log(action_logging_enum=INFO, logging_text="Write process to XML finished for [{f}].".format( f=BRAND_FILE))
def train_model(do_optimize=False, data=pd.DataFrame()): log_module_start(MODULE_NAME=MODEL_NAME) if len(data) == 0: data = pd.read_csv(DATA_PATH + LEXICAL_FEATURE_DATABASE) data = transform_data(data) train, test = train_test_split(data, test_size=0.2) pd.set_option('display.max_columns', None) y = data['Label'] x = data.drop(['Label'], axis=1).values log(action_logging_enum=INFO, logging_text="[ADAPTIVE BOOSTING] Data ready for use.") if do_optimize == True: optimize() params = {'n_estimators': 120, 'random_state': 0} log(action_logging_enum=INFO, logging_text="[ADAPTIVE BOOSTING] Starting training.") adaptive_boosting = AdaBoostClassifier() #params) f1 = print_scores(adaptive_boosting, x, y) y_train = train['Label'] x_train = train.drop(['Label'], axis=1) # random_state=1, n_estimators=120, min_samples_leaf=5, min_samples_split=10, # max_features='sqrt', max_depth=17 adaptive_boosting.fit(x_train, y_train) save_model(adaptive_boosting=adaptive_boosting) log_module_complete(MODULE_NAME=MODEL_NAME) return f1
def optimize(): log(action_logging_enum=INFO, logging_text="[SUPPORT VECTOR MACHINE]: Starting to search for best c, gamma, kernel by cross validating different values.") # read data train = pd.read_csv(DATA_PATH + LEXICAL_FEATURE_DATABASE) train = transform_data(train) x_train = train.drop(['Label'], axis=1) y_train = train['Label'].copy() param_grid = [ {'C': [1, 10, 100, 1000], 'kernel': ['linear']}, {'C': [1, 10, 100, 1000], 'gamma': [0.001, 0.0001], 'kernel': ['rbf']}, ] # Create grid search using 5-fold cross validation model = SVC() clf = GridSearchCV(model, param_grid, cv=10, verbose=0) best_model = clf.fit(x_train, y_train) # View best hyperparameters log(action_logging_enum=INFO, logging_text="[SUPPORT VECTOR MACHINE]: Hyperparameter tuning completed.") log(INFO, str('Best Kernel:', best_model.best_estimator_.get_params()['kernel'])) log(INFO, str('Best C:', best_model.best_estimator_.get_params()['C'])) log(INFO, str('Best gamma:', best_model.best_estimator_.get_params()['gamma']))
def save_model(extreme_gradient): with open(SAVED_MODEL_FILE, 'wb') as file: pickle.dump(extreme_gradient, file) log(action_logging_enum=INFO, logging_text="[EXTREME GRADIENT BOOSTING]: Model saved.") return
def load_model(): # load model from file support_vector_machine = pickle.load(open(SAVED_MODEL_FILE, 'rb')) log(action_logging_enum=INFO, logging_text="[SUPPORT VECTOR MACHINE]: Model loaded.") return support_vector_machine
def load_model(): # load model from file decision_tree = pickle.load(open(SAVED_MODEL_FILE, 'rb')) log(action_logging_enum=INFO, logging_text="[DECISION TREE]: Model loaded.") return decision_tree
def get_feature_contribution_subset(feature_, y_, coalition_, coal_size): # get contribution to coalition for feature_ (delta for feature i and coalition K_i) mutual_information = neighbor_mutual_information_coalition( coalition_, y_, feature_) if not mutual_information >= 0: log(INFO, "Return contribution.") return 0 log(INFO, "Contribution bigger than 0.") # check for all features S in coalition and feature_ R -> NMI(S ; D | R) > NMI(S ; D) related = 0 is_related = False for i in range(coal_size): nmi_cj_y_ci = neighbor_mutual_information_coalition( coalition_.iloc[:, i], y_, feature_) nmi_cj_y = neighbor_mutual_information_coalition( coalition_.iloc[:, i], y_, less=True) if nmi_cj_y_ci > nmi_cj_y: related += 1 if related >= (coal_size / 2): log(INFO, "Contribution is related.") is_related = True if not is_related: log(INFO, "Return contribution.") return 0 log(INFO, "Return contribution.") return 1
def get_input_to_lower(): prompt = input("type a command:") prompt = prompt.lower() # log system commands if prompt != "": log(action_logging_enum=INFO, logging_text="--> " + prompt) return prompt
def do_get(i_, y, x, combinations, feature, c, lene): res = get_feature_contribution_subset(y_=y, coalition_=x.iloc[:, combinations], feature_=feature, coal_size=len(combinations)) log(INFO, "Processed coalition {} of {}.".format(c, lene)) return res
def load_model(): try: # load model from file random_forest = pickle.load(open(SAVED_MODEL_FILE, 'rb')) log(action_logging_enum=INFO, logging_text="[RANDOM FOREST]: Model loaded.") return random_forest except Exception: return None
def load_model(): # load model from file try: adaptive_boosting = pickle.load(open(SAVED_MODEL_FILE, 'rb')) except Exception: return None log(action_logging_enum=INFO, logging_text="[ADAPTIVE BOOSTING]: Model loaded.") return adaptive_boosting
def load_model(): # load model from file try: extreme_gradient = pickle.load(open(SAVED_MODEL_FILE, 'rb')) except Exception: return None log(action_logging_enum=INFO, logging_text="[EXTREME GRADIENT BOOSTING]: Model loaded.") return extreme_gradient
def open_dataset_XML_file(filename, iterateable, label=None, url_label="url", label_label=None, max_line_count=-1): """ open xml file and write to list filename: xml filename iterateable: node containing data label: label for all data url_label: url specifier in xml data label_label: label specifiert in xml data max_line_count: maximum entries to be written to list """ PATH = "" if not os.path.isfile(DATA_PATH + filename): log(action_logging_enum=WARNING, logging_text="File [{f}] does not exist.".format(f=filename)) log(action_logging_enum=INFO, logging_text="Trying in backup folder.") if not os.path.isfile(DATA_BACKUP_PATH + filename): log(action_logging_enum=ERROR, logging_text="File [{f}] does even not exist in backup folder." .format(f=filename)) return None else: PATH = DATA_BACKUP_PATH log(action_logging_enum=INFO, logging_text="Found in backup folder.") else: PATH = DATA_PATH datalist = [] parser = et.XMLParser(strip_cdata=False) xtree = et.parse(PATH + filename, parser=parser) root = xtree.getroot() index = 1 for entry in root.iter(iterateable): url = entry.find(url_label).text e = Entry(entry.find(label_label).text, url) if not (label_label == None) else Entry(label, url) datalist.append(e) if index == max_line_count: break index += 1 log(action_logging_enum=INFO, logging_text="XML File filled in list. FILE: [{f}].".format( f=filename)) return datalist
def load_model(): # load model from file try: knn = pickle.load(open(SAVED_MODEL_FILE, 'rb')) except Exception: return None log(action_logging_enum=INFO, logging_text="[K-NEAREST NEIGHBOR]: Model loaded.") return knn
def delete_data(filename): """ delete file for param filename in data_path """ if os.path.isfile(DATA_PATH + filename): os.remove(DATA_PATH + filename) log(action_logging_enum=INFO, logging_text="File [{f}] deleted.".format(f=DATA_PATH + filename)) else: log(action_logging_enum=WARNING, logging_text="File [{f}] dose not exists.".format(f=filename))
def save_last_score(prec_train, prec_test, recall_train, recall_test, f1_train, f1_test): with open(SCORE_FILE, 'w') as file: file.write("Precision Train: " + str(prec_train) + "\n") file.write("Precision Test: " + str(prec_test) + "\n") file.write("Recall Train: " + str(recall_train) + "\n") file.write("Recall Test: " + str(recall_test) + "\n") file.write("F1-Score Train: " + str(f1_train) + "\n") file.write("F1-Score Test: " + str(f1_test) + "\n") file.close() log(action_logging_enum=INFO, logging_text="[ADAPTIVE BOOSTING]: score saved in [{f}]".format(f=SCORE_FILE)) return
def download_file(url, filename): """ download database file from url and save by filename url: url where data has to be downloaded filename: filename for data to be saved """ response = requests.get(url, timeout=10, headers=headers) with open(DATA_PATH + filename, 'wb') as file: file.write(response.content) log(action_logging_enum=INFO, logging_text="Download completed for [{f}].".format(f=filename))
def merge_dataframes(dataframe): try: nonlocal df if not pd.DataFrame(dataframe).empty: if df.empty: df = dataframe else: df = pd.concat([df, dataframe], ignore_index=True) except Exception as e: log(action_logging_enum=WARNING, logging_text=str(e))
def fetch_results(search_term, number_results, language_code): escaped_search_term = search_term.replace(' ', '+') google_url = 'https://www.google.de/search?q={}&num={}&hl={}'.format( escaped_search_term, number_results + 1, language_code) try: response = requests.get(google_url, headers=usr_agent, timeout=10) response.raise_for_status() except Exception as e: try: time.sleep(2) response = requests.get(google_url, headers=usr_agent, timeout=10) except Exception as e: log(action_logging_enum=ERROR, logging_text= "[Function search]: An error occured while querying the Google API." ) log(action_logging_enum=INFO, logging_text="[Function search]: Error description: {err}". format(err=str(e))) return -1 status_code = int(response.status_code) if status_code >= 200 and status_code < 400: return response.content else: if wait_after_429: while status_code < 200 or status_code >= 400: log(action_logging_enum=WARNING, logging_text="Google Search returned Status Code 429. " "Next check in: 5 minutes.") time.sleep(300) try: response = requests.get(google_url, headers=usr_agent, timeout=10) status_code = int(response.status_code) except Exception as e: log(action_logging_enum=ERROR, logging_text= "An error occured while querying the Google API.") log(action_logging_enum=INFO, logging_text="Error description: {err}".format( err=str(e))) return response.content return -1