def train_model(do_optimize=False, data=pd.DataFrame()): log_module_start(MODULE_NAME=MODEL_NAME) pd.set_option('display.max_columns', None) if len(data) == 0: data = pd.DataFrame(pd.read_csv(DATA_PATH + LEXICAL_FEATURE_DATABASE)) data = transform_data(data) data = data.replace("True", 1) data = data.replace("False", 0) y = data['Label'].copy() x = data.drop(["Label"], axis=1).copy() train, test = train_test_split(data, test_size=0.35) log(action_logging_enum=INFO, logging_text="[K-NEAREST NEIGHBOR] Data ready for use.") y_train = train['Label'].copy() x_train = train.drop(["Label"], axis=1).copy() params = {'n_neighbor': 9} knn = KNeighborsClassifier() # params) f1 = print_scores(knn, x, y) log(action_logging_enum=INFO, logging_text="[K-NEAREST NEIGHBOR] Starting training.") knn.fit(x_train, y_train) save_model(knn=knn) log_module_complete(MODULE_NAME=MODEL_NAME) return f1
def train_model(do_optimize=False, data=pd.DataFrame()): log_module_start(MODULE_NAME=MODEL_NAME) if len(data) == 0: data = pd.read_csv(DATA_PATH + LEXICAL_FEATURE_DATABASE) data = transform_data(data) train, test = train_test_split(data, test_size=0.2) pd.set_option('display.max_columns', None) y = data['Label'] x = data.drop(['Label'], axis=1).values log(action_logging_enum=INFO, logging_text="[ADAPTIVE BOOSTING] Data ready for use.") if do_optimize == True: optimize() params = {'n_estimators': 120, 'random_state': 0} log(action_logging_enum=INFO, logging_text="[ADAPTIVE BOOSTING] Starting training.") adaptive_boosting = AdaBoostClassifier() #params) f1 = print_scores(adaptive_boosting, x, y) y_train = train['Label'] x_train = train.drop(['Label'], axis=1) # random_state=1, n_estimators=120, min_samples_leaf=5, min_samples_split=10, # max_features='sqrt', max_depth=17 adaptive_boosting.fit(x_train, y_train) save_model(adaptive_boosting=adaptive_boosting) log_module_complete(MODULE_NAME=MODEL_NAME) return f1
def train_model(do_optimize=False, data=pd.DataFrame()): log_module_start(MODULE_NAME=MODEL_NAME) if len(data) == 0: return None data = transform_data(data) train, test = train_test_split(data, test_size=0.2) pd.set_option('display.max_columns', None) y = data['Label'] x = data.drop(['Label'], axis=1).values log(action_logging_enum=INFO, logging_text="[RANDOM FOREST] Data ready for use.") if do_optimize == True: optimize() params = { 'n_estimators': 800, 'max_features': 6, 'max_depth': 21, 'min_samples_leaf': 1, 'min_samples_split': 4 } log(action_logging_enum=INFO, logging_text="[RANDOM FOREST] Starting training.") random_forest = RandomForestClassifier(n_estimators=700, max_features='auto', min_samples_leaf=2, min_samples_split=3) f1 = print_scores(random_forest, x, y) # 'n_estimators': 1400, # 'max_features': 'sqrt', # 'max_depth': 20, # 'min_samples_leaf': 2, # 'min_samples_split': 4 #} # random_state=1, n_estimators=120, min_samples_leaf=5, min_samples_split=10, # max_features='sqrt', max_depth=17 y_train = train['Label'] x_train = train.drop(['Label'], axis=1).values random_forest.fit(x_train, y_train) save_model(random_forest=random_forest) log_module_complete(MODULE_NAME=MODEL_NAME) return f1
def train_model(do_optimize=False, data=pd.DataFrame()): # log training starting log_module_start(MODULE_NAME=MODEL_NAME) # read data and split into test and train if len(data) == 0: print(len(data)) data = pd.read_csv(DATA_PATH + CONTENT_FEATURE_DATABASE) # transform data data = transform_data(data) train, test = train_test_split(data, test_size=0.2) pd.set_option('display.expand_frame_repr', False) # display all columns with head() pd.set_option('display.max_columns', None) y = data['Label'] x = data.drop(['Label'], axis=1).values log(action_logging_enum=INFO, logging_text="[DECISION TREE]: Data ready for use.") # divide data to inputs (x) and labels (y) y_train = train['Label'] x_train = train.drop(['Label'], axis=1).values if do_optimize == True: optimize() params = { 'min_samples_split': 3, 'min_samples_leaf': 1, 'random_state': 42, 'class_weight': 'balanced' } log(action_logging_enum=INFO, logging_text="[DECISION TREE]: Starting training.") # create classifier with specifications decision_tree = tree.DecisionTreeClassifier() #params) f1 = print_scores(decision_tree, x, y) decision_tree.fit(x_train, y_train) save_model(decision_tree=decision_tree) # log train complete log_module_complete(MODULE_NAME=MODEL_NAME) return f1
def train_model(do_optimize=False, data=pd.DataFrame()): log_module_start(MODULE_NAME=MODEL_NAME) if len(data) == 0: data = pd.read_csv(DATA_PATH + CONTENT_FEATURE_DATABASE) data = transform_data(data) y = data['Label'] x = data.drop(['Label'], axis=1).values train, test = train_test_split(data, test_size=0.2) pd.set_option('display.max_columns', None) if do_optimize == True: optimize() log(action_logging_enum=INFO, logging_text="[SUPPORT VECTOR MACHINE] Data ready for use.") # support vector machine g = 0.1 c = 0.1 params = { 'kernel': 'linear', 'random_state': 0, 'gamma': g, 'C': c } support_vector_machine = SVC()#params) # params=params) log(action_logging_enum=INFO, logging_text="[SUPPORT VECTOR MACHINE] Starting training.") f1 = print_scores(support_vector_machine=support_vector_machine, x=x, y=y) # , params=params) y_train = train['Label'] x_train = train.drop(['Label'], axis=1).values support_vector_machine.fit(x_train, y_train) save_model(support_vector_machine=support_vector_machine) log_module_complete(MODULE_NAME=MODEL_NAME) return f1
def train_model(do_optimize=False, data=pd.DataFrame()): log_module_start(MODULE_NAME=MODEL_NAME) pd.set_option('display.max_columns', None) if len(data) == 0: data = pd.read_csv(DATA_PATH + LEXICAL_FEATURE_DATABASE) data = transform_data(data) y = data['Label'] x = data.drop(["Label"], axis=1) train, test = train_test_split(data, test_size=0.2) log(action_logging_enum=INFO, logging_text="[LOGISTIC REGRESSION]: Data ready for use.") params = { 'random_state': 1, 'C': 0.1 } logistic_regression = LogisticRegression()#params) # random_state=1, C=0.10) f1= print_scores(logistic_regression, x, y) if do_optimize == True: optimize() log(action_logging_enum=INFO, logging_text="[LOGISTIC REGRESSION]: Starting training.") y_train = train['Label'] x_train = train.drop(["Label"], axis=1) scaler = MinMaxScaler(feature_range=(0, 1)) scaler.fit(x_train) x_train = pd.DataFrame(scaler.transform(x_train)) logistic_regression.fit(x_train, y_train) save_model(logistic_regression=logistic_regression) log_module_complete(MODULE_NAME=MODEL_NAME) return f1
def run(content=False, lexical=False, signature=False, val_sets=False): # log module start log_module_start(MODULE_NAME=MODULE_NAME) content_feature_list = [] lexical_feature_list = [] # generate validation sets of all three databases if val_sets: generate_validation_sets() return # open data file and write to list (created in component database) data_list = open_dataset_XML_file(filename=DATABASE, iterateable="entry", label_label="label", url_label="url") if data_list == None: log(action_logging_enum=WARNING, logging_text= "[MODULE FEATURE EXTRACTION]: CSV File [data.csv] was not found. returning ..." ) return # binarize labels data_list = binarize_labels(data_list) log(action_logging_enum=INFO, logging_text="[MODULE FEATURE EXTRACTION]: Labels binarized") # create feature_list with FeatureEntries for all urls in list if lexical: # create lexical feauture list lexical_feature_list = f.extract_features_from_URL_list(data=data_list) if content: # extract content based features using ray # list is saved for each 1000 entries since the extraction lasts about 2 hours process = True index = 6000 append = False last_index = 5967 if index == 0: delete_data(filename=CONTENT_FEATURE_DATABASE) ray.init(num_cpus=6) while process: end_index = index + 1000 if end_index >= len(data_list): end_index = len(data_list) - 1 process = False if index > 0: append = True copy_data = data_list[index:end_index] content_feature_list = f.extract_features_from_website_list_ray( data=copy_data) if not len(content_feature_list) > 0: log( ERROR, "[MODULE FEATURE EXTRACTION]: Error while creating feature list for content filter. The list is empty" ) process = False break last_index += 1 last_index = write_content_features_CSV( feature_list=content_feature_list, append=append, new_index=last_index) log( INFO, "[MODULE FEATURE EXTRACTION]: Feature list for content filter was writen." ) index += 1000 log( INFO, "[MODULE FEATURE EXTRACTION]: Feature list for content filter was writen. (Next for index: {}" .format(index)) ray.shutdown() if signature: # extract features for signature based filter ray.init(num_cpus=6) signature_feature_list = f.extract_features_from_signature_list( data=data_list) ray.shutdown() write_signature_features_CSV(feature_list=signature_feature_list) # feature extraction completeted log(action_logging_enum=INFO, logging_text= "[MODULE FEATURE EXTRACTION]: Feature extraction completed.") # check whether the list has entries if len(lexical_feature_list) > 0 and lexical: log( INFO, "[MODULE FEATURE EXTRACTION]: Feature list for lexical filter successfully created." ) delete_data(filename=LEXICAL_FEATURE_DATABASE) # write lexical_feature_list to csv file write_lexical_features_CSV(feature_list=lexical_feature_list) elif lexical: log( ERROR, "[MODULE FEATURE EXTRACTION]: Error while creating feature list for lexical filter. The list is empty" ) # log module completion log_module_complete(MODULE_NAME=MODULE_NAME)
def train_model(do_optimize=False, data=pd.DataFrame()): log_module_start(MODULE_NAME=MODEL_NAME) if len(data) == 0: data = pd.read_csv(DATA_PATH + CONTENT_FEATURE_DATABASE) data = transform_data(data) for index, col in enumerate(data.columns): if data[col].dtype == np.bool: name = data.iloc[:, index].name if name != "Label": data[name] = data[name].astype(int) y = data['Label'] x = data.drop(['Label'], axis=1).values train, test = train_test_split(data, test_size=0.2) train_y = train['Label'] train_x = train.drop(['Label'], axis=1).values test_y = test['Label'] test_x = test.drop(['Label'], axis=1).values pd.set_option('display.max_columns', None) log(action_logging_enum=INFO, logging_text="[EXTREME GRADIENT BOOSTING] Data ready for use.") if do_optimize == True: optimize(train_x, train_y, test_x, test_y) params = { 'silent': False, 'scale_pos_weight': 1, 'use_label_encoder': False, 'learning_rate': 0.04, 'colsample_bytree': 0.7, 'subsample': 0.7, 'n_estimators': 700, 'reg_alpha': 0.3, 'max_depth': 5, 'gamma': 10 } extreme_gradient = XGBClassifier(silent=False, scale_pos_weight=1, use_label_encoder=False, learning_rate=0.04, colsample_bytree=0.7, subsample=0.7, n_estimators=700, reg_alpha=0.3, max_depth=5, gamma=10, enable_categorical=True) f1 = print_scores(extreme_gradient, x, y) y_train = train['Label'] x_train = train.drop(['Label'], axis=1) for index, col in enumerate(data.columns): if data[col].dtype == np.bool: name = data.iloc[:, index] data[name] = data[name].astype(int) extreme_gradient.fit(x_train, y_train) save_model(extreme_gradient=extreme_gradient) # random_state=1, n_estimators=120, min_samples_leaf=5, min_samples_split=10, # max_features='sqrt', max_depth=17 log(action_logging_enum=INFO, logging_text="[EXTREME GRADIENT BOOSTING] Starting training.") log_module_complete(MODULE_NAME=MODEL_NAME) return f1
def train_model(do_optimize=False, data=pd.DataFrame()): log_module_start(MODULE_NAME=MODEL_NAME) if len(data) == 0: data = pd.read_csv(DATA_PATH + LEXICAL_FEATURE_DATABASE) data = transform_data(data) data = data[[ 'Label', 'Entropy', 'Ratio Netloc/URL', 'Length URL', 'Ratio Digit/Letter', 'Ratio Path/URL', 'Has HTTPS', 'Length Netloc', 'KL Divergence', 'Ratio Vowel/Consonant', 'Number Symbols', 'Number Dots', 'Number Tokens Netloc', 'Number Digits Path', 'Ratio Cap/NonCap', 'Number Dash', 'Number Dash Netloc', 'Has Token Netloc', 'Number Slash Path', 'Ratio Query/URL', 'Number Digits Netloc', 'Number Redirects', 'Number PhishyTokens Path', 'Has Digits Netloc', 'Number Query Parameters', 'Number Dots Netloc', 'Has Query', 'Number Equals', 'Number Semicolon', 'Number Ampersand', 'Cert Created Shortly', 'Number Stars' ]] train, test = train_test_split(data, test_size=0.2) pd.set_option('display.max_columns', None) y = data['Label'] x = data[[ 'Entropy', 'Ratio Netloc/URL', 'Length URL', 'Ratio Digit/Letter', 'Ratio Path/URL', 'Has HTTPS', 'Length Netloc', 'KL Divergence', 'Ratio Vowel/Consonant', 'Number Symbols', 'Number Dots', 'Number Tokens Netloc', 'Number Digits Path', 'Ratio Cap/NonCap', 'Number Dash', 'Number Dash Netloc', 'Has Token Netloc', 'Number Slash Path', 'Ratio Query/URL', 'Number Digits Netloc', 'Number Redirects', 'Number PhishyTokens Path', 'Has Digits Netloc', 'Number Query Parameters', 'Number Dots Netloc', 'Has Query', 'Number Equals', 'Number Semicolon', 'Number Ampersand', 'Cert Created Shortly', 'Number Stars' ]] log(action_logging_enum=INFO, logging_text="[RANDOM FOREST] Data ready for use.") if do_optimize == True: optimize() params = { 'n_estimators': 800, 'max_features': 6, 'max_depth': 21, 'min_samples_leaf': 1, 'min_samples_split': 4 } log(action_logging_enum=INFO, logging_text="[RANDOM FOREST] Starting training.") random_forest = RandomForestClassifier( ) #n_estimators=1400, max_features='sqrt', min_samples_leaf=2, min_samples_split=4) f1 = print_scores(random_forest, x, y) # 'n_estimators': 1400, # 'max_features': 'sqrt', # 'max_depth': 20, # 'min_samples_leaf': 2, # 'min_samples_split': 4 #} # random_state=1, n_estimators=120, min_samples_leaf=5, min_samples_split=10, # max_features='sqrt', max_depth=17 y_train = train['Label'] x_train = train.drop(['Label'], axis=1).values random_forest.fit(x_train, y_train) save_model(random_forest=random_forest) log_module_complete(MODULE_NAME=MODEL_NAME) return f1
def train_model(do_optimize=False, data=pd.DataFrame()): log_module_start(MODULE_NAME=MODEL_NAME) if len(data) == 0: data = pd.read_csv(DATA_PATH + CONTENT_FEATURE_DATABASE) data = transform_data(data) data = data[[ 'Label', 'Ratio Similarity', 'Ratio Description Sim', 'Number HREF', 'Number DIV', 'Number LI', 'Ratio Title Sim', 'Number Span', 'Number UL', 'Has Bond Status', 'Number Image', 'Ratio Copyright Sim', 'Number PhishyTokens', 'Number Extern Links', 'Number Button', 'Number Inputs', 'Number Paragr', 'Ratio Unique Links', 'Has Freq Domain Extern', 'Has Copyright', 'Has Button', 'Has Redirect', 'Has iFrame', 'Has Extern Content', 'Has Meta', 'Has Input', 'Number Option', 'Has Action', 'Number OL', 'Number TR', 'Has Hidden Element', 'Number Checkbox' ]] train, test = train_test_split(data, test_size=0.2) pd.set_option('display.max_columns', None) y = data['Label'] x = data.drop(['Label'], axis=1) log(action_logging_enum=INFO, logging_text="[RANDOM FOREST] Data ready for use.") if do_optimize == True: optimize() #Best estimators: 1400 #Best samples leaf: 1 #Best samples split: 2 #Best features: sqrt #Best depth: 21 params = { 'n_estimators': 1400, 'max_features': 'sqrt', 'max_depth': 20, 'min_samples_leaf': 2, 'min_samples_split': 4 } log(action_logging_enum=INFO, logging_text="[RANDOM FOREST] Starting training.") random_forest = RandomForestClassifier( ) #n_estimators=600, max_features='auto', min_samples_leaf=1, min_samples_split=2, max_depth=None) f1 = print_scores(random_forest, x, y) # random_state=1, n_estimators=120, min_samples_leaf=5, min_samples_split=10, # max_features='sqrt', max_depth=17 y_train = train['Label'] x_train = train[[ 'Ratio Similarity', 'Ratio Description Sim', 'Number HREF', 'Number DIV', 'Number LI', 'Ratio Title Sim', 'Number Span', 'Number UL', 'Has Bond Status', 'Number Image', 'Ratio Copyright Sim', 'Number PhishyTokens', 'Number Extern Links', 'Number Button', 'Number Inputs', 'Number Paragr', 'Ratio Unique Links', 'Has Freq Domain Extern', 'Has Copyright', 'Has Button', 'Has Redirect', 'Has iFrame', 'Has Extern Content', 'Has Meta', 'Has Input', 'Number Option', 'Has Action', 'Number OL', 'Number TR', 'Has Hidden Element', 'Number Checkbox' ]] random_forest.fit(x_train, y_train) save_model(random_forest=random_forest) log_module_complete(MODULE_NAME=MODEL_NAME) return f1
def run(do_download_alexa=False, do_download_phish=False, do_query_alexa=False, check_status_phishing=False, check_status_benign=False): log_module_start(MODULE_NAME=MODULE_NAME) ################ ALEXA LIST ################## # download all list if do_download_alexa==True: db.download_file("http://s3.amazonaws.com/alexa-static/top-1m.csv.zip", "alexa.csv.zip") db.extract_from_Zip(compressed_name="alexa.csv.zip", target_dir=DATA_PATH, new_name="alexa.csv") # read lists from file if do_download_alexa: alexa_list = db.open_dataset_CSV_file(filename="alexa.csv", pos_url=1, label="Benign", max_line_count=16000) else: alexa_list = db.open_dataset_XML_file(ALEXA_FILE, iterateable="entry", label="Benign", url_label="url") if do_query_alexa == True: alexa_list = db.crawl_list_login_page(data=alexa_list, selenium_analysis=False, number_threads=10) # delete downloaded file if do_download_alexa: db.delete_data("alexa.csv.zip") db.move_file("alexa.csv") ################ PHISHTANK LIST ################## # download from phishtank -> DEVELOPER KEY NEEDED if do_download_phish == True: db.download_file( "http://data.phishtank.com/data/[developer key needed]/online-valid.xml", PHISHTANK_FILE) # write extracted list to XML if not alexa_list == None: db.write_list_to_XML(filename=ALEXA_FILE, root="data", list1=alexa_list) # open downloaded phishtank file phishtank_list = db.open_dataset_XML_file(filename=PHISHTANK_FILE, iterateable="entry", label="Phish") # check if websites in list are reachable if check_status_phishing: phishtank_list = db.check_status_of_website(phishtank_list) if check_status_benign: alexa_list = db.check_status_of_website(alexa_list) # make balanced list for same number of phishing and benign entries if len(phishtank_list) != len(alexa_list): if len(phishtank_list) > len(alexa_list): diff = len(phishtank_list) - len(alexa_list) for i in range(diff): phishtank_list.pop(0) else: diff = len(alexa_list) - len(phishtank_list) for i in range(diff): alexa_list.pop(0) # write phishtank file to XML file if not phishtank_list == None: db.write_list_to_XML(filename=PHISHTANK_FILE, root="data", list1=phishtank_list) # kaggle database available at: https://www.kaggle.com/kunal4892/phishingandlegitimateurls # kaggle_list = db.openCSVFile(filename="kaggle.csv", pos_url=0, pos_label=11) #db.deleteData("kaggle.csv") # if not kaggle_list == None: db.writeListtoXML(filename=KAGGLE_FILE, root="data", list=kaggle_list) ################ FINAL LIST ################## # create mix of phishtank and alexa list final_list = db.mix_lists_randomly(alexa_list, phishtank_list) # safe final list to XML for feature extraction if not final_list == None: db.write_list_to_XML(filename=DATABASE, root="data", list1=final_list) log_module_complete(MODULE_NAME=MODULE_NAME)