from time import time import keras.backend as K import numpy as np import argparse from keras.preprocessing.sequence import pad_sequences from utils.clean_data import process_legit_phish_data from utils.preprocess_data import split_data, extract_labels, create_vocab, read_dataset, create_char_vocab from utils.general_utils import get_logger, padding_email_sequences, load_word_embedding_dict, build_embedd_table from models.baseline_models import build_lstm, build_cnn from evaluators.evaluator_word_only import Evaluator logger = get_logger("Train ...") def main(): parser = argparse.ArgumentParser(description="Word and char themis model") parser.add_argument('--num_epochs', type=int, default=20, help='Number of epochs for training') parser.add_argument('--batch_size', type=int, default=16, help='Number of emails in each batch') parser.add_argument('--embedding', type=str, default='glove', help='Word embedding type, word2vec, senna or glove') parser.add_argument('--embedding_dim', type=int, default=50, help='Dimension of embedding') parser.add_argument('--embedding_path', type=str, default='embeddings/glove.6B.50d.txt', help='Path to embedding vec file') parser.add_argument('--baby', action='store_true', help='Set to True for small data quantity for debug') parser.add_argument('--seed', type=int, default=42, help='Set seed for data split') parser.add_argument('--legit_path', type=str, default='ISWPA2.0 Train Data/IWSPA2.0_Training_No_Header/legit/', help='Path to legit emails folder') parser.add_argument('--phish_path', type=str, default='ISWPA2.0 Train Data/IWSPA2.0_Training_No_Header/phish/', help='Path to phish emails folder') parser.add_argument('--model_name', type=str, choices=['lstm', 'cnn'], required=True) args = parser.parse_args() legit_path = args.legit_path phish_path = args.phish_path embedding_path = args.embedding_path
from sklearn.model_selection import train_test_split from utils.general_utils import get_logger logger = get_logger("Pre-process data ...") def split_data(data, random_state=42): train, dev_test = train_test_split(data, test_size=0.4, random_state=random_state) dev, test = train_test_split(dev_test, test_size=0.5, random_state=random_state) return train, dev, test def extract_labels(data): x_data = [] y_data = [] for item in data: x_data.append(item[0]) y_data.append(item[1]) return x_data, y_data def create_vocab(data, vocab_size, to_lower): logger.info('Creating vocabulary') total_words, unique_words = 0, 0 word_freqs = {} for content in data: if to_lower:
import pandas as pd import re import numpy as np import argparse from time import time from sklearn.model_selection import train_test_split from keras.preprocessing.text import Tokenizer from keras.preprocessing.sequence import pad_sequences from models.themis_models import build_simple_themis from evaluators.evaluator_sentiment_word_only import Evaluator from utils.clean_data import process_legit_phish_data from utils.preprocess_data import split_data, extract_labels, create_vocab, read_dataset from utils.general_utils import get_logger, padding_email_sequences, load_word_embedding_dict, build_embedd_table TAG_RE = re.compile(r'<[^>]+>') logger = get_logger("Train sent class...") def remove_tags(text): return TAG_RE.sub('', text) def preprocess_text(sen): sentence = remove_tags(sen) sentence = re.sub('[^a-zA-Z]', ' ', sentence) sentence = re.sub(r"\s+[a-zA-Z]\s+", ' ', sentence) sentence = re.sub(r'\s+', ' ', sentence) return sentence def main():
import numpy as np from sklearn.metrics import accuracy_score from utils.metrics import precision, recall, f1_score, false_positive_rate from utils.general_utils import get_logger logger = get_logger("Evaluate ...") class Evaluator: def __init__(self, model, X_train, X_dev, X_test, Y_train, Y_dev, Y_test, batch_size): self.model = model self.batch_size = batch_size self.X_train, self.X_dev, self.X_test = X_train, X_dev, X_test self.Y_train, self.Y_dev, self.Y_test = Y_train, Y_dev, Y_test self.best_dev_acc = -1 self.best_dev_precision = -1 self.best_dev_recall = -1 self.best_dev_f1 = -1 self.best_dev_false_pos_rate = -1 self.best_test_acc = -1 self.best_test_precision = -1 self.best_test_recall = -1 self.best_test_f1 = -1 self.best_test_false_pos_rate = -1 self.dev_acc = None