def main(): parser = argparse.ArgumentParser(description="Word and char themis model") parser.add_argument('--num_epochs', type=int, default=20, help='Number of epochs for training') parser.add_argument('--batch_size', type=int, default=16, help='Number of emails in each batch') parser.add_argument('--embedding', type=str, default='glove', help='Word embedding type, word2vec, senna or glove') parser.add_argument('--embedding_dim', type=int, default=50, help='Dimension of embedding') parser.add_argument('--embedding_path', type=str, default='embeddings/glove.6B.50d.txt', help='Path to embedding vec file') parser.add_argument('--baby', action='store_true', help='Set to True for small data quantity for debug') parser.add_argument('--seed', type=int, default=42, help='Set seed for data split') parser.add_argument('--legit_path', type=str, default='ISWPA2.0 Train Data/IWSPA2.0_Training_No_Header/legit/', help='Path to legit emails folder') parser.add_argument('--phish_path', type=str, default='ISWPA2.0 Train Data/IWSPA2.0_Training_No_Header/phish/', help='Path to phish emails folder') parser.add_argument('--model_name', type=str, choices=['lstm', 'cnn'], required=True) args = parser.parse_args() legit_path = args.legit_path phish_path = args.phish_path embedding_path = args.embedding_path embedding = args.embedding embedd_dim = args.embedding_dim epochs = args.num_epochs batch_size = args.batch_size baby = args.baby seed = args.seed model_name = args.model_name all_data = process_legit_phish_data(legit_path=legit_path, phish_path=phish_path) if baby: all_data = all_data[:100] train, dev, test = split_data(all_data) x_train, y_train = extract_labels(train) x_dev, y_dev = extract_labels(dev) x_test, y_test = extract_labels(test) vocab = create_vocab(x_train, vocab_size=20000, to_lower=True) x_train, max_token_train = read_dataset(x_train, vocab, to_lower=True) x_dev, max_token_dev = read_dataset(x_dev, vocab, to_lower=True) x_test, max_token_test = read_dataset(x_test, vocab, to_lower=True) max_token = max(max_token_train, max_token_dev, max_token_test) logger.info('Max tokens train: {}'.format(max_token_train)) logger.info('Max tokens dev: {}'.format(max_token_dev)) logger.info('Max tokens test: {}'.format(max_token_test)) logger.info('Max tokens: {}'.format(max_token)) X_train, Y_train, train_mask = padding_email_sequences(x_train, y_train, max_token, post_padding=True) X_dev, Y_dev, dev_mask = padding_email_sequences(x_dev, y_dev, max_token, post_padding=True) X_test, Y_test, test_mask = padding_email_sequences(x_test, y_test, max_token, post_padding=True) logger.info('X train shape: {}'.format(X_train.shape)) logger.info('X dev shape: {}'.format(X_dev.shape)) logger.info('X test shape: {}'.format(X_test.shape)) logger.info('Y train shape: {}'.format(Y_train.shape)) logger.info('Y dev shape: {}'.format(Y_dev.shape)) logger.info('Y test shape: {}'.format(Y_test.shape)) if embedding_path: embedd_dict, embedd_dim, _ = load_word_embedding_dict(embedding, embedding_path, vocab, logger, embedd_dim) embedd_matrix = build_embedd_table(vocab, embedd_dict, embedd_dim, logger, caseless=True) else: embedd_matrix = None if embedd_matrix is not None: embedd_dim = embedd_matrix.shape[1] embed_table = [embedd_matrix] if model_name == 'lstm': model = build_lstm(vocab, max_token, embedd_dim, embed_table) elif model_name == 'cnn': model = build_cnn(vocab, max_token, embedd_dim, embed_table) else: raise NotImplementedError evaluator = Evaluator(model, X_train, X_dev, X_test, Y_train, Y_dev, Y_test, batch_size) logger.info("Initial evaluation: ") evaluator.predict() evaluator.print_eval() logger.info("Train model") for ii in range(epochs): logger.info('Epoch %s/%s' % (str(ii + 1), epochs)) start_time = time() model.fit(X_train, Y_train, batch_size=batch_size, epochs=1, verbose=0, shuffle=True) tt_time = time() - start_time logger.info("Training one epoch in %.3f s" % tt_time) evaluator.predict() evaluator.print_eval() evaluator.print_final_eval()
def main(): parser = argparse.ArgumentParser(description="Word themis model") parser.add_argument('--num_epochs', type=int, default=20, help='Number of epochs for training') parser.add_argument('--batch_size', type=int, default=16, help='Number of emails in each batch') parser.add_argument('--embedding', type=str, default='glove', help='Word embedding type, word2vec, senna or glove') parser.add_argument('--embedding_dim', type=int, default=50, help='Dimension of embedding') parser.add_argument('--embedding_path', type=str, default='embeddings/glove.6B.50d.txt', help='Path to embedding vec file') parser.add_argument('--seed', type=int, default=42, help='Set seed for data split') parser.add_argument( '--legit_path', type=str, default='ISWPA2.0 Train Data/IWSPA2.0_Training_No_Header/legit/', help='Path to legit emails folder') parser.add_argument( '--phish_path', type=str, default='ISWPA2.0 Train Data/IWSPA2.0_Training_No_Header/phish/', help='Path to phish emails folder') parser.add_argument('--baby', action='store_true', help='Set to True for small data quantity for debug') args = parser.parse_args() epochs = args.num_epochs batch_size = args.batch_size embedding_path = args.embedding_path embedding = args.embedding embedd_dim = args.embedding_dim legit_path = args.legit_path phish_path = args.phish_path seed = args.seed baby = args.baby all_data = process_legit_phish_data(legit_path=legit_path, phish_path=phish_path) train, dev, test = split_data(all_data, random_state=seed) x_train_phish, y_train_phish = extract_labels(train) vocab = create_vocab(x_train_phish, vocab_size=20000, to_lower=True) data_path = 'ISWPA2.0 Train Data/IMDB Dataset.csv' if baby: movie_reviews = pd.read_csv(data_path)[:100] else: movie_reviews = pd.read_csv(data_path) movie_reviews.isnull().values.any() X = [] sentences = list(movie_reviews['review']) for sen in sentences: X.append(preprocess_text(sen)) X, max_token = read_dataset(X, vocab, to_lower=True) max_token = 300 y = movie_reviews['sentiment'] y = np.array(list(map(lambda x: 1 if x == "positive" else 0, y))) X_train, X_test_dev, y_train, y_test_dev = train_test_split( X, y, test_size=0.40, random_state=42) X_dev, X_test, y_dev, y_test = train_test_split(X_test_dev, y_test_dev, test_size=0.50, random_state=42) X_train = pad_sequences(X_train, maxlen=300, truncating='post') X_dev = pad_sequences(X_dev, maxlen=300, truncating='post') X_test = pad_sequences(X_test, maxlen=300, truncating='post') logger.info('X train shape: {}'.format(X_train.shape)) logger.info('X dev shape: {}'.format(X_dev.shape)) logger.info('X test shape: {}'.format(X_test.shape)) logger.info('Y train shape: {}'.format(y_train.shape)) logger.info('Y dev shape: {}'.format(y_dev.shape)) logger.info('Y test shape: {}'.format(y_test.shape)) if embedding_path: embedd_dict, embedd_dim, _ = load_word_embedding_dict( embedding, embedding_path, vocab, logger, embedd_dim) embedd_matrix = build_embedd_table(vocab, embedd_dict, embedd_dim, logger, caseless=True) else: embedd_matrix = None if embedd_matrix is not None: embedd_dim = embedd_matrix.shape[1] embed_table = [embedd_matrix] model = build_simple_themis(vocab, max_token, embedd_dim, embed_table) save_path = 'saved_models/word_only_themis_seed' + str(seed) evaluator = Evaluator(model, X_train, X_dev, X_test, y_train, y_dev, y_test, batch_size, save_path) logger.info("Initial evaluation: ") evaluator.predict() evaluator.print_eval() logger.info("Train model") for ii in range(epochs): logger.info('Epoch %s/%s' % (str(ii + 1), epochs)) start_time = time() model.fit(X_train, y_train, batch_size=batch_size, epochs=1, verbose=0, shuffle=True) tt_time = time() - start_time logger.info("Training one epoch in %.3f s" % tt_time) evaluator.predict() evaluator.print_eval() evaluator.print_final_eval()
def main(): parser = argparse.ArgumentParser(description="Shared Model") parser.add_argument('--test_prompt_id', type=int, default=1, help='prompt id of test essay set') parser.add_argument('--seed', type=int, default=12, help='set random seed') args = parser.parse_args() test_prompt_id = args.test_prompt_id seed = args.seed np.random.seed(seed) tf.random.set_seed(seed) random.seed(seed) os.environ['PYTHONHASHSEED'] = str(seed) print("Test prompt id is {} of type {}".format(test_prompt_id, type(test_prompt_id))) print("Seed: {}".format(seed)) configs = Configs() data_path = configs.DATA_PATH train_path = data_path + '/train.tsv' dev_path = data_path + '/dev.tsv' pretrained_embedding = configs.PRETRAINED_EMBEDDING embedding_path = configs.EMBEDDING_PATH embedding_dim = configs.EMBEDDING_DIM vocab_size = configs.VOCAB_SIZE epochs = configs.EPOCHS batch_size = configs.BATCH_SIZE read_configs = { 'train_path': train_path, 'dev_path': dev_path, 'vocab_size': vocab_size } word_vocab = read_word_vocab(read_configs) print('vocab complete') train_data_src, train_data_tgt, dev_data_src, dev_data_tgt = \ read_essays_words(read_configs, word_vocab, test_prompt_id) if pretrained_embedding: embedd_dict, embedd_dim, _ = load_word_embedding_dict(embedding_path) embedd_matrix = build_embedd_table(word_vocab, embedd_dict, embedd_dim, caseless=True) embed_table = [embedd_matrix] else: embed_table = None max_sentlen = max(train_data_src['max_sentlen'], train_data_tgt['max_sentlen'], dev_data_src['max_sentlen'], dev_data_tgt['max_sentlen']) max_sentnum = max(train_data_src['max_sentnum'], train_data_tgt['max_sentnum'], dev_data_src['max_sentnum'], dev_data_tgt['max_sentnum']) print('max sent length: {}'.format(max_sentlen)) print('max sent num: {}'.format(max_sentnum)) train_data_src['y_scaled'] = get_scaled_down_scores(train_data_src['data_y'], train_data_src['prompt_ids']) train_data_tgt['y_scaled'] = get_scaled_down_scores(train_data_tgt['data_y'], train_data_tgt['prompt_ids']) dev_data_src['y_scaled'] = get_scaled_down_scores(dev_data_src['data_y'], dev_data_src['prompt_ids']) dev_data_tgt['y_scaled'] = get_scaled_down_scores(dev_data_tgt['data_y'], dev_data_tgt['prompt_ids']) X_train_src = pad_hierarchical_text_sequences(train_data_src['words'], max_sentnum, max_sentlen) X_train_tgt = pad_hierarchical_text_sequences(train_data_tgt['words'], max_sentnum, max_sentlen) X_dev_src = pad_hierarchical_text_sequences(dev_data_src['words'], max_sentnum, max_sentlen) X_dev_tgt = pad_hierarchical_text_sequences(dev_data_tgt['words'], max_sentnum, max_sentlen) X_train_src = X_train_src.reshape((X_train_src.shape[0], X_train_src.shape[1] * X_train_src.shape[2])) X_train_tgt = X_train_tgt.reshape((X_train_tgt.shape[0], X_train_tgt.shape[1] * X_train_tgt.shape[2])) X_dev_src = X_dev_src.reshape((X_dev_src.shape[0], X_dev_src.shape[1] * X_dev_src.shape[2])) X_dev_tgt = X_dev_tgt.reshape((X_dev_tgt.shape[0], X_dev_tgt.shape[1] * X_dev_tgt.shape[2])) Y_train_src = np.array(train_data_src['y_scaled']) Y_train_tgt = np.array(train_data_tgt['y_scaled']) Y_dev_src = np.array(dev_data_src['y_scaled']) Y_dev_tgt = np.array(dev_data_tgt['y_scaled']) train_src_batches = batch_generator( [X_train_src, Y_train_src], batch_size) train_tgt_batches = batch_generator( [X_train_tgt, Y_train_tgt], batch_size) disc_loss_fn = tf.keras.losses.SparseCategoricalCrossentropy() score_loss_fn = tf.keras.losses.MeanSquaredError() optimizer = tf.keras.optimizers.Adam(learning_rate=1e-3) shared_model = SharedModelV2(len(word_vocab), max_sentnum, max_sentlen, embedding_dim, embed_table) steps = (X_train_src.shape[0] // batch_size) * epochs evaluator = SharedModelEvaluatorV2(test_prompt_id, X_dev_src, X_train_tgt, X_dev_tgt, dev_data_src['prompt_ids'], train_data_tgt['prompt_ids'], dev_data_tgt['prompt_ids'], Y_dev_src, Y_train_tgt, Y_dev_tgt) evaluator.evaluate(shared_model, 0, print_info=True) for step in range(steps): src_label = tf.zeros((batch_size, 1)) tgt_label = tf.ones((batch_size, 1)) X_train_src_batch, Y_train_src_batch = next(train_src_batches) X_train_tgt_batch, Y_train_tgt_batch = next(train_tgt_batches) X_both = tf.concat([X_train_src_batch, X_train_tgt_batch], axis=0) label_both = tf.concat([src_label, tgt_label], axis=0) fe_loss, score_loss, disc_loss = full_train_step(X_train_src_batch, Y_train_src_batch, X_both, label_both, shared_model, score_loss_fn, disc_loss_fn, optimizer) current_step = step + 1 if current_step % (steps//epochs) == 0: print( "fe loss (for one batch) at step %d: %.4f" % (current_step, float(fe_loss)) ) print( "score loss (for one batch) at step %d: %.4f" % (current_step, float(score_loss)) ) print( "disc loss (for one batch) at step %d: %.4f" % (current_step, float(disc_loss)) ) print('steps', steps) print('step', current_step) print('epochs', epochs) print('batch_size', batch_size) print('Evaluating epoch', current_step/(steps//epochs)) if step == 0: evaluator.evaluate(shared_model, 0) else: evaluator.evaluate(shared_model, current_step/(steps//epochs)) evaluator.print_final_info()
def main(): parser = argparse.ArgumentParser(description="PAES_attributes model") parser.add_argument('--test_prompt_id', type=int, default=1, help='prompt id of test essay set') parser.add_argument('--seed', type=int, default=12, help='set random seed') parser.add_argument('--attribute_name', type=str, help='name of the attribute to be trained on') args = parser.parse_args() test_prompt_id = args.test_prompt_id attribute_name = args.attribute_name seed = args.seed np.random.seed(seed) tf.random.set_seed(seed) random.seed(seed) os.environ['PYTHONHASHSEED'] = str(seed) print("Test prompt id is {} of type {}".format(test_prompt_id, type(test_prompt_id))) print("Attribute: {}".format(attribute_name)) print("Seed: {}".format(seed)) configs = Configs() data_path = configs.DATA_PATH train_path = data_path + str(test_prompt_id) + '/train.pk' dev_path = data_path + str(test_prompt_id) + '/dev.pk' test_path = data_path + str(test_prompt_id) + '/test.pk' features_path = configs.FEATURES_PATH pretrained_embedding = configs.PRETRAINED_EMBEDDING embedding_path = configs.EMBEDDING_PATH embedding_dim = configs.EMBEDDING_DIM readability_path = configs.READABILITY_PATH vocab_size = configs.VOCAB_SIZE epochs = configs.EPOCHS batch_size = configs.BATCH_SIZE read_configs = { 'train_path': train_path, 'dev_path': dev_path, 'test_path': test_path, 'features_path': features_path, 'readability_path': readability_path, 'vocab_size': vocab_size } word_vocab = read_word_vocab(read_configs) train_data, dev_data, test_data = read_essays_single_score_words( read_configs, word_vocab, attribute_name) if pretrained_embedding: embedd_dict, embedd_dim, _ = load_word_embedding_dict(embedding_path) embedd_matrix = build_embedd_table(word_vocab, embedd_dict, embedd_dim, caseless=True) embed_table = [embedd_matrix] else: embed_table = None max_sentlen = max(train_data['max_sentlen'], dev_data['max_sentlen'], test_data['max_sentlen']) max_sentnum = max(train_data['max_sentnum'], dev_data['max_sentnum'], test_data['max_sentnum']) print('max sent length: {}'.format(max_sentlen)) print('max sent num: {}'.format(max_sentnum)) train_data['y_scaled'] = get_single_scaled_down_score( train_data['data_y'], train_data['prompt_ids'], attribute_name) dev_data['y_scaled'] = get_single_scaled_down_score( dev_data['data_y'], dev_data['prompt_ids'], attribute_name) test_data['y_scaled'] = get_single_scaled_down_score( test_data['data_y'], test_data['prompt_ids'], attribute_name) X_train = pad_hierarchical_text_sequences(train_data['words'], max_sentnum, max_sentlen) X_dev = pad_hierarchical_text_sequences(dev_data['words'], max_sentnum, max_sentlen) X_test = pad_hierarchical_text_sequences(test_data['words'], max_sentnum, max_sentlen) X_train = X_train.reshape( (X_train.shape[0], X_train.shape[1] * X_train.shape[2])) X_dev = X_dev.reshape((X_dev.shape[0], X_dev.shape[1] * X_dev.shape[2])) X_test = X_test.reshape( (X_test.shape[0], X_test.shape[1] * X_test.shape[2])) X_train_linguistic_features = np.array(train_data['features_x']) X_dev_linguistic_features = np.array(dev_data['features_x']) X_test_linguistic_features = np.array(test_data['features_x']) X_train_readability = np.array(train_data['readability_x']) X_dev_readability = np.array(dev_data['readability_x']) X_test_readability = np.array(test_data['readability_x']) Y_train = np.array(train_data['y_scaled']) Y_dev = np.array(dev_data['y_scaled']) Y_test = np.array(test_data['y_scaled']) print('================================') print('X_train_pos: ', X_train.shape) print('X_train_readability: ', X_train_readability.shape) print('X_train_ling: ', X_train_linguistic_features.shape) print('Y_train: ', Y_train.shape) print('================================') print('X_dev_pos: ', X_dev.shape) print('X_dev_readability: ', X_dev_readability.shape) print('X_dev_ling: ', X_dev_linguistic_features.shape) print('Y_dev: ', Y_dev.shape) print('================================') print('X_test_pos: ', X_test.shape) print('X_test_readability: ', X_test_readability.shape) print('X_test_ling: ', X_test_linguistic_features.shape) print('Y_test: ', Y_test.shape) print('================================') model = build_Hi_att(len(word_vocab), max_sentnum, max_sentlen, configs, embed_table) dev_features_list = [X_dev] test_features_list = [X_test] evaluator = Evaluator(test_prompt_id, dev_data['prompt_ids'], test_data['prompt_ids'], dev_features_list, test_features_list, Y_dev, Y_test, attribute_name) evaluator.evaluate(model, -1, print_info=True) for ii in range(epochs): print('Epoch %s/%s' % (str(ii + 1), epochs)) start_time = time.time() model.fit([X_train], Y_train, batch_size=batch_size, epochs=1, verbose=0, shuffle=True) tt_time = time.time() - start_time print("Training one epoch in %.3f s" % tt_time) evaluator.evaluate(model, ii + 1) evaluator.print_final_info()
def main(): parser = argparse.ArgumentParser(description="Word and char themis model") parser.add_argument('--num_epochs', type=int, default=20, help='Number of epochs for training') parser.add_argument('--batch_size', type=int, default=16, help='Number of emails in each batch') parser.add_argument('--embedding', type=str, default='glove', help='Word embedding type, word2vec, senna or glove') parser.add_argument('--embedding_dim', type=int, default=50, help='Dimension of embedding') parser.add_argument('--embedding_path', type=str, default='embeddings/glove.6B.50d.txt', help='Path to embedding vec file') parser.add_argument('--baby', action='store_true', help='Set to True for small data quantity for debug') parser.add_argument('--seed', type=int, default=42, help='Set seed for data split') parser.add_argument( '--legit_path', type=str, default='ISWPA2.0 Train Data/IWSPA2.0_Training_No_Header/legit/', help='Path to legit emails folder') parser.add_argument( '--phish_path', type=str, default='ISWPA2.0 Train Data/IWSPA2.0_Training_No_Header/phish/', help='Path to phish emails folder') args = parser.parse_args() legit_path = args.legit_path phish_path = args.phish_path embedding_path = args.embedding_path embedding = args.embedding embedd_dim = args.embedding_dim epochs = args.num_epochs batch_size = args.batch_size baby = args.baby seed = args.seed all_data = process_legit_phish_data(legit_path=legit_path, phish_path=phish_path) if baby: all_data = all_data[:100] train, dev, test = split_data(all_data) x_train, y_train = extract_labels(train) x_dev, y_dev = extract_labels(dev) x_test, y_test = extract_labels(test) vocab = create_vocab(x_train, vocab_size=20000, to_lower=True) x_train, max_token_train = read_dataset(x_train, vocab, to_lower=True) x_dev, max_token_dev = read_dataset(x_dev, vocab, to_lower=True) x_test, max_token_test = read_dataset(x_test, vocab, to_lower=True) max_token = max(max_token_train, max_token_dev, max_token_test) logger.info('Max tokens train: {}'.format(max_token_train)) logger.info('Max tokens dev: {}'.format(max_token_dev)) logger.info('Max tokens test: {}'.format(max_token_test)) logger.info('Max tokens: {}'.format(max_token)) X_train, Y_train, train_mask = padding_email_sequences(x_train, y_train, max_token, post_padding=True) X_dev, Y_dev, dev_mask = padding_email_sequences(x_dev, y_dev, max_token, post_padding=True) X_test, Y_test, test_mask = padding_email_sequences(x_test, y_test, max_token, post_padding=True) logger.info('X train shape: {}'.format(X_train.shape)) logger.info('X dev shape: {}'.format(X_dev.shape)) logger.info('X test shape: {}'.format(X_test.shape)) logger.info('Y train shape: {}'.format(Y_train.shape)) logger.info('Y dev shape: {}'.format(Y_dev.shape)) logger.info('Y test shape: {}'.format(Y_test.shape)) if embedding_path: embedd_dict, embedd_dim, _ = load_word_embedding_dict( embedding, embedding_path, vocab, logger, embedd_dim) embedd_matrix = build_embedd_table(vocab, embedd_dict, embedd_dim, logger, caseless=True) else: embedd_matrix = None if embedd_matrix is not None: embedd_dim = embedd_matrix.shape[1] embed_table = [embedd_matrix] saved_path = 'saved_models/word_only_themis_seed' + str(seed) model_path = saved_path + '.json' weights_path = saved_path + '.h5' json_file = open(model_path, 'r') loaded_model = json_file.read() json_file.close() model = model_from_json(loaded_model) model.load_weights(weights_path) adam = optimizers.adam(lr=0.0001) model.compile(loss='binary_crossentropy', optimizer=adam, metrics=['binary_accuracy']) print(model.summary()) evaluator = Evaluator(model, X_train, X_dev, X_test, Y_train, Y_dev, Y_test, batch_size) logger.info("Initial evaluation: ") evaluator.predict() evaluator.print_eval() logger.info("Train model") for ii in range(epochs): logger.info('Epoch %s/%s' % (str(ii + 1), epochs)) start_time = time() model.fit(X_train, Y_train, batch_size=batch_size, epochs=1, verbose=0, shuffle=True) tt_time = time() - start_time logger.info("Training one epoch in %.3f s" % tt_time) evaluator.predict() evaluator.print_eval() evaluator.print_final_eval()