def run_text(): data = generate_data(split_point=15000, emb_size=50, maxlen=100) emb_matrix = data['emb_matrix'] train_batches = data['train_batches'] test_batches = data['test_batches'] model = TextRNN(emb_matrix, trainable_embeddings=False) optimizer = Adam(model.params, 0.001) criterion = BCEWithLogitsLoss() train(model, train_batches, test_batches, optimizer, criterion, 50, 5)
def run_text(): data = generate_data(emb_size=50, max_len=100) emb_matrix = data['emb_matrix'] train_batches = data['train_batches'] test_batches = data['test_batches'] model = ProjectedAttentionTextRNN(emb_matrix, stacked_layers=1) optimizer = Adam(model.params, 0.001) criterion = BCEWithLogitsLoss() train(model, train_batches, test_batches, optimizer, criterion, 50, 5)
def run_text(): data = generate_data(size=200000, split_point=160000, emb_size=50, max_len=25) emb_matrix = data['emb_matrix'] train_batches = data['train_batches'] test_batches = data['test_batches'] model = ProjectedAttentionTextRNN(emb_matrix) optimizer = Adam(model.params, 0.001) criterion = BCEWithLogitsLoss() train(model, train_batches, test_batches, optimizer, criterion, 50, 5)
def run_text_author(): data = generate_data(size=1000000, split_point=960000, emb_size=25, max_len=25) a2i = data['a2i'] emb_matrix = data['emb_matrix'] train_batches = data['train_batches'] test_batches = data['test_batches'] model = TextAuthorRNN(emb_matrix, len(a2i)) optimizer = Adam(model.params, 0.001) criterion = BCEWithLogitsLoss() train(model, train_batches, test_batches, optimizer, criterion, 50, 5)
def run_text_title_author(): data = generate_data(split_point=15000, emb_size=50, maxlen=100) a2i = data['a2i'] emb_matrix = data['emb_matrix'] train_batches = data['train_batches'] test_batches = data['test_batches'] model = ProjectedAttentionTextTitleAuthorRNN( emb_matrix, author_embeddings_input_size=len(a2i), embeddings_dropout=0.5, top_mlp_dropout=0.5, text_stacked_layers=1, text_cell_hidden_size=128, title_cell_hidden_size=32, top_mlp_outer_activation=None, top_mlp_layers=2) optimizer = Adam(model.params, 0.001) criterion = BCEWithLogitsLoss() train(model, train_batches, test_batches, optimizer, criterion, 50, 5)
from utils_deepwalk import deepwalk sys.path.append("../") from preprocess import get_train_data, import_texts, generate_data, clean_host_texts # Generating train data without duplicates and test data data = "../data/" train_file = data + "train_noduplicates.csv" train_hosts, y_train = get_train_data(train_file) texts_path = "../text/text" texts = import_texts(texts_path) with open(data + "test.csv", "r") as f: test_hosts = f.read().splitlines() train_data = generate_data(train_hosts, texts) test_data = generate_data(test_hosts, texts) # Preprocessing texts tokenizer = TweetTokenizer() punctuation = string.punctuation + "’“”.»«…°" stpwords_fr = stopwords.words("french") stpwords_en = stopwords.words("english") cleaned_train_data = clean_host_texts(data=train_data, tok=tokenizer, stpwds=stpwords_fr + stpwords_en, punct=punctuation) cleaned_test_data = clean_host_texts(data=test_data, tok=tokenizer, stpwds=stpwords_fr + stpwords_en, punct=punctuation)
'sexual_precision': sexual_precision_scores, 'physical_f1_score': physical_f1_scores, 'physical_recall': physical_recall_scores, 'physical_precision': physical_precision_scores } df = pd.DataFrame.from_dict(results_dict) if "results.csv" in os.listdir(RESULTS_DIR): df_old = pd.read_csv(RESULTS_DIR + "results.csv") df = pd.concat([df_old, df]) df.to_csv(RESULTS_DIR + "results.csv", index=False) if __name__ == "__main__": if not os.path.exists(MODELS_DIR): os.makedirs(MODELS_DIR) if not os.path.exists(RESULTS_DIR): os.makedirs(RESULTS_DIR) data = generate_data(embs_path=GLOVE_EMBEDDINGS_PATH, maxlen=CONFIG['maxlen'], batch_size=CONFIG['batch_size']) run_model("vanilla_last", data, False) run_model("vanilla_projected_last", data, False) run_model("vanilla_avg", data, False) run_model("vanilla_projected_avg", data, False) run_model("multi_attention", data, False) run_model("multi_projected_attention", data, False) run_model("projected_attention", data, False) run_model("attention", data, False)