def main(): parser = argparse.ArgumentParser( description= "Recurrent neural network for identifying and classifying toxic online comments" ) parser.add_argument("train_file_path") parser.add_argument("test_file_path") parser.add_argument("embedding_path") parser.add_argument("--result-path", default="toxic_results") parser.add_argument("--batch-size", type=int, default=256) parser.add_argument("--sentences-length", type=int, default=500) parser.add_argument("--recurrent-units", type=int, default=64) parser.add_argument("--dropout-rate", type=float, default=0.3) parser.add_argument("--dense-size", type=int, default=32) parser.add_argument("--fold-count", type=int, default=10) args = parser.parse_args() if args.fold_count <= 1: raise ValueError("fold-count should be more than 1") print("Loading data...") train_data = pd.read_csv(args.train_file_path) test_data = pd.read_csv(args.test_file_path) """ list_sentences_train = train_data["comment_text"].fillna(NAN_WORD).values list_sentences_test = test_data["comment_text"].fillna(NAN_WORD).values y_train = train_data[CLASSES].values print("Tokenizing sentences in train set...") tokenized_sentences_train, words_dict = tokenize_sentences(list_sentences_train, {}) print("Tokenizing sentences in test set...") tokenized_sentences_test, words_dict = tokenize_sentences(list_sentences_test, words_dict) words_dict[UNKNOWN_WORD] = len(words_dict) print("Loading embeddings...") embedding_list, embedding_word_dict = read_embedding_list(args.embedding_path) embedding_size = len(embedding_list[0]) print("Preparing data...") embedding_list, embedding_word_dict = clear_embedding_list(embedding_list, embedding_word_dict, words_dict) embedding_word_dict[UNKNOWN_WORD] = len(embedding_word_dict) embedding_list.append([0.] * embedding_size) embedding_word_dict[END_WORD] = len(embedding_word_dict) embedding_list.append([-1.] * embedding_size) embedding_matrix = np.array(embedding_list) id_to_word = dict((id, word) for word, id in words_dict.items()) train_list_of_token_ids = convert_tokens_to_ids( tokenized_sentences_train, id_to_word, embedding_word_dict, args.sentences_length) test_list_of_token_ids = convert_tokens_to_ids( tokenized_sentences_test, id_to_word, embedding_word_dict, args.sentences_length) X_train = np.array(train_list_of_token_ids) X_test = np.array(test_list_of_token_ids) """ embed_path = os.path.join(args.embedding_path, 'embeddings.npz') data = np.load(embed_path) embedding_matrix = data['arr_0'] data_path = os.path.join(args.embedding_path, 'train.npz') data = np.load(data_path) X_train = data['arr_0'] data_path = os.path.join(args.embedding_path, 'test.npz') data = np.load(data_path) X_test = data['arr_0'] data_path = os.path.join(args.embedding_path, 'label.npz') data = np.load(data_path) y_train = data['arr_0'] """ get_model_func = lambda: get_model( embedding_matrix, args.sentences_length, args.dropout_rate, args.recurrent_units, args.dense_size) """ get_model_func = lambda: get_GRU_GlobalMax_Ave( embedding_matrix, args.sentences_length, args.dropout_rate, args. recurrent_units, args.dense_size) print("Starting to train models...") models = train_folds(X_train, y_train, args.fold_count, args.batch_size, get_model_func) if not os.path.exists(args.result_path): os.mkdir(args.result_path) print("Predicting results...") test_predicts_list = [] for fold_id, model in enumerate(models): model_path = os.path.join(args.result_path, "model{0}_weights.npy".format(fold_id)) np.save(model_path, model.get_weights()) test_predicts_path = os.path.join( args.result_path, "test_predicts{0}.npy".format(fold_id)) test_predicts = model.predict(X_test, batch_size=args.batch_size) test_predicts_list.append(test_predicts) np.save(test_predicts_path, test_predicts) test_predicts = np.ones(test_predicts_list[0].shape) for fold_predict in test_predicts_list: test_predicts *= fold_predict test_predicts **= (1. / len(test_predicts_list)) test_predicts **= PROBABILITIES_NORMALIZE_COEFFICIENT test_ids = test_data["id"].values test_ids = test_ids.reshape((len(test_ids), 1)) test_predicts = pd.DataFrame(data=test_predicts, columns=CLASSES) test_predicts["id"] = test_ids test_predicts = test_predicts[["id"] + CLASSES] submit_path = os.path.join(args.result_path, "submit") test_predicts.to_csv(submit_path, index=False)
def main(): parser = argparse.ArgumentParser( description="Recurrent neural network for identifying and classifying toxic online comments") parser.add_argument("train_file_path") parser.add_argument("test_file_path") parser.add_argument('--test-mode', type=bool, default=False) parser.add_argument("--embedding-path", default=None) parser.add_argument("--result-path", default="toxic_results") parser.add_argument("--batch-size", type=int, default=256) parser.add_argument("--sentences-length", type=int, default=500) parser.add_argument("--recurrent-units", type=int, default=64) parser.add_argument("--dropout-rate", type=float, default=0.5) parser.add_argument("--dense-size", type=int, default=32) parser.add_argument("--fold-count", type=int, default=10) parser.add_argument('--epoch', type=int, default=5) parser.add_argument('--load-pretrained', type=bool, default=False) parser.add_argument('--sample', type=float, default=0.1) args = parser.parse_args() # if args.fold_count <= 1: # raise ValueError("fold-count should be more than 1") print("Loading data...") vocab_size = 50000 embedding_dim = 64 X_train, y_train, X_test, embedding_matrix = get_train_test_and_embedding( train_csv=args.train_file_path, test_csv=args.test_file_path, sequence_length=args.sentences_length, vocab_size=vocab_size, embedding_file=args.embedding_path, embedding_dim=embedding_dim) test_data = pd.read_csv(args.test_file_path) test_ids = test_data["id"].values get_model_func = lambda: get_model( embedding_matrix, args.sentences_length, args.dropout_rate, args.recurrent_units, args.dense_size) if args.test_mode is True: print('in test mode!') X_size = X_train.shape[0] sub_train_indices = np.random.choice(range(X_size), size=int(X_size*args.sample), replace=False) X_test_size = X_test.shape[0] sub_test_indices = np.random.choice(range(X_test_size), size=int(X_test_size*0.1), replace=False) X_test = X_test[sub_test_indices] test_ids = test_ids[sub_test_indices] else: print(' not in test mode') X_size = X_train.shape[0] sub_train_indices = np.random.choice(range(X_size), size=int(X_size*1), replace=False) # X_test_size = X_test.shape[0] # sub_test_indices = np.random.choice(range(X_test_size), size=int(X_test_size*1), replace=False) X_train = X_train[sub_train_indices] y_train = y_train[sub_train_indices] print("Starting to train models...") # model, hist = train_folds(X_train, y_train, args.epoch, args.batch_size, get_model_func) if args.load_pretrained is False: models, scores = train_folds( X=X_train, y=y_train, epoch=args.epoch, fold_count=args.fold_count, batch_size=args.batch_size, get_model_func=get_model_func) else: models = [get_model_func() for _ in range(args.fold_count)] scores = [0.98729337078270507, 0.98741052541081709, 0.98875435673905765, 0.98888426426103615, 0.98921313005210798, 0.98900847797211722, 0.98849751432495514, 0.98839252464184923, 0.98844750778401702, 0.98633935039731879] print('the fold score is : {}'.format(scores)) validation_scores = np.mean(scores) if not os.path.exists(args.result_path): os.mkdir(args.result_path) # print("Predicting results...") # y_pred = model.predict(X_test) # probabilities = softmax(scores) max_probability_index = np.argmax(scores) #mean_probabilities = [1./scores] * len(scores) # test_predicts_list = [] # for fold_id, model in enumerate(models): test_predicts_softmax = np.zeros(shape=(X_test.shape[0], len(CLASSES))) test_predicats_mean = np.zeros(shape=(X_test.shape[0], len(CLASSES))) test_predicats_max = np.zeros(shape=(X_test.shape[0], len(CLASSES))) test_predicats_with_normalizated = np.zeros(shape=(X_test.shape[0], len(CLASSES))) print('predicate test set!') for fold_id, model in enumerate(models): prob = probabilities[fold_id] print('predicate with fold_id {}'.format(fold_id)) model_path = os.path.join(args.result_path, 'model{0}_weights.npy'.format(fold_id)) if args.load_pretrained is True: weights = np.load('model{0}_weights.npy') model.set_weights(weights) else: np.save(model_path, model.get_weights()) for fold_id, model in enumerate(models): prob = probabilities[fold_id] # test_predicts_path = os.path.join(args.result_path, "test_predicts{0}.npy".format(fold_id)) t = model.predict(X_test, batch_size=args.batch_size) test_predicts_softmax += prob * t test_predicats_mean += t * (1/len(models)) if fold_id == max_probability_index: test_predicats_max = t # test_predicts_list.append(test_predicts) # np.save(test_predicts_path, test_predicts) fold_id += 1 test_predicats_with_normalizated = test_predicts_softmax / PROBABILITIES_NORMALIZE_COEFFICIENT # test_predicts = np.ones(test_predicts_list[0].shape) # for fold_predict in test_predicts_list: # test_predicts += fold_predict # test_predicts /= len(test_predicts_list) # test_predicts **= (1. / len(test_predicts_list)) # test_predicts **= PROBABILITIES_NORMALIZE_COEFFICIENT # test_ids = test_ids.reshape((len(test_ids), 1)) results_with_label = { 'softmax': test_predicts_softmax, 'max': test_predicats_max, 'mean': test_predicats_mean, 'normalized': test_predicats_with_normalizated } for method in results_with_label: test_predicts = pd.DataFrame(data=results_with_label[method], columns=CLASSES) test_predicts["id"] = test_ids test_predicts = test_predicts[["id"] + CLASSES] now = datetime.datetime.now().strftime('%Y_%m_%d_%H_%M_%S') embedding_size = re.findall('[(cbow)|(skip)]-(\d+)-',args.embedding_path)[0] parameters = "{}-emb-{}-batch_size-{}-sen_len-{}-RUNIT-{}-dense_s-{}".format( method, embedding_size, args.batch_size, args.sentences_length, args.recurrent_units, args.dense_size ) submit_path = os.path.join(args.result_path, "{}_{}_submission_lstm_{}.csv".format(parameters, now, validation_scores)) test_predicts.to_csv(submit_path, index=False)
def main(): parser = argparse.ArgumentParser( description= "Recurrent neural network for identifying and classifying toxic online comments" ) parser.add_argument("train_file_path") parser.add_argument("test_file_path") parser.add_argument("embedding_path") parser.add_argument("--result-path", default="toxic_results") parser.add_argument("--batch-size", type=int, default=256) parser.add_argument("--sentences-length", type=int, default=500) parser.add_argument("--recurrent-units", type=int, default=64) parser.add_argument("--dropout-rate", type=float, default=0.3) parser.add_argument("--dense-size", type=int, default=32) parser.add_argument("--fold-count", type=int, default=10) # parser.add_argument("train_file_path") # parser.add_argument("test_file_path") # parser.add_argument("embedding_path") # parser.add_argument("--result-path", default="toxic_results") # parser.add_argument("--batch-size", type=int, default=32) # parser.add_argument("--sentences-length", type=int, default=200) # parser.add_argument("--recurrent-units", type=int, default=80) # parser.add_argument("--dropout-rate", type=float, default=0.3) # parser.add_argument("--dense-size", type=int, default=32) # parser.add_argument("--fold-count", type=int, default=10) args = parser.parse_args() if args.fold_count <= 1: raise ValueError("fold-count should be more than 1") print("Loading data...") train_data = pd.read_csv(args.train_file_path) test_data = pd.read_csv(args.test_file_path) list_sentences_train = train_data["comment_text"].fillna(NAN_WORD).values list_sentences_test = test_data["comment_text"].fillna(NAN_WORD).values y_train = train_data[CLASSES].values print("Tokenizing sentences in train set...") tokenized_sentences_train, words_dict = tokenize_sentences( list_sentences_train, {}) print("Tokenizing sentences in test set...") tokenized_sentences_test, words_dict = tokenize_sentences( list_sentences_test, words_dict) words_dict[UNKNOWN_WORD] = len(words_dict) print("Loading embeddings...") embedding_list, embedding_word_dict = read_embedding_list( args.embedding_path) embedding_size = len(embedding_list[0]) print("Preparing data...") embedding_list, embedding_word_dict = clear_embedding_list( embedding_list, embedding_word_dict, words_dict) embedding_word_dict[UNKNOWN_WORD] = len(embedding_word_dict) embedding_list.append([0.] * embedding_size) embedding_word_dict[END_WORD] = len(embedding_word_dict) embedding_list.append([-1.] * embedding_size) embedding_matrix = np.array(embedding_list) id_to_word = dict((id, word) for word, id in words_dict.items()) train_list_of_token_ids = convert_tokens_to_ids(tokenized_sentences_train, id_to_word, embedding_word_dict, args.sentences_length) test_list_of_token_ids = convert_tokens_to_ids(tokenized_sentences_test, id_to_word, embedding_word_dict, args.sentences_length) X_train = np.array(train_list_of_token_ids) X_test = np.array(test_list_of_token_ids) get_model_func = lambda: get_model(embedding_matrix, args.sentences_length, args.dropout_rate, args.recurrent_units, args.dense_size) import gc del train_data, test_data, list_sentences_train, list_sentences_test del tokenized_sentences_train, tokenized_sentences_test, words_dict del embedding_list, embedding_word_dict del train_list_of_token_ids, test_list_of_token_ids gc.collect() print("Starting to train models...") models = train_folds(X_train, y_train, X_test, args.fold_count, args.batch_size, get_model_func)
def main(): parser = argparse.ArgumentParser( description= "Recurrent neural network for identifying and classifying toxic online comments" ) parser.add_argument("train_file_path") parser.add_argument("test_file_path") parser.add_argument("embedding_path") parser.add_argument("--result-path", default="toxic_results") parser.add_argument("--batch-size", type=int, default=256) parser.add_argument("--sentences-length", type=int, default=500) parser.add_argument("--recurrent-units", type=int, default=64) parser.add_argument("--dropout-rate", type=float, default=0.3) parser.add_argument("--dense-size", type=int, default=32) parser.add_argument("--fold-count", type=int, default=10) parser.add_argument("--aug-count", type=int, default=1) args = parser.parse_args() if args.fold_count <= 1: raise ValueError("fold-count should be more than 1") print("Loading data...") train_data = pd.read_csv(args.train_file_path) test_data = pd.read_csv(args.test_file_path) embed_path = os.path.join(args.embedding_path, 'embeddings.npz') data = np.load(embed_path) embedding_matrix = data['arr_0'] data_path = os.path.join(args.embedding_path, 'train.npz') data = np.load(data_path) X_train = data['arr_0'] data_path = os.path.join(args.embedding_path, 'test.npz') data = np.load(data_path) X_test = data['arr_0'] data_path = os.path.join(args.embedding_path, 'label.npz') data = np.load(data_path) y_train = data['arr_0'] get_model_func = lambda: get_LSTMGRU_GlobalMaxAve( embedding_matrix, args.sentences_length, args.dropout_rate, args. recurrent_units, args.dense_size) print("Starting to train models...") models, train_preds = train_folds(X_train, y_train, args.fold_count, args.batch_size, get_model_func, aug=args.aug_count) if not os.path.exists(args.result_path): os.mkdir(args.result_path) print("Predicting results...") test_predicts_list = [] for fold_id, model in enumerate(models): model_path = os.path.join(args.result_path, "model{0}_weights.npy".format(fold_id)) np.save(model_path, model.get_weights()) test_predicts_path = os.path.join( args.result_path, "test_predicts{0}.npy".format(fold_id)) test_predicts = model.predict(X_test, batch_size=args.batch_size) test_predicts_list.append(test_predicts) np.save(test_predicts_path, test_predicts) test_predicts = np.ones(test_predicts_list[0].shape) for fold_predict in test_predicts_list: test_predicts *= fold_predict test_predicts **= (1. / len(test_predicts_list)) test_predicts **= PROBABILITIES_NORMALIZE_COEFFICIENT test_ids = test_data["id"].values test_ids = test_ids.reshape((len(test_ids), 1)) test_predicts = pd.DataFrame(data=test_predicts, columns=CLASSES) test_predicts["id"] = test_ids test_predicts = test_predicts[["id"] + CLASSES] submit_path = os.path.join(args.result_path, "submit") test_predicts.to_csv(submit_path, index=False) train_ids = train_data["id"].values train_ids = train_ids.reshape((len(train_ids), 1)) train_predicts = pd.DataFrame(data=train_preds, columns=CLASSES) train_predicts["id"] = train_ids train_predicts = train_predicts[["id"] + CLASSES] valid_path = os.path.join(args.result_path, "valid") train_predicts.to_csv(valid_path, index=False)
def main(): parser = argparse.ArgumentParser( description= "Recurrent neural network for identifying and classifying toxic online comments" ) parser.add_argument("train_file_path") parser.add_argument("test_file_path") parser.add_argument("embedding_path") parser.add_argument("--result-path", default="toxic_results") parser.add_argument("--batch-size", type=int, default=256) parser.add_argument("--sentences-length", type=int, default=500) parser.add_argument("--recurrent-units", type=int, default=64) parser.add_argument("--dropout-rate", type=float, default=0.3) parser.add_argument("--dense-size", type=int, default=32) parser.add_argument("--fold-count", type=int, default=10) args = parser.parse_args() if args.fold_count <= 1: raise ValueError("fold-count should be more than 1") print("Loading data...") train_data = pd.read_csv(args.train_file_path) test_data = pd.read_csv(args.test_file_path) # Identify language #train_data['language'] = train_data['comment_text'].apply(detect_language) #test_data['language'] = test_data['comment_text'].apply(detect_language) # Translate the non-english to the english. #train_data['comment_text'] = train_data.apply(lambda x: translate(x.comment_text, x.language),axis=1) #test_data['comment_text'] = test_data.apply(lambda x: translate(x.comment_text, x.language),axis=1) #train_data.to_csv("train_data_translated.csv") #test_data.to_csv("test_data_translated.csv") #train_data['comment_text'] = train_data.apply(lambda x: clean(x.comment_text), axis=1) #train_data['comment_text'] = train_data.apply(lambda x: clean(x.comment_text), axis=1) #train_data.to_csv("train_data_cleaned_after_translate.csv") #test_data.to_csv("test_data_cleaned_after_translate.csv") list_sentences_train = train_data["comment_text"].fillna(NAN_WORD).values list_sentences_test = test_data["comment_text"].fillna(NAN_WORD).values y_train = train_data[CLASSES].values print("Tokenizing sentences in train set...") tokenized_sentences_train, words_dict = tokenize_sentences( list_sentences_train, {}) print("Tokenizing sentences in test set...") tokenized_sentences_test, words_dict = tokenize_sentences( list_sentences_test, words_dict) words_dict[UNKNOWN_WORD] = len(words_dict) print("Loading embeddings...") embedding_list, embedding_word_dict = read_embedding_list( args.embedding_path) embedding_size = len(embedding_list[0]) print("Preparing data...") embedding_list, embedding_word_dict = clear_embedding_list( embedding_list, embedding_word_dict, words_dict) embedding_word_dict[UNKNOWN_WORD] = len(embedding_word_dict) embedding_list.append([0.] * embedding_size) embedding_word_dict[END_WORD] = len(embedding_word_dict) embedding_list.append([-1.] * embedding_size) embedding_matrix = np.array(embedding_list) embedding_matrix_path = os.path.join(args.result_path, "embedding_matrix.npy") np.save(embedding_matrix_path, embedding_matrix) words_dict_path = os.path.join(args.result_path, "words_dict.npy") np.save(words_dict_path, words_dict) id_to_word = dict((id, word) for word, id in words_dict.items()) train_list_of_token_ids = convert_tokens_to_ids(tokenized_sentences_train, id_to_word, embedding_word_dict, args.sentences_length) test_list_of_token_ids = convert_tokens_to_ids(tokenized_sentences_test, id_to_word, embedding_word_dict, args.sentences_length) X_train = np.array(train_list_of_token_ids) X_test = np.array(test_list_of_token_ids) print(embedding_matrix.shape) print(embedding_matrix.shape[0]) print(embedding_matrix.shape[1]) get_model_func = lambda: get_model(embedding_matrix, args.sentences_length, args.dropout_rate, args.recurrent_units, args.dense_size) print("Starting to train models...") models = train_folds(X_train, y_train, args.fold_count, args.batch_size, get_model_func) if not os.path.exists(args.result_path): os.mkdir(args.result_path) print("Predicting results...") test_predicts_list = [] for fold_id, model in enumerate(models): model_path = os.path.join(args.result_path, "model{0}_weights.npy".format(fold_id)) np.save(model_path, model.get_weights()) model.save_weights("model{0}_weights.h5".format(fold_id)) test_predicts_path = os.path.join( args.result_path, "test_predicts{0}.npy".format(fold_id)) test_predicts = model.predict(X_test, batch_size=args.batch_size) test_predicts_list.append(test_predicts) np.save(test_predicts_path, test_predicts) test_predicts = np.ones(test_predicts_list[0].shape) for fold_predict in test_predicts_list: test_predicts *= fold_predict test_predicts **= (1. / len(test_predicts_list)) test_predicts **= PROBABILITIES_NORMALIZE_COEFFICIENT test_ids = test_data["id"].values test_ids = test_ids.reshape((len(test_ids), 1)) test_predicts = pd.DataFrame(data=test_predicts, columns=CLASSES) test_predicts["id"] = test_ids test_predicts = test_predicts[["id"] + CLASSES] submit_path = os.path.join(args.result_path, "submit") test_predicts.to_csv(submit_path, index=False) print("Predicting Discussion posts...") posts = pd.read_csv("posts_cleaned.csv") posts = posts.dropna() discussion_posts = posts['MSG_TEXT'].tolist() tokenized_discussion_posts, words_dict = tokenize_sentences( discussion_posts, words_dict) #id_to_word = dict((id, word) for word, id in words_dict.items()) discussion_list_of_token_ids = convert_tokens_to_ids( tokenized_discussion_posts, id_to_word, embedding_word_dict, args.sentences_length) X_test = np.array(discussion_list_of_token_ids) discussion_predict_list = [] for fold_id, model in enumerate(models): discussion_predicts = model.predict(X_test, batch_size=args.batch_size) discussion_predict_list.append(discussion_predicts) discussion_predicts = np.ones(discussion_predict_list[0].shape) for fold_predict in discussion_predict_list: discussion_predicts *= fold_predict discussion_predicts **= (1. / len(discussion_predict_list)) discussion_predicts **= PROBABILITIES_NORMALIZE_COEFFICIENT discussion_predicts = pd.DataFrame(data=discussion_predicts, columns=CLASSES) discussion_predicts['MSG_TEXT'] = discussion_posts discussion_predicts = discussion_predicts[["MSG_TEXT"] + CLASSES] discussion_predicts_path = os.path.join(args.result_path, "discussion_predicts.csv") discussion_predicts.to_csv(discussion_predicts_path, index=False)
def main(): parser = argparse.ArgumentParser( description= "Recurrent neural network for identifying and classifying toxic online comments" ) parser.add_argument("train_file_path") parser.add_argument("test_file_path") parser.add_argument("embedding_path") parser.add_argument("--result-path", default="toxic_results") parser.add_argument("--batch-size", type=int, default=256) parser.add_argument("--sentences-length", type=int, default=500) parser.add_argument("--recurrent-units", type=int, default=64) parser.add_argument("--dropout-rate", type=float, default=0.3) parser.add_argument("--dense-size", type=int, default=32) parser.add_argument("--fold-count", type=int, default=10) parser.add_argument("--modelname-prefix", type=str, default="") parser.add_argument("--cv", type=str, default="True") parser.add_argument("--use-roc", type=str, default="False") args = parser.parse_args() if args.fold_count <= 1: raise ValueError("fold-count should be more than 1") print('Input params') print(args) start = time.time() print('#' * 50) print("Loading data...") print('#' * 50) if os.path.exists( os.path.join( args.result_path, 'tokenized_sentences_train.pkl')) and os.path.exists( os.path.join( args.result_path, 'tokenized_sentences_train.pkl')) and os.path.exists( os.path.join(args.result_path, 'tokenized_sentences_train.pkl')): print('Preprocessed files found. Reading preprocess files') train_data = pd.read_csv(args.train_file_path) test_data = pd.read_csv(args.test_file_path) y_train = train_data[CLASSES].values with open( os.path.join(args.result_path, 'tokenized_sentences_train.pkl'), 'rb') as f: tokenized_sentences_train = pickle.load(f) with open( os.path.join(args.result_path, 'tokenized_sentences_test.pkl'), 'rb') as f: tokenized_sentences_test = pickle.load(f) with open(os.path.join(args.result_path, 'words_dict.pkl'), 'rb') as f: words_dict = pickle.load(f) else: print('Preprocessed files not found.') train_data = pd.read_csv(args.train_file_path) test_data = pd.read_csv(args.test_file_path) list_sentences_train = train_data["comment_text"].fillna( NAN_WORD).values list_sentences_test = test_data["comment_text"].fillna(NAN_WORD).values y_train = train_data[CLASSES].values print('#' * 50) print("Tokenizing sentences in train set...") print('#' * 50) tokenized_sentences_train, words_dict = tokenize_sentences( list_sentences_train, {}) print('#' * 50) print("Tokenizing sentences in test set...") print('#' * 50) tokenized_sentences_test, words_dict = tokenize_sentences( list_sentences_test, words_dict) print('Saving preprocess files...') with open( os.path.join(args.result_path, 'tokenized_sentences_train.pkl'), 'wb') as f: pickle.dump(tokenized_sentences_train, f) with open( os.path.join(args.result_path, 'tokenized_sentences_test.pkl'), 'wb') as f: pickle.dump(tokenized_sentences_test, f) with open(os.path.join(args.result_path, 'words_dict.pkl'), 'wb') as f: pickle.dump(words_dict, f) print('total words', len(words_dict)) words_dict[UNKNOWN_WORD] = len(words_dict) print('#' * 50) print("Loading embeddings...") print('#' * 50) if 'glove' in args.embedding_path: print('Reading Glove embedding') embedding_list, embedding_word_dict = read_embedding_list_glove( args.embedding_path) else: print('Reading Fasttext embedding') embedding_list, embedding_word_dict = read_embedding_list( args.embedding_path) embedding_size = len(embedding_list[0]) print('Embedding size', embedding_size) print('#' * 50) print("Preparing data...") print('#' * 50) embedding_list, embedding_word_dict = clear_embedding_list( embedding_list, embedding_word_dict, words_dict) embedding_word_dict[UNKNOWN_WORD] = len(embedding_word_dict) embedding_list.append([0.] * embedding_size) embedding_word_dict[END_WORD] = len(embedding_word_dict) embedding_list.append([-1.] * embedding_size) embedding_matrix = np.array(embedding_list) print('Embedding matrix shape:', embedding_matrix.shape) id_to_word = dict((id, word) for word, id in words_dict.items()) train_list_of_token_ids = convert_tokens_to_ids(tokenized_sentences_train, id_to_word, embedding_word_dict, args.sentences_length) test_list_of_token_ids = convert_tokens_to_ids(tokenized_sentences_test, id_to_word, embedding_word_dict, args.sentences_length) X_train = np.array(train_list_of_token_ids) X_test = np.array(test_list_of_token_ids) # GRU cross validation # get_model_func = lambda: get_model( # embedding_matrix, # args.sentences_length, # args.dropout_rate, # args.recurrent_units, # args.dense_size) # GRU maxpool, avgpool # get_model_func = lambda: get_model_pool( # embedding_matrix, # args.sentences_length, # args.dropout_rate, # args.recurrent_units, # args.dense_size) # deepmoji style # get_model_func = lambda: get_model_deepmoji_style( # embedding_matrix, # args.sentences_length, # args.dropout_rate, # args.recurrent_units, # args.dense_size) # GRU maxpool, avgpool validation # get_model_func = lambda: get_gru_model( # embedding_matrix, # args.sentences_length, # args.dropout_rate, # args.recurrent_units, # args.dense_size) # GRU maxpool, avgpool + cnn validation # get_model_func = lambda: get_model_pool_gru_cnn( # embedding_matrix, # args.sentences_length, # args.dropout_rate, # args.recurrent_units, # args.dense_size) # dpcnn validation get_model_func = lambda: get_dpcnn_model( embedding_matrix, args.sentences_length, args.dropout_rate, args. dense_size) # lstm with attention cross val # get_model_func = lambda: get_model_att_lstm( # embedding_matrix, # args.sentences_length, # args.dropout_rate, # args.recurrent_units, # args.dense_size) # capsule net # get_model_func = lambda: get_capsnet_model( # embedding_matrix, # args.sentences_length, # args.dropout_rate, # args.recurrent_units, # args.dense_size) print('#' * 50) print("Starting to train models...") print('#' * 50) models = train_folds(X_train, y_train, args.fold_count, args.batch_size, get_model_func, args.cv, args.use_roc) if not os.path.exists(args.result_path): os.mkdir(args.result_path) print('#' * 50) print("Predicting results...") print('#' * 50) if args.cv == "True": test_predicts_list = [] for fold_id, model in enumerate(models): model_path = os.path.join( args.result_path, "{0}_model{1}_weights.npy".format(args.modelname_prefix, fold_id)) np.save(model_path, model.get_weights()) test_predicts_path = os.path.join( args.result_path, "{0}_test_predicts{1}.npy".format(args.modelname_prefix, fold_id)) test_predicts = model.predict(X_test, batch_size=args.batch_size * 2) test_predicts_list.append(test_predicts) np.save(test_predicts_path, test_predicts) test_predicts = np.ones(test_predicts_list[0].shape) for fold_predict in test_predicts_list: test_predicts *= fold_predict test_predicts **= (1. / len(test_predicts_list)) # test_predicts **= PROBABILITIES_NORMALIZE_COEFFICIENT test_ids = test_data["id"].values test_ids = test_ids.reshape((len(test_ids), 1)) test_predicts = pd.DataFrame(data=test_predicts, columns=CLASSES) test_predicts["id"] = test_ids test_predicts = test_predicts[["id"] + CLASSES] submit_path = os.path.join(args.result_path, "{}_submit".format(args.modelname_prefix)) test_predicts.to_csv(submit_path, index=False) print('#' * 50) print('Prediction Completed...') print('#' * 50) total_time = time.time() - start mins, sec = divmod(total_time, 60) hrs, mins = divmod(mins, 60) print('Total time taken : {:.0f}h {:.0f}m {:.0f}s'.format( hrs, mins, sec)) else: print('No Cross Validation') test_predicts = models.predict(X_test, batch_size=args.batch_size * 2) test_ids = test_data["id"].values test_ids = test_ids.reshape((len(test_ids), 1)) test_predicts = pd.DataFrame(data=test_predicts, columns=CLASSES) test_predicts["id"] = test_ids test_predicts = test_predicts[["id"] + CLASSES] submit_path = os.path.join( args.result_path, "{}_submit_nocv".format(args.modelname_prefix)) test_predicts.to_csv(submit_path, index=False) total_time = time.time() - start mins, sec = divmod(total_time, 60) hrs, mins = divmod(mins, 60) print('Total time taken : {:.0f}h {:.0f}m {:.0f}s'.format( hrs, mins, sec))