def test_folder(self): (trn, val, preproc) = vis.images_from_folder( datadir='image_data/image_folder', data_aug=vis.get_data_aug(horizontal_flip=True), classes=['cat', 'dog'], train_test_names=['train', 'valid']) model = vis.image_classifier('pretrained_resnet50', trn, val) learner = ktrain.get_learner(model=model, train_data=trn, val_data=val, batch_size=1) learner.freeze() # test weight decay self.assertEqual(learner.get_weight_decay(), None) learner.set_weight_decay(1e-2) self.assertAlmostEqual(learner.get_weight_decay(), 1e-2) # train hist = learner.autofit(1e-3, monitor=VAL_ACC_NAME) # test train self.assertAlmostEqual(max(hist.history['lr']), 1e-3) if max(hist.history[ACC_NAME]) == 0.5: raise Exception('unlucky initialization: please run test again') self.assertGreater(max(hist.history[ACC_NAME]), 0.8) # test top_losses obs = learner.top_losses(n=1, val_data=val) print(obs) if obs: self.assertIn(obs[0][0], list(range(U.nsamples_from_data(val)))) else: self.assertEqual(max(hist.history[VAL_ACC_NAME]), 1) # test load and save model learner.save_model('/tmp/test_model') learner.load_model('/tmp/test_model') # test validate cm = learner.validate(val_data=val) print(cm) for i, row in enumerate(cm): self.assertEqual(np.argmax(row), i) # test predictor p = ktrain.get_predictor(learner.model, preproc) r = p.predict_folder('image_data/image_folder/train/') print(r) self.assertEqual(r[0][1], 'cat') r = p.predict_proba_folder('image_data/image_folder/train/') self.assertEqual(np.argmax(r[0][1]), 0) r = p.predict_filename('image_data/image_folder/train/cat/cat.11737.jpg') self.assertEqual(r, ['cat']) r = p.predict_proba_filename('image_data/image_folder/train/cat/cat.11737.jpg') self.assertEqual(np.argmax(r), 0) p.save('/tmp/test_predictor') p = ktrain.load_predictor('/tmp/test_predictor') r = p.predict_filename('image_data/image_folder/train/cat/cat.11737.jpg') self.assertEqual(r, ['cat'])
def mr_train(self, train_df, val_df): # Reset the model at the start of each training self.mr_t = text.Transformer(self.model_name, maxlen = self.max_len, class_names = self.class_names) # Preprocess the training train_data = self.mr_t.preprocess_train(train_df["Answer"].values, train_df["Score"].values) # Preprocess the testing val_data = self.mr_t.preprocess_test(val_df["Answer"].values, val_df["Score"].values) # Get the actual classifier model = self.mr_t.get_classifier() learner = ktrain.get_learner(model, train_data=train_data, val_data=val_data, batch_size=self.batch_size) # Train the model learner.fit_onecycle(self.l_rate, self.train_iter) # Print results for validation learner.validate(class_names=self.mr_t.get_classes()) self.mr_c = ktrain.get_predictor(learner.model, preproc=self.mr_t)
def __init__(self, model_directory): print("Initializing Roberta Model from:", model_directory + '...') self.model_directory = model_directory self.model = ktrain.load_predictor(model_directory).model self.preproc = ktrain.load_predictor(model_directory).preproc self.predictor = ktrain.get_predictor(self.model, self.preproc) print("Initialization complete.")
def bertKtrainDataBalancing(): posDataFrame = df_data[df_data.airline_sentiment=="positive"].airline_sentiment negDataFrame = df_data[df_data.airline_sentiment=="negative"].airline_sentiment neutralDataFrame = df_data[df_data.airline_sentiment=="neutral"].airline_sentiment posArray,negArray,neutArray = list(posDataFrame.index),list(negDataFrame.index),list(neutralDataFrame.index) random.shuffle(negArray)#,random.shuffle(neutArray),random.shuffle(posArray) finalDf = pd.concat([df_data.iloc[posArray[:2000]],df_data.iloc[negArray[:2000]],df_data.iloc[neutArray[:2000]]]) print(finalDf.airline_sentiment.value_counts()) indexList_2 = list(finalDf.index) random.shuffle(indexList_2) eightList_2 = [indexList_2[i] for i in range(0,len(indexList_2)*80//100)] data_train_2 = df_data.iloc[eightList_2] twentyList_2 = [indexList_2[i] for i in range(len(indexList_2)*80//100,len(indexList_2))] data_test_2 = df_data.iloc[twentyList_2] print(data_train_2.shape[0]+data_test_2.shape[0],finalDf.shape) print(finalDf.airline_sentiment.value_counts()) (X_train_2,y_train_2), (X_text_2,y_test_2), preprocess2 = text.texts_from_df(data_train_2,'text','airline_sentiment',data_test_2,maxlen=50,preprocess_mode='bert') model2 = text.text_classifier('bert',(X_train_2,y_train_2), preproc= preprocess2,multilabel=True) learner2 = ktrain.get_learner(model2,(X_train_2,y_train_2),val_data=(X_text_2,y_test_2),batch_size=6) learner2.lr_find() learner2.lr_plot() #1e-6/1e-3 learner2.fit_onecycle(lr=1e-6,epochs=1) predictor2 = ktrain.get_predictor(learner2.model,preprocess2) print("Normal Data : ",predictor2.predict(arr)) print("Clean Data : ",predictor2.predict(arr1))
def test_transformers_api_1(self): trn, val, preproc = txt.texts_from_array( x_train=self.trn[0], y_train=self.trn[1], x_test=self.val[0], y_test=self.val[1], class_names=self.classes, preprocess_mode="distilbert", maxlen=500, max_features=35000, ) model = txt.text_classifier("distilbert", train_data=trn, preproc=preproc) learner = ktrain.get_learner( model, train_data=trn, val_data=val, batch_size=6, eval_batch_size=EVAL_BS ) # test weight decay # NOTE due to transformers and/or AdamW bug, # val_accuracy is missing in training history if setting weight decay prior to training # self.assertEqual(learner.get_weight_decay(), None) # learner.set_weight_decay(1e-2) # self.assertAlmostEqual(learner.get_weight_decay(), 1e-2) # train lr = 5e-5 hist = learner.fit_onecycle(lr, 1) # test training results self.assertAlmostEqual(max(hist.history["lr"]), lr) self.assertGreater(max(hist.history[VAL_ACC_NAME]), 0.9) # test top losses obs = learner.top_losses(n=1, val_data=None) self.assertIn(obs[0][0], list(range(len(val.x)))) learner.view_top_losses(preproc=preproc, n=1, val_data=None) # test weight decay self.assertEqual(learner.get_weight_decay(), None) learner.set_weight_decay(1e-2) self.assertAlmostEqual(learner.get_weight_decay(), 1e-2) # test load and save model tmp_folder = ktrain.imports.tempfile.mkdtemp() learner.save_model(tmp_folder) learner.load_model(tmp_folder) # test validate cm = learner.validate() print(cm) for i, row in enumerate(cm): self.assertEqual(np.argmax(row), i) # test predictor p = ktrain.get_predictor(learner.model, preproc, batch_size=EVAL_BS) self.assertEqual(p.predict([TEST_DOC])[0], "soc.religion.christian") tmp_folder = ktrain.imports.tempfile.mkdtemp() p.save(tmp_folder) p = ktrain.load_predictor(tmp_folder, batch_size=EVAL_BS) self.assertEqual(p.predict(TEST_DOC), "soc.religion.christian") self.assertEqual(np.argmax(p.predict_proba([TEST_DOC])[0]), 3) self.assertEqual(type(p.explain(TEST_DOC)), IPython.core.display.HTML)
def training(train_frame): train_frame = train_frame.sample(frac=1) train_test_part = int(len(train_frame) * 0.9) train_df, self_test_df = train_frame[:train_test_part], train_frame[ train_test_part:] # text.texts_from_df return two tuples # maxlen=50 and rest of them are getting trucated # preprocess_mode: choose to use BERT model (X_train, y_train), (X_test, y_test), preprocess = text.texts_from_df( train_df=train_df, text_column='text', label_columns='emotion', val_df=self_test_df, maxlen=50, preprocess_mode='bert', ) # using BERT model model = text.text_classifier(name='bert', train_data=(X_train, y_train), preproc=preprocess) learner = ktrain.get_learner(model=model, train_data=(X_train, y_train), val_data=(X_test, y_test), batch_size=32) # fit one cycle uses the one cycle policy callback learner.fit_onecycle(lr=3e-5, epochs=2, checkpoint_folder='checkpoint') # get predictor and save predictor = ktrain.get_predictor(learner.model, preproc=preprocess) predictor.save('predictor')
def bertKtrain(): global predictor import ktrain,random from ktrain import text import tensorflow as tf arr = ["the service is good", "The cost is expensive and customer service sucked","the flight was late but prices are ok","service is fine and cost is also fine"] arr1 = [cleanSentence(text) for text in arr] predictor.predict(arr) indexList = list(df_data.index) random.shuffle(indexList) eightList = [indexList[i] for i in range(0,len(indexList)*80//100)] data_train = df_data.iloc[eightList] twentyList = [indexList[i] for i in range(len(indexList)*80//100,len(indexList))] data_test = df_data.iloc[twentyList] print(data_train.shape[0]+data_test.shape[0],df_data.shape) (X_train,y_train), (X_text,y_test), preprocess = text.texts_from_df(data_train,'text','airline_sentiment',data_test,maxlen=100,preprocess_mode='bert') model = text.text_classifier('bert',(X_train,y_train), preproc= preprocess,multilabel=False) learner = ktrain.get_learner(model,(X_train,y_train),val_data=(X_text,y_test),batch_size=6) learner.lr_find() learner.lr_plot() learner.fit_onecycle(lr=1e-3,epochs=1) #learning rate 1e-3/1e-6 predictor = ktrain.get_predictor(learner.model,preprocess) predictor.predict(arr) return "Use predictor.predict([]) to predict in future"
def fit_bert(self, train_docs, train_targets, labels): import ktrain from ktrain import text from tensorflow import keras assert self.params['clf_model'] != '' t = text.Transformer(self.params['clf_model'], maxlen=500, class_names=labels) train_texts = [d['title'] + "\n" + d['abstract'] for d in train_docs] trn = t.preprocess_train(train_texts, train_targets) model = t.get_classifier() learner = ktrain.get_learner(model, train_data=trn, batch_size=self.params['clf_batch_size']) learner.fit_onecycle(self.params['clf_learning_rate'], self.params['clf_epochs']) #self.t = t #self.learner = learner self.predictor = ktrain.get_predictor(learner.model, preproc=t)
def test_ner(self): model = txt.sequence_tagger('bilstm-crf', self.preproc) learner = ktrain.get_learner(model, train_data=self.trn, val_data=self.val) lr = 0.001 hist = learner.fit(lr, 1) # test training results #self.assertAlmostEqual(max(hist.history['lr']), lr) self.assertGreater(learner.validate(), 0.65) # test top losses obs = learner.top_losses(n=1) self.assertIn(obs[0][0], list(range(len(self.val.x)))) learner.view_top_losses(n=1) # test weight decay self.assertEqual(len(learner.get_weight_decay()), 2) self.assertEqual(learner.get_weight_decay()[0], None) learner.set_weight_decay(1e-4) self.assertAlmostEqual(learner.get_weight_decay()[0], 1e-4) # test load and save model learner.save_model('/tmp/test_model') learner.load_model('/tmp/test_model') # test predictor SENT = 'There is a man named John Smith.' p = ktrain.get_predictor(learner.model, self.preproc) self.assertEqual(p.predict(SENT)[-2][1], 'I-PER') p.save('/tmp/test_predictor') p = ktrain.load_predictor('/tmp/test_predictor') self.assertEqual(p.predict(SENT)[-2][1], 'I-PER')
def test_cora(self): (trn, val, preproc, df_holdout, G_complete) = gr.graph_nodes_from_csv( "graph_data/cora/cora.content", "graph_data/cora/cora.cites", sample_size=20, holdout_pct=0.1, holdout_for_inductive=True, train_pct=0.1, sep="\t", ) learner = ktrain.get_learner( model=gr.graph_node_classifier( "graphsage", trn, ), train_data=trn, # val_data=val, batch_size=64, ) lr = 0.01 hist = learner.autofit(lr, 10) # test training results self.assertAlmostEqual(max(hist.history["lr"]), lr) self.assertGreater(max(hist.history[ACC_NAME]), 0.9) # test top losses obs = learner.top_losses(n=1, val_data=val) self.assertIn(obs[0][0], list(range(val.targets.shape[0]))) learner.view_top_losses(preproc=preproc, n=1, val_data=val) # test weight decay self.assertEqual(learner.get_weight_decay(), None) learner.set_weight_decay(1e-2) self.assertAlmostEqual(learner.get_weight_decay(), 1e-2) # test load and save model learner.save_model("/tmp/test_model") learner.load_model("/tmp/test_model") # test validate learner.validate(val_data=val) cm = learner.validate(val_data=val) print(cm) for i, row in enumerate(cm): if i == 5: continue # many 5s are classified as 6s self.assertEqual(np.argmax(row), i) # test predictor p = ktrain.get_predictor(learner.model, preproc) self.assertIn(p.predict_transductive(val.ids[0:1])[0], preproc.get_classes()) p.predict_transductive(val.ids[0:1]) p.save("/tmp/test_predictor") p = ktrain.load_predictor("/tmp/test_predictor") self.assertIn(p.predict_transductive(val.ids[0:1])[0], preproc.get_classes())
def test_cora(self): (trn, val, preproc, df_holdout, G_complete) = gr.graph_nodes_from_csv('graph_data/cora/cora.content', 'graph_data/cora/cora.cites', sample_size=20, holdout_pct=0.1, holdout_for_inductive=True, train_pct=0.1, sep='\t') learner = ktrain.get_learner( model=gr.graph_node_classifier( 'graphsage', trn, ), train_data=trn, #val_data=val, batch_size=64) lr = 0.01 hist = learner.autofit(lr, 10) # test training results self.assertAlmostEqual(max(hist.history['lr']), lr) self.assertGreater(max(hist.history['acc']), 0.9) # test top losses obs = learner.top_losses(n=1, val_data=val) self.assertIn(obs[0][0], list(range(val.targets.shape[0]))) learner.view_top_losses(preproc=preproc, n=1, val_data=val) # test weight decay self.assertEqual(len(learner.get_weight_decay()), 1) self.assertEqual(learner.get_weight_decay()[0], None) learner.set_weight_decay(1e-4) self.assertAlmostEqual(learner.get_weight_decay()[0], 1e-4) # test load and save model learner.save_model('/tmp/test_model') learner.load_model('/tmp/test_model') # test validate learner.validate(val_data=val) cm = learner.validate(val_data=val) print(cm) for i, row in enumerate(cm): self.assertEqual(np.argmax(row), i) # test predictor p = ktrain.get_predictor(learner.model, preproc) self.assertIn( p.predict_transductive(val.ids[0:1])[0], preproc.get_classes()) p.predict_transductive(val.ids[0:1]) p.save('/tmp/test_predictor') p = ktrain.load_predictor('/tmp/test_predictor') self.assertIn( p.predict_transductive(val.ids[0:1])[0], preproc.get_classes())
def train(epochs=3, batchSize=8): ''' Trains the BERT model. Saves trianed BERT model in NLP/BERT/log directory. :params epochs: number of epochs to train the network batchSize: size of batches for training :return N/A ''' # blockPrint() # ========================================================== # # ======================== PARAMS ========================== # # ========================================================== # ouput_msg = "Begin training the BERT network ..." print(colored(ouput_msg, 'cyan')) current_dir = os.path.dirname(os.path.abspath(__file__)) datadir = os.path.join(current_dir, '../../../data/bert_data') batchSize = 4 epochs = 1 # ========================================================== # # ================= SET UP BERT NETWORK ==================== # # ========================================================== # (x_train, y_train), (x_test, y_test), preproc = text.texts_from_folder( datadir, maxlen=500, preprocess_mode='bert', train_test_names=['train', 'test'], classes=['0', '1']) model = text.text_classifier('bert', (x_train, y_train), preproc=preproc) learner = ktrain.get_learner(model, train_data=(x_train, y_train), val_data=(x_test, y_test), batch_size=batchSize) # ========================================================== # # ==================== TRAIN BERT MODEL ==================== # # ========================================================== # learner.fit_onecycle(2e-5, epochs) predictor = ktrain.get_predictor(learner.model, preproc=preproc) predictor.save('../log') # ========================================================== # # ====================== SAVE MODEL ======================== # # ========================================================== # ouput_msg = "Saving the trained BERT model in NLP/log/model.h5 ..." print(colored(ouput_msg, 'cyan')) save_dir = os.path.join(current_dir, '../log') if not os.path.exists(save_dir): os.mkdir(save_dir) save_file = os.path.join(current_dir, '../log/bert_model.h5') learner.save_model(save_file)
def test_bigru(self): trn, val, preproc = txt.texts_from_array( x_train=self.trn[0], y_train=self.trn[1], x_test=self.val[0], y_test=self.val[1], class_names=self.classes, preprocess_mode="standard", maxlen=350, max_features=35000, ngram_range=1, ) model = txt.text_classifier("bigru", train_data=trn, preproc=preproc) learner = ktrain.get_learner(model, train_data=trn, val_data=val, batch_size=32, eval_batch_size=EVAL_BS) lr = 0.01 hist = learner.autofit(lr, 1) # test training results self.assertAlmostEqual(max(hist.history["lr"]), lr) self.assertGreater(max(hist.history[VAL_ACC_NAME]), 0.89) self.assertAlmostEqual(max(hist.history["momentum"]), 0.95) self.assertAlmostEqual(min(hist.history["momentum"]), 0.85) # test top losses obs = learner.top_losses(n=1, val_data=None) self.assertIn(obs[0][0], list(range(len(val[0])))) learner.view_top_losses(preproc=preproc, n=1, val_data=None) # test weight decay self.assertEqual(learner.get_weight_decay(), None) learner.set_weight_decay(1e-2) self.assertAlmostEqual(learner.get_weight_decay(), 1e-2) # test load and save model learner.save_model("/tmp/test_model") learner.load_model("/tmp/test_model") # test validate cm = learner.validate() print(cm) for i, row in enumerate(cm): self.assertEqual(np.argmax(row), i) # test predictor p = ktrain.get_predictor(learner.model, preproc, batch_size=EVAL_BS) self.assertEqual(p.predict([TEST_DOC])[0], "soc.religion.christian") p.save("/tmp/test_predictor") p = ktrain.load_predictor("/tmp/test_predictor", batch_size=EVAL_BS) self.assertEqual(p.predict([TEST_DOC])[0], "soc.religion.christian") self.assertEqual(p.predict(TEST_DOC), "soc.religion.christian") self.assertEqual(np.argmax(p.predict_proba([TEST_DOC])[0]), 3) self.assertEqual(type(p.explain(TEST_DOC)), IPython.core.display.HTML)
def train_gru(x_train, y_train, x_test, y_test, preproc, bs=5): model = text.text_classifier("bigru", (x_train, y_train), preproc=preproc) learner = ktrain.get_learner( model, train_data=(x_train, y_train), val_data=(x_test, y_test) ) learner.lr_find(suggest=True) grad_lr = learner.lr_estimate() learner.autofit(min(grad_lr), 10) predictor = ktrain.get_predictor(learner.model, preproc) predictor.save(str(models_path)) learner.validate(class_names=preproc.get_classes())
def run_kfold(clf=None, X_all=df.text, y_all=df.sentiment, mod_type='scikit-learn'): kf = KFold(n_splits=10) accuracy = [] precision = [] recall = [] f1 = [] fold = 0 for train_index, test_index in kf.split(X_all): fold += 1 if mod_type == 'scikit-learn': X_train, X_test = X_all.values[train_index], X_all.values[test_index]˜ y_train, y_test = y_all.values[train_index], y_all.values[test_index] clf.fit(X_train, y_train) predictions = clf.predict(X_test) elif mod_type == 'bert': X_train, y_train = df.iloc[train_index, 0], df.iloc[train_index, 1] X_test, y_test = df.iloc[train_index, 0], df.iloc[train_index, 1] MODEL_NAME = 'bert-base-multilingual-uncased' # main model 1; check out https://towardsdatascience.com/text-classification-with-hugging-face-transformers-in-tensorflow-2-without-tears-ee50e4f3e7ed t = text.Transformer(MODEL_NAME, maxlen=500, classes=[0,1]) trn = t.preprocess_train(X_train, y_train) val = t.preprocess_test(X_test, y_test) model = t.get_classifier() learner = ktrain.get_learner(model, train_data=trn, val_data=val, batch_size=6) learner.lr_find(show_plot=False, max_epochs=2) learner.fit_onecycle(5e-5, 4) # replace var1 with optimal learning rate from above (i.e., apex of valley) predictor = ktrain.get_predictor(learner.model, preproc=t) predictions = X_test.apply(lambda x: predictor.predict(x)) accuracy.append(accuracy_score(y_test, predictions)) precision.append(classification_report( y_test, predictions, output_dict=True)['weighted avg']['precision']) recall.append(classification_report( y_test, predictions, output_dict=True)['weighted avg']['recall']) f1.append(classification_report( y_test, predictions, output_dict=True)['weighted avg']['f1-score']) mean_accuracy = np.mean(accuracy) mean_precision = np.mean(precision) mean_recall = np.mean(recall) mean_f1 = np.mean(f1) std_accuracy = np.std(accuracy) std_precision = np.std(precision) std_recall = np.std(recall) std_f1 = np.std(f1) return(mean_accuracy, mean_precision, mean_recall, mean_f1, std_accuracy, std_precision, std_recall, std_f1)
def train_svm(x_train, y_train, x_test, y_test, preproc, bs=5): model = text.text_classifier("nbsvm", (x_train, y_train), preproc=preproc) learner = ktrain.get_learner( model, train_data=(x_train, y_train), val_data=(x_test, y_test), batch_size=bs ) learner.lr_find(suggest=True) grad_lr = learner.lr_estimate() learner.autofit(min(grad_lr), 10) learner.view_top_losses(n=10, preproc=preproc) learner.validate(class_names=preproc.get_classes()) predictor = ktrain.get_predictor(learner.model, preproc) predictor.save(str(models_path))
def retrain(self, returned_output): x_train = [x['clause'] for x in returned_output] y_train = [ 1 if x['prediction'] == 'Unacceptable' else 0 for x in returned_output ] model = self.predictor.model trn = self.t.preprocess_train(x_train, y_train) learner = ktrain.get_learner(model, train_data=trn, batch_size=6) learner.fit_onecycle(3e-5, 6) self.predictor = ktrain.get_predictor(learner.model, preproc=self.t) self.predictor.save('gsa_server/resources/xlnet_6epoch_3e-5')
def __init__(self): # Load pre-trained model print("***Loading preprocessing pipeline") self.dpl = Datapipeline() print("***Preprocessing pipeline loaded") print("***Loading model***") weights_path = "trained_predictor" self.model = ktrain.load_predictor(weights_path).model self.preproc = ktrain.load_predictor(weights_path).preproc self.predictor = ktrain.get_predictor(self.model, self.preproc) print("***Model loaded***")
def test_fasttext_chinese(self): trn, val, preproc = txt.texts_from_csv( "./text_data/chinese_hotel_reviews.csv", "content", label_columns=["pos", "neg"], max_features=30000, maxlen=75, preprocess_mode="standard", sep="|", ) model = txt.text_classifier("fasttext", train_data=trn, preproc=preproc) learner = ktrain.get_learner(model, train_data=trn, val_data=val, batch_size=32) lr = 5e-3 hist = learner.autofit(lr, 10) # test training results self.assertAlmostEqual(max(hist.history["lr"]), lr) self.assertGreater(max(hist.history[VAL_ACC_NAME]), 0.85) # test top losses obs = learner.top_losses(n=1, val_data=None) self.assertIn(obs[0][0], list(range(len(val[0])))) learner.view_top_losses(preproc=preproc, n=1, val_data=None) # test weight decay self.assertEqual(learner.get_weight_decay(), None) learner.set_weight_decay(1e-2) self.assertAlmostEqual(learner.get_weight_decay(), 1e-2) # test load and save model learner.save_model("/tmp/test_model") learner.load_model("/tmp/test_model") # test validate cm = learner.validate(class_names=preproc.get_classes()) print(cm) for i, row in enumerate(cm): self.assertEqual(np.argmax(row), i) # test predictor p = ktrain.get_predictor(learner.model, preproc) self.assertEqual(p.predict([TEST_DOC])[0], "pos") p.save("/tmp/test_predictor") p = ktrain.load_predictor("/tmp/test_predictor") self.assertEqual(p.predict(TEST_DOC), "pos") self.assertEqual(np.argmax(p.predict_proba([TEST_DOC])[0]), 0) self.assertEqual(type(p.explain(TEST_DOC)), IPython.core.display.HTML)
def test_nbsvm(self): trn, val, preproc = txt.texts_from_array(x_train=self.trn[0], y_train=self.trn[1], x_test=self.val[0], y_test=self.val[1], class_names=self.classes, preprocess_mode='standard', maxlen=700, max_features=35000, ngram_range=3) model = txt.text_classifier('nbsvm', train_data=trn, preproc=preproc) learner = ktrain.get_learner(model, train_data=trn, val_data=val, batch_size=32) lr = 0.01 hist = learner.fit_onecycle(lr, 10) # test training results self.assertAlmostEqual(max(hist.history['lr']), lr) self.assertGreater(max(hist.history['val_acc']), 0.92) self.assertAlmostEqual(max(hist.history['momentum']), 0.95) self.assertAlmostEqual(min(hist.history['momentum']), 0.85) # test top losses obs = learner.top_losses(n=1, val_data=None) self.assertIn(obs[0][0], list(range(len(val[0])))) learner.view_top_losses(preproc=preproc, n=1, val_data=None) # test weight decay self.assertEqual(len(learner.get_weight_decay()), 0) learner.set_weight_decay(1e-4) self.assertEqual(len(learner.get_weight_decay()), 0) # test load and save model learner.save_model('/tmp/test_model') learner.load_model('/tmp/test_model') # test validate cm = learner.validate() print(cm) for i, row in enumerate(cm): self.assertEqual(np.argmax(row), i) # test predictor p = ktrain.get_predictor(learner.model, preproc) self.assertEqual(p.predict([TEST_DOC])[0], 'soc.religion.christian') p.save('/tmp/test_predictor') p = ktrain.load_predictor('/tmp/test_predictor') self.assertEqual(p.predict(TEST_DOC), 'soc.religion.christian') self.assertEqual(np.argmax(p.predict_proba([TEST_DOC])[0]), 3) self.assertEqual(type(p.explain(TEST_DOC)), IPython.core.display.HTML)
def test_cora(self): (trn, val, preproc) = gr.graph_links_from_csv('graph_data/cora/cora.content', 'graph_data/cora/cora.cites', sep='\t') learner = ktrain.get_learner(model=gr.graph_link_predictor( 'graphsage', trn, preproc), train_data=trn, val_data=val) lr = 0.01 hist = learner.fit_onecycle(lr, 5) # test training results self.assertAlmostEqual(max(hist.history['lr']), lr) self.assertGreater(max(hist.history[VAL_ACC_NAME]), 0.78) # test top losses obs = learner.top_losses(n=1, val_data=val) self.assertIn(obs[0][0], list(range(val.targets.shape[0]))) learner.view_top_losses(preproc=preproc, n=1, val_data=val) # test weight decay self.assertEqual(learner.get_weight_decay(), None) learner.set_weight_decay(1e-2) self.assertAlmostEqual(learner.get_weight_decay(), 1e-2) # test load and save model learner.save_model('/tmp/test_model') learner.load_model('/tmp/test_model') # test validate learner.validate(val_data=val) cm = learner.validate(val_data=val) print(cm) for i, row in enumerate(cm): self.assertEqual(np.argmax(row), i) # test predictor p = ktrain.get_predictor(learner.model, preproc) self.assertIn( p.predict(preproc.G, list(preproc.G.edges()))[:5][0], preproc.get_classes()) p.save('/tmp/test_predictor') p = ktrain.load_predictor('/tmp/test_predictor') self.assertEqual( p.predict(preproc.G, list(preproc.G.edges()))[:5][0], preproc.get_classes()[1])
def get_predictor(self, directory, predictor=None): """This method obtains the predictor from the trained model, saves it, and uses it to carry out a new set of predictions.""" if predictor is None: self.predictor = ktrain.get_predictor(self.learner.model, self.preprocessing) self.predictor.save(directory + '/predictor') else: self.predictor = predictor self.predictions = self.data[[ 'id', 'content', 'buy', 'sell', 'do_nothing', 'is_validation' ]].copy() self.predictions['prediction'] = self.predictor.predict( self.predictions['content'].tolist())
def test_fasttext_chinese(self): trn, val, preproc = txt.texts_from_csv( './text_data/chinese_hotel_reviews.csv', 'content', label_columns=["pos", "neg"], max_features=30000, maxlen=75, preprocess_mode='standard', sep='|') model = txt.text_classifier('fasttext', train_data=trn) learner = ktrain.get_learner(model, train_data=trn, val_data=val, batch_size=32) lr = 5e-3 hist = learner.autofit(lr, 10) # test training results self.assertAlmostEqual(max(hist.history['lr']), lr) self.assertGreater(max(hist.history['val_acc']), 0.85) # test top losses obs = learner.top_losses(n=1, val_data=None) self.assertIn(obs[0][0], list(range(len(val[0])))) learner.view_top_losses(preproc=preproc, n=1, val_data=None) # test weight decay self.assertEqual(len(learner.get_weight_decay()), 2) self.assertEqual(learner.get_weight_decay()[0], None) learner.set_weight_decay(1e-4) self.assertAlmostEqual(learner.get_weight_decay()[0], 1e-4) # test load and save model learner.save_model('/tmp/test_model') learner.load_model('/tmp/test_model') # test validate cm = learner.validate() print(cm) for i, row in enumerate(cm): self.assertEqual(np.argmax(row), i) # test predictor p = ktrain.get_predictor(learner.model, preproc) self.assertEqual(p.predict([TEST_DOC])[0], 'pos') p.save('/tmp/test_predictor') p = ktrain.load_predictor('/tmp/test_predictor') self.assertEqual(p.predict(TEST_DOC), 'pos') self.assertEqual(np.argmax(p.predict_proba([TEST_DOC])[0]), 0) self.assertEqual(type(p.explain(TEST_DOC)), IPython.core.display.HTML)
def test_bert(self): trn, val, preproc = txt.texts_from_array(x_train=self.trn[0], y_train=self.trn[1], x_test=self.val[0], y_test=self.val[1], class_names=self.classes, preprocess_mode='bert', maxlen=350, max_features=35000) model = txt.text_classifier('bert', train_data=trn, preproc=preproc) learner = ktrain.get_learner(model, train_data=trn, batch_size=6, eval_batch_size=EVAL_BS) lr = 2e-5 hist = learner.fit_onecycle(lr, 1) # test training results self.assertAlmostEqual(max(hist.history['lr']), lr) self.assertGreater(max(hist.history[ACC_NAME]), 0.7) # test top losses obs = learner.top_losses(n=1, val_data=val) self.assertIn(obs[0][0], list(range(len(val[0][0])))) learner.view_top_losses(preproc=preproc, n=1, val_data=val) # test weight decay self.assertEqual(learner.get_weight_decay(), None) learner.set_weight_decay(1e-2) self.assertAlmostEqual(learner.get_weight_decay(), 1e-2) # test load and save model learner.save_model('/tmp/test_model') learner.load_model('/tmp/test_model') # test validate cm = learner.validate(val_data=val) print(cm) for i, row in enumerate(cm): self.assertEqual(np.argmax(row), i) # test predictor p = ktrain.get_predictor(learner.model, preproc, batch_size=EVAL_BS) self.assertEqual(p.predict([TEST_DOC])[0], 'soc.religion.christian') p.save('/tmp/test_predictor') p = ktrain.load_predictor('/tmp/test_predictor', batch_size=EVAL_BS) self.assertEqual(p.predict(TEST_DOC), 'soc.religion.christian') self.assertEqual(np.argmax(p.predict_proba([TEST_DOC])[0]), 3) self.assertEqual(type(p.explain(TEST_DOC)), IPython.core.display.HTML)
def fit(self, train_strings, y_train): tf.random.set_random_seed(0) (x_train, y_train), (x_test, y_test), preproc = \ text.texts_from_array(train_strings, y_train, class_names=["low", "high"], preprocess_mode="bert", maxlen=300, lang="en") self.model = text.text_classifier('bert', (x_train, y_train), preproc=preproc) learner = ktrain.get_learner(self.model, train_data=(x_train, y_train), val_data=(x_test, y_test), batch_size=12) self.learner = learner learner.fit_onecycle(1e-5, 1) learner.plot('loss') plt.show() self.predictor = ktrain.get_predictor(learner.model, preproc)
def test_transformers_api_2(self): MODEL_NAME = 'distilbert-base-uncased' preproc = txt.Transformer(MODEL_NAME, maxlen=500, classes=self.classes) trn = preproc.preprocess_train(self.trn[0], self.trn[1]) val = preproc.preprocess_test(self.val[0], self.val[1]) model = preproc.get_classifier() learner = ktrain.get_learner(model, train_data=trn, val_data=val, batch_size=6, eval_batch_size=EVAL_BS) lr = 5e-5 hist = learner.fit_onecycle(lr, 1) # test training results self.assertAlmostEqual(max(hist.history['lr']), lr) self.assertGreater(max(hist.history[VAL_ACC_NAME]), 0.9) # test top losses obs = learner.top_losses(n=1, val_data=None) self.assertIn(obs[0][0], list(range(len(val.x)))) learner.view_top_losses(preproc=preproc, n=1, val_data=None) # test weight decay self.assertEqual(learner.get_weight_decay(), None) learner.set_weight_decay(1e-2) self.assertAlmostEqual(learner.get_weight_decay(), 1e-2) # test load and save model tmp_folder = ktrain.imports.tempfile.mkdtemp() learner.save_model(tmp_folder) learner.load_model(tmp_folder) # test validate cm = learner.validate() print(cm) for i, row in enumerate(cm): self.assertEqual(np.argmax(row), i) # test predictor p = ktrain.get_predictor(learner.model, preproc, batch_size=EVAL_BS) self.assertEqual(p.predict([TEST_DOC])[0], 'soc.religion.christian') tmp_folder = ktrain.imports.tempfile.mkdtemp() p.save(tmp_folder) p = ktrain.load_predictor(tmp_folder, batch_size=EVAL_BS) self.assertEqual(p.predict(TEST_DOC), 'soc.religion.christian') self.assertEqual(np.argmax(p.predict_proba([TEST_DOC])[0]), 3) self.assertEqual(type(p.explain(TEST_DOC)), IPython.core.display.HTML)
def test_classification(self): train_df = pd.read_csv("tabular_data/train.csv", index_col=0) train_df = train_df.drop("Name", 1) train_df = train_df.drop("Ticket", 1) trn, val, preproc = tabular.tabular_from_df(train_df, label_columns="Survived", random_state=42) model = tabular.tabular_classifier("mlp", trn) learner = ktrain.get_learner(model, train_data=trn, val_data=val, batch_size=32) lr = 0.001 hist = learner.fit_onecycle(lr, 30) # test training results self.assertAlmostEqual(max(hist.history["lr"]), lr) self.assertGreater(max(hist.history[VAL_ACC_NAME]), 0.8) # test top losses obs = learner.top_losses(n=1, val_data=val) self.assertIn(obs[0][0], list(range(val.df.shape[0]))) learner.view_top_losses(preproc=preproc, n=1, val_data=val) # test weight decay self.assertEqual(learner.get_weight_decay(), None) learner.set_weight_decay(1e-2) self.assertAlmostEqual(learner.get_weight_decay(), 1e-2) # test load and save model learner.save_model("/tmp/test_model") learner.load_model("/tmp/test_model") # test validate cm = learner.evaluate(val) print(cm) for i, row in enumerate(cm): self.assertEqual(np.argmax(row), i) # test predictor p = ktrain.get_predictor(learner.model, preproc) predicted_label = p.predict(train_df)[0] self.assertIn(predicted_label, preproc.get_classes()) p.save("/tmp/test_predictor") p = ktrain.load_predictor("/tmp/test_predictor") self.assertEqual(p.predict(train_df)[0], predicted_label)
def test_regression(self): trn, val, preproc = tabular.tabular_from_csv( "tabular_data/adults.csv", label_columns=["age"], is_regression=True, random_state=42, ) model = tabular.tabular_regression_model("mlp", trn) learner = ktrain.get_learner(model, train_data=trn, val_data=val, batch_size=128) lr = 0.001 hist = learner.autofit(lr, 5) # test training results self.assertAlmostEqual(max(hist.history["lr"]), lr) self.assertLess(min(hist.history["val_mae"]), 8.0) # test top losses obs = learner.top_losses(n=1, val_data=val) self.assertIn(obs[0][0], list(range(val.df.shape[0]))) learner.view_top_losses(preproc=preproc, n=1, val_data=val) # test weight decay self.assertEqual(learner.get_weight_decay(), None) learner.set_weight_decay(1e-2) self.assertAlmostEqual(learner.get_weight_decay(), 1e-2) # test load and save model learner.save_model("/tmp/test_model") learner.load_model("/tmp/test_model") # test validate cm = learner.evaluate(val) # test predictor p = ktrain.get_predictor(learner.model, preproc) train_df = pd.read_csv("tabular_data/adults.csv") age = p.predict(train_df)[0][0] self.assertLess(age, 100) p.save("/tmp/test_predictor") p = ktrain.load_predictor("/tmp/test_predictor") self.assertAlmostEqual(p.predict(train_df)[0][0], age)
def test_distilbert(self): trn, val, preproc = txt.texts_from_array(x_train=self.trn[0], y_train=self.trn[1], x_test=self.val[0], y_test=self.val[1], preprocess_mode='distilbert', maxlen=75) model = txt.text_regression_model('distilbert', train_data=trn, preproc=preproc) learner = ktrain.get_learner(model, train_data=trn, val_data=val, batch_size=100) lr = 5e-5 hist = learner.fit_onecycle(lr, 1) # test training results self.assertAlmostEqual(max(hist.history['lr']), lr) self.assertLess(min(hist.history['val_mae']), 16) # test top losses obs = learner.top_losses(n=1, val_data=None) self.assertIn(obs[0][0], list(range(len(val.x)))) learner.view_top_losses(preproc=preproc, n=1, val_data=None) # test weight decay self.assertEqual(learner.get_weight_decay(), None) learner.set_weight_decay(1e-2) self.assertAlmostEqual(learner.get_weight_decay(), 1e-2) # test load and save model tmp_folder = ktrain.imports.tempfile.mkdtemp() learner.save_model(tmp_folder) learner.load_model(tmp_folder, preproc=preproc) # test predictor p = ktrain.get_predictor(learner.model, preproc, batch_size=64) self.assertGreater(p.predict([TEST_DOC])[0], 1) tmp_folder = ktrain.imports.tempfile.mkdtemp() p.save(tmp_folder) p = ktrain.load_predictor(tmp_folder, batch_size=64) self.assertGreater(p.predict([TEST_DOC])[0], 1) self.assertIsNone(p.explain(TEST_DOC))
def test_linreg(self): trn, val, preproc = txt.texts_from_array(x_train=self.trn[0], y_train=self.trn[1], x_test=self.val[0], y_test=self.val[1], preprocess_mode='standard', ngram_range=3, maxlen=200, max_features=35000) model = txt.text_regression_model('linreg', train_data=trn, preproc=preproc) learner = ktrain.get_learner(model, train_data=trn, val_data=val, batch_size=256) lr = 0.01 hist = learner.fit_onecycle(lr, 10) # test training results self.assertAlmostEqual(max(hist.history['lr']), lr) self.assertLess(min(hist.history['val_mae']), 12) # test top losses obs = learner.top_losses(n=1, val_data=None) self.assertIn(obs[0][0], list(range(len(val[0])))) learner.view_top_losses(preproc=preproc, n=1, val_data=None) # test weight decay self.assertEqual(learner.get_weight_decay(), None) learner.set_weight_decay(1e-2) self.assertAlmostEqual(learner.get_weight_decay(), 1e-2) # test load and save model learner.save_model('/tmp/test_model') learner.load_model('/tmp/test_model') # test predictor p = ktrain.get_predictor(learner.model, preproc) self.assertGreater(p.predict([TEST_DOC])[0], 100) p.save('/tmp/test_predictor') p = ktrain.load_predictor('/tmp/test_predictor') self.assertGreater(p.predict([TEST_DOC])[0], 100) self.assertIsNone(p.explain(TEST_DOC))