def test_transformers_api_1(self): trn, val, preproc = txt.texts_from_array( x_train=self.trn[0], y_train=self.trn[1], x_test=self.val[0], y_test=self.val[1], class_names=self.classes, preprocess_mode="distilbert", maxlen=500, max_features=35000, ) model = txt.text_classifier("distilbert", train_data=trn, preproc=preproc) learner = ktrain.get_learner( model, train_data=trn, val_data=val, batch_size=6, eval_batch_size=EVAL_BS ) # test weight decay # NOTE due to transformers and/or AdamW bug, # val_accuracy is missing in training history if setting weight decay prior to training # self.assertEqual(learner.get_weight_decay(), None) # learner.set_weight_decay(1e-2) # self.assertAlmostEqual(learner.get_weight_decay(), 1e-2) # train lr = 5e-5 hist = learner.fit_onecycle(lr, 1) # test training results self.assertAlmostEqual(max(hist.history["lr"]), lr) self.assertGreater(max(hist.history[VAL_ACC_NAME]), 0.9) # test top losses obs = learner.top_losses(n=1, val_data=None) self.assertIn(obs[0][0], list(range(len(val.x)))) learner.view_top_losses(preproc=preproc, n=1, val_data=None) # test weight decay self.assertEqual(learner.get_weight_decay(), None) learner.set_weight_decay(1e-2) self.assertAlmostEqual(learner.get_weight_decay(), 1e-2) # test load and save model tmp_folder = ktrain.imports.tempfile.mkdtemp() learner.save_model(tmp_folder) learner.load_model(tmp_folder) # test validate cm = learner.validate() print(cm) for i, row in enumerate(cm): self.assertEqual(np.argmax(row), i) # test predictor p = ktrain.get_predictor(learner.model, preproc, batch_size=EVAL_BS) self.assertEqual(p.predict([TEST_DOC])[0], "soc.religion.christian") tmp_folder = ktrain.imports.tempfile.mkdtemp() p.save(tmp_folder) p = ktrain.load_predictor(tmp_folder, batch_size=EVAL_BS) self.assertEqual(p.predict(TEST_DOC), "soc.religion.christian") self.assertEqual(np.argmax(p.predict_proba([TEST_DOC])[0]), 3) self.assertEqual(type(p.explain(TEST_DOC)), IPython.core.display.HTML)
def bertKtrain(): global predictor import ktrain,random from ktrain import text import tensorflow as tf arr = ["the service is good", "The cost is expensive and customer service sucked","the flight was late but prices are ok","service is fine and cost is also fine"] arr1 = [cleanSentence(text) for text in arr] predictor.predict(arr) indexList = list(df_data.index) random.shuffle(indexList) eightList = [indexList[i] for i in range(0,len(indexList)*80//100)] data_train = df_data.iloc[eightList] twentyList = [indexList[i] for i in range(len(indexList)*80//100,len(indexList))] data_test = df_data.iloc[twentyList] print(data_train.shape[0]+data_test.shape[0],df_data.shape) (X_train,y_train), (X_text,y_test), preprocess = text.texts_from_df(data_train,'text','airline_sentiment',data_test,maxlen=100,preprocess_mode='bert') model = text.text_classifier('bert',(X_train,y_train), preproc= preprocess,multilabel=False) learner = ktrain.get_learner(model,(X_train,y_train),val_data=(X_text,y_test),batch_size=6) learner.lr_find() learner.lr_plot() learner.fit_onecycle(lr=1e-3,epochs=1) #learning rate 1e-3/1e-6 predictor = ktrain.get_predictor(learner.model,preprocess) predictor.predict(arr) return "Use predictor.predict([]) to predict in future"
def training(train_frame): train_frame = train_frame.sample(frac=1) train_test_part = int(len(train_frame) * 0.9) train_df, self_test_df = train_frame[:train_test_part], train_frame[ train_test_part:] # text.texts_from_df return two tuples # maxlen=50 and rest of them are getting trucated # preprocess_mode: choose to use BERT model (X_train, y_train), (X_test, y_test), preprocess = text.texts_from_df( train_df=train_df, text_column='text', label_columns='emotion', val_df=self_test_df, maxlen=50, preprocess_mode='bert', ) # using BERT model model = text.text_classifier(name='bert', train_data=(X_train, y_train), preproc=preprocess) learner = ktrain.get_learner(model=model, train_data=(X_train, y_train), val_data=(X_test, y_test), batch_size=32) # fit one cycle uses the one cycle policy callback learner.fit_onecycle(lr=3e-5, epochs=2, checkpoint_folder='checkpoint') # get predictor and save predictor = ktrain.get_predictor(learner.model, preproc=preprocess) predictor.save('predictor')
def bertKtrainDataBalancing(): posDataFrame = df_data[df_data.airline_sentiment=="positive"].airline_sentiment negDataFrame = df_data[df_data.airline_sentiment=="negative"].airline_sentiment neutralDataFrame = df_data[df_data.airline_sentiment=="neutral"].airline_sentiment posArray,negArray,neutArray = list(posDataFrame.index),list(negDataFrame.index),list(neutralDataFrame.index) random.shuffle(negArray)#,random.shuffle(neutArray),random.shuffle(posArray) finalDf = pd.concat([df_data.iloc[posArray[:2000]],df_data.iloc[negArray[:2000]],df_data.iloc[neutArray[:2000]]]) print(finalDf.airline_sentiment.value_counts()) indexList_2 = list(finalDf.index) random.shuffle(indexList_2) eightList_2 = [indexList_2[i] for i in range(0,len(indexList_2)*80//100)] data_train_2 = df_data.iloc[eightList_2] twentyList_2 = [indexList_2[i] for i in range(len(indexList_2)*80//100,len(indexList_2))] data_test_2 = df_data.iloc[twentyList_2] print(data_train_2.shape[0]+data_test_2.shape[0],finalDf.shape) print(finalDf.airline_sentiment.value_counts()) (X_train_2,y_train_2), (X_text_2,y_test_2), preprocess2 = text.texts_from_df(data_train_2,'text','airline_sentiment',data_test_2,maxlen=50,preprocess_mode='bert') model2 = text.text_classifier('bert',(X_train_2,y_train_2), preproc= preprocess2,multilabel=True) learner2 = ktrain.get_learner(model2,(X_train_2,y_train_2),val_data=(X_text_2,y_test_2),batch_size=6) learner2.lr_find() learner2.lr_plot() #1e-6/1e-3 learner2.fit_onecycle(lr=1e-6,epochs=1) predictor2 = ktrain.get_predictor(learner2.model,preprocess2) print("Normal Data : ",predictor2.predict(arr)) print("Clean Data : ",predictor2.predict(arr1))
def train(epochs=3, batchSize=8): ''' Trains the BERT model. Saves trianed BERT model in NLP/BERT/log directory. :params epochs: number of epochs to train the network batchSize: size of batches for training :return N/A ''' # blockPrint() # ========================================================== # # ======================== PARAMS ========================== # # ========================================================== # ouput_msg = "Begin training the BERT network ..." print(colored(ouput_msg, 'cyan')) current_dir = os.path.dirname(os.path.abspath(__file__)) datadir = os.path.join(current_dir, '../../../data/bert_data') batchSize = 4 epochs = 1 # ========================================================== # # ================= SET UP BERT NETWORK ==================== # # ========================================================== # (x_train, y_train), (x_test, y_test), preproc = text.texts_from_folder( datadir, maxlen=500, preprocess_mode='bert', train_test_names=['train', 'test'], classes=['0', '1']) model = text.text_classifier('bert', (x_train, y_train), preproc=preproc) learner = ktrain.get_learner(model, train_data=(x_train, y_train), val_data=(x_test, y_test), batch_size=batchSize) # ========================================================== # # ==================== TRAIN BERT MODEL ==================== # # ========================================================== # learner.fit_onecycle(2e-5, epochs) predictor = ktrain.get_predictor(learner.model, preproc=preproc) predictor.save('../log') # ========================================================== # # ====================== SAVE MODEL ======================== # # ========================================================== # ouput_msg = "Saving the trained BERT model in NLP/log/model.h5 ..." print(colored(ouput_msg, 'cyan')) save_dir = os.path.join(current_dir, '../log') if not os.path.exists(save_dir): os.mkdir(save_dir) save_file = os.path.join(current_dir, '../log/bert_model.h5') learner.save_model(save_file)
def test_bigru(self): trn, val, preproc = txt.texts_from_array( x_train=self.trn[0], y_train=self.trn[1], x_test=self.val[0], y_test=self.val[1], class_names=self.classes, preprocess_mode="standard", maxlen=350, max_features=35000, ngram_range=1, ) model = txt.text_classifier("bigru", train_data=trn, preproc=preproc) learner = ktrain.get_learner(model, train_data=trn, val_data=val, batch_size=32, eval_batch_size=EVAL_BS) lr = 0.01 hist = learner.autofit(lr, 1) # test training results self.assertAlmostEqual(max(hist.history["lr"]), lr) self.assertGreater(max(hist.history[VAL_ACC_NAME]), 0.89) self.assertAlmostEqual(max(hist.history["momentum"]), 0.95) self.assertAlmostEqual(min(hist.history["momentum"]), 0.85) # test top losses obs = learner.top_losses(n=1, val_data=None) self.assertIn(obs[0][0], list(range(len(val[0])))) learner.view_top_losses(preproc=preproc, n=1, val_data=None) # test weight decay self.assertEqual(learner.get_weight_decay(), None) learner.set_weight_decay(1e-2) self.assertAlmostEqual(learner.get_weight_decay(), 1e-2) # test load and save model learner.save_model("/tmp/test_model") learner.load_model("/tmp/test_model") # test validate cm = learner.validate() print(cm) for i, row in enumerate(cm): self.assertEqual(np.argmax(row), i) # test predictor p = ktrain.get_predictor(learner.model, preproc, batch_size=EVAL_BS) self.assertEqual(p.predict([TEST_DOC])[0], "soc.religion.christian") p.save("/tmp/test_predictor") p = ktrain.load_predictor("/tmp/test_predictor", batch_size=EVAL_BS) self.assertEqual(p.predict([TEST_DOC])[0], "soc.religion.christian") self.assertEqual(p.predict(TEST_DOC), "soc.religion.christian") self.assertEqual(np.argmax(p.predict_proba([TEST_DOC])[0]), 3) self.assertEqual(type(p.explain(TEST_DOC)), IPython.core.display.HTML)
def define_model_and_learner(self): """Once the training and testing data have been preprocessed, a ktrain model and a learner can be defined.""" self.model = text.text_classifier(self.model_name, self.train_preprocessed, preproc=self.preprocessing, multilabel=False) self.learner = ktrain.get_learner(self.model, train_data=self.train_preprocessed, val_data=self.test_preprocessed, batch_size=self.batch_size)
def train_gru(x_train, y_train, x_test, y_test, preproc, bs=5): model = text.text_classifier("bigru", (x_train, y_train), preproc=preproc) learner = ktrain.get_learner( model, train_data=(x_train, y_train), val_data=(x_test, y_test) ) learner.lr_find(suggest=True) grad_lr = learner.lr_estimate() learner.autofit(min(grad_lr), 10) predictor = ktrain.get_predictor(learner.model, preproc) predictor.save(str(models_path)) learner.validate(class_names=preproc.get_classes())
def train_svm(x_train, y_train, x_test, y_test, preproc, bs=5): model = text.text_classifier("nbsvm", (x_train, y_train), preproc=preproc) learner = ktrain.get_learner( model, train_data=(x_train, y_train), val_data=(x_test, y_test), batch_size=bs ) learner.lr_find(suggest=True) grad_lr = learner.lr_estimate() learner.autofit(min(grad_lr), 10) learner.view_top_losses(n=10, preproc=preproc) learner.validate(class_names=preproc.get_classes()) predictor = ktrain.get_predictor(learner.model, preproc) predictor.save(str(models_path))
def test_fasttext_chinese(self): trn, val, preproc = txt.texts_from_csv( "./text_data/chinese_hotel_reviews.csv", "content", label_columns=["pos", "neg"], max_features=30000, maxlen=75, preprocess_mode="standard", sep="|", ) model = txt.text_classifier("fasttext", train_data=trn, preproc=preproc) learner = ktrain.get_learner(model, train_data=trn, val_data=val, batch_size=32) lr = 5e-3 hist = learner.autofit(lr, 10) # test training results self.assertAlmostEqual(max(hist.history["lr"]), lr) self.assertGreater(max(hist.history[VAL_ACC_NAME]), 0.85) # test top losses obs = learner.top_losses(n=1, val_data=None) self.assertIn(obs[0][0], list(range(len(val[0])))) learner.view_top_losses(preproc=preproc, n=1, val_data=None) # test weight decay self.assertEqual(learner.get_weight_decay(), None) learner.set_weight_decay(1e-2) self.assertAlmostEqual(learner.get_weight_decay(), 1e-2) # test load and save model learner.save_model("/tmp/test_model") learner.load_model("/tmp/test_model") # test validate cm = learner.validate(class_names=preproc.get_classes()) print(cm) for i, row in enumerate(cm): self.assertEqual(np.argmax(row), i) # test predictor p = ktrain.get_predictor(learner.model, preproc) self.assertEqual(p.predict([TEST_DOC])[0], "pos") p.save("/tmp/test_predictor") p = ktrain.load_predictor("/tmp/test_predictor") self.assertEqual(p.predict(TEST_DOC), "pos") self.assertEqual(np.argmax(p.predict_proba([TEST_DOC])[0]), 0) self.assertEqual(type(p.explain(TEST_DOC)), IPython.core.display.HTML)
def test_nbsvm(self): trn, val, preproc = txt.texts_from_array(x_train=self.trn[0], y_train=self.trn[1], x_test=self.val[0], y_test=self.val[1], class_names=self.classes, preprocess_mode='standard', maxlen=700, max_features=35000, ngram_range=3) model = txt.text_classifier('nbsvm', train_data=trn, preproc=preproc) learner = ktrain.get_learner(model, train_data=trn, val_data=val, batch_size=32) lr = 0.01 hist = learner.fit_onecycle(lr, 10) # test training results self.assertAlmostEqual(max(hist.history['lr']), lr) self.assertGreater(max(hist.history['val_acc']), 0.92) self.assertAlmostEqual(max(hist.history['momentum']), 0.95) self.assertAlmostEqual(min(hist.history['momentum']), 0.85) # test top losses obs = learner.top_losses(n=1, val_data=None) self.assertIn(obs[0][0], list(range(len(val[0])))) learner.view_top_losses(preproc=preproc, n=1, val_data=None) # test weight decay self.assertEqual(len(learner.get_weight_decay()), 0) learner.set_weight_decay(1e-4) self.assertEqual(len(learner.get_weight_decay()), 0) # test load and save model learner.save_model('/tmp/test_model') learner.load_model('/tmp/test_model') # test validate cm = learner.validate() print(cm) for i, row in enumerate(cm): self.assertEqual(np.argmax(row), i) # test predictor p = ktrain.get_predictor(learner.model, preproc) self.assertEqual(p.predict([TEST_DOC])[0], 'soc.religion.christian') p.save('/tmp/test_predictor') p = ktrain.load_predictor('/tmp/test_predictor') self.assertEqual(p.predict(TEST_DOC), 'soc.religion.christian') self.assertEqual(np.argmax(p.predict_proba([TEST_DOC])[0]), 3) self.assertEqual(type(p.explain(TEST_DOC)), IPython.core.display.HTML)
def get_model_learner(train_data, val_data, preproc, name='bert', batch_size=6): model = text.text_classifier(name=name, train_data=train_data, preproc=preproc) learner = ktrain.get_learner(model=model, train_data=train_data, val_data=val_data, batch_size=batch_size) return model, learner
def test_fasttext_chinese(self): trn, val, preproc = txt.texts_from_csv( './text_data/chinese_hotel_reviews.csv', 'content', label_columns=["pos", "neg"], max_features=30000, maxlen=75, preprocess_mode='standard', sep='|') model = txt.text_classifier('fasttext', train_data=trn) learner = ktrain.get_learner(model, train_data=trn, val_data=val, batch_size=32) lr = 5e-3 hist = learner.autofit(lr, 10) # test training results self.assertAlmostEqual(max(hist.history['lr']), lr) self.assertGreater(max(hist.history['val_acc']), 0.85) # test top losses obs = learner.top_losses(n=1, val_data=None) self.assertIn(obs[0][0], list(range(len(val[0])))) learner.view_top_losses(preproc=preproc, n=1, val_data=None) # test weight decay self.assertEqual(len(learner.get_weight_decay()), 2) self.assertEqual(learner.get_weight_decay()[0], None) learner.set_weight_decay(1e-4) self.assertAlmostEqual(learner.get_weight_decay()[0], 1e-4) # test load and save model learner.save_model('/tmp/test_model') learner.load_model('/tmp/test_model') # test validate cm = learner.validate() print(cm) for i, row in enumerate(cm): self.assertEqual(np.argmax(row), i) # test predictor p = ktrain.get_predictor(learner.model, preproc) self.assertEqual(p.predict([TEST_DOC])[0], 'pos') p.save('/tmp/test_predictor') p = ktrain.load_predictor('/tmp/test_predictor') self.assertEqual(p.predict(TEST_DOC), 'pos') self.assertEqual(np.argmax(p.predict_proba([TEST_DOC])[0]), 0) self.assertEqual(type(p.explain(TEST_DOC)), IPython.core.display.HTML)
def test_bert(self): trn, val, preproc = txt.texts_from_array(x_train=self.trn[0], y_train=self.trn[1], x_test=self.val[0], y_test=self.val[1], class_names=self.classes, preprocess_mode='bert', maxlen=350, max_features=35000) model = txt.text_classifier('bert', train_data=trn, preproc=preproc) learner = ktrain.get_learner(model, train_data=trn, batch_size=6, eval_batch_size=EVAL_BS) lr = 2e-5 hist = learner.fit_onecycle(lr, 1) # test training results self.assertAlmostEqual(max(hist.history['lr']), lr) self.assertGreater(max(hist.history[ACC_NAME]), 0.7) # test top losses obs = learner.top_losses(n=1, val_data=val) self.assertIn(obs[0][0], list(range(len(val[0][0])))) learner.view_top_losses(preproc=preproc, n=1, val_data=val) # test weight decay self.assertEqual(learner.get_weight_decay(), None) learner.set_weight_decay(1e-2) self.assertAlmostEqual(learner.get_weight_decay(), 1e-2) # test load and save model learner.save_model('/tmp/test_model') learner.load_model('/tmp/test_model') # test validate cm = learner.validate(val_data=val) print(cm) for i, row in enumerate(cm): self.assertEqual(np.argmax(row), i) # test predictor p = ktrain.get_predictor(learner.model, preproc, batch_size=EVAL_BS) self.assertEqual(p.predict([TEST_DOC])[0], 'soc.religion.christian') p.save('/tmp/test_predictor') p = ktrain.load_predictor('/tmp/test_predictor', batch_size=EVAL_BS) self.assertEqual(p.predict(TEST_DOC), 'soc.religion.christian') self.assertEqual(np.argmax(p.predict_proba([TEST_DOC])[0]), 3) self.assertEqual(type(p.explain(TEST_DOC)), IPython.core.display.HTML)
def classify_from_folder(): DATADIR = './text_data/text_folder' (x_train, y_train), (x_test, y_test), preproc = txt.texts_from_folder( DATADIR, max_features=100, maxlen=10, ngram_range=3, classes=['pos', 'neg']) model = txt.text_classifier('nbsvm', (x_train, y_train)) learner = ktrain.get_learner(model, train_data=(x_train, y_train), val_data=(x_test, y_test), batch_size=1) hist = learner.autofit(0.001, 250) return hist
def fit(self, train_strings, y_train): tf.random.set_random_seed(0) (x_train, y_train), (x_test, y_test), preproc = \ text.texts_from_array(train_strings, y_train, class_names=["low", "high"], preprocess_mode="bert", maxlen=300, lang="en") self.model = text.text_classifier('bert', (x_train, y_train), preproc=preproc) learner = ktrain.get_learner(self.model, train_data=(x_train, y_train), val_data=(x_test, y_test), batch_size=12) self.learner = learner learner.fit_onecycle(1e-5, 1) learner.plot('loss') plt.show() self.predictor = ktrain.get_predictor(learner.model, preproc)
def classify_from_csv(): DATA_PATH = './text_data/texts.csv' (x_train, y_train), (x_test, y_test), preproc = txt.texts_from_csv( DATA_PATH, 'text', val_filepath=DATA_PATH, label_columns=["pos", "neg"], max_features=100, maxlen=10, ngram_range=3) model = txt.text_classifier('nbsvm', (x_train, y_train)) learner = ktrain.get_learner(model, train_data=(x_train, y_train), val_data=(x_test, y_test), batch_size=1) hist = learner.autofit(0.001, 250) return hist
def create(): print("Preparing dataset") dataset = pd.read_csv("./drive/My Drive/NLP/EN/dataset.csv", ",", encoding='ISO-8859-1') dataset.columns = ['id', 'sentiment', 'text'] dataset = dataset.drop(labels=['id'], axis=1) dataset.sentiment = dataset.sentiment.replace([0, 0.5, 1], ['neg', 'neu', 'pos']) data_train = dataset[(dataset.index > np.percentile(dataset.index, 0)) & (dataset.index <= np.percentile(dataset.index, 50))] data_test = dataset[(dataset.index > np.percentile(dataset.index, 81)) & (dataset.index <= np.percentile(dataset.index, 100))] (X_train, y_train), (X_test, y_test), preprocess = text.texts_from_df( train_df=data_train, text_column='text', label_columns='sentiment', val_df=data_test, maxlen=400, preprocess_mode='bert', verbose=0, lang='en') print("Creating model") model = text.text_classifier(name='bert', train_data=(X_train, y_train), preproc=preprocess, verbose=0) print("Creating learner") learner = ktrain.get_learner(model=model, train_data=(X_train, y_train), val_data=(X_test, y_test), batch_size=6) print("Loading saved model") learner.load_model('./drive/My Drive/NLP/EN/model') print("Creating predictor") return ktrain.get_predictor(learner.model, preprocess)
def train(self, language): categories = ['alt.atheism', 'soc.religion.christian','comp.graphics', 'sci.med'] train_b = fetch_20newsgroups(subset='train',categories=categories, shuffle=True, random_state=42) test_b = fetch_20newsgroups(subset='test',categories=categories, shuffle=True, random_state=42) print('size of training set: %s' % (len(train_b['data']))) print('size of validation set: %s' % (len(test_b['data']))) print('classes: %s' % (train_b.target_names)) x_train = train_b.data y_train = train_b.target x_test = test_b.data y_test = test_b.target (x_train, y_train), (x_test, y_test), preproc = text.texts_from_array(x_train=x_train, y_train=y_train, x_test=x_test, y_test=y_test,class_names=train_b.target_names,preprocess_mode='bert',maxlen=350,max_features=35000) model = text.text_classifier('bert', train_data=(x_train, y_train), preproc=preproc) learner = ktrain.get_learner(model, train_data=(x_train, y_train), batch_size=6) learner.fit_onecycle(2e-5, 4) learner.validate(val_data=(x_test, y_test), class_names=train_b.target_names) self.predictor = ktrain.get_predictor(learner.model, preproc)
def build_model(x_train, y_train, x_test, y_test, preproc): """ Builds and initializes model :param x_train: preprocessed training dataset features (messages) :param y_train: preprocessed training dataset labels :param x_test: preprocessed testing dataset features :param y_test: preprocessed testing dataset labels :param preproc: preprocessor object Returns model and learner object """ # instantiate model model = text.text_classifier('bert', (x_train, y_train), preproc=preproc, multilabel=True) # wrap model and data in learner object learner = ktrain.get_learner(model, train_data=(x_train, y_train), val_data=(x_test, y_test)) return model, learner
def startKBert(x_train, y_train, x_test, y_test, typeList): (x_train, y_train), (x_test, y_test), preproc = text.texts_from_array( x_train=x_train, y_train=y_train, x_test=x_test, y_test=y_test, class_names=typeList, preprocess_mode='bert', maxlen=250, max_features=40000) model = text.text_classifier('bert', train_data=(x_train, y_train), preproc=preproc) learner = ktrain.get_learner(model, train_data=(x_train, y_train), batch_size=6) learner.fit_onecycle(2e-5, 4) learner.validate(val_data=(x_test, y_test), class_names=typeList) predictor = ktrain.get_predictor(learner.model, preproc) predictor.get_classes() predictor.save( 'D:\\Mcgill\\U3 fall\\COMP 551\\p2\\tryBert\\tmp\\my03_ktrain_predictor' )
(x_train, y_train), (x_test, y_test), preproc = text.texts_from_array(x_train=X_train, y_train=y_train, x_test=X_test, y_test=y_test, class_names=class_names, preprocess_mode='bert', maxlen=350, max_features=35000) # ## 2. Training and validation # Loading the pretrained BERT for text classification model = text.text_classifier('bert', train_data=(x_train, y_train), preproc=preproc) # Wrap it in a Learner object learner = ktrain.get_learner(model, train_data=(x_train, y_train), val_data=(x_test, y_test), batch_size=6) # Train the model. More about tuning learning rates # [here](https://github.com/amaiya/ktrain/blob/master/tutorial-02-tuning-learning-rates.ipynb) learner.fit_onecycle(2e-5, 3) # Validation learner.validate(val_data=(x_test, y_test), class_names=class_names)
y_test.append(row[1]) with open('clue_types_extra_test_na.csv') as f: csv_reader = csv.reader(f, delimiter=',') for row in csv_reader: x_test.append(row[0]) y_test.append(row[1]) with open('clue_types_extra_train.csv') as f: csv_reader = csv.reader(f, delimiter=',') for row in csv_reader: x_train.append(row[0]) y_train.append(row[1]) trn, val, preproc = txt.texts_from_array(x_train=x_train, y_train=y_train, x_test=x_test, y_test=y_test, class_names=['0', '1', '2', '3', '4'], preprocess_mode='distilbert', maxlen=30) model = txt.text_classifier('distilbert', train_data=trn, preproc=preproc) learner = ktrain.get_learner(model, train_data=trn, val_data=val, batch_size=6) learner.fit_onecycle(3e-5, 5) """ predictor = ktrain.get_predictor(learner.model, preproc) predictor.save('category') """
import ktrain from ktrain import text ## Load data trn, val, preproc = text.texts_from_folder( "/home/jupyter-ozkan_ma/data/TXT/Ablation_Study_01/", max_features=20000, maxlen=512, ngram_range=1, preprocess_mode='standard', classes=['Center', 'Left', 'Right']) ## Inspection of available models text.print_text_classifiers() ## Apply the bigru model bigru = text.text_classifier("bigru", trn, preproc=preproc) learner_bigru = ktrain.get_learner(bigru, train_data=trn, val_data=val) learner_bigru.lr_find(show_plot=True, max_epochs=5) learner_bigru.lr_estimate() learner_bigru.fit(learner_bigru.lr_estimate()[1], 5)
# preprocess_mode means tokenizing, embedding and transformation of text corpus(here it is considering BERT model) (X_train, y_train), (X_test, y_test), preproc = text.texts_from_df(train_df=data_train, text_column = 'Reviews', label_columns = 'Sentiment', val_df = data_test, maxlen = 500, preprocess_mode = 'bert') ## Define Model # name = "bert" means, here we are using BERT model. model = text.text_classifier(name = 'bert', train_data = (X_train, y_train), preproc = preproc) ## Define Learner #here we have taken batch size as 6 as from the documentation it is recommend to use this with maxlen as 500 learner = ktrain.get_learner(model=model, train_data=(X_train, y_train), val_data = (X_test, y_test), batch_size = 6) ## Fit Model #Essentially fit is a very basic training loop, whereas fit one cycle uses the one cycle policy callback learner.fit_onecycle(lr = 2e-5, epochs = 1)
(x_train, y_train), (x_test, y_test), preproc =\ text.texts_from_df( dataframe, text_column='text', label_columns=labels, maxlen=200, max_features=3500, preprocess_mode='bert', verbose=1 ) model = text.text_classifier('bert', (x_train, y_train), preproc=preproc, multilabel=True, metrics=['accuracy'], verbose=1) learner = ktrain.get_learner(model, train_data=(x_train, y_train), val_data=(x_test, y_test), batch_size=8) model_log_dir = '/home/felix/Desktop/Document_Scanner/text_classifier/logs/english_transformer_model' tb_callback = TensorBoard(log_dir=model_log_dir, histogram_freq=1, write_graph=True) learner.lr_find(show_plot=True) learner.autofit(lr=1e-4, epochs=150,
print("DONE NEG TEST") print("DONE PREPARING BERT FOLDER") print("START TRAINING") (x_train_small, y_train_small), (x_test_small, y_test_small), preproc_small = text.texts_from_folder( "BERT_folder", maxlen=199, preprocess_mode='bert', train_test_names=['train', 'test'], classes=['pos', 'neg']) model_small = text.text_classifier('bert', (x_train_small, y_train_small), preproc=preproc_small) learner_small = ktrain.get_learner(model_small, train_data=(x_train_small, y_train_small), val_data=(x_test_small, y_test_small), batch_size=10) learner_small.fit_onecycle(2e-5, 1) print("DONE WITH TRAINING") print("START TO PREDICT") predictor = ktrain.get_predictor(learner_small.model, preproc_small) tweets_test = tweets_txt_test("Datasets/twitter-datasets/test_data.txt")
# Average accuracy average_accuracy = np.zeros(args.k) # For each fold for k in range(args.k): # Validation directory fold_dir = os.path.join(args.datadir, "k{}".format(k)) fold_val_dir = os.path.join(fold_dir, "val") # Load training and validation data from a folder (x_train, y_train), (x_test, y_test), preproc = text.texts_from_folder( fold_dir, maxlen=512, preprocess_mode='bert', classes=classes) # Load BERT learner = ktrain.get_learner(text.text_classifier('bert', (x_train, y_train)), train_data=(x_train, y_train), val_data=(x_test, y_test), batch_size=16) # Get good learning rate learner.lr_find() # Plot learner.lr_plot() # Train the model learner.fit(2e-5, 20, early_stopping=5) # learner.fit_onecycle(2e-5, 1) # Get the predictor
trn, val, preproc = text.texts_from_folder( "/home/jupyter-ozkan_ma/data/TXT/Full_Experiment/", max_features=20000, maxlen=512, ngram_range=1, preprocess_mode='standard', classes=['Center', 'LeanLeft', 'LeanRight', 'Left', 'Right']) ## Inspection of available classifiers text.print_text_classifiers() ### Applying the fasttext model (mod_17): fasttext = text.text_classifier("fasttext", trn, preproc=preproc) learner_ft = ktrain.get_learner(fasttext, train_data=trn, val_data=val) learner_ft.lr_find(show_plot=True, max_epochs=5) learner_ft.lr_estimate() learner_ft.fit(learner_ft.lr_estimate()[1], 5) # Since val_loss still decreass train for 5 epochs learner_ft.fit(learner_ft.lr_estimate()[1], 5) # Since val_loss still decreass train for 5 epochs learner_ft.fit(learner_ft.lr_estimate()[1], 5)
import ktrain from ktrain import text print(ktrain.__version__) DATA_PATH = 'train_data.csv' # VALID_DATA = 'validation_data.csv' NUM_WORDS = 50000 MAXLEN = 500 (x_train, y_train), (x_test, y_test), preproc = text.texts_from_csv(DATA_PATH, 'message', label_columns = ["class"], val_filepath=None, # if None, 10% of data will be used for validation max_features=NUM_WORDS, maxlen=MAXLEN, ngram_range=1) model = text.text_classifier('fasttext', (x_train, y_train), preproc=preproc) learner = ktrain.get_learner(model, train_data=(x_train, y_train), val_data=(x_test, y_test)) learner.autofit(1e-2) predictor = ktrain.get_predictor(learner.model, preproc) predictor.save('predictor_fasttext')