def bertKtrainDataBalancing(): posDataFrame = df_data[df_data.airline_sentiment=="positive"].airline_sentiment negDataFrame = df_data[df_data.airline_sentiment=="negative"].airline_sentiment neutralDataFrame = df_data[df_data.airline_sentiment=="neutral"].airline_sentiment posArray,negArray,neutArray = list(posDataFrame.index),list(negDataFrame.index),list(neutralDataFrame.index) random.shuffle(negArray)#,random.shuffle(neutArray),random.shuffle(posArray) finalDf = pd.concat([df_data.iloc[posArray[:2000]],df_data.iloc[negArray[:2000]],df_data.iloc[neutArray[:2000]]]) print(finalDf.airline_sentiment.value_counts()) indexList_2 = list(finalDf.index) random.shuffle(indexList_2) eightList_2 = [indexList_2[i] for i in range(0,len(indexList_2)*80//100)] data_train_2 = df_data.iloc[eightList_2] twentyList_2 = [indexList_2[i] for i in range(len(indexList_2)*80//100,len(indexList_2))] data_test_2 = df_data.iloc[twentyList_2] print(data_train_2.shape[0]+data_test_2.shape[0],finalDf.shape) print(finalDf.airline_sentiment.value_counts()) (X_train_2,y_train_2), (X_text_2,y_test_2), preprocess2 = text.texts_from_df(data_train_2,'text','airline_sentiment',data_test_2,maxlen=50,preprocess_mode='bert') model2 = text.text_classifier('bert',(X_train_2,y_train_2), preproc= preprocess2,multilabel=True) learner2 = ktrain.get_learner(model2,(X_train_2,y_train_2),val_data=(X_text_2,y_test_2),batch_size=6) learner2.lr_find() learner2.lr_plot() #1e-6/1e-3 learner2.fit_onecycle(lr=1e-6,epochs=1) predictor2 = ktrain.get_predictor(learner2.model,preprocess2) print("Normal Data : ",predictor2.predict(arr)) print("Clean Data : ",predictor2.predict(arr1))
def bertKtrain(): global predictor import ktrain,random from ktrain import text import tensorflow as tf arr = ["the service is good", "The cost is expensive and customer service sucked","the flight was late but prices are ok","service is fine and cost is also fine"] arr1 = [cleanSentence(text) for text in arr] predictor.predict(arr) indexList = list(df_data.index) random.shuffle(indexList) eightList = [indexList[i] for i in range(0,len(indexList)*80//100)] data_train = df_data.iloc[eightList] twentyList = [indexList[i] for i in range(len(indexList)*80//100,len(indexList))] data_test = df_data.iloc[twentyList] print(data_train.shape[0]+data_test.shape[0],df_data.shape) (X_train,y_train), (X_text,y_test), preprocess = text.texts_from_df(data_train,'text','airline_sentiment',data_test,maxlen=100,preprocess_mode='bert') model = text.text_classifier('bert',(X_train,y_train), preproc= preprocess,multilabel=False) learner = ktrain.get_learner(model,(X_train,y_train),val_data=(X_text,y_test),batch_size=6) learner.lr_find() learner.lr_plot() learner.fit_onecycle(lr=1e-3,epochs=1) #learning rate 1e-3/1e-6 predictor = ktrain.get_predictor(learner.model,preprocess) predictor.predict(arr) return "Use predictor.predict([]) to predict in future"
def training(train_frame): train_frame = train_frame.sample(frac=1) train_test_part = int(len(train_frame) * 0.9) train_df, self_test_df = train_frame[:train_test_part], train_frame[ train_test_part:] # text.texts_from_df return two tuples # maxlen=50 and rest of them are getting trucated # preprocess_mode: choose to use BERT model (X_train, y_train), (X_test, y_test), preprocess = text.texts_from_df( train_df=train_df, text_column='text', label_columns='emotion', val_df=self_test_df, maxlen=50, preprocess_mode='bert', ) # using BERT model model = text.text_classifier(name='bert', train_data=(X_train, y_train), preproc=preprocess) learner = ktrain.get_learner(model=model, train_data=(X_train, y_train), val_data=(X_test, y_test), batch_size=32) # fit one cycle uses the one cycle policy callback learner.fit_onecycle(lr=3e-5, epochs=2, checkpoint_folder='checkpoint') # get predictor and save predictor = ktrain.get_predictor(learner.model, preproc=preprocess) predictor.save('predictor')
def _dataset(dataset=dataset): df = pd.read_csv(dataset) if "cleaned" not in df.columns: raise ValueError() (x_train, y_train), (x_test, y_test), preproc = text.texts_from_df( df, "cleaned", # name of column containing review text label_columns=["complaint", "not_complaint"], maxlen=75, max_features=100000, preprocess_mode="standard", val_pct=0.1, ngram_range=1, ) return x_train, y_train, x_test, y_test, preproc
def texts_from_df(train_df, val_df, text_column, label_columns, maxlen=400, preprocess_mode='bert'): (X_train, y_train), (X_test, y_test), preprocess = text.texts_from_df( train_df=train_df, text_column=text_column, label_columns=label_columns, val_df=val_df, maxlen=maxlen, preprocess_mode=preprocess_mode) return (X_train, y_train), (X_test, y_test), preprocess
def preprocess_data(self): """This method preprocesses the split data, according to the specified model and parameters.""" if self.model_name not in ['bert', 'distilbert']: preprocess_mode = 'standard' else: preprocess_mode = self.model_name self.train_preprocessed, self.test_preprocessed, self.preprocessing = text.texts_from_df( self.data_train, 'content', label_columns=['buy', 'sell', 'do_nothing'], val_df=self.data_validation, maxlen=self.max_len, preprocess_mode=preprocess_mode, lang='en')
def create(): print("Preparing dataset") dataset = pd.read_csv("./drive/My Drive/NLP/EN/dataset.csv", ",", encoding='ISO-8859-1') dataset.columns = ['id', 'sentiment', 'text'] dataset = dataset.drop(labels=['id'], axis=1) dataset.sentiment = dataset.sentiment.replace([0, 0.5, 1], ['neg', 'neu', 'pos']) data_train = dataset[(dataset.index > np.percentile(dataset.index, 0)) & (dataset.index <= np.percentile(dataset.index, 50))] data_test = dataset[(dataset.index > np.percentile(dataset.index, 81)) & (dataset.index <= np.percentile(dataset.index, 100))] (X_train, y_train), (X_test, y_test), preprocess = text.texts_from_df( train_df=data_train, text_column='text', label_columns='sentiment', val_df=data_test, maxlen=400, preprocess_mode='bert', verbose=0, lang='en') print("Creating model") model = text.text_classifier(name='bert', train_data=(X_train, y_train), preproc=preprocess, verbose=0) print("Creating learner") learner = ktrain.get_learner(model=model, train_data=(X_train, y_train), val_data=(X_test, y_test), batch_size=6) print("Loading saved model") learner.load_model('./drive/My Drive/NLP/EN/model') print("Creating predictor") return ktrain.get_predictor(learner.model, preprocess)
def preprocess_data(train, val, label_cols, args): """ Tokenizes & pads text ('messages' column of pandas dataframe) using Bert Tokenizer note: padding transforms lists of integers (tokenized inputs) into a 2D np array of shape number of samples x max_len, padded with 0 Splits training and validation according to 'split' column :param text: text data to tokenize and pad :param args: classifier arguments dictionary Returns array of token IDs (=0 for padded text) for x_train, y_train, x_test, and y_test and BERT preprocessor object from ktrain """ (x_train, y_train), (x_test, y_test), preproc = text.texts_from_df( train, # training 'message', # text col label_cols, # label col val, # validation max_features=args['NUM_WORDS'], # max features maxlen=args['MAX_LEN'], # max len ngram_range=args['NGRAM'], # n_gram preprocess_mode='bert') # model return x_train, y_train, x_test, y_test, preproc
import pandas as pd import numpy as np import ktrain from ktrain import text import tensorflow as tf data_test = pd.read_excel('/flaskapi/IMDB-Movie-Reviews-Large-Dataset-50k/test.xlsx', dtype= str) data_train = pd.read_excel('/flaskapi/IMDB-Movie-Reviews-Large-Dataset-50k/train.xlsx', dtype = str) (train, val, preproc) = text.texts_from_df(train_df=data_train, text_column='Reviews', label_columns='Sentiment', val_df = data_test, maxlen = 400, preprocess_mode = 'distilbert') model = text.text_classifier(name = 'distilbert', train_data = train, preproc=preproc) learner = ktrain.get_learner(model = model, train_data = train, val_data = val, batch_size = 6) learner.fit_onecycle(lr = 2e-5, epochs=2) predictor = ktrain.get_predictor(learner.model, preproc) predictor.save('/flaskapi/distilbert') data = ['this movie was really bad. acting was also bad. I will not watch again', 'the movie was really great. I will see it again', 'another great movie. must watch to everyone'] predictor.predict(data)
def train_classifier(dataframe, labels, maxlen, max_features, batch_size, epochs, model_save_dir, model_log_dir): """train the multilabel classifier Parameters ---------- dataframe : dataframe the generated dataframe from data folder labels : str the extracted labels maxlen : int each document can be of most maxlen words max_features : int max num of words to consider in vocabulary batch_size : int size of parallel trained data stack epochs : int size of training iterations model_save_dir : str path to save the model model_log_dir : str path to the tracked logs for tensorboard usage """ (x_train, y_train), (x_test, y_test), preproc =\ text.texts_from_df( dataframe, text_column='text', label_columns=labels, maxlen=maxlen, max_features=max_features, preprocess_mode='bert', verbose=1 ) model = text.text_classifier('bert', (x_train, y_train), preproc=preproc, multilabel=True, metrics=['accuracy'], verbose=1) learner = ktrain.get_learner(model, train_data=(x_train, y_train), val_data=(x_test, y_test), batch_size=batch_size) tb_callback = TensorBoard(log_dir=model_log_dir, histogram_freq=1, write_graph=True) learner.lr_find(show_plot=True) learner.autofit(lr=1e-4, epochs=epochs, early_stopping=5, reduce_on_plateau=3, reduce_factor=0.95, monitor='val_loss', callbacks=[tb_callback], verbose=1) predictor = ktrain.get_predictor(model=learner.model, preproc=preproc, batch_size=batch_size) predictor.save(model_save_dir)
import tensorflow as tf import pandas as pd import numpy as np import ktrain from ktrain import text import tensorflow as tf #loading the train dataset df = pd.read_csv("Train.csv") data_test = pd.read_csv("test.csv") (X_train, y_train), (X_test, y_test), preproc = text.texts_from_df(train_df=df, text_column='tweet', label_columns='label', val_df=data_test, maxlen=500, preprocess_mode='bert') model = text.text_classifier(name='bert', train_data=(X_train, y_train), preproc=preproc) learner = ktrain.get_learner(model=model, train_data=(X_train, y_train), val_data=(X_test, y_test), batch_size=6) learner.fit_onecycle(lr=2e-5, epochs=1) predictor = ktrain.get_predictor(learner.model, preproc)
data_train.tail() #printing head rows of test dataset data_test.head() ## Train-Test Split # text.texts_from_df return two tuples # maxlen means it is considering that much words and rest are getting trucated # preprocess_mode means tokenizing, embedding and transformation of text corpus(here it is considering BERT model) (X_train, y_train), (X_test, y_test), preproc = text.texts_from_df(train_df=data_train, text_column = 'Reviews', label_columns = 'Sentiment', val_df = data_test, maxlen = 500, preprocess_mode = 'bert') ## Define Model # name = "bert" means, here we are using BERT model. model = text.text_classifier(name = 'bert', train_data = (X_train, y_train), preproc = preproc) ## Define Learner #here we have taken batch size as 6 as from the documentation it is recommend to use this with maxlen as 500
from Dataset import Data data = Data(language='en', creating_parquet=False) # 'de' classes = data.get_num_classes() data.preprocess_labels() dataframe = data.dataframe labels = list(dataframe.columns.values) labels = [label for label in labels if label not in ['text', 'label']] (x_train, y_train), (x_test, y_test), preproc =\ text.texts_from_df( dataframe, text_column='text', label_columns=labels, maxlen=200, max_features=3500, preprocess_mode='bert', verbose=1 ) model = text.text_classifier('bert', (x_train, y_train), preproc=preproc, multilabel=True, metrics=['accuracy'], verbose=1) learner = ktrain.get_learner(model, train_data=(x_train, y_train), val_data=(x_test, y_test), batch_size=8)
import ktrain.text as text import ktrain import pandas as pd preprocessing = None learner = None model = None for index_run in range(8, 10): input = pd.read_csv('../data/complete_ktrain.csv') (x_train, y_train), (x_val, y_val), preprocessing = text.texts_from_df( train_df=input, text_column='sentence', label_columns=[ 'joy', 'trust', 'fear', 'surprise', 'sadness', 'disgust', 'anger', 'anticipation', 'neutral' ], preprocess_mode='bert', val_pct=0.2, max_features=1000, maxlen=75) model = text.text_classifier(name='bert', train_data=(x_train, y_train), preproc=preprocessing) learner = ktrain.get_learner(model=model, train_data=(x_train, y_train), val_data=(x_val, y_val), batch_size=16)
- **DistilBERT**: distilbert-base-uncased distilbert-base-multilingual-cased, distilbert-base-german-cased, and others - **ALBERT**: albert-base-v2, albert-large-v2, and others - **RoBERTa**: roberta-base, roberta-large, roberta-large-mnli - **XLM**: xlm-mlm-xnli15–1024, xlm-mlm-100–1280, and others - **XLNet**: xlnet-base-cased, xlnet-large-cased # text.texts_from_df return two tuples # maxlen means it is considering that much words and rest are getting trucated # preprocess_mode means tokenizing, embedding and transformation of text corpus(here it is considering BERT model) (X_train, y_train), (X_test, y_test), preproc = text.texts_from_df(train_df=data_train, text_column = 'Reviews', label_columns = 'Sentiment', val_df = data_test, maxlen = 250, lang = 'zh-*', preprocess_mode = 'bert') # or distilbert ## size of data print(X_train[0].shape, y_train.shape) print(X_test[0].shape, y_test.shape) ## Define Model ## use 'distilbert' if you want model = text.text_classifier(name = 'bert', # or distilbert train_data = (X_train, y_train), preproc = preproc)
from sklearn.model_selection import train_test_split print("Preparing dataset") dataset = pd.read_csv("./data/imdb-reviews-pt-br.csv") X = dataset.drop('text_en', axis=1).rename(columns={"text_pt": "text"}) y = dataset.sentiment data_train, data_test, y_train, y_test = train_test_split(X, y, test_size=0.2) (X_train, y_train), (X_test, y_test), preprocess = text.texts_from_df(train_df=data_train, text_column='text', label_columns='sentiment', val_df=data_test, maxlen=400, preprocess_mode='bert', verbose=0, lang='pt') print("Creating model") model = text.text_classifier(name='bert', train_data=(X_train, y_train), preproc=preprocess, verbose=0) print("Creating learner") learner = ktrain.get_learner(model=model, train_data=(X_train, y_train), val_data=(X_test, y_test), batch_size=6)
test_list validation_list = corr_statement + incorr_statement + right_reason1 + right_reason2 random.shuffle(validation_list) validation_list column_labels = ['Sentence', 'Labels'] train_data_df = pd.DataFrame(data_list, columns = column_labels) test_data_df = pd.DataFrame(test_list, columns = column_labels) dev_data_df = pd.DataFrame(validation_list, columns = column_labels) train_data_df.head() (X_train, Y_train) , (X_test, Y_test), preprocess = text.texts_from_df(train_df= train_data_df, text_column= 'Sentence', label_columns= 'Labels', val_df = dev_data_df, maxlen = 400, preprocess_mode = 'bert') X_train[0].shape model = text.text_classifier(name='bert', train_data = (X_train, Y_train), preproc = preprocess) learner = ktrain.get_learner(model = model, train_data = (X_train, Y_train), val_data = (X_test, Y_test), batch_size = 6) learner.fit_onecycle(lr = 2e-5, epochs=1)