예제 #1
0
def bertKtrainDataBalancing():
	posDataFrame = df_data[df_data.airline_sentiment=="positive"].airline_sentiment
	negDataFrame = df_data[df_data.airline_sentiment=="negative"].airline_sentiment
	neutralDataFrame = df_data[df_data.airline_sentiment=="neutral"].airline_sentiment
	posArray,negArray,neutArray = list(posDataFrame.index),list(negDataFrame.index),list(neutralDataFrame.index)
	random.shuffle(negArray)#,random.shuffle(neutArray),random.shuffle(posArray)
	finalDf = pd.concat([df_data.iloc[posArray[:2000]],df_data.iloc[negArray[:2000]],df_data.iloc[neutArray[:2000]]])
	print(finalDf.airline_sentiment.value_counts())
	indexList_2 = list(finalDf.index)
	random.shuffle(indexList_2)
	eightList_2 = [indexList_2[i] for i in range(0,len(indexList_2)*80//100)]
	data_train_2 = df_data.iloc[eightList_2]
	twentyList_2 = [indexList_2[i] for i in range(len(indexList_2)*80//100,len(indexList_2))]
	data_test_2 = df_data.iloc[twentyList_2]
	print(data_train_2.shape[0]+data_test_2.shape[0],finalDf.shape)
	print(finalDf.airline_sentiment.value_counts())
	(X_train_2,y_train_2), (X_text_2,y_test_2), preprocess2 = text.texts_from_df(data_train_2,'text','airline_sentiment',data_test_2,maxlen=50,preprocess_mode='bert')
	model2 = text.text_classifier('bert',(X_train_2,y_train_2), preproc= preprocess2,multilabel=True)
	learner2 = ktrain.get_learner(model2,(X_train_2,y_train_2),val_data=(X_text_2,y_test_2),batch_size=6)
	learner2.lr_find()
	learner2.lr_plot() #1e-6/1e-3
	learner2.fit_onecycle(lr=1e-6,epochs=1)
	predictor2 = ktrain.get_predictor(learner2.model,preprocess2)
	print("Normal Data : ",predictor2.predict(arr))
	print("Clean Data : ",predictor2.predict(arr1))
예제 #2
0
def bertKtrain():
	global predictor
	import ktrain,random
	from ktrain import text
	import tensorflow as tf
	arr = ["the service is good", "The cost is expensive and customer service sucked","the flight was late but prices are ok","service is fine and cost is also fine"]
	arr1 = [cleanSentence(text) for text in arr]
	predictor.predict(arr)

	indexList = list(df_data.index)
	random.shuffle(indexList)
	eightList = [indexList[i] for i in range(0,len(indexList)*80//100)]
	data_train = df_data.iloc[eightList]
	twentyList = [indexList[i] for i in range(len(indexList)*80//100,len(indexList))]
	data_test = df_data.iloc[twentyList]
	print(data_train.shape[0]+data_test.shape[0],df_data.shape)
	(X_train,y_train), (X_text,y_test), preprocess = text.texts_from_df(data_train,'text','airline_sentiment',data_test,maxlen=100,preprocess_mode='bert')
	model = text.text_classifier('bert',(X_train,y_train), preproc= preprocess,multilabel=False)
	learner = ktrain.get_learner(model,(X_train,y_train),val_data=(X_text,y_test),batch_size=6)
	learner.lr_find()
	learner.lr_plot()
	learner.fit_onecycle(lr=1e-3,epochs=1) #learning rate 1e-3/1e-6
	predictor = ktrain.get_predictor(learner.model,preprocess)
	predictor.predict(arr)
	return "Use predictor.predict([]) to predict in future"
def training(train_frame):
    train_frame = train_frame.sample(frac=1)
    train_test_part = int(len(train_frame) * 0.9)
    train_df, self_test_df = train_frame[:train_test_part], train_frame[
        train_test_part:]

    # text.texts_from_df return two tuples
    # maxlen=50 and rest of them are getting trucated
    # preprocess_mode: choose to use BERT model
    (X_train, y_train), (X_test, y_test), preprocess = text.texts_from_df(
        train_df=train_df,
        text_column='text',
        label_columns='emotion',
        val_df=self_test_df,
        maxlen=50,
        preprocess_mode='bert',
    )
    # using BERT model
    model = text.text_classifier(name='bert',
                                 train_data=(X_train, y_train),
                                 preproc=preprocess)
    learner = ktrain.get_learner(model=model,
                                 train_data=(X_train, y_train),
                                 val_data=(X_test, y_test),
                                 batch_size=32)

    # fit one cycle uses the one cycle policy callback
    learner.fit_onecycle(lr=3e-5, epochs=2, checkpoint_folder='checkpoint')

    # get predictor and save
    predictor = ktrain.get_predictor(learner.model, preproc=preprocess)
    predictor.save('predictor')
def _dataset(dataset=dataset):
    df = pd.read_csv(dataset)
    if "cleaned" not in df.columns:
        raise ValueError()
    (x_train, y_train), (x_test, y_test), preproc = text.texts_from_df(
        df,
        "cleaned",  # name of column containing review text
        label_columns=["complaint", "not_complaint"],
        maxlen=75,
        max_features=100000,
        preprocess_mode="standard",
        val_pct=0.1,
        ngram_range=1,
    )
    return x_train, y_train, x_test, y_test, preproc
예제 #5
0
def texts_from_df(train_df,
                  val_df,
                  text_column,
                  label_columns,
                  maxlen=400,
                  preprocess_mode='bert'):

    (X_train, y_train), (X_test, y_test), preprocess = text.texts_from_df(
        train_df=train_df,
        text_column=text_column,
        label_columns=label_columns,
        val_df=val_df,
        maxlen=maxlen,
        preprocess_mode=preprocess_mode)
    return (X_train, y_train), (X_test, y_test), preprocess
    def preprocess_data(self):
        """This method preprocesses the split data, according to the specified model and parameters."""

        if self.model_name not in ['bert', 'distilbert']:
            preprocess_mode = 'standard'
        else:
            preprocess_mode = self.model_name

        self.train_preprocessed, self.test_preprocessed, self.preprocessing = text.texts_from_df(
            self.data_train,
            'content',
            label_columns=['buy', 'sell', 'do_nothing'],
            val_df=self.data_validation,
            maxlen=self.max_len,
            preprocess_mode=preprocess_mode,
            lang='en')
예제 #7
0
def create():
    print("Preparing dataset")

    dataset = pd.read_csv("./drive/My Drive/NLP/EN/dataset.csv",
                          ",",
                          encoding='ISO-8859-1')
    dataset.columns = ['id', 'sentiment', 'text']
    dataset = dataset.drop(labels=['id'], axis=1)

    dataset.sentiment = dataset.sentiment.replace([0, 0.5, 1],
                                                  ['neg', 'neu', 'pos'])

    data_train = dataset[(dataset.index > np.percentile(dataset.index, 0))
                         & (dataset.index <= np.percentile(dataset.index, 50))]
    data_test = dataset[(dataset.index > np.percentile(dataset.index, 81))
                        & (dataset.index <= np.percentile(dataset.index, 100))]

    (X_train, y_train), (X_test, y_test), preprocess = text.texts_from_df(
        train_df=data_train,
        text_column='text',
        label_columns='sentiment',
        val_df=data_test,
        maxlen=400,
        preprocess_mode='bert',
        verbose=0,
        lang='en')

    print("Creating model")
    model = text.text_classifier(name='bert',
                                 train_data=(X_train, y_train),
                                 preproc=preprocess,
                                 verbose=0)

    print("Creating learner")
    learner = ktrain.get_learner(model=model,
                                 train_data=(X_train, y_train),
                                 val_data=(X_test, y_test),
                                 batch_size=6)

    print("Loading saved model")
    learner.load_model('./drive/My Drive/NLP/EN/model')

    print("Creating predictor")
    return ktrain.get_predictor(learner.model, preprocess)
def preprocess_data(train, val, label_cols, args):
    """
    Tokenizes & pads text ('messages' column of pandas dataframe) using Bert Tokenizer
        note: padding transforms lists of integers (tokenized inputs) into a 2D np array
              of shape number of samples x max_len, padded with 0
    Splits training and validation according to 'split' column

    :param text: text data to tokenize and pad
    :param args: classifier arguments dictionary

    Returns array of token IDs (=0 for padded text) for x_train, y_train, x_test, and y_test
            and BERT preprocessor object from ktrain
    """
    (x_train, y_train), (x_test, y_test), preproc = text.texts_from_df(
        train,  # training
        'message',  # text col                                            
        label_cols,  # label col
        val,  # validation
        max_features=args['NUM_WORDS'],  # max features
        maxlen=args['MAX_LEN'],  # max len
        ngram_range=args['NGRAM'],  # n_gram 
        preprocess_mode='bert')  # model

    return x_train, y_train, x_test, y_test, preproc
예제 #9
0
import pandas as pd
import numpy as np
import ktrain
from ktrain import text
import tensorflow as tf

data_test = pd.read_excel('/flaskapi/IMDB-Movie-Reviews-Large-Dataset-50k/test.xlsx', dtype= str)
data_train = pd.read_excel('/flaskapi/IMDB-Movie-Reviews-Large-Dataset-50k/train.xlsx', dtype = str)

(train, val, preproc) = text.texts_from_df(train_df=data_train, text_column='Reviews', label_columns='Sentiment',
                   val_df = data_test,
                   maxlen = 400,
                   preprocess_mode = 'distilbert')

model = text.text_classifier(name = 'distilbert', train_data = train, preproc=preproc)
learner = ktrain.get_learner(model = model,
                             train_data = train,
                             val_data = val,
                             batch_size = 6)

learner.fit_onecycle(lr = 2e-5, epochs=2)
predictor = ktrain.get_predictor(learner.model, preproc)

predictor.save('/flaskapi/distilbert')

data = ['this movie was really bad. acting was also bad. I will not watch again',
        'the movie was really great. I will see it again', 'another great movie. must watch to everyone']

predictor.predict(data)
def train_classifier(dataframe, labels, maxlen, max_features, batch_size,
                     epochs, model_save_dir, model_log_dir):
    """train the multilabel classifier

    Parameters
    ----------
    dataframe : dataframe
        the generated dataframe from data folder
    labels : str
        the extracted labels 
    maxlen : int
        each document can be of most  maxlen  words
    max_features : int
        max num of words to consider in vocabulary
    batch_size : int
        size of parallel trained data stack
    epochs : int
        size of training iterations
    model_save_dir : str
        path to save the model
    model_log_dir : str
        path to the tracked logs for tensorboard usage
    """
    (x_train, y_train), (x_test, y_test), preproc =\
        text.texts_from_df(
                            dataframe,
                            text_column='text',
                            label_columns=labels,
                            maxlen=maxlen,
                            max_features=max_features,
                            preprocess_mode='bert',
                            verbose=1
                            )

    model = text.text_classifier('bert', (x_train, y_train),
                                 preproc=preproc,
                                 multilabel=True,
                                 metrics=['accuracy'],
                                 verbose=1)
    learner = ktrain.get_learner(model,
                                 train_data=(x_train, y_train),
                                 val_data=(x_test, y_test),
                                 batch_size=batch_size)

    tb_callback = TensorBoard(log_dir=model_log_dir,
                              histogram_freq=1,
                              write_graph=True)

    learner.lr_find(show_plot=True)
    learner.autofit(lr=1e-4,
                    epochs=epochs,
                    early_stopping=5,
                    reduce_on_plateau=3,
                    reduce_factor=0.95,
                    monitor='val_loss',
                    callbacks=[tb_callback],
                    verbose=1)

    predictor = ktrain.get_predictor(model=learner.model,
                                     preproc=preproc,
                                     batch_size=batch_size)
    predictor.save(model_save_dir)
import tensorflow as tf
import pandas as pd
import numpy as np
import ktrain
from ktrain import text
import tensorflow as tf

#loading the train dataset
df = pd.read_csv("Train.csv")
data_test = pd.read_csv("test.csv")

(X_train,
 y_train), (X_test,
            y_test), preproc = text.texts_from_df(train_df=df,
                                                  text_column='tweet',
                                                  label_columns='label',
                                                  val_df=data_test,
                                                  maxlen=500,
                                                  preprocess_mode='bert')

model = text.text_classifier(name='bert',
                             train_data=(X_train, y_train),
                             preproc=preproc)

learner = ktrain.get_learner(model=model,
                             train_data=(X_train, y_train),
                             val_data=(X_test, y_test),
                             batch_size=6)

learner.fit_onecycle(lr=2e-5, epochs=1)

predictor = ktrain.get_predictor(learner.model, preproc)
data_train.tail()
#printing head rows of test dataset

data_test.head()

## Train-Test Split

# text.texts_from_df return two tuples
# maxlen means it is considering that much words and rest are getting trucated
# preprocess_mode means tokenizing, embedding and transformation of text corpus(here it is considering BERT model)


(X_train, y_train), (X_test, y_test), preproc = text.texts_from_df(train_df=data_train,
                                                                   text_column = 'Reviews',
                                                                   label_columns = 'Sentiment',
                                                                   val_df = data_test,
                                                                   maxlen = 500,
                                                                   preprocess_mode = 'bert')

## Define Model

# name = "bert" means, here we are using BERT model.

model = text.text_classifier(name = 'bert',
                             train_data = (X_train, y_train),
                             preproc = preproc)

## Define Learner

#here we have taken batch size as 6 as from the documentation it is recommend to use this with maxlen as 500
from Dataset import Data

data = Data(language='en', creating_parquet=False)  # 'de'
classes = data.get_num_classes()
data.preprocess_labels()
dataframe = data.dataframe
labels = list(dataframe.columns.values)
labels = [label for label in labels if label not in ['text', 'label']]


(x_train, y_train), (x_test, y_test), preproc =\
     text.texts_from_df(
                        dataframe,
                        text_column='text',
                        label_columns=labels,
                        maxlen=200,
                        max_features=3500,
                        preprocess_mode='bert',
                        verbose=1
                        )

model = text.text_classifier('bert', (x_train, y_train),
                             preproc=preproc,
                             multilabel=True,
                             metrics=['accuracy'],
                             verbose=1)
learner = ktrain.get_learner(model,
                             train_data=(x_train, y_train),
                             val_data=(x_test, y_test),
                             batch_size=8)
예제 #14
0
import ktrain.text as text
import ktrain
import pandas as pd

preprocessing = None
learner = None
model = None
for index_run in range(8, 10):
    input = pd.read_csv('../data/complete_ktrain.csv')

    (x_train, y_train), (x_val, y_val), preprocessing = text.texts_from_df(
        train_df=input,
        text_column='sentence',
        label_columns=[
            'joy', 'trust', 'fear', 'surprise', 'sadness', 'disgust', 'anger',
            'anticipation', 'neutral'
        ],
        preprocess_mode='bert',
        val_pct=0.2,
        max_features=1000,
        maxlen=75)

    model = text.text_classifier(name='bert',
                                 train_data=(x_train, y_train),
                                 preproc=preprocessing)

    learner = ktrain.get_learner(model=model,
                                 train_data=(x_train, y_train),
                                 val_data=(x_val, y_val),
                                 batch_size=16)
- **DistilBERT**: distilbert-base-uncased distilbert-base-multilingual-cased, distilbert-base-german-cased, and others
- **ALBERT**: albert-base-v2, albert-large-v2, and others
- **RoBERTa**: roberta-base, roberta-large, roberta-large-mnli
- **XLM**: xlm-mlm-xnli15–1024, xlm-mlm-100–1280, and others
- **XLNet**: xlnet-base-cased, xlnet-large-cased


# text.texts_from_df return two tuples
# maxlen means it is considering that much words and rest are getting trucated
# preprocess_mode means tokenizing, embedding and transformation of text corpus(here it is considering BERT model)


(X_train, y_train), (X_test, y_test), preproc = text.texts_from_df(train_df=data_train,
                                                                   text_column = 'Reviews',
                                                                   label_columns = 'Sentiment',
                                                                   val_df = data_test,
                                                                   maxlen = 250,
                                                                   lang = 'zh-*',
                                                                   preprocess_mode = 'bert') # or distilbert

## size of data
print(X_train[0].shape, y_train.shape)
print(X_test[0].shape, y_test.shape)

## Define Model

## use 'distilbert' if you want
model = text.text_classifier(name = 'bert', # or distilbert
                             train_data = (X_train, y_train),
                             preproc = preproc)
예제 #16
0
from sklearn.model_selection import train_test_split

print("Preparing dataset")
dataset = pd.read_csv("./data/imdb-reviews-pt-br.csv")

X = dataset.drop('text_en', axis=1).rename(columns={"text_pt": "text"})
y = dataset.sentiment

data_train, data_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

(X_train,
 y_train), (X_test,
            y_test), preprocess = text.texts_from_df(train_df=data_train,
                                                     text_column='text',
                                                     label_columns='sentiment',
                                                     val_df=data_test,
                                                     maxlen=400,
                                                     preprocess_mode='bert',
                                                     verbose=0,
                                                     lang='pt')

print("Creating model")
model = text.text_classifier(name='bert',
                             train_data=(X_train, y_train),
                             preproc=preprocess,
                             verbose=0)

print("Creating learner")
learner = ktrain.get_learner(model=model,
                             train_data=(X_train, y_train),
                             val_data=(X_test, y_test),
                             batch_size=6)
예제 #17
0
test_list

validation_list = corr_statement + incorr_statement + right_reason1 + right_reason2
random.shuffle(validation_list) 
validation_list

column_labels = ['Sentence', 'Labels']
train_data_df = pd.DataFrame(data_list, columns = column_labels)
test_data_df = pd.DataFrame(test_list, columns = column_labels)
dev_data_df = pd.DataFrame(validation_list, columns = column_labels)

train_data_df.head()

(X_train, Y_train) , (X_test, Y_test), preprocess = text.texts_from_df(train_df= train_data_df, 
                                                                       text_column= 'Sentence', 
                                                                       label_columns= 'Labels',
                                                                       val_df = dev_data_df,
                                                                       maxlen = 400,
                                                                       preprocess_mode = 'bert')

X_train[0].shape

model = text.text_classifier(name='bert', 
                             train_data = (X_train, Y_train),
                             preproc = preprocess)

learner = ktrain.get_learner(model = model,
                             train_data = (X_train, Y_train),
                             val_data = (X_test, Y_test),
                             batch_size = 6)

learner.fit_onecycle(lr = 2e-5, epochs=1)