Exemplo n.º 1
0
def train(epochs=3, batchSize=8):
    '''
    Trains the BERT model. Saves trianed BERT model in NLP/BERT/log directory.

    :params  epochs: number of epochs to train the network
             batchSize: size of batches for training
    :return  N/A
    '''
    # blockPrint()

    # ========================================================== #
    # ======================== PARAMS ========================== #
    # ========================================================== #
    ouput_msg = "Begin training the BERT network ..."
    print(colored(ouput_msg, 'cyan'))

    current_dir = os.path.dirname(os.path.abspath(__file__))
    datadir = os.path.join(current_dir, '../../../data/bert_data')
    batchSize = 4
    epochs = 1

    # ========================================================== #
    # ================= SET UP BERT NETWORK ==================== #
    # ========================================================== #
    (x_train, y_train), (x_test, y_test), preproc = text.texts_from_folder(
        datadir,
        maxlen=500,
        preprocess_mode='bert',
        train_test_names=['train', 'test'],
        classes=['0', '1'])

    model = text.text_classifier('bert', (x_train, y_train), preproc=preproc)

    learner = ktrain.get_learner(model,
                                 train_data=(x_train, y_train),
                                 val_data=(x_test, y_test),
                                 batch_size=batchSize)

    # ========================================================== #
    # ==================== TRAIN BERT MODEL ==================== #
    # ========================================================== #
    learner.fit_onecycle(2e-5, epochs)

    predictor = ktrain.get_predictor(learner.model, preproc=preproc)
    predictor.save('../log')
    # ========================================================== #
    # ====================== SAVE MODEL ======================== #
    # ========================================================== #
    ouput_msg = "Saving the trained BERT model in NLP/log/model.h5 ..."
    print(colored(ouput_msg, 'cyan'))

    save_dir = os.path.join(current_dir, '../log')
    if not os.path.exists(save_dir):
        os.mkdir(save_dir)

    save_file = os.path.join(current_dir, '../log/bert_model.h5')
    learner.save_model(save_file)
def texts_from_folder(preprocess_mode='standard'):
    DATADIR = './text_data/text_folder'
    trn, val, preproc = txt.texts_from_folder(DATADIR, 
                                                    max_features=100, maxlen=10, 
                                                    ngram_range=3, 
                                                    classes=['pos', 'neg'], 
                                                    train_test_names = ['train', 'test'],
                                                    preprocess_mode=preprocess_mode)

    return (trn, val, preproc)
Exemplo n.º 3
0
def texts_from_folder(preprocess_mode="standard"):
    DATADIR = "./text_data/text_folder"
    trn, val, preproc = txt.texts_from_folder(
        DATADIR,
        max_features=100,
        maxlen=10,
        ngram_range=3,
        classes=["pos", "neg"],
        train_test_names=["train", "test"],
        preprocess_mode=preprocess_mode,
    )

    return (trn, val, preproc)
Exemplo n.º 4
0
def classify_from_folder():
    DATADIR = './text_data/text_folder'
    (x_train, y_train), (x_test, y_test), preproc = txt.texts_from_folder(
        DATADIR,
        max_features=100,
        maxlen=10,
        ngram_range=3,
        classes=['pos', 'neg'])
    model = txt.text_classifier('nbsvm', (x_train, y_train))
    learner = ktrain.get_learner(model,
                                 train_data=(x_train, y_train),
                                 val_data=(x_test, y_test),
                                 batch_size=1)
    hist = learner.autofit(0.001, 250)
    return hist
### Loading the IMDB dataset
"""

dataset = tf.keras.utils.get_file(fname="aclImdb_v1.tar.gz",
                                  origin="http://ai.stanford.edu/~amaas/data/sentiment/aclImdb_v1.tar.gz",
                                  extract=True)
IMDB_DATADIR = os.path.join(os.path.dirname(dataset), 'aclImdb') #ACL =Association for computer linguistics

print(os.path.dirname(dataset))
print(IMDB_DATADIR)

"""### Creating the training and test sets"""

(x_train, y_train), (x_test, y_test), preproc = text.texts_from_folder(datadir=IMDB_DATADIR,
                                                                       classes=['pos','neg'],
                                                                       maxlen=500,
                                                                       train_test_names=['train','test'],
                                                                       preprocess_mode='bert')

"""## Part 2: Building the BERT model"""

model = text.text_classifier(name='bert',
                             train_data=(x_train, y_train),
                             preproc=preproc)

"""## Part 3: Training the BERT model"""

learner = ktrain.get_learner(model=model,
                             train_data=(x_train, y_train),
                             val_data=(x_test, y_test),
                             batch_size=6)
Exemplo n.º 6
0
import ktrain
from ktrain import text

## Load data

trn, val, preproc = text.texts_from_folder(
    "/home/jupyter-ozkan_ma/data/TXT/Ablation_Study_01/",
    max_features=20000,
    maxlen=512,
    ngram_range=1,
    preprocess_mode='standard',
    classes=['Center', 'Left', 'Right'])

## Inspection of available models

text.print_text_classifiers()

## Apply the bigru model

bigru = text.text_classifier("bigru", trn, preproc=preproc)

learner_bigru = ktrain.get_learner(bigru, train_data=trn, val_data=val)

learner_bigru.lr_find(show_plot=True, max_epochs=5)

learner_bigru.lr_estimate()

learner_bigru.fit(learner_bigru.lr_estimate()[1], 5)
Exemplo n.º 7
0
# this file trains a BERT model to make prediction on the reviews, actually this feature has not been implemented
# due to hardware limitations

import ktrain
from ktrain import text
import glob

(x_train, y_train), (x_test, y_test), preproc = text.texts_from_folder(
    'aclImdb',
    maxlen=500,
    preprocess_mode='bert',
    train_test_names=['train', 'test'],
    classes=['pos', 'neg'])
model = text.text_classifier('bert', (x_train, y_train), preproc=preproc)
learner = ktrain.get_learner(model,
                             train_data=(x_train, y_train),
                             val_data=(x_test, y_test),
                             batch_size=6)
learner.fit_onecycle(2e-5, 2)  # train for 2 epochs
predictor = ktrain.get_predictor(model, preproc)
predictor.save('/models/predictor')

predictor = ktrain.load_predictor('/models/predictor')
dataset = 'aclImdb/train/unsup'
file_list = glob.glob(dataset + "/*.txt")
results = open("train_labels.txt", "w")
for file in file_list:
    review_text = open(file, "r", encoding="utf-8").readlines()[0]
    predict = predictor.predict(review_text)
    results.write(predict + '\n')
results.close()
Exemplo n.º 8
0
parser = argparse.ArgumentParser()
parser.add_argument("--datadir")
parser.add_argument("--k", default=10)
args = parser.parse_args()

# Average accuracy
average_accuracy = np.zeros(args.k)

# For each fold
for k in range(args.k):
    # Validation directory
    fold_dir = os.path.join(args.datadir, "k{}".format(k))
    fold_val_dir = os.path.join(fold_dir, "val")

    # Load training and validation data from a folder
    (x_train, y_train), (x_test, y_test), preproc = text.texts_from_folder(
        fold_dir, maxlen=512, preprocess_mode='bert', classes=classes)

    # Load BERT
    learner = ktrain.get_learner(text.text_classifier('bert',
                                                      (x_train, y_train)),
                                 train_data=(x_train, y_train),
                                 val_data=(x_test, y_test),
                                 batch_size=16)

    # Get good learning rate
    learner.lr_find()

    # Plot
    learner.lr_plot()

    # Train the model
import ktrain
from ktrain import text

## Loading data

trn, val, preproc = text.texts_from_folder(
    "/home/jupyter-ozkan_ma/data/TXT/Full_Experiment/",
    max_features=20000,
    maxlen=512,
    ngram_range=1,
    preprocess_mode='standard',
    classes=['Center', 'LeanLeft', 'LeanRight', 'Left', 'Right'])

## Inspection of available classifiers

text.print_text_classifiers()

### Applying the fasttext model (mod_17):

fasttext = text.text_classifier("fasttext", trn, preproc=preproc)

learner_ft = ktrain.get_learner(fasttext, train_data=trn, val_data=val)

learner_ft.lr_find(show_plot=True, max_epochs=5)

learner_ft.lr_estimate()

learner_ft.fit(learner_ft.lr_estimate()[1], 5)

# Since val_loss still decreass train for 5 epochs
learner_ft.fit(learner_ft.lr_estimate()[1], 5)
Exemplo n.º 10
0
    f = open("BERT_folder/test/neg/%d.txt" % i, "w+")
    f.write(t)
    f.close()
    i += 1

print("DONE NEG TEST")

print("DONE PREPARING BERT FOLDER")

print("START TRAINING")

(x_train_small,
 y_train_small), (x_test_small,
                  y_test_small), preproc_small = text.texts_from_folder(
                      "BERT_folder",
                      maxlen=199,
                      preprocess_mode='bert',
                      train_test_names=['train', 'test'],
                      classes=['pos', 'neg'])

model_small = text.text_classifier('bert', (x_train_small, y_train_small),
                                   preproc=preproc_small)
learner_small = ktrain.get_learner(model_small,
                                   train_data=(x_train_small, y_train_small),
                                   val_data=(x_test_small, y_test_small),
                                   batch_size=10)

learner_small.fit_onecycle(2e-5, 1)

print("DONE WITH TRAINING")

print("START TO PREDICT")
Exemplo n.º 11
0
cd ~/environments/f

virtualenv -p python py-keras
source py-keras/bin/activate

pip install ktrain
pip install keras
pip install tf-nightly-gpu 

python

import ktrain
from ktrain import text

#model
(x_train, y_train), (x_test, y_test), preproc=text.texts_from_folder('aclImdb',maxlen=100, preprocess_mode='bert',classes=['pos', 'neg'])  #see reference for maxlen & batchsize

#test
learner=ktrain.get_learner(text.text_classifier('bert', (x_train, y_train)),train_data=(x_train, y_train), val_data=(x_test, y_test), batch_size=6)

learner.fit_onecycle(2e-5, 1)

#predict, save & re load 
predictor = ktrain.get_predictor(learner.model, preproc)
#See reference 2
predictor.save('/home/antony/environments/f/model')

Refences:
https://towardsdatascience.com/bert-text-classification-in-3-lines-of-code-using-keras-264db7e7a358
https://github.com/amaiya/ktrain/blob/master/tutorial-04-text-classification.ipynb
Exemplo n.º 12
0
########## TO USE GPU ###################################
#config =  tf.compat.v1.ConfigProto()
#config.gpu_options.per_process_gpu_memory_fraction = 0.4
#sess = tf.compat.v1.Session(config=config)
#keras.backend.set_session(sess)
#########################################################

########## TO IGNORE GPU ################
import os
os.environ['CUDA_VISIBLE_DEVICES'] = '-1'
#########################################

(x_train,
 y_train), (x_test,
            y_test), preproc = text.texts_from_folder("../datasets/aclImdb",
                                                      maxlen=500,
                                                      preprocess_mode="bert",
                                                      classes=["pos", "neg"])

learner = ktrain.get_learner(text.text_classifier("bert", (x_train, y_train),
                                                  preproc=preproc),
                             train_data=(x_train, y_train),
                             val_data=(x_test, y_test),
                             batch_size=6)

learner.fit_onecycle(2e-5, 1)
predictor = ktrain.get_predictor(learner.model, preproc)
data = [
    'This movie was horrible! The plot was boring. Acting was okay, though.',
    'The film really sucked. I want my money back.',
    'The plot had too many holes.',
    'What a beautiful romantic comedy. 10/10 would see again!',
Exemplo n.º 13
0
def test(datadir, batchSize=6):
    '''
    Predicts whether or not an abstract indicates a new dataset.

    :param datadir: directory of evaulation examples
    :return classification: list of ints
    '''
    # ========================================================== #
    # ======================== PARAMS ========================== #
    # ========================================================== #
    current_dir = os.path.dirname(os.path.abspath(__file__))
    traindir = os.path.join(current_dir, '../../../data/bert_data')

    # ========================================================== #
    # ================= GET EVALUATION DATA ==================== #
    # ========================================================== #
    output_msg = 'Setting up BERT network for classification ...'
    print(colored(output_msg, 'cyan'))

    if not os.path.exists(traindir):
        error = ('Data in directory inDexDa/data/bert_data has either been',
                 ' deleted or is formatted incorrectly. Refer to original',
                 ' data supplied in the repo for proper formatting.')
        print(colored(error, 'red'))
        raise Exception(error)

    if not os.path.exists(datadir):
        error = (
            'Data directory for evaluation data does not exist. Make sure',
            ' that directory and eval.json file exist at: {}'.format(datadir))
        print(colored(error, 'red'))
        raise Exception(error)

    with open(datadir, 'r') as f:
        contents = f.read()
        raw = json.loads(contents)
        eval_papers = [paper["Abstract"] for paper in raw]

    # ========================================================== #
    # ================= SET UP BERT NETWORK ==================== #
    # ========================================================== #
    (x_train, y_train), (x_test, y_test), preproc = text.texts_from_folder(
        traindir,
        maxlen=500,
        preprocess_mode='bert',
        train_test_names=['train', 'test'],
        classes=['0', '1'])

    model = text.text_classifier('bert', (x_train, y_train), preproc=preproc)

    learner = ktrain.get_learner(model,
                                 train_data=(x_train, y_train),
                                 val_data=(x_test, y_test),
                                 batch_size=batchSize)

    # ========================================================== #
    # =============== LOAD PRETRAINED BERT MODEL =============== #
    # ========================================================== #
    output_msg = 'Loading the pretrained BERT network ...'
    print(colored(output_msg, 'cyan'))

    load_file = os.path.join(current_dir, '../log/bert_model.h5')
    try:
        learner.load_model(load_file)
    except:
        error = 'Something went wrong when trying to load the weights for the BERT model.'
        print(colored(error, 'red'))
        exit()

    predictor = ktrain.get_predictor(learner.model, preproc)

    # ========================================================== #
    # ======================== PREDICT ========================= #
    # ========================================================== #
    output_msg = 'Predicting if new datasets are presented ...'
    print(colored(output_msg, 'cyan'))

    prediction = predictor.predict(eval_papers)

    results = []
    for idx, paper in enumerate(eval_papers):
        if prediction[idx] == '0':
            results.append({"Abstract": paper, "Prediction": "No Dataset"})
        elif prediction[idx] == '1':
            results.append({
                "Abstract": paper,
                "Prediction": "Dataset Detected"
            })

    # ========================================================== #
    # ================== INFO ABOUT DATASETS =================== #
    # ========================================================== #
    output_msg = "Finalizing BERT Outputs ..."
    print(colored(output_msg, 'cyan'))

    dataset_papers = []
    for idx, result in enumerate(results):
        progress.printProgressBar(idx + 1,
                                  math.ceil(len(results)),
                                  prefix='Progress :',
                                  suffix='Complete',
                                  length=30)
        for paper in raw:
            if result["Abstract"] == paper[
                    "Abstract"] and "Dataset Detected" in result["Prediction"]:
                paper.update({"Prediction": result["Prediction"]})
                dataset_papers.append(paper)

    # ========================================================== #
    # ========================= SAVE =========================== #
    # ========================================================== #
    output_msg = 'Saving results ...'
    print(colored(output_msg, 'cyan'))
    outputdir = os.path.join(current_dir, '../../../data/results.json')
    with open(outputdir, 'w') as f:
        json.dump(dataset_papers, f, indent=4)