def test_save_load_model(): """Test saving/loading a fitted model to disk""" X_train, y_train, X_dev, y_dev = sst2_test_data() model = BertClassifier() model.max_seq_length = 64 model.train_batch_size = 8 model.epochs = 1 model.fit(X_train, y_train) accy1 = model.score(X_dev, y_dev) savefile = './test_model_save.bin' print("\nSaving model to ", savefile) model.save(savefile) # load model from disk new_model = load_model(savefile) # predict with new model accy2 = new_model.score(X_dev, y_dev) # clean up print("Cleaning up model file: test_model_save.bin ") os.remove(savefile) assert accy1 == accy2
def test_not_fitted_exception(): """Test predicting with a model that has not been fitted""" X_train, y_train, X_dev, y_dev, label_list = toxic_test_data() model = BertClassifier() model.max_seq_length = 64 model.train_batch_size = 8 model.epochs = 1 model.multilabel = True model.label_list = label_list # model has not been fitted: model.fit(X_train, y_train) with pytest.raises(Exception): model.score(X_dev, y_dev)
def bert_model(): model = BertClassifier() # model.bert_model = 'bert-base-uncased' model.bert_model = 'bert-large-uncased' # model.bert_model = 'scibert-basevocab-uncased' # model.num_mlp_layers = 10 model.max_seq_length = 64 model.epochs = 4 # model.learning_rate = 4e-5 model.learning_rate = 2e-5 model.gradient_accumulation_steps = 1 return model
def test_bert(): X, y = data_from_df( pd.read_csv("~/Documents/OOA/v1.1/Data/train.csv")[:100]) model = BertClassifier() model.restore_finetuned_model( "/Users/oduwaedoosagie/Desktop/berts/baby_bert/v2/baby_bert2.bin") print(model.bert_embedding(["i like pie"])) print(model.bert_embedding(["i like pie"]).shape)
def _create_classifier(self, num_threads, y): from bert_sklearn import BertClassifier bert_model = self.component_config["bert_model"] epochs = self.component_config["epochs"] max_seq_length = self.component_config["max_seq_length"] train_batch_size = self.component_config["train_batch_size"] validation_fraction = self.component_config["validation_fraction"] return BertClassifier( bert_model=bert_model, epochs=epochs, max_seq_length=max_seq_length, train_batch_size=train_batch_size, validation_fraction=validation_fraction )
def train_model(train, model_file_to_save, epochs=3, val_frac=0.1, class_weight=None): X_train = train['sentence'] y_train = train['label'] max_seq_length, train_batch_size, lr = 128, 32, 2e-5 model = BertClassifier(bert_model=BERT_MODEL, random_state=RANDOM_STATE, \ class_weight=class_weight, max_seq_length=max_seq_length, \ train_batch_size=train_batch_size, learning_rate=lr, \ epochs=epochs, validation_fraction=val_frac) print(model) model.fit(X_train, y_train) model.save(model_file_to_save) print(f'\n- model saved to: {model_file_to_save}\n') return model
plt.show() if __name__ == "__main__": #读取路径 train_path = 'F:/code/tfcode/school_code/dataclean_code/train_data_vec.npz' test_path = 'F:/code/tfcode/school_code/dataclean_code/test_data_vec.npz' #读取训练集与测试集 train_x , train_y = get_dataset(train_path) test_x , test_y = get_dataset(test_path) seed = 1234 random.seed(seed) random.shuffle(train_x ) random.seed(seed) random.shuffle(train_y) seed = 2143 random.seed(seed) random.shuffle(test_x ) random.seed(seed) random.shuffle(test_y) model = BertClassifier() model.fit(train_x , train_y) pre_y = model.predict(test_x) score = model.score(pre_y , test_y) print(score)
## 去除噪声标签,获得训练数据 data_df_not_na_label = data_df_not_na[data_df_not_na['label'].isin( ['0', '1', '-1'])] data_df_not_na_label['label'].value_counts() train_df, dev_df = train_test_split(data_df_not_na_label, test_size=0.2, shuffle=True) ## 准备模型的数据 X_train, y_train = train_df['content'], train_df['label'] X_dev, y_dev = dev_df['content'], dev_df['label'] # define model model = BertClassifier('bert-base-uncased') model.validation_fraction = 0.0 model.learning_rate = 3e-5 model.gradient_accumulation_steps = 1 model.max_seq_length = 64 model.train_batch_size = 1 model.eval_batch_size = 1 model.epochs = 1 # fit model.fit(X_train, y_train) # score accy = model.score(X_dev, y_dev) test_df = pd.read_csv(
def test_bert_sklearn_accy(): """ Test BERTss accuracy compare against huggingface run_classifier.py on 200 rows of SST-2 data. """ print("Running bert-sklearn...") X_train, y_train, X_dev, y_dev = sst2_test_data() # define model model = BertClassifier() model.validation_fraction = 0.0 model.learning_rate = 5e-5 model.gradient_accumulation_steps = 2 model.max_seq_length = 64 model.train_batch_size = 16 model.eval_batch_size = 8 model.epochs = 2 model.fit(X_train, y_train) bert_sklearn_accy = model.score(X_dev, y_dev) bert_sklearn_accy /= 100 # run huggingface BERT run_classifier and check we get the same accuracy cmd = r"python tests/run_classifier.py --task_name sst-2 \ --data_dir ./tests/data/sst2 \ --do_train --do_eval \ --output_dir ./comptest \ --bert_model bert-base-uncased \ --do_lower_case \ --learning_rate 5e-5 \ --gradient_accumulation_steps 2 \ --max_seq_length 64 \ --train_batch_size 16 \ --eval_batch_size 8 \ --num_train_epochs 2" print("\nRunning huggingface run_classifier.py...\n") os.system(cmd) print("...finished run_classifier.py\n") # parse run_classifier.py output file and find the accy accy = open("comptest/eval_results.txt").read().split("\n")[ 0] # 'acc = 0.76' accy = accy.split("=")[1] accy = float(accy) print("BERTss accy: %.02f, run_classifier.py accy : %0.02f" % (bert_sklearn_accy, accy)) # clean up print("\nCleaning up eval file: eval_results.txt") #os.remove("eval_results.txt") shutil.rmtree("comptest") assert bert_sklearn_accy == accy
def test_nonbinary_classify(): """Test non-binary classification with different inputs""" train = pd.read_csv(DATADIR + "/mnli/train.csv") X_train = train[['text_a', 'text_b']] y_train = train['label'] #X_train = list(X_train.values) #y_train = list(y_train.values) model = BertClassifier() model.validation_fraction = 0.0 model.max_seq_length = 64 model.train_batch_size = 16 model.eval_batch_size = 8 model.epochs = 1 model.fit(X_train, y_train) accy = model.score(X_train, y_train) # pandas df input X = X_train[:5] print("testing %s input" % (type(X))) y1 = model.predict(X) # numpy array input X = X_train[:5].values print("testing %s input" % (type(X))) y2 = model.predict(X) assert list(y2) == list(y1) # list input X = list(X_train[:5].values) print("testing %s input" % (type(X))) y3 = model.predict(X) assert list(y3) == list(y1)
from sklearn.model_selection import train_test_split from preparacaoDados import tratamentoDados from bert_sklearn import BertClassifier from sklearn.metrics import f1_score data, label = tratamentoDados("sem OHE") tfidf = tratamentoDados("tfidf") #X_train, X_test, y_train, y_test = train_test_split(tfidf, label,test_size=0.3,stratify = label,random_state =5) X_train, X_test, y_train, y_test = train_test_split(tfidf, label,test_size=0.3,random_state =5) del data, tfidf # Definindo modelo model = BertClassifier()# text/text pair classification # Treino model.fit(X_train, y_train.values.ravel()) # Predicoes y_predito = model.predict(X_test) # Resultados micro = f1_score(y_test,y_predito,average='micro') macro = f1_score(y_test,y_predito,average='macro') print("O f1Score micro do Bert é: ",micro) print("O f1Score macro do Bert é: ",macro)
if tokenize: text = df['comment'].apply(lambda x : rdrsegmenter.tokenize(x)) text_normalize(train) text_normalize(test) train.head() train_text = train['comment'] train_label = train['label'] train_sents, val_sents, train_label, val_labels = train_test_split(train_text, train_label, test_size=0.2) train_sents.head() model = BertClassifier(max_seq_length=128, train_batch_size=32, epochs=5, bert_model='bert-base-multilingual-cased') model # Commented out IPython magic to ensure Python compatibility. # %%time # history = model.fit(train_text, train_label) accy = model.score(val_sents, val_labels) # make class probability predictions y_prob = model.predict_proba(val_sents) print("class prob estimates:\n", y_prob) # make predictions y_pred = model.predict(val_sents)
def bert(train_x, train_y, test_x, test_y): bert = BertClassifier(**bert_params) bert.fit(train_x, train_y.values.ravel()) print('BERT Accuracy:', bert.score(test_x, test_y.values.ravel()))