def test_bert_sklearn_accy(): """ Test bert_sklearn accuracy compare against huggingface run_classifier.py on 200 rows of SST-2 data. """ print("Running bert-sklearn...") X_train, y_train, X_dev, y_dev, label_list = toxic_test_data() # define model model = BertClassifier() model.validation_fraction = 0.0 model.learning_rate = 5e-5 model.gradient_accumulation_steps = 2 model.max_seq_length = 64 model.train_batch_size = 16 model.eval_batch_size = 8 model.epochs = 2 model.multilabel = True # for multi-label classification model.label_list = label_list model.fit(X_train, y_train) bert_sklearn_accy = model.score(X_dev, y_dev) bert_sklearn_accy /= 100 # run huggingface BERT run_classifier and check we get the same accuracy cmd = r"python tests/run_classifier.py --task_name sst-2 \ --data_dir ./tests/data/sst2 \ --do_train --do_eval \ --output_dir ./comptest \ --bert_model bert-base-uncased \ --do_lower_case \ --learning_rate 5e-5 \ --gradient_accumulation_steps 2 \ --max_seq_length 64 \ --train_batch_size 16 \ --eval_batch_size 8 \ --num_train_epochs 2" print("\nRunning huggingface run_classifier.py...\n") os.system(cmd) print("...finished run_classifier.py\n") # parse run_classifier.py output file and find the accy accy = open("comptest/eval_results.txt").read().split("\n")[ 0] # 'acc = 0.76' accy = accy.split("=")[1] accy = float(accy) print("bert_sklearn accy: %.02f, run_classifier.py accy : %0.02f" % (bert_sklearn_accy, accy)) # clean up print("\nCleaning up eval file: eval_results.txt") #os.remove("eval_results.txt") shutil.rmtree("comptest") assert bert_sklearn_accy == accy
def bert_model(): model = BertClassifier() # model.bert_model = 'bert-base-uncased' model.bert_model = 'bert-large-uncased' # model.bert_model = 'scibert-basevocab-uncased' # model.num_mlp_layers = 10 model.max_seq_length = 64 model.epochs = 4 # model.learning_rate = 4e-5 model.learning_rate = 2e-5 model.gradient_accumulation_steps = 1 return model
['0', '1', '-1'])] data_df_not_na_label['label'].value_counts() train_df, dev_df = train_test_split(data_df_not_na_label, test_size=0.2, shuffle=True) ## 准备模型的数据 X_train, y_train = train_df['content'], train_df['label'] X_dev, y_dev = dev_df['content'], dev_df['label'] # define model model = BertClassifier('bert-base-uncased') model.validation_fraction = 0.0 model.learning_rate = 3e-5 model.gradient_accumulation_steps = 1 model.max_seq_length = 64 model.train_batch_size = 1 model.eval_batch_size = 1 model.epochs = 1 # fit model.fit(X_train, y_train) # score accy = model.score(X_dev, y_dev) test_df = pd.read_csv( 'data/nCov_10k_test.csv', skiprows=[0], names=['id', 'time', 'account', 'content', 'pic', 'video'])