def test_save_load(self): """ Ensure saving + loading does not cause errors Ensure saving + loading does not change predictions """ save_file = "tests/saved-models/test-save-load" save_file_fp16 = "tests/saved-models/test-save-load_fp16" config = self.default_config(save_adam_vars=False) model = Classifier(**config) train_sample = self.dataset.sample(n=self.n_sample) valid_sample = self.dataset.sample(n=self.n_sample) model.fit(train_sample.Text, train_sample.Target) predictions = model.predict(valid_sample.Text) # testing file size reduction options model.save(save_file) self.assertLess(os.stat(save_file).st_size, 500000000) # reducing floating point precision model.saver.save_dtype = np.float16 model.save(save_file_fp16) self.assertLess(os.stat(save_file_fp16).st_size, 260000000) model = Classifier.load(save_file_fp16) new_predictions = model.predict(valid_sample.Text) for i, prediction in enumerate(predictions): self.assertEqual(prediction, new_predictions[i])
def test_fit_lm_only(self): """ Ensure LM only training does not error out """ model = Classifier() train_sample = self.dataset.sample(n=self.n_sample) valid_sample = self.dataset.sample(n=self.n_sample) # Ensure model can still be fit with only text model.fit(train_sample.Text) # Save and reload check save_file = 'tests/saved-models/test-save-load' model.save(save_file) model = Classifier.load(save_file) # Ensure model can still be fit with text + targets model.fit(train_sample.Text, train_sample.Target) predictions = model.predict(valid_sample.Text) for prediction in predictions: self.assertIsInstance(prediction, (np.int, np.int64)) probabilities = model.predict_proba(valid_sample.Text) for proba in probabilities: self.assertIsInstance(proba, dict)
def test_save_load_language_model(self): """ Ensure saving + loading does not cause errors Ensure saving + loading does not change predictions """ save_file = "tests/saved-models/test-save-load" model = Classifier() lm_out = model.generate_text("The quick brown fox", 6) start_id = model.input_pipeline.text_encoder.start_token start_token = model.input_pipeline.text_encoder.decoder[start_id] self.assertNotIn(start_token, lm_out) # Non finetuned models do not use extra tokens train_sample = self.dataset.sample(n=self.n_sample) model.fit(train_sample.Text, train_sample.Target) lm_out = model.generate_text("", 5) self.assertIn(start_token, lm_out.lower()) self.assertEqual(type(lm_out), str) model.save(save_file) model = Classifier.load(save_file) lm_out_2 = model.generate_text("Indico RULE") self.assertEqual(type(lm_out_2), str) self.assertIn("{}Indico RULE".format(start_token).lower(), lm_out_2.lower()) # Both of these models use extra toks
def test_save_load_language_model(self): """ Ensure saving + loading does not cause errors Ensure saving + loading does not change predictions """ save_file = 'tests/saved-models/test-save-load' model = Classifier(verbose=False) train_sample = self.dataset.sample(n=self.n_sample) model.fit(train_sample.Text, train_sample.Target) lm_out = model.generate_text("", 5) self.assertEqual(type(lm_out), str) model.save(save_file) model = Classifier.load(save_file) lm_out_2 = model.generate_text("Indico RULE") self.assertEqual(type(lm_out_2), str) self.assertIn('_start_Indico RULE'.lower(), lm_out_2)
def test_save_load(self): """ Ensure saving + loading does not cause errors Ensure saving + loading does not change predictions """ save_file = 'tests/saved-models/test-save-load' model = Classifier(config=self.default_config()) train_sample = self.dataset.sample(n=self.n_sample) valid_sample = self.dataset.sample(n=self.n_sample) model.fit(train_sample.Text, train_sample.Target) predictions = model.predict(valid_sample.Text) model.save(save_file) model = Classifier.load(save_file) new_predictions = model.predict(valid_sample.Text) for i, prediction in enumerate(predictions): self.assertEqual(prediction, new_predictions[i])
def test_save_load_language_model(self): """ Ensure saving + loading does not cause errors Ensure saving + loading does not change predictions """ save_file = 'tests/saved-models/test-save-load' model = Classifier() train_sample = self.dataset.sample(n=self.n_sample) model.fit(train_sample.Text, train_sample.Target) lm_out = model.generate_text("", 5) self.assertEqual(type(lm_out), str) model.save(save_file) model = Classifier.load(save_file) lm_out_2 = model.generate_text("Indico RULE") self.assertEqual(type(lm_out_2), str) start_id = model.input_pipeline.text_encoder.start start_token = model.input_pipeline.text_encoder.decoder[start_id] self.assertIn('{}Indico RULE'.format(start_token).lower(), lm_out_2.lower())
def test_save_load(self): """ Ensure saving + loading does not cause errors Ensure saving + loading does not change predictions """ save_file = "tests/saved-models/test-save-load" config = self.default_config(save_adam_vars=False, n_epochs=1) model = Classifier(**config) model.fit(self.trainX, self.trainY, context=self.train_context) predictions = model.predict(self.trainX, context=self.train_context) model.save(save_file) model = Classifier.load(save_file) new_predictions = model.predict(self.trainX, context=self.train_context) for i, prediction in enumerate(predictions): self.assertEqual(prediction, new_predictions[i])
def test_save_load(self): """ Ensure saving + loading does not cause errors Ensure saving + loading does not change predictions """ save_file = "tests/saved-models/test-save-load" config = self.default_config(save_adam_vars=False, n_epochs=1) model = Classifier(**config) (trainX, testX, trainY, _) = self.dataset trainY = [random.randint(0, 1) for _ in range(len(trainY))] model.fit(trainX, trainY) predictions = model.predict(testX) model.save(save_file) model = Classifier.load(save_file) new_predictions = model.predict(testX) for i, prediction in enumerate(predictions): self.assertEqual(prediction, new_predictions[i])
l2_reg=0.0, lr=6.25E-05, lm_loss_coef=0.25, # eval_acc = True, # doesn't work # oversample = True, # oversamples too much, so I am doing it separately params_device=0, autosave_path="/W210_Gov_Complaints_Portal/models/", verbose=True, ) model.fit(trainX_res_list, trainY_res_list) # Finetune base model on custom data duration = time.time() - start print("Training Done") print("It took :" + str(duration) + " seconds") model.save("/W210_Gov_Complaints_Portal/models/combined_model_strat_20181117" ) # Serialize the model to disk print("Model Saved") print("Starting testing") # model = Classifier.load("/W210_Gov_Complaints_Portal/models/combined_model_strat_20181117") print(testX.shape) print(model) start = time.time() predictions = model.predict(testX.tolist()) duration = time.time() - start print("Predictions done") print("It took :" + str(duration) + " seconds") print("Evaluating accuracy") mainPredictions = [] for pred in predictions:
DATA_PATH = Path('./data') MODELS_PATH = Path('./models') MODELS_PATH.mkdir(exist_ok=True) if __name__ == '__main__': parser = argparse.ArgumentParser() parser.add_argument('--nrows', default=147618, type=int, help='Define number of posts to be used to perform unsupervised finetuning of language model, defaults to all posts available (147618)') parser.add_argument('--name', type=str, help='Name of model to be saved in ./models directory') parser.add_argument('--labeled', action='store_true', help='Use only labeled posts for finetuning') args = parser.parse_args() # read in data and select sample based on CLI args posts_df = pd.read_csv(DATA_PATH/'processed'/'all_posts_data.csv', usecols=['post_id', 'cleaned_body', 'label', 'predict_me']) if args.labeled: posts_sample = posts_df[(posts_df.label.notnull()) | posts_df.predict_me] else: posts_sample = posts_df.sample(n=args.nrows, random_state=42) texts = list(posts_sample.cleaned_body.astype(str)) print(f'{len(texts)} posts will be used to finetune the GPT language model') model = Classifier(batch_size=8) model.fit(texts) model.save(MODELS_PATH / args.name)
print(data3.shape) print(data3.loc[82480]) mask = (data3['description'].str.len() >= 20) & (data3['description'].str.len() <= 512) dataFiltered = data3.loc[mask] print(dataFiltered.shape) dataFiltered.columns[dataFiltered.isna().any()].tolist() # ourLabel doesn't have NaN values, so that is good. trainingData = dataFiltered[["description", "OurLabel"]] print(type(trainingData)) print(trainingData.shape) trainX, testX, trainY, testY = train_test_split(trainingData.description, trainingData.OurLabel, test_size=0.2, random_state=42) # bigMask = (trainingData["description"].str.len() >=1000) # print(trainingData.loc[bigMask].shape) # Split in train and test 80/20 print(trainX.shape) print(type(trainX)) print(trainY.shape) model = Classifier(max_length=512, val_interval=3000, verbose=True) # Load base model model.fit(trainX, trainY) # Finetune base model on custom data model.save("newModel") # Serialize the model to disk
stratify=sampleY) print(trainX.shape) print("Split into train and test") print("Starting training") print(trainX.shape) start = time.time() model = Classifier(max_length=512, val_interval=3000, verbose=True) # Load base model model.fit(trainX.tolist(), trainY.tolist()) # Finetune base model on custom data duration = time.time() - start print("Training Done") print("It took :" + str(duration) + " seconds") model.save("combined_model_20181018") # Serialize the model to disk print("Model Saved") # model = Classifier.load("../models/combined_model_20181018") print(testX.shape) print(model) start = time.time() predictions = model.predict(testX.tolist()) duration = time.time() - start print("Predictions done") print("It took :" + str(duration) + " seconds") mainPredictions = [] for pred in predictions: mainPredictions.append(labelsMap[pred])
lr=6.25E-05, lm_loss_coef=0.25, # eval_acc = True, # doesn't work # oversample = True, # oversamples too much, so I am doing it separately params_device=0, autosave_path="/W210_Gov_Complaints_Portal/models/", verbose=True, ) model.fit(trainX.tolist(), trainY.tolist()) # Finetune base model on custom data duration = time.time() - start print("Training Done") print("It took :" + str(duration) + " seconds") model.save( "/W210_Gov_Complaints_Portal/models/combined_model_full_no_oversample_20181123" ) # Serialize the model to disk print("Model Saved") print("Starting testing") # model = Classifier.load("/W210_Gov_Complaints_Portal/models/combined_model_full_no_oversample_20181123") print(testX.shape) print(model) start = time.time() predictions = model.predict(testX.tolist()) duration = time.time() - start print("Predictions done") print("It took :" + str(duration) + " seconds") print("Evaluating accuracy") mainPredictions = []