예제 #1
0
    def test_save_load(self):
        """
        Ensure saving + loading does not cause errors
        Ensure saving + loading does not change predictions
        """
        save_file = "tests/saved-models/test-save-load"
        save_file_fp16 = "tests/saved-models/test-save-load_fp16"

        config = self.default_config(save_adam_vars=False)
        model = Classifier(**config)
        train_sample = self.dataset.sample(n=self.n_sample)
        valid_sample = self.dataset.sample(n=self.n_sample)
        model.fit(train_sample.Text, train_sample.Target)
        predictions = model.predict(valid_sample.Text)

        # testing file size reduction options
        model.save(save_file)
        self.assertLess(os.stat(save_file).st_size, 500000000)

        # reducing floating point precision
        model.saver.save_dtype = np.float16
        model.save(save_file_fp16)
        self.assertLess(os.stat(save_file_fp16).st_size, 260000000)

        model = Classifier.load(save_file_fp16)
        new_predictions = model.predict(valid_sample.Text)
        for i, prediction in enumerate(predictions):
            self.assertEqual(prediction, new_predictions[i])
예제 #2
0
    def test_fit_lm_only(self):
        """
        Ensure LM only training does not error out
        """
        model = Classifier()
        train_sample = self.dataset.sample(n=self.n_sample)
        valid_sample = self.dataset.sample(n=self.n_sample)

        # Ensure model can still be fit with only text
        model.fit(train_sample.Text)

        # Save and reload check
        save_file = 'tests/saved-models/test-save-load'
        model.save(save_file)
        model = Classifier.load(save_file)

        # Ensure model can still be fit with text + targets
        model.fit(train_sample.Text, train_sample.Target)
        predictions = model.predict(valid_sample.Text)
        for prediction in predictions:
            self.assertIsInstance(prediction, (np.int, np.int64))

        probabilities = model.predict_proba(valid_sample.Text)
        for proba in probabilities:
            self.assertIsInstance(proba, dict)
예제 #3
0
    def test_save_load_language_model(self):
        """
        Ensure saving + loading does not cause errors
        Ensure saving + loading does not change predictions
        """
        save_file = "tests/saved-models/test-save-load"
        model = Classifier()

        lm_out = model.generate_text("The quick brown fox", 6)
        start_id = model.input_pipeline.text_encoder.start_token
        start_token = model.input_pipeline.text_encoder.decoder[start_id]
        self.assertNotIn(start_token, lm_out) # Non finetuned models do not use extra tokens
        
        train_sample = self.dataset.sample(n=self.n_sample)
        model.fit(train_sample.Text, train_sample.Target)
        lm_out = model.generate_text("", 5)
        self.assertIn(start_token, lm_out.lower())
        self.assertEqual(type(lm_out), str)
        model.save(save_file)

        model = Classifier.load(save_file)
        lm_out_2 = model.generate_text("Indico RULE")
        self.assertEqual(type(lm_out_2), str)
        
        self.assertIn("{}Indico RULE".format(start_token).lower(), lm_out_2.lower()) # Both of these models use extra toks
예제 #4
0
 def test_save_load_language_model(self):
     """
     Ensure saving + loading does not cause errors
     Ensure saving + loading does not change predictions
     """
     save_file = 'tests/saved-models/test-save-load'
     model = Classifier(verbose=False)
     train_sample = self.dataset.sample(n=self.n_sample)
     model.fit(train_sample.Text, train_sample.Target)
     lm_out = model.generate_text("", 5)
     self.assertEqual(type(lm_out), str)
     model.save(save_file)
     model = Classifier.load(save_file)
     lm_out_2 = model.generate_text("Indico RULE")
     self.assertEqual(type(lm_out_2), str)
     self.assertIn('_start_Indico RULE'.lower(), lm_out_2)
예제 #5
0
 def test_save_load(self):
     """
     Ensure saving + loading does not cause errors
     Ensure saving + loading does not change predictions
     """
     save_file = 'tests/saved-models/test-save-load'
     model = Classifier(config=self.default_config())
     train_sample = self.dataset.sample(n=self.n_sample)
     valid_sample = self.dataset.sample(n=self.n_sample)
     model.fit(train_sample.Text, train_sample.Target)
     predictions = model.predict(valid_sample.Text)
     model.save(save_file)
     model = Classifier.load(save_file)
     new_predictions = model.predict(valid_sample.Text)
     for i, prediction in enumerate(predictions):
         self.assertEqual(prediction, new_predictions[i])
예제 #6
0
 def test_save_load_language_model(self):
     """
     Ensure saving + loading does not cause errors
     Ensure saving + loading does not change predictions
     """
     save_file = 'tests/saved-models/test-save-load'
     model = Classifier()
     train_sample = self.dataset.sample(n=self.n_sample)
     model.fit(train_sample.Text, train_sample.Target)
     lm_out = model.generate_text("", 5)
     self.assertEqual(type(lm_out), str)
     model.save(save_file)
     model = Classifier.load(save_file)
     lm_out_2 = model.generate_text("Indico RULE")
     self.assertEqual(type(lm_out_2), str)
     start_id = model.input_pipeline.text_encoder.start
     start_token = model.input_pipeline.text_encoder.decoder[start_id]
     self.assertIn('{}Indico RULE'.format(start_token).lower(), lm_out_2.lower())
예제 #7
0
    def test_save_load(self):
        """
        Ensure saving + loading does not cause errors
        Ensure saving + loading does not change predictions
        """
        save_file = "tests/saved-models/test-save-load"
        config = self.default_config(save_adam_vars=False, n_epochs=1)
        model = Classifier(**config)

        model.fit(self.trainX, self.trainY, context=self.train_context)
        predictions = model.predict(self.trainX, context=self.train_context)
        model.save(save_file)

        model = Classifier.load(save_file)
        new_predictions = model.predict(self.trainX,
                                        context=self.train_context)
        for i, prediction in enumerate(predictions):
            self.assertEqual(prediction, new_predictions[i])
예제 #8
0
    def test_save_load(self):
        """
        Ensure saving + loading does not cause errors
        Ensure saving + loading does not change predictions
        """
        save_file = "tests/saved-models/test-save-load"
        config = self.default_config(save_adam_vars=False, n_epochs=1)
        model = Classifier(**config)

        (trainX, testX, trainY, _) = self.dataset
        trainY = [random.randint(0, 1) for _ in range(len(trainY))]
        model.fit(trainX, trainY)
        predictions = model.predict(testX)
        model.save(save_file)

        model = Classifier.load(save_file)
        new_predictions = model.predict(testX)
        for i, prediction in enumerate(predictions):
            self.assertEqual(prediction, new_predictions[i])
    l2_reg=0.0,
    lr=6.25E-05,
    lm_loss_coef=0.25,
    #                     eval_acc = True, # doesn't work
    #                     oversample = True, # oversamples too much, so I am doing it separately
    params_device=0,
    autosave_path="/W210_Gov_Complaints_Portal/models/",
    verbose=True,
)
model.fit(trainX_res_list,
          trainY_res_list)  # Finetune base model on custom data
duration = time.time() - start
print("Training Done")
print("It took :" + str(duration) + " seconds")

model.save("/W210_Gov_Complaints_Portal/models/combined_model_strat_20181117"
           )  # Serialize the model to disk
print("Model Saved")

print("Starting testing")
# model = Classifier.load("/W210_Gov_Complaints_Portal/models/combined_model_strat_20181117")
print(testX.shape)
print(model)
start = time.time()
predictions = model.predict(testX.tolist())
duration = time.time() - start
print("Predictions done")
print("It took :" + str(duration) + " seconds")

print("Evaluating accuracy")
mainPredictions = []
for pred in predictions:
예제 #10
0
DATA_PATH = Path('./data')
MODELS_PATH = Path('./models')
MODELS_PATH.mkdir(exist_ok=True)


if __name__ == '__main__':
    parser = argparse.ArgumentParser()
    parser.add_argument('--nrows', default=147618, type=int,
                        help='Define number of posts to be used to perform unsupervised finetuning of language model, defaults to all posts available (147618)')
    parser.add_argument('--name', type=str, 
                        help='Name of model to be saved in ./models directory')
    parser.add_argument('--labeled', action='store_true',
                        help='Use only labeled posts for finetuning')
    args = parser.parse_args()

    # read in data and select sample based on CLI args
    posts_df = pd.read_csv(DATA_PATH/'processed'/'all_posts_data.csv', usecols=['post_id', 'cleaned_body', 'label', 'predict_me'])

    if args.labeled:
        posts_sample = posts_df[(posts_df.label.notnull()) | posts_df.predict_me]
    else:
        posts_sample = posts_df.sample(n=args.nrows, random_state=42)     

    texts = list(posts_sample.cleaned_body.astype(str))
    print(f'{len(texts)} posts will be used to finetune the GPT language model')

    model = Classifier(batch_size=8)
    model.fit(texts)

    model.save(MODELS_PATH / args.name)
예제 #11
0
print(data3.shape)
print(data3.loc[82480])

mask = (data3['description'].str.len() >=
        20) & (data3['description'].str.len() <= 512)
dataFiltered = data3.loc[mask]
print(dataFiltered.shape)

dataFiltered.columns[dataFiltered.isna().any()].tolist()
# ourLabel doesn't have NaN values, so that is good.

trainingData = dataFiltered[["description", "OurLabel"]]
print(type(trainingData))
print(trainingData.shape)
trainX, testX, trainY, testY = train_test_split(trainingData.description,
                                                trainingData.OurLabel,
                                                test_size=0.2,
                                                random_state=42)
# bigMask = (trainingData["description"].str.len() >=1000)
# print(trainingData.loc[bigMask].shape)
# Split in train and test 80/20
print(trainX.shape)
print(type(trainX))
print(trainY.shape)

model = Classifier(max_length=512, val_interval=3000,
                   verbose=True)  # Load base model
model.fit(trainX, trainY)  # Finetune base model on custom data

model.save("newModel")  # Serialize the model to disk
                                                stratify=sampleY)
print(trainX.shape)
print("Split into train and test")

print("Starting training")
print(trainX.shape)
start = time.time()
model = Classifier(max_length=512, val_interval=3000,
                   verbose=True)  # Load base model
model.fit(trainX.tolist(),
          trainY.tolist())  # Finetune base model on custom data
duration = time.time() - start
print("Training Done")
print("It took :" + str(duration) + " seconds")

model.save("combined_model_20181018")  # Serialize the model to disk
print("Model Saved")

# model = Classifier.load("../models/combined_model_20181018")
print(testX.shape)
print(model)
start = time.time()
predictions = model.predict(testX.tolist())
duration = time.time() - start
print("Predictions done")
print("It took :" + str(duration) + " seconds")

mainPredictions = []
for pred in predictions:
    mainPredictions.append(labelsMap[pred])
예제 #13
0
    lr=6.25E-05,
    lm_loss_coef=0.25,
    #                     eval_acc = True, # doesn't work
    #                     oversample = True, # oversamples too much, so I am doing it separately
    params_device=0,
    autosave_path="/W210_Gov_Complaints_Portal/models/",
    verbose=True,
)
model.fit(trainX.tolist(),
          trainY.tolist())  # Finetune base model on custom data
duration = time.time() - start
print("Training Done")
print("It took :" + str(duration) + " seconds")

model.save(
    "/W210_Gov_Complaints_Portal/models/combined_model_full_no_oversample_20181123"
)  # Serialize the model to disk
print("Model Saved")

print("Starting testing")
# model = Classifier.load("/W210_Gov_Complaints_Portal/models/combined_model_full_no_oversample_20181123")
print(testX.shape)
print(model)
start = time.time()
predictions = model.predict(testX.tolist())
duration = time.time() - start
print("Predictions done")
print("It took :" + str(duration) + " seconds")

print("Evaluating accuracy")
mainPredictions = []