async def classifyOpen311Complaint(request):
    global model

    # Check if data provided
    if request.json == None:
        return json({"result", "No data in request"})

    # Check if we have a 311 'description' field
    if request.json.get('description') == None and request.json.get('descriptions') == None:
        return json({'service_code': 'unknown'})

    # If the model is not already loaded then load it
    if model == None:
        model = Classifier(max_length=512, val_interval=3000, verbose = True)
        model = Classifier.load("/root/combined_model_20181021")

    if request.json.get('descriptions') != None:
        processedComplaints = list(map(lambda x: preProcess(x), request.json.get('descriptions')))
        prediction = model.predict(processedComplaints).tolist()
    else:
        print("Doing simple prediction")
        prediction = model.predict([preProcess(request.json.get('description'))])[0]

    print("Prediction is: ", prediction)

    # If we have a service_code in the incoming request then we assume an Open311 message,
    # so we update the service_code and return the full message.  Otherwise we just send
    # back a new message with the service_code only
    if request.json.get('service_code') == None:
        print("No service code provided, returning one")
        return json({'service_code': prediction})
    else:
        print("Service_code was provided so updating it")
        request.json['service_code'] = prediction
        return json(request.json)
Пример #2
0
    def test_save_load(self):
        """
        Ensure saving + loading does not cause errors
        Ensure saving + loading does not change predictions
        """
        save_file = "tests/saved-models/test-save-load"
        save_file_fp16 = "tests/saved-models/test-save-load_fp16"

        config = self.default_config(save_adam_vars=False)
        model = Classifier(**config)
        train_sample = self.dataset.sample(n=self.n_sample)
        valid_sample = self.dataset.sample(n=self.n_sample)
        model.fit(train_sample.Text, train_sample.Target)
        predictions = model.predict(valid_sample.Text)

        # testing file size reduction options
        model.save(save_file)
        self.assertLess(os.stat(save_file).st_size, 500000000)

        # reducing floating point precision
        model.saver.save_dtype = np.float16
        model.save(save_file_fp16)
        self.assertLess(os.stat(save_file_fp16).st_size, 260000000)

        model = Classifier.load(save_file_fp16)
        new_predictions = model.predict(valid_sample.Text)
        for i, prediction in enumerate(predictions):
            self.assertEqual(prediction, new_predictions[i])
Пример #3
0
 def test_classifier_auxiliary(self):
     """
     Ensure model training does not error out
     Ensure model returns predictions
     """
     model = Classifier(**self.default_config())
     model.fit(self.trainX, self.trainY, context=self.train_context)
     _ = model.predict(self.trainX, context=self.train_context)
     # test cached predict
     _ = model.predict(self.trainX, context=self.train_context)
Пример #4
0
 def test_fit_predict_batch_size_1(self):
     """
     Ensure training is possible with batch size of 1
     """
     model = Classifier(**self.default_config())
     model.config.batch_size = 1
     train_sample = self.dataset.sample(n=self.n_sample)
     valid_sample = self.dataset.sample(n=self.n_sample)
     model.fit(train_sample.Text.values, train_sample.Target.values)
     model.predict(valid_sample.Text.values)
Пример #5
0
 def test_classifier_no_auxiliary(self):
     """
     Ensure model training does not error out
     Ensure model returns predictions
     """
     config = self.default_config(use_auxiliary_info=False,
                                  context_dim=None,
                                  val_set=(self.trainX, self.trainY))
     model = Classifier(**config)
     model.fit(self.trainX, self.trainY)
     _ = model.predict(self.trainX)
     # test cached predict
     _ = model.predict(self.trainX)
Пример #6
0
    def test_multiple_models_fit_predict(self):
        """
        Ensure second call to predict is faster than first
        """
        model = Classifier(**self.default_config())
        train_sample = self.dataset.sample(n=self.n_sample)
        valid_sample = self.dataset.sample(n=self.n_sample)
        model.fit(train_sample.Text.values, train_sample.Target.values)
        model.predict(valid_sample.Text.values)

        model2 = Classifier(**self.default_config())
        model2.fit(train_sample.Text.values, train_sample.Target.values)
        model2.predict(valid_sample.Text.values)
Пример #7
0
	def post(self):
		global model
		print("Received POST request on Open311 interface")

		# if the classifier has not been loaded then load it
		if model == None:
			model = Classifier(max_length=512, val_interval=3000, verbose = True)
			model = Classifier.load("/root/combined_model_20181021")

		# check if the JSON description has been filled in
		some_json = request.get_json()
		print("Received JSON: ", some_json)
		if some_json.get('description') == None:
			return {'service_code': 'unknown'}
		newTextDescription = some_json.get('description')
		print("received: ", newTextDescription)
		prediction = model.predict([newTextDescription])
		# check if the input data also contained the service code and if so replace it 
		# and return the original message
		if some_json.get('service_code') == None:
			# No service code so just return that
			print("No service code provided, returning one")
			return {'service_code': prediction[0]}
		else:
			print("Service_code was provided so updating it")
			some_json['service_code'] = prediction[0]
			return some_json
Пример #8
0
    def test_fit_lm_only(self):
        """
        Ensure LM only training does not error out
        """
        model = Classifier()
        train_sample = self.dataset.sample(n=self.n_sample)
        valid_sample = self.dataset.sample(n=self.n_sample)

        # Ensure model can still be fit with only text
        model.fit(train_sample.Text)

        # Save and reload check
        save_file = 'tests/saved-models/test-save-load'
        model.save(save_file)
        model = Classifier.load(save_file)

        # Ensure model can still be fit with text + targets
        model.fit(train_sample.Text, train_sample.Target)
        predictions = model.predict(valid_sample.Text)
        for prediction in predictions:
            self.assertIsInstance(prediction, (np.int, np.int64))

        probabilities = model.predict_proba(valid_sample.Text)
        for proba in probabilities:
            self.assertIsInstance(proba, dict)
Пример #9
0
	def post(self):
		global model
		print("Received POST request on google interface")

		# if the classifier has not been loaded then load it
		if model == None:
			model = Classifier(max_length=512, val_interval=3000, verbose = True)
			model = Classifier.load("/root/combined_model_20181021")

		# check if the JSON description has been filled in
		some_json = request.get_json()
		if some_json.get('queryResult') == None:
			print("Empty message text")
			return {'fulfillmentText': 'unknown'}
		queryResult = some_json.get('queryResult')
		if queryResult.get('queryText') == None:
			print("Empty message text")
			return {'fulfillmentText': 'unknown'}
		newTextDescription = queryResult.get('queryText')
		print("received: ", newTextDescription)

		# Predict the classification of the text
		prediction = model.predict([newTextDescription])

		# Return the result
		print("returning: ", prediction[0])
		return {'fulfillmentText': prediction[0]}
Пример #10
0
 def test_save_load(self):
     """
     Ensure saving + loading does not cause errors
     Ensure saving + loading does not change predictions
     """
     save_file = 'tests/saved-models/test-save-load'
     model = Classifier(config=self.default_config())
     train_sample = self.dataset.sample(n=self.n_sample)
     valid_sample = self.dataset.sample(n=self.n_sample)
     model.fit(train_sample.Text, train_sample.Target)
     predictions = model.predict(valid_sample.Text)
     model.save(save_file)
     model = Classifier.load(save_file)
     new_predictions = model.predict(valid_sample.Text)
     for i, prediction in enumerate(predictions):
         self.assertEqual(prediction, new_predictions[i])
Пример #11
0
async def processGoogleActionRequest(request):
    global model
    print("Received POST request on google interface")

    # Check if data provided
    if request.json == None:
        return json({"result", "No data in request"})
    some_json = request.json
    if some_json.get('queryResult') == None:
        print("Empty message text")
        return json({'fulfillmentText': 'unknown'})
    queryResult = some_json.get('queryResult')
    if queryResult.get('queryText') == None:
        print("Empty message text")
        return json({'fulfillmentText': 'unknown'})
    newTextDescription = queryResult.get('queryText')
    print("received: ", newTextDescription)
    processedDescription = preProcess(newTextDescription)
    print("pre-processed: ", processedDescription)

    # If the model is not already loaded then load it
    if model == None:
        model = Classifier(max_length=512, val_interval=3000, verbose=True)
        model = Classifier.load("/root/combined_model_20181021")

    # Predict the classification of the text
    prediction = model.predict([processedDescription])

    # Return the result
    print("returning: ", prediction[0])
    return json({'fulfillmentText': prediction[0]})
Пример #12
0
    def test_class_weights(self):
        # testing class weights
        train_sample = self.dataset.sample(n=self.n_sample * 3)
        valid_sample = self.dataset.sample(n=self.n_sample * 3)
        model = Classifier(**self.default_config())
        model.fit(train_sample.Text.values, train_sample.Target.values)
        predictions = model.predict(valid_sample.Text.values)
        recall = recall_score(valid_sample.Target.values, predictions, pos_label=1)
        model = Classifier(**self.default_config(class_weights={1: 100}))
        model.fit(train_sample.Text.values, train_sample.Target.values)
        predictions = model.predict(valid_sample.Text.values)
        new_recall = recall_score(valid_sample.Target.values, predictions, pos_label=1)
        self.assertTrue(new_recall >= recall)

        # test auto-inferred class weights function
        model = Classifier(**self.default_config(class_weights='log'))
        model.fit(train_sample.Text.values, train_sample.Target.values)
Пример #13
0
    def _evaluate(self, session):
        try:
            with tf.Graph().as_default():
                from finetune import Classifier
                model = Classifier(**self._config_to_finetune)

                if self._current_finetune.saver.variables:
                    model.saver.variables = {
                        k: v.copy()
                        for k, v in
                        self._current_finetune.saver.variables.items()
                        if "global_step" not in k and "Adam" not in k
                    }

                model.saver.fallback_ = {
                    k: v
                    for k, v in self._current_finetune.saver.fallback.items()
                    if "global_step" not in k
                }
                train_x, train_y = self.train_data
                model.fit(train_x, train_y)
                test_x, test_y = self.test_data
                test_accuracy = np.mean(model.predict(test_x) == test_y)
                train_accuracy = np.mean(model.predict(train_x) == train_y)
        except IOError as e:
            traceback.print_exc(file=sys.stdout)
            test_accuracy = -1.0
            train_accuracy = -1.0

        global_step = session.run(tf.train.get_or_create_global_step())
        directory = os.path.join(self._eval_dir, "..", "finetuning")

        if not os.path.exists(directory):
            os.makedirs(directory)
        summary_writer = writer_cache.FileWriterCache.get(directory)
        summary_proto = summary_pb2.Summary()
        summary_proto.value.add(tag="finetuning/{}_train_accurary".format(
            self._name),
                                simple_value=float(train_accuracy))
        summary_proto.value.add(tag="finetuning/{}_test_accurary".format(
            self._name),
                                simple_value=float(test_accuracy))
        summary_writer.add_summary(summary_proto, global_step)
        summary_writer.flush()

        self._timer.update_last_triggered_step(self._iter_count)
Пример #14
0
    def test_save_load(self):
        """
        Ensure saving + loading does not cause errors
        Ensure saving + loading does not change predictions
        """
        save_file = "tests/saved-models/test-save-load"
        config = self.default_config(save_adam_vars=False, n_epochs=1)
        model = Classifier(**config)

        model.fit(self.trainX, self.trainY, context=self.train_context)
        predictions = model.predict(self.trainX, context=self.train_context)
        model.save(save_file)

        model = Classifier.load(save_file)
        new_predictions = model.predict(self.trainX,
                                        context=self.train_context)
        for i, prediction in enumerate(predictions):
            self.assertEqual(prediction, new_predictions[i])
Пример #15
0
    def test_save_load(self):
        """
        Ensure saving + loading does not cause errors
        Ensure saving + loading does not change predictions
        """
        save_file = "tests/saved-models/test-save-load"
        config = self.default_config(save_adam_vars=False, n_epochs=1)
        model = Classifier(**config)

        (trainX, testX, trainY, _) = self.dataset
        trainY = [random.randint(0, 1) for _ in range(len(trainY))]
        model.fit(trainX, trainY)
        predictions = model.predict(testX)
        model.save(save_file)

        model = Classifier.load(save_file)
        new_predictions = model.predict(testX)
        for i, prediction in enumerate(predictions):
            self.assertEqual(prediction, new_predictions[i])
Пример #16
0
    def test_cached_predict(self):
        """
        Ensure second call to predict is faster than first
        """

        model = Classifier(**self.default_config())
        train_sample = self.dataset.sample(n=self.n_sample)
        valid_sample = self.dataset.sample(n=self.n_sample)
        model.fit(train_sample.Text.values, train_sample.Target.values)

        with model.cached_predict():
            start = time.time()
            model.predict(valid_sample.Text[:1].values)
            first = time.time()
            model.predict(valid_sample.Text[:1].values)
            second = time.time()

        first_prediction_time = first - start
        second_prediction_time = second - first
        self.assertLess(second_prediction_time, first_prediction_time / 2.0)
Пример #17
0
 def test_auxiliary_classifier(self):
     """
     Ensure model training does not error out
     Ensure model returns predictions
     """
     (trainX, testX, trainY, _) = self.dataset
     trainY = [
         random.randint(0, 1) for _ in range(len(trainY))
     ]  # random labels just to make sure there are no errors -> reasonable predictions tests are in sequence_label
     model = Classifier(**self.default_config())
     model.fit(trainX, trainY)
     _ = model.predict(testX)
Пример #18
0
 def test_reasonable_predictions(self):
     """
     Ensure model converges to a reasonable solution for a trivial problem
     """
     model = Classifier(config=self.default_config())
     n_per_class = (self.n_sample * 5)
     trX = ['cat'] * n_per_class + ['finance'] * n_per_class
     trY = copy(trX)
     teX = ['feline'] * n_per_class + ['investment'] * n_per_class
     teY = ['cat'] * n_per_class + ['finance'] * n_per_class
     model.fit(trX, trY)
     predY = model.predict(teX)
     self.assertEqual(accuracy_score(teY, predY), 1.00)
Пример #19
0
 def test_reasonable_predictions_smaller_model(self):
     """
     Ensure model converges to a reasonable solution for a trivial problem
     """
     model = Classifier(base_model=GPTModelSmall)
     n_per_class = (self.n_sample * 5)
     trX = ['cat'] * n_per_class + ['finance'] * n_per_class
     np.random.shuffle(trX)
     trY = copy(trX)
     teX = ['feline'] * n_per_class + ['investment'] * n_per_class
     teY = ['cat'] * n_per_class + ['finance'] * n_per_class
     model.fit(trX, trY)
     predY = model.predict(teX)
     self.assertEqual(accuracy_score(teY, predY), 1.00)
Пример #20
0
    def test_explain(self):
        model = Classifier(**self.default_config())
        train_sample = self.dataset.sample(n=self.n_sample)
        valid_sample = self.dataset.sample(n=self.n_sample)
        model.fit(train_sample.Text, train_sample.Target)
        explanations = model.explain(valid_sample.Text)
        normal_predictions = model.predict(valid_sample.Text)
        explanation_preds = [e["prediction"] for e in explanations]

        # check that the process of turning on explain does not change the preds
        self.assertEqual(explanation_preds, list(normal_predictions))
        self.assertEqual(len(explanation_preds), len(train_sample.Text))
        self.assertEqual(type(explanations[0]["token_ends"]), list)
        self.assertEqual(type(explanations[0]["token_starts"]), list)
        self.assertEqual(type(explanations[0]["explanation"]), dict)
        self.assertEqual(len(explanations[0]["token_starts"]), len(explanations[0]["explanation"][0]))
        self.assertEqual(len(explanations[0]["token_ends"]), len(explanations[0]["explanation"][0]))
Пример #21
0
    def test_chunk_long_sequences(self):
        test_sequence = [
            "This is a sentence to test chunk_long_sequences in classification. " * 20,
            "Another example so now there are two different classes in the test. " * 20,
        ]
        labels = ["a", "b"]
        model = Classifier()
        model.config.chunk_long_sequences = True
        model.config.max_length = 18

        model.finetune(test_sequence * 10, labels * 10)

        predictions = model.predict(test_sequence * 10)
        probas = model.predict_proba(test_sequence * 10)

        self.assertEqual(len(predictions), 20)
        self.assertEqual(len(probas[0]), 2)
        np.testing.assert_almost_equal(np.sum(list(probas[0].values())), 1, decimal=4)
Пример #22
0
    def test_fit_predict(self):
        """
        Ensure model training does not error out
        Ensure model returns predictions of the right type
        """

        model = Classifier(config=self.default_config())
        train_sample = self.dataset.sample(n=self.n_sample)
        valid_sample = self.dataset.sample(n=self.n_sample)
        model.fit(train_sample.Text, train_sample.Target)

        predictions = model.predict(valid_sample.Text)
        for prediction in predictions:
            self.assertIsInstance(prediction, (np.int, np.int64))

        probabilities = model.predict_proba(valid_sample.Text)
        for proba in probabilities:
            self.assertIsInstance(proba, dict)
Пример #23
0
    def test_fit_lm_only(self):
        """
        Ensure LM only training does not error out
        """
        model = Classifier()
        train_sample = self.dataset.sample(n=self.n_sample)
        valid_sample = self.dataset.sample(n=self.n_sample)

        # Ensure model can still be fit with text + targets
        model.fit(train_sample.Text, train_sample.Target)

        predictions = model.predict(valid_sample.Text)
        for prediction in predictions:
            self.assertIsInstance(prediction, (np.int, np.int64))

        probabilities = model.predict_proba(valid_sample.Text)
        for proba in probabilities:
            self.assertIsInstance(proba, dict)
Пример #24
0
    def test_reasonable_predictions(self):
        """
        Ensure model converges to a reasonable solution for a trivial problem
        """
        model = Classifier(**self.default_config(n_epochs=5))

        n_duplicates = 5

        trX = (
            ["cat", "kitten", "feline", "meow", "kitty"] * n_duplicates +
            ["finance", "investment", "investing", "dividends", "financial"] *
            n_duplicates)
        trY = (['cat'] * (len(trX) // 2) + ['finance'] * (len(trX) // 2))
        teX = ["furball", "fiduciary"]
        teY = ["cat"] + ["finance"]
        model.fit(trX, trY)
        predY = model.predict(teX)
        print(predY)
        self.assertEqual(accuracy_score(teY, predY), 1.00)
class StanfordSentimentTreebank(Dataset):

    def __init__(self, filename=None, **kwargs):
        super().__init__(filename=(filename or DATA_PATH), **kwargs)

    def md5(self):
        return CHECKSUM
        
    def download(self):
        """
        Download Stanford Sentiment Treebank to data directory
        """
        path = Path(self.filename)
        path.parent.mkdir(parents=True, exist_ok=True)
        generic_download(
            url="https://s3.amazonaws.com/enso-data/SST-binary.csv",
            text_column="Text",
            target_column="Target",
            filename=SST_FILENAME
        )


if __name__ == "__main__":
    # Train and evaluate on SST
    dataset = StanfordSentimentTreebank(nrows=1000).dataframe
    model = Classifier(verbose=True, n_epochs=2, val_size=0.01, val_interval=10, visible_gpus=[], tensorboard_folder='.tensorboard')
    trainX, testX, trainY, testY = train_test_split(dataset.Text, dataset.Target, test_size=0.3, random_state=42)
    model.fit(trainX, trainY)
    accuracy = np.mean(model.predict(testX) == testY)
    print('Test Accuracy: {:0.2f}'.format(accuracy))
        """
        Download Stanford Sentiment Treebank to data directory
        """
        path = Path(self.filename)
        path.parent.mkdir(parents=True, exist_ok=True)
        generic_download(
            url="https://s3.amazonaws.com/enso-data/SST-binary.csv",
            text_column="Text",
            target_column="Target",
            filename=SST_FILENAME)


if __name__ == "__main__":
    # Train and evaluate on SST
    dataset = StanfordSentimentTreebank(nrows=1000).dataframe
    model = Classifier(debugging_logs=True,
                       interpolate_pos_embed=False,
                       n_epochs=3,
                       batch_size=2,
                       lr_warmup=0.1,
                       max_length=64,
                       base_model=GPTModel)
    trainX, testX, trainY, testY = train_test_split(dataset.Text.values,
                                                    dataset.Target.values,
                                                    test_size=0.3,
                                                    random_state=42)
    model.fit(trainX, trainY)
    preds = model.predict(testX)
    print(preds, testY)
    print(classification_report(testY, preds))
model.fit(trainX_res_list,
          trainY_res_list)  # Finetune base model on custom data
duration = time.time() - start
print("Training Done")
print("It took :" + str(duration) + " seconds")

model.save("/W210_Gov_Complaints_Portal/models/combined_model_strat_20181117"
           )  # Serialize the model to disk
print("Model Saved")

print("Starting testing")
# model = Classifier.load("/W210_Gov_Complaints_Portal/models/combined_model_strat_20181117")
print(testX.shape)
print(model)
start = time.time()
predictions = model.predict(testX.tolist())
duration = time.time() - start
print("Predictions done")
print("It took :" + str(duration) + " seconds")

print("Evaluating accuracy")
mainPredictions = []
for pred in predictions:
    mainPredictions.append(labelsMap[pred])

mainTestY = []
for testLabel in testY.tolist():
    mainTestY.append(labelsMap[testLabel])

correctMain = 0
countMain = 0
    def download(self):
        """
        Download Stanford Sentiment Treebank to data directory
        """
        path = Path(self.filename)
        path.parent.mkdir(parents=True, exist_ok=True)
        generic_download(
            url="https://s3.amazonaws.com/enso-data/SST-binary.csv",
            text_column="Text",
            target_column="Target",
            filename=SST_FILENAME)


if __name__ == "__main__":
    # Train and evaluate on SST
    dataset = StanfordSentimentTreebank(nrows=1000).dataframe
    model = Classifier(verbose=True,
                       n_epochs=2,
                       val_size=0.01,
                       val_interval=10,
                       visible_gpus=[],
                       tensorboard_folder='.tensorboard')
    trainX, testX, trainY, testY = train_test_split(dataset.Text,
                                                    dataset.Target,
                                                    test_size=0.3,
                                                    random_state=42)
    model.fit(trainX, trainY)
    accuracy = np.mean(model.predict(testX) == testY)
    print('Test Accuracy: {:0.2f}'.format(accuracy))
Пример #29
0
"""GPT2imdb.ipynb

Automatically generated by Colaboratory.

Original file is located at
    https://colab.research.google.com/drive/1_484wco-2YnrTKVJr5wN4qW4RDQIuKZD
"""

import pandas as pd
import finetune

url = 'https://raw.githubusercontent.com/BillGu19/Bass/master/name_genre_identifiers.csv'
name_genre = pd.read_csv(url)
name = name_genre['primaryName']
genre = name_genre['top genre']
#print(name)
#print(genre)
#print(name_genre)

from finetune.base_models import BERT, BERTLarge, GPT2, GPT2Medium, GPT2Large, TextCNN, TCN, RoBERTa, DistilBERT
from finetune import Classifier
from finetune import LanguageModel

#X = ['german shepherd', 'maine coon', 'persian', 'beagle']
#Y = ['dog', 'cat', 'cat', 'dog']
model = Classifier(base_model=GPT2)
model.fit(name, genre)

testX = ['Tom Cruise','Jamie Lee Curtis', 'Claire Danes', 'Geena Davis', 'Robert De Niro', 'John Denver', 'Johnny Depp', 'Leonardo DiCaprio', 'Clint Eastwood']
predictions= model.predict(testX)
print(predictions)