예제 #1
0
    def test_fit_lm_only(self):
        """
        Ensure LM only training does not error out
        """
        model = Classifier()
        train_sample = self.dataset.sample(n=self.n_sample)
        valid_sample = self.dataset.sample(n=self.n_sample)

        # Ensure model can still be fit with only text
        model.fit(train_sample.Text)

        # Save and reload check
        save_file = 'tests/saved-models/test-save-load'
        model.save(save_file)
        model = Classifier.load(save_file)

        # Ensure model can still be fit with text + targets
        model.fit(train_sample.Text, train_sample.Target)
        predictions = model.predict(valid_sample.Text)
        for prediction in predictions:
            self.assertIsInstance(prediction, (np.int, np.int64))

        probabilities = model.predict_proba(valid_sample.Text)
        for proba in probabilities:
            self.assertIsInstance(proba, dict)
async def classifyOpen311Complaint(request):
    global model

    # Check if data provided
    if request.json == None:
        return json({"result", "No data in request"})

    # Check if we have a 311 'description' field
    if request.json.get('description') == None and request.json.get('descriptions') == None:
        return json({'service_code': 'unknown'})

    # If the model is not already loaded then load it
    if model == None:
        model = Classifier(max_length=512, val_interval=3000, verbose = True)
        model = Classifier.load("/root/combined_model_20181021")

    if request.json.get('descriptions') != None:
        processedComplaints = list(map(lambda x: preProcess(x), request.json.get('descriptions')))
        prediction = model.predict(processedComplaints).tolist()
    else:
        print("Doing simple prediction")
        prediction = model.predict([preProcess(request.json.get('description'))])[0]

    print("Prediction is: ", prediction)

    # If we have a service_code in the incoming request then we assume an Open311 message,
    # so we update the service_code and return the full message.  Otherwise we just send
    # back a new message with the service_code only
    if request.json.get('service_code') == None:
        print("No service code provided, returning one")
        return json({'service_code': prediction})
    else:
        print("Service_code was provided so updating it")
        request.json['service_code'] = prediction
        return json(request.json)
예제 #3
0
    def test_save_load(self):
        """
        Ensure saving + loading does not cause errors
        Ensure saving + loading does not change predictions
        """
        save_file = "tests/saved-models/test-save-load"
        save_file_fp16 = "tests/saved-models/test-save-load_fp16"

        config = self.default_config(save_adam_vars=False)
        model = Classifier(**config)
        train_sample = self.dataset.sample(n=self.n_sample)
        valid_sample = self.dataset.sample(n=self.n_sample)
        model.fit(train_sample.Text, train_sample.Target)
        predictions = model.predict(valid_sample.Text)

        # testing file size reduction options
        model.save(save_file)
        self.assertLess(os.stat(save_file).st_size, 500000000)

        # reducing floating point precision
        model.saver.save_dtype = np.float16
        model.save(save_file_fp16)
        self.assertLess(os.stat(save_file_fp16).st_size, 260000000)

        model = Classifier.load(save_file_fp16)
        new_predictions = model.predict(valid_sample.Text)
        for i, prediction in enumerate(predictions):
            self.assertEqual(prediction, new_predictions[i])
def generate_GPT_feats(model_path, post_level=True):
    if post_level:
        df = pd.read_csv(PROCESSED_PATH / 'all_posts_data.csv')
        df = df[df.predict_me |
                (df.label.notnull())].loc[:, ['post_id', 'cleaned_body']]
    else:
        df = pd.read_csv(PROCESSED_PATH / 'sentences.csv')
        df = df.rename(columns={'body': 'cleaned_body'})

    model = Classifier.load(model_path)
    texts_to_featurize = list(df.cleaned_body.astype(str))
    features = model.featurize(texts_to_featurize)

    # generate a df with features as cols, with index as post_id
    GPT_embeddings = pd.DataFrame(features)
    GPT_embeddings.index = df.post_id

    if post_level:
        GPT_embeddings = GPT_embeddings.add_prefix('post_lvl-')
    else:
        GPT_embeddings = GPT_embeddings.add_prefix('sentence_lvl-')
        GPT_embeddings = flatten_cols(
            GPT_embeddings.groupby('post_id').agg(['mean', 'max', 'min']))

    return GPT_embeddings
예제 #5
0
    def test_save_load_language_model(self):
        """
        Ensure saving + loading does not cause errors
        Ensure saving + loading does not change predictions
        """
        save_file = "tests/saved-models/test-save-load"
        model = Classifier()

        lm_out = model.generate_text("The quick brown fox", 6)
        start_id = model.input_pipeline.text_encoder.start_token
        start_token = model.input_pipeline.text_encoder.decoder[start_id]
        self.assertNotIn(start_token, lm_out) # Non finetuned models do not use extra tokens
        
        train_sample = self.dataset.sample(n=self.n_sample)
        model.fit(train_sample.Text, train_sample.Target)
        lm_out = model.generate_text("", 5)
        self.assertIn(start_token, lm_out.lower())
        self.assertEqual(type(lm_out), str)
        model.save(save_file)

        model = Classifier.load(save_file)
        lm_out_2 = model.generate_text("Indico RULE")
        self.assertEqual(type(lm_out_2), str)
        
        self.assertIn("{}Indico RULE".format(start_token).lower(), lm_out_2.lower()) # Both of these models use extra toks
예제 #6
0
	def post(self):
		global model
		print("Received POST request on google interface")

		# if the classifier has not been loaded then load it
		if model == None:
			model = Classifier(max_length=512, val_interval=3000, verbose = True)
			model = Classifier.load("/root/combined_model_20181021")

		# check if the JSON description has been filled in
		some_json = request.get_json()
		if some_json.get('queryResult') == None:
			print("Empty message text")
			return {'fulfillmentText': 'unknown'}
		queryResult = some_json.get('queryResult')
		if queryResult.get('queryText') == None:
			print("Empty message text")
			return {'fulfillmentText': 'unknown'}
		newTextDescription = queryResult.get('queryText')
		print("received: ", newTextDescription)

		# Predict the classification of the text
		prediction = model.predict([newTextDescription])

		# Return the result
		print("returning: ", prediction[0])
		return {'fulfillmentText': prediction[0]}
예제 #7
0
	def post(self):
		global model
		print("Received POST request on Open311 interface")

		# if the classifier has not been loaded then load it
		if model == None:
			model = Classifier(max_length=512, val_interval=3000, verbose = True)
			model = Classifier.load("/root/combined_model_20181021")

		# check if the JSON description has been filled in
		some_json = request.get_json()
		print("Received JSON: ", some_json)
		if some_json.get('description') == None:
			return {'service_code': 'unknown'}
		newTextDescription = some_json.get('description')
		print("received: ", newTextDescription)
		prediction = model.predict([newTextDescription])
		# check if the input data also contained the service code and if so replace it 
		# and return the original message
		if some_json.get('service_code') == None:
			# No service code so just return that
			print("No service code provided, returning one")
			return {'service_code': prediction[0]}
		else:
			print("Service_code was provided so updating it")
			some_json['service_code'] = prediction[0]
			return some_json
예제 #8
0
async def processGoogleActionRequest(request):
    global model
    print("Received POST request on google interface")

    # Check if data provided
    if request.json == None:
        return json({"result", "No data in request"})
    some_json = request.json
    if some_json.get('queryResult') == None:
        print("Empty message text")
        return json({'fulfillmentText': 'unknown'})
    queryResult = some_json.get('queryResult')
    if queryResult.get('queryText') == None:
        print("Empty message text")
        return json({'fulfillmentText': 'unknown'})
    newTextDescription = queryResult.get('queryText')
    print("received: ", newTextDescription)
    processedDescription = preProcess(newTextDescription)
    print("pre-processed: ", processedDescription)

    # If the model is not already loaded then load it
    if model == None:
        model = Classifier(max_length=512, val_interval=3000, verbose=True)
        model = Classifier.load("/root/combined_model_20181021")

    # Predict the classification of the text
    prediction = model.predict([processedDescription])

    # Return the result
    print("returning: ", prediction[0])
    return json({'fulfillmentText': prediction[0]})
예제 #9
0
    def setUpClass(cls):
        cls._download_data()
        
        #dataset preparation
        cls.classifier_dataset = pd.read_csv(cls.classifier_dataset_path, nrows=cls.n_sample * 10)

        path = os.path.join(os.path.dirname(__file__), "data", "testdata.json")
        with open(path, 'rt') as fp:
            cls.texts, cls.labels = json.load(fp)

        cls.animals = ["dog", "cat", "horse", "cow", "pig", "sheep", "goat", "chicken", "guinea pig", "donkey", "turkey", "duck", "camel", "goose", "llama", "rabbit", "fox"]
        cls.numbers = ["one", "two", "three", "four", "five", "six", "seven", "eight", "nine", "ten", "eleven", "twelve", "thirteen", "fourteen", "fifteen", "sixteen"]
        
        #train and save sequence labeler for later use
        try:
            cls.s = SequenceLabeler.load(cls.sequence_labeler_path, **cls.default_seq_config(cls))
        except FileNotFoundError:
            cls.s = SequenceLabeler(**cls.default_seq_config(cls))
            cls.s.fit(cls.texts * 10, cls.labels * 10)
            cls.s.save(cls.sequence_labeler_path)
        
        #train and save classifier for later use
        train_sample = cls.classifier_dataset.sample(n=cls.n_sample*10)
        try:
            cls.cl = Classifier.load(cls.classifier_path)
        except FileNotFoundError:
            cls.cl = Classifier(**cls.default_config(cls))
            cls.cl.fit(train_sample.Text, train_sample.Target)
            cls.cl.save(cls.classifier_path)

        if cls.do_comparison:
            #train and save comparison regressor for use
            cls.cr = ComparisonRegressor()
    
            n_per = 150
            similar = []
            different = []
            for dataset in [cls.animals, cls.numbers]:
                for i in range(n_per // 2):
                    similar.append([random.choice(dataset), random.choice(dataset)])
            for i in range(n_per):
                different.append([random.choice(cls.animals), random.choice(cls.numbers)])

            targets = np.asarray([1] * len(similar) + [0] * len(different))
            data = similar + different

            cls.x_tr, cls.x_te, cls.t_tr, cls.t_te = train_test_split(data, targets, test_size=0.3, random_state=42)
            
            try:
                cls.cr = ComparisonRegressor.load(cls.comparison_regressor_path, **cls.default_config(cls))
            except FileNotFoundError:
                cls.cr = ComparisonRegressor(**cls.default_config(cls))
                cls.cr.fit(cls.x_tr, cls.t_tr)
                cls.cr.save(cls.comparison_regressor_path)
예제 #10
0
 def test_save_load_language_model(self):
     """
     Ensure saving + loading does not cause errors
     Ensure saving + loading does not change predictions
     """
     save_file = 'tests/saved-models/test-save-load'
     model = Classifier(verbose=False)
     train_sample = self.dataset.sample(n=self.n_sample)
     model.fit(train_sample.Text, train_sample.Target)
     lm_out = model.generate_text("", 5)
     self.assertEqual(type(lm_out), str)
     model.save(save_file)
     model = Classifier.load(save_file)
     lm_out_2 = model.generate_text("Indico RULE")
     self.assertEqual(type(lm_out_2), str)
     self.assertIn('_start_Indico RULE'.lower(), lm_out_2)
예제 #11
0
 def test_save_load(self):
     """
     Ensure saving + loading does not cause errors
     Ensure saving + loading does not change predictions
     """
     save_file = 'tests/saved-models/test-save-load'
     model = Classifier(config=self.default_config())
     train_sample = self.dataset.sample(n=self.n_sample)
     valid_sample = self.dataset.sample(n=self.n_sample)
     model.fit(train_sample.Text, train_sample.Target)
     predictions = model.predict(valid_sample.Text)
     model.save(save_file)
     model = Classifier.load(save_file)
     new_predictions = model.predict(valid_sample.Text)
     for i, prediction in enumerate(predictions):
         self.assertEqual(prediction, new_predictions[i])
예제 #12
0
 def test_save_load_language_model(self):
     """
     Ensure saving + loading does not cause errors
     Ensure saving + loading does not change predictions
     """
     save_file = 'tests/saved-models/test-save-load'
     model = Classifier()
     train_sample = self.dataset.sample(n=self.n_sample)
     model.fit(train_sample.Text, train_sample.Target)
     lm_out = model.generate_text("", 5)
     self.assertEqual(type(lm_out), str)
     model.save(save_file)
     model = Classifier.load(save_file)
     lm_out_2 = model.generate_text("Indico RULE")
     self.assertEqual(type(lm_out_2), str)
     start_id = model.input_pipeline.text_encoder.start
     start_token = model.input_pipeline.text_encoder.decoder[start_id]
     self.assertIn('{}Indico RULE'.format(start_token).lower(), lm_out_2.lower())
예제 #13
0
    def test_save_load(self):
        """
        Ensure saving + loading does not cause errors
        Ensure saving + loading does not change predictions
        """
        save_file = "tests/saved-models/test-save-load"
        config = self.default_config(save_adam_vars=False, n_epochs=1)
        model = Classifier(**config)

        model.fit(self.trainX, self.trainY, context=self.train_context)
        predictions = model.predict(self.trainX, context=self.train_context)
        model.save(save_file)

        model = Classifier.load(save_file)
        new_predictions = model.predict(self.trainX,
                                        context=self.train_context)
        for i, prediction in enumerate(predictions):
            self.assertEqual(prediction, new_predictions[i])
예제 #14
0
    def test_save_load(self):
        """
        Ensure saving + loading does not cause errors
        Ensure saving + loading does not change predictions
        """
        save_file = "tests/saved-models/test-save-load"
        config = self.default_config(save_adam_vars=False, n_epochs=1)
        model = Classifier(**config)

        (trainX, testX, trainY, _) = self.dataset
        trainY = [random.randint(0, 1) for _ in range(len(trainY))]
        model.fit(trainX, trainY)
        predictions = model.predict(testX)
        model.save(save_file)

        model = Classifier.load(save_file)
        new_predictions = model.predict(testX)
        for i, prediction in enumerate(predictions):
            self.assertEqual(prediction, new_predictions[i])