Exemplo n.º 1
0
    def train_and_score_models(self, models=None):
        job_id = ("dev" if APP_ENV == "development" else
                  datetime.now().strftime("%Y-%m-%d-%H%M")
                  )  # overwrite same model in development
        models = models or {
            "logistic_regression": LogisticRegression(random_state=99),
            "multinomial_nb": MultinomialNB()
        }
        for model_name in models.keys():
            print("--------------------------")
            print("MODEL:")
            model = models[model_name]
            print(model)

            print("TRAINING...")
            model.fit(self.matrix_train, self.y_train)

            print("TRAINING SCORES:")
            y_pred_train = model.predict(self.matrix_train)
            scores_train = classification_report(self.y_train,
                                                 y_pred_train,
                                                 output_dict=True)
            print("ACCY:", scores_train["accuracy"])
            pprint(scores_train)

            print("TEST SCORES:")
            y_pred_test = model.predict(self.matrix_test)
            scores_test = classification_report(self.y_test,
                                                y_pred_test,
                                                output_dict=True)
            print("ACCY:", scores_test["accuracy"])
            pprint(scores_test)

            print("SAVING ...")
            storage = ModelStorage(
                dirpath=f"nlp_v2/models/{job_id}/{model_name}")
            storage.save_vectorizer(self.tv)
            storage.save_model(model)
            storage.save_scores({
                "model_name": model_name,
                "job_id": job_id,
                "features": len(self.tv.get_feature_names()),
                "label_maker": self.label_maker.__name__,
                "matrix_train": self.matrix_train.shape,
                "matrix_test": self.matrix_test.shape,
                "scores_train": scores_train,
                "scores_test": scores_test
            })
Exemplo n.º 2
0
import os

from app.nlp.model_storage import ModelStorage

MODEL_DIRPATH = os.getenv(
    "MODEL_DIRPATH",
    default="tweet_classifier/models/logistic_regression/2020-09-08-1229")

if __name__ == "__main__":

    storage = ModelStorage(dirpath=MODEL_DIRPATH)

    storage.promote_model()
Exemplo n.º 3
0
                                                training_predictions,
                                                output_dict=True)
        print("ACCY:", training_scores["accuracy"])
        pprint(training_scores)

        print("TEST SCORES...")
        test_predictions = model.predict(test_matrix)
        test_scores = classification_report(test_labels,
                                            test_predictions,
                                            output_dict=True)
        print("ACCY:", test_scores["accuracy"])
        pprint(test_scores)

        print("SAVING MODEL FILES...")
        model_id = ("dev" if APP_ENV == "development" else
                    datetime.now().strftime("%Y-%m-%d-%H%M")
                    )  # overwrite same model in development
        storage = ModelStorage(
            dirpath=f"{MODELS_DIRPATH}/{model_name}/{model_id}")
        storage.save_vectorizer(tv)
        storage.save_model(model)
        storage.save_scores({
            "model_name": model_name,
            "model_id": model_id,
            "features": len(tv.get_feature_names()),
            "training_matrix": training_matrix.shape,
            "test_matrix": test_matrix.shape,
            "training_scores": training_scores,
            "test_scores": test_scores
        })
Exemplo n.º 4
0
import os

from app import seek_confirmation
from app.job import Job
from app.bq_service import BigQueryService
from app.nlp.model_storage import ModelStorage, MODELS_DIRPATH

MODEL_NAME = os.getenv("MODEL_NAME", default="current_best")

LIMIT = os.getenv("LIMIT")
BATCH_SIZE = int(os.getenv("BATCH_SIZE", default="100000"))

if __name__ == "__main__":

    storage = ModelStorage(dirpath=f"{MODELS_DIRPATH}/{MODEL_NAME}")
    tv = storage.load_vectorizer()
    clf = storage.load_model()

    bq_service = BigQueryService()

    print("DESTROYING PREDICTIONS TABLE???")
    seek_confirmation()
    print("DESTROYING PREDICTIONS TABLE...")
    bq_service.destructively_migrate_2_community_predictions_table()

    job = Job()
    job.start()

    ids_batch = []
    statuses_batch = []
    for row in bq_service.fetch_unlabeled_statuses_in_batches(limit=LIMIT):
Exemplo n.º 5
0
import os

from app.nlp.model_storage import ModelStorage

SOURCE = os.getenv("SOURCE", default="nlp_v2/models/dev/multinomial_nb")
DESTINATION = os.getenv("DESTINATION",
                        default="nlp_v2/models/best/multinomial_nb")

if __name__ == "__main__":

    storage = ModelStorage(dirpath=SOURCE)

    storage.promote_model(destination=DESTINATION)
Exemplo n.º 6
0
import os

from app.nlp.model_storage import ModelStorage, BEST_MODEL_DIRPATH

if __name__ == "__main__":

    storage = ModelStorage(dirpath=BEST_MODEL_DIRPATH)

    tv = storage.load_vectorizer()
    print(type(tv))
    print("FEATURES / TOKENS:", len(tv.get_feature_names()))  #> 3842

    clf = storage.load_model()
    print(type(clf))

    while True:

        status_text = input("Status Text: ")
        if not status_text:
            print("THANKS! COME AGAIN!")
            break

        matrix = tv.transform([status_text])
        #print(matrix)

        result = clf.predict(matrix)
        print("PREDICTED COMMUNITY ID:", result[0])
Exemplo n.º 7
0
            if job.counter % BATCH_SIZE == 0:
                save_batch(batch)
                batch = []
                job.progress_report()

        if len(batch) > 0:
            save_batch(batch)
            batch = []
        job.end()

    seek_confirmation()
    #exit()

    for model_name in ["logistic_regression", "multinomial_nb"]:

        storage = ModelStorage(dirpath=f"nlp_v2/models/best/{model_name}")
        tv = storage.load_vectorizer()
        clf = storage.load_model()

        print(f"DESTROY PREDICTIONS TABLE? ({model_name})")
        seek_confirmation()
        bq_service.nlp_v2_destructively_migrate_predictions_table(model_name)
        predictions_table = bq_service.nlp_v2_get_predictions_table(model_name) # API call. cache it here once.

        job.start()

        for chunk_df in read_csv(CSV_FILEPATH, chunksize=BATCH_SIZE): # FYI: this will include the last chunk even if it is not a full batch
            status_ids = chunk_df["status_id"].tolist()
            status_texts = chunk_df["status_text"].tolist()

            preds = clf.predict(tv.transform(status_texts))