class DecisionTree: def __init__(self): self.log = LoggerUtil(self.__class__.__name__).get() self.config = ConfigUtil.get_config_instance() self.helper = Helper() self.metrics = Metrics() @staticmethod def train_model(x_train, y_train): model = DecisionTreeClassifier() model.fit(x_train, y_train) return model @staticmethod def test_model(model, x_test): return model.predict(x_test) def main(self, x_train, x_test, y_train, y_test): image_path = self.config["image_path"] model = self.train_model(x_train, y_train) self.log.info("{} Model performance on test data".format(self.__class__.__name__)) y_pred = self.test_model(model, x_test) acc_score, cr_report, cnf_matrix = self.metrics.metrics(y_true=y_test, y_predicted=y_pred) self.helper.plot_save_cnf_matrix(cnf_matrix, flag="test", model_name=self.__class__.__name__, image_path=image_path) return { "model": model, "metrics": { "accuracy": acc_score, "classification_report": cr_report, "confusion_matrix": cnf_matrix } }
class UpdateProfile: def __init__(self): self.log = LoggerUtil(self.__class__.__name__).get() self.config = ConfigUtil.get_config_instance() self.db_utils = DBUtils() def get_client(self): address = self.config['mongo']['address'] port = self.config['mongo']['port'] auth_db = self.config['mongo']['auth_db'] is_auth_enabled = self.config['mongo']['is_auth_enabled'] username = self.config['mongo']['username'] password = self.config['mongo']['password'] client = self.db_utils.get_client(address=address, port=port, username=username, password=password, auth_db=auth_db, is_auth_enabled=is_auth_enabled) return client @staticmethod def insert_query(source, destination, time, date, num_seats_req, phone_num, email, preferences): query = dict() query['source'] = source query['destination'] = destination query['time'] = time query['date'] = date query['num_seats_req'] = num_seats_req query['phone_num'] = phone_num query['email'] = email query['preferences'] = preferences return json.dumps(query) def update(self, source, destination, time, date, num_seats_req, phone_num, email, preferences): client = self.get_client() users_database_name = self.config['mongo']['users_database'] users_hist_collection_name = self.config['mongo'][ 'users_hist_collection_name'] database = client[users_database_name] users_hist_collection = database[users_hist_collection_name] query = self.insert_query(source=source, destination=destination, time=time, date=date, num_seats_req=num_seats_req, phone_num=phone_num, email=email, preferences=preferences) try: users_hist_collection.insert(query) self.log.info( "Updated profile for user with email : {}".format(email)) except Exception as e: self.log.error("Error : {}".format(e))
class Metrics: def __init__(self): self.config = ConfigUtil.get_config_instance() self.log = LoggerUtil(self.__class__.__name__).get() def metrics(self, y_true, y_predicted): cr_report = classification_report(y_true=y_true, y_pred=y_predicted) acc_score = accuracy_score(y_true=y_true, y_pred=y_predicted) self.log.info("Accuracy Score : {}".format(acc_score)) self.log.info("Classification Report: \n{}".format(cr_report)) cnf_matrix = confusion_matrix(y_true=y_true, y_pred=y_predicted) return acc_score, cr_report, cnf_matrix
class UserRegistration: def __init__(self): self.log = LoggerUtil(self.__class__.__name__).get() self.config = ConfigUtil.get_config_instance() self.db_utils = DBUtils() def get_client(self): address = self.config['mongo']['address'] port = self.config['mongo']['port'] auth_db = self.config['mongo']['auth_db'] is_auth_enabled = self.config['mongo']['is_auth_enabled'] username = self.config['mongo']['username'] password = self.config['mongo']['password'] client = self.db_utils.get_client(address=address, port=port, username=username, password=password, auth_db=auth_db, is_auth_enabled=is_auth_enabled) return client @staticmethod def insert_query(name, email, contact, password): query = dict() query['name'] = name query['email'] = email query['contact'] = contact query['password'] = password query['auth'] = True return query def add_user(self, **kwargs): name = kwargs['name'] email = kwargs['email'] contact = kwargs['contact'] password = kwargs['password'] client = self.get_client() users_database_name = self.config['mongo']['users_database'] add_users_collection_name = self.config['mongo'][ 'add_users_collection'] database = client[users_database_name] add_users_collection = database[add_users_collection_name] query = self.insert_query(name, email, contact, password) try: add_users_collection.insert(query) self.log.info("Added user with username : {}".format(email)) except Exception as e: self.log.error("Error : {}".format(e))
class TestModel: def __init__(self): self.log = LoggerUtil(self.__class__.__name__).get() self.config = ConfigUtil.get_config_instance() self.model = LinguisticModel() self.read_data = ReadData() self.metrics = Metrics() self.helper = Helper() self.logic = Logic() def check_if_trained(self): models_path = self.config["models_path"] model = joblib.load(models_path + "/" + "mlp.mdl") vectorizer = joblib.load(models_path + "/" + "vectorizer.mdl") return vectorizer, model def main(self, test=False): if test: nlp = load("en_core_web_sm") vectorizer, model = self.check_if_trained() self.log.info("Please enter the sentence") sentence = str(input()) tokens = self.read_data.transform_sentence(sentence) features = vectorizer.transform(tokens) predictions = model.predict(features) ling_pred = self.logic.apply_rules(text_tokens=tokens, nlp=nlp) self.log.info("Given sentence : {}".format(sentence)) self.log.info( "Prediction of Linguistic Model : {}".format(ling_pred)) self.log.info("Prediction of ML Model : {}".format( any(predictions))) self.log.info("Final Prediction : {}".format(ling_pred or any(predictions))) else: model, vectorizer = self.model.main() tagged_data_df = self.read_data.prepare_tagged_data() features = vectorizer.transform(tagged_data_df["data"]) labels = tagged_data_df["labels"] predictions = model.predict(features) acc_score, cr_report, cnf_matrix = self.metrics.metrics( y_true=labels, y_predicted=predictions) self.helper.plot_save_cnf_matrix( cnf_matrix=cnf_matrix, model_name="satwik", flag="test", image_path= "/home/satwik/Documents/Hiring/huddl_assignment/Images/")
class ReadData: def __init__(self): self.log = LoggerUtil(self.__class__.__name__).get() self.config = ConfigUtil.get_config_instance() self.process_data = ProcessData() self.helper = Helper() self.ps = PorterStemmer() self.stop_words = set.union(STOP_WORDS, { 'ect', 'hou', 'com', 'recipient', 'na', 'ou', 'cn', 'enron', 'zdnet' }) def prepare_data(self, n_rows): data_path = self.config["data_path"] chunksize = self.config["chunksize"] n_partitions = self.config["n_partitions"] df_list = list() chunk = read_csv(data_path + "/" + "emails.csv", chunksize=chunksize) for i in range(int(n_rows / chunksize)): self.log.info("Iter : {}".format(i)) df = next(chunk) df = self.process_data.get_sub_message(df) self.process_data.clean_df(df, n_partitions) df_list.append(df) self.log.info("Label Stats : {}".format(Counter(df["labels"]))) return concat(df_list) def prepare_tagged_data(self): tagged_path = self.config["tagged_path"] tagged_data = read_csv(tagged_path + "/" + "actions.csv", header=None) rows = len(tagged_data) tagged_data[0] = tagged_data[0].apply(lambda x: [ "".join(item.lower().strip()) for item in x.split() if x.lower().strip() not in self.stop_words ]) df = DataFrame.from_dict({ "data": tagged_data[0].values, "labels": ones(rows, dtype=bool) }) return df def transform_sentence(self, sentence): return [ item.lower().strip() for item in sentence.split() if item.lower().strip() not in self.stop_words ]
class CheckPreTaggedData: def __init__(self): self.log = LoggerUtil(self.__class__.__name__).get() self.config = ConfigUtil.get_config_instance() self.logic = Logic() def main(self): nlp = load("en_core_web_sm") stop_words = set.union(STOP_WORDS, { 'ect', 'hou', 'com', 'recipient', 'na', 'ou', 'cn', 'enron', 'zdnet' }) tagged_df = read_csv(self.config["tagged_path"] + "/" + "actions.csv", header=None) tagged_df[0] = tagged_df[0].apply(lambda x: [ item.lower().strip() for item in x.split() if item.lower().strip() not in stop_words ]) tagged_df["labels"] = tagged_df[0].apply( lambda x: self.logic.apply_rules(x, nlp)) self.log.info( "Values detected by Model in Pre-Tagged Sentences : {}".format( tagged_df.labels.value_counts()))
class LinguisticModel: def __init__(self): self.log = LoggerUtil(self.__class__.__name__).get() self.config = ConfigUtil.get_config_instance() self.read_data = ReadData() self.bayes = NaiveBayes() self.lr = LRModel() self.dt = DecisionTree() self.rf = RandomForest() self.et = ExtraTree() self.adaboost = Adaboost() self.mlp = MLP() @staticmethod def split_data(df, labels): x_train, x_test, y_train, y_test = train_test_split(df, labels, stratify=labels, shuffle=True) return x_train, x_test, y_train, y_test @staticmethod def custom_word_tokenizer(tokens): return tokens def vectorize_data(self, data): """ While we needed subject to estimate if the mail contains action item or not, we don't need it for model selection. :return: """ vectorizer = TfidfVectorizer(tokenizer=self.custom_word_tokenizer, sublinear_tf=True, analyzer='word', lowercase=False, max_features=2500) vectorizer.fit(data) return vectorizer @staticmethod def get_tagged_dataset(df): return df[~df["labels"]]["message"].head(n=1250), zeros(1250, dtype=bool) def train_models(self, x_train_features, x_test_features, y_train, y_test): nb_model = self.bayes.main(x_train=x_train_features, y_train=y_train, x_test=x_test_features, y_test=y_test) lr_model = self.lr.main(x_train=x_train_features, y_train=y_train, x_test=x_test_features, y_test=y_test) dt_model = self.dt.main(x_train=x_train_features, y_train=y_train, x_test=x_test_features, y_test=y_test) rf_model = self.rf.main(x_train=x_train_features, y_train=y_train, x_test=x_test_features, y_test=y_test) et_model = self.et.main(x_train=x_train_features, y_train=y_train, x_test=x_test_features, y_test=y_test) adaboost_model = self.adaboost.main(x_train=x_train_features, y_train=y_train, x_test=x_test_features, y_test=y_test) mlp_model = self.mlp.main(x_train=x_train_features, y_train=y_train, x_test=x_test_features, y_test=y_test) model_dict = { "naive_bayes": nb_model, "logistic_regression": lr_model, "decision_tree": dt_model, "random_forest": rf_model, "extra_tree": et_model, "adaboost": adaboost_model, "mlp": mlp_model } return model_dict def save_models(self, model_dict, models_path, vectorizer): joblib.dump(vectorizer, models_path + "/" + "vectorizer.mdl") for model_name, stat_dict in model_dict.items(): if model_name in [ "naive_bayes", "logistic_regression", "decision_tree", "random_forest", "extra_tree", "adaboost", "mlp" ]: joblib.dump(stat_dict["model"], models_path + "/" + model_name + ".mdl") else: self.log.error("Non-Standard Model referenced") def get_best_model(self, model_dict): best_acc = 0 best_model_name = "" for model_name, model_stat in model_dict.items(): for key, value in model_stat.items(): if key == "metrics": accuracy = value["accuracy"] if accuracy > best_acc: best_model_name = model_name self.log.info("Best model determined is : {}".format(best_model_name)) return model_dict[best_model_name]["model"] def main(self): models_path = self.config["models_path"] untagged_data_df = self.read_data.prepare_data(n_rows=3000) untagged_data, untagged_labels = untagged_data_df[ "message"], untagged_data_df["labels"] vectorizer = self.vectorize_data(untagged_data) x_train, x_test, y_train, y_test = self.split_data( untagged_data, untagged_labels) x_train_features = vectorizer.transform(x_train) x_test_features = vectorizer.transform(x_test) model_dict = self.train_models(x_train_features, x_test_features, y_train, y_test) self.save_models(model_dict=model_dict, models_path=models_path, vectorizer=vectorizer) best_model = self.get_best_model(model_dict) return best_model, vectorizer