def __generate_feature_matrix(self): feature_matrix = pd.DataFrame() es = Es(settings.hosts, settings.index_name, settings.index_type) doc_count = es.count() ## Populate the matrix print("Total number of features: {}".format(len(self.spamwords))) for i in range(len(self.spamwords)): spamword = self.spamwords[i] result_id_list, result_score_list = es.get_documents_for_query( spamword) if (len(result_id_list) == 0): print("No documents are present for spamword {}".format( spamword)) result_list = [0] * doc_count for i in range(len(result_id_list)): result_list[int(result_id_list[i]) - 1] = result_score_list[i] feature_matrix[spamword] = result_list ## Print purpose if (i % 10 == 0): print("Feature matrix for {} words done".format(i)) print(feature_matrix.index.values) ## Get the train and test list id_list, filename_list, label_list, split_list = es.get_all_documents() label_list = [1 if x == "spam" else 0 for x in label_list] ## Divide into train-test set train_row_ids = [] test_row_ids = [] train_index_list = [] train_labels = [] test_index_list = [] test_labels = [] for i in range(len(split_list)): if (split_list[i] == 'train'): train_row_ids.append(int(id_list[i]) - 1) train_index_list.append(filename_list[i]) train_labels.append(label_list[i]) else: test_row_ids.append(int(id_list[i]) - 1) test_index_list.append(filename_list[i]) test_labels.append(label_list[i]) print("Number of documents in train: {}".format(len(train_row_ids))) print("Number of documents in test: {}".format(len(test_row_ids))) ## Divide the feature_matrix into train and test train_feature_matrix = feature_matrix.loc[train_row_ids, :] train_feature_matrix.index = train_index_list test_feature_matrix = feature_matrix.loc[test_row_ids, :] test_feature_matrix.index = test_index_list print("Shape of train feature matrix: {}".format( train_feature_matrix.shape)) print("Shape of test feature matrix: {}".format( test_feature_matrix.shape)) return (train_feature_matrix, train_labels, test_feature_matrix, test_labels, train_row_ids, test_row_ids)
class Spam_Classifier_Unigrams(): es = None unigrams_dict = {} def __init__(self): print(" In the constructor") self.es = Es(settings.hosts, settings.index_name, settings.index_type) def __load_unigram(self, create_unigrams_dict): ## Save the unigrams_dict if (create_unigrams_dict == True): self.unigrams_dict = self.es.get_all_vocabulary() util.dump_pickle_file(settings.unigram_filename, self.unigrams_dict) else: self.unigrams_dict = util.load_pickle_file( settings.unigram_filename) def __generate_feature_matrix(self, label_dict): start_calculation = timeit.default_timer() feature_sparse_matrix = dok_matrix( (len(label_dict), len(self.unigrams_dict)), dtype=np.float32) count = 0 ## Loop over all the documents for id in label_dict.keys(): if (count % 1000 == 0): print( "Generating features for {} documents done".format(count)) doc_term_vector = self.es.term_vectors(id) if "body" in doc_term_vector[ "term_vectors"] and "terms" in doc_term_vector[ "term_vectors"]["body"]: tokens = doc_term_vector["term_vectors"]["body"]["terms"].keys( ) for token in tokens: ## Get the token number token_number = self.unigrams_dict[token] tf = doc_term_vector["term_vectors"]["body"]["terms"][ token]["term_freq"] feature_sparse_matrix[count, token_number - 1] = tf count = count + 1 stop_calculation = timeit.default_timer() print("Time taken to generate feature matrix: " + str(stop_calculation - start_calculation)) return (feature_sparse_matrix) def __get_feature_matrix(self): start_calculation = timeit.default_timer() ## Label dict label_train_dict = {} label_test_dict = {} ## Get all documents and get the train and test list id_list, filename_list, label_list, split_list = self.es.get_all_documents( ) label_list = [1 if x == "spam" else 0 for x in label_list] for i in range(len(split_list)): if (split_list[i] == 'train'): label_train_dict[id_list[i]] = label_list[i] else: label_test_dict[id_list[i]] = label_list[i] print("Generating feature matrix for train documents") feature_sparse_train_matrix = self.__generate_feature_matrix( label_train_dict) print("Generating feature matrix for test documents") feature_sparse_test_matrix = self.__generate_feature_matrix( label_test_dict) train_csr = feature_sparse_train_matrix.tocsr() train_labels = list(label_train_dict.values()) test_csr = feature_sparse_test_matrix.tocsr() test_labels = list(label_test_dict.values()) stop_calculation = timeit.default_timer() print("Time taken to generate feature matrix: " + str(stop_calculation - start_calculation)) return (train_csr, train_labels, test_csr, test_labels) def __classification(self, train_df, train_labels, test_df, test_labels): logistic_regression_accuracy = ml_models.logistic_regression( train_df, train_labels, test_df, test_labels) print("Logistic Regression Accuracy {}".format( logistic_regression_accuracy)) decision_tree_accuracy = ml_models.decision_tree( train_df, train_labels, test_df, test_labels) print("Decision Tree Accuracy {}".format(decision_tree_accuracy)) accuracy = max(logistic_regression_accuracy, decision_tree_accuracy) return (accuracy) def run(self, create_unigrams_dict=False): start_calculation = timeit.default_timer() ## Load unigram self.__load_unigram(create_unigrams_dict) ## Generate feature matrix train_csr, train_labels, test_csr, test_labels = self.__get_feature_matrix( ) ## Run the classification algorithm accuracy = self.__classification(train_csr, train_labels, test_csr, test_labels) print("Accuracy is {}".format(accuracy)) stop_calculation = timeit.default_timer() print("Time taken to run spam classifier userwords: " + str(stop_calculation - start_calculation))