def featureEngineeringTraining(): # ----------------- READ PREPROCESSING FILE ----------------------- DATA_FOLDER = "ml_core/data/training/" VECT_FOLDER = "ml_core/vector/training/" FEATURES_FOLDER = "ml_core/vector/training/features/" FileName = "Preprocessed_Dataset_Training.csv" TWEET_DATA = pd.read_csv(DATA_FOLDER + FileName, usecols=["tweet_tokens_stemmed"]) TWEET_DATA.columns = ["tweet"] # join list of token as single document string def join_text_list(texts): texts = ast.literal_eval(texts) return ' '.join([text for text in texts]) TWEET_DATA["tweet_join"] = TWEET_DATA["tweet"].apply(join_text_list) #------------------------- READ CONFIG --------------------------- ses_max_feature = readJson_config('ml_core/', 'configuration.json', 'max_features') max_features = int( ses_max_feature[0]) if ses_max_feature is not None else 1000 # ------------------------- MAIN CALC ---------------------------- # ngram_range (1, 3) to use unigram, bigram, trigram cvect = CountVectorizer(max_features=max_features, ngram_range=(1, 1)) counts = cvect.fit_transform(TWEET_DATA["tweet_join"]) normalized_counts = normalize(counts, norm='l1', axis=1) tfidf = TfidfVectorizer(max_features=max_features, ngram_range=(1, 1), smooth_idf=False) tfs = tfidf.fit_transform(TWEET_DATA["tweet_join"]) tfidf_sparse = normalized_counts.multiply(tfidf.idf_) feature_name = {} feature_name['feature'] = tfidf.get_feature_names() tfidf_mat = tfidf_sparse.toarray() #------------------------ SAVE ----------------------------------- tfidf_sparse = sparse.csr_matrix(tfidf_sparse) sparse.save_npz(VECT_FOLDER + "tfidf_sparse_training.npz", tfidf_sparse) writeJson_config(FEATURES_FOLDER, ("tfidf_feature_training.json"), feature_name, append=False) return 'success'
def classificationTraining(cnn_model, tfidf_mat_selection, tags): MODEL_FOLDER = "ml_core/model/" # split dataset tfidf_mat_train, tfidf_mat_test, tags_train, tags_test = \ train_test_split(tfidf_mat_selection, tags, test_size=0.25, random_state=42) def check_model(model,x,y): #return model.fit(x,y,batch_size=63,epochs=20,verbose=1,validation_split=0.15) return model.fit(x, y, verbose=1, validation_split=0.2) # ----------------------------- START TRAINING MODEL ------------------------------ estimator = KerasClassifier(build_fn=cnn_model, epochs=25, batch_size=6) history = check_model(estimator,tfidf_mat_train, tags_train.ravel()) # ------------------------------ TEST REPORT -------------------------------- tags_pred = estimator.predict(tfidf_mat_test) def print_cm(y_true, y_pred, labels_order) : df = pd.DataFrame( confusion_matrix(y_true, y_pred, labels=labels_order), index=['target : 1', 'target : 0'], columns=['pred : 1', 'pred : 0'] ) df.style.set_properties(**{'text-align': 'center'}) return df cm_model = print_cm(tags_pred, tags_test, [1, 0]) report_model = classification_report(tags_pred, tags_test, output_dict=True) print(cm_model) print(report_model) # ------------------------------ SAVE MODEL & HISTORY ------------------------------ estimator.model.save(MODEL_FOLDER + "cnn_model_training.h5") pickle.dump(estimator.classes_, open(MODEL_FOLDER + 'cnn_class_training.pkl','wb')) # class_json = {} # class_json['class'] = estimator.classes_ # writeJson_config(MODEL_FOLDER , 'cnn_class_training.json', class_json, append=False) def formatStr(floats): return ['{:.2f}'.format(x) for x in floats] json_hist = {} json_hist["acc"] = formatStr(history.history['acc']) json_hist["val_acc"] = formatStr(history.history['val_acc']) json_hist["prec"] = formatStr(history.history['prec']) json_hist["val_prec"] = formatStr(history.history['val_prec']) json_hist["rec"] = formatStr(history.history['rec']) json_hist["val_rec"] = formatStr(history.history['val_rec']) json_hist["loss"] = formatStr(history.history['loss']) json_hist["val_loss"] = formatStr(history.history['val_loss']) writeJson_config(MODEL_FOLDER + "history/" , "cnn_history_model.json", json_hist, append=False) json_report = {} json_report['confusion_matrix'] = cm_model.values.tolist() json_report['report'] = report_model writeJson_config(MODEL_FOLDER + "report/" , "cnn_report_model.json", json_report, append=False) return 'success'
def featureSelection(): threshold = 0.01 np.set_printoptions(suppress=True) # ----------------- READ PREPROCESSING FILE ----------------------- VECT_SEL_FOLDER = "ml_core/vector_selection/" VECT_FOLDER = "ml_core/vector/" VECT_TEMPLATE = "ml_core/template/tfidf_sparse_template.npz" TEMPLATE_FOLDER = "ml_core/template/" FEATURE_CONFIG = "feature_template.json" FileName = [] for filename in os.listdir(VECT_FOLDER): path = os.path.join(VECT_FOLDER, filename) if not os.path.isdir(path): strDatetime = filename.replace("tfidf_sparse_", "").replace(".npz", "") FileDatetime = datetime.strptime(strDatetime, "%d%m%Y_%H%M%S") FileName.append([filename, FileDatetime]) # ----------------------- LOAD SPARSE MATRIX ------------------------- FileName = sorted(FileName, key=lambda t: t[1], reverse=True) tfidf_mat = sparse.load_npz(VECT_FOLDER + FileName[0][0]).toarray() json_name = (FileName[0][0]).replace(".npz", ".json").replace("sparse", "feature") features = readJson_config(VECT_FOLDER + "features/", json_name, 'feature')[0] tfidf_mat_selection = None features_template = None tfidf_mat_template = None selected_idx = [] now = datetime.now() dt_string = now.strftime("%d%m%Y_%H%M%S") # ------------------------------- RUN ------------------------------- tfidf_mat_template = [] for i in range(len(tfidf_mat)): tfidf_mat_template.append(sparse.load_npz(VECT_TEMPLATE).toarray()[0]) features_template = readJson_config(TEMPLATE_FOLDER, FEATURE_CONFIG, 'feature') print(features_template) for i in range(len(tfidf_mat)): for feature in features: if feature in features_template: idx_template = features_template.index(feature) idx = features.index(feature) tfidf_mat_template[i][idx_template] = tfidf_mat[i][idx] selected_idx.append(1) else: selected_idx.append(0) #-------------------------------- SAVE ----------------------------------- tfidf_sparse_template = sparse.csr_matrix(tfidf_mat_template) sparse.save_npz( VECT_SEL_FOLDER + "tfidf_selection_sparse_" + dt_string + ".npz", tfidf_sparse_template) writeJson_config(VECT_SEL_FOLDER + "features/", ("tfidf_sparse_" + dt_string + ".json"), features_template, append=False) # ----------------------------- LOAD DATA VIEW ---------------------------- tableRecords = [] for i in range(len(tfidf_mat)): for item in zip(features, tfidf_mat[i], selected_idx): tfidf_round = np.array([item[1]]).round(decimals=3) if item[1] != 0.0: tableRecords.append( ['Document_' + str(i), item[0], tfidf_round[0], item[2]]) return tableRecords
def featureEngineering(): # ----------------- READ PREPROCESSING FILE ----------------------- DATA_FOLDER = "ml_core/data/" VECT_FOLDER = "ml_core/vector/" FEATURES_FOLDER = "ml_core/vector/features/" FileName = [] for filename in os.listdir(DATA_FOLDER): path = os.path.join(DATA_FOLDER, filename) if not os.path.isdir(path): strDatetime = filename.replace("Preprocessed_Dataset_", "").replace(".csv", "") FileDatetime = datetime.strptime(strDatetime, "%d%m%Y_%H%M%S") FileName.append([filename, FileDatetime]) FileName = sorted(FileName, key=lambda t: t[1], reverse=True) TWEET_DATA = pd.read_csv(DATA_FOLDER + FileName[0][0], usecols=["tweet_tokens_stemmed"]) TWEET_DATA.columns = ["tweet"] # join list of token as single document string def join_text_list(texts): texts = ast.literal_eval(texts) return ' '.join([text for text in texts]) TWEET_DATA["tweet_join"] = TWEET_DATA["tweet"].apply(join_text_list) #------------------------- READ CONFIG --------------------------- ses_max_feature = readJson_config('ml_core/', 'configuration.json', 'max_features') max_features = int(ses_max_feature[0]) if ses_max_feature is not None else 1000 # ------------------------- MAIN CALC ---------------------------- # ngram_range (1, 3) to use unigram, bigram, trigram cvect = CountVectorizer(max_features=max_features, ngram_range=(1,1)) counts = cvect.fit_transform(TWEET_DATA["tweet_join"]) normalized_counts = normalize(counts, norm='l1', axis=1) tfidf = TfidfVectorizer(max_features=max_features, ngram_range=(1,1), smooth_idf=False) tfs = tfidf.fit_transform(TWEET_DATA["tweet_join"]) tfidf_sparse = normalized_counts.multiply(tfidf.idf_) feature_name = {} feature_name['feature'] = tfidf.get_feature_names() tfidf_mat = tfidf_sparse.toarray() #------------------------ SAVE ----------------------------------- now = datetime.now() dt_string = now.strftime("%d%m%Y_%H%M%S") tfidf_sparse = sparse.csr_matrix(tfidf_sparse) sparse.save_npz(VECT_FOLDER + "tfidf_sparse_" + dt_string + ".npz", tfidf_sparse) writeJson_config(FEATURES_FOLDER, ("tfidf_feature_" + dt_string + ".json"), feature_name, append=False) #------------------------- DATA VIEW -------------------------------- TableRecords = [] terms = tfidf.get_feature_names() TF = normalized_counts.toarray() IDF = tfidf.idf_ TFIDF = tfidf_mat for i in range(len(TF)): for item in zip(terms,TF[i], IDF, TFIDF[i]): if item[1] != 0.0: Num = np.array([item[1], item[2], item[3]]).round(decimals=3) TableRecords.append(['Document_'+ str(i) , item[0], Num[0], Num[1], Num[2]]) return TableRecords
def featureSelectionTraining(training=False): threshold = 0.01 np.set_printoptions(suppress=True) # ----------------- READ PREPROCESSING FILE ----------------------- VECT_SEL_FOLDER = "ml_core/vector_selection/training/" VECT_FOLDER = "ml_core/vector/training/" VECT_TEMPLATE = "ml_core/template/tfidf_sparse_template.npz" TEMPLATE_FOLDER = "ml_core/template/" FEATURE_CONFIG = "feature_template.json" LABEL_PATH = "ml_core/data/training/Preprocessed_Dataset_Training.csv" TWEET_DATA = pd.read_csv(LABEL_PATH, usecols=["label"]) tags = TWEET_DATA.label # ----------------------- LOAD SPARSE MATRIX ------------------------- FileName = "tfidf_sparse_training.npz" tfidf_mat = sparse.load_npz(VECT_FOLDER + FileName).toarray() json_feature = "tfidf_feature_training.json" features = readJson_config(VECT_FOLDER + "features/", json_feature, 'feature')[0] tfidf_mat_selection = None features_template = None tfidf_mat_template = None selected_idx = [] now = datetime.now() dt_string = now.strftime("%d%m%Y_%H%M%S") # ---------------------------- TRAINING ----------------------------- mi = mutual_info_classif(tfidf_mat, tags) norm_mi = mi / np.max(mi) column_idx = [i for i, mi_item in enumerate(norm_mi) if mi_item < 0.01] tfidf_mat_selection = np.delete(tfidf_mat, column_idx, 1) # template data selected_idx = [j for j in range(len(norm_mi)) if j not in column_idx] selected_features = [] for idx in selected_idx: selected_features.append(features[idx]) tfidf_mat_template = [0.0] * len(selected_features) features_template = selected_features #-------------------------------- SAVE ----------------------------------- # Save template tfidf_sparse_template = sparse.csr_matrix(tfidf_mat_template) sparse.save_npz(VECT_TEMPLATE, tfidf_sparse_template) feature_dict = {} feature_dict['feature'] = features_template writeJson_config(TEMPLATE_FOLDER, FEATURE_CONFIG, feature_dict, append=False) # save training data tfidf_sparse = sparse.csr_matrix(tfidf_mat_selection) sparse.save_npz(VECT_SEL_FOLDER + "tfidf_selection_sparse_training.npz", tfidf_sparse) writeJson_config(VECT_SEL_FOLDER + "features/", "tfidf_feature_training.json", features_template, append=False) return 'success'