return False def fold_data(data, k=2): kf = KFold(n_splits=k, shuffle=True, random_state=2) for train, test in kf.split(data): return train, test def import_data(filename): print(f"Import {filename}") importer = DataImporter(filename) return importer.get_data() storage = Storage() particle = storage.load('./pickle/pso-1.pckl') pos = particle.position.astype(bool) pos = [map_bool(x) for x in pos] data = storage.load('./pickle/default-1541653057.8427656.pckl') train_idx, test_idx = fold_data(data) selected_tfidf = TFIDF(data.iloc[train_idx]['Review']) features = np.array(list(selected_tfidf.termIndex.keys())) features = features[pos] new_data = import_data('../data/Avg_55,26.xlsx') new_data['Review'] = preprocess_data(new_data, features) storage.save(new_data, f"pickle/preprocessed-{time.time()}.pckl") # kf = KFold(n_splits=10, shuffle=True, random_state=2)
return train, test def preprocess_data(data): print(f"Preprocess data...") preprocessor = Preprocessor() result = [] for i, review in enumerate(data['Review']): result.append(" ".join(preprocessor.preprocess(review))) print(f"Review {i + 1} preprocessed") return result # data = import_data('../data/Avg_55,26.xlsx') # data['Review'] = preprocess_data(data) storage = Storage() # storage.save(data, f"pickle/default-{time.time()}.pckl") data = storage.load("pickle/default-1541653057.8427656.pckl") train, test = fold_data(data) train_data = data.iloc[train] test_data = data.iloc[test] tfidf = TFIDF(train_data['Review']) num_attrs = len(tfidf.termIndex) # clf = C45(tfidf, data) # clf.train() # score = clf.score(tfidf, test_data) # print(score) # 0.3630573248407643 pso = PSO(num_attrs, 20, 20, 0.7, 0.5, 0.99)
import sys, os cwd = os.getcwd().split("\\") sys.path.append(".." if cwd[-1] == "revisions" else "revisions/..") from sklearn.decomposition import PCA from entities.Storage import Storage from libs.TFIDF import TFIDF from mpl_toolkits.mplot3d import Axes3D import matplotlib.pyplot as plt import numpy as np s = Storage() data = s.load("pickle/default-1541653057.8427656.pckl") tfidf = TFIDF(data["Review"]) english_labels = { "Berdampak positif": "Berdampak positif", "Berdampak negatif": "Berdampak negatif", "Netral": "Netral" } groups = { "Berdampak positif": "green", "Berdampak negatif": "red", "Netral": "blue" } translated_labels = [english_labels[label] for label in data["Label"]] colors = np.array([groups[x] for x in translated_labels]) pca = PCA(n_components=2).fit(tfidf.weights) data2D = pca.transform(tfidf.weights) x_std = np.std(data2D[:, 0]) y_std = np.std(data2D[:, 1])
import sys, os cwd = os.getcwd().split("\\") sys.path.append(".." if cwd[-1] == "revisions" else "revisions/..") import numpy as np, random, math, pandas as pd from libs.TFIDF import TFIDF from entities.Storage import Storage storage = Storage() data = storage.load('./pickle/default-1541653057.8427656.pckl') tfidf = TFIDF(data['Review']) print(len(tfidf.weights[0])) df = pd.DataFrame(tfidf.weights) df.to_excel('tfidf.xlsx')
import sys, os cwd = os.getcwd().split("\\") sys.path.append(".." if cwd[-1] == "revisions" else "revisions/..") from libs.DataImporter import DataImporter from sklearn.decomposition import PCA from entities.Storage import Storage from libs.TFIDF import TFIDF from mpl_toolkits.mplot3d import Axes3D import matplotlib.pyplot as plt import numpy as np s = Storage() data = s.load("pickle/default-1541653057.8427656.pckl") tfidf = TFIDF(data["Review"]) english_labels = { "Berdampak positif": "Berdampak positif", "Berdampak negatif": "Berdampak negatif", "Netral": "Netral" } groups = { "Berdampak positif": "green", "Berdampak negatif": "red", "Netral": "blue" } translated_labels = [english_labels[label] for label in data["Label"]] colors = np.array([groups[x] for x in translated_labels]) pca = PCA(n_components=2).fit(tfidf.weights) data2D = pca.transform(tfidf.weights) x_std = np.std(data2D[:, 0])
import sys, os cwd = os.getcwd().split("\\") sys.path.append(".." if cwd[-1] == "revisions" else "revisions/..") import numpy as np, random, math from libs.TFIDF import TFIDF from libs.C45 import C45 from entities.Storage import Storage from sklearn.model_selection import KFold import matplotlib.pyplot as plt from matplotlib.ticker import MaxNLocator storage = Storage() particle = storage.load('./pickle/pso-3.pckl') data = storage.load('./pickle/default-1541653057.8427656.pckl') def map_bool(x): if x == 1: return True return False def fold_data(data, k = 2): kf = KFold(n_splits=k, shuffle=True, random_state=2) for train, test in kf.split(data): return train, test train_idx, test_idx = fold_data(data) pos = particle.position.astype(bool) pos = [map_bool(x) for x in pos] selected_tfidf = TFIDF(data.iloc[train_idx]['Review'])
from entities.Storage import Storage from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer from sklearn.ensemble import GradientBoostingClassifier from sklearn.multiclass import OneVsRestClassifier from sklearn.preprocessing import label_binarize, LabelEncoder from sklearn.metrics import roc_curve, auc, confusion_matrix, classification_report from scipy import interp from itertools import cycle import matplotlib.pyplot as plt import numpy as np s = Storage() count_vect = CountVectorizer() tfidf_transformer = TfidfTransformer() gradboost = OneVsRestClassifier(GradientBoostingClassifier(random_state=7)) labels = ["Berdampak positif", "Berdampak negatif", "Netral"] le = LabelEncoder() all_predict_probas = np.array([]) all_tests = np.array([]) fpr = dict() tpr = dict() roc_auc = dict() for i in range(10): train = s.load(f"data/folds/train{i + 1}.pckl") test = s.load(f"data/folds/test{i + 1}.pckl") train_vect = count_vect.fit_transform(train["Review"]) train_tfidf = tfidf_transformer.fit_transform(train_vect)
import sys, os cwd = os.getcwd().split("\\") sys.path.append(".." if cwd[-1] == "revisions" else "revisions/..") from entities.Storage import Storage s = Storage() for i in range(10): tree = s.load(f"../data/models/tree{i + 1}.pckl") tree.show_tree() break
def __init__(self, UI): self.preprocessor = Preprocessor() self.k = 0 self.storage = Storage() self.threadpool = QThreadPool()
class MainControl(): def __init__(self, UI): self.preprocessor = Preprocessor() self.k = 0 self.storage = Storage() self.threadpool = QThreadPool() def classifyReview(self, review, tree): clf = self.storage.load(f"./data/models/{tree}") return clf.predict(clf.vectors, [review]) def import_excel(self, UI): return self.openFileDialog(UI) def openFileDialog(self, UI): try: options = QFileDialog.Options() options |= QFileDialog.DontUseNativeDialog fileName, _ = QFileDialog.getOpenFileName( UI, "Select Excel File", "", "Excel Files(*.xls *.xlsx)", options=options) if (fileName): importer = DataImporter(fileName) return importer.get_data() except: UI.msg = QMessageBox() UI.msg.setIcon(QMessageBox.Warning) UI.msg.setWindowTitle("Warning") UI.msg.setText("File tidak memiliki kolom Review dan Label") UI.msg.setStandardButtons(QMessageBox.Ok) UI.msg.show() UI.statusBar().showMessage("Import failed") return None def preprocess_data(self, UI, data): totalTime = 0 resultReview = [] for i, (review, label) in enumerate(zip(data["Review"], data["Label"])): if i > 0: UI.tableWidget.item(i - 1, 0).setBackground(QColor(255, 255, 255)) UI.tableWidget.item(i, 0).setBackground(QColor(255, 128, 128)) startTime = time.time() preprocessedReview = " ".join(self.preprocessor.preprocess(review)) endTime = time.time() resultReview.append(preprocessedReview) UI.logOutput.append( f"Review {i + 1} preprocessed in {round(endTime - startTime, 2)}s" ) totalTime += (endTime - startTime) UI.tableWidget.scrollToItem(UI.tableWidget.item(i - 1, 0), QAbstractItemView.PositionAtCenter) QApplication.processEvents() dlen = len(data) UI.tableWidget.item(dlen - 1, 0).setBackground(QColor(255, 255, 255)) UI.tableWidget.scrollToItem(UI.tableWidget.item(dlen - 1, 0), QAbstractItemView.PositionAtCenter) UI.logOutput.append( f"{dlen} review(s) preprocessed in {round(totalTime, 2)}s") return resultReview def save_data(self, data): self.storage.save(data, "data/preprocessed/preprocessed.pckl") def fold_data(self, k, UI=None): self.k = k self.threadpool.setMaxThreadCount(self.k) self.data = self.storage.load("data/preprocessed/preprocessed.pckl") kf = KFold(n_splits=self.k, shuffle=True, random_state=2) for i, (train, test) in enumerate(kf.split(self.data)): self.storage.save(self.data.iloc[train], f"data/folds/train{i + 1}.pckl") self.storage.save(self.data.iloc[test], f"data/folds/test{i + 1}.pckl") if UI is not None: UI.logOutput.append(f"Data folded by {k}") def mltrain_fn(self, params={ 'i': None, 'remove_zero_tfidf': False, 'UI': None }): train = self.storage.load(f"data/folds/train{params['i'] + 1}.pckl") tfidf = TFIDF(train["Review"]) if params['remove_zero_tfidf']: tfidf.weights = tfidf.remove_zero_tfidf(tfidf.weights, 0.4) clf = C45(tfidf, train) clf.train() return params["i"], clf, tfidf, params['UI'] or None def mltrain_result(self, res): self.attrs[res[0]] = res[2].count_vect.get_feature_names() self.clfs[res[0]] = res[1] self.tfidfs[res[0]] = res[2] self.storage.save(res[1], f"data/models/tree{res[0] + 1}.pckl") if res[3] is not None: res[3].logOutput.append(f"Tree {res[0] + 1} trained") def train_model(self, UI=None): if self.k <= 0 and UI is not None: UI.msg = QMessageBox() UI.msg.setIcon(QMessageBox.Warning) UI.msg.setWindowTitle("Warning") UI.msg.setText( "Anda harus membagi data menggunakan k-fold terlebih dahulu") UI.msg.setStandardButtons(QMessageBox.Ok) UI.msg.show() UI.statusBar().showMessage("Train and test failed") return False self.attrs, self.clfs, self.tfidfs = ([0 for i in range(self.k) ], [0 for i in range(self.k)], [0 for i in range(self.k)]) el = QEventLoop() for i in range(self.k): if UI is not None: UI.logOutput.append(f"Train tree {i + 1}") params = {'i': i, 'remove_zero_tfidf': True, 'UI': UI} worker = Worker(self.mltrain_fn, params) worker.signals.result.connect(self.mltrain_result) self.threadpool.start(worker) self.threadpool.waitForDone() el.processEvents() el.exit() if UI is not None: UI.logOutput.append("Training completed") return self.attrs def test_model(self): scores = [0 for i in range(self.k)] for i in range(self.k): test = self.storage.load(f"data/folds/test{i + 1}.pckl") score = self.clfs[i].score(self.tfidfs[i], test) self.clfs[i].set_score(score) # self.clfs[i].scores = self.clfs[i].score(self.tfidfs[i], test) self.storage.save(self.clfs[i], f"data/models/tree{i + 1}.pckl") # scores[i] = self.clfs[i].scores scores[i] = score return scores def optimize_model(self, popSize, numIteration, c1, c2, target): results = [] for i in range(self.k): train, test = self.storage.load( f"data/folds/train{i + 1}.pckl"), self.storage.load( f"data/folds/test{i + 1}.pckl") clf = self.storage.load(f"data/models/tree{i + 1}.pckl") particleSize = len(clf.termsInfo) pso = PSO(particleSize, popSize, numIteration, c1, c2, clf.get_score() + target) bestParticle = pso.exec(train, test) results.append(bestParticle) self.storage.save(bestParticle, f"data/particles/particle{i + 1}.pckl") return results def get_data(self, kth, dstype): t = "train" if dstype == "Training Data" else "test" return self.storage.load(f"data/folds/{t}{kth}.pckl") def load_data(self, path): if os.path.exists(path): return self.storage.load(path) msg = QMessageBox() msg.setIcon(QMessageBox.Warning) msg.setWindowTitle("Error") msg.setText("Data yang dimuat tidak ada") msg.setStandardButtons(QMessageBox.Ok) msg.exec_() return None
for train, test in kf.split(data): return train, test def preprocess_data(data, selected_attr): print(f"Preprocess data...") preprocessor = Preprocessor() result = [] for i, review in enumerate(data['Review']): result.append(" ".join( preprocessor.selected_preprocess(review, selected_attr))) print(f"Review {i + 1} preprocessed") return result storage = Storage() particle = storage.load('./pickle/pso-3.pckl') data = storage.load('./pickle/default-1541653057.8427656.pckl') train_idx, test_idx = fold_data(data) pos = particle.position.astype(bool) pos = [map_bool(x) for x in pos] selected_tfidf = TFIDF(data.iloc[train_idx]['Review']) features = np.array(list(selected_tfidf.termIndex.keys())) features = features[pos] data = import_data('../data/Avg_55,26.xlsx') data['Review'] = preprocess_data(data, features) storage.save(data, f"pickle/selected-3.pckl") # data = storage.load('./pickle/selected-1542024722.200629.pckl') for review, label in zip(data["Review"], data["Label"]):
from sklearn.decomposition import PCA from entities.Storage import Storage from libs.TFIDF import TFIDF from mpl_toolkits.mplot3d import Axes3D import matplotlib.pyplot as plt import numpy as np s = Storage() data = s.load("../pckl2/preprocessed/preprocessed.pckl") tfidf = TFIDF(data["Review"]) english_labels = { "Berdampak positif": "Berdampak positif", "Berdampak negatif": "Berdampak negatif", "Netral": "Netral" } groups = { "Berdampak positif": "green", "Berdampak negatif": "red", "Netral": "blue" } translated_labels = [english_labels[label] for label in data["Label"]] colors = np.array([groups[x] for x in translated_labels]) pca = PCA(n_components=2).fit(tfidf.weights) data2D = pca.transform(tfidf.weights) x_std = np.std(data2D[:, 0]) y_std = np.std(data2D[:, 1]) plt.xlabel("Komponen 1") plt.ylabel("Komponen 2") plt.title("Distribusi Titik Data Ulasan Grafik 2D")