Exemplo n.º 1
0

storage = Storage()
particle = storage.load('./pickle/pso-1.pckl')
pos = particle.position.astype(bool)
pos = [map_bool(x) for x in pos]
data = storage.load('./pickle/default-1541653057.8427656.pckl')
train_idx, test_idx = fold_data(data)
selected_tfidf = TFIDF(data.iloc[train_idx]['Review'])
features = np.array(list(selected_tfidf.termIndex.keys()))
features = features[pos]

new_data = import_data('../data/Avg_55,26.xlsx')
new_data['Review'] = preprocess_data(new_data, features)

storage.save(new_data, f"pickle/preprocessed-{time.time()}.pckl")

# kf = KFold(n_splits=10, shuffle=True, random_state=2)
# for i, (train, test) in enumerate(kf.split(data)):
# 	print("Train optimized")
# 	tfidf = TFIDF(data.iloc[train]["Review"])
# 	tfidf.weights = tfidf.remove_zero_tfidf(tfidf.weights, 0.5)
# 	tfidf.termIndex = {key:val for i, (key, val) in enumerate(tfidf.termIndex.items()) if key in features}
# 	clf = C45(tfidf, data.iloc[train])
# 	clf.train()
# 	result = clf.score(tfidf, data.iloc[test])

# 	print("Train unoptimized")
# 	tfidf = TFIDF(data.iloc[train]["Review"])
# 	tfidf.weights = tfidf.remove_zero_tfidf(tfidf.weights, 0.5)
# 	clf_unoptimized = C45(tfidf, data.iloc[train])
Exemplo n.º 2
0
def preprocess_data(data):
    print(f"Preprocess data...")
    preprocessor = Preprocessor()
    result = []
    for i, review in enumerate(data['Review']):
        result.append(" ".join(preprocessor.preprocess(review)))
        print(f"Review {i + 1} preprocessed")
    return result


# data = import_data('../data/Avg_55,26.xlsx')
# data['Review'] = preprocess_data(data)
storage = Storage()
# storage.save(data, f"pickle/default-{time.time()}.pckl")
data = storage.load("pickle/default-1541653057.8427656.pckl")
train, test = fold_data(data)
train_data = data.iloc[train]
test_data = data.iloc[test]

tfidf = TFIDF(train_data['Review'])
num_attrs = len(tfidf.termIndex)
# clf = C45(tfidf, data)
# clf.train()

# score = clf.score(tfidf, test_data)
# print(score) # 0.3630573248407643

pso = PSO(num_attrs, 20, 20, 0.7, 0.5, 0.99)
result = pso.exec(train_data, test_data)
storage.save(result, f"pickle/pso-1.pckl")
Exemplo n.º 3
0
class MainControl():
    def __init__(self, UI):
        self.preprocessor = Preprocessor()
        self.k = 0
        self.storage = Storage()
        self.threadpool = QThreadPool()

    def classifyReview(self, review, tree):
        clf = self.storage.load(f"./data/models/{tree}")
        return clf.predict(clf.vectors, [review])

    def import_excel(self, UI):
        return self.openFileDialog(UI)

    def openFileDialog(self, UI):
        try:
            options = QFileDialog.Options()
            options |= QFileDialog.DontUseNativeDialog
            fileName, _ = QFileDialog.getOpenFileName(
                UI,
                "Select Excel File",
                "",
                "Excel Files(*.xls *.xlsx)",
                options=options)
            if (fileName):
                importer = DataImporter(fileName)
                return importer.get_data()
        except:
            UI.msg = QMessageBox()
            UI.msg.setIcon(QMessageBox.Warning)
            UI.msg.setWindowTitle("Warning")
            UI.msg.setText("File tidak memiliki kolom Review dan Label")
            UI.msg.setStandardButtons(QMessageBox.Ok)
            UI.msg.show()
            UI.statusBar().showMessage("Import failed")
        return None

    def preprocess_data(self, UI, data):
        totalTime = 0
        resultReview = []
        for i, (review, label) in enumerate(zip(data["Review"],
                                                data["Label"])):
            if i > 0:
                UI.tableWidget.item(i - 1,
                                    0).setBackground(QColor(255, 255, 255))
            UI.tableWidget.item(i, 0).setBackground(QColor(255, 128, 128))
            startTime = time.time()
            preprocessedReview = " ".join(self.preprocessor.preprocess(review))
            endTime = time.time()
            resultReview.append(preprocessedReview)
            UI.logOutput.append(
                f"Review {i + 1} preprocessed in {round(endTime - startTime, 2)}s"
            )
            totalTime += (endTime - startTime)
            UI.tableWidget.scrollToItem(UI.tableWidget.item(i - 1, 0),
                                        QAbstractItemView.PositionAtCenter)
            QApplication.processEvents()
        dlen = len(data)
        UI.tableWidget.item(dlen - 1, 0).setBackground(QColor(255, 255, 255))
        UI.tableWidget.scrollToItem(UI.tableWidget.item(dlen - 1, 0),
                                    QAbstractItemView.PositionAtCenter)
        UI.logOutput.append(
            f"{dlen} review(s) preprocessed in {round(totalTime, 2)}s")
        return resultReview

    def save_data(self, data):
        self.storage.save(data, "data/preprocessed/preprocessed.pckl")

    def fold_data(self, k, UI=None):
        self.k = k
        self.threadpool.setMaxThreadCount(self.k)
        self.data = self.storage.load("data/preprocessed/preprocessed.pckl")
        kf = KFold(n_splits=self.k, shuffle=True, random_state=2)
        for i, (train, test) in enumerate(kf.split(self.data)):
            self.storage.save(self.data.iloc[train],
                              f"data/folds/train{i + 1}.pckl")
            self.storage.save(self.data.iloc[test],
                              f"data/folds/test{i + 1}.pckl")
        if UI is not None:
            UI.logOutput.append(f"Data folded by {k}")

    def mltrain_fn(self,
                   params={
                       'i': None,
                       'remove_zero_tfidf': False,
                       'UI': None
                   }):
        train = self.storage.load(f"data/folds/train{params['i'] + 1}.pckl")
        tfidf = TFIDF(train["Review"])
        if params['remove_zero_tfidf']:
            tfidf.weights = tfidf.remove_zero_tfidf(tfidf.weights, 0.4)
        clf = C45(tfidf, train)
        clf.train()
        return params["i"], clf, tfidf, params['UI'] or None

    def mltrain_result(self, res):
        self.attrs[res[0]] = res[2].count_vect.get_feature_names()
        self.clfs[res[0]] = res[1]
        self.tfidfs[res[0]] = res[2]
        self.storage.save(res[1], f"data/models/tree{res[0] + 1}.pckl")
        if res[3] is not None:
            res[3].logOutput.append(f"Tree {res[0] + 1} trained")

    def train_model(self, UI=None):
        if self.k <= 0 and UI is not None:
            UI.msg = QMessageBox()
            UI.msg.setIcon(QMessageBox.Warning)
            UI.msg.setWindowTitle("Warning")
            UI.msg.setText(
                "Anda harus membagi data menggunakan k-fold terlebih dahulu")
            UI.msg.setStandardButtons(QMessageBox.Ok)
            UI.msg.show()
            UI.statusBar().showMessage("Train and test failed")
            return False

        self.attrs, self.clfs, self.tfidfs = ([0 for i in range(self.k)
                                               ], [0 for i in range(self.k)],
                                              [0 for i in range(self.k)])
        el = QEventLoop()
        for i in range(self.k):
            if UI is not None:
                UI.logOutput.append(f"Train tree {i + 1}")
            params = {'i': i, 'remove_zero_tfidf': True, 'UI': UI}
            worker = Worker(self.mltrain_fn, params)
            worker.signals.result.connect(self.mltrain_result)
            self.threadpool.start(worker)

        self.threadpool.waitForDone()
        el.processEvents()
        el.exit()
        if UI is not None:
            UI.logOutput.append("Training completed")
        return self.attrs

    def test_model(self):
        scores = [0 for i in range(self.k)]
        for i in range(self.k):
            test = self.storage.load(f"data/folds/test{i + 1}.pckl")
            score = self.clfs[i].score(self.tfidfs[i], test)
            self.clfs[i].set_score(score)
            # self.clfs[i].scores = self.clfs[i].score(self.tfidfs[i], test)
            self.storage.save(self.clfs[i], f"data/models/tree{i + 1}.pckl")
            # scores[i] = self.clfs[i].scores
            scores[i] = score
        return scores

    def optimize_model(self, popSize, numIteration, c1, c2, target):
        results = []
        for i in range(self.k):
            train, test = self.storage.load(
                f"data/folds/train{i + 1}.pckl"), self.storage.load(
                    f"data/folds/test{i + 1}.pckl")
            clf = self.storage.load(f"data/models/tree{i + 1}.pckl")
            particleSize = len(clf.termsInfo)
            pso = PSO(particleSize, popSize, numIteration, c1, c2,
                      clf.get_score() + target)
            bestParticle = pso.exec(train, test)
            results.append(bestParticle)
            self.storage.save(bestParticle,
                              f"data/particles/particle{i + 1}.pckl")
        return results

    def get_data(self, kth, dstype):
        t = "train" if dstype == "Training Data" else "test"
        return self.storage.load(f"data/folds/{t}{kth}.pckl")

    def load_data(self, path):
        if os.path.exists(path):
            return self.storage.load(path)
        msg = QMessageBox()
        msg.setIcon(QMessageBox.Warning)
        msg.setWindowTitle("Error")
        msg.setText("Data yang dimuat tidak ada")
        msg.setStandardButtons(QMessageBox.Ok)
        msg.exec_()
        return None
Exemplo n.º 4
0
kf = KFold(n_splits=10, shuffle=True, random_state=2)
for i, (train, test) in enumerate(kf.split(data)):
	print("Train optimized")
	tfidf = TFIDF(data.iloc[train]["Review"])
	tfidf.weights = tfidf.remove_zero_tfidf(tfidf.weights, 0.5)
	tfidf.termIndex = {key:val for i, (key, val) in enumerate(tfidf.termIndex.items()) if key in features}
	clf = C45(tfidf, data.iloc[train])
	clf.train()
	result = clf.score(tfidf, data.iloc[test])

	print("Train unoptimized")
	tfidf = TFIDF(data.iloc[train]["Review"])
	tfidf.weights = tfidf.remove_zero_tfidf(tfidf.weights, 0.5)
	clf_unoptimized = C45(tfidf, data.iloc[train])
	clf_unoptimized.train()
	result_unoptimized = clf_unoptimized.score(tfidf, data.iloc[test])
	print(result_unoptimized, result)
	c45.append(result_unoptimized)
	pso_c45.append(result)

storage.save(result, './pickle/res-optimized-3.pckl')
storage.save(result_unoptimized, './pickle/res.pckl')
x_axis = np.linspace(0, 10, 10)
plt.plot(x_axis, c45, label="C4.5")
plt.scatter(x_axis, c45)
plt.plot(x_axis, pso_c45, label="PSO - C4.5")
plt.scatter(x_axis, pso_c45)
plt.grid()
plt.legend()
plt.show()
Exemplo n.º 5
0

storage = Storage()

particle = storage.load('./pickle/pso-3.pckl')
data = storage.load('./pickle/default-1541653057.8427656.pckl')
train_idx, test_idx = fold_data(data)
pos = particle.position.astype(bool)
pos = [map_bool(x) for x in pos]
selected_tfidf = TFIDF(data.iloc[train_idx]['Review'])
features = np.array(list(selected_tfidf.termIndex.keys()))
features = features[pos]

data = import_data('../data/Avg_55,26.xlsx')
data['Review'] = preprocess_data(data, features)
storage.save(data, f"pickle/selected-3.pckl")
# data = storage.load('./pickle/selected-1542024722.200629.pckl')
for review, label in zip(data["Review"], data["Label"]):
    print(label, review)
tfidf = TFIDF(data["Review"])
english_labels = {
    "Berdampak positif": "Berdampak positif",
    "Berdampak negatif": "Berdampak negatif",
    "Netral": "Netral"
}
groups = {
    "Berdampak positif": "green",
    "Berdampak negatif": "red",
    "Netral": "blue"
}
translated_labels = [english_labels[label] for label in data["Label"]]