Exemplo n.º 1
0
    return False


def fold_data(data, k=2):
    kf = KFold(n_splits=k, shuffle=True, random_state=2)
    for train, test in kf.split(data):
        return train, test


def import_data(filename):
    print(f"Import {filename}")
    importer = DataImporter(filename)
    return importer.get_data()


storage = Storage()
particle = storage.load('./pickle/pso-1.pckl')
pos = particle.position.astype(bool)
pos = [map_bool(x) for x in pos]
data = storage.load('./pickle/default-1541653057.8427656.pckl')
train_idx, test_idx = fold_data(data)
selected_tfidf = TFIDF(data.iloc[train_idx]['Review'])
features = np.array(list(selected_tfidf.termIndex.keys()))
features = features[pos]

new_data = import_data('../data/Avg_55,26.xlsx')
new_data['Review'] = preprocess_data(new_data, features)

storage.save(new_data, f"pickle/preprocessed-{time.time()}.pckl")

# kf = KFold(n_splits=10, shuffle=True, random_state=2)
Exemplo n.º 2
0
        return train, test


def preprocess_data(data):
    print(f"Preprocess data...")
    preprocessor = Preprocessor()
    result = []
    for i, review in enumerate(data['Review']):
        result.append(" ".join(preprocessor.preprocess(review)))
        print(f"Review {i + 1} preprocessed")
    return result


# data = import_data('../data/Avg_55,26.xlsx')
# data['Review'] = preprocess_data(data)
storage = Storage()
# storage.save(data, f"pickle/default-{time.time()}.pckl")
data = storage.load("pickle/default-1541653057.8427656.pckl")
train, test = fold_data(data)
train_data = data.iloc[train]
test_data = data.iloc[test]

tfidf = TFIDF(train_data['Review'])
num_attrs = len(tfidf.termIndex)
# clf = C45(tfidf, data)
# clf.train()

# score = clf.score(tfidf, test_data)
# print(score) # 0.3630573248407643

pso = PSO(num_attrs, 20, 20, 0.7, 0.5, 0.99)
Exemplo n.º 3
0
import sys, os
cwd = os.getcwd().split("\\")
sys.path.append(".." if cwd[-1] == "revisions" else "revisions/..")

from sklearn.decomposition import PCA
from entities.Storage import Storage
from libs.TFIDF import TFIDF
from mpl_toolkits.mplot3d import Axes3D
import matplotlib.pyplot as plt
import numpy as np

s = Storage()
data = s.load("pickle/default-1541653057.8427656.pckl")
tfidf = TFIDF(data["Review"])
english_labels = {
    "Berdampak positif": "Berdampak positif",
    "Berdampak negatif": "Berdampak negatif",
    "Netral": "Netral"
}
groups = {
    "Berdampak positif": "green",
    "Berdampak negatif": "red",
    "Netral": "blue"
}
translated_labels = [english_labels[label] for label in data["Label"]]
colors = np.array([groups[x] for x in translated_labels])

pca = PCA(n_components=2).fit(tfidf.weights)
data2D = pca.transform(tfidf.weights)
x_std = np.std(data2D[:, 0])
y_std = np.std(data2D[:, 1])
Exemplo n.º 4
0
import sys, os
cwd = os.getcwd().split("\\")
sys.path.append(".." if cwd[-1] == "revisions" else "revisions/..")

import numpy as np, random, math, pandas as pd
from libs.TFIDF import TFIDF
from entities.Storage import Storage

storage = Storage()
data = storage.load('./pickle/default-1541653057.8427656.pckl')
tfidf = TFIDF(data['Review'])
print(len(tfidf.weights[0]))

df = pd.DataFrame(tfidf.weights)
df.to_excel('tfidf.xlsx')
Exemplo n.º 5
0
import sys, os
cwd = os.getcwd().split("\\")
sys.path.append(".." if cwd[-1] == "revisions" else "revisions/..")

from libs.DataImporter import DataImporter
from sklearn.decomposition import PCA
from entities.Storage import Storage
from libs.TFIDF import TFIDF
from mpl_toolkits.mplot3d import Axes3D
import matplotlib.pyplot as plt
import numpy as np

s = Storage()
data = s.load("pickle/default-1541653057.8427656.pckl")
tfidf = TFIDF(data["Review"])
english_labels = {
	"Berdampak positif": "Berdampak positif",
	"Berdampak negatif": "Berdampak negatif",
	"Netral": "Netral"	
}
groups = {
	"Berdampak positif": "green",
	"Berdampak negatif": "red",
	"Netral": "blue"	
}
translated_labels = [english_labels[label] for label in data["Label"]]
colors = np.array([groups[x] for x in translated_labels])

pca = PCA(n_components=2).fit(tfidf.weights)
data2D = pca.transform(tfidf.weights)
x_std = np.std(data2D[:, 0])
Exemplo n.º 6
0
import sys, os
cwd = os.getcwd().split("\\")
sys.path.append(".." if cwd[-1] == "revisions" else "revisions/..")

import numpy as np, random, math
from libs.TFIDF import TFIDF
from libs.C45 import C45
from entities.Storage import Storage
from sklearn.model_selection import KFold
import matplotlib.pyplot as plt
from matplotlib.ticker import MaxNLocator

storage = Storage()
particle = storage.load('./pickle/pso-3.pckl')
data = storage.load('./pickle/default-1541653057.8427656.pckl')

def map_bool(x):
	if x == 1:
		return True
	return False

def fold_data(data, k = 2):
	kf = KFold(n_splits=k, shuffle=True, random_state=2)
	for train, test in kf.split(data):
		return train, test

train_idx, test_idx = fold_data(data)

pos = particle.position.astype(bool)
pos = [map_bool(x) for x in pos]
selected_tfidf = TFIDF(data.iloc[train_idx]['Review'])
Exemplo n.º 7
0
from entities.Storage import Storage
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.multiclass import OneVsRestClassifier
from sklearn.preprocessing import label_binarize, LabelEncoder
from sklearn.metrics import roc_curve, auc, confusion_matrix, classification_report
from scipy import interp
from itertools import cycle
import matplotlib.pyplot as plt
import numpy as np

s = Storage()
count_vect = CountVectorizer()
tfidf_transformer = TfidfTransformer()
gradboost = OneVsRestClassifier(GradientBoostingClassifier(random_state=7))
labels = ["Berdampak positif", "Berdampak negatif", "Netral"]
le = LabelEncoder()

all_predict_probas = np.array([])
all_tests = np.array([])
fpr = dict()
tpr = dict()
roc_auc = dict()

for i in range(10):
	train = s.load(f"data/folds/train{i + 1}.pckl")
	test = s.load(f"data/folds/test{i + 1}.pckl")

	train_vect = count_vect.fit_transform(train["Review"])
	train_tfidf = tfidf_transformer.fit_transform(train_vect)
Exemplo n.º 8
0
import sys, os
cwd = os.getcwd().split("\\")
sys.path.append(".." if cwd[-1] == "revisions" else "revisions/..")

from entities.Storage import Storage

s = Storage()

for i in range(10):
	tree = s.load(f"../data/models/tree{i + 1}.pckl")
	tree.show_tree()
	break
Exemplo n.º 9
0
 def __init__(self, UI):
     self.preprocessor = Preprocessor()
     self.k = 0
     self.storage = Storage()
     self.threadpool = QThreadPool()
Exemplo n.º 10
0
class MainControl():
    def __init__(self, UI):
        self.preprocessor = Preprocessor()
        self.k = 0
        self.storage = Storage()
        self.threadpool = QThreadPool()

    def classifyReview(self, review, tree):
        clf = self.storage.load(f"./data/models/{tree}")
        return clf.predict(clf.vectors, [review])

    def import_excel(self, UI):
        return self.openFileDialog(UI)

    def openFileDialog(self, UI):
        try:
            options = QFileDialog.Options()
            options |= QFileDialog.DontUseNativeDialog
            fileName, _ = QFileDialog.getOpenFileName(
                UI,
                "Select Excel File",
                "",
                "Excel Files(*.xls *.xlsx)",
                options=options)
            if (fileName):
                importer = DataImporter(fileName)
                return importer.get_data()
        except:
            UI.msg = QMessageBox()
            UI.msg.setIcon(QMessageBox.Warning)
            UI.msg.setWindowTitle("Warning")
            UI.msg.setText("File tidak memiliki kolom Review dan Label")
            UI.msg.setStandardButtons(QMessageBox.Ok)
            UI.msg.show()
            UI.statusBar().showMessage("Import failed")
        return None

    def preprocess_data(self, UI, data):
        totalTime = 0
        resultReview = []
        for i, (review, label) in enumerate(zip(data["Review"],
                                                data["Label"])):
            if i > 0:
                UI.tableWidget.item(i - 1,
                                    0).setBackground(QColor(255, 255, 255))
            UI.tableWidget.item(i, 0).setBackground(QColor(255, 128, 128))
            startTime = time.time()
            preprocessedReview = " ".join(self.preprocessor.preprocess(review))
            endTime = time.time()
            resultReview.append(preprocessedReview)
            UI.logOutput.append(
                f"Review {i + 1} preprocessed in {round(endTime - startTime, 2)}s"
            )
            totalTime += (endTime - startTime)
            UI.tableWidget.scrollToItem(UI.tableWidget.item(i - 1, 0),
                                        QAbstractItemView.PositionAtCenter)
            QApplication.processEvents()
        dlen = len(data)
        UI.tableWidget.item(dlen - 1, 0).setBackground(QColor(255, 255, 255))
        UI.tableWidget.scrollToItem(UI.tableWidget.item(dlen - 1, 0),
                                    QAbstractItemView.PositionAtCenter)
        UI.logOutput.append(
            f"{dlen} review(s) preprocessed in {round(totalTime, 2)}s")
        return resultReview

    def save_data(self, data):
        self.storage.save(data, "data/preprocessed/preprocessed.pckl")

    def fold_data(self, k, UI=None):
        self.k = k
        self.threadpool.setMaxThreadCount(self.k)
        self.data = self.storage.load("data/preprocessed/preprocessed.pckl")
        kf = KFold(n_splits=self.k, shuffle=True, random_state=2)
        for i, (train, test) in enumerate(kf.split(self.data)):
            self.storage.save(self.data.iloc[train],
                              f"data/folds/train{i + 1}.pckl")
            self.storage.save(self.data.iloc[test],
                              f"data/folds/test{i + 1}.pckl")
        if UI is not None:
            UI.logOutput.append(f"Data folded by {k}")

    def mltrain_fn(self,
                   params={
                       'i': None,
                       'remove_zero_tfidf': False,
                       'UI': None
                   }):
        train = self.storage.load(f"data/folds/train{params['i'] + 1}.pckl")
        tfidf = TFIDF(train["Review"])
        if params['remove_zero_tfidf']:
            tfidf.weights = tfidf.remove_zero_tfidf(tfidf.weights, 0.4)
        clf = C45(tfidf, train)
        clf.train()
        return params["i"], clf, tfidf, params['UI'] or None

    def mltrain_result(self, res):
        self.attrs[res[0]] = res[2].count_vect.get_feature_names()
        self.clfs[res[0]] = res[1]
        self.tfidfs[res[0]] = res[2]
        self.storage.save(res[1], f"data/models/tree{res[0] + 1}.pckl")
        if res[3] is not None:
            res[3].logOutput.append(f"Tree {res[0] + 1} trained")

    def train_model(self, UI=None):
        if self.k <= 0 and UI is not None:
            UI.msg = QMessageBox()
            UI.msg.setIcon(QMessageBox.Warning)
            UI.msg.setWindowTitle("Warning")
            UI.msg.setText(
                "Anda harus membagi data menggunakan k-fold terlebih dahulu")
            UI.msg.setStandardButtons(QMessageBox.Ok)
            UI.msg.show()
            UI.statusBar().showMessage("Train and test failed")
            return False

        self.attrs, self.clfs, self.tfidfs = ([0 for i in range(self.k)
                                               ], [0 for i in range(self.k)],
                                              [0 for i in range(self.k)])
        el = QEventLoop()
        for i in range(self.k):
            if UI is not None:
                UI.logOutput.append(f"Train tree {i + 1}")
            params = {'i': i, 'remove_zero_tfidf': True, 'UI': UI}
            worker = Worker(self.mltrain_fn, params)
            worker.signals.result.connect(self.mltrain_result)
            self.threadpool.start(worker)

        self.threadpool.waitForDone()
        el.processEvents()
        el.exit()
        if UI is not None:
            UI.logOutput.append("Training completed")
        return self.attrs

    def test_model(self):
        scores = [0 for i in range(self.k)]
        for i in range(self.k):
            test = self.storage.load(f"data/folds/test{i + 1}.pckl")
            score = self.clfs[i].score(self.tfidfs[i], test)
            self.clfs[i].set_score(score)
            # self.clfs[i].scores = self.clfs[i].score(self.tfidfs[i], test)
            self.storage.save(self.clfs[i], f"data/models/tree{i + 1}.pckl")
            # scores[i] = self.clfs[i].scores
            scores[i] = score
        return scores

    def optimize_model(self, popSize, numIteration, c1, c2, target):
        results = []
        for i in range(self.k):
            train, test = self.storage.load(
                f"data/folds/train{i + 1}.pckl"), self.storage.load(
                    f"data/folds/test{i + 1}.pckl")
            clf = self.storage.load(f"data/models/tree{i + 1}.pckl")
            particleSize = len(clf.termsInfo)
            pso = PSO(particleSize, popSize, numIteration, c1, c2,
                      clf.get_score() + target)
            bestParticle = pso.exec(train, test)
            results.append(bestParticle)
            self.storage.save(bestParticle,
                              f"data/particles/particle{i + 1}.pckl")
        return results

    def get_data(self, kth, dstype):
        t = "train" if dstype == "Training Data" else "test"
        return self.storage.load(f"data/folds/{t}{kth}.pckl")

    def load_data(self, path):
        if os.path.exists(path):
            return self.storage.load(path)
        msg = QMessageBox()
        msg.setIcon(QMessageBox.Warning)
        msg.setWindowTitle("Error")
        msg.setText("Data yang dimuat tidak ada")
        msg.setStandardButtons(QMessageBox.Ok)
        msg.exec_()
        return None
Exemplo n.º 11
0
    for train, test in kf.split(data):
        return train, test


def preprocess_data(data, selected_attr):
    print(f"Preprocess data...")
    preprocessor = Preprocessor()
    result = []
    for i, review in enumerate(data['Review']):
        result.append(" ".join(
            preprocessor.selected_preprocess(review, selected_attr)))
        print(f"Review {i + 1} preprocessed")
    return result


storage = Storage()

particle = storage.load('./pickle/pso-3.pckl')
data = storage.load('./pickle/default-1541653057.8427656.pckl')
train_idx, test_idx = fold_data(data)
pos = particle.position.astype(bool)
pos = [map_bool(x) for x in pos]
selected_tfidf = TFIDF(data.iloc[train_idx]['Review'])
features = np.array(list(selected_tfidf.termIndex.keys()))
features = features[pos]

data = import_data('../data/Avg_55,26.xlsx')
data['Review'] = preprocess_data(data, features)
storage.save(data, f"pickle/selected-3.pckl")
# data = storage.load('./pickle/selected-1542024722.200629.pckl')
for review, label in zip(data["Review"], data["Label"]):
Exemplo n.º 12
0
from sklearn.decomposition import PCA
from entities.Storage import Storage
from libs.TFIDF import TFIDF
from mpl_toolkits.mplot3d import Axes3D
import matplotlib.pyplot as plt
import numpy as np

s = Storage()
data = s.load("../pckl2/preprocessed/preprocessed.pckl")
tfidf = TFIDF(data["Review"])
english_labels = {
    "Berdampak positif": "Berdampak positif",
    "Berdampak negatif": "Berdampak negatif",
    "Netral": "Netral"
}
groups = {
    "Berdampak positif": "green",
    "Berdampak negatif": "red",
    "Netral": "blue"
}
translated_labels = [english_labels[label] for label in data["Label"]]
colors = np.array([groups[x] for x in translated_labels])

pca = PCA(n_components=2).fit(tfidf.weights)
data2D = pca.transform(tfidf.weights)
x_std = np.std(data2D[:, 0])
y_std = np.std(data2D[:, 1])

plt.xlabel("Komponen 1")
plt.ylabel("Komponen 2")
plt.title("Distribusi Titik Data Ulasan Grafik 2D")