예제 #1
0
class CheckDataset(object):
    def __init__(self):
        self.nlp = NLPHelper()
        self.ut = Utility()
        self.fex = FeatureExtractor()

    def checkData(self):
        path = "scenario2_fullidn_pickle/"
        filelist = os.listdir(path)
        data = pd.DataFrame()

        for idx, file in enumerate(filelist):

            #buka file pickle yang isinya data ner, coref, dan pos dari suatu teks berita
            pkl_dict = self.ut.loadPickle(os.path.join(path, file))
            # print(pkl_dict['ner'])
            # entities = self.fex.extractBefEntity(pkl_dict['ner'])
            filename = pkl_dict['filename']

            df = pd.DataFrame()

            df = self.countTermFrequency(pkl_dict['text'])
            df['filename'] = filename
            data = data.append(df)
            # df['entities'] = entities
        self.ut.convertToExcel("TF.xlsx", data, 'Sheet1')

    def countTermFrequency(self, text):
        import nltk

        words = nltk.word_tokenize(text)
        fdist = nltk.FreqDist(words)

        df = pd.DataFrame.from_dict(fdist, orient='index').reset_index()
        df.columns = ['term', 'frequency']
        # for word, frequency in fdist.most_common(50):
        #     print(u'{}:{}'.format(word, frequency))
        return df
예제 #2
0
class ModelTrainer(object):
    def __init__(self):
        self.ut = Utility()

    def train(self, dataset, drop_element):
        #classifier algorithm, n_estimator = jumlah tree, random_state= angka apapun, sengaja didefine biar hasilnya tetap sama
        clf = RandomForestClassifier(n_estimators=3,
                                     random_state=2)  #coba utak atik

        # extract feature needed, drop entity
        dataset = dataset.drop(['entity', 'id_text', drop_element], axis=1)
        # convert type to numeric
        dataset = self.ut.convertToNumeric(dataset)
        # dataset = self.ut.oneHotEncoding(dataset)

        # determine wich column is feature or label
        # X itu fitur
        X = dataset.iloc[:, :
                         -1]  # [x = take  entire row, y = take all column except last column]
        # y itu label
        y = dataset.iloc[:, -1]  # [x = take entire row, y = last column only]

        # get training score using cross validation
        result = self.nFoldCrossValidation(X, y, clf, nfold=10)

        if drop_element == 'who':

            # training and save into pickle
            # scenario 1
            # joblib.dump(clf,'model/scen1_train_where_halfidn.pkl')
            # joblib.dump(clf,'model/HO_scen1_train_where_halfidn.pkl')
            # joblib.dump(clf,'model/3_scen1_train_where_halfidn.pkl')
            # joblib.dump(clf,'model/HO2_scen1_train_where_halfidn.pkl')
            # # scenario 2
            # joblib.dump(clf,'model/scen2_train_where_fullidn.pkl')
            # joblib.dump(clf,'model/HO_scen2_train_where_fullidn.pkl')
            # joblib.dump(clf,'model/3_scen2_train_where_fullidn.pkl')
            # joblib.dump(clf,'model/HO2_scen2_train_where_fullidn.pkl')
            # # scenario 3
            # joblib.dump(clf,'model/scen3_train_where_default.pkl')
            # joblib.dump(clf,'model/HO_scen3_train_where_default.pkl')
            joblib.dump(clf, 'model/3_scen3_train_where_default.pkl')
            # joblib.dump(clf,'model/HO2_scen3_train_where_default.pkl')
            # testing
            # joblib.dump(clf,'model/s2_testing_where.pkl')
            print("Model for WHERE has been saved")

            # scenario 1
            # self.ut.convertToExcel("./result/scenario1_halfidn_WHERE_10fold.xlsx",result,"Sheet1")
            # self.ut.convertToExcel("./result/HO_scenario1_halfidn_WHERE_10fold.xlsx",result,"Sheet1")
            # self.ut.convertToExcel("./result/HO2_scenario1_halfidn_WHERE_10fold.xlsx",result,"Sheet1")
            # scenario 2
            # self.ut.convertToExcel("./result/3_scenario2_fullidn_WHERE_10fold.xlsx",result,"Sheet1")
            # self.ut.convertToExcel("./result/HO2_scenario2_fullidn_WHERE_10fold.xlsx",result,"Sheet1")
            # scenario 3
            self.ut.convertToExcel(
                "./result/3_scenario3_default_WHERE_10fold.xlsx", result,
                "Sheet1")
            # self.ut.convertToExcel("./result/HO2_scenario3_default_WHERE_10fold.xlsx",result,"Sheet1")

            # scenario testing
            # self.ut.convertToExcel("./result/s2_testing_WHERE_10fold.xlsx",result,"Sheet1")
            print(
                "Cross Validation for WHERE model has been saved to excel file!"
            )

        elif drop_element == 'where':
            # training and save into pickle
            # scenario 1
            # joblib.dump(clf,'model/scen1_train_who_halfidn.pkl')
            # joblib.dump(clf,'model/HO_scen1_train_who_halfidn.pkl')
            # joblib.dump(clf,'model/3_scen1_train_who_halfidn.pkl')
            # joblib.dump(clf,'model/HO2_scen1_train_who_halfidn.pkl')
            # # scenario 2
            # joblib.dump(clf,'model/scen2_train_who_fullidn.pkl')
            # joblib.dump(clf,'model/HO_scen2_train_who_fullidn.pkl')
            # joblib.dump(clf,'model/3_scen2_train_who_fullidn.pkl')
            # joblib.dump(clf,'model/HO2_scen2_train_who_fullidn.pkl')
            # # scenario 3
            # joblib.dump(clf,'model/scen3_train_who_default.pkl')
            # joblib.dump(clf,'model/HO_scen3_train_who_default.pkl')
            joblib.dump(clf, 'model/3_scen3_train_who_default.pkl')
            # joblib.dump(clf,'model/HO2_scen3_train_who_default.pkl')
            # testing
            # joblib.dump(clf,'model/s2_testing_who.pkl')
            print("Model for WHO has been saved")

            # scenario 1
            # self.ut.convertToExcel("./result/scenario1_halfidn_WHO_10fold.xlsx",result,"Sheet1")
            # self.ut.convertToExcel("./result/HO_scenario1_halfidn_WHO_10fold.xlsx",result,"Sheet1")
            # self.ut.convertToExcel("./result/HO2_scenario1_halfidn_WHO_10fold.xlsx",result,"Sheet1")
            # scenario 2
            # self.ut.convertToExcel("./result/3_scenario2_fullidn_WHO_10fold.xlsx",result,"Sheet1")
            # self.ut.convertToExcel("./result/HO2_scenario2_fullidn_WHO_10fold.xlsx",result,"Sheet1")
            # scenario 3
            self.ut.convertToExcel(
                "./result/3_scenario3_default_WHO_10fold.xlsx", result,
                "Sheet1")
            # self.ut.convertToExcel("./result/HO2_scenario3_default_WHO_10fold.xlsx",result,"Sheet1")

            # scenario testing
            # self.ut.convertToExcel("./result/s2_testing_WHO_10fold.xlsx",result,"Sheet1")
            print(
                "Cross Validation for WHO model has been saved to excel file!")

    # classic method
    def getEvaluationScore(self, X_test, y_test, model):
        y_pred = model.predict(X_test)
        print(y_pred)
        # otak atik aja datanya, gimana biar nilainya jadi ga 0 lagi, undersampling oversampling ?
        print("Accuracy: ", (accuracy_score(y_test, y_pred) * 100).round(4))
        print("Precision: ", (precision_score(y_test, y_pred) * 100).round(4))
        print("Recall: ", (recall_score(y_test, y_pred) * 100).round(4))
        print("F-measure: ", (f1_score(y_test, y_pred) * 100).round(4))
        print("Confusion matrix:")
        print(confusion_matrix(y_test, y_pred))

    # cross validation
    def nFoldCrossValidation(self, X, y, model, nfold):
        # ngitung jumlah class
        class_count = y.groupby(y).count().shape[0]
        # shuffle biar ngacak di awal, random_state biar dirunning berkali kali tetap sama
        k_fold = StratifiedKFold(n_splits=nfold, shuffle=True, random_state=7)
        # initiate score lists
        precision_list = []
        recall_list = []
        fscore_list = []
        train_score = []
        test_score = []

        # counter fold, dimulai dari 1
        fold_count = 1
        print("Confusion matrix of " + model.__class__.__name__ + ":\n")

        # train_indices and test_indices returns an array of indices which indicate train and test data  (indices = index)
        for train_indices, test_indices in k_fold.split(X,
                                                        y):  #split per n fold
            # memisahkan data training dan data testing
            X_train, X_test = X.iloc[train_indices], X.iloc[
                test_indices]  #iloc = locate data by index
            y_train, y_test = y.iloc[train_indices], y.iloc[test_indices]

            # fit = buat training
            model.fit(X_train, y_train)

            # prediksi data training
            predictions = model.predict(X_train)
            # mendapatkan nilai akurasi dengan cara ngebandingin hasil prediksi dengan nilai aslinya (y_train / data training) satu persatu
            train_score.append(
                accuracy_score(y_train, predictions).round(4)
            )  #diround 4 biar hasilnya dibulatkan jadi 4 angka di belakang koma

            # prediksi data testing
            predictions = model.predict(X_test)
            # mendapatkan nilai akurasi dengan cara ngebandingin hasil prediksi dengan nilai aslinya (y_test / data testing)
            test_score.append(accuracy_score(y_test, predictions).round(4))

            # mencari nilai precision, recall dan f_score dengan membandingkan nilai asli (y_testing) dengan hasil prediksi
            precision_list.append(
                precision_score(y_test, predictions).round(4))
            recall_list.append(recall_score(y_test, predictions).round(4))
            fscore_list.append(f1_score(y_test, predictions).round(4))

            # menunjukkan fold ke berapa, dan confusion matrixnya seperti apa
            print("Fold " + str(fold_count) + ":")
            # urutan confusion matrix, pokoknya baris bawah, sebelah kanan itu True Positive nya kirinya False Positive, atas True Negative dan False Negative
            print(confusion_matrix(y_test, predictions))
            print()
            fold_count += 1

        # hitung rata - rata nilai akurasi, precision, recall, dan f_score
        acc_train = (sum(train_score) / len(train_score)).round(4)
        acc_test = (sum(test_score) / len(test_score)).round(4)
        precision = (sum(precision_list) / len(precision_list)).round(4)
        recall = (sum(recall_list) / len(recall_list)).round(4)
        f_score = (sum(fscore_list) / len(fscore_list)).round(4)

        print("Evaluation using " + model.__class__.__name__ + ":\n")

        # simpan data hasil perhitungan akurasi precision recall f_score ke dataframe
        fold_index = [str(i + 1) for i in range(nfold)]  #create fold index
        fold_data = [
            fold_index, train_score, test_score, precision_list, recall_list,
            fscore_list
        ]
        fold_column = [
            'fold', 'acc_train', 'acc_test', 'precision', 'recall', 'f_score'
        ]  #create column name
        df_fold = pd.DataFrame(np.column_stack(fold_data),
                               columns=fold_column)  #bikin DataFrame
        df_fold = df_fold.set_index('fold')  #set data fold as index

        #PRINT  hasil
        print(df_fold)
        print("=" * 50 + "\n")

        print('Total data classified:', len(X))
        # perlu dibandingkan nilai akurasi di training dan di testing, siapa tau ada overfitting, kalau misalnya ga beda jauh, berarti kemungkinan modelnya benar
        print('Accuracy on Train:', acc_train)
        print('Accuracy on Test:', acc_test)
        print('Precision:', precision)
        print('Recall:', recall)
        print('F-Score:', f_score)

        return df_fold
예제 #3
0
class EvaluateNews(object):
    def __init__(self):
        self.ut = Utility()
        self.fwe = FiveWExtractor()
        self.fex = FeatureExtractor()
        self.nlp = NLPHelper()
        self.tr = ModelTrainer()

    def evaluateGoldenDatasetNews(self, file_range=None):
        # filerange = (0, 10)
        # find feature in one text and save it to excel
        path = "./datasets/"
        filelist = os.listdir(path)
        data = pd.DataFrame()

        if file_range:
            filelist = filelist[file_range[0]:file_range[1]]

        for idx, file in enumerate(filelist):
            print(file)
            #buka file pickle yang isinya data ner, coref, dan pos dari suatu teks berita
            file_temp = self.ut.loadJSON(os.path.join(path, file))
            # ekstraksi 5W dari file JSON
            try:
                temp = self.fwe.extract5w(file_temp["text"],
                                          file_temp["title"])
                temp["file"] = file
                data = data.append(temp, ignore_index=True)
            except:
                temp = []
                print("It failed huhu")

        self.ut.convertToExcel("idnhalf_goldendata_evaluate_089.xlsx", data,
                               'Sheet1')

        print("Evaluating golden data is done!")

    def extract5wLocalNewsForEval(self, filename):

        data = self.ut.loadCSV(filename, ',', "ISO-8859-1")

        data['extracted'] = data.apply(
            lambda x: self.fwe.extract5wLocalNews(x['text'], x['title']),
            axis=1)
        temp = pd.DataFrame()
        temp['title'] = data['extracted'].apply(lambda x: x['title'])
        temp['text'] = data['extracted'].apply(lambda x: x['text'])
        temp['who'] = data['extracted'].apply(lambda x: x['who'])
        temp['where'] = data['extracted'].apply(lambda x: x['where'])
        temp['what'] = data['extracted'].apply(lambda x: x['what'])
        temp['when'] = data['extracted'].apply(lambda x: x['when'])
        temp['why'] = data['extracted'].apply(lambda x: x['why'])

        # scenario 1
        # self.ut.convertToExcel("3_scen1_halfidn_evallocalnews.xlsx",temp,'Sheet1')
        # self.ut.convertToExcel("HO_scen1_halfidn_evallocalnews.xlsx",temp,'Sheet1')
        # scenario 2
        # self.ut.convertToExcel("3_scen2_fullidn_evallocalnews.xlsx",temp,'Sheet1')
        # self.ut.convertToExcel("HO_scen2_fullidn_evallocalnews.xlsx",temp,'Sheet1')
        # scenario 3
        self.ut.convertToExcel("3_scen3_default_evallocalnews.xlsx", temp,
                               'Sheet1')
        # self.ut.convertToExcel("HO_scen3_default_evallocalnews.xlsx",temp,'Sheet1')

        print("Evaluating local news is done!")

    def extractFeatureFromLocalNews(self, filename):
        data = self.ut.loadCSV(filename, ',', "ISO-8859-1")

        data['ner'] = data['text'].apply(lambda x: self.nlp.getNER(x))
        data['coref'] = data['text'].apply(lambda x: self.nlp.getCoref(x))

        feature = pd.DataFrame()
        for i in range(data.shape[0]):
            feature = feature.append(self.fex.extractFeaturesDirectFromText(
                data.iloc[i]),
                                     ignore_index=True)

        # scenario 1
        # self.ut.convertToExcel("scen1_halfidn_localfeature.xlsx",feature,'Sheet1')
        # scenario 2
        # self.ut.convertToExcel("scen2_fullidn_localfeature.xlsx",feature,'Sheet1')
        # scenario 3
        # self.ut.convertToExcel("scen3_default_localfeature.xlsx",feature,'Sheet1')

    def evaluateLocalWhoWhere(self, drop_element):

        # # scenario 1
        # dataset = pd.read_excel('scen1_halfidn_localfeature.xlsx', sheet_name='Sheet1')
        # model_where = joblib.load('model/3_scen1_train_where_halfidn.pkl')
        # model_who = joblib.load('model/3_scen1_train_who_halfidn.pkl')

        # scenario 2
        # dataset = pd.read_excel('scen2_fullidn_localfeature.xlsx', sheet_name='Sheet1')
        # model_where = joblib.load('model/3_scen2_train_where_fullidn.pkl')
        # model_who = joblib.load('model/3_scen2_train_who_fullidn.pkl')

        # # scenario 3
        dataset = pd.read_excel('scen3_default_localfeature.xlsx',
                                sheet_name='Sheet1')
        model_where = joblib.load('model/3_scen3_train_where_default.pkl')
        model_who = joblib.load('model/3_scen3_train_who_default.pkl')

        # scenario HO -------------------------------
        # # # scenario 1
        # dataset = pd.read_excel('scen1_halfidn_localfeature.xlsx', sheet_name='Sheet1')
        # model_where = joblib.load('model/HO2_scen1_train_where_halfidn.pkl')
        # model_who = joblib.load('model/HO2_scen1_train_who_halfidn.pkl')

        # scenario 2
        # dataset = pd.read_excel('scen2_fullidn_localfeature.xlsx', sheet_name='Sheet1')
        # model_where = joblib.load('model/HO2_scen2_train_where_fullidn.pkl')
        # model_who = joblib.load('model/HO2_scen2_train_who_fullidn.pkl')

        # # # scenario 3
        # dataset = pd.read_excel('scen3_default_localfeature.xlsx', sheet_name='Sheet1')
        # model_where = joblib.load('model/HO2_scen3_train_where_default.pkl')
        # model_who = joblib.load('model/HO2_scen3_train_who_default.pkl')

        # scenario test
        # dataset = pd.read_excel('scen1_halfidn_localfeature.xlsx', sheet_name='Sheet1')
        # model_where = joblib.load('model/s2_testing_where.pkl')
        # model_who = joblib.load('model/s2_testing_who.pkl')

        if drop_element == 'who':
            self.evaluateModelLocal(dataset, 'who', model_where)
            print("Evaluation for WHERE's local classifier is done!")
        elif drop_element == 'where':
            self.evaluateModelLocal(dataset, 'where', model_who)
            print("Evaluation for WHO's local classifier is done!")

    def evaluateModelLocal(self, dataset, drop_element, model):
        dataset = self.ut.convertToNumeric(dataset)

        dataset = dataset.drop(['entity', drop_element], axis=1)
        # !!! FOR ONE HOT ENCODING !!!
        # dataset = self.ut.oneHotEncoding(dataset)

        # determine wich column is feature or label
        # X itu fitur
        X = dataset.iloc[:, :
                         -1]  # [x = take  entire row, y = take all column except last column]
        # y itu label
        y = dataset.iloc[:, -1]  # [x = take entire row, y = last column only]

        # get training score using cross validation
        # result = self.nFoldCrossValidation(X, y, clf, nfold=10)
        result = self.tr.getEvaluationScore(X, y, model)