def test_classifier(dataset, classifier): print("-----TESTING CLASSIFIER-----") if isinstance(classifier, keras.engine.training.Model): x_test, y_test = dl.prepare_data_for_RNN(dataset) print("-----TEST SET SIZE: " + str(x_test["sentence1"].shape) + "-----") scores = classifier.evaluate(x_test, y_test) print("\n%s: %.2f%%" % (classifier.metrics_names[1], scores[1] * 100)) elif isinstance(classifier, keras.models.Sequential): x_test, y_test = dl.prepare_data_for_NN(dataset) print("-----TEST SET SIZE: " + str(len(x_test)) + "-----") scores = classifier.evaluate(x_test, y_test) print("\n%s: %.2f%%" % (classifier.metrics_names[1], scores[1] * 100)) elif isinstance(classifier, sklearn.ensemble.forest.RandomForestClassifier): x_test, y_test = dl.prepare_data_for_RF(dataset) print("-----TEST SET SIZE: " + str(len(x_test)) + "-----") else: x_test, y_test = dl.prepare_data_for_ZeroR(dataset) print("-----TEST SET SIZE: " + str(len(x_test)) + "-----") prediction = classifier.predict(x_test) numberOfClasses = y_test.shape[1] position = np.argmax(prediction, axis=-1) y_pred = np.identity(numberOfClasses)[position] target_names = ['nonrelated', 'related'] print(classification_report(y_test, y_pred, target_names=target_names)) y_test = [np.where(r == 1)[0][0] for r in y_test] y_pred = [np.where(r == 1)[0][0] for r in y_pred] y_true = pd.Series(y_test) y_pred = pd.Series(y_pred) print( pd.crosstab(y_true, y_pred, rownames=['True'], colnames=['Predicted'], margins=True))
def train_NN_classifier(dataset, epochs, singlePrint=False): print("-----TRAIN CLASSIFIER-----") x_train, y_train = dl.prepare_data_for_NN(dataset) numberOfClasses = y_train.shape[1] model = Sequential() model.add(Dense(500, input_dim=len(x_train[0]), activation='sigmoid')) model.add(Dense(numberOfClasses, activation='softmax')) model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy']) if (singlePrint): history = model.fit(x_train, y_train, validation_split=0.2, epochs=epochs, batch_size=150, verbose=0) print(history.history["acc"]) else: history = model.fit(x_train, y_train, validation_split=0.2, epochs=epochs, batch_size=150) plot_Training(history) print("-----TRAINING COMPLETE-----") return model
def baseline_results(dataset): x_test, y_test = dl.prepare_data_for_ZeroR(dataset) prediction = np.random.choice(2, x_test.shape[0]) numberOfLabels = np.unique(prediction).shape[0] prediction = np.identity(numberOfLabels)[prediction.astype(int).flatten()] numberOfClasses = y_test.shape[1] position = np.argmax(prediction, axis=-1) y_pred = np.identity(numberOfClasses)[position] target_names = ['nonrelated', 'related'] print(classification_report(y_test, y_pred, target_names=target_names)) y_test = [np.where(r == 1)[0][0] for r in y_test] y_pred = [np.where(r == 1)[0][0] for r in y_pred] y_true = pd.Series(y_test) y_pred = pd.Series(y_pred) print( pd.crosstab(y_true, y_pred, rownames=['True'], colnames=['Predicted'], margins=True))
def train_Dummy_classifier(train): x_train, y_train = dl.prepare_data_for_ZeroR(train) classifier = DummyClassifier(strategy="stratified", random_state=0) classifier.fit(x_train, y_train) return classifier
def balance_data(self, dataset, balancing=0.5): """balances the passed data parameter: dataset: pandas dataframe containing the data balancing: precentage of the balancing -> 0.5 = equal 50-50 balncing """ dataset = dl.balance_dataset(dataset, balancing) return dataset
def split_data(self, dataset, splitting=0.1): """splits data into training and testset parameter: dataset: pandas dataframe containing the training data splitting: percentage split of the data 0.1 = 10% testing data return values: trainset, testset """ train, test = dl.supervised_split(dataset, splitting) return train, test
def train_RF_classifier(dataset): print("-----TRAIN CLASSIFIER-----") x_train, y_train = dl.prepare_data_for_RF(dataset) estimators = 200 randomForest = RandomForestClassifier(n_estimators=estimators) randomForest.fit(x_train, y_train) print("-----TRAINING COMPLETE-----") return randomForest
def load_Data(self, dataDirectory=None, store=None): """load the data from XML-file and automatically extracts the features parameter: dataDirectory: directory of the XML-file store: directory to the h5.file for storing the loaded data and features """ data = dl.loadData(dataDirectory) dataset = self.generate_Features(data) if store != None: dataStore = pd.HDFStore(store) dataStore['dataset'] = data dataStore['feature'] = dataset return dataset
def train_RNN_classifier(dataset, epochs, singlePrint=False): print("-----TRAIN CLASSIFIER-----!") x_train, y_train = dl.prepare_data_for_RNN(dataset) numberOfClasses = y_train.shape[1] print("Number of classes:" + str(numberOfClasses)) lstm_input_dim = x_train["sentence1"].shape[1:] concatenateInput = x_train["sharedFeatures"].shape[1:] sentence1 = Input(lstm_input_dim, name="sentence1") sentence2 = Input(lstm_input_dim, name="sentence2") sharedFeatures = Input(concatenateInput, name="sharedFeatures") lstm1 = LSTM(16, return_sequences=False)(sentence1) lstm2 = LSTM(16, return_sequences=False)(sentence2) concatenateLayer = concatenate([lstm1, lstm2, sharedFeatures], axis=-1) dense = Dense(500, activation='sigmoid')(concatenateLayer) softmax = Dense(numberOfClasses, activation='softmax')(dense) model = Model(inputs=[sentence1, sentence2, sharedFeatures], outputs=[softmax]) model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy']) if (singlePrint): history = model.fit(x_train, y_train, validation_split=0.2, epochs=epochs, batch_size=150, verbose=0) print(history.history["acc"]) else: history = model.fit(x_train, y_train, validation_split=0.2, epochs=epochs, batch_size=500) plot_Training(history) print("-----TRAINING COMPLETE-----") return model
def show_dataset_statistics(self): """print the statistics of the datasets""" print("AraucariaDB") dl.loadStatistics("resources/corpora/araucaria") print("microtext") dl.loadStatistics("resources/corpora/microtext") print("rrd") dl.loadStatistics("resources/corpora/rrd") print("schemes") dl.loadStatistics("resources/corpora/schemes") print("STAB") dl.loadStatistics("resources/corpora/studentEssays") print("IBM") dl.loadStatistics("resources/corpora/ibm") print("ArguE") dl.loadStatistics("resources/corpora/arguE")
class main: current_dir = os.path.dirname(inspect.stack()[0][1]) + '/' aif = current_dir + "resources/datasets/aif.h5" se = current_dir + "resources/datasets/se.h5" ibm = current_dir + "resources/datasets/ibm.h5" argu = current_dir + "resources/datasets/arguE.h5" aifTrain = current_dir + "resources/datasets/training/aifTrain.h5" aifTest = current_dir + "resources/datasets/testing/aifTest.h5" seTrain = current_dir + "resources/datasets/training/seTrain.h5" seTest = current_dir + "resources/datasets/testing/seTest.h5" ibmTrain = current_dir + "resources/datasets/training/ibmTrain.h5" ibmTest = current_dir + "resources/datasets/testing/ibmTest.h5" argueTrain = current_dir + "resources/datasets/training/argueTrain.h5" argueTest = current_dir + "resources/datasets/testing/argueTest.h5" arguE = ArguE() ####### Build resources if not existing ####### if not os.path.exists(seTrain): if not os.path.exists(se): se_data = dl.loadData((current_dir + 'resources/datasets/brat-project/')) AFE = af.AdvancedFeatureExtractor() se_data = AFE.extractFeatures(se_data) store = pd.HDFStore(se,'w') store["feature"] = se_data store.close() print("SE generated") se_data = arguE.load_Data_From_Store(se) se_train, se_test = arguE.split_data(se_data) store = pd.HDFStore(seTrain,'w') store["feature"] = se_train store.close() store = pd.HDFStore(seTest,'w') store["feature"] = se_test store.close() print("Train-test generated") ####################################################################### ####### Training ####### print("################## TRAINING:") #data is already balanced and labels are changed trainSet = arguE.load_Data_From_Store(seTrain) trainSet = arguE.change_labels(trainSet) trainSet = arguE.balance_data(trainSet) #OneR = arguE.train_Dummy_classifier(trainSet, current_dir + "resources/classifierModels/se_or.pkl") #RF = arguE.train_RF_classifier(trainSet, current_dir+ "resources/classifierModels/all_rf.pkl") RNN = arguE.train_RNN_classifier(trainSet, epochs=25, saveModel=current_dir + "resources/classifierModels/se_rnn.h5") ####### Testing ####### print("################## TESTING:") testSet = arguE.load_Data_From_Store(seTest) testSet = arguE.change_labels(testSet) #arguE.test_classifier(testSet, OneR) #arguE.test_classifier(testSet, RF) arguE.test_classifier(testSet, RNN)