def setUp(self): logger = DummyLogger('test_preprocessing.log') self.connection = SqliteDbHandler( logger, '.', 'training_db').create_db_connection() self.unnecessary_columns = [ 'TSH_measured', 'T3_measured', 'TT4_measured', 'T4U_measured', 'FTI_measured', 'TBG_measured', 'TBG', 'TSH' ] self.preprocessing = Preprocessor(logger, self.unnecessary_columns)
class TestPreprocessor(unittest.TestCase): def setUp(self): logger = DummyLogger('test_preprocessing.log') self.connection = SqliteDbHandler( logger, '.', 'training_db').create_db_connection() self.unnecessary_columns = [ 'TSH_measured', 'T3_measured', 'TT4_measured', 'T4U_measured', 'FTI_measured', 'TBG_measured', 'TBG', 'TSH' ] self.preprocessing = Preprocessor(logger, self.unnecessary_columns) def test_preprocessing(self): print(os.getcwd()) df = pd.read_csv('./tests/InputFile.csv') print(df.shape) self.preprocessing.transform(df)
def get_preprocessor(self): unnecessary_columns = [ 'TSH_measured', 'T3_measured', 'TT4_measured', 'T4U_measured', 'FTI_measured', 'TBG_measured', 'TBG', 'TSH' ] preprocessor = Preprocessor(self._logger, unnecessary_columns) return preprocessor
if __name__ == "__main__": path_to_gold = '../data/isear/isear-val.csv' path_to_pred = '../data/isear/isear-val-prediction.csv' path_to_train = '../data/isear/isear-train.csv' path_to_test = '../data/isear/isear-test.csv' corpus = Corpus(path_to_train, path_to_test, path_to_gold, path_to_pred) # load from corpus y_true_test = [emo.label for emo in corpus.test_data] y_true_train = [emo.label for emo in corpus.train_data] p_train = Preprocessor(corpus.train_data, y_true_train) p_test = Preprocessor(corpus.test_data, y_true_test) counts_per_label = p_train.count_labels() wordcount_per_labels, wordsum_per_labels = p_train.count_words_in_labels() nb = NaiveBayes(counts_per_label, len(p_train.sentences), wordcount_per_labels, wordsum_per_labels) with open(path_to_test, 'r') as test_text: rows = test_text.readlines() y_pred = nb.fit(p_test.get_tokenized_sentences()) possible_labels = list(nb.num_labels.keys()) # evaluator
def prepare_data(self, data, pp=True): chs = [ 'T7', 'T8', 'P7', 'P8', 'O1', 'O2'] return self.get_part_from_a([Preprocessor.leave_main(i, chs) for i in data ], split=SPLIT, pp=pp)
path_to_test = '../data/isear/isear-test.csv' c = ec.Corpus(path_to_train, path_to_test, path_to_gold, path_to_pred) # data for training train_data = c.train_data train_text = [emo.text for emo in train_data] train_labels = [emo.label for emo in train_data] # data for testing test_data = c.test_data test_text = [emo.text for emo in test_data] test_labels = [emo.label for emo in test_data] # pre-processing processor = Preprocessor(train_text, train_labels) train_instances = processor.extract_features() processor = Preprocessor(test_text, test_data) test_instances = processor.extract_features() # classifier classifier = MultiLabelPerceptron(train_instances=train_instances, max_iters=10, theta=0.0) classifier.train() scores = classifier.inference(test_instances) # evaluator evaluator = Evaluator(c.labels, test_labels, scores)
path_to_gold = '../data/isear/isear-val.csv' path_to_pred = '../data/isear/isear-val-prediction.csv' path_to_train = '../data/isear/isear-train.csv' path_to_test = '../data/isear/isear-test.csv' filepath = '../data/NRC/NRC-Emotion-Lexicon-Wordlevel-v0.92.txt' corpus = Corpus(path_to_train, path_to_test, path_to_gold, path_to_pred) # load from corpus y_true = [emo.label for emo in corpus.test_data] # Preprocessing step p = Preprocessor(corpus.test_data, y_true) tokenized_instances = p.get_clean_tokenized_sentences() #EmotionDict emo_d = EmotionDict(filepath) y_pred = emo_d.emotion(tokenized_instances) y_pred_new, y_test_new = filtering(y_true, y_pred) sorted_x = sorted(emo_d.not_found_words.items(), key=operator.itemgetter(1)) #print(sorted_x) # Evaluator evaluator = evl.Evaluator(emo_d.emotions, y_test_new, y_pred_new) f_score, precision, recall, macro, micro = evaluator.evaluate()
from preprocessing.preprocessing import Preprocessor from settings import PATHS def apply_preprocessing(sent, p): p.set_text(sent) return p.get_text() if __name__ == "__main__": with open(PATHS['TEXT_PATH'], 'r', encoding='utf-8') as f: data = f.read().splitlines() p = Preprocessor() data_prep_encoder = [apply_preprocessing(x, p) for x in data]