예제 #1
0
    def setUp(self):
        logger = DummyLogger('test_preprocessing.log')
        self.connection = SqliteDbHandler(
            logger, '.', 'training_db').create_db_connection()

        self.unnecessary_columns = [
            'TSH_measured', 'T3_measured', 'TT4_measured', 'T4U_measured',
            'FTI_measured', 'TBG_measured', 'TBG', 'TSH'
        ]
        self.preprocessing = Preprocessor(logger, self.unnecessary_columns)
예제 #2
0
class TestPreprocessor(unittest.TestCase):
    def setUp(self):
        logger = DummyLogger('test_preprocessing.log')
        self.connection = SqliteDbHandler(
            logger, '.', 'training_db').create_db_connection()

        self.unnecessary_columns = [
            'TSH_measured', 'T3_measured', 'TT4_measured', 'T4U_measured',
            'FTI_measured', 'TBG_measured', 'TBG', 'TSH'
        ]
        self.preprocessing = Preprocessor(logger, self.unnecessary_columns)

    def test_preprocessing(self):
        print(os.getcwd())
        df = pd.read_csv('./tests/InputFile.csv')
        print(df.shape)
        self.preprocessing.transform(df)
    def get_preprocessor(self):

        unnecessary_columns = [
            'TSH_measured', 'T3_measured', 'TT4_measured', 'T4U_measured',
            'FTI_measured', 'TBG_measured', 'TBG', 'TSH'
        ]
        preprocessor = Preprocessor(self._logger, unnecessary_columns)
        return preprocessor
예제 #4
0
if __name__ == "__main__":
    
    path_to_gold = '../data/isear/isear-val.csv'
    path_to_pred = '../data/isear/isear-val-prediction.csv'

    path_to_train = '../data/isear/isear-train.csv'
    path_to_test = '../data/isear/isear-test.csv'

    corpus = Corpus(path_to_train, path_to_test, path_to_gold, path_to_pred)

    # load from corpus
    y_true_test = [emo.label for emo in corpus.test_data]
    y_true_train = [emo.label for emo in corpus.train_data]

    p_train = Preprocessor(corpus.train_data, y_true_train)
    p_test = Preprocessor(corpus.test_data, y_true_test)

    counts_per_label = p_train.count_labels()
    wordcount_per_labels, wordsum_per_labels = p_train.count_words_in_labels()

    nb = NaiveBayes(counts_per_label, len(p_train.sentences), wordcount_per_labels, wordsum_per_labels)

    with open(path_to_test, 'r') as test_text:
        rows = test_text.readlines()

    y_pred = nb.fit(p_test.get_tokenized_sentences())

    possible_labels = list(nb.num_labels.keys())

    # evaluator
예제 #5
0
	def prepare_data(self, data, pp=True):
		chs = [ 'T7', 'T8', 'P7', 'P8', 'O1', 'O2']
		return self.get_part_from_a([Preprocessor.leave_main(i, chs) for i in data ], split=SPLIT, pp=pp)
예제 #6
0
    path_to_test = '../data/isear/isear-test.csv'

    c = ec.Corpus(path_to_train, path_to_test, path_to_gold, path_to_pred)

    # data for training
    train_data = c.train_data
    train_text = [emo.text for emo in train_data]
    train_labels = [emo.label for emo in train_data]

    # data for testing
    test_data = c.test_data
    test_text = [emo.text for emo in test_data]
    test_labels = [emo.label for emo in test_data]

    # pre-processing
    processor = Preprocessor(train_text, train_labels)
    train_instances = processor.extract_features()

    processor = Preprocessor(test_text, test_data)
    test_instances = processor.extract_features()

    # classifier
    classifier = MultiLabelPerceptron(train_instances=train_instances,
                                      max_iters=10,
                                      theta=0.0)
    classifier.train()

    scores = classifier.inference(test_instances)

    # evaluator
    evaluator = Evaluator(c.labels, test_labels, scores)
예제 #7
0
    path_to_gold = '../data/isear/isear-val.csv'
    path_to_pred = '../data/isear/isear-val-prediction.csv'

    path_to_train = '../data/isear/isear-train.csv'
    path_to_test = '../data/isear/isear-test.csv'

    filepath = '../data/NRC/NRC-Emotion-Lexicon-Wordlevel-v0.92.txt'

    corpus = Corpus(path_to_train, path_to_test, path_to_gold, path_to_pred)

    # load from corpus
    y_true = [emo.label for emo in corpus.test_data]

    # Preprocessing step
    p = Preprocessor(corpus.test_data, y_true)

    tokenized_instances = p.get_clean_tokenized_sentences()

    #EmotionDict
    emo_d = EmotionDict(filepath)
    y_pred = emo_d.emotion(tokenized_instances)
    y_pred_new, y_test_new = filtering(y_true, y_pred)

    sorted_x = sorted(emo_d.not_found_words.items(),
                      key=operator.itemgetter(1))
    #print(sorted_x)

    # Evaluator
    evaluator = evl.Evaluator(emo_d.emotions, y_test_new, y_pred_new)
    f_score, precision, recall, macro, micro = evaluator.evaluate()
예제 #8
0
from preprocessing.preprocessing import Preprocessor
from settings import PATHS


def apply_preprocessing(sent, p):
    p.set_text(sent)
    return p.get_text()


if __name__ == "__main__":

    with open(PATHS['TEXT_PATH'], 'r', encoding='utf-8') as f:
        data = f.read().splitlines()

    p = Preprocessor()
    data_prep_encoder = [apply_preprocessing(x, p) for x in data]