def test_url(self): config = templates.pipeline() config['data_loader'] = templates.data_loader_single_item_loader() config['steps'].append(templates.normalize_text_remove_urls()) loader = DataPreprocess(config) process = loader.process_item("remove url http://www.google.com") self.assertEqual("remove url ", process["data"])
def test_whitespace(self): config = templates.pipeline() config['data_loader'] = templates.data_loader_single_item_loader() config['steps'].append(templates.normalize_text_remove_whitespace()) loader = DataPreprocess(config) process = loader.process_item(" remove whitespace ") self.assertEqual("remove whitespace", process["data"])
def test_punctunation(self): config = templates.pipeline() config['data_loader'] = templates.data_loader_single_item_loader() config['steps'].append(templates.normalize_text_remove_punctuation()) loader = DataPreprocess(config) process = loader.process_item(TEST_DATA) self.assertEqual("This isn t a TEST sentences ", process["data"])
def test_lower(self): config = templates.pipeline() config['data_loader'] = templates.data_loader_single_item_loader() config['steps'].append(templates.normalize_text_lowercase()) loader = DataPreprocess(config) process = loader.process_item(TEST_DATA) self.assertEqual(TEST_DATA.lower(), process["data"])
def test_stopwords(self): config = templates.pipeline() config['data_loader'] = templates.data_loader_single_item_loader() config['steps'].append(templates.normalize_text_lowercase()) config['steps'].append(templates.normalize_text_remove_stopwords()) loader = DataPreprocess(config) process = loader.process_item(TEST_DATA) self.assertEqual("isn't test sentences!", process["data"])
def test_lemmatizer(self): config = templates.pipeline() config['data_loader'] = templates.data_loader_single_item_loader() config['steps'].append(templates.normalize_text_lowercase()) config['steps'].append(templates.normalize_text_lemmatizer()) loader = DataPreprocess(config) process = loader.process_item("How many cities are there?") self.assertEqual("how many city are there?", process["data"])
def test_contractions(self): config = templates.pipeline() config['data_loader'] = templates.data_loader_single_item_loader() config['steps'].append(templates.normalize_text_lowercase()) config['steps'].append(templates.normalize_text_expand_contractions()) loader = DataPreprocess(config) process = loader.process_item(TEST_DATA) self.assertEqual("this is not a test sentences!", process["data"])
def test_single_item_pipeline(self): config = { "data_loader": { "type": "single_item" }, "steps": [ { "name": "normalize_text", "type": "lowercase", "log_level": "INFO" }, ], } loader = DataPreprocess(config) process = loader.process_item(TEST_DATA) self.assertEqual(TEST_DATA.lower(), process["data"])