def test_tfidf_with_all_datasets(self): should_succeed = ["movies"] should_fail = ["irisArr", "irisDf", "digits", "housing", "creditG", "drugRev"] for name in should_succeed: dataset = getattr(self, f"_{name}") TfidfVectorizer.validate_schema(**dataset) for name in should_fail: dataset = getattr(self, f"_{name}") with self.assertRaises(ValueError): TfidfVectorizer.validate_schema(**dataset)
def test_more_hyperparam_values(self): with self.assertRaises(jsonschema.ValidationError): tf_idf = TfidfVectorizer(max_df=2.5, min_df=2, max_features=1000, stop_words='english') with self.assertRaises(jsonschema.ValidationError): tf_idf = TfidfVectorizer(max_df=2, min_df=2, max_features=1000, stop_words=['I', 'we', 'not', 'this', 'that'], analyzer = 'char')
def test_tfidf_with_all_datasets(self): should_succeed = ['movies'] should_fail = ['irisArr', 'irisDf', 'digits', 'housing', 'creditG', 'drugRev'] for name in should_succeed: dataset = getattr(self, f'_{name}') TfidfVectorizer.validate_schema(**dataset) for name in should_fail: dataset = getattr(self, f'_{name}') with self.assertRaises(ValueError): TfidfVectorizer.validate_schema(**dataset)
def test_more_hyperparam_values(self): with EnableSchemaValidation(): with self.assertRaises(jsonschema.ValidationError): _ = TfidfVectorizer( max_df=2.5, min_df=2, max_features=1000, stop_words="english" ) with self.assertRaises(jsonschema.ValidationError): _ = TfidfVectorizer( max_df=2, min_df=2, max_features=1000, stop_words=["I", "we", "not", "this", "that"], analyzer="char", )
def test_non_null_tokenizer(self): # tokenize the doc and lemmatize its tokens def my_tokenizer(): return 'abc' with self.assertRaises(jsonschema.ValidationError): tf_idf = TfidfVectorizer(max_df=2, min_df=2, max_features=1000, stop_words='english', tokenizer = my_tokenizer, analyzer = 'char')
def test_non_null_tokenizer(self): # tokenize the doc and lemmatize its tokens def my_tokenizer(): return "abc" with EnableSchemaValidation(): with self.assertRaises(jsonschema.ValidationError): _ = TfidfVectorizer( max_df=2, min_df=2, max_features=1000, stop_words="english", tokenizer=my_tokenizer, analyzer="char", )