def test_limit_features(self): X, X_rdd = self.make_text_rdd() params = [{ 'min_df': .5 }, { 'min_df': 2, 'max_df': .9 }, { 'min_df': 1, 'max_df': .6 }, { 'min_df': 2, 'max_features': 3 }] for paramset in params: local = CountVectorizer(**paramset) dist = SparkCountVectorizer(**paramset) result_local = local.fit_transform(X).toarray() result_dist = dist.fit_transform(X_rdd).toarray() assert_equal(local.vocabulary_, dist.vocabulary_) assert_array_equal(result_local, result_dist) result_dist = dist.transform(X_rdd).toarray() assert_array_equal(result_local, result_dist)
def test_limit_features(self): X, X_rdd = self.generate_text_dataset() params = [{'min_df': .5}, {'min_df': 2, 'max_df': .9}, {'min_df': 1, 'max_df': .6}, {'min_df': 2, 'max_features': 3}] for paramset in params: local = CountVectorizer(**paramset) dist = SparkCountVectorizer(**paramset) result_local = local.fit_transform(X) result_dist = sp.vstack(dist.fit_transform(X_rdd).collect()) assert_equal(local.vocabulary_, dist.vocabulary_) assert_array_equal(result_local.toarray(), result_dist.toarray()) result_dist = sp.vstack(dist.transform(X_rdd).collect()) assert_array_equal(result_local.toarray(), result_dist.toarray())
def test_limit_features(self): X, X_rdd = self.make_text_rdd() params = [{'min_df': .5}, {'min_df': 2, 'max_df': .9}, {'min_df': 1, 'max_df': .6}, {'min_df': 2, 'max_features': 3}] for paramset in params: local = CountVectorizer(**paramset) dist = SparkCountVectorizer(**paramset) result_local = local.fit_transform(X).toarray() result_dist = dist.fit_transform(X_rdd).toarray() assert_equal(local.vocabulary_, dist.vocabulary_) assert_array_equal(result_local, result_dist) result_dist = dist.transform(X_rdd).toarray() assert_array_equal(result_local, result_dist)