def test_same_output(self): X, X_rdd = self.generate_text_dataset() local = HashingVectorizer() dist = SparkHashingVectorizer() result_local = local.transform(X) result_dist = sp.vstack(dist.transform(X_rdd).collect()) assert_array_equal(result_local.toarray(), result_dist.toarray())
def test_same_output(self): X, X_rdd = self.make_text_rdd() local = HashingVectorizer() dist = SparkHashingVectorizer() result_local = local.transform(X).toarray() result_dist = dist.transform(X_rdd).toarray() assert_array_equal(result_local, result_dist)
def test_dummy_analyzer(self): X, X_rdd = self.generate_text_dataset() def splitter(x): return x.split() X = map(splitter, X) X_rdd = X_rdd.map(lambda x: map(splitter, x)) local = HashingVectorizer(analyzer=lambda x: x) dist = SparkHashingVectorizer(analyzer=lambda x: x) result_local = local.transform(X) result_dist = sp.vstack(dist.transform(X_rdd).collect()) assert_array_equal(result_local.toarray(), result_dist.toarray()) result_local = local.fit_transform(X) result_dist = sp.vstack(dist.fit_transform(X_rdd).collect()) assert_array_equal(result_local.toarray(), result_dist.toarray())
def test_dummy_analyzer(self): X, X_rdd = self.make_text_rdd() def splitter(x): return x.split() X = list(map(splitter, X)) X_rdd = X_rdd.map(lambda x: list(map(splitter, x))) local = HashingVectorizer(analyzer=lambda x: x) dist = SparkHashingVectorizer(analyzer=lambda x: x) result_local = local.transform(X).toarray() result_dist = dist.transform(X_rdd).toarray() assert_array_equal(result_local, result_dist) result_local = local.fit_transform(X).toarray() result_dist = dist.fit_transform(X_rdd).toarray() assert_array_equal(result_local, result_dist)
list = [] data_train, data_test, target_train, target_test = cross_validation.train_test_split( data, target, test_size=0.25, random_state=43) # train data toRDD train_x = sc.parallelize(data_train) train_y = sc.parallelize(target_train) train_x = ArrayRDD(train_x) train_y = ArrayRDD(train_y) Z = DictRDD((train_x, train_y), columns=('X', 'y'), dtype=[np.ndarray, np.ndarray]) # pipeline dist_pipeline = SparkPipeline(( ('vect', SparkHashingVectorizer(non_negative=True)), # hashingTF for NB ('tfidf', SparkTfidfTransformer()), # IDF ('clf', SparkMultinomialNB(alpha=0.05)) # NB )) # fit dist_pipeline.fit(Z, clf__classes=np.array([0, 1])) # test data to RDD test_x = ArrayRDD(sc.parallelize(data_test)) test_y = ArrayRDD(sc.parallelize(target_test)) test_Z = DictRDD((test_x, test_y), columns=('X', 'y'), dtype=[np.ndarray, np.ndarray]) # predict test data