def test_inverse_tansform(self): # simple pipe sk_pipe = SKPipeline([("SS", self.sk_ss), ("PCA", self.sk_pca)]) sk_pipe.fit(self.X, self.y) sk_transform = sk_pipe.transform(self.X) sk_inverse_transformed = sk_pipe.inverse_transform(sk_transform) photon_pipe = PhotonPipeline([("SS", self.p_ss), ("PCA", self.p_pca)]) photon_pipe.fit(self.X, self.y) p_transform, _, _ = photon_pipe.transform(self.X) p_inverse_transformed, _, _ = photon_pipe.inverse_transform( p_transform) self.assertTrue( np.array_equal(sk_inverse_transformed, p_inverse_transformed)) # now including stack stack = Stack("stack", [self.p_pca]) stack_pipeline = PhotonPipeline([ ("stack", stack), ("StandardScaler", PipelineElement("StandardScaler")), ("LinearSVC", PipelineElement("LinearSVC")), ]) stack_pipeline.fit(self.X, self.y) feature_importances = stack_pipeline.feature_importances_ inversed_data, _, _ = stack_pipeline.inverse_transform( feature_importances) self.assertEqual(inversed_data.shape[1], self.X.shape[1])
def sklearn_custom_transformer_model(sklearn_knn_model): def transform(vec): return vec + 1 transformer = SKFunctionTransformer(transform, validate=True) pipeline = SKPipeline([("custom_transformer", transformer), ("knn", sklearn_knn_model.model)]) return ModelWithData(pipeline, inference_data=datasets.load_iris().data[:, :2])
def setUp(self): self.X, self.y = load_breast_cancer(True) self.scaler = PipelineElement("StandardScaler", {'with_mean': True}) self.pca = PipelineElement('PCA', {'n_components': [1, 2]}, test_disabled=True, random_state=3) self.tree = PipelineElement('DecisionTreeClassifier', {'min_samples_split': [2, 3, 4]}, random_state=3) self.transformer_branch = Branch('MyBranch', [self.scaler, self.pca]) self.transformer_branch_sklearn = SKPipeline([("SS", StandardScaler()), ("PCA", PCA(random_state=3))]) self.estimator_branch = Branch('MyBranch', [self.scaler, self.pca, self.tree]) self.estimator_branch_sklearn = SKPipeline([ ("SS", StandardScaler()), ("PCA", PCA(random_state=3)), ("Tree", DecisionTreeClassifier(random_state=3)) ])
def test_predict_proba(self): sk_pipe = SKPipeline([("SS", self.sk_ss), ("SVC", self.sk_dt)]) sk_pipe.fit(self.X, self.y) sk_proba = sk_pipe.predict_proba(self.X) photon_pipe = PhotonPipeline([("SS", self.p_ss), ("SVC", self.p_dt)]) photon_pipe.fit(self.X, self.y) photon_proba = photon_pipe.predict_proba(self.X) self.assertTrue(np.array_equal(sk_proba, photon_proba))
def test_predict_with_training_flag(self): # manually edit labels sk_pipe = SKPipeline([("SS", self.sk_ss), ("SVC", self.sk_svc)]) y_plus_one = self.y + 1 sk_pipe.fit(self.X, y_plus_one) sk_pred = sk_pipe.predict(self.X) # edit labels during pipeline p_pipe = PhotonPipeline([("SS", self.p_ss), ("YT", self.dummy_photon_element), ("SVC", self.p_svm)]) p_pipe.fit(self.X, self.y) p_pred = p_pipe.predict(self.X) sk_standardized_X = self.sk_ss.transform(self.X) input_of_y_transformer = self.dummy_photon_element.base_element.X self.assertTrue(np.array_equal(sk_standardized_X, input_of_y_transformer)) self.assertTrue(np.array_equal(sk_pred, p_pred))
def test_regular_use(self): photon_pipe = PhotonPipeline([("PCA", self.p_pca), ("SVC", self.p_svm)]) photon_pipe.fit(self.X, self.y) photon_transformed_X, _, _ = photon_pipe.transform(self.X) photon_predicted_y = photon_pipe.predict(self.X) # the element is given by reference, so it should be fitted right here photon_ref_transformed_X, _, _ = self.p_pca.transform(self.X) photon_ref_predicted_y = self.p_svm.predict(photon_ref_transformed_X) self.assertTrue(np.array_equal(photon_transformed_X, photon_ref_transformed_X)) self.assertTrue(np.array_equal(photon_predicted_y, photon_ref_predicted_y)) sk_pipe = SKPipeline([('PCA', self.sk_pca), ("SVC", self.sk_svc)]) sk_pipe.fit(self.X, self.y) sk_predicted_y = sk_pipe.predict(self.X) self.assertTrue(np.array_equal(photon_predicted_y, sk_predicted_y))
def explain_article_lime_task_impl(view_cache_id, ace_id, pipeline_id, article_number): ace = ACE.objects.get({'_id': ObjectId(ace_id)}) pipeline = Pipeline.objects.get({'_id': ObjectId(pipeline_id)}) article_number = int(article_number) article = ace.data_source.articles[article_number] sk_pipeline = pipeline.sk_pipeline.get() prediction = sk_pipeline.predict([article.raw_text])[0] # do not modify pipeline.sk_pipeline skp = deepcopy(sk_pipeline) model = skp.steps.pop()[1] used_classes = model.classes_ used_class_names = [ace.data_source.labels[x] for x in used_classes] lime_text_html = '' lime_features_html = '' anchor_html = '' # TODO: do not send article raw text, I suspect the bug report for stop-word appearance is due to raw_text # although we are sending through the pipeline predict_proba if pipeline.nlp_tool.name == 'TF-IDF': # the pipeline should be linear, but it contains a FeatureUnion (with a Pipeline), so let's flatten it steps = [] # will contained the flattened steps for step in sk_pipeline.steps: if isinstance(step[1], FeatureUnion): # step[1] FeatureUnion, should contain a single Pipeline steps.extend(step[1].transformer_list[0][1].steps) else: steps.append(step) # find tfidf vectorizer step number for tfidf_step_index, step in enumerate(steps): if isinstance(step[1], TfidfVectorizer): break preprocess_pipeline = SKPipeline(steps[:tfidf_step_index]) rest_pipeline = SKPipeline(steps[tfidf_step_index:]) # give lime text before tfidf_step, the function should be the rest of the pipeline. lime_text_html = get_lime_text_explanation( preprocess_pipeline.transform([article.raw_text])[0], prediction, used_class_names, rest_pipeline.predict_proba).as_html() # anchor_html = get_anchor_text_explanation( # skp, # article.raw_text, # sk_pipeline.predict, # used_class_names # ).as_html() else: lime_features_html = get_lime_feature_explanation( article, prediction, skp, model.predict_proba, pipeline.data_source.articles, used_class_names).as_html() cache = CachedView.objects.get({'_id': ObjectId(view_cache_id)}) cache.task.set_success() cache.data = dict( pipeline=pipeline, article=article, prediction=prediction, exp1_html=lime_text_html or lime_features_html, exp2_html=anchor_html, article_number=article_number, ) cache.save()
def dict_vectorize(transformer_name, transformer): return SKPipeline([(transformer_name, transformer), ('DictVectorizer', DictVectorizer(sparse=False))])