def test_nearest_neighbors(metric): # create a features vector where 1st and 3rd item are in same direction # and are near to each other so that both cosine and euclidean dist work # similarly 2nd and 4th vectors are opposite in direction and far from # the remaining two so that they are similar to each other based on both # cosine and euclidean distance features = np.array( [[1.0, 1, 1], [-0.1, -0.1, -0.1], [1, 0.9, 0.9], [-0.1, -0.1, -0.2]] ) ds = stream.DataStream(features, context=["a", "b", "c", "d"]) op = ops.neighbors.NearestNeighborsOperation(n_neighbors=2, metric=metric) nbors_ds = ds.apply(op) nbors = list(nbors_ds) # distance does not matter as long as the items we expect to be same are # returned as neighbors assert nbors[0] == [ {"context": "a", "distance": ANY, "item_idx": 0}, {"context": "c", "distance": ANY, "item_idx": 2}, ] assert nbors[1] == [ {"context": "b", "distance": ANY, "item_idx": 1}, {"context": "d", "distance": ANY, "item_idx": 3}, ] assert nbors[2] == [ {"context": "c", "distance": ANY, "item_idx": 2}, {"context": "a", "distance": ANY, "item_idx": 0}, ] assert nbors[3] == [ {"context": "d", "distance": ANY, "item_idx": 3}, {"context": "b", "distance": ANY, "item_idx": 1}, ]
def topic_model( texts: Union[stream.DataStream, Iterable], num_topics: int = 30, max_words_per_topic: int = 10, vectorize_op: ops.base.ScikitBasedOperation = None, cleaning_ops: Optional[List[base.Operation]] = None, topic_modeling_op: Optional[ops.topic.TopicModelingOperation] = None, ): texts = texts if isinstance( texts, stream.DataStream) else stream.DataStream(texts) if cleaning_ops is None: cleaning_ops = [] if vectorize_op and not isinstance(vectorize_op, ops.base.ScikitBasedOperation): raise ValueError( "vectorize_op should be of type ops.base.ScikitBasedOperation" f" but got {type(vectorize_op)}") elif vectorize_op is None: vectorize_op = ops.text.encode.tfidf(max_features=15000, max_df=0.98, min_df=2) if not topic_modeling_op: topic_modeling_op = ops.topic.lda(n_topics=num_topics, ) topics_ds = texts.apply(*cleaning_ops, vectorize_op, topic_modeling_op) feature_names = vectorize_op.model.get_feature_names() return topic_modeling_op.map_topics( topics_ds, feature_names=feature_names, max_words_per_topic=max_words_per_topic)
def run(self, ds: stream.DataStream) -> stream.DataStream: G = nx.Graph() for pair in ds: idx1, idx2 = pair[0], pair[1] G.add_edge(idx1, idx2) groups = list(nx.connected_components(G)) return stream.DataStream(groups, applied_ops=ds.applied_ops + [self])
def run(self, ds: stream.DataStream) -> stream.DataStream: raw_topics_scores_ds = super().run(ds) topics_with_ctx = self._get_topic_per_item(raw_topics_scores_ds) topics, ctxs = more_itertools.unzip(topics_with_ctx) return stream.DataStream(items=topics, applied_ops=ds.applied_ops + [self], context=ctxs)
def test_returns_a_stream_with_doc_vectors(): ds = stream.DataStream(["this is", "another", "sentence"]) vector_ds = ds.apply(ops.text.embedding.doc_embedding()) vectors = list(vector_ds) context = list(vector_ds.context) assert len(vectors) == ds.total_items assert len(context) == len(vectors) assert all(isinstance(v, np.ndarray) for v in vectors)
def test_flatten(): input_ds = stream.DataStream(items=[[1, 2], [3, 4, 5]], context=["a", "b"]) ds = input_ds.apply(ops.stream.flatten(distribute_context=True)) assert list(ds) == [1, 2, 3, 4, 5] assert list(ds.context) == ["a", "a", "b", "b", "b"] ds = input_ds.apply(ops.stream.flatten(distribute_context=False)) assert list(ds) == [1, 2, 3, 4, 5] assert list(ds.context) == ["a_0", "a_1", "b_0", "b_1", "b_2"]
def run(self, ds: stream.DataStream): if self.mode == "upper": fn = str.upper elif self.mode == "capitalize": fn = str.capitalize else: fn = str.lower items = map(fn, ds) return stream.DataStream(applied_ops=ds.applied_ops + [self], items=items, context=ds.context)
def test_similar_pairs(metric): features = np.array( [[1.0, 1, 1], [-0.1, -0.1, -0.1], [1, 0.9, 0.9], [-0.1, -0.1, -0.2]] ) # ds = stream.DataStream(features, context=[{"A": 1}, {"B": 2}, {"c": 3}, {"d": 4}]) ds = stream.DataStream(features, context=["a", "b", "c", "d"]) op = ops.neighbors.SimilarPairOperation(n_neighbors=2, metric=metric) pairs_ds = ds.apply(op) pairs = list(pairs_ds) assert sorted(pairs) == sorted([("a", "c", ANY), ("b", "d", ANY)])
def test_sentence_extraction(): texts = ["this is a text. with two sentences.", "this is with single sentence"] out_ds = stream.DataStream(items=texts).apply(ops.text.extract.sentences()) expected_items = [ ["this is a text.", "with two sentences."], ["this is with single sentence"], ] expected_context = [0, 1] actual_items, actual_context = list(out_ds.items), list(out_ds.context) assert actual_items == expected_items assert actual_context == expected_context
def run(self, ds: stream.DataStream, fit_params: dict = {}) -> stream.DataStream: if self.should_train: if ds.is_countable: train_ds = ds pred_ds = ds else: train_items, pred_items = itertools.tee(ds, 2) train_context, pred_context = itertools.tee(ds.context, 2) train_ds = stream.DataStream(train_items, context=train_context) pred_ds = stream.DataStream(pred_items, context=pred_context) self._fit(train_ds, fit_params) else: pred_ds = ds predictions = self._predict(pred_ds) return stream.DataStream(items=predictions, applied_ops=ds.applied_ops + [self], context=ds.context)
def run(self, ds: stream.DataStream) -> stream.DataStream: docs_ds = self.get_docs_stream(ds) docs = zip(docs_ds, docs_ds.context) # match results is a tuple ((doc, matches), context) match_results = self.matcher.pipe(docs, return_matches=True, as_tuples=True) new_docs_with_context = more_itertools.map_except( self._filter_tokens, match_results, EmptyTextError) new_docs, context = more_itertools.unzip(new_docs_with_context) return stream.DataStream(new_docs, applied_ops=ds.applied_ops + [self], context=context)
def test_topic_modeling(topic_op): ds = stream.DataStream(["this is about playstation", "government governs"]) topics_ds = ds.apply(ops.text.encode.count(name="vec"), topic_op) vec_op = topics_ds.applied_ops.find_by_name("vec") feature_names = vec_op.model.get_feature_names() topic_word_stream = topic_op.map_topics( topics_ds=topics_ds, feature_names=feature_names, max_words_per_topic=2, ) assert list(topic_word_stream) == [[ANY, ANY], [ANY, ANY]]
def test_sklearn_classifiers_work_with_multiclass_label( classifier, feature_vectors, multiclass_labels ): vec_ds = stream.DataStream(feature_vectors, context=multiclass_labels) fit_params = {"y": multiclass_labels} if classifier.supports_batch_training: fit_params["classes"] = list(set(multiclass_labels)) pred_ds = vec_ds.apply(classifier, op_kwargs={"cls": {"fit_params": fit_params}},) preds = list(pred_ds) assert len(preds) == len(feature_vectors) assert all(isinstance(p, ops.classify.ClassificationResult) for p in preds) assert all(isinstance(p.label, str) for p in preds)
def test_sklearn_classifiers_work_with_multilabel_label( classifier, feature_vectors, multilabel_labels ): vec_ds = stream.DataStream(feature_vectors, context=multilabel_labels) fit_params = {"y": multilabel_labels} if classifier.supports_batch_training or not classifier.exclusive_classes: fit_params["classes"] = list( set(itertools.chain.from_iterable(multilabel_labels)) ) pred_ds = vec_ds.apply(classifier, op_kwargs={"cls": {"fit_params": fit_params}},) preds = list(pred_ds) assert len(preds) == len(feature_vectors) assert all(isinstance(p, ops.classify.ClassificationResult) for p in preds) assert all(isinstance(p.label, (list, tuple)) for p in preds)
def map_topics( self, topics_ds: stream.DataStream, feature_names: Iterable[str], max_words_per_topic: int = 5, ) -> stream.DataStream: words_of_topics = [] for topic_vec in self.model.components_: words_of_topic = [] for feature_id in topic_vec.argsort()[-1:-max_words_per_topic - 1:-1]: words_of_topic.append(feature_names[feature_id]) words_of_topics.append(words_of_topic) mapped_topics = (words_of_topics[topic_id] for topic_id in topics_ds) return stream.DataStream(mapped_topics, applied_ops=topics_ds.applied_ops, context=topics_ds.context)
def test_calls_appropriate_underlying_methods_for_training_and_prediction( model_class, should_train): model = MagicMock(spec=model_class) ds = stream.DataStream([[0, 1, 2], [3, 4, 5]]) op = cluster.ClusterOperation(model=model) op.should_train = should_train assert op.should_train == should_train test_return_value = [[0, 1], [3, 4]] if model_class in cluster.ALGORITHMS_NOT_SUPPORTING_NEW_INFERENCE: model.labels_ = test_return_value output = op.run(ds) model.fit.assert_called_once() assert list(output.items) == test_return_value elif model_class in cluster.ALGORITHMS_SUPPORTING_NEW_INFERENCE: model.predict.return_value = test_return_value output = op.run(ds) if should_train: if op.supports_batch_training: model.partial_fit.assert_called_once() else: model.fit.assert_called_once() model.predict.assert_called_once() else: model.fit.assert_not_called() model.predict.assert_called_once() assert list(output.items) == test_return_value else: raise Exception("model class not expected")
def test_token_extraction(): ds = stream.DataStream(items=["this is a sentence", "woo hoo", ""]) out_ds = ds.apply(ops.text.extract.tokens()) expected = [["this", "is", "a", "sentence"], ["woo", "hoo"], []] assert list(out_ds.items) == expected
def run(self, ds: stream.DataStream) -> stream.DataStream: dist_indices_ds = super().run(ds) pairs = self._get_pairs(dist_indices_ds) # contexts do not make sense anymore return stream.DataStream(pairs, applied_ops=ds.applied_ops + [self])