示例#1
0
def test_nearest_neighbors(metric):
    # create a features vector where 1st and 3rd item are in same direction
    # and are near to each other so that both cosine and euclidean dist work
    # similarly 2nd and 4th vectors are opposite in direction and far from
    # the remaining two so that they are similar to each other based on both
    # cosine and euclidean distance
    features = np.array(
        [[1.0, 1, 1], [-0.1, -0.1, -0.1], [1, 0.9, 0.9], [-0.1, -0.1, -0.2]]
    )
    ds = stream.DataStream(features, context=["a", "b", "c", "d"])
    op = ops.neighbors.NearestNeighborsOperation(n_neighbors=2, metric=metric)
    nbors_ds = ds.apply(op)
    nbors = list(nbors_ds)

    # distance does not matter as long as the items we expect to be same are
    # returned as neighbors
    assert nbors[0] == [
        {"context": "a", "distance": ANY, "item_idx": 0},
        {"context": "c", "distance": ANY, "item_idx": 2},
    ]

    assert nbors[1] == [
        {"context": "b", "distance": ANY, "item_idx": 1},
        {"context": "d", "distance": ANY, "item_idx": 3},
    ]

    assert nbors[2] == [
        {"context": "c", "distance": ANY, "item_idx": 2},
        {"context": "a", "distance": ANY, "item_idx": 0},
    ]

    assert nbors[3] == [
        {"context": "d", "distance": ANY, "item_idx": 3},
        {"context": "b", "distance": ANY, "item_idx": 1},
    ]
示例#2
0
def topic_model(
    texts: Union[stream.DataStream, Iterable],
    num_topics: int = 30,
    max_words_per_topic: int = 10,
    vectorize_op: ops.base.ScikitBasedOperation = None,
    cleaning_ops: Optional[List[base.Operation]] = None,
    topic_modeling_op: Optional[ops.topic.TopicModelingOperation] = None,
):
    texts = texts if isinstance(
        texts, stream.DataStream) else stream.DataStream(texts)
    if cleaning_ops is None:
        cleaning_ops = []

    if vectorize_op and not isinstance(vectorize_op,
                                       ops.base.ScikitBasedOperation):
        raise ValueError(
            "vectorize_op should be of type ops.base.ScikitBasedOperation"
            f" but got {type(vectorize_op)}")
    elif vectorize_op is None:
        vectorize_op = ops.text.encode.tfidf(max_features=15000,
                                             max_df=0.98,
                                             min_df=2)

    if not topic_modeling_op:
        topic_modeling_op = ops.topic.lda(n_topics=num_topics, )

    topics_ds = texts.apply(*cleaning_ops, vectorize_op, topic_modeling_op)
    feature_names = vectorize_op.model.get_feature_names()

    return topic_modeling_op.map_topics(
        topics_ds,
        feature_names=feature_names,
        max_words_per_topic=max_words_per_topic)
示例#3
0
 def run(self, ds: stream.DataStream) -> stream.DataStream:
     G = nx.Graph()
     for pair in ds:
         idx1, idx2 = pair[0], pair[1]
         G.add_edge(idx1, idx2)
     groups = list(nx.connected_components(G))
     return stream.DataStream(groups, applied_ops=ds.applied_ops + [self])
示例#4
0
 def run(self, ds: stream.DataStream) -> stream.DataStream:
     raw_topics_scores_ds = super().run(ds)
     topics_with_ctx = self._get_topic_per_item(raw_topics_scores_ds)
     topics, ctxs = more_itertools.unzip(topics_with_ctx)
     return stream.DataStream(items=topics,
                              applied_ops=ds.applied_ops + [self],
                              context=ctxs)
示例#5
0
def test_returns_a_stream_with_doc_vectors():
    ds = stream.DataStream(["this is", "another", "sentence"])

    vector_ds = ds.apply(ops.text.embedding.doc_embedding())
    vectors = list(vector_ds)
    context = list(vector_ds.context)
    assert len(vectors) == ds.total_items
    assert len(context) == len(vectors)
    assert all(isinstance(v, np.ndarray) for v in vectors)
示例#6
0
def test_flatten():
    input_ds = stream.DataStream(items=[[1, 2], [3, 4, 5]], context=["a", "b"])
    ds = input_ds.apply(ops.stream.flatten(distribute_context=True))

    assert list(ds) == [1, 2, 3, 4, 5]
    assert list(ds.context) == ["a", "a", "b", "b", "b"]

    ds = input_ds.apply(ops.stream.flatten(distribute_context=False))
    assert list(ds) == [1, 2, 3, 4, 5]
    assert list(ds.context) == ["a_0", "a_1", "b_0", "b_1", "b_2"]
示例#7
0
 def run(self, ds: stream.DataStream):
     if self.mode == "upper":
         fn = str.upper
     elif self.mode == "capitalize":
         fn = str.capitalize
     else:
         fn = str.lower
     items = map(fn, ds)
     return stream.DataStream(applied_ops=ds.applied_ops + [self],
                              items=items,
                              context=ds.context)
示例#8
0
def test_similar_pairs(metric):
    features = np.array(
        [[1.0, 1, 1], [-0.1, -0.1, -0.1], [1, 0.9, 0.9], [-0.1, -0.1, -0.2]]
    )
    # ds = stream.DataStream(features, context=[{"A": 1}, {"B": 2}, {"c": 3}, {"d": 4}])
    ds = stream.DataStream(features, context=["a", "b", "c", "d"])
    op = ops.neighbors.SimilarPairOperation(n_neighbors=2, metric=metric)
    pairs_ds = ds.apply(op)
    pairs = list(pairs_ds)

    assert sorted(pairs) == sorted([("a", "c", ANY), ("b", "d", ANY)])
示例#9
0
def test_sentence_extraction():
    texts = ["this is a text. with two sentences.", "this is with single sentence"]
    out_ds = stream.DataStream(items=texts).apply(ops.text.extract.sentences())
    expected_items = [
        ["this is a text.", "with two sentences."],
        ["this is with single sentence"],
    ]
    expected_context = [0, 1]

    actual_items, actual_context = list(out_ds.items), list(out_ds.context)
    assert actual_items == expected_items
    assert actual_context == expected_context
示例#10
0
    def run(self,
            ds: stream.DataStream,
            fit_params: dict = {}) -> stream.DataStream:
        if self.should_train:
            if ds.is_countable:
                train_ds = ds
                pred_ds = ds
            else:
                train_items, pred_items = itertools.tee(ds, 2)
                train_context, pred_context = itertools.tee(ds.context, 2)
                train_ds = stream.DataStream(train_items,
                                             context=train_context)
                pred_ds = stream.DataStream(pred_items, context=pred_context)
            self._fit(train_ds, fit_params)
        else:
            pred_ds = ds

        predictions = self._predict(pred_ds)
        return stream.DataStream(items=predictions,
                                 applied_ops=ds.applied_ops + [self],
                                 context=ds.context)
示例#11
0
 def run(self, ds: stream.DataStream) -> stream.DataStream:
     docs_ds = self.get_docs_stream(ds)
     docs = zip(docs_ds, docs_ds.context)
     # match results is a tuple ((doc, matches), context)
     match_results = self.matcher.pipe(docs,
                                       return_matches=True,
                                       as_tuples=True)
     new_docs_with_context = more_itertools.map_except(
         self._filter_tokens, match_results, EmptyTextError)
     new_docs, context = more_itertools.unzip(new_docs_with_context)
     return stream.DataStream(new_docs,
                              applied_ops=ds.applied_ops + [self],
                              context=context)
示例#12
0
def test_topic_modeling(topic_op):
    ds = stream.DataStream(["this is about playstation", "government governs"])
    topics_ds = ds.apply(ops.text.encode.count(name="vec"), topic_op)
    vec_op = topics_ds.applied_ops.find_by_name("vec")
    feature_names = vec_op.model.get_feature_names()

    topic_word_stream = topic_op.map_topics(
        topics_ds=topics_ds,
        feature_names=feature_names,
        max_words_per_topic=2,
    )

    assert list(topic_word_stream) == [[ANY, ANY], [ANY, ANY]]
示例#13
0
def test_sklearn_classifiers_work_with_multiclass_label(
    classifier, feature_vectors, multiclass_labels
):
    vec_ds = stream.DataStream(feature_vectors, context=multiclass_labels)
    fit_params = {"y": multiclass_labels}
    if classifier.supports_batch_training:
        fit_params["classes"] = list(set(multiclass_labels))

    pred_ds = vec_ds.apply(classifier, op_kwargs={"cls": {"fit_params": fit_params}},)

    preds = list(pred_ds)
    assert len(preds) == len(feature_vectors)
    assert all(isinstance(p, ops.classify.ClassificationResult) for p in preds)
    assert all(isinstance(p.label, str) for p in preds)
示例#14
0
def test_sklearn_classifiers_work_with_multilabel_label(
    classifier, feature_vectors, multilabel_labels
):
    vec_ds = stream.DataStream(feature_vectors, context=multilabel_labels)
    fit_params = {"y": multilabel_labels}
    if classifier.supports_batch_training or not classifier.exclusive_classes:
        fit_params["classes"] = list(
            set(itertools.chain.from_iterable(multilabel_labels))
        )

    pred_ds = vec_ds.apply(classifier, op_kwargs={"cls": {"fit_params": fit_params}},)

    preds = list(pred_ds)
    assert len(preds) == len(feature_vectors)
    assert all(isinstance(p, ops.classify.ClassificationResult) for p in preds)
    assert all(isinstance(p.label, (list, tuple)) for p in preds)
示例#15
0
    def map_topics(
        self,
        topics_ds: stream.DataStream,
        feature_names: Iterable[str],
        max_words_per_topic: int = 5,
    ) -> stream.DataStream:
        words_of_topics = []
        for topic_vec in self.model.components_:
            words_of_topic = []
            for feature_id in topic_vec.argsort()[-1:-max_words_per_topic -
                                                  1:-1]:
                words_of_topic.append(feature_names[feature_id])
            words_of_topics.append(words_of_topic)

        mapped_topics = (words_of_topics[topic_id] for topic_id in topics_ds)
        return stream.DataStream(mapped_topics,
                                 applied_ops=topics_ds.applied_ops,
                                 context=topics_ds.context)
示例#16
0
def test_calls_appropriate_underlying_methods_for_training_and_prediction(
        model_class, should_train):
    model = MagicMock(spec=model_class)
    ds = stream.DataStream([[0, 1, 2], [3, 4, 5]])

    op = cluster.ClusterOperation(model=model)
    op.should_train = should_train

    assert op.should_train == should_train

    test_return_value = [[0, 1], [3, 4]]

    if model_class in cluster.ALGORITHMS_NOT_SUPPORTING_NEW_INFERENCE:
        model.labels_ = test_return_value

        output = op.run(ds)

        model.fit.assert_called_once()
        assert list(output.items) == test_return_value

    elif model_class in cluster.ALGORITHMS_SUPPORTING_NEW_INFERENCE:
        model.predict.return_value = test_return_value

        output = op.run(ds)

        if should_train:
            if op.supports_batch_training:
                model.partial_fit.assert_called_once()
            else:
                model.fit.assert_called_once()
            model.predict.assert_called_once()
        else:
            model.fit.assert_not_called()
            model.predict.assert_called_once()

        assert list(output.items) == test_return_value
    else:
        raise Exception("model class not expected")
示例#17
0
def test_token_extraction():
    ds = stream.DataStream(items=["this is a sentence", "woo hoo", ""])
    out_ds = ds.apply(ops.text.extract.tokens())

    expected = [["this", "is", "a", "sentence"], ["woo", "hoo"], []]
    assert list(out_ds.items) == expected
示例#18
0
    def run(self, ds: stream.DataStream) -> stream.DataStream:
        dist_indices_ds = super().run(ds)
        pairs = self._get_pairs(dist_indices_ds)

        # contexts do not make sense anymore
        return stream.DataStream(pairs, applied_ops=ds.applied_ops + [self])