def test_get_docs_returns_spacy_docs(): op = SpacyBasedOperation(nlp=nlp) text_docs = DataStream(["this is doc 1", "this is doc 2"]) spacy_docs = DataStream([nlp.make_doc(d) for d in text_docs]) assert all((isinstance(d, Doc) for d in op.get_docs_stream(text_docs))) assert all((isinstance(d, Doc) for d in op.get_docs_stream(spacy_docs)))
def test_remove_numbers(): texts = ["One is 1", "Hey, my number is 23458"] expected = ["One is", "Hey, my number is"] ds = DataStream(texts) actual = list(map(str, ds.apply(remove_numbers()))) assert actual == expected
def test_accepts_stream_of_texts(): patterns = [[{"LOWER": "this"}]] op = TokenFilterOperation(patterns) ds = DataStream(["this is a string data stream"]) output = list(ds.apply(op)) assert isinstance(output[0], str)
def test_does_not_train_while_training_is_disabled(): ds = DataStream(["this is text1", "this is text2"]) op = ScikitBasedOperation(model=MagicMock(spec_set=TfidfVectorizer), predict_fn_name="transform") op.should_train = False ds.apply(op) op.model.fit.assert_not_called()
def test_remove_short_words(): texts = ["this is a first text", "what was that"] expected = ["this first text", "what that"] ds = DataStream(texts) actual = list(map(str, ds.apply(remove_short_words(length=4)))) assert actual == expected
def test_accepts_stream_of_spacy_docs(): nlp = spacy.load("en_core_web_sm") patterns = [[{"LOWER": "this"}]] op = TokenFilterOperation(patterns) ds = DataStream(nlp.pipe(["this is a spacy doc data stream"])) output = list(ds.apply(op)) assert isinstance(output[0], str)
def test_remove_emails(): texts = [ "please contact us at: [email protected]", "send email @ [email protected]", ] expected = ["please contact us at:", "send email @"] ds = DataStream(texts) actual = list(map(str, ds.apply(remove_emails()))) assert actual == expected
def test_remove_links(): texts = [ "visit us at www.example.com/testing", "our website is http://example.com/", ] expected = ["visit us at", "our website is"] ds = DataStream(texts) actual = list(map(str, ds.apply(remove_links()))) assert actual == expected
def test_vectorizes_correctly(op, input, input_count, is_input_generator): # pytest doesn't seem to support parametrize generator # convert input to generator here if is_input_generator: input = (x for x in input) ds = DataStream(input) features_ds = ds.apply(op) assert len(op.model.vocabulary_) > 0 features = list(features_ds) assert len(features) == input_count assert features[0].shape[-1] == len(op.model.vocabulary_)
def test_remove_stopwords(): stopwords = ["this", "that", "an", "a"] texts = [ "That is a nice car", "Python is a type of a snake", "This test should pass", ] expected = ["is nice car", "Python is type of snake", "test should pass"] ds = DataStream(texts) actual = list(map(str, ds.apply(remove_stopwords(stopwords)))) assert actual == expected
def test_if_every_token_is_removed_then_items_is_discarded(): texts = ["this will be deleted", "this will not be deleted"] context = ["a", "b"] ds = DataStream(items=texts, context=context) op = remove_stopwords(["this", "will", "be", "deleted"]) output_ds = ds.apply(op) actual_texts = list(output_ds.items) actual_context = list(output_ds.context) # check that we have only one text in the stream # and the context is "b" assert len(actual_texts) == 1 assert len(actual_context) == 1 assert actual_context[0] == "b"
def get_docs_stream(self, ds: DataStream) -> DataStream: """Returns DataStream of spacy Docs. If the data stream already contains spacy Docs then they are returned as-is otherwise the nlp object is used to create spacy Docs Parameters ---------- ds : DataStream input data stream Returns ------ out : DataStream A datastream containing an iterable of spacy's `Doc` objects """ if ds.item_type != Doc: docs_with_context = self.nlp.pipe( zip(ds, ds.context), as_tuples=True, n_process=config.ALLOCATED_PROCESSOR_FOR_SPACY, ) new_docs, context = more_itertools.unzip(docs_with_context) return DataStream(items=new_docs, applied_ops=ds.applied_ops, context=context) else: return ds
def run(self, ds: DataStream) -> DataStream: docs_ds = self.get_docs_stream(ds) processed_docs = map(self.process_doc, docs_ds, docs_ds.context) processed_docs = (x for x in processed_docs if x is not None) items, context = more_itertools.unzip(processed_docs) return DataStream(items=items, applied_ops=ds.applied_ops + [self], context=context)
def run(self, ds, fit_params: dict = {}, predict_params: dict = {}): if not self.can_predict_on_new: self.should_train = True if self.should_train: if ds.is_countable: train_ds = ds pred_ds = ds else: train_items, pred_items = itertools.tee(ds, 2) train_context, pred_context = itertools.tee(ds.context, 2) train_ds = DataStream(train_items, context=train_context) pred_ds = DataStream(pred_items, context=pred_context) self._fit(train_ds, fit_params) else: pred_ds = ds preds, context = more_itertools.unzip( self._predict(pred_ds, predict_params)) preds = itertools.chain.from_iterable(preds) context = itertools.chain.from_iterable(context) return DataStream(items=preds, context=context, applied_ops=ds.applied_ops + [self])
def ds(): return DataStream(items=["Aa", "bbB"])
def test_lemmatizes_correctly_for_stream_of_texts(texts, lemmatized, nlp): ds = DataStream(texts) op = lemmatize(nlp=nlp) assert list(ds.apply(op)) == lemmatized
def test_token_filter_keeps_matching_tokens(input, patterns, expected): op = TokenFilterOperation(patterns, keep_matching_tokens=True) ds = DataStream(input) actual = list(map(str, ds.apply(op))) assert actual == expected
def test_lemmatizes_correctly_for_stream_of_spacy_docs(texts, lemmatized, nlp): op = lemmatize(nlp=nlp) docs = nlp.pipe(texts) assert list(DataStream(docs).apply(op)) == lemmatized
def test_correctly_changes_cases(ds: DataStream, mode: str, expected: list): op = CaseChangeOperation(mode=mode) assert list(ds.apply(op)) == expected
def test_filter_pos(): ds = DataStream(items=["that is an apple"]) assert list(ds.apply(pos_filter("NOUN")).items) == ["that is an"]