def test_lf_applier_pandas_spacy_preprocessor_memoized(self) -> None: spacy = SpacyPreprocessor(text_field="text", doc_field="doc") spacy.memoize = True @labeling_function(pre=[spacy]) def first_is_name(x: DataPoint) -> int: return 0 if x.doc[0].pos_ == "PROPN" else -1 @labeling_function(pre=[spacy]) def has_verb(x: DataPoint) -> int: return 0 if sum(t.pos_ == "VERB" for t in x.doc) > 0 else -1 df = pd.DataFrame(dict(text=TEXT_DATA)) df = dd.from_pandas(df, npartitions=2) applier = DaskLFApplier([first_is_name, has_verb]) L = applier.apply(df) np.testing.assert_equal(L, L_TEXT_EXPECTED)
def test_spacy_preprocessor(self) -> None: x = SimpleNamespace(text="Jane plays soccer.") preprocessor = SpacyPreprocessor("text", "doc") x_preprocessed = preprocessor(x) assert x_preprocessed is not None self.assertEqual(len(x_preprocessed.doc), 4) token = x_preprocessed.doc[0] self.assertEqual(token.text, "Jane") self.assertEqual(token.pos_, "PROPN")
def test_lf_applier_pandas_spacy_preprocessor(self) -> None: spacy = SpacyPreprocessor(text_field="text", doc_field="doc") @labeling_function(pre=[spacy]) def first_is_name(x: DataPoint) -> int: return 0 if x.doc[0].pos_ == "PROPN" else -1 @labeling_function(pre=[spacy]) def has_verb(x: DataPoint) -> int: return 0 if sum(t.pos_ == "VERB" for t in x.doc) > 0 else -1 df = pd.DataFrame(dict(text=TEXT_DATA)) applier = PandasLFApplier([first_is_name, has_verb]) L = applier.apply(df, progress_bar=False) np.testing.assert_equal(L, L_TEXT_EXPECTED)
def _create_preprocessor( cls, parameters: SpacyPreprocessorParameters) -> SpacyPreprocessor: return SpacyPreprocessor(**parameters._asdict())
# For more info, see the [`SpacyPreprocessor` documentation](https://snorkel.readthedocs.io/en/master/packages/_autosummary/preprocess/snorkel.preprocess.nlp.SpacyPreprocessor.html#snorkel.preprocess.nlp.SpacyPreprocessor). # # # If you prefer to use a different NLP tool, you can also wrap that as a `Preprocessor` and use it in the same way. # For more info, see the [`preprocessor` documentation](https://snorkel.readthedocs.io/en/master/packages/_autosummary/preprocess/snorkel.preprocess.preprocessor.html#snorkel.preprocess.preprocessor). # %% [markdown] {"tags": ["md-exclude"]} # If the spaCy English model wasn't already installed, the next cell may raise an exception. # If this happens, restart the kernel and re-execute the cells up to this point. # %% from snorkel.preprocess.nlp import SpacyPreprocessor # The SpacyPreprocessor parses the text in text_field and # stores the new enriched representation in doc_field spacy = SpacyPreprocessor(text_field="text", doc_field="doc", memoize=True) # %% @labeling_function(pre=[spacy]) def has_person(x): """Ham comments mention specific people and are short.""" if len(x.doc) < 20 and any([ent.label_ == "PERSON" for ent in x.doc.ents]): return HAM else: return ABSTAIN # %% [markdown] # Because spaCy is such a common preprocessor for NLP applications, we also provide a # [prebuilt `labeling_function`-like decorator that uses spaCy](https://snorkel.readthedocs.io/en/master/packages/_autosummary/labeling/snorkel.labeling.lf.nlp.nlp_labeling_function.html#snorkel.labeling.lf.nlp.nlp_labeling_function).
def _create_preprocessor( cls, parameters: SpacyPreprocessorParameters) -> SpacyPreprocessor: preprocessor = SpacyPreprocessor(**parameters._asdict()) make_spark_preprocessor(preprocessor) return preprocessor
for line in inF: allDev.append(json.loads(line)) df_dev = pd.DataFrame(allDev) df_train = df_tweets.loc[~df_tweets['id'].isin(df_dev['id'])] #label mappings BE = 0 NL = 1 ABSTAIN = -1 #spacy preprocessor for dutch spacy_preproc = SpacyPreprocessor('text', 'doc', language='nl_core_news_sm', memoize=True, disable=['tagger', 'parser']) ## RULES @labeling_function() def country_code(x): #country_code based on tweet location #precise but low coverage if x.country_code == 'BE': return BE elif x.country_code == 'NL': return NL else: return ABSTAIN