class CRFTagger(ITagger): def __init__(self, cache_size: Optional[int] = 15000): self._tagger = crf.load_tagger() self._tags = crf.load_tags() extractor = crf.load_extractor() if cache_size is not None: extractor = Cache(extractor, size=cache_size) extractors = {offset: extractor for offset in range(-2, 3)} self._generator = FeatureWindowGenerator(extractors) def _get_features(self, text: Text): for window in self._generator.generate(text, range(len(text))): yield { f'{position}:{name}': value for position, features in window for name, value in features } def tag(self, text: Text, indices: Indices) -> Iterator[Tagged]: labels = self._tagger.tag(self._get_features(text)) for index in indices: label = labels[index] if label: yield index, self._tags[label]
class LinearTagger(ITagger): def __init__(self, cache_size: Optional[int] = 15000): self._coefficients = linear.load_coefficients() self._intercept = linear.load_intercept() self._tags = linear.load_tags() vocabulary = linear.load_vocabulary() extractor = linear.load_extractor() if cache_size is not None: extractor = Cache(extractor, size=cache_size) extractors = {offset: extractor for offset in vocabulary.keys()} self._generator = FeatureWindowGenerator(extractors) self._vectorizer = SparseWindowVectorizer(vocabulary) def tag(self, text: Text, indices: Indices) -> Iterator[Tagged]: text = [normalize(word) for word in text] indices = [index for index in indices if is_cyrillic(text[index])] windows = self._generator.generate(text, indices) matrix = self._vectorizer.transform(windows) labels = (matrix * self._coefficients + self._intercept).argmax(axis=1) for index, label in zip(indices, labels): yield index, self._tags[label]
def _assert_windows_equal( expected: Iterable[FeatureWindow], generator: FeatureWindowGenerator, text: Text, indices: Indices = None, ): actual = generator.generate(text, indices or range(len(text))) expected = _unroll(expected) actual = _unroll(actual) assert len(expected) == len(actual) for expected_window, actual_window in zip(expected, actual): assert expected_window == actual_window
def _assert_windows_equal(expected: Iterable[FeatureWindow], generator: FeatureWindowGenerator, text: Text, indices: Indices = None, ): actual = generator.generate(text, indices or range(len(text))) def unroll(windows: Iterable[FeatureWindow]): return [ [(position, list(features)) for position, features in window] for window in windows ] expected = unroll(expected) actual = unroll(actual) assert len(expected) == len(actual) for expected_window, actual_window in zip(expected, actual): assert expected_window == actual_window