def test_highlight(self): text = 'Мама мыла раму' df = Separator.separate_string(text) df['highlight'] = df.word_id == 1 v = DfViewer(as_html_object=False, highlight_column='highlight') self.assertEqual( 'Мама <span style="background-color:#ffdddd;">мыла</span> раму', v.convert(df))
def test_separation(self): text = '«Какой-нибудь» текст — с знаками… И еще словами!.. Вот так.' df = Separator.separate_string(text) self.assertListEqual( list(df.word_offset), [0, 1, 13, 17, 23, 26, 28, 35, 37, 39, 43, 50, 54, 58, 61]) self.assertListEqual(list(df.word_length), [1, 12, 1, 5, 1, 1, 7, 1, 1, 3, 7, 3, 3, 3, 1])
def setUpClass(cls) -> None: super(SlovnetFeaturizersTestCase, cls).setUpClass() cls.analyzer = SlovnetFeaturizer() cls.context_featurizer = SlovnetContextFeaturizer() df = Separator.separate_string(text) for c in ['word_id', 'sentence_id', 'paragraph_id']: df[c] += 100 cls.result = cls.analyzer.featurize(df) cls.context_result = cls.context_featurizer.featurize(cls.result)
def test_pymorphy(self): df = Separator.separate_string(text) df.word_id += 100 result = PyMorphyFeaturizer().featurize(df) self.assertListEqual( [100, 101, 102, 103, 104, 105, 106, 107, 108, 109, 110], list(result.index)) self.assertListEqual([ 'он', 'подойти', 'к', 'дверь', '.', 'за', 'она', 'никто', 'не', 'быть', '.' ], list(result.normal_form)) self.assertListEqual([ 'normal_form', 'alternatives', 'score', 'delta_score', 'POS', 'animacy', 'gender', 'number', 'case', 'aspect', 'transitivity', 'person', 'tense', 'mood', 'voice', 'involvement' ], list(result.columns))
def test_usage_of_provided_pymorphy_column(self): df = Separator.separate_string("окно открыто") df['check_requested'] = True df1 = df.copy() alg = RepetitionsAlgorithm(50, False, True, False) alg.run_on_bundle(DataBundle(src=df1)) self.assertTrue(df1.repetition_status.all()) df2 = df.copy() pym = df2[['word_id']].copy() pym['normal_form'] = 'окно' pym = pym.set_index('word_id') alg = RepetitionsAlgorithm(50, False, True, False) alg.run_on_bundle(DataBundle(src=df2, pymorphy=pym)) self.assertFalse(df2.repetition_status.all()) df3 = df.copy() pym = df3[['word_id']].copy() pym['normal_form'] = ['двуединый', 'единообразие'] pym = pym.set_index('word_id') alg = RepetitionsAlgorithm(50, False, False, True) alg.run_on_bundle(DataBundle(src=df3, pymorphy=pym)) self.assertFalse(df3.repetition_status.all())
def test_viewer(self): text = 'Мама мыла раму' df = Separator.separate_string(text) v = DfViewer(as_html_object=False) self.assertEqual(text, v.convert(df))
def test_separation_columns(self): text = '«Какой-нибудь» текст — с знаками… И еще словами!.. Вот так.' df = Separator.separate_string(text) self.assertListEqual(Separator.COLUMNS, list(df.columns))
def test_separator_types(self): df = Separator.separate_string('Слово сло' + chr(8242) + 'во! Qwe - йцу ' + "it's") self.assertListEqual( ['ru', 'ru', 'punct', 'unk', 'punct', 'ru', 'unk', 'unk', 'unk'], list(df.word_type))
def test_separation_string_with_nl(self): df = Separator.separate_string('Строка\nВторая строка') self.assertListEqual([0, 1, 1], list(df.paragraph_id))