Пример #1
0
  def test_slice_start_from_space(self):
    offff = 20
    txt = ' ' * offff + '''основании Устава, с одной стороны, и Фонд «Благо»'''
    tm = TextMap(txt)
    print(tm.map[0])
    print(tm.tokens[11])
    print(tm.map[11])
    # print(f'[{doc.tokens_map.text}]')
    print(f'[{tm.text}]')

    print(len(tm))
    tm_sliced = tm.slice(slice(0, len(tm)))
    print('span-0')
    print(tm.map[0])
    print(tm_sliced.map[0])

    self.assertEqual(len(tm), len(tm_sliced))
    self.assertEqual(tm.map[0], tm_sliced.map[0])

    for c in range(len(tm.tokens[0])):
      print(c)
      self.assertEqual(0, tm.token_index_by_char(c))
      self.assertEqual(0, tm_sliced.token_index_by_char(c))

    self.assertEqual(tm.text, tm_sliced.text)

    self.assertEqual(0, tm.token_index_by_char(0))
Пример #2
0
  def test_token_indices_by_char_range_sliced(self):
    text = 'm йe qwert'

    __tm = TextMap(text)  # tokenization
    tm = __tm.slice(slice(1, 2))

    self.assertEqual('йe', tm.tokens[0])
    self.assertEqual('йe', tm.text)

    char_range = tm.char_range([0, 1])
    ti = tm.token_indices_by_char_range(char_range)
    self.assertEqual(0, ti[0])
    self.assertEqual(1, ti[1])

    ti = tm.token_indices_by_char_range([1, 2])
    self.assertEqual(0, ti[0])
Пример #3
0
  def test_concat_then_slice(self):
    text1 = 'этилен мама'
    text2 = 'этилен папа'

    tm0 = TextMap('')
    tm1 = TextMap(text1)
    tm2 = TextMap(text2)

    tm0 += tm1
    tm0 += tm2

    print(tm1.tokens)
    self.assertEqual(text1 + text2, tm0.text)
    self.assertEqual('мамаэтилен', tm0.text_range([1, 3]))

    tm3 = tm0.slice(slice(1, 3))
    self.assertEqual('мамаэтилен', tm3.text)
Пример #4
0
  def test_slice(self):
    text = 'этилен мама   ಶ್ರೀರಾಮ'
    tm = TextMap(text)
    tm2: TextMap = tm.slice(slice(1, 2))

    self.assertEqual(tm2[0], 'мама')
    self.assertEqual(tm2.text, 'мама')

    tm3 = tm2.slice(slice(0, 1))
    self.assertEqual(tm3[0], 'мама')

    self.assertEqual(0, tm.token_index_by_char(1))
    self.assertEqual(0, tm2.token_index_by_char(1))
    self.assertEqual(0, tm3.token_index_by_char(1))

    self.assertEqual('мама', tm3.text)
    self.assertEqual('мама', tm3.text_range([0, 1]))
    self.assertEqual('мама', tm3.text_range([0, 2]))