Пример #1
0
  def test_split_span_add_delimiters(self):
    text = '1 2 3\nмама\nಶ್ರೀರಾಮ'
    tm = TextMap(text)

    spans = [s for s in tm.split_spans('\n', add_delimiter=True)]
    for k in spans:
      print(tm.text_range(k))

    self.assertEqual('1 2 3\n', tm.text_range(spans[0]))
Пример #2
0
  def test_map_text_range(self):
    text = """1.2. мама   молилась ಶ್ರೀರಾಮ\n\nРама -- Вишну, А Вишну 
    ел... черешню? (черешня по 10 руб. 20 коп.) '' """

    tm = TextMap(text)
    t = tm.text_range([0, 3])
    self.assertEqual(t, '1.2. мама   молилась')
Пример #3
0
  def test_sentence_at_index_return_delimiters(self):

    tm = TextMap('стороны Заключили\n  договор  ПРЕДМЕТ \nДОГОВОРА')
    for i in range(len(tm)):
      print(i, tm[i])

    bounds = tm.sentence_at_index(0)
    print(bounds)
    print(tm.text_range(bounds))
    for i in range(0, 3):
      bounds = tm.sentence_at_index(i)
      self.assertEqual('стороны Заключили\n', tm.text_range(bounds), str(i))

    for i in range(3, 5):
      bounds = tm.sentence_at_index(i)
      self.assertEqual('договор  ПРЕДМЕТ \n', tm.text_range(bounds))

    for i in range(6, 7):
      bounds = tm.sentence_at_index(i)
      self.assertEqual('ДОГОВОРА', tm.text_range(bounds))
Пример #4
0
  def test_find_value_sign_c(self):

    for (sign_expected, price, currency, _, text) in data:
      tm = TextMap(text)
      sign, span = find_value_sign(tm)
      if sign_expected:
        self.assertEqual(sign_expected, sign, text)
      quote = ''
      if span:
        quote = tm.text_range(span)
      print(f'{sign},\t {span},\t {quote}')
Пример #5
0
  def test_token_indices_by_char_range(self):
    text = 'мама'
    span = [0, 4]
    expected = text[span[0]:span[1]]
    print(expected)

    tm = TextMap(text)  # tokenization
    ti = tm.token_indices_by_char_range(span)
    self.assertEqual(0, ti[0])
    self.assertEqual(1, ti[1])

    self.assertEqual(expected, tm.text_range(ti))
Пример #6
0
  def test_finditer(self):
    from analyser.transaction_values import _re_greather_then
    text = """стоимость, равную или превышающую 2000000 ( два миллиона ) долларов сша, но менее"""
    tm = TextMap(text)
    iter = tm.finditer(_re_greather_then)

    results = [t for t in iter]
    results = results[0]

    self.assertEqual('превышающую', tm.text_range(results))
    self.assertEqual(4, results[0])
    self.assertEqual(5, results[1])
Пример #7
0
def nn_get_tag_value(tagname: str,
                     textmap: TextMap,
                     semantic_map: DataFrame,
                     threshold=0.3) -> SemanticTag or None:
    att = semantic_map[tagname].values
    slices = find_top_spans(att, threshold=threshold,
                            limit=1)  # TODO: estimate per-tag thresholds

    if len(slices) > 0:
        span = slices[0].start, slices[0].stop
        value = textmap.text_range(span)
        tag = SemanticTag(tagname, value, span)
        tag.confidence = float(att[slices[0]].mean())
        return tag
    return None
Пример #8
0
  def test_concat_then_slice(self):
    text1 = 'этилен мама'
    text2 = 'этилен папа'

    tm0 = TextMap('')
    tm1 = TextMap(text1)
    tm2 = TextMap(text2)

    tm0 += tm1
    tm0 += tm2

    print(tm1.tokens)
    self.assertEqual(text1 + text2, tm0.text)
    self.assertEqual('мамаэтилен', tm0.text_range([1, 3]))

    tm3 = tm0.slice(slice(1, 3))
    self.assertEqual('мамаэтилен', tm3.text)
Пример #9
0
 def test_find_value_sign_b(self):
   text = """стоимость, равную или превышающую 2000000 ( два миллиона ) долларов сша, но менее"""
   tm = TextMap(text)
   sign, span = find_value_sign(tm)
   quote = tm.text_range(span)
   self.assertEqual('менее', quote)
Пример #10
0
def line_features(tokens_map: TextMap, line_span: (int, int), line_number: int,
                  prev_features):
    tokens: Tokens = tokens_map.tokens_by_range(line_span)
    # TODO: add previous and next lines features
    txt: str = tokens_map.text_range(line_span)

    numbers, span, k, s = get_tokenized_line_number(tokens, 0)
    if not numbers:
        numbers = []
        number_minor = -2
        number_major = -2
    else:
        number_minor = numbers[-1]
        number_major = numbers[0]

    header_id = ' '.join(tokens[span[1]:])
    header_id = header_id.lower()

    all_upper = header_id.upper() == header_id

    features = {
        'line_number': line_number,
        # 'popular': _onehot(header_id in popular_headers),
        # 'cr_count': txt.count('\r'),
        'has_contract': _onehot(txt.lower().find('договор')),
        'has_article': _onehot(txt.lower().find('статья')),
        'all_uppercase': _onehot(all_upper),
        'len_tokens': len(tokens),
        'len_chars': len(txt),
        'number_level': len(numbers),
        'number_minor': number_minor,
        'number_major': number_major,
        'number_roman': _onehot(s),
        'endswith_dot': _onehot(txt.rstrip().endswith('.')),
        'endswith_comma': _onehot(txt.rstrip().endswith(',')),
        'endswith_underscore': _onehot(txt.rstrip().endswith('_')),

        # counts
        'dots': header_id.count('.'),
        'tabs': txt.count('\t'),
        'spaces_inside': txt.strip().count(' '),
        'spaces_all': txt.count(' '),
        'commas': header_id.count(','),
        'brackets': _count_strange_symbols(txt, '(){}[]'),
        'dashes': header_id.count('-'),
        'colons': header_id.count(':'),
        'semicolons': header_id.count(';'),
        'strange_symbols': _count_strange_symbols(header_id, '[$@+]?^&'),
        'capitals': _count_capitals(txt),
        'digits': _count_digits(header_id),
        'quotes': _count_strange_symbols(txt, '«»"\'"'),
        'underscores': _count_strange_symbols(txt, '_')
    }

    # if prev_features is None:
    #   # features['prev-number_level'] = 0
    #   features['prev-len_chars']=-1
    # else:
    #   # features['prev-number_level'] = prev_features['number_level']
    #   features['prev-len_chars'] = prev_features['len_chars']

    return features