def test_split_span_add_delimiters(self): text = '1 2 3\nмама\nಶ್ರೀರಾಮ' tm = TextMap(text) spans = [s for s in tm.split_spans('\n', add_delimiter=True)] for k in spans: print(tm.text_range(k)) self.assertEqual('1 2 3\n', tm.text_range(spans[0]))
def test_map_text_range(self): text = """1.2. мама молилась ಶ್ರೀರಾಮ\n\nРама -- Вишну, А Вишну ел... черешню? (черешня по 10 руб. 20 коп.) '' """ tm = TextMap(text) t = tm.text_range([0, 3]) self.assertEqual(t, '1.2. мама молилась')
def test_sentence_at_index_return_delimiters(self): tm = TextMap('стороны Заключили\n договор ПРЕДМЕТ \nДОГОВОРА') for i in range(len(tm)): print(i, tm[i]) bounds = tm.sentence_at_index(0) print(bounds) print(tm.text_range(bounds)) for i in range(0, 3): bounds = tm.sentence_at_index(i) self.assertEqual('стороны Заключили\n', tm.text_range(bounds), str(i)) for i in range(3, 5): bounds = tm.sentence_at_index(i) self.assertEqual('договор ПРЕДМЕТ \n', tm.text_range(bounds)) for i in range(6, 7): bounds = tm.sentence_at_index(i) self.assertEqual('ДОГОВОРА', tm.text_range(bounds))
def test_find_value_sign_c(self): for (sign_expected, price, currency, _, text) in data: tm = TextMap(text) sign, span = find_value_sign(tm) if sign_expected: self.assertEqual(sign_expected, sign, text) quote = '' if span: quote = tm.text_range(span) print(f'{sign},\t {span},\t {quote}')
def test_token_indices_by_char_range(self): text = 'мама' span = [0, 4] expected = text[span[0]:span[1]] print(expected) tm = TextMap(text) # tokenization ti = tm.token_indices_by_char_range(span) self.assertEqual(0, ti[0]) self.assertEqual(1, ti[1]) self.assertEqual(expected, tm.text_range(ti))
def test_finditer(self): from analyser.transaction_values import _re_greather_then text = """стоимость, равную или превышающую 2000000 ( два миллиона ) долларов сша, но менее""" tm = TextMap(text) iter = tm.finditer(_re_greather_then) results = [t for t in iter] results = results[0] self.assertEqual('превышающую', tm.text_range(results)) self.assertEqual(4, results[0]) self.assertEqual(5, results[1])
def nn_get_tag_value(tagname: str, textmap: TextMap, semantic_map: DataFrame, threshold=0.3) -> SemanticTag or None: att = semantic_map[tagname].values slices = find_top_spans(att, threshold=threshold, limit=1) # TODO: estimate per-tag thresholds if len(slices) > 0: span = slices[0].start, slices[0].stop value = textmap.text_range(span) tag = SemanticTag(tagname, value, span) tag.confidence = float(att[slices[0]].mean()) return tag return None
def test_concat_then_slice(self): text1 = 'этилен мама' text2 = 'этилен папа' tm0 = TextMap('') tm1 = TextMap(text1) tm2 = TextMap(text2) tm0 += tm1 tm0 += tm2 print(tm1.tokens) self.assertEqual(text1 + text2, tm0.text) self.assertEqual('мамаэтилен', tm0.text_range([1, 3])) tm3 = tm0.slice(slice(1, 3)) self.assertEqual('мамаэтилен', tm3.text)
def test_find_value_sign_b(self): text = """стоимость, равную или превышающую 2000000 ( два миллиона ) долларов сша, но менее""" tm = TextMap(text) sign, span = find_value_sign(tm) quote = tm.text_range(span) self.assertEqual('менее', quote)
def line_features(tokens_map: TextMap, line_span: (int, int), line_number: int, prev_features): tokens: Tokens = tokens_map.tokens_by_range(line_span) # TODO: add previous and next lines features txt: str = tokens_map.text_range(line_span) numbers, span, k, s = get_tokenized_line_number(tokens, 0) if not numbers: numbers = [] number_minor = -2 number_major = -2 else: number_minor = numbers[-1] number_major = numbers[0] header_id = ' '.join(tokens[span[1]:]) header_id = header_id.lower() all_upper = header_id.upper() == header_id features = { 'line_number': line_number, # 'popular': _onehot(header_id in popular_headers), # 'cr_count': txt.count('\r'), 'has_contract': _onehot(txt.lower().find('договор')), 'has_article': _onehot(txt.lower().find('статья')), 'all_uppercase': _onehot(all_upper), 'len_tokens': len(tokens), 'len_chars': len(txt), 'number_level': len(numbers), 'number_minor': number_minor, 'number_major': number_major, 'number_roman': _onehot(s), 'endswith_dot': _onehot(txt.rstrip().endswith('.')), 'endswith_comma': _onehot(txt.rstrip().endswith(',')), 'endswith_underscore': _onehot(txt.rstrip().endswith('_')), # counts 'dots': header_id.count('.'), 'tabs': txt.count('\t'), 'spaces_inside': txt.strip().count(' '), 'spaces_all': txt.count(' '), 'commas': header_id.count(','), 'brackets': _count_strange_symbols(txt, '(){}[]'), 'dashes': header_id.count('-'), 'colons': header_id.count(':'), 'semicolons': header_id.count(';'), 'strange_symbols': _count_strange_symbols(header_id, '[$@+]?^&'), 'capitals': _count_capitals(txt), 'digits': _count_digits(header_id), 'quotes': _count_strange_symbols(txt, '«»"\'"'), 'underscores': _count_strange_symbols(txt, '_') } # if prev_features is None: # # features['prev-number_level'] = 0 # features['prev-len_chars']=-1 # else: # # features['prev-number_level'] = prev_features['number_level'] # features['prev-len_chars'] = prev_features['len_chars'] return features