def test_map_tokens_in_range(self): text = '1.2. мама ಶ್ರೀರಾಮ' tm = TextMap(text) tokens = tm.tokens_by_range([0, 2]) self.assertEqual(len(tokens), 2) self.assertEqual(tokens[0], '1.2.') self.assertEqual(tokens[1], 'мама')
def line_features(tokens_map: TextMap, line_span: (int, int), line_number: int, prev_features): tokens: Tokens = tokens_map.tokens_by_range(line_span) # TODO: add previous and next lines features txt: str = tokens_map.text_range(line_span) numbers, span, k, s = get_tokenized_line_number(tokens, 0) if not numbers: numbers = [] number_minor = -2 number_major = -2 else: number_minor = numbers[-1] number_major = numbers[0] header_id = ' '.join(tokens[span[1]:]) header_id = header_id.lower() all_upper = header_id.upper() == header_id features = { 'line_number': line_number, # 'popular': _onehot(header_id in popular_headers), # 'cr_count': txt.count('\r'), 'has_contract': _onehot(txt.lower().find('договор')), 'has_article': _onehot(txt.lower().find('статья')), 'all_uppercase': _onehot(all_upper), 'len_tokens': len(tokens), 'len_chars': len(txt), 'number_level': len(numbers), 'number_minor': number_minor, 'number_major': number_major, 'number_roman': _onehot(s), 'endswith_dot': _onehot(txt.rstrip().endswith('.')), 'endswith_comma': _onehot(txt.rstrip().endswith(',')), 'endswith_underscore': _onehot(txt.rstrip().endswith('_')), # counts 'dots': header_id.count('.'), 'tabs': txt.count('\t'), 'spaces_inside': txt.strip().count(' '), 'spaces_all': txt.count(' '), 'commas': header_id.count(','), 'brackets': _count_strange_symbols(txt, '(){}[]'), 'dashes': header_id.count('-'), 'colons': header_id.count(':'), 'semicolons': header_id.count(';'), 'strange_symbols': _count_strange_symbols(header_id, '[$@+]?^&'), 'capitals': _count_capitals(txt), 'digits': _count_digits(header_id), 'quotes': _count_strange_symbols(txt, '«»"\'"'), 'underscores': _count_strange_symbols(txt, '_') } # if prev_features is None: # # features['prev-number_level'] = 0 # features['prev-len_chars']=-1 # else: # # features['prev-number_level'] = prev_features['number_level'] # features['prev-len_chars'] = prev_features['len_chars'] return features