def test_split_span_add_delimiters(self): text = '1 2 3\nмама\nಶ್ರೀರಾಮ' tm = TextMap(text) spans = [s for s in tm.split_spans('\n', add_delimiter=True)] for k in spans: print(tm.text_range(k)) self.assertEqual('1 2 3\n', tm.text_range(spans[0]))
def doc_features(tokens_map: TextMap): body_lines_ranges = tokens_map.split_spans(PARAGRAPH_DELIMITER, add_delimiter=True) _doc_features = [] _line_spans = [] ln = 0 _prev_features = None for line_span in body_lines_ranges: _line_spans.append(line_span) _features = line_features(tokens_map, line_span, ln, _prev_features) _doc_features.append(_features) _prev_features = _features ln += 1 doc_featuresX_data = pd.DataFrame.from_records(_doc_features) doc_features_data = np.array(doc_featuresX_data) return doc_features_data, _line_spans