def get_tokenized_data(get_parsect_data): parsect_json = get_parsect_data parsect_lines = parsect_json["parse_sect"] parsect_lines = parsect_lines[:100] tokenizer = WordTokenizer() lines = [] labels = [] for line_json in parsect_lines: text = line_json["text"] label = line_json["label"] lines.append(text) labels.append(label) instances = tokenizer.tokenize_batch(lines) return instances, labels
def test_len_sample_batch(self): sample_sentences = ["I like big apple.", "We process text"] tokenizer = WordTokenizer() tokenized = tokenizer.tokenize_batch(sample_sentences) assert len(tokenized) == 2