def get_tokenized_data(get_parsect_data):
    parsect_json = get_parsect_data
    parsect_lines = parsect_json["parse_sect"]
    parsect_lines = parsect_lines[:100]
    tokenizer = WordTokenizer()

    lines = []
    labels = []

    for line_json in parsect_lines:
        text = line_json["text"]
        label = line_json["label"]
        lines.append(text)
        labels.append(label)

    instances = tokenizer.tokenize_batch(lines)

    return instances, labels
Пример #2
0
 def test_len_sample_batch(self):
     sample_sentences = ["I like big apple.", "We process text"]
     tokenizer = WordTokenizer()
     tokenized = tokenizer.tokenize_batch(sample_sentences)
     assert len(tokenized) == 2