Exemplo n.º 1
0
def samples_generator_sorted(path, max_text_legth=10000):
    data = []
    with open(path, newline='') as f:
        reader = csv.reader(f)
        for row in reader:
            data.append(row)

    MAX = max_text_legth
    datas = sorted(data, key=lambda x: len(x[3]), reverse=True)
    print('Longest text', len(datas[0][3]))
    for row in datas:

        id = row[0]
        print(id)
        text_id = row[1]
        sequence = row[2]
        text = row[3]

        if len(text) > MAX:
            for fragment in split_long_text(text, MAX):
                s = Sentence(fragment, use_tokenizer='toki')
                s.id = id
                s.text_id = text_id
                s.sequence = sequence
                s.ner = []
                s.length = len(fragment)

                yield s

        else:
            s = Sentence(text, use_tokenizer='toki')
            s.id = id
            s.text_id = text_id
            s.sequence = sequence
            s.ner = []
            s.length = len(text)

            yield s
Exemplo n.º 2
0
def samples_generator(path):
    with open(path, newline='') as f:
        reader = csv.reader(f)
        for row in reader:
            id = row[0]
            print(id)
            text_id = row[1]
            sequence = row[2]
            text = row[3]

            s = Sentence(text, use_tokenizer='toki')
            s.id = id
            s.text_id = text_id
            s.sequence = sequence
            s.ner = []

            yield s