Пример #1
0
def create_new_corpus(data_dict, corpus_vol, **kwargs):
    new_corpus = Corpus([])
    sem_nums = kwargs['sem_nums']
    intents = data_dict.keys()
    if not corpus_vol:
        return
    elif sem_nums > len(intents):
        return
    else:
        for i in range(corpus_vol):
            intent_sam = set()
            while len(intent_sam) < sem_nums:
                intent_sam.add(random.choice(list(intents)))
            spanset = SpanSet()
            sentences = []
            start_position = 0
            for intent in list(intent_sam):
                if intent == 'noise':
                    txt = random.choice(list(data_dict[intent]))
                    sentences.append(txt)
                    start_position += len(txt)
                else:
                    txt = random.choice(list(data_dict[intent]))
                    sentences.append(txt)
                    spanset.append(
                        Span(start=start_position,
                             end=start_position + len(txt),
                             entity=intent))
                    start_position += len(txt)
            doc = Document(text=''.join(sentences),
                           label='|'.join(intent_sam),
                           span_set=spanset)
            new_corpus.append(doc)

    return new_corpus
Пример #2
0
def test_contains__(datadir, tmpdir):
    corpus = Corpus()

    corpus.append(seq_one)
    corpus.append(seq_two)

    assert seq_one in corpus

    other_corpus = Document("")

    assert other_corpus not in corpus
Пример #3
0
def test_write_to_file(datadir, tmpdir):
    corpus = Corpus()

    corpus.append(seq_one)
    corpus.append(seq_two)

    result_file = tmpdir / "output.conllx"
    corpus.write_to_file(result_file)

    gold_file = datadir / "output.conllx"

    assert filecmp.cmp(result_file, gold_file)
Пример #4
0
def test_getitem__(datadir, tmpdir):
    corpus = Corpus()

    corpus.append(seq_one)
    corpus.append(seq_two)

    # test single element get item
    item = corpus[0]

    assert item == seq_one

    # test batch element get item
    other_corpus = corpus[[0, 1]]

    assert other_corpus == corpus