예제 #1
0
def process_one_line(line, logger=sys.stderr):
    obj = bson.loads(line)
    # print(obj)
    text = obj['text']
    intent = obj['intent']
    id = obj["id"]
    domain = obj["domain"]
    seq = Document(text, label=intent, id=id)
    seq.domain = domain
    for entity in obj['entities']:
        start = int(entity['start'])  # original index start at 0
        end = int(entity['end'])
        entity = entity['entity']

        try:
            span = Span(start, end, entity)  # may raise OffsetSpanCheckError
        except OffsetSpanCheckError as e:
            logger.write("{}\tspan init failed: {}\n".format(id, e))
            raise CheckFailedError

        # get value which is not in corpus_item object
        # span.fill_text(corpus_item['text'])

        seq.span_set.append(span)

    encoding = offset_to_biluo(seq)  # may raise AssertionError
    # print(encoding)

    sentence = SentenceX(word_lines=text,
                         attribute_lines=[encoding],
                         id=seq.id)
    sentence.meta = {'domain': domain, 'label': intent}

    return seq, sentence
    def _copy_structure_as_doc(self) -> Document:
        doc = Document(copy.deepcopy(self.text))

        attrs_need_be_copied = ["domain", "intent", "function", "sub_function"]

        for attr in attrs_need_be_copied:
            setattr(doc, attr, getattr(self, attr))

        doc.span_set = copy.deepcopy(self.span_set)
        doc.span_set.bind(doc)

        return doc
예제 #3
0
def two_add_link(map_data, file1, file2, link, domain):
    list1 = read_raw_data(file1)
    list2 = read_raw_data(file2)
    link_list = read_raw_data(link)
    len_all = max(len(list1), len(list2))
    path1 = os.path.basename(file1)
    path2 = os.path.basename(file2)
    doc_list = []
    dict_list = read_map(map_data)
    # 数量min
    for i in range(0, len_all):
        l1 = choice(list1)
        l2 = choice(list2)
        l3 = choice(link_list)
        l1end = line_end_remove(l1)
        l2end = line_end_remove(l2)

        l = l1 + l3 + l2
        doc1 = Document(l)
        doc1.domain = domain
        doc1.intent = dict_list[
            path1[:-4]] + ": " + path1[:-4] + "||" + dict_list[
                path2[:-4]] + ": " + path2[:-4]
        span_list1 = [
            Span(start=0, end=len(l1end), entity=path1[:-4]),
            Span(start=len(l1 + l3),
                 end=len(l1 + l3 + l2end),
                 entity=path2[:-4]),
        ]
        doc1.entities = SpanSet(span_list1)
        # print(doc1)
        doc_list.append(doc1)

        ll = l2 + l3 + l1
        doc2 = Document(ll)
        doc2.domain = domain
        doc2.intent = dict_list[
            path1[:-4]] + ": " + path1[:-4] + "||" + dict_list[
                path2[:-4]] + ": " + path2[:-4]
        span_list2 = [
            Span(start=0, end=len(l2end), entity=path2[:-4]),
            Span(start=len(l2 + l3),
                 end=len(l2 + l3 + l1end),
                 entity=path1[:-4]),
        ]
        doc2.entities = SpanSet(span_list2)
        # print(doc1)
        doc_list.append(doc2)

    doc_list = list(set(doc_list))
    corpus = Corpus(doc_list)
    res_path = "./data/" + path1[:-4] + '-' + path2[:-4] + '-' + 'link' + ".conllx"
    corpus.write_to_file(res_path)
예제 #4
0
    def to_offset(self, sequence, text, **kwargs):
        seq = Document(text)

        plain_offset_list = self.decode_to_offset(sequence)

        for offset in plain_offset_list:
            seq.span_set.append(Span(offset[0], offset[1], offset[2]))

        seq.span_set.bind(seq)

        seq.label = kwargs.pop('label', None)
        seq.id = kwargs.pop('id', None)
        seq.extra_attr = kwargs

        return seq
예제 #5
0
 def inference_process(cls, model, input_data, input_ids):
     output = []
     for data, id in zip(input_data, input_ids):
         req = []
         req.append(data)
         request = Request(req)
         response = model.inference(request)
         tmp_result = response['data'][0].sequence
         if not isinstance(tmp_result, Document):
             tmp_result = Document(tmp_result.text, tmp_result.span_set)
         tmp_result.label = response['cls'][0][0]
         tmp_result.id = id
         output.append(tmp_result)
     corpus_inference = Corpus(output)
     return corpus_inference
예제 #6
0
def conllz_to_offset(sentence_data: Sentence,
                     raise_exception=False,
                     attr_index=0) -> Tuple[Document, bool]:
    decoder = BILUOSequenceEncoderDecoder()

    input_text = sentence_data.word_lines
    tags_seq = sentence_data.get_attribute_by_index(attr_index)

    failed = False
    meta = copy.deepcopy(sentence_data.meta)

    try:
        seq = decoder.to_offset(tags_seq,
                                input_text,
                                label=meta.pop('label', None),
                                id=sentence_data.id,
                                **meta)
    except TagSetDecodeError as e:
        if not raise_exception:
            # invalid tag sequence will raise exception
            # so return a empty result
            seq = Document(input_text)
            failed = True
        else:
            raise

    return seq, failed
예제 #7
0
def create_new_corpus(data_dict, corpus_vol, **kwargs):
    new_corpus = Corpus([])
    sem_nums = kwargs['sem_nums']
    intents = data_dict.keys()
    if not corpus_vol:
        return
    elif sem_nums > len(intents):
        return
    else:
        for i in range(corpus_vol):
            intent_sam = set()
            while len(intent_sam) < sem_nums:
                intent_sam.add(random.choice(list(intents)))
            spanset = SpanSet()
            sentences = []
            start_position = 0
            for intent in list(intent_sam):
                if intent == 'noise':
                    txt = random.choice(list(data_dict[intent]))
                    sentences.append(txt)
                    start_position += len(txt)
                else:
                    txt = random.choice(list(data_dict[intent]))
                    sentences.append(txt)
                    spanset.append(
                        Span(start=start_position,
                             end=start_position + len(txt),
                             entity=intent))
                    start_position += len(txt)
            doc = Document(text=''.join(sentences),
                           label='|'.join(intent_sam),
                           span_set=spanset)
            new_corpus.append(doc)

    return new_corpus
예제 #8
0
def test_bind():
    doc = Document("abc")
    doc.span_set.append(Span(start=0, end=1, entity="a"))

    result = doc.convert_to_md()
    expected = "[a](a) b c"

    assert result == expected

    span = doc.span_set[0]
    span.bind(doc)

    span.value = ["a", "a", "a"]
    result = doc.convert_to_md()
    expected = "[a a a](a) b c"

    assert result == expected
예제 #9
0
def one_add_noise(map_data, file1, noise, domain, pos=''):
    list1 = read_raw_data(file1)
    noise_list = read_raw_data(noise)
    len_all = max(len(list1), len(noise_list))

    dict_list = read_map(map_data)
    path1 = os.path.basename(file1)
    doc_list = []
    # 数量min
    for i in range(0, len_all):
        l1 = choice(list1)
        l2 = choice(noise_list)
        l1end = line_end_remove(l1)
        l2end = line_end_remove(l2)
        if pos == 'before':
            l = l2 + l1
            span_list1 = [
                Span(start=len(l2), end=len(l1end + l2), entity=path1[:-4]),
            ]
        elif pos == 'after':
            l = l1 + l2
            span_list1 = [
                Span(start=0, end=len(l1end), entity=path1[:-4]),
            ]
        else:
            l = l1
            span_list1 = [
                Span(start=0, end=len(l1end), entity=path1[:-4]),
            ]

        doc1 = Document(l)
        doc1.domain = domain
        doc1.intent = dict_list[path1[:-4]] + ": " + path1[:-4]

        doc1.entities = SpanSet(span_list1)
        # print(doc1)
        doc_list.append(doc1)

    doc_list = list(set(doc_list))
    corpus = Corpus(doc_list)
    res_path = "./data/" + path1[:-4] + '-' + 'noise' + '_' + pos + ".conllx"
    corpus.write_to_file(res_path)
예제 #10
0
def test_contains__(datadir, tmpdir):
    corpus = Corpus()

    corpus.append(seq_one)
    corpus.append(seq_two)

    assert seq_one in corpus

    other_corpus = Document("")

    assert other_corpus not in corpus
예제 #11
0
def one_to_conllx(map_data, file1, domain):
    list1 = read_raw_data(file1)
    path1 = os.path.basename(file1)
    dict_list = read_map(map_data)
    doc_list = []
    # 数量min+max
    for i in list1:
        doc1 = Document(i)
        doc1.domain = domain
        # print(dict_list[path1[:-4]])
        doc1.intent = dict_list[path1[:-4]] + ": " + path1[:-4]
        lenx = line_end_remove(i)
        span_list1 = [
            Span(start=0, end=len(lenx), entity=path1[:-4]),
        ]
        doc1.entities = SpanSet(span_list1)
        # print(doc1)
        doc_list.append(doc1)

    corpus = Corpus(doc_list)
    res_path = "./data/" + path1[:-4] + ".conllx"
    corpus.write_to_file(res_path)
예제 #12
0
def test_difference(datadir):
    corpus_one = Corpus.read_from_file(datadir / "corpus_one.conllx")
    corpus_two = Corpus.read_from_file(datadir / "corpus_two.conllx")

    result = corpus_one.difference(corpus_two)
    expected = Corpus([
        Document(
            "王小明在台北新竹的清华大学读书。",
            span_set=SpanSet(
                [Span(0, 3, "PERSON"),
                 Span(4, 8, "GPE"),
                 Span(9, 13, "ORG")]),
            id="3",
        )
    ])

    assert result == expected
예제 #13
0
def read_data(data_path, output_file):
    data_path = Path(data_path)

    doc_list = []

    for data_file in data_path.glob("*.json"):
        with jsonlines.open(str(data_file)) as reader:
            for obj in reader:
                text = [i for i in obj["content"]]
                doc = Document(text)
                doc.sub_function = obj["childFunction"]
                doc.domain = obj["domain"]
                doc.function = obj["function"]
                doc.intent = obj["intent"]

                span_list = []
                for entity in obj["marked"]:
                    record = entity["record"]
                    if not record:
                        continue

                    start = int(record[0])
                    end = int(record[-1]) + 1
                    entity_type = entity["titleIndex"]

                    span = Span(start, end, entity_type)

                    span_list.append(span)

                entities = SpanSet(span_list)

                doc.entities = entities

                doc_list.append(doc)

    corpus = Corpus(doc_list)
    corpus.write_to_file(output_file)
예제 #14
0
def test_express_pattern(datadir):
    corpus = Corpus.read_from_file(datadir / "corpus.conllx")

    express_pattern = ExpressPattern(corpus)
    result = express_pattern.compute()

    expected = {
        ("<PERSON>", "在", "<GPE>", "的", "<ORG>", "读", "书", "。"): [
            Document(
                text=[
                    "王",
                    "小",
                    "明",
                    "在",
                    "北",
                    "京",
                    "的",
                    "清",
                    "华",
                    "大",
                    "学",
                    "读",
                    "书",
                    "。",
                ],
                span_set=SpanSet(
                    [
                        Span(0, 3, "PERSON", value=None, normal_value=None),
                        Span(4, 6, "GPE", value=None, normal_value=None),
                        Span(7, 11, "ORG", value=None, normal_value=None),
                    ]
                ),
                id="1",
                label=None,
                extra_attr={},
            ),
            Document(
                text=[
                    "王",
                    "小",
                    "明",
                    "在",
                    "台",
                    "北",
                    "新",
                    "竹",
                    "的",
                    "清",
                    "华",
                    "大",
                    "学",
                    "读",
                    "书",
                    "。",
                ],
                span_set=SpanSet(
                    [
                        Span(0, 3, "PERSON", value=None, normal_value=None),
                        Span(4, 8, "GPE", value=None, normal_value=None),
                        Span(9, 13, "ORG", value=None, normal_value=None),
                    ]
                ),
                id="3",
                label=None,
                extra_attr={},
            ),
        ],
        ("来", "一", "首", "<歌手名>", "的", "歌", "。"): [
            Document(
                text=["来", "一", "首", "蓝", "泽", "雨", "的", "歌", "。"],
                span_set=SpanSet([Span(3, 6, "歌手名", value=None, normal_value=None)]),
                id="2",
                label=None,
                extra_attr={},
            )
        ],
    }

    assert result == expected
예제 #15
0
corpus = Corpus.read_from_file("./data/all_data.conllx")

list1 = []
for doc in corpus:
    list1.append(doc)

len_all = len(list1)
doc_list = []
for i in range(0, len_all):
    l1 = choice(list1)
    len1 = len(l1.text)
    span_list = []
    for span in l1.span_set:
        span_list.append(span)

    l2 = choice(list1)
    for span in l2.span_set:
        span_ll = Span(start=len1 + span.start,
                       end=len1 + span.end,
                       entity=span.entity)
        span_list.append(span_ll)

    text = "".join(l1.text) + "".join(l2.text)
    doc1 = Document(text)
    doc1.entities = SpanSet(span_list)
    doc1.domain = l1.domain
    doc_list.append(doc1)

doc_list = list(set(doc_list))
corpus = Corpus(doc_list)
corpus.write_to_file('./data/data_all.conllx')
예제 #16
0
import filecmp

from tokenizer_tools.tagset.offset.document import Document
from tokenizer_tools.tagset.offset.span import Span
from tokenizer_tools.tagset.offset.corpus import Corpus
from tokenizer_tools.tagset.offset.document_compare_ways import DocumentCompareWays
from tokenizer_tools.tagset.offset.span_set import SpanSet

seq = Document("王小明在北京的清华大学读书。", id="1")
seq.span_set.append(Span(0, 3, "PERSON", "王小明"))
seq.span_set.append(Span(4, 6, "GPE", "北京"))
seq.span_set.append(Span(7, 11, "ORG", "清华大学"))
seq_one = seq

seq = Document("来一首蓝泽雨的歌。", id="2")
seq.span_set.append(Span(3, 6, "歌手名", "蓝泽雨"))
seq_two = seq


def test_read_from_file(datadir):
    corpus = Corpus.read_from_file(datadir / "output.conllx")

    assert len(corpus) == 2
    assert corpus[0] == seq_one
    assert corpus[1] == seq_two


def test_write_to_file(datadir, tmpdir):
    corpus = Corpus()

    corpus.append(seq_one)
예제 #17
0
def offset_to_biluo(sequence: Document) -> List[str]:
    """
    Convert Sequence object to BILUO string

    :param sequence: Sequence example
    :return: string of BILUO encoding
    """
    encoding = ['O'] * len(sequence.text)

    for span in sequence.span_set:
        encoder = BILUOEncoderDecoder(span.entity)
        entity_text = sequence.text[span.start:span.end]
        entity_encoding = encoder.encode(entity_text)

        encoding[span.start:span.end] = entity_encoding

    return encoding


if __name__ == "__main__":
    seq = Document("王小明在北京的清华大学读书。")
    seq.span_set.append(Span(0, 3, 'PERSON', '王小明'))
    seq.span_set.append(Span(4, 6, 'GPE', '北京'))
    seq.span_set.append(Span(7, 11, 'ORG', '清华大学'))

    check_result = seq.check_span_set()
    print(check_result)

    encoding = offset_to_biluo(seq)
    print(encoding)
예제 #18
0
def two_add_before_link_after(map_data, file1, file2, before, link, after,
                              domain):
    list1 = read_raw_data(file1)
    list2 = read_raw_data(file2)
    before_list = read_raw_data(before)
    link_list = read_raw_data(link)
    after_list = read_raw_data(after)
    len_all = min(len(list1), len(list2))

    path1 = os.path.basename(file1)
    path2 = os.path.basename(file2)
    doc_list = []
    dict_list = read_map(map_data)
    # 数量min
    for i in range(0, len_all):
        l1 = choice(list1)
        l2 = choice(list2)
        link = choice(link_list)
        before = choice(before_list)
        before2 = choice(before_list)
        after = choice(after_list)

        l1end = line_end_remove(l1)
        l2end = line_end_remove(l2)
        link_end = line_end_remove(link)
        before_end = line_end_remove(before)
        after_end = line_end_remove(after)

        # file1 + file2
        len1 = l1 + l2

        doc1 = Document(len1)
        doc1.domain = domain

        doc1.intent = dict_list[
            path1[:-4]] + ": " + path1[:-4] + "||" + dict_list[
                path2[:-4]] + ": " + path2[:-4]
        span_list1 = [
            Span(start=0, end=len(l1end), entity=path1[:-4]),
            Span(start=len(l1), end=len(l1 + l2end), entity=path2[:-4]),
        ]
        doc1.entities = SpanSet(span_list1)
        doc_list.append(doc1)

        # file1 + link + file2
        len2 = l1 + link + l2

        doc2 = Document(len2)
        doc2.domain = domain
        doc2.intent = dict_list[
            path1[:-4]] + ": " + path1[:-4] + "||" + dict_list[
                path2[:-4]] + ": " + path2[:-4]
        span_list2 = [
            Span(start=0, end=len(l1end), entity=path1[:-4]),
            Span(start=len(l1 + link),
                 end=len(l1 + link + l2end),
                 entity=path2[:-4]),
        ]
        doc2.entities = SpanSet(span_list2)
        doc_list.append(doc2)

        # before + file1 + file2
        len3 = before + l1 + l2

        doc3 = Document(len3)
        doc3.domain = domain
        doc3.intent = dict_list[
            path1[:-4]] + ": " + path1[:-4] + "||" + dict_list[
                path2[:-4]] + ": " + path2[:-4]
        span_list3 = [
            Span(start=len(before), end=len(before + l1end),
                 entity=path1[:-4]),
            Span(start=len(before + l1),
                 end=len(before + l1 + l2end),
                 entity=path2[:-4]),
        ]
        doc3.entities = SpanSet(span_list3)
        doc_list.append(doc3)

        # before + file1 + file2 + after
        len4 = before + l1 + l2 + after

        doc4 = Document(len4)
        doc4.domain = domain
        doc4.intent = dict_list[
            path1[:-4]] + ": " + path1[:-4] + "||" + dict_list[
                path2[:-4]] + ": " + path2[:-4]
        span_list4 = [
            Span(start=len(before), end=len(before + l1end),
                 entity=path1[:-4]),
            Span(start=len(before + l1),
                 end=len(before + l1 + l2end),
                 entity=path2[:-4]),
        ]
        doc4.entities = SpanSet(span_list4)
        doc_list.append(doc4)

        # before + file1 + link + file2
        len5 = before + l1 + link + l2

        doc5 = Document(len5)
        doc5.domain = domain
        doc5.intent = dict_list[
            path1[:-4]] + ": " + path1[:-4] + "||" + dict_list[
                path2[:-4]] + ": " + path2[:-4]
        span_list5 = [
            Span(start=len(before), end=len(before + l1end),
                 entity=path1[:-4]),
            Span(start=len(before + l1 + link),
                 end=len(before + l1 + link + l2end),
                 entity=path2[:-4]),
        ]
        doc5.entities = SpanSet(span_list5)
        doc_list.append(doc5)

        # before + file1 + link + file2 + after
        len6 = before + l1 + link + l2 + after

        doc6 = Document(len6)
        doc6.domain = domain
        doc6.intent = dict_list[
            path1[:-4]] + ": " + path1[:-4] + "||" + dict_list[
                path2[:-4]] + ": " + path2[:-4]
        span_list6 = [
            Span(start=len(before), end=len(before + l1end),
                 entity=path1[:-4]),
            Span(start=len(before + l1 + link),
                 end=len(before + l1 + link + l2end),
                 entity=path2[:-4]),
        ]
        doc6.entities = SpanSet(span_list6)
        doc_list.append(doc6)

        # file1 + file2 + after
        len7 = l1 + l2 + after

        doc7 = Document(len7)
        doc7.domain = domain
        doc7.intent = dict_list[
            path1[:-4]] + ": " + path1[:-4] + "||" + dict_list[
                path2[:-4]] + ": " + path2[:-4]
        span_list7 = [
            Span(start=0, end=len(l1end), entity=path1[:-4]),
            Span(start=len(l1), end=len(l1 + l2end), entity=path2[:-4]),
        ]
        doc7.entities = SpanSet(span_list7)
        doc_list.append(doc7)

        # file1 + link + file2 + after
        len8 = l1 + link + l2 + after

        doc8 = Document(len8)
        doc8.domain = domain
        doc8.intent = dict_list[
            path1[:-4]] + ": " + path1[:-4] + "||" + dict_list[
                path2[:-4]] + ": " + path2[:-4]
        span_list8 = [
            Span(start=0, end=len(l1end), entity=path1[:-4]),
            Span(start=len(l1 + link),
                 end=len(l1 + link + l2end),
                 entity=path2[:-4]),
        ]
        doc8.entities = SpanSet(span_list8)
        doc_list.append(doc8)

        # file1 + before + file2
        len9 = l1 + before + l2

        doc9 = Document(len9)
        doc9.domain = domain
        doc9.intent = dict_list[
            path1[:-4]] + ": " + path1[:-4] + "||" + dict_list[
                path2[:-4]] + ": " + path2[:-4]
        span_list9 = [
            Span(start=0, end=len(l1end), entity=path1[:-4]),
            Span(start=len(l1 + before),
                 end=len(l1 + before + l2end),
                 entity=path2[:-4]),
        ]
        doc9.entities = SpanSet(span_list9)
        doc_list.append(doc9)

        # file1 + before + file2 + after
        len10 = l1 + before + l2 + after

        doc10 = Document(len10)
        doc10.domain = domain
        doc10.intent = dict_list[
            path1[:-4]] + ": " + path1[:-4] + "||" + dict_list[
                path2[:-4]] + ": " + path2[:-4]
        span_list10 = [
            Span(start=0, end=len(l1end), entity=path1[:-4]),
            Span(start=len(l1 + before),
                 end=len(l1 + before + l2end),
                 entity=path2[:-4]),
        ]
        doc10.entities = SpanSet(span_list10)
        doc_list.append(doc10)

        # file1 + link + before + file2
        len11 = l1 + link + before + l2

        doc11 = Document(len11)
        doc11.domain = domain
        doc11.intent = dict_list[
            path1[:-4]] + ": " + path1[:-4] + "||" + dict_list[
                path2[:-4]] + ": " + path2[:-4]
        span_list11 = [
            Span(start=0, end=len(l1end), entity=path1[:-4]),
            Span(start=len(l1 + link + before),
                 end=len(l1 + link + before + l2end),
                 entity=path2[:-4]),
        ]
        doc11.entities = SpanSet(span_list11)
        doc_list.append(doc11)

        # file1 + link + before + file2 + after
        len12 = l1 + link + before + l2 + after

        doc12 = Document(len12)
        doc12.domain = domain
        doc12.intent = dict_list[
            path1[:-4]] + ": " + path1[:-4] + "||" + dict_list[
                path2[:-4]] + ": " + path2[:-4]
        span_list12 = [
            Span(start=0, end=len(l1end), entity=path1[:-4]),
            Span(start=len(l1 + link + before),
                 end=len(l1 + link + before + l2end),
                 entity=path2[:-4]),
        ]
        doc12.entities = SpanSet(span_list12)
        doc_list.append(doc12)

        # before + file1 + before2 + file2
        len13 = before + l1 + before2 + l2

        doc13 = Document(len13)
        doc13.domain = domain
        doc13.intent = dict_list[
            path1[:-4]] + ": " + path1[:-4] + "||" + dict_list[
                path2[:-4]] + ": " + path2[:-4]
        span_list13 = [
            Span(start=len(before), end=len(before + l1end),
                 entity=path1[:-4]),
            Span(start=len(l1 + before + before2),
                 end=len(l1 + before + before2 + l2end),
                 entity=path2[:-4]),
        ]
        doc13.entities = SpanSet(span_list13)
        doc_list.append(doc13)

        # before + file1 + before2 + file2 + after
        len14 = before + l1 + before2 + l2 + after

        doc14 = Document(len14)
        doc14.domain = domain
        doc14.intent = dict_list[
            path1[:-4]] + ": " + path1[:-4] + "||" + dict_list[
                path2[:-4]] + ": " + path2[:-4]
        span_list14 = [
            Span(start=len(before), end=len(before + l1end),
                 entity=path1[:-4]),
            Span(start=len(l1 + before + before2),
                 end=len(l1 + before + before2 + l2end),
                 entity=path2[:-4]),
        ]
        doc14.entities = SpanSet(span_list14)
        doc_list.append(doc14)

        # before + file1 + link + before2 + file2
        len15 = before + l1 + link + before2 + l2

        doc15 = Document(len15)
        doc15.domain = domain
        doc15.intent = dict_list[
            path1[:-4]] + ": " + path1[:-4] + "||" + dict_list[
                path2[:-4]] + ": " + path2[:-4]
        span_list15 = [
            Span(start=len(before), end=len(before + l1end),
                 entity=path1[:-4]),
            Span(start=len(l1 + before + link + before2),
                 end=len(l1 + before + link + before2 + l2end),
                 entity=path2[:-4]),
        ]
        doc15.entities = SpanSet(span_list15)
        doc_list.append(doc15)

        # before + file1 + link + before2 + file2 + after
        len16 = before + l1 + link + before2 + l2 + after

        doc16 = Document(len16)
        doc16.domain = domain
        doc16.intent = dict_list[
            path1[:-4]] + ": " + path1[:-4] + "||" + dict_list[
                path2[:-4]] + ": " + path2[:-4]
        span_list16 = [
            Span(start=len(before), end=len(before + l1end),
                 entity=path1[:-4]),
            Span(start=len(l1 + before + link + before2),
                 end=len(l1 + before + link + before2 + l2end),
                 entity=path2[:-4]),
        ]
        doc16.entities = SpanSet(span_list16)
        doc_list.append(doc16)

    doc_list = list(set(doc_list))
    corpus = Corpus(doc_list)
    res_path = "./data/" + path1[:-4] + '-' + path2[:-4] + ".conllx"
    corpus.write_to_file(res_path)
예제 #19
0
def one_before_after_add_noise(map_data, file1, before_noise, after_noise,
                               domain):
    list1 = read_raw_data(file1)
    before_list = read_raw_data(before_noise)
    after_list = read_raw_data(after_noise)
    len_all = max(len(list1), len(before_list), len(after_list))

    dict_list = read_map(map_data)
    path1 = os.path.basename(file1)
    doc_list = []
    # 数量min
    for i in range(0, len_all):
        l1 = choice(list1)
        before = choice(before_list)
        after = choice(after_list)

        l1end = line_end_remove(l1)
        before_end = line_end_remove(before)
        after_end = line_end_remove(after)

        # file1
        len1 = l1
        doc1 = Document(len1)
        doc1.domain = domain
        doc1.intent = dict_list[path1[:-4]] + ": " + path1[:-4]
        span_list1 = [
            Span(start=0, end=len(l1end), entity=path1[:-4]),
        ]
        doc1.entities = SpanSet(span_list1)
        doc_list.append(doc1)

        # before + file1
        len2 = before + l1
        doc2 = Document(len2)
        doc2.domain = domain
        doc2.intent = dict_list[path1[:-4]] + ": " + path1[:-4]
        span_list2 = [
            Span(start=len(before), end=len(before + l1end),
                 entity=path1[:-4]),
        ]
        doc2.entities = SpanSet(span_list2)
        doc_list.append(doc2)

        # before + file1 + after
        len3 = before + l1 + after
        doc3 = Document(len3)
        doc3.domain = domain
        doc3.intent = dict_list[path1[:-4]] + ": " + path1[:-4]
        span_list3 = [
            Span(start=len(before), end=len(before + l1end),
                 entity=path1[:-4]),
        ]
        doc3.entities = SpanSet(span_list3)
        doc_list.append(doc3)

        # file1 + after
        len4 = l1 + after
        doc4 = Document(len4)
        doc4.domain = domain
        doc4.intent = dict_list[path1[:-4]] + ": " + path1[:-4]
        span_list4 = [
            Span(start=0, end=len(l1end), entity=path1[:-4]),
        ]
        doc4.entities = SpanSet(span_list4)
        doc_list.append(doc4)

    doc_list = list(set(doc_list))
    corpus = Corpus(doc_list)
    res_path = "./data/" + path1[:-4] + ".conllx"
    corpus.write_to_file(res_path)