Пример #1
0
    def _inference(self, model, input_data: list):
        '''
            调用模型对输入数据进行推理,即采用模型对输入数据做实体识别标注,
            并将推理结果保存至指定输出路径,推理结果供人工复查。
        :param model: 模型
        :param input_data: 语料数据
        :return:
        '''
        output = []
        batch_size = 1
        batches = MtModelInference_Deliverable.generate_batch_input(
            input_data, batch_size)
        for batch in batches:
            request = Request(batch)
            response = model.inference(request)
            tmp_result = response['data'][0].sequence
            tmp_result.label = response['cls'][0][0]
            output.append(tmp_result)

        predict_result = Corpus(output)
        predict_result.write_to_file(
            os.path.join(self.config['output_filepath'],
                         'inference_out.conllx'))

        print(
            '*** inference has been done, please check the result through the path below:'
        )
        print('==>{}'.format(self.config['output_filepath']))

        return
Пример #2
0
def main(model_dir,
         gold_corpus_file,
         predicted_corpus_file,
         install_dependencies=True):
    gold_corpus = Corpus.read_from_file(gold_corpus_file)

    dm_model = dm.load(model_dir, install_dependencies=install_dependencies)

    docs = []
    docs_failed = []

    for gold_doc in gold_corpus:
        text = gold_doc.text
        id_ = gold_doc.id

        request = dm.make_request(query=[text])
        response = dm_model.inference(request)
        result = response.data[0]

        doc = result.sequence
        doc.id = id_

        if result.is_failed:
            doc.extra_attr["is_failed"] = True
            doc.extra_attr["exec_msg"] = result.exec_msg
            docs_failed.append(doc)
        else:
            docs.append(doc)

    predicted_corpus = Corpus(docs + docs_failed)
    predicted_corpus.write_to_file(predicted_corpus_file)

    print(len(docs_failed), len(docs))
Пример #3
0
def two_add_link(map_data, file1, file2, link, domain):
    list1 = read_raw_data(file1)
    list2 = read_raw_data(file2)
    link_list = read_raw_data(link)
    len_all = max(len(list1), len(list2))
    path1 = os.path.basename(file1)
    path2 = os.path.basename(file2)
    doc_list = []
    dict_list = read_map(map_data)
    # 数量min
    for i in range(0, len_all):
        l1 = choice(list1)
        l2 = choice(list2)
        l3 = choice(link_list)
        l1end = line_end_remove(l1)
        l2end = line_end_remove(l2)

        l = l1 + l3 + l2
        doc1 = Document(l)
        doc1.domain = domain
        doc1.intent = dict_list[
            path1[:-4]] + ": " + path1[:-4] + "||" + dict_list[
                path2[:-4]] + ": " + path2[:-4]
        span_list1 = [
            Span(start=0, end=len(l1end), entity=path1[:-4]),
            Span(start=len(l1 + l3),
                 end=len(l1 + l3 + l2end),
                 entity=path2[:-4]),
        ]
        doc1.entities = SpanSet(span_list1)
        # print(doc1)
        doc_list.append(doc1)

        ll = l2 + l3 + l1
        doc2 = Document(ll)
        doc2.domain = domain
        doc2.intent = dict_list[
            path1[:-4]] + ": " + path1[:-4] + "||" + dict_list[
                path2[:-4]] + ": " + path2[:-4]
        span_list2 = [
            Span(start=0, end=len(l2end), entity=path2[:-4]),
            Span(start=len(l2 + l3),
                 end=len(l2 + l3 + l1end),
                 entity=path1[:-4]),
        ]
        doc2.entities = SpanSet(span_list2)
        # print(doc1)
        doc_list.append(doc2)

    doc_list = list(set(doc_list))
    corpus = Corpus(doc_list)
    res_path = "./data/" + path1[:-4] + '-' + path2[:-4] + '-' + 'link' + ".conllx"
    corpus.write_to_file(res_path)
Пример #4
0
def test_write_to_file(datadir, tmpdir):
    corpus = Corpus()

    corpus.append(seq_one)
    corpus.append(seq_two)

    result_file = tmpdir / "output.conllx"
    corpus.write_to_file(result_file)

    gold_file = datadir / "output.conllx"

    assert filecmp.cmp(result_file, gold_file)
Пример #5
0
def group_by_domain(input_file, output_dir):
    output_dir = Path(output_dir)

    corpus = Corpus.read_from_file(input_file)

    domain_doc = collections.defaultdict(list)
    for doc in corpus:
        domain_doc[doc.domain].append(doc)

    for domain, doc_list in domain_doc.items():
        output_file = output_dir / "{}.conllx".format(domain)

        corpus = Corpus(doc_list)
        corpus.write_to_file(output_file)
Пример #6
0
def one_add_noise(map_data, file1, noise, domain, pos=''):
    list1 = read_raw_data(file1)
    noise_list = read_raw_data(noise)
    len_all = max(len(list1), len(noise_list))

    dict_list = read_map(map_data)
    path1 = os.path.basename(file1)
    doc_list = []
    # 数量min
    for i in range(0, len_all):
        l1 = choice(list1)
        l2 = choice(noise_list)
        l1end = line_end_remove(l1)
        l2end = line_end_remove(l2)
        if pos == 'before':
            l = l2 + l1
            span_list1 = [
                Span(start=len(l2), end=len(l1end + l2), entity=path1[:-4]),
            ]
        elif pos == 'after':
            l = l1 + l2
            span_list1 = [
                Span(start=0, end=len(l1end), entity=path1[:-4]),
            ]
        else:
            l = l1
            span_list1 = [
                Span(start=0, end=len(l1end), entity=path1[:-4]),
            ]

        doc1 = Document(l)
        doc1.domain = domain
        doc1.intent = dict_list[path1[:-4]] + ": " + path1[:-4]

        doc1.entities = SpanSet(span_list1)
        # print(doc1)
        doc_list.append(doc1)

    doc_list = list(set(doc_list))
    corpus = Corpus(doc_list)
    res_path = "./data/" + path1[:-4] + '-' + 'noise' + '_' + pos + ".conllx"
    corpus.write_to_file(res_path)
Пример #7
0
def one_to_conllx(map_data, file1, domain):
    list1 = read_raw_data(file1)
    path1 = os.path.basename(file1)
    dict_list = read_map(map_data)
    doc_list = []
    # 数量min+max
    for i in list1:
        doc1 = Document(i)
        doc1.domain = domain
        # print(dict_list[path1[:-4]])
        doc1.intent = dict_list[path1[:-4]] + ": " + path1[:-4]
        lenx = line_end_remove(i)
        span_list1 = [
            Span(start=0, end=len(lenx), entity=path1[:-4]),
        ]
        doc1.entities = SpanSet(span_list1)
        # print(doc1)
        doc_list.append(doc1)

    corpus = Corpus(doc_list)
    res_path = "./data/" + path1[:-4] + ".conllx"
    corpus.write_to_file(res_path)
Пример #8
0
    def _inference(self, model, input_data: list):
        output = []
        batch_size = 1
        batches = MtModelInference_Deliverable.generate_batch_input(
            input_data, batch_size)
        for batch in tqdm.tqdm(batches):
            request = Request(batch)
            response = model.inference(request)
            tmp_result = response['data'][0].sequence
            tmp_result.label = response['cls'][0][0]
            output.append(tmp_result)

        predict_result = Corpus(output)
        predict_result.write_to_file(
            os.path.join(self.config['output_filepath'],
                         'inference_out.conllx'))

        print(
            '*** inference has been done, please check the result through the path below:'
        )
        print('==>{}'.format(self.config['output_filepath']))

        return
Пример #9
0
def read_data(data_path, output_file):
    data_path = Path(data_path)

    doc_list = []

    for data_file in data_path.glob("*.json"):
        with jsonlines.open(str(data_file)) as reader:
            for obj in reader:
                text = [i for i in obj["content"]]
                doc = Document(text)
                doc.sub_function = obj["childFunction"]
                doc.domain = obj["domain"]
                doc.function = obj["function"]
                doc.intent = obj["intent"]

                span_list = []
                for entity in obj["marked"]:
                    record = entity["record"]
                    if not record:
                        continue

                    start = int(record[0])
                    end = int(record[-1]) + 1
                    entity_type = entity["titleIndex"]

                    span = Span(start, end, entity_type)

                    span_list.append(span)

                entities = SpanSet(span_list)

                doc.entities = entities

                doc_list.append(doc)

    corpus = Corpus(doc_list)
    corpus.write_to_file(output_file)
Пример #10
0
list1 = []
for doc in corpus:
    list1.append(doc)

len_all = len(list1)
doc_list = []
for i in range(0, len_all):
    l1 = choice(list1)
    len1 = len(l1.text)
    span_list = []
    for span in l1.span_set:
        span_list.append(span)

    l2 = choice(list1)
    for span in l2.span_set:
        span_ll = Span(start=len1 + span.start,
                       end=len1 + span.end,
                       entity=span.entity)
        span_list.append(span_ll)

    text = "".join(l1.text) + "".join(l2.text)
    doc1 = Document(text)
    doc1.entities = SpanSet(span_list)
    doc1.domain = l1.domain
    doc_list.append(doc1)

doc_list = list(set(doc_list))
corpus = Corpus(doc_list)
corpus.write_to_file('./data/data_all.conllx')
Пример #11
0
def two_add_before_link_after(map_data, file1, file2, before, link, after,
                              domain):
    list1 = read_raw_data(file1)
    list2 = read_raw_data(file2)
    before_list = read_raw_data(before)
    link_list = read_raw_data(link)
    after_list = read_raw_data(after)
    len_all = min(len(list1), len(list2))

    path1 = os.path.basename(file1)
    path2 = os.path.basename(file2)
    doc_list = []
    dict_list = read_map(map_data)
    # 数量min
    for i in range(0, len_all):
        l1 = choice(list1)
        l2 = choice(list2)
        link = choice(link_list)
        before = choice(before_list)
        before2 = choice(before_list)
        after = choice(after_list)

        l1end = line_end_remove(l1)
        l2end = line_end_remove(l2)
        link_end = line_end_remove(link)
        before_end = line_end_remove(before)
        after_end = line_end_remove(after)

        # file1 + file2
        len1 = l1 + l2

        doc1 = Document(len1)
        doc1.domain = domain

        doc1.intent = dict_list[
            path1[:-4]] + ": " + path1[:-4] + "||" + dict_list[
                path2[:-4]] + ": " + path2[:-4]
        span_list1 = [
            Span(start=0, end=len(l1end), entity=path1[:-4]),
            Span(start=len(l1), end=len(l1 + l2end), entity=path2[:-4]),
        ]
        doc1.entities = SpanSet(span_list1)
        doc_list.append(doc1)

        # file1 + link + file2
        len2 = l1 + link + l2

        doc2 = Document(len2)
        doc2.domain = domain
        doc2.intent = dict_list[
            path1[:-4]] + ": " + path1[:-4] + "||" + dict_list[
                path2[:-4]] + ": " + path2[:-4]
        span_list2 = [
            Span(start=0, end=len(l1end), entity=path1[:-4]),
            Span(start=len(l1 + link),
                 end=len(l1 + link + l2end),
                 entity=path2[:-4]),
        ]
        doc2.entities = SpanSet(span_list2)
        doc_list.append(doc2)

        # before + file1 + file2
        len3 = before + l1 + l2

        doc3 = Document(len3)
        doc3.domain = domain
        doc3.intent = dict_list[
            path1[:-4]] + ": " + path1[:-4] + "||" + dict_list[
                path2[:-4]] + ": " + path2[:-4]
        span_list3 = [
            Span(start=len(before), end=len(before + l1end),
                 entity=path1[:-4]),
            Span(start=len(before + l1),
                 end=len(before + l1 + l2end),
                 entity=path2[:-4]),
        ]
        doc3.entities = SpanSet(span_list3)
        doc_list.append(doc3)

        # before + file1 + file2 + after
        len4 = before + l1 + l2 + after

        doc4 = Document(len4)
        doc4.domain = domain
        doc4.intent = dict_list[
            path1[:-4]] + ": " + path1[:-4] + "||" + dict_list[
                path2[:-4]] + ": " + path2[:-4]
        span_list4 = [
            Span(start=len(before), end=len(before + l1end),
                 entity=path1[:-4]),
            Span(start=len(before + l1),
                 end=len(before + l1 + l2end),
                 entity=path2[:-4]),
        ]
        doc4.entities = SpanSet(span_list4)
        doc_list.append(doc4)

        # before + file1 + link + file2
        len5 = before + l1 + link + l2

        doc5 = Document(len5)
        doc5.domain = domain
        doc5.intent = dict_list[
            path1[:-4]] + ": " + path1[:-4] + "||" + dict_list[
                path2[:-4]] + ": " + path2[:-4]
        span_list5 = [
            Span(start=len(before), end=len(before + l1end),
                 entity=path1[:-4]),
            Span(start=len(before + l1 + link),
                 end=len(before + l1 + link + l2end),
                 entity=path2[:-4]),
        ]
        doc5.entities = SpanSet(span_list5)
        doc_list.append(doc5)

        # before + file1 + link + file2 + after
        len6 = before + l1 + link + l2 + after

        doc6 = Document(len6)
        doc6.domain = domain
        doc6.intent = dict_list[
            path1[:-4]] + ": " + path1[:-4] + "||" + dict_list[
                path2[:-4]] + ": " + path2[:-4]
        span_list6 = [
            Span(start=len(before), end=len(before + l1end),
                 entity=path1[:-4]),
            Span(start=len(before + l1 + link),
                 end=len(before + l1 + link + l2end),
                 entity=path2[:-4]),
        ]
        doc6.entities = SpanSet(span_list6)
        doc_list.append(doc6)

        # file1 + file2 + after
        len7 = l1 + l2 + after

        doc7 = Document(len7)
        doc7.domain = domain
        doc7.intent = dict_list[
            path1[:-4]] + ": " + path1[:-4] + "||" + dict_list[
                path2[:-4]] + ": " + path2[:-4]
        span_list7 = [
            Span(start=0, end=len(l1end), entity=path1[:-4]),
            Span(start=len(l1), end=len(l1 + l2end), entity=path2[:-4]),
        ]
        doc7.entities = SpanSet(span_list7)
        doc_list.append(doc7)

        # file1 + link + file2 + after
        len8 = l1 + link + l2 + after

        doc8 = Document(len8)
        doc8.domain = domain
        doc8.intent = dict_list[
            path1[:-4]] + ": " + path1[:-4] + "||" + dict_list[
                path2[:-4]] + ": " + path2[:-4]
        span_list8 = [
            Span(start=0, end=len(l1end), entity=path1[:-4]),
            Span(start=len(l1 + link),
                 end=len(l1 + link + l2end),
                 entity=path2[:-4]),
        ]
        doc8.entities = SpanSet(span_list8)
        doc_list.append(doc8)

        # file1 + before + file2
        len9 = l1 + before + l2

        doc9 = Document(len9)
        doc9.domain = domain
        doc9.intent = dict_list[
            path1[:-4]] + ": " + path1[:-4] + "||" + dict_list[
                path2[:-4]] + ": " + path2[:-4]
        span_list9 = [
            Span(start=0, end=len(l1end), entity=path1[:-4]),
            Span(start=len(l1 + before),
                 end=len(l1 + before + l2end),
                 entity=path2[:-4]),
        ]
        doc9.entities = SpanSet(span_list9)
        doc_list.append(doc9)

        # file1 + before + file2 + after
        len10 = l1 + before + l2 + after

        doc10 = Document(len10)
        doc10.domain = domain
        doc10.intent = dict_list[
            path1[:-4]] + ": " + path1[:-4] + "||" + dict_list[
                path2[:-4]] + ": " + path2[:-4]
        span_list10 = [
            Span(start=0, end=len(l1end), entity=path1[:-4]),
            Span(start=len(l1 + before),
                 end=len(l1 + before + l2end),
                 entity=path2[:-4]),
        ]
        doc10.entities = SpanSet(span_list10)
        doc_list.append(doc10)

        # file1 + link + before + file2
        len11 = l1 + link + before + l2

        doc11 = Document(len11)
        doc11.domain = domain
        doc11.intent = dict_list[
            path1[:-4]] + ": " + path1[:-4] + "||" + dict_list[
                path2[:-4]] + ": " + path2[:-4]
        span_list11 = [
            Span(start=0, end=len(l1end), entity=path1[:-4]),
            Span(start=len(l1 + link + before),
                 end=len(l1 + link + before + l2end),
                 entity=path2[:-4]),
        ]
        doc11.entities = SpanSet(span_list11)
        doc_list.append(doc11)

        # file1 + link + before + file2 + after
        len12 = l1 + link + before + l2 + after

        doc12 = Document(len12)
        doc12.domain = domain
        doc12.intent = dict_list[
            path1[:-4]] + ": " + path1[:-4] + "||" + dict_list[
                path2[:-4]] + ": " + path2[:-4]
        span_list12 = [
            Span(start=0, end=len(l1end), entity=path1[:-4]),
            Span(start=len(l1 + link + before),
                 end=len(l1 + link + before + l2end),
                 entity=path2[:-4]),
        ]
        doc12.entities = SpanSet(span_list12)
        doc_list.append(doc12)

        # before + file1 + before2 + file2
        len13 = before + l1 + before2 + l2

        doc13 = Document(len13)
        doc13.domain = domain
        doc13.intent = dict_list[
            path1[:-4]] + ": " + path1[:-4] + "||" + dict_list[
                path2[:-4]] + ": " + path2[:-4]
        span_list13 = [
            Span(start=len(before), end=len(before + l1end),
                 entity=path1[:-4]),
            Span(start=len(l1 + before + before2),
                 end=len(l1 + before + before2 + l2end),
                 entity=path2[:-4]),
        ]
        doc13.entities = SpanSet(span_list13)
        doc_list.append(doc13)

        # before + file1 + before2 + file2 + after
        len14 = before + l1 + before2 + l2 + after

        doc14 = Document(len14)
        doc14.domain = domain
        doc14.intent = dict_list[
            path1[:-4]] + ": " + path1[:-4] + "||" + dict_list[
                path2[:-4]] + ": " + path2[:-4]
        span_list14 = [
            Span(start=len(before), end=len(before + l1end),
                 entity=path1[:-4]),
            Span(start=len(l1 + before + before2),
                 end=len(l1 + before + before2 + l2end),
                 entity=path2[:-4]),
        ]
        doc14.entities = SpanSet(span_list14)
        doc_list.append(doc14)

        # before + file1 + link + before2 + file2
        len15 = before + l1 + link + before2 + l2

        doc15 = Document(len15)
        doc15.domain = domain
        doc15.intent = dict_list[
            path1[:-4]] + ": " + path1[:-4] + "||" + dict_list[
                path2[:-4]] + ": " + path2[:-4]
        span_list15 = [
            Span(start=len(before), end=len(before + l1end),
                 entity=path1[:-4]),
            Span(start=len(l1 + before + link + before2),
                 end=len(l1 + before + link + before2 + l2end),
                 entity=path2[:-4]),
        ]
        doc15.entities = SpanSet(span_list15)
        doc_list.append(doc15)

        # before + file1 + link + before2 + file2 + after
        len16 = before + l1 + link + before2 + l2 + after

        doc16 = Document(len16)
        doc16.domain = domain
        doc16.intent = dict_list[
            path1[:-4]] + ": " + path1[:-4] + "||" + dict_list[
                path2[:-4]] + ": " + path2[:-4]
        span_list16 = [
            Span(start=len(before), end=len(before + l1end),
                 entity=path1[:-4]),
            Span(start=len(l1 + before + link + before2),
                 end=len(l1 + before + link + before2 + l2end),
                 entity=path2[:-4]),
        ]
        doc16.entities = SpanSet(span_list16)
        doc_list.append(doc16)

    doc_list = list(set(doc_list))
    corpus = Corpus(doc_list)
    res_path = "./data/" + path1[:-4] + '-' + path2[:-4] + ".conllx"
    corpus.write_to_file(res_path)
Пример #12
0
def one_before_after_add_noise(map_data, file1, before_noise, after_noise,
                               domain):
    list1 = read_raw_data(file1)
    before_list = read_raw_data(before_noise)
    after_list = read_raw_data(after_noise)
    len_all = max(len(list1), len(before_list), len(after_list))

    dict_list = read_map(map_data)
    path1 = os.path.basename(file1)
    doc_list = []
    # 数量min
    for i in range(0, len_all):
        l1 = choice(list1)
        before = choice(before_list)
        after = choice(after_list)

        l1end = line_end_remove(l1)
        before_end = line_end_remove(before)
        after_end = line_end_remove(after)

        # file1
        len1 = l1
        doc1 = Document(len1)
        doc1.domain = domain
        doc1.intent = dict_list[path1[:-4]] + ": " + path1[:-4]
        span_list1 = [
            Span(start=0, end=len(l1end), entity=path1[:-4]),
        ]
        doc1.entities = SpanSet(span_list1)
        doc_list.append(doc1)

        # before + file1
        len2 = before + l1
        doc2 = Document(len2)
        doc2.domain = domain
        doc2.intent = dict_list[path1[:-4]] + ": " + path1[:-4]
        span_list2 = [
            Span(start=len(before), end=len(before + l1end),
                 entity=path1[:-4]),
        ]
        doc2.entities = SpanSet(span_list2)
        doc_list.append(doc2)

        # before + file1 + after
        len3 = before + l1 + after
        doc3 = Document(len3)
        doc3.domain = domain
        doc3.intent = dict_list[path1[:-4]] + ": " + path1[:-4]
        span_list3 = [
            Span(start=len(before), end=len(before + l1end),
                 entity=path1[:-4]),
        ]
        doc3.entities = SpanSet(span_list3)
        doc_list.append(doc3)

        # file1 + after
        len4 = l1 + after
        doc4 = Document(len4)
        doc4.domain = domain
        doc4.intent = dict_list[path1[:-4]] + ": " + path1[:-4]
        span_list4 = [
            Span(start=0, end=len(l1end), entity=path1[:-4]),
        ]
        doc4.entities = SpanSet(span_list4)
        doc_list.append(doc4)

    doc_list = list(set(doc_list))
    corpus = Corpus(doc_list)
    res_path = "./data/" + path1[:-4] + ".conllx"
    corpus.write_to_file(res_path)
Пример #13
0
from tokenizer_tools.tagset.offset.corpus import Corpus
from tokenizer_tools.conllz.writer import write_conllx


# read data
corpus = Corpus.read_from_file("./data/all_data.conllx")

ll = []
for doc in corpus:
    if doc.domain == "车身控制":
        ll.append(doc)
cor_ll = Corpus(ll)
cor_ll.write_to_file('./data/车身控制.conllx')
Пример #14
0
from tokenizer_tools.tagset.offset.corpus import Corpus, Document

corpus = Corpus.read_from_file("./data/all_data.conllx")

# 抽取包含某个实体(entity)的corpus
extract_list = []

for doc in corpus:
    # print(doc)
    for i in doc.entities:
        if i.entity == '百分比值':
            extract_doc = Document(doc.text)
            extract_doc.entities = doc.entities
            extract_doc.id = doc.id
            extract_doc.domain = doc.domain
            extract_doc.intent = doc.intent
            extract_list.append(extract_doc)

extract_document = Corpus(extract_list)
# print(extract_document)
res_file = 'data/百分比值.conllx'
extract_document.write_to_file(res_file)