def main(model_dir, gold_corpus_file, predicted_corpus_file, install_dependencies=True): gold_corpus = Corpus.read_from_file(gold_corpus_file) dm_model = dm.load(model_dir, install_dependencies=install_dependencies) docs = [] docs_failed = [] for gold_doc in gold_corpus: text = gold_doc.text id_ = gold_doc.id request = dm.make_request(query=[text]) response = dm_model.inference(request) result = response.data[0] doc = result.sequence doc.id = id_ if result.is_failed: doc.extra_attr["is_failed"] = True doc.extra_attr["exec_msg"] = result.exec_msg docs_failed.append(doc) else: docs.append(doc) predicted_corpus = Corpus(docs + docs_failed) predicted_corpus.write_to_file(predicted_corpus_file) print(len(docs_failed), len(docs))
def main( corpus_file: str = None, datadir: str = None, test_size: Union[int, float] = 0.1, train_corpus: str = None, test_corpus: str = None, ): result_dir = Path(datadir) corpus = None if corpus_file: corpus = Corpus.read_from_file(corpus_file) train, test = corpus.train_test_split(test_size=test_size) train.write_to_file(result_dir / "train.conllx") test.write_to_file(result_dir / "test.conllx") else: train = Corpus.read_from_file(train_corpus) test = Corpus.read_from_file(test_corpus) docs = [doc for doc in train] + [doc for doc in test] corpus = Corpus(docs) shutil.copy(train_corpus, datadir) shutil.copy(test_corpus, datadir) entities = {span.entity for doc in corpus for span in doc.span_set} with open(result_dir / "entity.txt", "wt") as fd: fd.write("\n".join(entities))
def _inference(self, model, input_data: list): ''' 调用模型对输入数据进行推理,即采用模型对输入数据做实体识别标注, 并将推理结果保存至指定输出路径,推理结果供人工复查。 :param model: 模型 :param input_data: 语料数据 :return: ''' output = [] batch_size = 1 batches = MtModelInference_Deliverable.generate_batch_input( input_data, batch_size) for batch in batches: request = Request(batch) response = model.inference(request) tmp_result = response['data'][0].sequence tmp_result.label = response['cls'][0][0] output.append(tmp_result) predict_result = Corpus(output) predict_result.write_to_file( os.path.join(self.config['output_filepath'], 'inference_out.conllx')) print( '*** inference has been done, please check the result through the path below:' ) print('==>{}'.format(self.config['output_filepath'])) return
def create_new_corpus(data_dict, corpus_vol, **kwargs): new_corpus = Corpus([]) sem_nums = kwargs['sem_nums'] intents = data_dict.keys() if not corpus_vol: return elif sem_nums > len(intents): return else: for i in range(corpus_vol): intent_sam = set() while len(intent_sam) < sem_nums: intent_sam.add(random.choice(list(intents))) spanset = SpanSet() sentences = [] start_position = 0 for intent in list(intent_sam): if intent == 'noise': txt = random.choice(list(data_dict[intent])) sentences.append(txt) start_position += len(txt) else: txt = random.choice(list(data_dict[intent])) sentences.append(txt) spanset.append( Span(start=start_position, end=start_position + len(txt), entity=intent)) start_position += len(txt) doc = Document(text=''.join(sentences), label='|'.join(intent_sam), span_set=spanset) new_corpus.append(doc) return new_corpus
def test_union(datadir): corpus_one = Corpus.read_from_file(datadir / "corpus_one.conllx") corpus_two = Corpus.read_from_file(datadir / "corpus_two.conllx") result = len(corpus_one.union(corpus_two)) expected = 4 assert result == expected
def test_symmetric_difference(datadir): corpus_one = Corpus.read_from_file(datadir / "corpus_one.conllx") corpus_two = Corpus.read_from_file(datadir / "corpus_two.conllx") result = len(corpus_one.symmetric_difference(corpus_two)) expected = 2 assert result == expected
def test_set_document_compare_way(datadir): corpus_one = Corpus.read_from_file(datadir / "corpus_one.conllx") corpus_two = Corpus.read_from_file(datadir / "corpus_two.conllx") assert corpus_one != corpus_two corpus_one.set_document_compare_way(DocumentCompareWays.TEXT_ONLY) corpus_two.set_document_compare_way(DocumentCompareWays.TEXT_ONLY) assert corpus_one == corpus_two
def test_contains__(datadir, tmpdir): corpus = Corpus() corpus.append(seq_one) corpus.append(seq_two) assert seq_one in corpus other_corpus = Document("") assert other_corpus not in corpus
def test_corpus_diff(datadir): corpus_one = Corpus.read_from_file(datadir / "corpus_one.conllx") corpus_two = Corpus.read_from_file(datadir / "corpus_two.conllx") corpus_diff = CorpusDiff(corpus_one, corpus_two) corpus_diff_result = corpus_diff.compare() result = corpus_diff_result.render_to_md() expected = """# 3 - <D: None, F: None, S: None, I: None> [王 小 明](PERSON) 在 [台 北 新 竹](GPE) 的 [清 华 大 学](ORG) 读 书 。 - <D: None, F: None, S: None, I: None> [王 小 明](PERSON) 在 [台 北 新 竹](CITY) 的 [清 华 大 学](ORG) 读 书 。""" assert result == expected
def two_add_link(map_data, file1, file2, link, domain): list1 = read_raw_data(file1) list2 = read_raw_data(file2) link_list = read_raw_data(link) len_all = max(len(list1), len(list2)) path1 = os.path.basename(file1) path2 = os.path.basename(file2) doc_list = [] dict_list = read_map(map_data) # 数量min for i in range(0, len_all): l1 = choice(list1) l2 = choice(list2) l3 = choice(link_list) l1end = line_end_remove(l1) l2end = line_end_remove(l2) l = l1 + l3 + l2 doc1 = Document(l) doc1.domain = domain doc1.intent = dict_list[ path1[:-4]] + ": " + path1[:-4] + "||" + dict_list[ path2[:-4]] + ": " + path2[:-4] span_list1 = [ Span(start=0, end=len(l1end), entity=path1[:-4]), Span(start=len(l1 + l3), end=len(l1 + l3 + l2end), entity=path2[:-4]), ] doc1.entities = SpanSet(span_list1) # print(doc1) doc_list.append(doc1) ll = l2 + l3 + l1 doc2 = Document(ll) doc2.domain = domain doc2.intent = dict_list[ path1[:-4]] + ": " + path1[:-4] + "||" + dict_list[ path2[:-4]] + ": " + path2[:-4] span_list2 = [ Span(start=0, end=len(l2end), entity=path2[:-4]), Span(start=len(l2 + l3), end=len(l2 + l3 + l1end), entity=path1[:-4]), ] doc2.entities = SpanSet(span_list2) # print(doc1) doc_list.append(doc2) doc_list = list(set(doc_list)) corpus = Corpus(doc_list) res_path = "./data/" + path1[:-4] + '-' + path2[:-4] + '-' + 'link' + ".conllx" corpus.write_to_file(res_path)
def main(gold: str, pred: str) -> dict: gold_corpus = Corpus.read_from_file(gold) pred_corpus = Corpus.read_from_file(pred) cm = CorpusMetric.create_from_corpus(gold_corpus, pred_corpus) return { "entity_f1_score": cm.entity_f1_score, "entity_accuracy_score": cm.entity_accuracy_score, "entity_precision_score": cm.entity_precision_score, "entity_recall_score": cm.entity_recall_score, "entity_classification_report": cm.entity_classification_report, "doc_entity_correctness": cm.doc_entity_correctness, }
def test_intersection(datadir): corpus = Corpus.read_from_file(datadir / "self.conllx") other_corpus = Corpus.read_from_file(datadir / "other.conllx") result = corpus.intersection(other_corpus) assert isinstance(result, Corpus) assert len(result) == 2 second_corpus = Corpus.read_from_file(datadir / "second_other.conllx") result = corpus.intersection(other_corpus, second_corpus) assert isinstance(result, Corpus) assert len(result) == 1
def group_by_domain(input_file, output_dir): output_dir = Path(output_dir) corpus = Corpus.read_from_file(input_file) domain_doc = collections.defaultdict(list) for doc in corpus: domain_doc[doc.domain].append(doc) for domain, doc_list in domain_doc.items(): output_file = output_dir / "{}.conllx".format(domain) corpus = Corpus(doc_list) corpus.write_to_file(output_file)
def test_getitem__(datadir, tmpdir): corpus = Corpus() corpus.append(seq_one) corpus.append(seq_two) # test single element get item item = corpus[0] assert item == seq_one # test batch element get item other_corpus = corpus[[0, 1]] assert other_corpus == corpus
def read_raw_data(filepath): corpus_raw = Corpus.read_from_file(filepath) out_dict = defaultdict(set) for sam in corpus_raw: out_dict[sam.label].add(''.join(sam.text)) return out_dict
def __call__(self): corpus_test = Corpus.read_from_file( os.path.join(self.config['data_filepath'], self.config['data_filename'])) scores, differ_corpus_tuples = self._evaluation(corpus_test) self.save_result(scores, differ_corpus_tuples) print('Evaluation has been done.')
def test_express_pattern(datadir): corpus = Corpus.read_from_file(datadir / "corpus.conllx") express_pattern = ExpressPattern(corpus) result = express_pattern.compute() result_keys = [str(i) for i in result.keys()] expected_keys = ["<PERSON> 在 <GPE> 的 <ORG> 读 书 。", "来 一 首 <歌手名> 的 歌 。"] for r, e in zip(result_keys, expected_keys): assert e in r result_value = result.values() expected_value = [ [ "[王 小 明](PERSON) 在 [北 京](GPE) 的 [清 华 大 学](ORG) 读 书 。", "[王 小 明](PERSON) 在 [台 北 新 竹](GPE) 的 [清 华 大 学](ORG) 读 书 。", ], ["来 一 首 [蓝 泽 雨](歌手名) 的 歌 。"], ] for i, value in enumerate(result_value): for j, element in enumerate(value): assert expected_value[i][j] in str(element)
def to_conllx(file_prefix): base_name, _ = os.path.splitext(file_prefix) log_file = './data/error/{}.error'.format(base_name) with open('./data/raw/{}'.format(file_prefix)) as fd, open(log_file, 'wt') as logger: output_lines = [] seq_list = [] for raw_line in fd: line = raw_line.strip() if not line: continue try: seq, sentence = process_one_line(line, logger) except CheckFailedError as e: continue else: seq_list.append(seq) output_lines.append(sentence) # write_conll(output_lines, 'data/{}.text'.format(file_prefix)) output_file = './data/domain/{}.conllx'.format(base_name) # with open('./data/domain/{}.conllx'.format(base_name), 'wt') as output_fd: # write_conllx(output_lines, output_fd) Corpus(seq_list).write_to_file(output_file)
def render(doc_pattern, dictionary: Dict[str, List[str]]): doc_list = [] for pattern in doc_pattern: placeholder_names = [i.entity for i in pattern.get_placeholders()] pattern_specific_dictionary = { i: dictionary[i] for i in placeholder_names } instance_list_variable = list( itertools.product(*pattern_specific_dictionary.values())) # print(len(instance_list_variable)) if len(instance_list_variable) > 200: random_instance_list_variable = sample(instance_list_variable, 200) else: random_instance_list_variable = instance_list_variable for instance_variable in random_instance_list_variable: instance_mapping = dict( zip(pattern_specific_dictionary.keys(), instance_variable)) doc = pattern.render(instance_mapping) doc_list.append(doc) return Corpus(doc_list)
def test_eq__(datadir): corpus = Corpus.read_from_file(datadir / "data.conllx") corpus_statistics = CorpusStatistics.create_from_corpus(corpus) expected = CorpusStatistics( domain=Counter({"domain_one": 2, "domain_two": 2}), function=Counter({"function_one": 2, "function_two": 2}), sub_function=Counter({"sub_function_one": 2, "sub_function_two": 2}), intent=Counter({"intent_one": 2, "intent_two": 2}), entity_types={ "PERSON": Counter({("王", "小", "明"): 2}), "GPE": Counter({("北", "京"): 2}), "ORG": Counter({("清", "华", "大", "学"): 2}), "歌手名": Counter({("蓝", "泽", "雨"): 2}), }, entity_values={ ("王", "小", "明"): Counter({"PERSON": 2}), ("北", "京"): Counter({"GPE": 2}), ("清", "华", "大", "学"): Counter({"ORG": 2}), ("蓝", "泽", "雨"): Counter({"歌手名": 2}), }, ) assert corpus_statistics == expected
def extract_part_seq(data, all_seq, part_seq): # read data corpus = Corpus.read_from_file(data) # generate all sequence pattern doc_pattern = corpus.generate_pattern() doc_pattern.write_to_file(all_seq) yy_list = [] # extract part sequence pattern for doc in doc_pattern: for i in doc.text: if i == '关': yydoc = DocumentPattern(doc.text) # print(yydoc) yydoc.entities = doc.entities yydoc.domain = doc.domain yydoc.id = doc.id yy_list.append(yydoc) yy_list = set(yy_list) yyy_doc = CorpusPattern(yy_list) print(yyy_doc) yyy_doc.write_to_file(part_seq) return yyy_doc
def test_render(datadir): corpus = Corpus.read_from_file(datadir / "corpus.conllx") corpus_pattern = CorpusPattern.create_from_corpus(corpus) dictionary = { "PERSON": ["小王", "小李"], "GPE": ["北京"], "ORG": ["师范大学", "专科学校"], "歌手名": ["周杰伦", "孙燕姿"] } generated_corpus = corpus_pattern.render(dictionary) expected = sorted([ "[小 王](PERSON) 在 [北 京](GPE) 的 [师 范 大 学](ORG) 读 书 。", "[小 王](PERSON) 在 [北 京](GPE) 的 [专 科 学 校](ORG) 读 书 。", "[小 李](PERSON) 在 [北 京](GPE) 的 [师 范 大 学](ORG) 读 书 。", "[小 李](PERSON) 在 [北 京](GPE) 的 [专 科 学 校](ORG) 读 书 。", "来 一 首 [周 杰 伦](歌手名) 的 歌 。", "来 一 首 [孙 燕 姿](歌手名) 的 歌 。" ]) result = sorted([str(i) for i in generated_corpus]) for e, r in zip(expected, result): assert e in r
def test_attr_access(datadir): corpus = Corpus.read_from_file(datadir / "corpus.conllx") doc = corpus[0] assert doc.domain == "domain" assert doc.function == "function" assert doc.intent == "intent" assert doc.sub_function == "sub_function"
def test_fuzzy_search(datadir): corpus = Corpus.read_from_file(datadir / "output.conllx") result = corpus.fuzzy_search("北京 读书", limit=1) expected = seq_one assert result[0][0] == expected
def test_difference(datadir): corpus_one = Corpus.read_from_file(datadir / "corpus_one.conllx") corpus_two = Corpus.read_from_file(datadir / "corpus_two.conllx") result = corpus_one.difference(corpus_two) expected = Corpus([ Document( "王小明在台北新竹的清华大学读书。", span_set=SpanSet( [Span(0, 3, "PERSON"), Span(4, 8, "GPE"), Span(9, 13, "ORG")]), id="3", ) ]) assert result == expected
def test_remove_duplicate(datadir): corpus = Corpus.read_from_file(datadir / "duplicate.conllx") assert len(corpus) == 4 duplicate_free = corpus.remove_duplicate() assert isinstance(duplicate_free, Corpus) assert len(duplicate_free) == 2
def generate_constraint_to_file(input_file: str, output_file: str, output_attr: str = "label"): corpus = Corpus.read_from_file(input_file) domain_mapping = generate_constraint(corpus, output_attr) with open(output_file, "wt") as fd: json.dump(domain_mapping, fd, indent=4, ensure_ascii=False)
def test_collect_domain(datadir): corpus = Corpus.read_from_file(datadir / "data.conllx") corpus_statistics = CorpusStatistics.create_from_corpus(corpus) result = corpus_statistics.domain expected = Counter({"domain_one": 2, "domain_two": 2}) assert result == expected
def test_collect_intent(datadir): corpus = Corpus.read_from_file(datadir / "data.conllx") corpus_statistics = CorpusStatistics.create_from_corpus(corpus) result = corpus_statistics.intent expected = Counter({'intent_one': 2, 'intent_two': 2}) assert result == expected
def test_collect_sub_function(datadir): corpus = Corpus.read_from_file(datadir / "data.conllx") corpus_statistics = CorpusStatistics.create_from_corpus(corpus) result = corpus_statistics.sub_function expected = Counter({'sub_function_one': 2, 'sub_function_two': 2}) assert result == expected