def main( corpus_file: str = None, datadir: str = None, test_size: Union[int, float] = 0.1, train_corpus: str = None, test_corpus: str = None, ): result_dir = Path(datadir) corpus = None if corpus_file: corpus = Corpus.read_from_file(corpus_file) train, test = corpus.train_test_split(test_size=test_size) train.write_to_file(result_dir / "train.conllx") test.write_to_file(result_dir / "test.conllx") else: train = Corpus.read_from_file(train_corpus) test = Corpus.read_from_file(test_corpus) docs = [doc for doc in train] + [doc for doc in test] corpus = Corpus(docs) shutil.copy(train_corpus, datadir) shutil.copy(test_corpus, datadir) entities = {span.entity for doc in corpus for span in doc.span_set} with open(result_dir / "entity.txt", "wt") as fd: fd.write("\n".join(entities))
def test_union(datadir): corpus_one = Corpus.read_from_file(datadir / "corpus_one.conllx") corpus_two = Corpus.read_from_file(datadir / "corpus_two.conllx") result = len(corpus_one.union(corpus_two)) expected = 4 assert result == expected
def test_symmetric_difference(datadir): corpus_one = Corpus.read_from_file(datadir / "corpus_one.conllx") corpus_two = Corpus.read_from_file(datadir / "corpus_two.conllx") result = len(corpus_one.symmetric_difference(corpus_two)) expected = 2 assert result == expected
def test_set_document_compare_way(datadir): corpus_one = Corpus.read_from_file(datadir / "corpus_one.conllx") corpus_two = Corpus.read_from_file(datadir / "corpus_two.conllx") assert corpus_one != corpus_two corpus_one.set_document_compare_way(DocumentCompareWays.TEXT_ONLY) corpus_two.set_document_compare_way(DocumentCompareWays.TEXT_ONLY) assert corpus_one == corpus_two
def test_corpus_diff(datadir): corpus_one = Corpus.read_from_file(datadir / "corpus_one.conllx") corpus_two = Corpus.read_from_file(datadir / "corpus_two.conllx") corpus_diff = CorpusDiff(corpus_one, corpus_two) corpus_diff_result = corpus_diff.compare() result = corpus_diff_result.render_to_md() expected = """# 3 - <D: None, F: None, S: None, I: None> [王 小 明](PERSON) 在 [台 北 新 竹](GPE) 的 [清 华 大 学](ORG) 读 书 。 - <D: None, F: None, S: None, I: None> [王 小 明](PERSON) 在 [台 北 新 竹](CITY) 的 [清 华 大 学](ORG) 读 书 。""" assert result == expected
def test_intersection(datadir): corpus = Corpus.read_from_file(datadir / "self.conllx") other_corpus = Corpus.read_from_file(datadir / "other.conllx") result = corpus.intersection(other_corpus) assert isinstance(result, Corpus) assert len(result) == 2 second_corpus = Corpus.read_from_file(datadir / "second_other.conllx") result = corpus.intersection(other_corpus, second_corpus) assert isinstance(result, Corpus) assert len(result) == 1
def main(gold: str, pred: str) -> dict: gold_corpus = Corpus.read_from_file(gold) pred_corpus = Corpus.read_from_file(pred) cm = CorpusMetric.create_from_corpus(gold_corpus, pred_corpus) return { "entity_f1_score": cm.entity_f1_score, "entity_accuracy_score": cm.entity_accuracy_score, "entity_precision_score": cm.entity_precision_score, "entity_recall_score": cm.entity_recall_score, "entity_classification_report": cm.entity_classification_report, "doc_entity_correctness": cm.doc_entity_correctness, }
def test_express_pattern(datadir): corpus = Corpus.read_from_file(datadir / "corpus.conllx") express_pattern = ExpressPattern(corpus) result = express_pattern.compute() result_keys = [str(i) for i in result.keys()] expected_keys = ["<PERSON> 在 <GPE> 的 <ORG> 读 书 。", "来 一 首 <歌手名> 的 歌 。"] for r, e in zip(result_keys, expected_keys): assert e in r result_value = result.values() expected_value = [ [ "[王 小 明](PERSON) 在 [北 京](GPE) 的 [清 华 大 学](ORG) 读 书 。", "[王 小 明](PERSON) 在 [台 北 新 竹](GPE) 的 [清 华 大 学](ORG) 读 书 。", ], ["来 一 首 [蓝 泽 雨](歌手名) 的 歌 。"], ] for i, value in enumerate(result_value): for j, element in enumerate(value): assert expected_value[i][j] in str(element)
def test_eq__(datadir): corpus = Corpus.read_from_file(datadir / "data.conllx") corpus_statistics = CorpusStatistics.create_from_corpus(corpus) expected = CorpusStatistics( domain=Counter({"domain_one": 2, "domain_two": 2}), function=Counter({"function_one": 2, "function_two": 2}), sub_function=Counter({"sub_function_one": 2, "sub_function_two": 2}), intent=Counter({"intent_one": 2, "intent_two": 2}), entity_types={ "PERSON": Counter({("王", "小", "明"): 2}), "GPE": Counter({("北", "京"): 2}), "ORG": Counter({("清", "华", "大", "学"): 2}), "歌手名": Counter({("蓝", "泽", "雨"): 2}), }, entity_values={ ("王", "小", "明"): Counter({"PERSON": 2}), ("北", "京"): Counter({"GPE": 2}), ("清", "华", "大", "学"): Counter({"ORG": 2}), ("蓝", "泽", "雨"): Counter({"歌手名": 2}), }, ) assert corpus_statistics == expected
def read_raw_data(filepath): corpus_raw = Corpus.read_from_file(filepath) out_dict = defaultdict(set) for sam in corpus_raw: out_dict[sam.label].add(''.join(sam.text)) return out_dict
def main(model_dir, gold_corpus_file, predicted_corpus_file, install_dependencies=True): gold_corpus = Corpus.read_from_file(gold_corpus_file) dm_model = dm.load(model_dir, install_dependencies=install_dependencies) docs = [] docs_failed = [] for gold_doc in gold_corpus: text = gold_doc.text id_ = gold_doc.id request = dm.make_request(query=[text]) response = dm_model.inference(request) result = response.data[0] doc = result.sequence doc.id = id_ if result.is_failed: doc.extra_attr["is_failed"] = True doc.extra_attr["exec_msg"] = result.exec_msg docs_failed.append(doc) else: docs.append(doc) predicted_corpus = Corpus(docs + docs_failed) predicted_corpus.write_to_file(predicted_corpus_file) print(len(docs_failed), len(docs))
def test_render(datadir): corpus = Corpus.read_from_file(datadir / "corpus.conllx") corpus_pattern = CorpusPattern.create_from_corpus(corpus) dictionary = { "PERSON": ["小王", "小李"], "GPE": ["北京"], "ORG": ["师范大学", "专科学校"], "歌手名": ["周杰伦", "孙燕姿"] } generated_corpus = corpus_pattern.render(dictionary) expected = sorted([ "[小 王](PERSON) 在 [北 京](GPE) 的 [师 范 大 学](ORG) 读 书 。", "[小 王](PERSON) 在 [北 京](GPE) 的 [专 科 学 校](ORG) 读 书 。", "[小 李](PERSON) 在 [北 京](GPE) 的 [师 范 大 学](ORG) 读 书 。", "[小 李](PERSON) 在 [北 京](GPE) 的 [专 科 学 校](ORG) 读 书 。", "来 一 首 [周 杰 伦](歌手名) 的 歌 。", "来 一 首 [孙 燕 姿](歌手名) 的 歌 。" ]) result = sorted([str(i) for i in generated_corpus]) for e, r in zip(expected, result): assert e in r
def extract_part_seq(data, all_seq, part_seq): # read data corpus = Corpus.read_from_file(data) # generate all sequence pattern doc_pattern = corpus.generate_pattern() doc_pattern.write_to_file(all_seq) yy_list = [] # extract part sequence pattern for doc in doc_pattern: for i in doc.text: if i == '关': yydoc = DocumentPattern(doc.text) # print(yydoc) yydoc.entities = doc.entities yydoc.domain = doc.domain yydoc.id = doc.id yy_list.append(yydoc) yy_list = set(yy_list) yyy_doc = CorpusPattern(yy_list) print(yyy_doc) yyy_doc.write_to_file(part_seq) return yyy_doc
def __call__(self): corpus_test = Corpus.read_from_file( os.path.join(self.config['data_filepath'], self.config['data_filename'])) scores, differ_corpus_tuples = self._evaluation(corpus_test) self.save_result(scores, differ_corpus_tuples) print('Evaluation has been done.')
def test_fuzzy_search(datadir): corpus = Corpus.read_from_file(datadir / "output.conllx") result = corpus.fuzzy_search("北京 读书", limit=1) expected = seq_one assert result[0][0] == expected
def test_attr_access(datadir): corpus = Corpus.read_from_file(datadir / "corpus.conllx") doc = corpus[0] assert doc.domain == "domain" assert doc.function == "function" assert doc.intent == "intent" assert doc.sub_function == "sub_function"
def test_difference(datadir): corpus_one = Corpus.read_from_file(datadir / "corpus_one.conllx") corpus_two = Corpus.read_from_file(datadir / "corpus_two.conllx") result = corpus_one.difference(corpus_two) expected = Corpus([ Document( "王小明在台北新竹的清华大学读书。", span_set=SpanSet( [Span(0, 3, "PERSON"), Span(4, 8, "GPE"), Span(9, 13, "ORG")]), id="3", ) ]) assert result == expected
def test_remove_duplicate(datadir): corpus = Corpus.read_from_file(datadir / "duplicate.conllx") assert len(corpus) == 4 duplicate_free = corpus.remove_duplicate() assert isinstance(duplicate_free, Corpus) assert len(duplicate_free) == 2
def generate_constraint_to_file(input_file: str, output_file: str, output_attr: str = "label"): corpus = Corpus.read_from_file(input_file) domain_mapping = generate_constraint(corpus, output_attr) with open(output_file, "wt") as fd: json.dump(domain_mapping, fd, indent=4, ensure_ascii=False)
def test_set_document_compare_function_and_set_document_hash_function(datadir): corpus_one = Corpus.read_from_file(datadir / "corpus_one.conllx") corpus_two = Corpus.read_from_file(datadir / "corpus_two.conllx") assert corpus_one != corpus_two def consider_text_only_document_compare_function(self, other): return self.text == other.text corpus_one.set_document_compare_method(consider_text_only_document_compare_function) corpus_two.set_document_compare_method(consider_text_only_document_compare_function) def consider_text_only_document_hash_function(self): return hash(frozenset(self.text)) corpus_one.set_document_hash_method(consider_text_only_document_hash_function) corpus_two.set_document_hash_method(consider_text_only_document_hash_function) assert corpus_one == corpus_two
def test_collect_intent(datadir): corpus = Corpus.read_from_file(datadir / "data.conllx") corpus_statistics = CorpusStatistics.create_from_corpus(corpus) result = corpus_statistics.intent expected = Counter({'intent_one': 2, 'intent_two': 2}) assert result == expected
def test_collect_sub_function(datadir): corpus = Corpus.read_from_file(datadir / "data.conllx") corpus_statistics = CorpusStatistics.create_from_corpus(corpus) result = corpus_statistics.sub_function expected = Counter({'sub_function_one': 2, 'sub_function_two': 2}) assert result == expected
def test_collect_domain(datadir): corpus = Corpus.read_from_file(datadir / "data.conllx") corpus_statistics = CorpusStatistics.create_from_corpus(corpus) result = corpus_statistics.domain expected = Counter({"domain_one": 2, "domain_two": 2}) assert result == expected
def test_create_from_corpus(datadir): corpus = Corpus.read_from_file(datadir / "corpus.conllx") result = CorpusPattern.create_from_corpus(corpus) result_str_list = sorted([str(i) for i in result]) expected_str_list = sorted( ["<PERSON> 在 <GPE> 的 <ORG> 读 书 。", "来 一 首 <歌手名> 的 歌 。"]) for r, e in zip(result_str_list, expected_str_list): assert e in r
def test_as_string(datadir): corpus = Corpus.read_from_file(datadir / "corpus.conllx") doc = corpus[0] doc_string = str(doc) expected_doc_string = ( "<D: domain, F: function, S: sub_function, I: intent>" " " "[王 小 明](PERSON) 在 [北 京](GPE) 的 [清 华 大 学](ORG) 读 书 。") assert doc_string == expected_doc_string
def group_by_domain(input_file, output_dir): output_dir = Path(output_dir) corpus = Corpus.read_from_file(input_file) domain_doc = collections.defaultdict(list) for doc in corpus: domain_doc[doc.domain].append(doc) for domain, doc_list in domain_doc.items(): output_file = output_dir / "{}.conllx".format(domain) corpus = Corpus(doc_list) corpus.write_to_file(output_file)
def test_attr_change(datadir, tmpdir): corpus = Corpus.read_from_file(datadir / "corpus.conllx") doc = corpus[0] # change attr doc.domain = "DOMAIN" doc.function = "FUNCTION" doc.intent = "INTENT" doc.sub_function = "SUB_FUNCTION" # write out output_file = tmpdir / "data.conllx" corpus.write_to_file(output_file) # read in check_corpus = Corpus.read_from_file(output_file) check_doc = check_corpus[0] # check assert check_doc.domain == "DOMAIN" assert check_doc.function == "FUNCTION" assert check_doc.intent == "INTENT" assert check_doc.sub_function == "SUB_FUNCTION"
def test_collect_entity_values(datadir): corpus = Corpus.read_from_file(datadir / "data.conllx") corpus_statistics = CorpusStatistics.create_from_corpus(corpus) result = corpus_statistics.entity_values expected = { ("王", "小", "明"): Counter(["PERSON", "PERSON"]), ("北", "京"): Counter(["GPE", "GPE"]), ("清", "华", "大", "学"): Counter(["ORG", "ORG"]), ("蓝", "泽", "雨"): Counter(["歌手名", "歌手名"]), } assert result == expected
def test_create_from_corpus(datadir): corpus = Corpus.read_from_file(datadir / "data.conllx") corpus_statistics = CorpusStatistics.create_from_corpus(corpus) expected = CorpusStatistics( domain=None, function=None, sub_function=None, intent=None, entity_types=None, entity_values=None, ) assert corpus_statistics == expected
def get_expend_res(self): # read data corpus = Corpus.read_from_file(self.config['data_corpus']) result_raw, result_new = Data_Expend( self.config['configure']).get_list() # generate list result_hb = Data_Expend.hebing(result_raw, result_new) # merge res = Data_Expend.quchong(result_hb) # set # generate sequence pattern doc_pattern = corpus.generate_pattern() doc_pattern.write_to_file(self.config['sequence_expend']) # expend doc doc_expend = doc_pattern.render(res) doc_expend.write_to_file(self.config['data_expend_result'])