def statistic_entity_for_sequence_labeling(input_file_path, ner_type="flat"):
    entity_statistic_dict = {}

    if ner_type == "flat":
        """
        Desc:
            input file should be : 
            王 B-PERSON
            小 M-PERSON
            明 E-PERSON
        """
        annotate_sentences = load_conll(input_file_path)
        for item in annotate_sentences:
            word_items, label_items = item
            sentence, tags = bmes_decode([[c, l] for c, l in zip(word_items, label_items)])
            # tags: [{'term': '上海', 'tag': 'GPE', 'begin': 0, 'end': 2}, {'term': '浦东', 'tag': 'GPE', 'begin': 2, 'end': 4}]
            for entity_item in tags:
                if entity_item.tag not in entity_statistic_dict.keys():
                    entity_statistic_dict[entity_item.tag] = 1
                else:
                    entity_statistic_dict[entity_item.tag] += 1
    elif ner_type == "nest":
        pass
    else:
        raise ValueError("Please notice your entity type do not exists !!")

    print("check the number of entities ")
    print("=*="*20)
    sum_of_entity = sum([entity_v for entity_v in entity_statistic_dict.values()])
    print("Total number of entity is : {}".format(sum_of_entity))
    for entity_k, entity_v in entity_statistic_dict.items():
        print("{} -> {}".format(entity_k, entity_v))

    return entity_statistic_dict
示例#2
0
def generate_query_ner_dataset(source_file_path,
                               dump_file_path,
                               entity_sign="nested",
                               dataset_name=None,
                               query_sign="default"):
    """
    Args:
        source_data_file: /data/genia/train.word.json | /data/msra/train.char.bmes
        dump_data_file: /data/genia-mrc/train.mrc.json | /data/msra-mrc/train.mrc.json
        dataset_name: one in ["en_ontonotes5", "en_conll03", ]
        entity_sign: one of ["nested", "flat"]
        query_sign: defualt is "default"
    Desc:
        pass 
    """
    entity_queries = queries_for_dataset[dataset_name][query_sign]
    label_lst = queries_for_dataset[dataset_name]["labels"]

    if entity_sign == "nested":
        with open(source_file_path, "r") as f:
            source_data = json.load(f)
    elif entity_sign == "flat":
        # source_data = load_conll(source_file_path, "\t")
        source_data = load_conll(source_file_path)
    else:
        raise ValueError("ENTITY_SIGN can only be NESTED or FLAT.")

    target_data = transform_examples_to_qa_features(entity_queries,
                                                    label_lst,
                                                    source_data,
                                                    entity_sign=entity_sign)

    with open(dump_file_path, "w") as f:
        json.dump(target_data, f, sort_keys=True, ensure_ascii=False, indent=2)