def feature_mapping(sentences, features): """ 创建特征和index映射关系 :param sentences: list of list of tuple, [[(words11, features11, ..., tag11), ...], [(word21, feature21, ..., tag21), ...], ...] :param features: string, 特征的列index, 以逗号分隔 :return: feature_to_id: dict id_to_feature: dict """ dico = OrderedDict() feature_to_id = OrderedDict() id_to_feature = OrderedDict() features_list = features.split(",") for feature_i in features_list: if feature_i == "0": continue cur_feature = [[t[int(feature_i)] for t in s] for s in sentences] cur_dico = create_dico(cur_feature) print("%sth feature found %i unique features" % (feature_i, len(cur_dico))) cur_dico["<UNK>"] = 10000000 cur_feature_to_id, cur_id_to_feature = create_mapping(cur_dico) dico[feature_i] = cur_dico feature_to_id[feature_i] = cur_feature_to_id id_to_feature[feature_i] = cur_id_to_feature return dico, feature_to_id, id_to_feature
def tag_mapping(sentences): """ Create a dictionary and a mapping of tags, sorted by frequency. """ tags = [[char[-1] for char in s] for s in sentences] dico = create_dico(tags) tag_to_id, id_to_tag = create_mapping(dico) print("Found %i unique named entity tags" % len(dico)) return dico, tag_to_id, id_to_tag
def tag_mapping(sentences): """ Create a dictionary and a mapping of tags, sorted by frequency. """ f = open(os.path.join('configs', 'tag_to_id.txt'), 'w', encoding='utf8') f1 = open(os.path.join('configs', 'id_to_tag.txt'), 'w', encoding='utf8') f2 = open(os.path.join('configs', 'indent_to_id.txt'), 'w', encoding='utf8') f3 = open(os.path.join('configs', 'id_to_indent.txt'), 'w', encoding='utf8') tags = [] intents = [] for s in sentences: ts = [] ints = [] for char in s: tag = char[1] intent = char[-1] ts.append(tag) ints.append(intent) tags.append(ts) intents.append(ints) dico_tags = create_dico(tags) tag_to_id, id_to_tag = create_mapping(dico_tags) dico_intents = create_dico(intents) intent_to_id, id_to_intent = create_mapping(dico_intents) for k, v in tag_to_id.items(): f.write(k + ":" + str(v) + "\n") for k, v in id_to_tag.items(): f1.write(str(k) + ":" + str(v) + "\n") for k, v in intent_to_id.items(): f2.write(k + ":" + str(v) + "\n") for k, v in id_to_intent.items(): f3.write(str(k) + ":" + str(v) + "\n") return tag_to_id, id_to_tag, intent_to_id, id_to_intent
def char_mapping(sentences, lower): """ Create a dictionary and a mapping of words, sorted by frequency. """ chars = [[x[0].lower() if lower else x[0] for x in s] for s in sentences] dico = create_dico(chars) dico["<PAD>"] = 10000001 dico['<UNK>'] = 10000000 char_to_id, id_to_char = create_mapping(dico) print("Found %i unique words (%i in total)" % (len(dico), sum(len(x) for x in chars))) return dico, char_to_id, id_to_char