def augment_with_pretrained(dictionary, ext_emb_path, chars): """ Augment the dictionary with words that have a pretrained embedding. If `words` is None, we add every word that has a pretrained embedding to the dictionary, otherwise, we only add the words that are given by `words` (typically the words in the development and test sets.) """ print('Loading pretrained embeddings from %s...' % ext_emb_path) assert os.path.isfile(ext_emb_path) # Load pretrained embeddings from file pretrained = set([ line.rstrip().split()[0].strip() for line in codecs.open(ext_emb_path, 'r', 'utf-8') if len(ext_emb_path) > 0 ]) # We either add every word in the pretrained file, # or only words given in the `words` list to which # we can assign a pretrained embedding if chars is None: for char in pretrained: if char not in dictionary: dictionary[char] = 0 else: for char in chars: if any( x in pretrained for x in [char, char.lower(), re.sub('\d', '0', char.lower())]) and char not in dictionary: dictionary[char] = 0 word_to_id, id_to_word = create_mapping(dictionary) return dictionary, word_to_id, id_to_word
def feature_mapping(sentences, features): """ 创建特征和index映射关系 :param sentences: list of list of tuple, [[(words11, features11, ..., tag11), ...], [(word21, feature21, ..., tag21), ...], ...] :param features: string, 特征的列index, 以逗号分隔 :return: feature_to_id: dict id_to_feature: dict """ dico = OrderedDict() feature_to_id = OrderedDict() id_to_feature = OrderedDict() features_list = features.split(",") for feature_i in features_list: if feature_i == "0": continue cur_feature = [[t[int(feature_i)] for t in s] for s in sentences] cur_dico = create_dico(cur_feature) print("%sth feature found %i unique features" % (feature_i, len(cur_dico))) cur_dico["<UNK>"] = 10000000 cur_feature_to_id, cur_id_to_feature = create_mapping(cur_dico) dico[feature_i] = cur_dico feature_to_id[feature_i] = cur_feature_to_id id_to_feature[feature_i] = cur_id_to_feature return dico, feature_to_id, id_to_feature
def tag_mapping(sentences): """ Create a dictionary and a mapping of tags, sorted by frequency. """ tags = [[char[-1] for char in s] for s in sentences] dico = create_dico(tags) tag_to_id, id_to_tag = create_mapping(dico) print("Found %i unique named entity tags" % len(dico)) return dico, tag_to_id, id_to_tag
def tag_mapping(sentences): """ Create a dictionary and a mapping of tags, sorted by frequency. """ f = open(os.path.join('configs', 'tag_to_id.txt'), 'w', encoding='utf8') f1 = open(os.path.join('configs', 'id_to_tag.txt'), 'w', encoding='utf8') f2 = open(os.path.join('configs', 'indent_to_id.txt'), 'w', encoding='utf8') f3 = open(os.path.join('configs', 'id_to_indent.txt'), 'w', encoding='utf8') tags = [] intents = [] for s in sentences: ts = [] ints = [] for char in s: tag = char[1] intent = char[-1] ts.append(tag) ints.append(intent) tags.append(ts) intents.append(ints) dico_tags = create_dico(tags) tag_to_id, id_to_tag = create_mapping(dico_tags) dico_intents = create_dico(intents) intent_to_id, id_to_intent = create_mapping(dico_intents) for k, v in tag_to_id.items(): f.write(k + ":" + str(v) + "\n") for k, v in id_to_tag.items(): f1.write(str(k) + ":" + str(v) + "\n") for k, v in intent_to_id.items(): f2.write(k + ":" + str(v) + "\n") for k, v in id_to_intent.items(): f3.write(str(k) + ":" + str(v) + "\n") return tag_to_id, id_to_tag, intent_to_id, id_to_intent
def char_mapping(sentences, lower): """ Create a dictionary and a mapping of words, sorted by frequency. """ chars = [[x[0].lower() if lower else x[0] for x in s] for s in sentences] dico = create_dico(chars) dico["<PAD>"] = 10000001 dico['<UNK>'] = 10000000 char_to_id, id_to_char = create_mapping(dico) print("Found %i unique words (%i in total)" % (len(dico), sum(len(x) for x in chars))) return dico, char_to_id, id_to_char
def augment_with_pretrained(dictionary, ext_emb_path, chars): """ Augment the dictionary with words that have a pretrained embedding. If `chars` is None, we add every word that has a pretrained embedding to the dictionary, otherwise, we only add the words that are given by `chars` (typically the words in the development and test sets.) 增大训练集字的字典,若chars为None则将pre-trained的所有字都加入,否则只加入pre-trained和chars中共同的字 :param dictionary: 训练集字的字典 :param ext_emb_path: pre-trained file path :param chars: 测试集的字的set """ print('Loading pretrained embeddings from %s...' % ext_emb_path) assert os.path.isfile(ext_emb_path) # Load pretrained embeddings from file # 得到pre-trained中所有字的set pretrained = set([ line.rstrip().split()[0].strip() for line in codecs.open(ext_emb_path, 'r', 'utf-8') if len(ext_emb_path) > 0 ]) # We either add every word in the pretrained file, # or only words given in the `words` list to which # we can assign a pretrained embedding # 若测试集字为None if chars is None: # 将pre-trained的所有字加入训练集字的字典 for char in pretrained: if char not in dictionary: dictionary[char] = 0 else: # 将测试集中存在与pre-trained的字加入训练集字的字典 for char in chars: if any( x in pretrained for x in [char, char.lower(), re.sub('\d', '0', char.lower())]) and char not in dictionary: dictionary[char] = 0 word_to_id, id_to_word = create_mapping(dictionary) return dictionary, word_to_id, id_to_word