def char_mapping(sentences, lower): chars = [[x[0].lower() if lower else x[0] for x in s] for s in sentences] dico = create_dico(chars) dico["<PAD>"] = 10000001 dico['<UNK>'] = 10000000 char_to_id, id_to_char = create_mapping(dico) print("Found %i unique words (%i in total)" % (len(dico), sum(len(x) for x in chars))) return dico, char_to_id, id_to_char
def tag_mapping(sentences): """ Create a dictionary and a mapping of tags, sorted by frequency. """ tags = [[char[-1] for char in s] for s in sentences] dico = create_dico(tags) tag_to_id, id_to_tag = create_mapping(dico) print("Found %i unique named entity tags" % len(dico)) return dico, tag_to_id, id_to_tag
def tag_mapping(sentences): """ Create a dictionary and a mapping of tags, sorted by frequency. """ tags = [[char[-1] for char in s] for s in sentences] dico = create_dico(tags) tag_to_id, id_to_tag = create_mapping(dico) print("Found %i unique named entity tags" % len(dico)) return dico, tag_to_id, id_to_tag
def mark_mapping(sentences): """ Create a dictionary and a mapping of marks, sorted by frequency. """ marks = [[mark[1] for mark in s] for s in sentences] dico = create_dico(marks) dico['<UNK>'] = 10000000 mark_to_id, id_to_mark = create_mapping(dico) return dico, mark_to_id, id_to_mark
def tag_mapping(sentences): """ 构建标签字典 :param sentences: :return: """ tag_list = [[x[1] for x in s] for s in sentences] dico = data_utils.create_dico(tag_list) tag_to_id, id_to_tag = data_utils.create_mapping(dico) return dico, tag_to_id, id_to_tag
def pos_mapping(sentences): """ Create a dictionary and a mapping of tags, sorted by frequency. """ pos = [[char[1] for char in s] for s in sentences] dico = create_dico(pos) dico["UNK"] = 100000000 pos_to_id, id_to_pos = create_mapping(dico) print("Found %i unique part of speech tags" % len(dico)) return dico, pos_to_id, id_to_pos
def pos_mapping(sentences): """ Create a dictionary and a mapping of poss, sorted by frequency. """ poss = [[char[-2] for char in s] for s in sentences] dico_pos = create_dico(poss) dico_pos['<UNK>'] = 10000000 pos_to_id, id_to_pos = create_mapping(dico_pos) print("Found %i unique named entity poss" % len(dico_pos)) return dico_pos, pos_to_id, id_to_pos
def char_mapping(sentences, lower): """ Create a dictionary and a mapping of words, sorted by frequency. """ chars = [[x[0].lower() if lower else x[0] for x in s] for s in sentences] dico = create_dico(chars) dico['<UNK>'] = 10000000 char_to_id, id_to_char = create_mapping(dico) print("Found %i unique words (%i in total)" % (len(dico), sum(len(x) for x in chars))) return dico, char_to_id, id_to_char
def char_mapping(sentences, lower): # lower: 决定是否把字符全部转化为小写 """ Create a dictionary and a mapping of words, sorted by frequency. """ chars = [[x[0].lower() if lower else x[0] for x in s] for s in sentences] # char dimension: SentenceNum*SentenceLen dico = create_dico(chars) # 以字典的形式统计每个字出现的次数 dico dimension: CharNum*2,第一列为字,第二列为出现次数 dico["<PAD>"] = 10000001 dico['<UNK>'] = 10000000 char_to_id, id_to_char = create_mapping(dico) print("Found %i unique words (%i in total)" % (len(dico), sum(len(x) for x in chars))) return dico, char_to_id, id_to_char
def tag_mapping(sentences): """ 根据数据集的标签频数创建字典,然后得到标签与索引id的双向映射字典 :param sentences: :return: """ tags = [[char[-1] for char in s] for s in sentences] dico = create_dico(tags) # 根据标签出现的频数创建字典 tag_to_id, id_to_tag = create_mapping(dico) print("Found %i unique named entity tags" % len(dico)) return dico, tag_to_id, id_to_tag
def eojeol_mapping(sentences): """ 어절 사전을 구축한다. 안쓸듯... """ eojeol = [[[word] for word in s[-2]] for s in sentences] dico = create_dico(eojeol) dico["<PAD>"] = 10000001 dico['<UNK>'] = 10000000 eojeol_to_id, id_to_eojeol = create_mapping(dico) print("Found %i unique words" % (len(dico))) return dico, eojeol_to_id, id_to_eojeol
def word_mapping(sentences): """ 构建字典 :param sentences: :return: """ word_list = [[x[0] for x in s] for s in sentences] dico = data_utils.create_dico(word_list) dico['<PAD>'] = 10000001 dico['<UNK>'] = 10000000 word_to_id, id_to_word = data_utils.create_mapping(dico) return dico, word_to_id, id_to_word
def tag_mapping(sentences): """ Create a dictionary and a mapping of tags, sorted by frequency. """ tags = [[char[-1] for char in s] for s in sentences] dico = create_dico(tags) dico['[SEP]'] = len(dico) + 1 dico['[CLS]'] = len(dico) + 2 tag_to_id, id_to_tag = create_mapping(dico) # logger.info("Found {} unique named entity tags".format(len(dico))) return dico, tag_to_id, id_to_tag
def char_mapping(sentences, lower): """ Create a dictionary and a mapping of words, sorted by frequency. """ chars = [[x[0].lower() if lower else x[0] for x in s] for s in sentences] dico = create_dico(chars) # 字典,包含每个字符及其出现的频率 dico["<PAD>"] = 10000001 # 定义填充词 dico['<UNK>'] = 10000000 # 定义未登录词 char_to_id, id_to_char = create_mapping(dico) #print("Found %i unique words (%i in total)" % ( # len(dico), sum(len(x) for x in chars) #)) return dico, char_to_id, id_to_char
def char_mapping(sentences, lower): """ Create a dictionary and a mapping of words, sorted by frequency. """ chars = [[x[0].lower() if lower else x[0] for x in s] for s in sentences] dico = create_dico(chars) dico["<PAD>"] = 10000001 dico['<UNK>'] = 10000000 char_to_id, id_to_char = create_mapping(dico) print("Found %i unique words (%i in total)" % ( len(dico), sum(len(x) for x in chars) )) return dico, char_to_id, id_to_char
def char_mapping(sentences, lower): #lower表示是否忽略大小写 """ Create a dictionary and a mapping of words, sorted by frequency. """ chars = [[x[0].lower() if lower else x[0] for x in s] for s in sentences] #获取句子中的字 dico = create_dico(chars) #create_dico表示创建词频字典 dico["<PAD>"] = 10000001 #应该不是表示结尾??? dico['<UNK>'] = 10000000 char_to_id, id_to_char = create_mapping(dico) #print("Found %i unique words (%i in total)" % ( # len(dico), sum(len(x) for x in chars) #)) return dico, char_to_id, id_to_char
def affix_mapping_with_pos(sentences, type, size, frequency): """ Affix 사전을 구축한다. 형태소 기준으로 구축할 지, 원 단어 기준으로 구축할 지 미정 현재는 원 단어 기준 :param sentences: list형태의 sentence 정보 :param *_size: *의 n-gram의 size :param frequency: frequency의 threshold ex) 50이상 등장 :return: """ affixes = [] for sentence in sentences: affix = [] # 단어 기준으로 affix 진행 시 # for word in sentence[-2]: # 형태소 기준으로 affix 진행 시 for eojeol in sentence[-3]: word = "" for w in eojeol.split('|'): word += w.split('/')[0] aff_tmp = "" if len(word) < size: for i in range(size - len(word)): aff_tmp += "^" if type == 'prefix': aff_tmp = aff_tmp + word # size가 3이라면 ><><안 이런식으로 elif type == 'suffix': aff_tmp = word + aff_tmp # size가 3이라면 다><>< 이런식으로 elif len(word) == size: aff_tmp = word else: idx = size - len(word) if type == 'prefix': aff_tmp = word[:idx] # size가 3이면 apple => app elif type == 'suffix': aff_tmp = word[-idx:] # size가 3이면 apple => ple affix.append(aff_tmp) affixes.append(affix) whole_aff_dico = create_dico(affixes) # 전체 prefix 사전 aff_dico = only_frequent_affix(whole_aff_dico, frequency) # frequent한 것만 모아놓은 사전 aff_dico["<PAD>"] = 10000001 aff_dico["<UNK>"] = 10000000 aff_to_id, id_to_aff = create_mapping(aff_dico) print("Found %i unique %s" % (len(aff_dico), type)) return aff_dico, aff_to_id, id_to_aff
def pumsa_mapping(sentences): """ 단어 사전을 구축한다. """ pumsas1 = [[[word] for word in s[5]] for s in sentences] pumsas2 = [[[word] for word in s[6]] for s in sentences] pumsas3 = [[[word] for word in s[7]] for s in sentences] pumsas4 = [[[word] for word in s[8]] for s in sentences] dico = create_dico(pumsas1 + pumsas2 + pumsas3 + pumsas4) dico["<PAD>"] = 10000001 dico['<UNK>'] = 10000000 pumsa_to_id, id_to_pumsa = create_mapping(dico) print("Found %i unique pumsa" % (len(pumsa_to_id))) return dico, pumsa_to_id, id_to_pumsa
def word_mapping(sentences): """ 단어 사전을 구축한다. """ words1 = [[[word] for word in s[1]] for s in sentences] words2 = [[[word] for word in s[2]] for s in sentences] words3 = [[[word] for word in s[3]] for s in sentences] words4 = [[[word] for word in s[4]] for s in sentences] dico = create_dico(words1 + words2 + words3 + words4) dico["<PAD>"] = 10000001 dico['<UNK>'] = 10000000 word_to_id, id_to_word = create_mapping(dico) print("Found %i unique words" % (len(dico))) return dico, word_to_id, id_to_word
def char_mapping(sentences, lower): """ Create a dictionary and a mapping of words, sorted by frequency. """ chars = [[x[0].lower() if lower else x[0] for x in s] for s in sentences] # 首先将每个字符转换为小写 dico = create_dico( chars) # 调用data_utils.py中的create_dico函数创建key-value的映射,生成含有每个字出现的次数的字典 dico["<PAD>"] = 10000001 dico['<UNK>'] = 10000000 char_to_id, id_to_char = create_mapping( dico) # 生成key-value和value-key两种字典映射形式,按value值降序排列 print("Found %i unique words (%i in total)" % ( # print字典中共有多少不同字符和一共有多少字符 len(dico), sum(len(x) for x in chars))) return dico, char_to_id, id_to_char
def char_mapping(sentences, lower): """ 음절 사전을 구축한다. """ if lower: chars = [[[char for char in word.lower()] for word in s[1]] for s in sentences] else: chars = [[[char for char in word] for word in s[1]] for s in sentences] dico = create_dico(chars) dico["<PAD>"] = 10000001 dico['<UNK>'] = 10000000 char_to_id, id_to_char = create_mapping(dico) print("Found %i unique chars" % (len(dico))) return dico, char_to_id, id_to_char
def char_mapping(sentences, lower): """ Create a dictionary and a mapping of words, sorted by frequency. lower的预定值是:True """ chars = [[x[0].lower() if lower else x[0] for x in s] for s in sentences] # Python lower() 方法转换字符串中所有大写字符为小写。 # chars 存储的是语料里的每个字 dico = create_dico(chars) # dico 为统计了语料中每个字出现的次数的字典 dico["<PAD>"] = 10000001 dico['<UNK>'] = 10000000 char_to_id, id_to_char = create_mapping(dico) # 以上两个字典为对字频做了统计之后的字和id的互译字典,字频越大id越小 print("Found %i unique words (%i in total)" % (len(dico), sum(len(x) for x in chars))) return dico, char_to_id, id_to_char
def char_mapping(sentences, lower): """ 构建一个词典和每个词的映射,通过频率排序 :param sentences: :param lower: :return: """ # 将每个词转化为小写 chars = [[x[0].lower() if lower else x[0] for x in s] for s in sentences] dico = create_dico(chars) # 定义特殊字符 dico["<PAD>"] = 10000001 dico['<UNK>'] = 10000000 # 返回正反向词典 char_to_id, id_to_char = create_mapping(dico) return dico, char_to_id, id_to_char
def tag_mapping(sentences): """ Create a dictionary and a mapping of tags, sorted by frequency. """ tags = [[char[-1] for char in s] for s in sentences] # print("tags:{}".format(tags)) # "[['O', 'O', 'B-SLOC', 'I-SLOC', 'I-SLOC', 'I-SLOC', 'I-SLOC', 'I-SLOC', 'I-SLOC', 'I-SLOC', 'E-SLOC', 'O', 'B-TYPE', 'E-TYPE']]" dico = create_dico(tags) # print("dico:{}".format(dico)) # "dico:{'E-YEAR': 1460, 'I-SPEED': 1639, 'B-SLOC': 3025, 'S-ELOC': 1, 'E-DAY': 1936, 'O': 15082, 'B-YEAR': 1460, 'B-DAY': 1936, 'I-STIME': 4285, 'I-DAY': 1367, 'I-MONTH': 478, 'E-PART': 1936, 'I-ROAD': 901, 'E-STIME': 1934, 'B-STIME': 1934, 'B-ROAD': 556, 'B-TYPE': 3027, 'I-SLOC': 19315, 'I-ETIME': 3113, 'E-ROAD': 556, 'I-YEAR': 1434, 'B-ETIME': 1414, 'E-MONTH': 1935, 'E-TYPE': 3027, 'I-ELOC': 5125, 'E-SLOC': 3025, 'B-PART': 1936, 'S-SLOC': 1, 'B-MONTH': 1935, 'E-SPEED': 1278, 'B-SPEED': 1278, 'B-ELOC': 784, 'E-ELOC': 784, 'E-ETIME': 1414}" tag_to_id, id_to_tag = create_mapping(dico) # print("tag_to_id:{}".format(tag_to_id)) # "tag_to_id:{'I-SPEED': 17, 'E-ROAD': 30, 'B-SLOC': 7, 'E-SLOC': 8, 'B-ETIME': 21, 'I-DAY': 23, 'E-SPEED': 25, 'B-ELOC': 27, 'B-YEAR': 18, 'I-ETIME': 4, 'I-SLOC': 0, 'E-ETIME': 22, 'B-TYPE': 5, 'I-ELOC': 2, 'E-DAY': 11, 'E-MONTH': 14, 'B-PART': 10, 'B-SPEED': 24, 'E-TYPE': 6, 'E-ELOC': 28, 'O': 1, 'I-ROAD': 26, 'B-ROAD': 29, 'S-SLOC': 33, 'B-STIME': 15, 'E-PART': 12, 'E-YEAR': 19, 'S-ELOC': 32, 'B-MONTH': 13, 'B-DAY': 9, 'I-YEAR': 20, 'I-MONTH': 31, 'E-STIME': 16, 'I-STIME': 3}" # print("id_to_tag:{}".format(id_to_tag)) # "id_to_tag:{0: 'I-SLOC', 1: 'O', 2: 'I-ELOC', 3: 'I-STIME', 4: 'I-ETIME', 5: 'B-TYPE', 6: 'E-TYPE', 7: 'B-SLOC', 8: 'E-SLOC', 9: 'B-DAY', 10: 'B-PART', 11: 'E-DAY', 12: 'E-PART', 13: 'B-MONTH', 14: 'E-MONTH', 15: 'B-STIME', 16: 'E-STIME', 17: 'I-SPEED', 18: 'B-YEAR', 19: 'E-YEAR', 20: 'I-YEAR', 21: 'B-ETIME', 22: 'E-ETIME', 23: 'I-DAY', 24: 'B-SPEED', 25: 'E-SPEED', 26: 'I-ROAD', 27: 'B-ELOC', 28: 'E-ELOC', 29: 'B-ROAD', 30: 'E-ROAD', 31: 'I-MONTH', 32: 'S-ELOC', 33: 'S-SLOC'} " print("Found %i unique named entity tags" % len(dico)) return dico, tag_to_id, id_to_tag
def char_mapping(sentences, lower): """ 根据数据集的词频创建字典,然后得到字符与索引id的双向映射字典 :param sentences: :param lower: :return: """ chars = [[x[0].lower() if lower else x[0] for x in s] for s in sentences] # 用creat创建字典 dico = create_dico(chars) # 创建字典,键值对为word-词频frequency # padding字符<PAD>的频数,极大化这一数值,保证最终得到的映射字典中<PAD>的索引为0,因为对序列进行补长的时候,补充的是0 dico["<PAD>"] = 10000001 dico['<UNK>'] = 10000000 # unknown字符的索引 # 根据字典得到两种映射 char_to_id, id_to_char = create_mapping(dico) print("Found %i unique words (%i in total)" % (len(dico), sum(len(x) for x in chars))) # print("char_to_id:", char_to_id) return dico, char_to_id, id_to_char
def char_mapping(sentences, lower): """ Create a dictionary and a mapping of characters, sorted by frequency. """ words = [[x[0] for x in s] for s in sentences] chars = [] for s in words: char = [] for word in s: for c in word: char.append(c.lower() if lower else c) chars.append(char) dico = create_dico(chars) dico["<PAD>"] = 10000001 dico['<UNK>'] = 10000000 char_to_id, id_to_char = create_mapping(dico) print("Found %i unique chars (%i in total)" % (len(dico), sum(len(x) for x in chars))) return dico, char_to_id, id_to_char
def char_mapping(sentences, lower): """ 根据字创建一个字典 :param sentences: :param lower: :return: 字典,字-id映射,id-字映射 """ # 只包含当前词 [[w1, w2,...], [w1, w2, ...], []] chars = [[char.lower() if lower else char for char in sentence] for sentence in sentences] dico = create_dico(chars) dico[u"<PAD>"] = 10000001 # 目前未用到 定义一个大的数保证其对应id为0 dico[u"<UNK>"] = 10000000 # 未登录词 char_to_id, id_to_char = create_mapping(dico) print("Found %i unique words (%i in total)" % (len(dico), sum(len(x) for x in chars))) return dico, char_to_id, id_to_char
def char_mapping(sentences, lower): """ Create a dictionary and a mapping of words, sorted by frequency. """ #����һ���ֵ�ӳ�䣬����Ƶ���� sÿ�仰��x��['��', 'O'] ����ת����Сд chars = [ [x[0].lower() if lower else x[0] for x in s] for s in sentences ] #[['��', '��', '��', '��', '��', 'ס', '��', 'Ժ', '��'], ['��', '��']] dico = create_dico(chars) #ͳ����Ƶ dico[ "<PAD>"] = 10000001 #pading ÿ���ּӸ��߿� ÿ�仰�ij���pading ��һ���ij��� ����� dico['<UNK>'] = 10000000 #��֪������Ҳ����id char_to_id, id_to_char = create_mapping(dico) #ÿ���ֽ����Ը��ֵ� #�ֵ�id char_to_id id���� id_to_char #print("Found %i unique words (%i in total)" % ( # len(dico), sum(len(x) for x in chars) #)) # dico ��û��������ֵ�{'��': 79775, '��': 3134, '��': 7713, '0': 335904, '��': 19664, '��': 14307} # char_to_id {0: '<PAD>', 1: '<UNK>', 2: '0', 3: '��', 4: '��', 5: '��', 6: '��',} ��Ƶ�Ӵ�С # id_to_char {'<PAD>': 0, '<UNK>': 1, '0': 2, '��': 3, '��': 4, '��': 5, '��': 6,} ��Ƶ�Ӵ�С return dico, char_to_id, id_to_char
def tag_mapping(sentences): """ Create a dictionary and a mapping of tags, sorted by frequency. """ # 该函数用于将tag做mapping, # 其输入是: # 输出是: f = open('tag_to_id.txt', 'w', encoding='utf8') f1 = open('id_to_tag.txt', 'w', encoding='utf8') tags = [] for s in sentences: ts = [] for char in s: tag = char[-1] ts.append(tag) tags.append(ts) # tags [['O', 'O', 'B-DRU', 'E-DRU', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'B-SYM', # 'E-SYM', 'B-SYM', 'I-SYM', 'I-SYM', 'E-SYM', 'O', 'O', 'O', 'O', 'O', 'O', 'O', # 'O', 'O', 'O', 'O', 'O', 'B-DRU', 'E-DRU', 'O', 'O', 'O', 'O', 'O', 'O', 'O', # 'B-REG', 'I-REG', 'E-REG', 'B-SYM', 'E-SYM', 'O', 'O', 'O', 'O', 'O', 'O', 'O', # 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O']] # tags1 = [[char[-1] for char in s] for s in sentences] dico = create_dico(tags) # dico {'O': 44, 'B-DRU': 2, 'E-DRU': 2, 'B-SYM': 3, 'E-SYM': 3, 'I-SYM': 2, 'B-REG': 1, # 'I-REG': 1, 'E-REG': 1} tag_to_id, id_to_tag = create_mapping(dico) # tag_to_id: # {'O': 0, 'B-SYM': 1, 'E-SYM': 2, 'B-DRU': 3, 'E-DRU': 4, 'I-SYM': 5, 'B-REG': 6, 'E-REG': 7, # 'I-REG': 8} # id_to_tag # {0: 'O', 1: 'B-SYM', 2: 'E-SYM', 3: 'B-DRU', 4: 'E-DRU', 5: 'I-SYM', 6: 'B-REG', 7: 'E-REG', # 8: 'I-REG'} # print("Found %i unique named entity tags" % len(dico)) # 写入文档中 for k, v in tag_to_id.items(): f.write(k + ":" + str(v) + "\n") for k, v in id_to_tag.items(): f1.write(str(k) + ":" + str(v) + "\n") return dico, tag_to_id, id_to_tag
def char_mapping(sentences, lower): """ Create a dictionary and a mapping of words, sorted by frequency. """ # print("lower:{}".format(lower)) # lower: False chars = [[x[0].lower() if lower else x[0] for x in s] for s in sentences] # print("in char_mapping chars:{}".format(chars)) # print("chars[0]:{}".format(chars[0])) # "chars[0]:['我', '要', '看', '乌', '鲁', '木', '齐', '市', '第', '四', '十', '九', '中', '学', '东', '门', '去', '乌', '鲁', '木', '齐', '推', '拿', '职', '业', '学', '校', '南', '门', '沿', '西', '虹', '东', '路', '的', '监', '控']" dico = create_dico(chars) # print("dico:{}".format(dico)) # dico: {'仓': 16, '背': 5, '视': 348, '煨': 1, '代': 25, '欢': 2, '配': 2, '核': 5, '还': 3, '结': 4, '工': 124 } dico["<PAD>"] = 10000001 dico['<UNK>'] = 10000000 char_to_id, id_to_char = create_mapping(dico) # id_to_item: {0: '<PAD>', 1: '<UNK>', 2: '1', 3: ':', 4: '2', 5: '门', 6: '的', 7: '0', 8: '月' # item_to_id: {'俊': 402, '是': 428, '仪': 642, '哥': 728, '童': 366, '3': 12, '界': 450, '税': 876} # print("char_to_id:{}".format(char_to_id)) print("Found %i unique words (%i in total)" % (len(dico), sum(len(x) for x in chars))) return dico, char_to_id, id_to_char
def char_mapping(sentences, lower): """ Create a dictionary and a mapping of words, sorted by frequency. """ # 按照频率构造字典chars取sentences中的最数组最内单元的第一个数组作为结果,本质上就是最开始输入的经过处理替换数字的文字字符串。 # ['入', '院', '情', '况', ':', '女', ',', '0', '0', '岁', ',', '以', '突', '发', '言', '语', '不', # '清', '0', '天', ',', '加', '重', '0', '天', '入', '院', '。', '入', '院', '情', '况', ':', '患', # '者', '以', '腰', '痛', '伴', '双', '下', '肢', '疼', '痛', '半', '年', ',', '加', '重', '0', '0', # '余', '天', '为', '主', '诉', '入', '院', '。'] chars = [[x[0].lower() if lower else x[0] for x in s] for s in sentences] # 在此处获取字及对应出现的频率 dico = create_dico(chars) dico["<PAD>"] = 10000001 dico['<UNK>'] = 10000000 # 这里将无重复的字符传入create_mapping构件字典,传入数据是{'无': 1754, '长': 68, '期': 179,...} # char_to_id # {'<PAD>': 0, '<UNK>': 1, '0': 2, '入': 3, '院': 4, ',': 5, '天': 6, '。': 7, '以': 8, '况': 9, '加': 10, # '情': 11, '痛': 12, '重': 13, ':': 14, '下': 15, '不': 16, '为': 17, '主': 18, '伴': 19, '余': 20, '半': 21, # '双': 22, '发': 23, '女': 24, '岁': 25, '年': 26, '患': 27, '清': 28, '疼': 29, '突': 30, '者': 31, '肢': 32, # '腰': 33, '言': 34, '诉': 35, '语': 36} # id_to_char # {0: '<PAD>', 1: '<UNK>', 2: '0', 3: '入', 4: '院', 5: ',', 6: '天', 7: '。', 8: '以', 9: '况', 10: '加', # 11: '情', 12: '痛', 13: '重', 14: ':', 15: '下', 16: '不', 17: '为', 18: '主', 19: '伴', 20: '余', 21: '半', # 22: '双', 23: '发', 24: '女', 25: '岁', 26: '年', 27: '患', 28: '清', 29: '疼', 30: '突', 31: '者', 32: '肢', # 33: '腰', 34: '言', 35: '诉', 36: '语'} char_to_id, id_to_char = create_mapping(dico) # print("Found %i unique words (%i in total)" % ( # len(dico), sum(len(x) for x in chars) # )) # 这里dico{'入': 4, '院': 4, '情': 2, '况': 2, ':': 2, '女': 1, ',': 4, '0': 6, '岁': 1, '以': 2, '突': 1, # '发': 1, '言': 1, '语': 1, '不': 1, '清': 1, '天': 3, '加': 2, '重': 2, '。': 2, '患': 1, '者': 1, '腰': 1, # '痛': 2, '伴': 1, '双': 1, '下': 1, '肢': 1, '疼': 1, '半': 1, '年': 1, '余': 1, '为': 1, '主': 1, '诉': 1, # '<PAD>': 10000001, '<UNK>': 10000000} return dico, char_to_id, id_to_char
def tag_mapping(sentences): """ Create a dictionary and a mapping of tags, sorted by frequency. """ f = open('tag_to_id.txt', 'w', encoding='utf8') f1 = open('id_to_tag.txt', 'w', encoding='utf8') tags = [] for s in sentences: ts = [] for char in s: tag = char[-1] ts.append(tag) tags.append(ts) #tags1 = [[char[-1] for char in s] for s in sentences] dico = create_dico(tags) tag_to_id, id_to_tag = create_mapping(dico) #print("Found %i unique named entity tags" % len(dico)) for k, v in tag_to_id.items(): f.write(k + ":" + str(v) + "\n") for k, v in id_to_tag.items(): f1.write(str(k) + ":" + str(v) + "\n") return dico, tag_to_id, id_to_tag