def augment_with_pretrained(dictionary, ext_emb_path, chars): """ Augment the dictionary with words that have a pretrained embedding. If `words` is None, we add every word that has a pretrained embedding to the dictionary, otherwise, we only add the words that are given by `words` (typically the words in the development and test sets.) """ print('Loading pretrained embeddings from %s...' % ext_emb_path) assert os.path.isfile(ext_emb_path) # Load pretrained embeddings from file pretrained = set([ line.rstrip().split()[0].strip() for line in codecs.open(ext_emb_path, 'r', 'utf-8') if len(ext_emb_path) > 0 ]) # We either add every word in the pretrained file, # or only words given in the `words` list to which # we can assign a pretrained embedding if chars is None: for char in pretrained: if char not in dictionary: dictionary[char] = 0 else: for char in chars: if any(x in pretrained for x in [ char, char.lower(), re.sub('\d', '0', char.lower()) ]) and char not in dictionary: dictionary[char] = 0 word_to_id, id_to_word = create_mapping(dictionary) return dictionary, word_to_id, id_to_word
def tag_mapping(sentences): """ Create a dictionary and a mapping of tags, sorted by frequency. """ tags = [[char[-1] for char in s] for s in sentences] dico = create_dico(tags) tag_to_id, id_to_tag = create_mapping(dico) print("Found %i unique named entity tags" % len(dico)) return dico, tag_to_id, id_to_tag
def word_mapping(sentences): """ mapping 映射 构建字典 word_to_id, id_to_word :param sentences: :return: """ word_list = [[x[0] for x in s] for s in sentences] dico = data_utils.create_dico(word_list) dico['<PAD>'] = 10000001 dico['<UNK>'] = 10000000 word_to_id, id_to_word = data_utils.create_mapping(dico) return dico, word_to_id, id_to_word
def word_mapping(sentences): """ 构建字典 :param sentences: 语料数据 :return: word统计频数字典、word_id映射、id_word映射 """ word_list = [[x[0] for x in s] for s in sentences] word_count = data_utils.create_item_count(word_list) # 一个小trick, 将填充词和未登录词的词频设置的特别大,排序字典可以排到前边 word_count["<PAD>"] = 10000001 word_count["<UNK>"] = 10000000 word_to_id, id_to_word = data_utils.create_mapping(word_count) return word_count, word_to_id, id_to_word
def char_mapping(sentences, lower): """ Create a dictionary and a mapping of words, sorted by frequency. """ chars = [[x[0].lower() if lower else x[0] for x in s] for s in sentences] dico = create_dico(chars) dico["<PAD>"] = 10000001 dico['<UNK>'] = 10000000 char_to_id, id_to_char = create_mapping(dico) print("Found %i unique words (%i in total)" % ( len(dico), sum(len(x) for x in chars) )) return dico, char_to_id, id_to_char
def char_mapping(sentences, lower): """ Create a dictionary and a mapping of words, sorted by frequency. """ chars = [[x[0].lower() if lower else x[0] for x in s] for s in sentences] dico = create_dico(chars) dico["<PAD>"] = 10000001 dico['<UNK>'] = 10000000 char_to_id, id_to_char = create_mapping(dico) print("Found %i unique words (%i in total)" % (len(dico), sum(len(x) for x in chars))) return dico, char_to_id, id_to_char
def tag_mapping(sentences): """ Create a dictionary and a mapping of tags, sorted by frequency. """ tags = [[char[-1] for char in s] for s in sentences] dico = create_dico(tags) dico['[SEP]'] = len(dico) + 1 dico['[CLS]'] = len(dico) + 2 tag_to_id, id_to_tag = create_mapping(dico) print("Found %i unique named entity tags" % len(dico)) return dico, tag_to_id, id_to_tag
def affix_mapping_with_pos(sentences, type, size, frequency): """ Affix 사전을 구축한다. 형태소 기준으로 구축할 지, 원 단어 기준으로 구축할 지 미정 현재는 원 단어 기준 :param sentences: list형태의 sentence 정보 :param *_size: *의 n-gram의 size :param frequency: frequency의 threshold ex) 50이상 등장 :return: """ affixes = [] for sentence in sentences: affix = [] # 단어 기준으로 affix 진행 시 # for word in sentence[-2]: # 형태소 기준으로 affix 진행 시 for eojeol in sentence[-3]: word = "" for w in eojeol.split('|'): word += w.split('/')[0] aff_tmp = "" if len(word) < size: for i in range(size - len(word)): aff_tmp += "^" if type == 'prefix': aff_tmp = aff_tmp + word # size가 3이라면 ><><안 이런식으로 elif type == 'suffix': aff_tmp = word + aff_tmp # size가 3이라면 다><>< 이런식으로 elif len(word) == size: aff_tmp = word else: idx = size - len(word) if type == 'prefix': aff_tmp = word[:idx] # size가 3이면 apple => app elif type == 'suffix': aff_tmp = word[-idx:] # size가 3이면 apple => ple affix.append(aff_tmp) affixes.append(affix) whole_aff_dico = create_dico(affixes) # 전체 prefix 사전 aff_dico = only_frequent_affix(whole_aff_dico, frequency) # frequent한 것만 모아놓은 사전 aff_dico["<PAD>"] = 10000001 aff_dico["<UNK>"] = 10000000 aff_to_id, id_to_aff = create_mapping(aff_dico) print("Found %i unique %s" % (len(aff_dico), type)) return aff_dico, aff_to_id, id_to_aff
def word_mapping(sentences): """ 단어 사전을 구축한다. """ words1 = [[[word] for word in s[1]] for s in sentences] words2 = [[[word] for word in s[2]] for s in sentences] words3 = [[[word] for word in s[3]] for s in sentences] words4 = [[[word] for word in s[4]] for s in sentences] dico = create_dico(words1 + words2 + words3 + words4) dico["<PAD>"] = 10000001 dico['<UNK>'] = 10000000 word_to_id, id_to_word = create_mapping(dico) print("Found %i unique words" % (len(dico))) return dico, word_to_id, id_to_word
def pumsa_mapping(sentences): """ 단어 사전을 구축한다. """ pumsas1 = [[[word] for word in s[5]] for s in sentences] pumsas2 = [[[word] for word in s[6]] for s in sentences] pumsas3 = [[[word] for word in s[7]] for s in sentences] pumsas4 = [[[word] for word in s[8]] for s in sentences] dico = create_dico(pumsas1 + pumsas2 + pumsas3 + pumsas4) dico["<PAD>"] = 10000001 dico['<UNK>'] = 10000000 pumsa_to_id, id_to_pumsa = create_mapping(dico) print("Found %i unique pumsa" % (len(pumsa_to_id))) return dico, pumsa_to_id, id_to_pumsa
def char_mapping(sentences, lower): #lower表示是否忽略大小写 """ Create a dictionary and a mapping of words, sorted by frequency. """ chars = [[x[0].lower() if lower else x[0] for x in s] for s in sentences] #获取句子中的字 dico = create_dico(chars) #create_dico表示创建词频字典 dico["<PAD>"] = 10000001 #应该不是表示结尾??? dico['<UNK>'] = 10000000 char_to_id, id_to_char = create_mapping(dico) #print("Found %i unique words (%i in total)" % ( # len(dico), sum(len(x) for x in chars) #)) return dico, char_to_id, id_to_char
def char_mapping(sentences, lower): """ 음절 사전을 구축한다. """ if lower: chars = [[[char for char in word.lower()] for word in s[1]] for s in sentences] else: chars = [[[char for char in word] for word in s[1]] for s in sentences] dico = create_dico(chars) dico["<PAD>"] = 10000001 dico['<UNK>'] = 10000000 char_to_id, id_to_char = create_mapping(dico) print("Found %i unique chars" % (len(dico))) return dico, char_to_id, id_to_char
def char_mapping(sentences, lower): """ Create a dictionary and a mapping of words, sorted by frequency. """ chars = [[x[0].lower() if lower else x[0] for x in s] for s in sentences] # 首先将每个字符转换为小写 dico = create_dico( chars) # 调用data_utils.py中的create_dico函数创建key-value的映射,生成含有每个字出现的次数的字典 dico["<PAD>"] = 10000001 dico['<UNK>'] = 10000000 char_to_id, id_to_char = create_mapping( dico) # 生成key-value和value-key两种字典映射形式,按value值降序排列 print("Found %i unique words (%i in total)" % ( # print字典中共有多少不同字符和一共有多少字符 len(dico), sum(len(x) for x in chars))) return dico, char_to_id, id_to_char
def char_mapping(sentences, lower): """ Create a dictionary and a mapping of words, sorted by frequency. lower的预定值是:True """ chars = [[x[0].lower() if lower else x[0] for x in s] for s in sentences] # Python lower() 方法转换字符串中所有大写字符为小写。 # chars 存储的是语料里的每个字 dico = create_dico(chars) # dico 为统计了语料中每个字出现的次数的字典 dico["<PAD>"] = 10000001 dico['<UNK>'] = 10000000 char_to_id, id_to_char = create_mapping(dico) # 以上两个字典为对字频做了统计之后的字和id的互译字典,字频越大id越小 print("Found %i unique words (%i in total)" % (len(dico), sum(len(x) for x in chars))) return dico, char_to_id, id_to_char
def char_mapping(sentences, lower): """ 构建一个词典和每个词的映射,通过频率排序 :param sentences: :param lower: :return: """ # 将每个词转化为小写 chars = [[x[0].lower() if lower else x[0] for x in s] for s in sentences] dico = create_dico(chars) # 定义特殊字符 dico["<PAD>"] = 10000001 dico['<UNK>'] = 10000000 # 返回正反向词典 char_to_id, id_to_char = create_mapping(dico) return dico, char_to_id, id_to_char
def tag_mapping(sentences): """ Create a dictionary and a mapping of tags, sorted by frequency. """ tags = [[char[-1] for char in s] for s in sentences] # print("tags:{}".format(tags)) # "[['O', 'O', 'B-SLOC', 'I-SLOC', 'I-SLOC', 'I-SLOC', 'I-SLOC', 'I-SLOC', 'I-SLOC', 'I-SLOC', 'E-SLOC', 'O', 'B-TYPE', 'E-TYPE']]" dico = create_dico(tags) # print("dico:{}".format(dico)) # "dico:{'E-YEAR': 1460, 'I-SPEED': 1639, 'B-SLOC': 3025, 'S-ELOC': 1, 'E-DAY': 1936, 'O': 15082, 'B-YEAR': 1460, 'B-DAY': 1936, 'I-STIME': 4285, 'I-DAY': 1367, 'I-MONTH': 478, 'E-PART': 1936, 'I-ROAD': 901, 'E-STIME': 1934, 'B-STIME': 1934, 'B-ROAD': 556, 'B-TYPE': 3027, 'I-SLOC': 19315, 'I-ETIME': 3113, 'E-ROAD': 556, 'I-YEAR': 1434, 'B-ETIME': 1414, 'E-MONTH': 1935, 'E-TYPE': 3027, 'I-ELOC': 5125, 'E-SLOC': 3025, 'B-PART': 1936, 'S-SLOC': 1, 'B-MONTH': 1935, 'E-SPEED': 1278, 'B-SPEED': 1278, 'B-ELOC': 784, 'E-ELOC': 784, 'E-ETIME': 1414}" tag_to_id, id_to_tag = create_mapping(dico) # print("tag_to_id:{}".format(tag_to_id)) # "tag_to_id:{'I-SPEED': 17, 'E-ROAD': 30, 'B-SLOC': 7, 'E-SLOC': 8, 'B-ETIME': 21, 'I-DAY': 23, 'E-SPEED': 25, 'B-ELOC': 27, 'B-YEAR': 18, 'I-ETIME': 4, 'I-SLOC': 0, 'E-ETIME': 22, 'B-TYPE': 5, 'I-ELOC': 2, 'E-DAY': 11, 'E-MONTH': 14, 'B-PART': 10, 'B-SPEED': 24, 'E-TYPE': 6, 'E-ELOC': 28, 'O': 1, 'I-ROAD': 26, 'B-ROAD': 29, 'S-SLOC': 33, 'B-STIME': 15, 'E-PART': 12, 'E-YEAR': 19, 'S-ELOC': 32, 'B-MONTH': 13, 'B-DAY': 9, 'I-YEAR': 20, 'I-MONTH': 31, 'E-STIME': 16, 'I-STIME': 3}" # print("id_to_tag:{}".format(id_to_tag)) # "id_to_tag:{0: 'I-SLOC', 1: 'O', 2: 'I-ELOC', 3: 'I-STIME', 4: 'I-ETIME', 5: 'B-TYPE', 6: 'E-TYPE', 7: 'B-SLOC', 8: 'E-SLOC', 9: 'B-DAY', 10: 'B-PART', 11: 'E-DAY', 12: 'E-PART', 13: 'B-MONTH', 14: 'E-MONTH', 15: 'B-STIME', 16: 'E-STIME', 17: 'I-SPEED', 18: 'B-YEAR', 19: 'E-YEAR', 20: 'I-YEAR', 21: 'B-ETIME', 22: 'E-ETIME', 23: 'I-DAY', 24: 'B-SPEED', 25: 'E-SPEED', 26: 'I-ROAD', 27: 'B-ELOC', 28: 'E-ELOC', 29: 'B-ROAD', 30: 'E-ROAD', 31: 'I-MONTH', 32: 'S-ELOC', 33: 'S-SLOC'} " print("Found %i unique named entity tags" % len(dico)) return dico, tag_to_id, id_to_tag
def augment_with_pretrained(dictionary, ext_emb_path, chars): """ Augment the dictionary with words that have a pretrained embedding. If `words` is None, we add every word that has a pretrained embedding to the dictionary, otherwise, we only add the words that are given by `words` (typically the words in the development and test sets.) Args: dictionary: 传入的字典 ext_emb_path: 预训练嵌入集地址 chars: 传入的字符集 Return dictionary, word_to_id, id_to_word """ #print('Loading pretrained embeddings from %s...' % ext_emb_path) assert os.path.isfile(ext_emb_path) # Load pretrained embeddings from file # 加载预训练嵌入集的所有字符 pretrained = set([ line.rstrip().split()[0].strip() for line in codecs.open(ext_emb_path, 'r', 'utf-8') if len(ext_emb_path) > 0 ]) # We either add every word in the pretrained file, # or only words given in the `words` list to which # we can assign a pretrained embedding if chars is None: for char in pretrained: if char not in dictionary: dictionary[char] = 0 else: # 对于chars中的每个字符 如果在 pretrained 存在,在 dictionary 不存在 # 往 dictionary 扩充该字符,并且出现频率设置为0 # 此处并没有使用预训练嵌入集相应字符的向量 for char in chars: if any( x in pretrained for x in [char, char.lower(), re.sub('\d', '0', char.lower())]) and char not in dictionary: dictionary[char] = 0 word_to_id, id_to_word = create_mapping(dictionary) return dictionary, word_to_id, id_to_word
def augment_with_pretrained(dictionary, ext_emb_path, chars): """ Augment the dictionary with words that have a pretrained embedding. If `words` is None, we add every word that has a pretrained embedding to the dictionary, otherwise, we only add the words that are given by `words` (typically the words in the development and test sets.) """ print('Loading pretrained embeddings from %s...' % ext_emb_path) assert os.path.isfile(ext_emb_path) #for char in chars: # print(char) #print(len(dictionary)) # Load pretrained embeddings from file pretrained = set([ line.rstrip().split()[0].strip() for line in codecs.open(ext_emb_path, 'r', 'utf-8') if len(ext_emb_path) > 0 ]) # We either add every word in the pretrained file, # or only words given in the `words` list to which # we can assign a pretrained embedding if chars is None: for char in pretrained: if char not in dictionary: dictionary[char] = 0 else: for char in chars: if any( x in pretrained for x in [char, char.lower(), re.sub('\d', '0', char.lower())]) and char not in dictionary: dictionary[char] = 0 word_to_id, id_to_word = create_mapping(dictionary) #t=0 #for i in id_to_word: # print(i) # t+=1 #print(t) #print(len(dictionary)) #print(len(pretrained)) #print(dictionary) return dictionary, word_to_id, id_to_word
def char_mapping(sentences, lower): """ 根据数据集的词频创建字典,然后得到字符与索引id的双向映射字典 :param sentences: :param lower: :return: """ chars = [[x[0].lower() if lower else x[0] for x in s] for s in sentences] # 用creat创建字典 dico = create_dico(chars) # 创建字典,键值对为word-词频frequency # padding字符<PAD>的频数,极大化这一数值,保证最终得到的映射字典中<PAD>的索引为0,因为对序列进行补长的时候,补充的是0 dico["<PAD>"] = 10000001 dico['<UNK>'] = 10000000 # unknown字符的索引 # 根据字典得到两种映射 char_to_id, id_to_char = create_mapping(dico) print("Found %i unique words (%i in total)" % (len(dico), sum(len(x) for x in chars))) # print("char_to_id:", char_to_id) return dico, char_to_id, id_to_char
def char_mapping(sentences, lower): """ Create a dictionary and a mapping of characters, sorted by frequency. """ words = [[x[0] for x in s] for s in sentences] chars = [] for s in words: char = [] for word in s: for c in word: char.append(c.lower() if lower else c) chars.append(char) dico = create_dico(chars) dico["<PAD>"] = 10000001 dico['<UNK>'] = 10000000 char_to_id, id_to_char = create_mapping(dico) print("Found %i unique chars (%i in total)" % (len(dico), sum(len(x) for x in chars))) return dico, char_to_id, id_to_char
def augment_with_pretrained(dictionary, ext_emb_path, chars): """ Augment the dictionary with words that have a pretrained embedding. If `words` is None, we add every word that has a pretrained embedding to the dictionary, otherwise, we only add the words that are given by `words` (typically the words in the development and test sets.) """ #print('Loading pretrained embeddings from %s...' % ext_emb_path) assert os.path.isfile( ext_emb_path ) #����Ƿ��������ļ� 'e:\\objectTest\\NERuselocal\\data\\vec.txt' # Load pretrained embeddings from file pretrained = set([ #setȥ���ظ����֡�������ȡ���� line.rstrip().split()[0].strip() #ÿ��ȥ���з����ո���� for line in codecs.open(ext_emb_path, 'r', 'utf-8') #codecs���ļ���һ���� if len(ext_emb_path) > 0 ]) # We either add every word in the pretrained file, # or only words given in the `words` list to which # we can assign a pretrained embedding if chars is None: # chars ��ѵ��������ƽ���� ������ �ļ��ֵ������û������� for char in pretrained: if char not in dictionary: dictionary[ char] = 0 #��� ѵ�����е��� �����ļ����� Ҳ���� ѵ�����ֵ� �� ����ӵ�ѵ���ֵ��� else: for char in chars: #ÿ���� any �ж�x �������е�����һ�ַ���Ture �и���Ϊ�վͷ���false if any(x in pretrained for x in [ char, char.lower(), #תСд re.sub('\d', '0', char.lower()) #�����滻Ϊ0 ]) and char not in dictionary: dictionary[ char] = 0 #������� ԭʼ�ļ�pretrained�� ���Ҳ���ѵ���ֵ��У�����������ת����ĸ���� ��ӵ�ѵ���ֵ��� word_to_id, id_to_word = create_mapping( dictionary) #����������ѵ���ֵ� �������·��� return dictionary, word_to_id, id_to_word
def char_mapping(sentences, lower): """ 根据字创建一个字典 :param sentences: :param lower: :return: 字典,字-id映射,id-字映射 """ # 只包含当前词 [[w1, w2,...], [w1, w2, ...], []] chars = [[char.lower() if lower else char for char in sentence] for sentence in sentences] dico = create_dico(chars) dico[u"<PAD>"] = 10000001 # 目前未用到 定义一个大的数保证其对应id为0 dico[u"<UNK>"] = 10000000 # 未登录词 char_to_id, id_to_char = create_mapping(dico) print("Found %i unique words (%i in total)" % (len(dico), sum(len(x) for x in chars))) return dico, char_to_id, id_to_char
def char_mapping(sentences, lower): """ Create a dictionary and a mapping of words, sorted by frequency. """ #����һ���ֵ�ӳ�䣬����Ƶ���� sÿ�仰��x��['��', 'O'] ����ת����Сд chars = [ [x[0].lower() if lower else x[0] for x in s] for s in sentences ] #[['��', '��', '��', '��', '��', 'ס', '��', 'Ժ', '��'], ['��', '��']] dico = create_dico(chars) #ͳ����Ƶ dico[ "<PAD>"] = 10000001 #pading ÿ���ּӸ��߿� ÿ�仰�ij���pading ��һ���ij��� ����� dico['<UNK>'] = 10000000 #��֪������Ҳ����id char_to_id, id_to_char = create_mapping(dico) #ÿ���ֽ����Ը��ֵ� #�ֵ�id char_to_id id���� id_to_char #print("Found %i unique words (%i in total)" % ( # len(dico), sum(len(x) for x in chars) #)) # dico ��û��������ֵ�{'��': 79775, '��': 3134, '��': 7713, '0': 335904, '��': 19664, '��': 14307} # char_to_id {0: '<PAD>', 1: '<UNK>', 2: '0', 3: '��', 4: '��', 5: '��', 6: '��',} ��Ƶ�Ӵ�С # id_to_char {'<PAD>': 0, '<UNK>': 1, '0': 2, '��': 3, '��': 4, '��': 5, '��': 6,} ��Ƶ�Ӵ�С return dico, char_to_id, id_to_char
def augment_with_pretrained(dictionary, ext_emb_path, chars): print('Loading pretrained embeddings from %s...' % ext_emb_path) assert os.path.isfile(ext_emb_path) pretrained = set([ line.rstrip().split()[0].strip() for line in codecs.open(ext_emb_path, 'r', 'utf-8') ]) if chars is None: for char in pretrained: if char not in dictionary: dictionary[char] = 0 else: for char in chars: if any( x in pretrained for x in [char, char.lower(), re.sub('\d', '0', char.lower())]) and char not in dictionary: dictionary[char] = 0 word_to_id, id_to_word = create_mapping(dictionary) return dictionary, word_to_id, id_to_word
def tag_mapping(sentences): """ Create a dictionary and a mapping of tags, sorted by frequency. """ # 该函数用于将tag做mapping, # 其输入是: # 输出是: f = open('tag_to_id.txt', 'w', encoding='utf8') f1 = open('id_to_tag.txt', 'w', encoding='utf8') tags = [] for s in sentences: ts = [] for char in s: tag = char[-1] ts.append(tag) tags.append(ts) # tags [['O', 'O', 'B-DRU', 'E-DRU', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'B-SYM', # 'E-SYM', 'B-SYM', 'I-SYM', 'I-SYM', 'E-SYM', 'O', 'O', 'O', 'O', 'O', 'O', 'O', # 'O', 'O', 'O', 'O', 'O', 'B-DRU', 'E-DRU', 'O', 'O', 'O', 'O', 'O', 'O', 'O', # 'B-REG', 'I-REG', 'E-REG', 'B-SYM', 'E-SYM', 'O', 'O', 'O', 'O', 'O', 'O', 'O', # 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O']] # tags1 = [[char[-1] for char in s] for s in sentences] dico = create_dico(tags) # dico {'O': 44, 'B-DRU': 2, 'E-DRU': 2, 'B-SYM': 3, 'E-SYM': 3, 'I-SYM': 2, 'B-REG': 1, # 'I-REG': 1, 'E-REG': 1} tag_to_id, id_to_tag = create_mapping(dico) # tag_to_id: # {'O': 0, 'B-SYM': 1, 'E-SYM': 2, 'B-DRU': 3, 'E-DRU': 4, 'I-SYM': 5, 'B-REG': 6, 'E-REG': 7, # 'I-REG': 8} # id_to_tag # {0: 'O', 1: 'B-SYM', 2: 'E-SYM', 3: 'B-DRU', 4: 'E-DRU', 5: 'I-SYM', 6: 'B-REG', 7: 'E-REG', # 8: 'I-REG'} # print("Found %i unique named entity tags" % len(dico)) # 写入文档中 for k, v in tag_to_id.items(): f.write(k + ":" + str(v) + "\n") for k, v in id_to_tag.items(): f1.write(str(k) + ":" + str(v) + "\n") return dico, tag_to_id, id_to_tag
def char_mapping(sentences, lower): """ Create a dictionary and a mapping of words, sorted by frequency. """ # print("lower:{}".format(lower)) # lower: False chars = [[x[0].lower() if lower else x[0] for x in s] for s in sentences] # print("in char_mapping chars:{}".format(chars)) # print("chars[0]:{}".format(chars[0])) # "chars[0]:['我', '要', '看', '乌', '鲁', '木', '齐', '市', '第', '四', '十', '九', '中', '学', '东', '门', '去', '乌', '鲁', '木', '齐', '推', '拿', '职', '业', '学', '校', '南', '门', '沿', '西', '虹', '东', '路', '的', '监', '控']" dico = create_dico(chars) # print("dico:{}".format(dico)) # dico: {'仓': 16, '背': 5, '视': 348, '煨': 1, '代': 25, '欢': 2, '配': 2, '核': 5, '还': 3, '结': 4, '工': 124 } dico["<PAD>"] = 10000001 dico['<UNK>'] = 10000000 char_to_id, id_to_char = create_mapping(dico) # id_to_item: {0: '<PAD>', 1: '<UNK>', 2: '1', 3: ':', 4: '2', 5: '门', 6: '的', 7: '0', 8: '月' # item_to_id: {'俊': 402, '是': 428, '仪': 642, '哥': 728, '童': 366, '3': 12, '界': 450, '税': 876} # print("char_to_id:{}".format(char_to_id)) print("Found %i unique words (%i in total)" % (len(dico), sum(len(x) for x in chars))) return dico, char_to_id, id_to_char
def augment_with_pretrained(dictionary, ext_emb_path): """ Augment the dictionary with words that have a pretrained embedding. If `words` is None, we add every word that has a pretrained embedding to the dictionary. """ print('Loading pretrained embeddings from %s...' % ext_emb_path) assert os.path.isfile(ext_emb_path) # Load pretrained embeddings from file pretrained = set([ line.rstrip().split()[0].strip() for line in codecs.open(ext_emb_path, 'r', 'utf-8') if len(ext_emb_path) > 0 ]) for char in pretrained: if char not in dictionary: dictionary[char] = 0 char_to_id, id_to_char = create_mapping(dictionary) return dictionary, char_to_id, id_to_char
def augment_with_pretrained(dictionary, ext_emb_path, chars): """ Augment the dictionary with words that have a pretrained embedding. If `words` is None, we add every word that has a pretrained embedding to the dictionary, otherwise, we only add the words that are given by `words` (typically the words in the development and test sets.) """ "ext_emb_path是vec.txt文件用于将字符转化为预训练的词向量" # print('Loading pretrained embeddings from %s...' % ext_emb_path) assert os.path.isfile(ext_emb_path) # Load pretrained embeddings from file pretrained = set([ line.rstrip().split()[0].strip() for line in codecs.open(ext_emb_path, 'r', 'utf-8') if len(ext_emb_path) > 0 ]) # pretrained是加载的预训练的单词对应的vetctor # We either add every word in the pretrained file, # or only words given in the `words` list to which # we can assign a pretrained embedding # chars是传入的每句话 # 这里如果chars是空,则赋值为0,这里将训练出来的分别派出成1,2,3,4,5的格式,结果是{'<PAD>': 0, '<UNK>': 1, '0': 2, ',': 3, ':': 4, '。': 5, # ....} if chars is None: for char in pretrained: if char not in dictionary: dictionary[char] = 0 else: for char in chars: if any( x in pretrained for x in [char, char.lower(), re.sub('\d', '0', char.lower())]) and char not in dictionary: dictionary[char] = 0 word_to_id, id_to_word = create_mapping(dictionary) return dictionary, word_to_id, id_to_word
def tag_mapping(sentences): """ Create a dictionary and a mapping of tags, sorted by frequency. """ f = open('tag_to_id.txt', 'w', encoding='utf8') f1 = open('id_to_tag.txt', 'w', encoding='utf8') tags = [] for s in sentences: ts = [] for char in s: tag = char[-1] ts.append(tag) tags.append(ts) #tags1 = [[char[-1] for char in s] for s in sentences] dico = create_dico(tags) tag_to_id, id_to_tag = create_mapping(dico) #print("Found %i unique named entity tags" % len(dico)) for k, v in tag_to_id.items(): f.write(k + ":" + str(v) + "\n") for k, v in id_to_tag.items(): f1.write(str(k) + ":" + str(v) + "\n") return dico, tag_to_id, id_to_tag
def char_mapping(sentences, lower): """ Create a dictionary and a mapping of words, sorted by frequency. """ # 按照频率构造字典chars取sentences中的最数组最内单元的第一个数组作为结果,本质上就是最开始输入的经过处理替换数字的文字字符串。 # ['入', '院', '情', '况', ':', '女', ',', '0', '0', '岁', ',', '以', '突', '发', '言', '语', '不', # '清', '0', '天', ',', '加', '重', '0', '天', '入', '院', '。', '入', '院', '情', '况', ':', '患', # '者', '以', '腰', '痛', '伴', '双', '下', '肢', '疼', '痛', '半', '年', ',', '加', '重', '0', '0', # '余', '天', '为', '主', '诉', '入', '院', '。'] chars = [[x[0].lower() if lower else x[0] for x in s] for s in sentences] # 在此处获取字及对应出现的频率 dico = create_dico(chars) dico["<PAD>"] = 10000001 dico['<UNK>'] = 10000000 # 这里将无重复的字符传入create_mapping构件字典,传入数据是{'无': 1754, '长': 68, '期': 179,...} # char_to_id # {'<PAD>': 0, '<UNK>': 1, '0': 2, '入': 3, '院': 4, ',': 5, '天': 6, '。': 7, '以': 8, '况': 9, '加': 10, # '情': 11, '痛': 12, '重': 13, ':': 14, '下': 15, '不': 16, '为': 17, '主': 18, '伴': 19, '余': 20, '半': 21, # '双': 22, '发': 23, '女': 24, '岁': 25, '年': 26, '患': 27, '清': 28, '疼': 29, '突': 30, '者': 31, '肢': 32, # '腰': 33, '言': 34, '诉': 35, '语': 36} # id_to_char # {0: '<PAD>', 1: '<UNK>', 2: '0', 3: '入', 4: '院', 5: ',', 6: '天', 7: '。', 8: '以', 9: '况', 10: '加', # 11: '情', 12: '痛', 13: '重', 14: ':', 15: '下', 16: '不', 17: '为', 18: '主', 19: '伴', 20: '余', 21: '半', # 22: '双', 23: '发', 24: '女', 25: '岁', 26: '年', 27: '患', 28: '清', 29: '疼', 30: '突', 31: '者', 32: '肢', # 33: '腰', 34: '言', 35: '诉', 36: '语'} char_to_id, id_to_char = create_mapping(dico) # print("Found %i unique words (%i in total)" % ( # len(dico), sum(len(x) for x in chars) # )) # 这里dico{'入': 4, '院': 4, '情': 2, '况': 2, ':': 2, '女': 1, ',': 4, '0': 6, '岁': 1, '以': 2, '突': 1, # '发': 1, '言': 1, '语': 1, '不': 1, '清': 1, '天': 3, '加': 2, '重': 2, '。': 2, '患': 1, '者': 1, '腰': 1, # '痛': 2, '伴': 1, '双': 1, '下': 1, '肢': 1, '疼': 1, '半': 1, '年': 1, '余': 1, '为': 1, '主': 1, '诉': 1, # '<PAD>': 10000001, '<UNK>': 10000000} return dico, char_to_id, id_to_char
parser.add_argument("--batch_size", default=128, type=int, help="Batch size") parser.add_argument("--steps_per_checkpoint", default=10, type=int, help="Save model checkpoint every this iteration") parser.add_argument("--summary_dir", default='summary/', type=str, help="mode name") args = parser.parse_args() source_data = load_data(args.source_file) target_data = load_data(args.target_file) source_idx_to_word, source_word_to_idx = create_mapping(source_data) target_idx_to_word, target_word_to_idx = create_mapping(target_data) source_data_idx, target_data_idx = sentences_to_ids(source_data, target_data, source_word_to_idx, target_word_to_idx) source_data_train, source_data_dev, target_data_train, target_data_dev\ = train_test_split(source_data_idx, target_data_idx, test_size=0.3, random_state=1, shuffle=False, stratify=None) with tf.Session() as sess: model = Seq2SeqModel(rnn_size=args.rnn_size, num_layers=args.num_layers, enc_embedding_size=args.embedding_size, learning_rate=args.learning_rate, dec_embedding_size=args.embedding_size,
def tag_mapping(sentences): tags = [[char[-1] for char in s] for s in sentences] dico = create_dico(tags) tag_to_id, id_to_tag = create_mapping(dico) print("Found %i unique named entity tags" % len(dico)) return dico, tag_to_id, id_to_tag