예제 #1
0
def char_mapping(sentences, lower):
    chars = [[x[0].lower() if lower else x[0] for x in s] for s in sentences]
    dico = create_dico(chars)
    dico["<PAD>"] = 10000001
    dico['<UNK>'] = 10000000
    char_to_id, id_to_char = create_mapping(dico)
    print("Found %i unique words (%i in total)" %
          (len(dico), sum(len(x) for x in chars)))
    return dico, char_to_id, id_to_char
예제 #2
0
def tag_mapping(sentences):
    """
    Create a dictionary and a mapping of tags, sorted by frequency.
    """
    tags = [[char[-1] for char in s] for s in sentences]
    dico = create_dico(tags)
    tag_to_id, id_to_tag = create_mapping(dico)
    print("Found %i unique named entity tags" % len(dico))
    return dico, tag_to_id, id_to_tag
예제 #3
0
def tag_mapping(sentences):
    """
    Create a dictionary and a mapping of tags, sorted by frequency.
    """
    tags = [[char[-1] for char in s] for s in sentences]
    dico = create_dico(tags)
    tag_to_id, id_to_tag = create_mapping(dico)
    print("Found %i unique named entity tags" % len(dico))
    return dico, tag_to_id, id_to_tag
예제 #4
0
def mark_mapping(sentences):
    """
    Create a dictionary and a mapping of marks, sorted by frequency.
    """
    marks = [[mark[1] for mark in s] for s in sentences]
    dico = create_dico(marks)
    dico['<UNK>'] = 10000000
    mark_to_id, id_to_mark = create_mapping(dico)
    return dico, mark_to_id, id_to_mark
예제 #5
0
def tag_mapping(sentences):
    """
    构建标签字典
    :param sentences:
    :return:
    """
    tag_list = [[x[1] for x in s] for s in sentences]
    dico = data_utils.create_dico(tag_list)
    tag_to_id, id_to_tag = data_utils.create_mapping(dico)
    return dico, tag_to_id, id_to_tag
예제 #6
0
def pos_mapping(sentences):
    """
    Create a dictionary and a mapping of tags, sorted by frequency.
    """
    pos = [[char[1] for char in s] for s in sentences]
    dico = create_dico(pos)
    dico["UNK"] = 100000000
    pos_to_id, id_to_pos = create_mapping(dico)
    print("Found %i unique part of speech tags" % len(dico))
    return dico, pos_to_id, id_to_pos
예제 #7
0
def pos_mapping(sentences):
    """
    Create a dictionary and a mapping of poss, sorted by frequency.
    """
    poss = [[char[-2] for char in s] for s in sentences]
    dico_pos = create_dico(poss)
    dico_pos['<UNK>'] = 10000000
    pos_to_id, id_to_pos = create_mapping(dico_pos)
    print("Found %i unique named entity poss" % len(dico_pos))
    return dico_pos, pos_to_id, id_to_pos
예제 #8
0
def char_mapping(sentences, lower):
    """
    Create a dictionary and a mapping of words, sorted by frequency.
    """
    chars = [[x[0].lower() if lower else x[0] for x in s] for s in sentences]
    dico = create_dico(chars)
    dico['<UNK>'] = 10000000
    char_to_id, id_to_char = create_mapping(dico)
    print("Found %i unique words (%i in total)" %
          (len(dico), sum(len(x) for x in chars)))
    return dico, char_to_id, id_to_char
def char_mapping(sentences, lower):         # lower: 决定是否把字符全部转化为小写
    """
    Create a dictionary and a mapping of words, sorted by frequency.
    """
    chars = [[x[0].lower() if lower else x[0] for x in s] for s in sentences]   # char dimension: SentenceNum*SentenceLen
    dico = create_dico(chars)    # 以字典的形式统计每个字出现的次数 dico dimension: CharNum*2,第一列为字,第二列为出现次数
    dico["<PAD>"] = 10000001
    dico['<UNK>'] = 10000000
    char_to_id, id_to_char = create_mapping(dico)
    print("Found %i unique words (%i in total)" % (len(dico), sum(len(x) for x in chars)))
    return dico, char_to_id, id_to_char
예제 #10
0
def tag_mapping(sentences):
    """
    根据数据集的标签频数创建字典,然后得到标签与索引id的双向映射字典
    :param sentences:
    :return:
    """
    tags = [[char[-1] for char in s] for s in sentences]
    dico = create_dico(tags)  # 根据标签出现的频数创建字典
    tag_to_id, id_to_tag = create_mapping(dico)
    print("Found %i unique named entity tags" % len(dico))
    return dico, tag_to_id, id_to_tag
예제 #11
0
def eojeol_mapping(sentences):
    """
	어절 사전을 구축한다. 안쓸듯...
	"""
    eojeol = [[[word] for word in s[-2]] for s in sentences]
    dico = create_dico(eojeol)
    dico["<PAD>"] = 10000001
    dico['<UNK>'] = 10000000
    eojeol_to_id, id_to_eojeol = create_mapping(dico)
    print("Found %i unique words" % (len(dico)))
    return dico, eojeol_to_id, id_to_eojeol
예제 #12
0
def word_mapping(sentences):
    """
    构建字典
    :param sentences:
    :return:
    """
    word_list = [[x[0] for x in s] for s in sentences]
    dico = data_utils.create_dico(word_list)
    dico['<PAD>'] = 10000001
    dico['<UNK>'] = 10000000
    word_to_id, id_to_word = data_utils.create_mapping(dico)
    return dico, word_to_id, id_to_word
예제 #13
0
def tag_mapping(sentences):
    """
    Create a dictionary and a mapping of tags, sorted by frequency.
    """
    tags = [[char[-1] for char in s] for s in sentences]

    dico = create_dico(tags)
    dico['[SEP]'] = len(dico) + 1
    dico['[CLS]'] = len(dico) + 2

    tag_to_id, id_to_tag = create_mapping(dico)
    # logger.info("Found {} unique named entity tags".format(len(dico)))
    return dico, tag_to_id, id_to_tag
예제 #14
0
def char_mapping(sentences, lower):
    """
    Create a dictionary and a mapping of words, sorted by frequency.
    """
    chars = [[x[0].lower() if lower else x[0] for x in s] for s in sentences]
    dico = create_dico(chars)  # 字典,包含每个字符及其出现的频率
    dico["<PAD>"] = 10000001  # 定义填充词
    dico['<UNK>'] = 10000000  # 定义未登录词
    char_to_id, id_to_char = create_mapping(dico)
    #print("Found %i unique words (%i in total)" % (
    #    len(dico), sum(len(x) for x in chars)
    #))
    return dico, char_to_id, id_to_char
예제 #15
0
def char_mapping(sentences, lower):
    """
    Create a dictionary and a mapping of words, sorted by frequency.
    """
    chars = [[x[0].lower() if lower else x[0] for x in s] for s in sentences]
    dico = create_dico(chars)
    dico["<PAD>"] = 10000001
    dico['<UNK>'] = 10000000
    char_to_id, id_to_char = create_mapping(dico)
    print("Found %i unique words (%i in total)" % (
        len(dico), sum(len(x) for x in chars)
    ))
    return dico, char_to_id, id_to_char
예제 #16
0
def char_mapping(sentences, lower):  #lower表示是否忽略大小写
    """
    Create a dictionary and a mapping of words, sorted by frequency.
    """
    chars = [[x[0].lower() if lower else x[0] for x in s]
             for s in sentences]  #获取句子中的字
    dico = create_dico(chars)  #create_dico表示创建词频字典
    dico["<PAD>"] = 10000001  #应该不是表示结尾???
    dico['<UNK>'] = 10000000
    char_to_id, id_to_char = create_mapping(dico)
    #print("Found %i unique words (%i in total)" % (
    #    len(dico), sum(len(x) for x in chars)
    #))
    return dico, char_to_id, id_to_char
예제 #17
0
def affix_mapping_with_pos(sentences, type, size, frequency):
    """
	Affix 사전을 구축한다.
	형태소 기준으로 구축할 지, 원 단어 기준으로 구축할 지 미정
	현재는 원 단어 기준
	:param sentences:  list형태의 sentence 정보
	:param *_size: *의 n-gram의 size
	:param frequency: frequency의 threshold ex) 50이상 등장
	:return:
	"""
    affixes = []
    for sentence in sentences:
        affix = []
        # 단어 기준으로 affix 진행 시
        # for word in sentence[-2]:

        # 형태소 기준으로 affix 진행 시
        for eojeol in sentence[-3]:
            word = ""
            for w in eojeol.split('|'):
                word += w.split('/')[0]

            aff_tmp = ""
            if len(word) < size:
                for i in range(size - len(word)):
                    aff_tmp += "^"
                if type == 'prefix':
                    aff_tmp = aff_tmp + word  # size가 3이라면 ><><안 이런식으로
                elif type == 'suffix':
                    aff_tmp = word + aff_tmp  # size가 3이라면 다><>< 이런식으로
            elif len(word) == size:
                aff_tmp = word
            else:
                idx = size - len(word)
                if type == 'prefix':
                    aff_tmp = word[:idx]  # size가 3이면 apple => app
                elif type == 'suffix':
                    aff_tmp = word[-idx:]  # size가 3이면 apple => ple
            affix.append(aff_tmp)
        affixes.append(affix)

    whole_aff_dico = create_dico(affixes)  # 전체 prefix 사전
    aff_dico = only_frequent_affix(whole_aff_dico,
                                   frequency)  # frequent한 것만 모아놓은 사전
    aff_dico["<PAD>"] = 10000001
    aff_dico["<UNK>"] = 10000000

    aff_to_id, id_to_aff = create_mapping(aff_dico)
    print("Found %i unique %s" % (len(aff_dico), type))
    return aff_dico, aff_to_id, id_to_aff
예제 #18
0
def pumsa_mapping(sentences):
    """
	단어 사전을 구축한다.
	"""
    pumsas1 = [[[word] for word in s[5]] for s in sentences]
    pumsas2 = [[[word] for word in s[6]] for s in sentences]
    pumsas3 = [[[word] for word in s[7]] for s in sentences]
    pumsas4 = [[[word] for word in s[8]] for s in sentences]
    dico = create_dico(pumsas1 + pumsas2 + pumsas3 + pumsas4)
    dico["<PAD>"] = 10000001
    dico['<UNK>'] = 10000000
    pumsa_to_id, id_to_pumsa = create_mapping(dico)
    print("Found %i unique pumsa" % (len(pumsa_to_id)))
    return dico, pumsa_to_id, id_to_pumsa
예제 #19
0
def word_mapping(sentences):
    """
	단어 사전을 구축한다.
	"""
    words1 = [[[word] for word in s[1]] for s in sentences]
    words2 = [[[word] for word in s[2]] for s in sentences]
    words3 = [[[word] for word in s[3]] for s in sentences]
    words4 = [[[word] for word in s[4]] for s in sentences]
    dico = create_dico(words1 + words2 + words3 + words4)
    dico["<PAD>"] = 10000001
    dico['<UNK>'] = 10000000
    word_to_id, id_to_word = create_mapping(dico)
    print("Found %i unique words" % (len(dico)))
    return dico, word_to_id, id_to_word
예제 #20
0
파일: loader.py 프로젝트: Uzw103/NER-model
def char_mapping(sentences, lower):
    """
    Create a dictionary and a mapping of words, sorted by frequency.
    """
    chars = [[x[0].lower() if lower else x[0] for x in s]
             for s in sentences]  # 首先将每个字符转换为小写
    dico = create_dico(
        chars)  # 调用data_utils.py中的create_dico函数创建key-value的映射,生成含有每个字出现的次数的字典
    dico["<PAD>"] = 10000001
    dico['<UNK>'] = 10000000
    char_to_id, id_to_char = create_mapping(
        dico)  # 生成key-value和value-key两种字典映射形式,按value值降序排列
    print("Found %i unique words (%i in total)" % (  # print字典中共有多少不同字符和一共有多少字符
        len(dico), sum(len(x) for x in chars)))
    return dico, char_to_id, id_to_char
예제 #21
0
def char_mapping(sentences, lower):
    """
    음절 사전을 구축한다.
    """
    if lower:
        chars = [[[char for char in word.lower()] for word in s[1]]
                 for s in sentences]
    else:
        chars = [[[char for char in word] for word in s[1]] for s in sentences]
    dico = create_dico(chars)
    dico["<PAD>"] = 10000001
    dico['<UNK>'] = 10000000
    char_to_id, id_to_char = create_mapping(dico)
    print("Found %i unique chars" % (len(dico)))
    return dico, char_to_id, id_to_char
def char_mapping(sentences, lower):
    """
    Create a dictionary and a mapping of words, sorted by frequency.
    lower的预定值是:True
    """
    chars = [[x[0].lower() if lower else x[0] for x in s] for s in sentences]
    # Python lower() 方法转换字符串中所有大写字符为小写。
    # chars 存储的是语料里的每个字
    dico = create_dico(chars)  # dico 为统计了语料中每个字出现的次数的字典
    dico["<PAD>"] = 10000001
    dico['<UNK>'] = 10000000
    char_to_id, id_to_char = create_mapping(dico)
    # 以上两个字典为对字频做了统计之后的字和id的互译字典,字频越大id越小
    print("Found %i unique words (%i in total)" %
          (len(dico), sum(len(x) for x in chars)))
    return dico, char_to_id, id_to_char
예제 #23
0
파일: loader.py 프로젝트: aiedward/Cner_v1
def char_mapping(sentences, lower):
    """
    构建一个词典和每个词的映射,通过频率排序
    :param sentences:
    :param lower:
    :return:
    """
    # 将每个词转化为小写
    chars = [[x[0].lower() if lower else x[0] for x in s] for s in sentences]
    dico = create_dico(chars)
    # 定义特殊字符
    dico["<PAD>"] = 10000001
    dico['<UNK>'] = 10000000
    # 返回正反向词典
    char_to_id, id_to_char = create_mapping(dico)
    return dico, char_to_id, id_to_char
예제 #24
0
def tag_mapping(sentences):
    """
    Create a dictionary and a mapping of tags, sorted by frequency.
    """
    tags = [[char[-1] for char in s] for s in sentences]
    # print("tags:{}".format(tags))
    # "[['O', 'O', 'B-SLOC', 'I-SLOC', 'I-SLOC', 'I-SLOC', 'I-SLOC', 'I-SLOC', 'I-SLOC', 'I-SLOC', 'E-SLOC', 'O', 'B-TYPE', 'E-TYPE']]"
    dico = create_dico(tags)
    # print("dico:{}".format(dico))
    # "dico:{'E-YEAR': 1460, 'I-SPEED': 1639, 'B-SLOC': 3025, 'S-ELOC': 1, 'E-DAY': 1936, 'O': 15082, 'B-YEAR': 1460, 'B-DAY': 1936, 'I-STIME': 4285, 'I-DAY': 1367, 'I-MONTH': 478, 'E-PART': 1936, 'I-ROAD': 901, 'E-STIME': 1934, 'B-STIME': 1934, 'B-ROAD': 556, 'B-TYPE': 3027, 'I-SLOC': 19315, 'I-ETIME': 3113, 'E-ROAD': 556, 'I-YEAR': 1434, 'B-ETIME': 1414, 'E-MONTH': 1935, 'E-TYPE': 3027, 'I-ELOC': 5125, 'E-SLOC': 3025, 'B-PART': 1936, 'S-SLOC': 1, 'B-MONTH': 1935, 'E-SPEED': 1278, 'B-SPEED': 1278, 'B-ELOC': 784, 'E-ELOC': 784, 'E-ETIME': 1414}"
    tag_to_id, id_to_tag = create_mapping(dico)
    # print("tag_to_id:{}".format(tag_to_id))
    # "tag_to_id:{'I-SPEED': 17, 'E-ROAD': 30, 'B-SLOC': 7, 'E-SLOC': 8, 'B-ETIME': 21, 'I-DAY': 23, 'E-SPEED': 25, 'B-ELOC': 27, 'B-YEAR': 18, 'I-ETIME': 4, 'I-SLOC': 0, 'E-ETIME': 22, 'B-TYPE': 5, 'I-ELOC': 2, 'E-DAY': 11, 'E-MONTH': 14, 'B-PART': 10, 'B-SPEED': 24, 'E-TYPE': 6, 'E-ELOC': 28, 'O': 1, 'I-ROAD': 26, 'B-ROAD': 29, 'S-SLOC': 33, 'B-STIME': 15, 'E-PART': 12, 'E-YEAR': 19, 'S-ELOC': 32, 'B-MONTH': 13, 'B-DAY': 9, 'I-YEAR': 20, 'I-MONTH': 31, 'E-STIME': 16, 'I-STIME': 3}"
    # print("id_to_tag:{}".format(id_to_tag))
    # "id_to_tag:{0: 'I-SLOC', 1: 'O', 2: 'I-ELOC', 3: 'I-STIME', 4: 'I-ETIME', 5: 'B-TYPE', 6: 'E-TYPE', 7: 'B-SLOC', 8: 'E-SLOC', 9: 'B-DAY', 10: 'B-PART', 11: 'E-DAY', 12: 'E-PART', 13: 'B-MONTH', 14: 'E-MONTH', 15: 'B-STIME', 16: 'E-STIME', 17: 'I-SPEED', 18: 'B-YEAR', 19: 'E-YEAR', 20: 'I-YEAR', 21: 'B-ETIME', 22: 'E-ETIME', 23: 'I-DAY', 24: 'B-SPEED', 25: 'E-SPEED', 26: 'I-ROAD', 27: 'B-ELOC', 28: 'E-ELOC', 29: 'B-ROAD', 30: 'E-ROAD', 31: 'I-MONTH', 32: 'S-ELOC', 33: 'S-SLOC'} "
    print("Found %i unique named entity tags" % len(dico))
    return dico, tag_to_id, id_to_tag
예제 #25
0
def char_mapping(sentences, lower):
    """
    根据数据集的词频创建字典,然后得到字符与索引id的双向映射字典
    :param sentences:
    :param lower:
    :return:
    """
    chars = [[x[0].lower() if lower else x[0] for x in s] for s in sentences]
    # 用creat创建字典
    dico = create_dico(chars)  # 创建字典,键值对为word-词频frequency
    # padding字符<PAD>的频数,极大化这一数值,保证最终得到的映射字典中<PAD>的索引为0,因为对序列进行补长的时候,补充的是0
    dico["<PAD>"] = 10000001
    dico['<UNK>'] = 10000000  # unknown字符的索引
    # 根据字典得到两种映射
    char_to_id, id_to_char = create_mapping(dico)
    print("Found %i unique words (%i in total)" %
          (len(dico), sum(len(x) for x in chars)))
    # print("char_to_id:", char_to_id)
    return dico, char_to_id, id_to_char
예제 #26
0
파일: loader.py 프로젝트: yyht/Chinese-NER
def char_mapping(sentences, lower):
    """
    Create a dictionary and a mapping of characters, sorted by frequency.
    """
    words = [[x[0] for x in s] for s in sentences]
    chars = []
    for s in words:
        char = []
        for word in s:
            for c in word:
                char.append(c.lower() if lower else c)
        chars.append(char)

    dico = create_dico(chars)
    dico["<PAD>"] = 10000001
    dico['<UNK>'] = 10000000
    char_to_id, id_to_char = create_mapping(dico)
    print("Found %i unique chars (%i in total)" %
          (len(dico), sum(len(x) for x in chars)))
    return dico, char_to_id, id_to_char
예제 #27
0
def char_mapping(sentences, lower):
    """
    根据字创建一个字典
    :param sentences: 
    :param lower: 
    :return: 字典,字-id映射,id-字映射
    """

    # 只包含当前词 [[w1, w2,...], [w1, w2, ...], []]
    chars = [[char.lower() if lower else char for char in sentence]
             for sentence in sentences]
    dico = create_dico(chars)

    dico[u"<PAD>"] = 10000001  # 目前未用到 定义一个大的数保证其对应id为0
    dico[u"<UNK>"] = 10000000  # 未登录词

    char_to_id, id_to_char = create_mapping(dico)
    print("Found %i unique words (%i in total)" %
          (len(dico), sum(len(x) for x in chars)))

    return dico, char_to_id, id_to_char
예제 #28
0
def char_mapping(sentences, lower):
    """
    Create a dictionary and a mapping of words, sorted by frequency.
    """
    #����һ���ֵ�ӳ�䣬����Ƶ���� sÿ�仰��x��['��', 'O'] ����ת����Сд
    chars = [
        [x[0].lower() if lower else x[0] for x in s] for s in sentences
    ]  #[['��', '��', '��', '��', '��', 'ס', '��', 'Ժ', '��'],  ['��', '��']]
    dico = create_dico(chars)  #ͳ����Ƶ
    dico[
        "<PAD>"] = 10000001  #pading ÿ���ּӸ��߿� ÿ�仰�ij���pading ��һ���ij��� �����
    dico['<UNK>'] = 10000000  #��֪������Ҳ����id
    char_to_id, id_to_char = create_mapping(dico)  #ÿ���ֽ����Ը��ֵ�
    #�ֵ�id  char_to_id   id���� id_to_char
    #print("Found %i unique words (%i in total)" % (
    #    len(dico), sum(len(x) for x in chars)
    #))
    # dico ��û��������ֵ�{'��': 79775, '��': 3134, '��': 7713, '0': 335904, '��': 19664, '��': 14307}
    # char_to_id  {0: '<PAD>', 1: '<UNK>', 2: '0', 3: '��', 4: '��', 5: '��', 6: '��',}  ��Ƶ�Ӵ�С
    # id_to_char  {'<PAD>': 0, '<UNK>': 1, '0': 2, '��': 3, '��': 4, '��': 5, '��': 6,}  ��Ƶ�Ӵ�С
    return dico, char_to_id, id_to_char
예제 #29
0
def tag_mapping(sentences):
    """
    Create a dictionary and a mapping of tags, sorted by frequency.
    """
    # 该函数用于将tag做mapping,
    # 其输入是:
    # 输出是:

    f = open('tag_to_id.txt', 'w', encoding='utf8')
    f1 = open('id_to_tag.txt', 'w', encoding='utf8')
    tags = []
    for s in sentences:
        ts = []
        for char in s:
            tag = char[-1]
            ts.append(tag)
        tags.append(ts)
    # tags [['O', 'O', 'B-DRU', 'E-DRU', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'B-SYM',
    #       'E-SYM', 'B-SYM', 'I-SYM', 'I-SYM', 'E-SYM', 'O', 'O', 'O', 'O', 'O', 'O', 'O',
    #       'O', 'O', 'O', 'O', 'O', 'B-DRU', 'E-DRU', 'O', 'O', 'O', 'O', 'O', 'O', 'O',
    #       'B-REG', 'I-REG', 'E-REG', 'B-SYM', 'E-SYM', 'O', 'O', 'O', 'O', 'O', 'O', 'O',
    #       'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O']]
    # tags1 = [[char[-1] for char in s] for s in sentences]
    dico = create_dico(tags)
    # dico {'O': 44, 'B-DRU': 2, 'E-DRU': 2, 'B-SYM': 3, 'E-SYM': 3, 'I-SYM': 2, 'B-REG': 1,
    #       'I-REG': 1, 'E-REG': 1}
    tag_to_id, id_to_tag = create_mapping(dico)
    # tag_to_id:
    # {'O': 0, 'B-SYM': 1, 'E-SYM': 2, 'B-DRU': 3, 'E-DRU': 4, 'I-SYM': 5, 'B-REG': 6, 'E-REG': 7,
    #  'I-REG': 8}
    # id_to_tag
    # {0: 'O', 1: 'B-SYM', 2: 'E-SYM', 3: 'B-DRU', 4: 'E-DRU', 5: 'I-SYM', 6: 'B-REG', 7: 'E-REG',
    # 8: 'I-REG'}
    # print("Found %i unique named entity tags" % len(dico))
    # 写入文档中
    for k, v in tag_to_id.items():
        f.write(k + ":" + str(v) + "\n")
    for k, v in id_to_tag.items():
        f1.write(str(k) + ":" + str(v) + "\n")
    return dico, tag_to_id, id_to_tag
예제 #30
0
def char_mapping(sentences, lower):
    """
    Create a dictionary and a mapping of words, sorted by frequency.
    """
    # print("lower:{}".format(lower))
    # lower: False
    chars = [[x[0].lower() if lower else x[0] for x in s] for s in sentences]
    # print("in char_mapping chars:{}".format(chars))
    # print("chars[0]:{}".format(chars[0]))
    # "chars[0]:['我', '要', '看', '乌', '鲁', '木', '齐', '市', '第', '四', '十', '九', '中', '学', '东', '门', '去', '乌', '鲁', '木', '齐', '推', '拿', '职', '业', '学', '校', '南', '门', '沿', '西', '虹', '东', '路', '的', '监', '控']"
    dico = create_dico(chars)
    # print("dico:{}".format(dico))
    # dico: {'仓': 16, '背': 5, '视': 348, '煨': 1, '代': 25, '欢': 2, '配': 2, '核': 5, '还': 3, '结': 4, '工': 124 }
    dico["<PAD>"] = 10000001
    dico['<UNK>'] = 10000000
    char_to_id, id_to_char = create_mapping(dico)
    # id_to_item: {0: '<PAD>', 1: '<UNK>', 2: '1', 3: ':', 4: '2', 5: '门', 6: '的', 7: '0', 8: '月'
    # item_to_id: {'俊': 402, '是': 428, '仪': 642, '哥': 728, '童': 366, '3': 12, '界': 450, '税': 876}
    # print("char_to_id:{}".format(char_to_id))
    print("Found %i unique words (%i in total)" %
          (len(dico), sum(len(x) for x in chars)))
    return dico, char_to_id, id_to_char
예제 #31
0
def char_mapping(sentences, lower):
    """
    Create a dictionary and a mapping of words, sorted by frequency.
    """

    # 按照频率构造字典chars取sentences中的最数组最内单元的第一个数组作为结果,本质上就是最开始输入的经过处理替换数字的文字字符串。
    # ['入', '院', '情', '况', ':', '女', ',', '0', '0', '岁', ',', '以', '突', '发', '言', '语', '不',
    # '清', '0', '天', ',', '加', '重', '0', '天', '入', '院', '。', '入', '院', '情', '况', ':', '患',
    # '者', '以', '腰', '痛', '伴', '双', '下', '肢', '疼', '痛', '半', '年', ',', '加', '重', '0', '0',
    #  '余', '天', '为', '主', '诉', '入', '院', '。']
    chars = [[x[0].lower() if lower else x[0] for x in s] for s in sentences]
    # 在此处获取字及对应出现的频率
    dico = create_dico(chars)
    dico["<PAD>"] = 10000001
    dico['<UNK>'] = 10000000
    # 这里将无重复的字符传入create_mapping构件字典,传入数据是{'无': 1754, '长': 68, '期': 179,...}
    # char_to_id
    # {'<PAD>': 0, '<UNK>': 1, '0': 2, '入': 3, '院': 4, ',': 5, '天': 6, '。': 7, '以': 8, '况': 9, '加': 10,
    #   '情': 11, '痛': 12, '重': 13, ':': 14, '下': 15, '不': 16, '为': 17, '主': 18, '伴': 19, '余': 20, '半': 21,
    #   '双': 22, '发': 23, '女': 24, '岁': 25, '年': 26, '患': 27, '清': 28, '疼': 29, '突': 30, '者': 31, '肢': 32,
    #   '腰': 33, '言': 34, '诉': 35, '语': 36}
    #  id_to_char
    # {0: '<PAD>', 1: '<UNK>', 2: '0', 3: '入', 4: '院', 5: ',', 6: '天', 7: '。', 8: '以', 9: '况', 10: '加',
    #  11: '情', 12: '痛', 13: '重', 14: ':', 15: '下', 16: '不', 17: '为', 18: '主', 19: '伴', 20: '余', 21: '半',
    #  22: '双', 23: '发', 24: '女', 25: '岁', 26: '年', 27: '患', 28: '清', 29: '疼', 30: '突', 31: '者', 32: '肢',
    #  33: '腰', 34: '言', 35: '诉', 36: '语'}
    char_to_id, id_to_char = create_mapping(dico)
    # print("Found %i unique words (%i in total)" % (
    #    len(dico), sum(len(x) for x in chars)
    # ))
    #    这里dico{'入': 4, '院': 4, '情': 2, '况': 2, ':': 2, '女': 1, ',': 4, '0': 6, '岁': 1, '以': 2, '突': 1,
    #    '发': 1, '言': 1, '语': 1, '不': 1, '清': 1, '天': 3, '加': 2, '重': 2, '。': 2, '患': 1, '者': 1, '腰': 1,
    #    '痛': 2, '伴': 1, '双': 1, '下': 1, '肢': 1, '疼': 1, '半': 1, '年': 1, '余': 1, '为': 1, '主': 1, '诉': 1,
    #    '<PAD>': 10000001, '<UNK>': 10000000}

    return dico, char_to_id, id_to_char
예제 #32
0
def tag_mapping(sentences):
    """
    Create a dictionary and a mapping of tags, sorted by frequency.
    """

    f = open('tag_to_id.txt', 'w', encoding='utf8')
    f1 = open('id_to_tag.txt', 'w', encoding='utf8')
    tags = []
    for s in sentences:
        ts = []
        for char in s:
            tag = char[-1]
            ts.append(tag)
        tags.append(ts)

    #tags1 = [[char[-1] for char in s] for s in sentences]
    dico = create_dico(tags)
    tag_to_id, id_to_tag = create_mapping(dico)
    #print("Found %i unique named entity tags" % len(dico))
    for k, v in tag_to_id.items():
        f.write(k + ":" + str(v) + "\n")
    for k, v in id_to_tag.items():
        f1.write(str(k) + ":" + str(v) + "\n")
    return dico, tag_to_id, id_to_tag