Exemplo n.º 1
0
def augment_with_pretrained(dictionary, ext_emb_path, chars):
    """
    Augment the dictionary with words that have a pretrained embedding.
    If `words` is None, we add every word that has a pretrained embedding
    to the dictionary, otherwise, we only add the words that are given by
    `words` (typically the words in the development and test sets.)
    """
    print('Loading pretrained embeddings from %s...' % ext_emb_path)
    assert os.path.isfile(ext_emb_path)

    # Load pretrained embeddings from file
    pretrained = set([
        line.rstrip().split()[0].strip()
        for line in codecs.open(ext_emb_path, 'r', 'utf-8')
        if len(ext_emb_path) > 0
    ])

    # We either add every word in the pretrained file,
    # or only words given in the `words` list to which
    # we can assign a pretrained embedding
    if chars is None:
        for char in pretrained:
            if char not in dictionary:
                dictionary[char] = 0
    else:
        for char in chars:
            if any(x in pretrained for x in [
                char,
                char.lower(),
                re.sub('\d', '0', char.lower())
            ]) and char not in dictionary:
                dictionary[char] = 0

    word_to_id, id_to_word = create_mapping(dictionary)
    return dictionary, word_to_id, id_to_word
Exemplo n.º 2
0
def tag_mapping(sentences):
    """
    Create a dictionary and a mapping of tags, sorted by frequency.
    """
    tags = [[char[-1] for char in s] for s in sentences]
    dico = create_dico(tags)
    tag_to_id, id_to_tag = create_mapping(dico)
    print("Found %i unique named entity tags" % len(dico))
    return dico, tag_to_id, id_to_tag
Exemplo n.º 3
0
def word_mapping(sentences):
    """
    mapping 映射
    构建字典 word_to_id, id_to_word
    :param sentences:
    :return:
    """
    word_list = [[x[0] for x in s] for s in sentences]
    dico = data_utils.create_dico(word_list)
    dico['<PAD>'] = 10000001
    dico['<UNK>'] = 10000000
    word_to_id, id_to_word = data_utils.create_mapping(dico)
    return dico, word_to_id, id_to_word
Exemplo n.º 4
0
def word_mapping(sentences):
    """
    构建字典
    :param sentences: 语料数据
    :return: word统计频数字典、word_id映射、id_word映射
    """
    word_list = [[x[0] for x in s] for s in sentences]
    word_count = data_utils.create_item_count(word_list)
    # 一个小trick, 将填充词和未登录词的词频设置的特别大,排序字典可以排到前边
    word_count["<PAD>"] = 10000001
    word_count["<UNK>"] = 10000000
    word_to_id, id_to_word = data_utils.create_mapping(word_count)
    return word_count, word_to_id, id_to_word
Exemplo n.º 5
0
def char_mapping(sentences, lower):
    """
    Create a dictionary and a mapping of words, sorted by frequency.
    """
    chars = [[x[0].lower() if lower else x[0] for x in s] for s in sentences]
    dico = create_dico(chars)
    dico["<PAD>"] = 10000001
    dico['<UNK>'] = 10000000
    char_to_id, id_to_char = create_mapping(dico)
    print("Found %i unique words (%i in total)" % (
        len(dico), sum(len(x) for x in chars)
    ))
    return dico, char_to_id, id_to_char
Exemplo n.º 6
0
def char_mapping(sentences, lower):
    """
    Create a dictionary and a mapping of words, sorted by frequency.
    """
    chars = [[x[0].lower() if lower else x[0] for x in s] for s in sentences]
    dico = create_dico(chars)
    dico["<PAD>"] = 10000001
    dico['<UNK>'] = 10000000

    char_to_id, id_to_char = create_mapping(dico)
    print("Found %i unique words (%i in total)" %
          (len(dico), sum(len(x) for x in chars)))
    return dico, char_to_id, id_to_char
Exemplo n.º 7
0
def tag_mapping(sentences):
    """
    Create a dictionary and a mapping of tags, sorted by frequency.
    """
    tags = [[char[-1] for char in s] for s in sentences]

    dico = create_dico(tags)
    dico['[SEP]'] = len(dico) + 1
    dico['[CLS]'] = len(dico) + 2

    tag_to_id, id_to_tag = create_mapping(dico)
    print("Found %i unique named entity tags" % len(dico))
    return dico, tag_to_id, id_to_tag
Exemplo n.º 8
0
def affix_mapping_with_pos(sentences, type, size, frequency):
    """
	Affix 사전을 구축한다.
	형태소 기준으로 구축할 지, 원 단어 기준으로 구축할 지 미정
	현재는 원 단어 기준
	:param sentences:  list형태의 sentence 정보
	:param *_size: *의 n-gram의 size
	:param frequency: frequency의 threshold ex) 50이상 등장
	:return:
	"""
    affixes = []
    for sentence in sentences:
        affix = []
        # 단어 기준으로 affix 진행 시
        # for word in sentence[-2]:

        # 형태소 기준으로 affix 진행 시
        for eojeol in sentence[-3]:
            word = ""
            for w in eojeol.split('|'):
                word += w.split('/')[0]

            aff_tmp = ""
            if len(word) < size:
                for i in range(size - len(word)):
                    aff_tmp += "^"
                if type == 'prefix':
                    aff_tmp = aff_tmp + word  # size가 3이라면 ><><안 이런식으로
                elif type == 'suffix':
                    aff_tmp = word + aff_tmp  # size가 3이라면 다><>< 이런식으로
            elif len(word) == size:
                aff_tmp = word
            else:
                idx = size - len(word)
                if type == 'prefix':
                    aff_tmp = word[:idx]  # size가 3이면 apple => app
                elif type == 'suffix':
                    aff_tmp = word[-idx:]  # size가 3이면 apple => ple
            affix.append(aff_tmp)
        affixes.append(affix)

    whole_aff_dico = create_dico(affixes)  # 전체 prefix 사전
    aff_dico = only_frequent_affix(whole_aff_dico,
                                   frequency)  # frequent한 것만 모아놓은 사전
    aff_dico["<PAD>"] = 10000001
    aff_dico["<UNK>"] = 10000000

    aff_to_id, id_to_aff = create_mapping(aff_dico)
    print("Found %i unique %s" % (len(aff_dico), type))
    return aff_dico, aff_to_id, id_to_aff
Exemplo n.º 9
0
def word_mapping(sentences):
    """
	단어 사전을 구축한다.
	"""
    words1 = [[[word] for word in s[1]] for s in sentences]
    words2 = [[[word] for word in s[2]] for s in sentences]
    words3 = [[[word] for word in s[3]] for s in sentences]
    words4 = [[[word] for word in s[4]] for s in sentences]
    dico = create_dico(words1 + words2 + words3 + words4)
    dico["<PAD>"] = 10000001
    dico['<UNK>'] = 10000000
    word_to_id, id_to_word = create_mapping(dico)
    print("Found %i unique words" % (len(dico)))
    return dico, word_to_id, id_to_word
Exemplo n.º 10
0
def pumsa_mapping(sentences):
    """
	단어 사전을 구축한다.
	"""
    pumsas1 = [[[word] for word in s[5]] for s in sentences]
    pumsas2 = [[[word] for word in s[6]] for s in sentences]
    pumsas3 = [[[word] for word in s[7]] for s in sentences]
    pumsas4 = [[[word] for word in s[8]] for s in sentences]
    dico = create_dico(pumsas1 + pumsas2 + pumsas3 + pumsas4)
    dico["<PAD>"] = 10000001
    dico['<UNK>'] = 10000000
    pumsa_to_id, id_to_pumsa = create_mapping(dico)
    print("Found %i unique pumsa" % (len(pumsa_to_id)))
    return dico, pumsa_to_id, id_to_pumsa
Exemplo n.º 11
0
def char_mapping(sentences, lower):  #lower表示是否忽略大小写
    """
    Create a dictionary and a mapping of words, sorted by frequency.
    """
    chars = [[x[0].lower() if lower else x[0] for x in s]
             for s in sentences]  #获取句子中的字
    dico = create_dico(chars)  #create_dico表示创建词频字典
    dico["<PAD>"] = 10000001  #应该不是表示结尾???
    dico['<UNK>'] = 10000000
    char_to_id, id_to_char = create_mapping(dico)
    #print("Found %i unique words (%i in total)" % (
    #    len(dico), sum(len(x) for x in chars)
    #))
    return dico, char_to_id, id_to_char
Exemplo n.º 12
0
def char_mapping(sentences, lower):
    """
    음절 사전을 구축한다.
    """
    if lower:
        chars = [[[char for char in word.lower()] for word in s[1]]
                 for s in sentences]
    else:
        chars = [[[char for char in word] for word in s[1]] for s in sentences]
    dico = create_dico(chars)
    dico["<PAD>"] = 10000001
    dico['<UNK>'] = 10000000
    char_to_id, id_to_char = create_mapping(dico)
    print("Found %i unique chars" % (len(dico)))
    return dico, char_to_id, id_to_char
Exemplo n.º 13
0
def char_mapping(sentences, lower):
    """
    Create a dictionary and a mapping of words, sorted by frequency.
    """
    chars = [[x[0].lower() if lower else x[0] for x in s]
             for s in sentences]  # 首先将每个字符转换为小写
    dico = create_dico(
        chars)  # 调用data_utils.py中的create_dico函数创建key-value的映射,生成含有每个字出现的次数的字典
    dico["<PAD>"] = 10000001
    dico['<UNK>'] = 10000000
    char_to_id, id_to_char = create_mapping(
        dico)  # 生成key-value和value-key两种字典映射形式,按value值降序排列
    print("Found %i unique words (%i in total)" % (  # print字典中共有多少不同字符和一共有多少字符
        len(dico), sum(len(x) for x in chars)))
    return dico, char_to_id, id_to_char
def char_mapping(sentences, lower):
    """
    Create a dictionary and a mapping of words, sorted by frequency.
    lower的预定值是:True
    """
    chars = [[x[0].lower() if lower else x[0] for x in s] for s in sentences]
    # Python lower() 方法转换字符串中所有大写字符为小写。
    # chars 存储的是语料里的每个字
    dico = create_dico(chars)  # dico 为统计了语料中每个字出现的次数的字典
    dico["<PAD>"] = 10000001
    dico['<UNK>'] = 10000000
    char_to_id, id_to_char = create_mapping(dico)
    # 以上两个字典为对字频做了统计之后的字和id的互译字典,字频越大id越小
    print("Found %i unique words (%i in total)" %
          (len(dico), sum(len(x) for x in chars)))
    return dico, char_to_id, id_to_char
Exemplo n.º 15
0
def char_mapping(sentences, lower):
    """
    构建一个词典和每个词的映射,通过频率排序
    :param sentences:
    :param lower:
    :return:
    """
    # 将每个词转化为小写
    chars = [[x[0].lower() if lower else x[0] for x in s] for s in sentences]
    dico = create_dico(chars)
    # 定义特殊字符
    dico["<PAD>"] = 10000001
    dico['<UNK>'] = 10000000
    # 返回正反向词典
    char_to_id, id_to_char = create_mapping(dico)
    return dico, char_to_id, id_to_char
Exemplo n.º 16
0
def tag_mapping(sentences):
    """
    Create a dictionary and a mapping of tags, sorted by frequency.
    """
    tags = [[char[-1] for char in s] for s in sentences]
    # print("tags:{}".format(tags))
    # "[['O', 'O', 'B-SLOC', 'I-SLOC', 'I-SLOC', 'I-SLOC', 'I-SLOC', 'I-SLOC', 'I-SLOC', 'I-SLOC', 'E-SLOC', 'O', 'B-TYPE', 'E-TYPE']]"
    dico = create_dico(tags)
    # print("dico:{}".format(dico))
    # "dico:{'E-YEAR': 1460, 'I-SPEED': 1639, 'B-SLOC': 3025, 'S-ELOC': 1, 'E-DAY': 1936, 'O': 15082, 'B-YEAR': 1460, 'B-DAY': 1936, 'I-STIME': 4285, 'I-DAY': 1367, 'I-MONTH': 478, 'E-PART': 1936, 'I-ROAD': 901, 'E-STIME': 1934, 'B-STIME': 1934, 'B-ROAD': 556, 'B-TYPE': 3027, 'I-SLOC': 19315, 'I-ETIME': 3113, 'E-ROAD': 556, 'I-YEAR': 1434, 'B-ETIME': 1414, 'E-MONTH': 1935, 'E-TYPE': 3027, 'I-ELOC': 5125, 'E-SLOC': 3025, 'B-PART': 1936, 'S-SLOC': 1, 'B-MONTH': 1935, 'E-SPEED': 1278, 'B-SPEED': 1278, 'B-ELOC': 784, 'E-ELOC': 784, 'E-ETIME': 1414}"
    tag_to_id, id_to_tag = create_mapping(dico)
    # print("tag_to_id:{}".format(tag_to_id))
    # "tag_to_id:{'I-SPEED': 17, 'E-ROAD': 30, 'B-SLOC': 7, 'E-SLOC': 8, 'B-ETIME': 21, 'I-DAY': 23, 'E-SPEED': 25, 'B-ELOC': 27, 'B-YEAR': 18, 'I-ETIME': 4, 'I-SLOC': 0, 'E-ETIME': 22, 'B-TYPE': 5, 'I-ELOC': 2, 'E-DAY': 11, 'E-MONTH': 14, 'B-PART': 10, 'B-SPEED': 24, 'E-TYPE': 6, 'E-ELOC': 28, 'O': 1, 'I-ROAD': 26, 'B-ROAD': 29, 'S-SLOC': 33, 'B-STIME': 15, 'E-PART': 12, 'E-YEAR': 19, 'S-ELOC': 32, 'B-MONTH': 13, 'B-DAY': 9, 'I-YEAR': 20, 'I-MONTH': 31, 'E-STIME': 16, 'I-STIME': 3}"
    # print("id_to_tag:{}".format(id_to_tag))
    # "id_to_tag:{0: 'I-SLOC', 1: 'O', 2: 'I-ELOC', 3: 'I-STIME', 4: 'I-ETIME', 5: 'B-TYPE', 6: 'E-TYPE', 7: 'B-SLOC', 8: 'E-SLOC', 9: 'B-DAY', 10: 'B-PART', 11: 'E-DAY', 12: 'E-PART', 13: 'B-MONTH', 14: 'E-MONTH', 15: 'B-STIME', 16: 'E-STIME', 17: 'I-SPEED', 18: 'B-YEAR', 19: 'E-YEAR', 20: 'I-YEAR', 21: 'B-ETIME', 22: 'E-ETIME', 23: 'I-DAY', 24: 'B-SPEED', 25: 'E-SPEED', 26: 'I-ROAD', 27: 'B-ELOC', 28: 'E-ELOC', 29: 'B-ROAD', 30: 'E-ROAD', 31: 'I-MONTH', 32: 'S-ELOC', 33: 'S-SLOC'} "
    print("Found %i unique named entity tags" % len(dico))
    return dico, tag_to_id, id_to_tag
Exemplo n.º 17
0
def augment_with_pretrained(dictionary, ext_emb_path, chars):
    """
    Augment the dictionary with words that have a pretrained embedding.
    If `words` is None, we add every word that has a pretrained embedding
    to the dictionary, otherwise, we only add the words that are given by
    `words` (typically the words in the development and test sets.)
    
    Args:
      dictionary: 传入的字典
      ext_emb_path: 预训练嵌入集地址
      chars: 传入的字符集
    Return
      dictionary, word_to_id, id_to_word
    """
    #print('Loading pretrained embeddings from %s...' % ext_emb_path)
    assert os.path.isfile(ext_emb_path)

    # Load pretrained embeddings from file
    # 加载预训练嵌入集的所有字符
    pretrained = set([
        line.rstrip().split()[0].strip()
        for line in codecs.open(ext_emb_path, 'r', 'utf-8')
        if len(ext_emb_path) > 0
    ])

    # We either add every word in the pretrained file,
    # or only words given in the `words` list to which
    # we can assign a pretrained embedding
    if chars is None:
        for char in pretrained:
            if char not in dictionary:
                dictionary[char] = 0
    else:
        # 对于chars中的每个字符 如果在 pretrained 存在,在 dictionary 不存在
        # 往 dictionary 扩充该字符,并且出现频率设置为0
        # 此处并没有使用预训练嵌入集相应字符的向量
        for char in chars:
            if any(
                    x in pretrained for x in
                [char, char.lower(),
                 re.sub('\d', '0', char.lower())]) and char not in dictionary:
                dictionary[char] = 0

    word_to_id, id_to_word = create_mapping(dictionary)
    return dictionary, word_to_id, id_to_word
Exemplo n.º 18
0
def augment_with_pretrained(dictionary, ext_emb_path, chars):
    """
    Augment the dictionary with words that have a pretrained embedding.
    If `words` is None, we add every word that has a pretrained embedding
    to the dictionary, otherwise, we only add the words that are given by
    `words` (typically the words in the development and test sets.)
    """
    print('Loading pretrained embeddings from %s...' % ext_emb_path)
    assert os.path.isfile(ext_emb_path)
    #for char in chars:
    #	print(char)
    #print(len(dictionary))
    # Load pretrained embeddings from file
    pretrained = set([
        line.rstrip().split()[0].strip()
        for line in codecs.open(ext_emb_path, 'r', 'utf-8')
        if len(ext_emb_path) > 0
    ])

    # We either add every word in the pretrained file,
    # or only words given in the `words` list to which
    # we can assign a pretrained embedding
    if chars is None:
        for char in pretrained:
            if char not in dictionary:
                dictionary[char] = 0
    else:
        for char in chars:
            if any(
                    x in pretrained for x in
                [char, char.lower(),
                 re.sub('\d', '0', char.lower())]) and char not in dictionary:
                dictionary[char] = 0

    word_to_id, id_to_word = create_mapping(dictionary)
    #t=0
    #for i in id_to_word:
    #	print(i)
    #	t+=1
    #print(t)
    #print(len(dictionary))
    #print(len(pretrained))
    #print(dictionary)
    return dictionary, word_to_id, id_to_word
Exemplo n.º 19
0
def char_mapping(sentences, lower):
    """
    根据数据集的词频创建字典,然后得到字符与索引id的双向映射字典
    :param sentences:
    :param lower:
    :return:
    """
    chars = [[x[0].lower() if lower else x[0] for x in s] for s in sentences]
    # 用creat创建字典
    dico = create_dico(chars)  # 创建字典,键值对为word-词频frequency
    # padding字符<PAD>的频数,极大化这一数值,保证最终得到的映射字典中<PAD>的索引为0,因为对序列进行补长的时候,补充的是0
    dico["<PAD>"] = 10000001
    dico['<UNK>'] = 10000000  # unknown字符的索引
    # 根据字典得到两种映射
    char_to_id, id_to_char = create_mapping(dico)
    print("Found %i unique words (%i in total)" %
          (len(dico), sum(len(x) for x in chars)))
    # print("char_to_id:", char_to_id)
    return dico, char_to_id, id_to_char
Exemplo n.º 20
0
def char_mapping(sentences, lower):
    """
    Create a dictionary and a mapping of characters, sorted by frequency.
    """
    words = [[x[0] for x in s] for s in sentences]
    chars = []
    for s in words:
        char = []
        for word in s:
            for c in word:
                char.append(c.lower() if lower else c)
        chars.append(char)

    dico = create_dico(chars)
    dico["<PAD>"] = 10000001
    dico['<UNK>'] = 10000000
    char_to_id, id_to_char = create_mapping(dico)
    print("Found %i unique chars (%i in total)" %
          (len(dico), sum(len(x) for x in chars)))
    return dico, char_to_id, id_to_char
Exemplo n.º 21
0
def augment_with_pretrained(dictionary, ext_emb_path, chars):
    """
    Augment the dictionary with words that have a pretrained embedding.
    If `words` is None, we add every word that has a pretrained embedding
    to the dictionary, otherwise, we only add the words that are given by
    `words` (typically the words in the development and test sets.)
    """
    #print('Loading pretrained embeddings from %s...' % ext_emb_path)
    assert os.path.isfile(
        ext_emb_path
    )  #����Ƿ��������ļ� 'e:\\objectTest\\NERuselocal\\data\\vec.txt'

    # Load pretrained embeddings from file
    pretrained = set([  #setȥ���ظ����֡�������ȡ����
        line.rstrip().split()[0].strip()  #ÿ��ȥ���з����ո����
        for line in codecs.open(ext_emb_path, 'r',
                                'utf-8')  #codecs���ļ���һ����
        if len(ext_emb_path) > 0
    ])

    # We either add every word in the pretrained file,
    # or only words given in the `words` list to which
    # we can assign a pretrained embedding
    if chars is None:  # chars ��ѵ��������ƽ���� ������ �ļ��ֵ������û�������
        for char in pretrained:
            if char not in dictionary:
                dictionary[
                    char] = 0  #��� ѵ�����е���   �����ļ�����  Ҳ���� ѵ�����ֵ� �� ����ӵ�ѵ���ֵ���
    else:
        for char in chars:  #ÿ���� any �ж�x �������е�����һ�ַ���Ture �и���Ϊ�վͷ���false
            if any(x in pretrained for x in [
                    char,
                    char.lower(),  #תСд
                    re.sub('\d', '0', char.lower())  #�����滻Ϊ0
            ]) and char not in dictionary:
                dictionary[
                    char] = 0  #������� ԭʼ�ļ�pretrained�� ���Ҳ���ѵ���ֵ��У�����������ת����ĸ����  ��ӵ�ѵ���ֵ���

    word_to_id, id_to_word = create_mapping(
        dictionary)  #����������ѵ���ֵ� �������·���
    return dictionary, word_to_id, id_to_word
Exemplo n.º 22
0
def char_mapping(sentences, lower):
    """
    根据字创建一个字典
    :param sentences: 
    :param lower: 
    :return: 字典,字-id映射,id-字映射
    """

    # 只包含当前词 [[w1, w2,...], [w1, w2, ...], []]
    chars = [[char.lower() if lower else char for char in sentence]
             for sentence in sentences]
    dico = create_dico(chars)

    dico[u"<PAD>"] = 10000001  # 目前未用到 定义一个大的数保证其对应id为0
    dico[u"<UNK>"] = 10000000  # 未登录词

    char_to_id, id_to_char = create_mapping(dico)
    print("Found %i unique words (%i in total)" %
          (len(dico), sum(len(x) for x in chars)))

    return dico, char_to_id, id_to_char
Exemplo n.º 23
0
def char_mapping(sentences, lower):
    """
    Create a dictionary and a mapping of words, sorted by frequency.
    """
    #����һ���ֵ�ӳ�䣬����Ƶ���� sÿ�仰��x��['��', 'O'] ����ת����Сд
    chars = [
        [x[0].lower() if lower else x[0] for x in s] for s in sentences
    ]  #[['��', '��', '��', '��', '��', 'ס', '��', 'Ժ', '��'],  ['��', '��']]
    dico = create_dico(chars)  #ͳ����Ƶ
    dico[
        "<PAD>"] = 10000001  #pading ÿ���ּӸ��߿� ÿ�仰�ij���pading ��һ���ij��� �����
    dico['<UNK>'] = 10000000  #��֪������Ҳ����id
    char_to_id, id_to_char = create_mapping(dico)  #ÿ���ֽ����Ը��ֵ�
    #�ֵ�id  char_to_id   id���� id_to_char
    #print("Found %i unique words (%i in total)" % (
    #    len(dico), sum(len(x) for x in chars)
    #))
    # dico ��û��������ֵ�{'��': 79775, '��': 3134, '��': 7713, '0': 335904, '��': 19664, '��': 14307}
    # char_to_id  {0: '<PAD>', 1: '<UNK>', 2: '0', 3: '��', 4: '��', 5: '��', 6: '��',}  ��Ƶ�Ӵ�С
    # id_to_char  {'<PAD>': 0, '<UNK>': 1, '0': 2, '��': 3, '��': 4, '��': 5, '��': 6,}  ��Ƶ�Ӵ�С
    return dico, char_to_id, id_to_char
Exemplo n.º 24
0
def augment_with_pretrained(dictionary, ext_emb_path, chars):
    print('Loading pretrained embeddings from %s...' % ext_emb_path)
    assert os.path.isfile(ext_emb_path)
    pretrained = set([
        line.rstrip().split()[0].strip()
        for line in codecs.open(ext_emb_path, 'r', 'utf-8')
    ])
    if chars is None:
        for char in pretrained:
            if char not in dictionary:
                dictionary[char] = 0
    else:
        for char in chars:
            if any(
                    x in pretrained for x in
                [char, char.lower(),
                 re.sub('\d', '0', char.lower())]) and char not in dictionary:
                dictionary[char] = 0

    word_to_id, id_to_word = create_mapping(dictionary)
    return dictionary, word_to_id, id_to_word
Exemplo n.º 25
0
def tag_mapping(sentences):
    """
    Create a dictionary and a mapping of tags, sorted by frequency.
    """
    # 该函数用于将tag做mapping,
    # 其输入是:
    # 输出是:

    f = open('tag_to_id.txt', 'w', encoding='utf8')
    f1 = open('id_to_tag.txt', 'w', encoding='utf8')
    tags = []
    for s in sentences:
        ts = []
        for char in s:
            tag = char[-1]
            ts.append(tag)
        tags.append(ts)
    # tags [['O', 'O', 'B-DRU', 'E-DRU', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'B-SYM',
    #       'E-SYM', 'B-SYM', 'I-SYM', 'I-SYM', 'E-SYM', 'O', 'O', 'O', 'O', 'O', 'O', 'O',
    #       'O', 'O', 'O', 'O', 'O', 'B-DRU', 'E-DRU', 'O', 'O', 'O', 'O', 'O', 'O', 'O',
    #       'B-REG', 'I-REG', 'E-REG', 'B-SYM', 'E-SYM', 'O', 'O', 'O', 'O', 'O', 'O', 'O',
    #       'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O']]
    # tags1 = [[char[-1] for char in s] for s in sentences]
    dico = create_dico(tags)
    # dico {'O': 44, 'B-DRU': 2, 'E-DRU': 2, 'B-SYM': 3, 'E-SYM': 3, 'I-SYM': 2, 'B-REG': 1,
    #       'I-REG': 1, 'E-REG': 1}
    tag_to_id, id_to_tag = create_mapping(dico)
    # tag_to_id:
    # {'O': 0, 'B-SYM': 1, 'E-SYM': 2, 'B-DRU': 3, 'E-DRU': 4, 'I-SYM': 5, 'B-REG': 6, 'E-REG': 7,
    #  'I-REG': 8}
    # id_to_tag
    # {0: 'O', 1: 'B-SYM', 2: 'E-SYM', 3: 'B-DRU', 4: 'E-DRU', 5: 'I-SYM', 6: 'B-REG', 7: 'E-REG',
    # 8: 'I-REG'}
    # print("Found %i unique named entity tags" % len(dico))
    # 写入文档中
    for k, v in tag_to_id.items():
        f.write(k + ":" + str(v) + "\n")
    for k, v in id_to_tag.items():
        f1.write(str(k) + ":" + str(v) + "\n")
    return dico, tag_to_id, id_to_tag
Exemplo n.º 26
0
def char_mapping(sentences, lower):
    """
    Create a dictionary and a mapping of words, sorted by frequency.
    """
    # print("lower:{}".format(lower))
    # lower: False
    chars = [[x[0].lower() if lower else x[0] for x in s] for s in sentences]
    # print("in char_mapping chars:{}".format(chars))
    # print("chars[0]:{}".format(chars[0]))
    # "chars[0]:['我', '要', '看', '乌', '鲁', '木', '齐', '市', '第', '四', '十', '九', '中', '学', '东', '门', '去', '乌', '鲁', '木', '齐', '推', '拿', '职', '业', '学', '校', '南', '门', '沿', '西', '虹', '东', '路', '的', '监', '控']"
    dico = create_dico(chars)
    # print("dico:{}".format(dico))
    # dico: {'仓': 16, '背': 5, '视': 348, '煨': 1, '代': 25, '欢': 2, '配': 2, '核': 5, '还': 3, '结': 4, '工': 124 }
    dico["<PAD>"] = 10000001
    dico['<UNK>'] = 10000000
    char_to_id, id_to_char = create_mapping(dico)
    # id_to_item: {0: '<PAD>', 1: '<UNK>', 2: '1', 3: ':', 4: '2', 5: '门', 6: '的', 7: '0', 8: '月'
    # item_to_id: {'俊': 402, '是': 428, '仪': 642, '哥': 728, '童': 366, '3': 12, '界': 450, '税': 876}
    # print("char_to_id:{}".format(char_to_id))
    print("Found %i unique words (%i in total)" %
          (len(dico), sum(len(x) for x in chars)))
    return dico, char_to_id, id_to_char
Exemplo n.º 27
0
def augment_with_pretrained(dictionary, ext_emb_path):
    """
    Augment the dictionary with words that have a pretrained embedding.
    If `words` is None, we add every word that has a pretrained embedding
    to the dictionary.
    """
    print('Loading pretrained embeddings from %s...' % ext_emb_path)
    assert os.path.isfile(ext_emb_path)

    # Load pretrained embeddings from file
    pretrained = set([
        line.rstrip().split()[0].strip()
        for line in codecs.open(ext_emb_path, 'r', 'utf-8')
        if len(ext_emb_path) > 0
    ])

    for char in pretrained:
        if char not in dictionary:
            dictionary[char] = 0

    char_to_id, id_to_char = create_mapping(dictionary)
    return dictionary, char_to_id, id_to_char
Exemplo n.º 28
0
def augment_with_pretrained(dictionary, ext_emb_path, chars):
    """
    Augment the dictionary with words that have a pretrained embedding.
    If `words` is None, we add every word that has a pretrained embedding
    to the dictionary, otherwise, we only add the words that are given by
    `words` (typically the words in the development and test sets.)
    """
    "ext_emb_path是vec.txt文件用于将字符转化为预训练的词向量"
    # print('Loading pretrained embeddings from %s...' % ext_emb_path)
    assert os.path.isfile(ext_emb_path)

    # Load pretrained embeddings from file
    pretrained = set([
        line.rstrip().split()[0].strip()
        for line in codecs.open(ext_emb_path, 'r', 'utf-8')
        if len(ext_emb_path) > 0
    ])
    # pretrained是加载的预训练的单词对应的vetctor
    # We either add every word in the pretrained file,
    # or only words given in the `words` list to which
    # we can assign a pretrained embedding
    # chars是传入的每句话
    # 这里如果chars是空,则赋值为0,这里将训练出来的分别派出成1,2,3,4,5的格式,结果是{'<PAD>': 0, '<UNK>': 1, '0': 2, ',': 3, ':': 4, '。': 5,
    # ....}
    if chars is None:
        for char in pretrained:
            if char not in dictionary:
                dictionary[char] = 0
    else:
        for char in chars:
            if any(
                    x in pretrained for x in
                [char, char.lower(),
                 re.sub('\d', '0', char.lower())]) and char not in dictionary:
                dictionary[char] = 0

    word_to_id, id_to_word = create_mapping(dictionary)
    return dictionary, word_to_id, id_to_word
Exemplo n.º 29
0
def tag_mapping(sentences):
    """
    Create a dictionary and a mapping of tags, sorted by frequency.
    """

    f = open('tag_to_id.txt', 'w', encoding='utf8')
    f1 = open('id_to_tag.txt', 'w', encoding='utf8')
    tags = []
    for s in sentences:
        ts = []
        for char in s:
            tag = char[-1]
            ts.append(tag)
        tags.append(ts)

    #tags1 = [[char[-1] for char in s] for s in sentences]
    dico = create_dico(tags)
    tag_to_id, id_to_tag = create_mapping(dico)
    #print("Found %i unique named entity tags" % len(dico))
    for k, v in tag_to_id.items():
        f.write(k + ":" + str(v) + "\n")
    for k, v in id_to_tag.items():
        f1.write(str(k) + ":" + str(v) + "\n")
    return dico, tag_to_id, id_to_tag
Exemplo n.º 30
0
def char_mapping(sentences, lower):
    """
    Create a dictionary and a mapping of words, sorted by frequency.
    """

    # 按照频率构造字典chars取sentences中的最数组最内单元的第一个数组作为结果,本质上就是最开始输入的经过处理替换数字的文字字符串。
    # ['入', '院', '情', '况', ':', '女', ',', '0', '0', '岁', ',', '以', '突', '发', '言', '语', '不',
    # '清', '0', '天', ',', '加', '重', '0', '天', '入', '院', '。', '入', '院', '情', '况', ':', '患',
    # '者', '以', '腰', '痛', '伴', '双', '下', '肢', '疼', '痛', '半', '年', ',', '加', '重', '0', '0',
    #  '余', '天', '为', '主', '诉', '入', '院', '。']
    chars = [[x[0].lower() if lower else x[0] for x in s] for s in sentences]
    # 在此处获取字及对应出现的频率
    dico = create_dico(chars)
    dico["<PAD>"] = 10000001
    dico['<UNK>'] = 10000000
    # 这里将无重复的字符传入create_mapping构件字典,传入数据是{'无': 1754, '长': 68, '期': 179,...}
    # char_to_id
    # {'<PAD>': 0, '<UNK>': 1, '0': 2, '入': 3, '院': 4, ',': 5, '天': 6, '。': 7, '以': 8, '况': 9, '加': 10,
    #   '情': 11, '痛': 12, '重': 13, ':': 14, '下': 15, '不': 16, '为': 17, '主': 18, '伴': 19, '余': 20, '半': 21,
    #   '双': 22, '发': 23, '女': 24, '岁': 25, '年': 26, '患': 27, '清': 28, '疼': 29, '突': 30, '者': 31, '肢': 32,
    #   '腰': 33, '言': 34, '诉': 35, '语': 36}
    #  id_to_char
    # {0: '<PAD>', 1: '<UNK>', 2: '0', 3: '入', 4: '院', 5: ',', 6: '天', 7: '。', 8: '以', 9: '况', 10: '加',
    #  11: '情', 12: '痛', 13: '重', 14: ':', 15: '下', 16: '不', 17: '为', 18: '主', 19: '伴', 20: '余', 21: '半',
    #  22: '双', 23: '发', 24: '女', 25: '岁', 26: '年', 27: '患', 28: '清', 29: '疼', 30: '突', 31: '者', 32: '肢',
    #  33: '腰', 34: '言', 35: '诉', 36: '语'}
    char_to_id, id_to_char = create_mapping(dico)
    # print("Found %i unique words (%i in total)" % (
    #    len(dico), sum(len(x) for x in chars)
    # ))
    #    这里dico{'入': 4, '院': 4, '情': 2, '况': 2, ':': 2, '女': 1, ',': 4, '0': 6, '岁': 1, '以': 2, '突': 1,
    #    '发': 1, '言': 1, '语': 1, '不': 1, '清': 1, '天': 3, '加': 2, '重': 2, '。': 2, '患': 1, '者': 1, '腰': 1,
    #    '痛': 2, '伴': 1, '双': 1, '下': 1, '肢': 1, '疼': 1, '半': 1, '年': 1, '余': 1, '为': 1, '主': 1, '诉': 1,
    #    '<PAD>': 10000001, '<UNK>': 10000000}

    return dico, char_to_id, id_to_char
Exemplo n.º 31
0
parser.add_argument("--batch_size", default=128, type=int, help="Batch size")
parser.add_argument("--steps_per_checkpoint",
                    default=10,
                    type=int,
                    help="Save model checkpoint every this iteration")
parser.add_argument("--summary_dir",
                    default='summary/',
                    type=str,
                    help="mode name")

args = parser.parse_args()

source_data = load_data(args.source_file)
target_data = load_data(args.target_file)

source_idx_to_word, source_word_to_idx = create_mapping(source_data)
target_idx_to_word, target_word_to_idx = create_mapping(target_data)

source_data_idx, target_data_idx = sentences_to_ids(source_data, target_data,
                                                    source_word_to_idx,
                                                    target_word_to_idx)
source_data_train, source_data_dev, target_data_train, target_data_dev\
    = train_test_split(source_data_idx, target_data_idx, test_size=0.3, random_state=1,
                       shuffle=False, stratify=None)

with tf.Session() as sess:
    model = Seq2SeqModel(rnn_size=args.rnn_size,
                         num_layers=args.num_layers,
                         enc_embedding_size=args.embedding_size,
                         learning_rate=args.learning_rate,
                         dec_embedding_size=args.embedding_size,
Exemplo n.º 32
0
def tag_mapping(sentences):
    tags = [[char[-1] for char in s] for s in sentences]
    dico = create_dico(tags)
    tag_to_id, id_to_tag = create_mapping(dico)
    print("Found %i unique named entity tags" % len(dico))
    return dico, tag_to_id, id_to_tag