예제 #1
0
파일: loader.py 프로젝트: memoiry/NLU
def augment_with_pretrained(dictionary, ext_emb_path, chars):
    """
    Augment the dictionary with words that have a pretrained embedding.
    If `words` is None, we add every word that has a pretrained embedding
    to the dictionary, otherwise, we only add the words that are given by
    `words` (typically the words in the development and test sets.)
    """
    print('Loading pretrained embeddings from %s...' % ext_emb_path)
    assert os.path.isfile(ext_emb_path)

    # Load pretrained embeddings from file
    pretrained = set([
        line.rstrip().split()[0].strip()
        for line in codecs.open(ext_emb_path, 'r', 'utf-8')
        if len(ext_emb_path) > 0
    ])

    # We either add every word in the pretrained file,
    # or only words given in the `words` list to which
    # we can assign a pretrained embedding
    if chars is None:
        for char in pretrained:
            if char not in dictionary:
                dictionary[char] = 0
    else:
        for char in chars:
            if any(
                    x in pretrained for x in
                [char, char.lower(),
                 re.sub('\d', '0', char.lower())]) and char not in dictionary:
                dictionary[char] = 0

    word_to_id, id_to_word = create_mapping(dictionary)
    return dictionary, word_to_id, id_to_word
예제 #2
0
def feature_mapping(sentences, features):
    """
    创建特征和index映射关系
    :param sentences: list of list of tuple, [[(words11, features11, ..., tag11), ...], [(word21, feature21, ..., tag21), ...], ...]
    :param features: string, 特征的列index, 以逗号分隔
    :return: 
        feature_to_id: dict
        id_to_feature: dict
    """
    dico = OrderedDict()
    feature_to_id = OrderedDict()
    id_to_feature = OrderedDict()

    features_list = features.split(",")
    for feature_i in features_list:
        if feature_i == "0":
            continue
        cur_feature = [[t[int(feature_i)] for t in s] for s in sentences]
        cur_dico = create_dico(cur_feature)
        print("%sth feature found %i unique features" %
              (feature_i, len(cur_dico)))
        cur_dico["<UNK>"] = 10000000
        cur_feature_to_id, cur_id_to_feature = create_mapping(cur_dico)

        dico[feature_i] = cur_dico
        feature_to_id[feature_i] = cur_feature_to_id
        id_to_feature[feature_i] = cur_id_to_feature

    return dico, feature_to_id, id_to_feature
예제 #3
0
파일: loader.py 프로젝트: memoiry/NLU
def tag_mapping(sentences):
    """
    Create a dictionary and a mapping of tags, sorted by frequency.
    """
    tags = [[char[-1] for char in s] for s in sentences]
    dico = create_dico(tags)
    tag_to_id, id_to_tag = create_mapping(dico)
    print("Found %i unique named entity tags" % len(dico))
    return dico, tag_to_id, id_to_tag
예제 #4
0
def tag_mapping(sentences):
    """
    Create a dictionary and a mapping of tags, sorted by frequency.
    """

    f = open(os.path.join('configs', 'tag_to_id.txt'), 'w', encoding='utf8')
    f1 = open(os.path.join('configs', 'id_to_tag.txt'), 'w', encoding='utf8')
    f2 = open(os.path.join('configs', 'indent_to_id.txt'),
              'w',
              encoding='utf8')
    f3 = open(os.path.join('configs', 'id_to_indent.txt'),
              'w',
              encoding='utf8')

    tags = []
    intents = []
    for s in sentences:
        ts = []
        ints = []
        for char in s:
            tag = char[1]
            intent = char[-1]
            ts.append(tag)
            ints.append(intent)
        tags.append(ts)
        intents.append(ints)

    dico_tags = create_dico(tags)
    tag_to_id, id_to_tag = create_mapping(dico_tags)

    dico_intents = create_dico(intents)
    intent_to_id, id_to_intent = create_mapping(dico_intents)

    for k, v in tag_to_id.items():
        f.write(k + ":" + str(v) + "\n")
    for k, v in id_to_tag.items():
        f1.write(str(k) + ":" + str(v) + "\n")

    for k, v in intent_to_id.items():
        f2.write(k + ":" + str(v) + "\n")
    for k, v in id_to_intent.items():
        f3.write(str(k) + ":" + str(v) + "\n")

    return tag_to_id, id_to_tag, intent_to_id, id_to_intent
예제 #5
0
파일: loader.py 프로젝트: memoiry/NLU
def char_mapping(sentences, lower):
    """
    Create a dictionary and a mapping of words, sorted by frequency.
    """
    chars = [[x[0].lower() if lower else x[0] for x in s] for s in sentences]
    dico = create_dico(chars)
    dico["<PAD>"] = 10000001
    dico['<UNK>'] = 10000000
    char_to_id, id_to_char = create_mapping(dico)
    print("Found %i unique words (%i in total)" %
          (len(dico), sum(len(x) for x in chars)))
    return dico, char_to_id, id_to_char
예제 #6
0
def augment_with_pretrained(dictionary, ext_emb_path, chars):
    """
    Augment the dictionary with words that have a pretrained embedding.
    If `chars` is None, we add every word that has a pretrained embedding
    to the dictionary, otherwise, we only add the words that are given by
    `chars` (typically the words in the development and test sets.)
    
    增大训练集字的字典,若chars为None则将pre-trained的所有字都加入,否则只加入pre-trained和chars中共同的字
    
    :param dictionary: 训练集字的字典
    :param ext_emb_path: pre-trained file path
    :param chars: 测试集的字的set
    """
    print('Loading pretrained embeddings from %s...' % ext_emb_path)
    assert os.path.isfile(ext_emb_path)

    # Load pretrained embeddings from file
    # 得到pre-trained中所有字的set
    pretrained = set([
        line.rstrip().split()[0].strip()
        for line in codecs.open(ext_emb_path, 'r', 'utf-8')
        if len(ext_emb_path) > 0
    ])

    # We either add every word in the pretrained file,
    # or only words given in the `words` list to which
    # we can assign a pretrained embedding
    # 若测试集字为None
    if chars is None:
        # 将pre-trained的所有字加入训练集字的字典
        for char in pretrained:
            if char not in dictionary:
                dictionary[char] = 0
    else:
        # 将测试集中存在与pre-trained的字加入训练集字的字典
        for char in chars:
            if any(
                    x in pretrained for x in
                [char, char.lower(),
                 re.sub('\d', '0', char.lower())]) and char not in dictionary:
                dictionary[char] = 0

    word_to_id, id_to_word = create_mapping(dictionary)
    return dictionary, word_to_id, id_to_word