Пример #1
0
def augment_with_pretrained(dictionary, ext_emb_path, words):
    """
    Augment the dictionary with words that have a pretrained embedding.
    If `words` is None, we add every word that has a pretrained embedding
    to the dictionary, otherwise, we only add the words that are given by
    `words` (typically the words in the development and test sets.)
    """
    print('Loading pre-trained embeddings from {}...'.format(ext_emb_path))
    assert os.path.isfile(ext_emb_path), '{} not found'.format(ext_emb_path)

    # Load pretrained embeddings from file
    pretrained = set([
        line.rstrip().split()[0].strip()
        for line in codecs.open(ext_emb_path, 'r', 'utf-8')
        if len(ext_emb_path) > 0
    ])

    # We either add every word in the pretrained file,
    # or only words given in the `words` list to which
    # we can assign a pretrained embedding
    if words is None:
        for word in pretrained:
            if word not in dictionary:
                dictionary[word] = 0
    else:
        for word in words:
            if any(
                    x in pretrained for x in
                [word, word.lower(),
                 re.sub('\d', '0', word.lower())]) and word not in dictionary:
                dictionary[word] = 0

    word_to_id, id_to_word = create_mapping(dictionary)
    return dictionary, word_to_id, id_to_word
Пример #2
0
def tag_mapping(sentences, idx=-1):
    """
    Create a dictionary and a mapping of tags, sorted by frequency.
    """
    tags = [[word[idx] for word in s] for s in sentences]
    dico = create_dico(tags)
    tag_to_id, id_to_tag = create_mapping(dico)
    print("Found {} unique named entity tags".format(len(dico)))
    return dico, tag_to_id, id_to_tag
Пример #3
0
def pos_mapping(sentences):
    """
    Create a dictionary and a mapping of POS tags, sorted by frequency.
    """
    tags = [[word[2] for word in s] for s in sentences]
    dico = create_dico(tags)
    dico['<UNK>'] = 10000000
    tag_to_id, id_to_tag = create_mapping(dico)
    print("Found {} unique POS tags".format(len(dico)))
    return dico, tag_to_id, id_to_tag
Пример #4
0
def char_mapping(sentences):
    """
    Create a dictionary and mapping of characters, sorted by frequency.
    """
    chars = ["".join([w[1] for w in s]) for s in sentences]
    dico = create_dico(chars)
    dico['<UNK>'] = 10000000
    char_to_id, id_to_char = create_mapping(dico)
    print("Found {} unique characters".format(len(dico)))
    return dico, char_to_id, id_to_char
Пример #5
0
def word_mapping(sentences, lower):
    """
    Create a dictionary and a mapping of words, sorted by frequency.
    """
    words = [[x[1].lower() if lower else x[1] for x in s] for s in sentences]
    dico = create_dico(words)
    dico['<UNK>'] = 10000000
    dico['<PADDING>'] = 500000
    word_to_id, id_to_word = create_mapping(dico)
    print('Found {} unique words ({} in total)'.format(
        len(dico), sum(len(x) for x in words)))
    return dico, word_to_id, id_to_word
Пример #6
0
def segment_mapping(sentences, only_i_o_tag=True):
    """
    Create a dictionary and a mapping of ner segmentation tags, sorted by frequency.
    """
    if only_i_o_tag:
        tags = [[word[4].replace('B-', 'I-') for word in s] for s in sentences]
    else:
        tags = [[word[4] for word in s] for s in sentences]
    dico = create_dico(tags)
    tag_to_id, id_to_tag = create_mapping(dico)
    print("Found {} unique segmentation tags".format(len(dico)))
    return dico, tag_to_id, id_to_tag