예제 #1
0
def char_mapping(sentences):
    """
    Create a dictionary and mapping of characters, sorted by frequency.
    """
    chars = ["".join([w[0] for w in s]) for s in sentences]
    dico = create_dico(chars)
    dico['<PAD>'] = 10000000
    # dico[';'] = 0
    char_to_id, id_to_char = create_mapping(dico)
    print("Found %i unique characters" % len(dico))
    return dico, char_to_id, id_to_char
예제 #2
0
def augment_with_pretrained(dictionary, ext_emb_path, words):
    """
    Augment the dictionary with words that have a pretrained embedding.
    If `words` is None, we add every word that has a pretrained embedding
    to the dictionary, otherwise, we only add the words that are given by
    `words` (typically the words in the development and test sets.)
    """
    print('Loading pretrained embeddings from %s...' % ext_emb_path)
    assert os.path.isfile(ext_emb_path)

    #Load pretrained embeddings from file
    pretrained = set([
        line.rstrip().split()[0].strip()
        for line in codecs.open(ext_emb_path, 'r', 'utf-8')
        if len(ext_emb_path) > 0
    ])

    pretrained = []
    for line in codecs.open(ext_emb_path, 'r', 'utf-8'):
        if len(ext_emb_path) > 0:
            try:
                pretrained.append(line.rstrip().split()[0].strip())
            except IndexError:
                continue
    pretrained = set(pretrained)
    for word in words:
        if word not in dictionary and any(
                x in pretrained
                for x in [word,
                          word.lower(),
                          re.sub('\d', '0', word.lower())]):
            dictionary[
                word] = 0  #add the word from dev & test pretrained embedding with 0 freq

    # We either add every word in the pretrained file,
    # or only words given in the `words` list to which
    # we can assign a pretrained embedding

    #JT: commented_below : as adding all words from embedding throws CUDA runtime errors
    # if words is None:
    #     for word in pretrained:
    #         if word not in dictionary:
    #             dictionary[word] = 0 #add the word from pretrained embedding with 0 freq
    # else:
    #     for word in words:
    #         if any(x in pretrained for x in [
    #             word,
    #             word.lower(),
    #             re.sub('\d', '0', word.lower())
    #         ]) and word not in dictionary:
    #             dictionary[word] = 0 #add the word from pretrained embedding with 0 freq

    word_to_id, id_to_word = create_mapping(dictionary)
    return dictionary, word_to_id, id_to_word
예제 #3
0
def tag_mapping(sentences):
    """
    Create a dictionary and a mapping of tags, sorted by frequency.
    """
    tags = [[word[-1] for word in s] for s in sentences]
    dico = create_dico(tags)
    dico[model.START_TAG] = -1
    dico[model.STOP_TAG] = -2
    tag_to_id, id_to_tag = create_mapping(dico)
    print("Found %i unique named entity tags" % len(dico))
    # print(dico)
    return dico, tag_to_id, id_to_tag
예제 #4
0
def word_mapping(sentences, lower):
    """
    Create a dictionary and a mapping of words, sorted by frequency.
    """
    words = [[x[0].lower() if lower else x[0] for x in s] for s in sentences]
    dico = create_dico(words) #dict with word frequency
    # print(dico)

    dico['<PAD>'] = 10000001
    dico['<UNK>'] = 10000000
    dico = {k:v for k,v in dico.items() if v>=3} #prune words which has occureced less than 3 times
    word_to_id, id_to_word = create_mapping(dico)

    print("Found %i unique words (%i in total)" % (
        len(dico), sum(len(x) for x in words)
    ))

    return dico, word_to_id, id_to_word