def augment_with_pretrained(dictionary, ext_emb_path, words): """ Augment the dictionary with words that have a pretrained embedding. If `words` is None, we add every word that has a pretrained embedding to the dictionary, otherwise, we only add the words that are given by `words` (typically the words in the development and test sets.) """ print('Loading pre-trained embeddings from {}...'.format(ext_emb_path)) assert os.path.isfile(ext_emb_path), '{} not found'.format(ext_emb_path) # Load pretrained embeddings from file pretrained = set([ line.rstrip().split()[0].strip() for line in codecs.open(ext_emb_path, 'r', 'utf-8') if len(ext_emb_path) > 0 ]) # We either add every word in the pretrained file, # or only words given in the `words` list to which # we can assign a pretrained embedding if words is None: for word in pretrained: if word not in dictionary: dictionary[word] = 0 else: for word in words: if any( x in pretrained for x in [word, word.lower(), re.sub('\d', '0', word.lower())]) and word not in dictionary: dictionary[word] = 0 word_to_id, id_to_word = create_mapping(dictionary) return dictionary, word_to_id, id_to_word
def tag_mapping(sentences, idx=-1): """ Create a dictionary and a mapping of tags, sorted by frequency. """ tags = [[word[idx] for word in s] for s in sentences] dico = create_dico(tags) tag_to_id, id_to_tag = create_mapping(dico) print("Found {} unique named entity tags".format(len(dico))) return dico, tag_to_id, id_to_tag
def pos_mapping(sentences): """ Create a dictionary and a mapping of POS tags, sorted by frequency. """ tags = [[word[2] for word in s] for s in sentences] dico = create_dico(tags) dico['<UNK>'] = 10000000 tag_to_id, id_to_tag = create_mapping(dico) print("Found {} unique POS tags".format(len(dico))) return dico, tag_to_id, id_to_tag
def char_mapping(sentences): """ Create a dictionary and mapping of characters, sorted by frequency. """ chars = ["".join([w[1] for w in s]) for s in sentences] dico = create_dico(chars) dico['<UNK>'] = 10000000 char_to_id, id_to_char = create_mapping(dico) print("Found {} unique characters".format(len(dico))) return dico, char_to_id, id_to_char
def word_mapping(sentences, lower): """ Create a dictionary and a mapping of words, sorted by frequency. """ words = [[x[1].lower() if lower else x[1] for x in s] for s in sentences] dico = create_dico(words) dico['<UNK>'] = 10000000 dico['<PADDING>'] = 500000 word_to_id, id_to_word = create_mapping(dico) print('Found {} unique words ({} in total)'.format( len(dico), sum(len(x) for x in words))) return dico, word_to_id, id_to_word
def segment_mapping(sentences, only_i_o_tag=True): """ Create a dictionary and a mapping of ner segmentation tags, sorted by frequency. """ if only_i_o_tag: tags = [[word[4].replace('B-', 'I-') for word in s] for s in sentences] else: tags = [[word[4] for word in s] for s in sentences] dico = create_dico(tags) tag_to_id, id_to_tag = create_mapping(dico) print("Found {} unique segmentation tags".format(len(dico))) return dico, tag_to_id, id_to_tag