def read_conll(path): word_sentences = [] pos_sentences = [] words = [] poses = [] word_alphabet = Alphabet('word', (padding_symbol,)) pos_alphabet = Alphabet('pos', (padding_symbol,)) with open(path) as f: for l in f: if l.strip() == "": word_sentences.append(words[:]) pos_sentences.append(poses[:]) words = [] poses = [] else: parts = l.split() word = parts[1] pos = parts[4] words.append(word) poses.append(pos) word_alphabet.add(word) pos_alphabet.add(pos) return word_sentences, pos_sentences, word_alphabet, pos_alphabet
def read_conll(path, label_alphabet=None): word_sentences = [] pos_sentences = [] words = [] poses = [] word_alphabet = Alphabet('word', (padding_symbol, )) if label_alphabet is None: label_alphabet = Alphabet('label', (padding_symbol, )) with open(path) as f: for l in f: if l.strip() == "": word_sentences.append(words[:]) pos_sentences.append(poses[:]) words = [] poses = [] else: parts = l.split() word = parts[1] pos = parts[4] words.append(word) poses.append(pos) word_alphabet.add(word) label_alphabet.add(pos) # Add the last sentence in. if len(words) > 0: word_sentences.append(words[:]) pos_sentences.append(poses[:]) return word_sentences, pos_sentences, word_alphabet, label_alphabet
class Lookup: def __init__(self, config): """ :return: """ if config.word_vector == 'word2vec': logger.info("Loading word2vec from disk ...") self.model = Word2Vec.load_word2vec_format(config.word_vector_path, binary=True) print("Loading done...") self.full_alphabet = Alphabet("full_lookup") def initail_lookup(self, alphabet): """ Initialize the lookup table of the word vectors. This will create a full lookup table that contains all the vocabulary, and a table that contains only the given alphabet. :param alphabet: The alphabet that stores the words. :return: A numpy array of shape [vocabulary size, dimension], each row is a word embedding. """ embeddings = [] if Alphabet.default_index == 0: embeddings.append(uniform_embedding([1, self.model.vector_size])) else: raise ValueError("Default index is not the first one, you must change the implementation here.") # Add words from the given alphabet to the embedding list, and to the full alphabet. for w, index in alphabet.iteritems(): if not self.full_alphabet.has_instance(w): embedding = self.model[w] if w in self.model else uniform_embedding([1, self.model.vector_size]) embeddings.append(embedding) self.full_alphabet.add(w) # Store embeddings that appear in training data. self.table = np.vstack(embeddings) for w in self.model.vocab.keys(): if not alphabet.has_instance(w): embedding = self.model[w] self.full_alphabet.add(w) embeddings.append(embedding) # Store embeddings of the full vocabulary. self.full_table = np.vstack(embeddings) logger.info("The training only embedding table contains %d embeddings, each with a dimension of size %d." % ( self.table.shape[0], self.table.shape[1])) logger.info("The full embedding table contains %d embeddings, each with a dimension of size %d." % ( self.full_table.shape[0], self.full_table.shape[1])) def load_additional_embeddings(self, original_alphabet, new_alphabet): """ Create an additional lookup table that contains additional words that's not in the orginal ones. :param original_alphabet: The original table. :param new_alphabet: The additional table. :return: """ embeddings = [] for w, index in new_alphabet.iteritems(): if not original_alphabet.has_instance(w): embedding = self.model[w] if w in self.model else uniform_embedding([1, self.model.vector_size]) embeddings.append(embedding) if len(embeddings) > 0: additional_table = np.vstack(embeddings) return additional_table else: return None