class FeatureExtractor(object): def __init__(self, model, wordsCount, rels, langs, words, ch, nnvecs, options): """ Options handling """ self.model = model if langs: self.langs = {lang: ind+1 for ind, lang in enumerate(langs)} # +1 for padding vector else: self.langs = None self.nnvecs = nnvecs self.multiling = options.multiling #and options.use_lembed self.external_embedding = None if options.external_embedding is not None: self.get_external_embeddings(options.external_embedding, model, wordsCount) self.disable_bilstm = options.disable_bilstm self.disable_second_bilstm = options.disable_second_bilstm """sharing""" self.shareBiLSTM = options.shareBiLSTM self.shareWordLookup = options.shareWordLookup self.shareCharLookup = options.shareCharLookup self.shareCharBiLSTM = options.shareCharBiLSTM self.word_lembed = options.lembed_word self.char_lembed = options.lembed_char """dims""" self.word_emb_size = options.word_emb_size self.char_emb_size = options.char_emb_size self.lstm_output_size = options.lstm_output_size self.char_lstm_output_size = options.char_lstm_output_size self.lang_emb_size = options.lang_emb_size lstm_input_size = self.word_emb_size + (self.edim if self.external_embedding is\ not None else 0) + (self.lang_emb_size if self.word_lembed else 0)\ + 2 * self.char_lstm_output_size """UTILS""" self.wordsCount = wordsCount self.irels = rels if self.multiling and not self.shareWordLookup: w2i = {} for lang in self.langs: w2i[lang] = {w: i for i, w in enumerate(words[lang])} self.vocab = {} for lang in self.langs: self.vocab[lang] = {word: ind+2 for word, ind in w2i[lang].iteritems()} else: w2i = {w: i for i, w in enumerate(words)} self.vocab = {word: ind+2 for word, ind in w2i.iteritems()} # +2 for MLP padding vector and OOV vector if not self.multiling or self.shareCharLookup: self.chars = {char: ind+1 for ind, char in enumerate(ch)} # +1 for OOV vector else: self.chars = {} for lang in self.langs: self.chars[lang] = {char: ind+1 for ind, char in enumerate(ch[lang])} self.rels = {word: ind for ind, word in enumerate(rels)} """BILSTMS""" #word if not self.multiling or self.shareBiLSTM: if not self.disable_bilstm: self.bilstm1 = BiLSTM(lstm_input_size, self.lstm_output_size, model, dropout_rate=0.33) if not self.disable_second_bilstm: self.bilstm2 = BiLSTM(2* self.lstm_output_size, self.lstm_output_size, model, dropout_rate=0.33) else: self.lstm_output_size = int(lstm_input_size * 0.5) else: self.bilstm1= {} self.bilstm2= {} for lang in self.langs: self.bilstm1[lang] = BiLSTM(lstm_input_size, self.lstm_output_size, model, dropout_rate=0.33) self.bilstm2[lang] = BiLSTM(2* self.lstm_output_size, self.lstm_output_size, model, dropout_rate=0.33) #char if self.char_lembed: char_in_dims = self.char_emb_size + self.lang_emb_size else: char_in_dims = self.char_emb_size if not self.multiling or self.shareCharBiLSTM: self.char_bilstm = BiLSTM(char_in_dims,self.char_lstm_output_size,self.model,dropout_rate=0.33) else: self.char_bilstms = {} for lang in self.langs: self.char_bilstms[lang] = BiLSTM(char_in_dims,self.char_lstm_output_size,self.model,dropout_rate=0.33) """LOOKUPS""" if not self.multiling or self.shareCharLookup: self.clookup = self.model.add_lookup_parameters((len(ch) + 1, self.char_emb_size)) else: self.clookups = {} for lang in self.langs: self.clookups[lang] = self.model.add_lookup_parameters((len(ch[lang]) + 1, self.char_emb_size)) if not self.multiling or self.shareWordLookup: self.wlookup = self.model.add_lookup_parameters((len(words) + 2, self.word_emb_size)) else: self.wlookups = {} for lang in self.langs: self.wlookups[lang] = self.model.add_lookup_parameters((len(words[lang]) + 2, self.word_emb_size)) if self.multiling and self.lang_emb_size > 0: self.langslookup = model.add_lookup_parameters((len(langs) + 1, self.lang_emb_size)) """Padding""" self.word2lstm = model.add_parameters((self.lstm_output_size * 2, lstm_input_size)) self.word2lstmbias = model.add_parameters((self.lstm_output_size *2)) self.chPadding = model.add_parameters((self.char_lstm_output_size *2)) def get_char_vec(self,word,dropout,lang=None,langvec=None): if word.form == "*root*": word.chVec = self.chPadding.expr() # use the padding vector if it's the word token else: char_vecs = [] for char in word.form: if lang: cvec = self.clookups[lang][self.chars[lang].get(char,0)] else: cvec = self.clookup[self.chars.get(char,0)] if langvec: char_vecs.append(dy.concatenate([langvec,cvec])) else: char_vecs.append(cvec) if lang: word.chVec = self.char_bilstms[lang].get_sequence_vector(char_vecs,dropout) else: word.chVec = self.char_bilstm.get_sequence_vector(char_vecs,dropout) def Init(self): #TODO: This function makes me cry #I'm not sure how necessary it is to get different padding vecs evec = self.elookup[1] if self.external_embedding is not None else None paddingLangVec = self.langslookup[0] if self.multiling and self.lang_emb_size > 0 else None if not self.multiling or self.shareWordLookup: paddingWordVec = self.wlookup[1] #import ipdb;ipdb.set_trace() self.paddingVec = dy.tanh(self.word2lstm.expr() * dy.concatenate(filter(None, [paddingWordVec, evec, self.chPadding.expr(), paddingLangVec if self.word_lembed else None])) + self.word2lstmbias.expr() ) self.empty = self.paddingVec if self.nnvecs == 1 else dy.concatenate([self.paddingVec for _ in xrange(self.nnvecs)]) else: paddingWordVecs = {} self.paddingVecs = {} self.emptyVecs = {} for lang in self.langs: paddingWordVecs[lang] = self.wlookups[lang][1] self.paddingVecs[lang] = dy.tanh(self.word2lstm.expr() * dy.concatenate(filter(None, [paddingWordVecs[lang], evec, self.chPadding.expr(), paddingLangVec if self.word_lembed else None])) + self.word2lstmbias.expr() ) self.emptyVecs[lang] = self.paddingVecs[lang] if self.nnvecs == 1 else dy.concatenate([self.paddingVecs[lang] for _ in xrange(self.nnvecs)]) def getWordEmbeddings(self, sentence, train, get_vectors=False): lang = sentence[0].language_id for root in sentence: #word if not self.multiling or self.shareWordLookup: wordcount = float(self.wordsCount.get(root.norm, 0)) else: wordcount = float(self.wordsCount[lang].get(root.norm, 0)) noDropFlag = not train or (random.random() < (wordcount/(0.25+wordcount))) if not self.multiling or self.shareWordLookup: root.wordvec = self.wlookup[int(self.vocab.get(root.norm, 0)) if noDropFlag else 0] else: root.wordvec = self.wlookups[lang][int(self.vocab[lang].get(root.norm, 0)) if noDropFlag else 0] if self.multiling and self.word_lembed: root.langvec = self.langslookup[self.langs[root.language_id]] if self.lang_emb_size > 0 else None else: root.langvec = None #char if not self.multiling or self.shareCharBiLSTM: if self.char_lembed: langVec = self.langslookup[self.langs[lang]] self.get_char_vec(root,train, langvec=langvec) else: self.get_char_vec(root,train) else: self.get_char_vec(root,train, lang=lang) if self.external_embedding is not None: if not noDropFlag and random.random() < 0.5: root.evec = self.elookup[0] elif root.form in self.external_embedding: root.evec = self.elookup[self.extrnd[root.form]] elif root.norm in self.external_embedding: root.evec = self.elookup[self.extrnd[root.norm]] else: root.evec = self.elookup[0] else: root.evec = None root.vec = dy.concatenate(filter(None, [root.wordvec, root.evec, root.chVec, root.langvec])) if not self.multiling or self.shareBiLSTM: self.bilstm1.set_token_vecs(sentence,train) self.bilstm2.set_token_vecs(sentence,train) else: self.bilstm1[lang].set_token_vecs(sentence,train) self.bilstm2[lang].set_token_vecs(sentence,train) if get_vectors: data_vec = list() for i, token in enumerate(sentence): if token.form != '*root*': import pdb wordvec = token.wordvec.value() if self.external_embedding is not None: wordvec += token.evec.value() data_tuple = (i+1, token.form, token.cpos, token.feats, token.chVec.value(), wordvec, token.vec.value()) data_vec.append(data_tuple) return data_vec def get_external_embeddings(self, external_embedding_file, model, wordsCount): # NOTE: this is modified to load fastText embeddings! self.external_embedding = {} external_embedding_fp = codecs.open(external_embedding_file, 'r', encoding='utf-8') # read first line --- number of tokens and embedding dimension self.edim = int(external_embedding_fp.readline().split()[1]) num_tokens = 0 for line in external_embedding_fp: line = line.strip().split() if len(line) != self.edim + 1: continue else: if line[0] in wordsCount: self.external_embedding[line[0]] = [float(f) for f in line[1:]] num_tokens += 1 external_embedding_fp.close() # self.edim = len(self.external_embedding.values()[0]) self.noextrn = [0.0 for _ in xrange(self.edim)] #??? self.extrnd = {word: i + 3 for i, word in enumerate(self.external_embedding)} self.elookup = model.add_lookup_parameters((len(self.external_embedding) + 3, self.edim)) for word, i in self.extrnd.iteritems(): self.elookup.init_row(i, self.external_embedding[word]) self.extrnd['*PAD*'] = 1 self.extrnd['*INITIAL*'] = 2 print '-' * 100 print 'Load external embedding. Vector dimensions:', self.edim, ', number of tokens:', num_tokens print '-' * 100
class FeatureExtractor(object): def __init__(self, model, options, words, rels, langs, w2i, ch, nnvecs): self.model = model self.disableBilstm = options.disable_bilstm self.multiling = options.use_lembed and options.multiling self.lstm_output_size = options.lstm_output_size self.char_lstm_output_size = options.char_lstm_output_size self.word_emb_size = options.word_emb_size self.char_emb_size = options.char_emb_size self.lang_emb_size = options.lang_emb_size self.wordsCount = words self.vocab = {word: ind + 2 for word, ind in w2i.iteritems() } # +2 for MLP padding vector and OOV vector self.chars = {char: ind + 1 for ind, char in enumerate(ch)} # +1 for OOV vector self.rels = {word: ind for ind, word in enumerate(rels)} self.nnvecs = nnvecs if langs: self.langs = {lang: ind + 1 for ind, lang in enumerate(langs) } # +1 for padding vector else: self.langs = None self.irels = rels self.external_embedding = None if options.external_embedding is not None: self.get_external_embeddings(options.external_embedding) lstm_input_size = self.word_emb_size + (self.edim if self.external_embedding is\ not None else 0) + (self.lang_emb_size if self.multiling else 0) + 2 * self.char_lstm_output_size if not self.disableBilstm: self.bilstm1 = BiLSTM(lstm_input_size, self.lstm_output_size, self.model, dropout_rate=0.33) self.bilstm2 = BiLSTM(2 * self.lstm_output_size, self.lstm_output_size, self.model, dropout_rate=0.33) else: self.lstm_output_size = int(lstm_input_size * 0.5) self.char_bilstm = BiLSTM(self.char_emb_size, self.char_lstm_output_size, self.model, dropout_rate=0.33) self.clookup = self.model.add_lookup_parameters( (len(ch) + 1, self.char_emb_size)) self.wlookup = self.model.add_lookup_parameters( (len(words) + 2, self.word_emb_size)) if self.multiling and self.lang_emb_size > 0: self.langslookup = self.model.add_lookup_parameters( (len(langs) + 1, self.lang_emb_size)) #used in the PaddingVec self.word2lstm = self.model.add_parameters( (self.lstm_output_size * 2, lstm_input_size)) self.word2lstmbias = self.model.add_parameters( (self.lstm_output_size * 2)) self.chPadding = self.model.add_parameters( (self.char_lstm_output_size * 2)) def Init(self): evec = self.elookup[1] if self.external_embedding is not None else None paddingWordVec = self.wlookup[1] paddingLangVec = self.langslookup[ 0] if self.multiling and self.lang_emb_size > 0 else None self.paddingVec = dy.tanh(self.word2lstm.expr() * dy.concatenate( filter( None, [paddingWordVec, evec, self.chPadding.expr(), paddingLangVec])) + self.word2lstmbias.expr()) self.empty = self.paddingVec if self.nnvecs == 1 else dy.concatenate( [self.paddingVec for _ in xrange(self.nnvecs)]) def getWordEmbeddings(self, sentence, train): for root in sentence: wordcount = float(self.wordsCount.get(root.norm, 0)) noDropFlag = not train or (random.random() < (wordcount / (0.25 + wordcount))) root.wordvec = self.wlookup[int(self.vocab.get(root.norm, 0) ) if noDropFlag else 0] self.get_char_vector(root, train) if self.external_embedding is not None: if not noDropFlag and random.random() < 0.5: root.evec = self.elookup[0] elif root.form in self.external_embedding: root.evec = self.elookup[self.extrnd[root.form]] elif root.norm in self.external_embedding: root.evec = self.elookup[self.extrnd[root.norm]] else: root.evec = self.elookup[0] else: root.evec = None if self.multiling: root.langvec = self.langslookup[self.langs[ root.language_id]] if self.lang_emb_size > 0 else None else: root.langvec = None root.vec = dy.concatenate( filter(None, [root.wordvec, root.evec, root.chVec, root.langvec])) if not self.disableBilstm: self.bilstm1.set_token_vecs(sentence, train) self.bilstm2.set_token_vecs(sentence, train) def get_char_vector(self, root, train): if root.form == "*root*": # no point running a character analysis over this placeholder token root.chVec = self.chPadding.expr( ) # use the padding vector if it's the root token else: char_vecs = [] for char in root.form: char_vecs.append(self.clookup[self.chars.get(char, 0)]) root.chVec = self.char_bilstm.get_sequence_vector(char_vecs, train) def get_external_embeddings(self, external_embedding_file): external_embedding_fp = codecs.open(external_embedding_file, 'r', encoding='utf-8') external_embedding_fp.readline() self.external_embedding = {} for line in external_embedding_fp: line = line.strip().split() self.external_embedding[line[0]] = [float(f) for f in line[1:]] external_embedding_fp.close() self.edim = len(self.external_embedding.values()[0]) self.noextrn = [0.0 for _ in xrange(self.edim)] #??? self.extrnd = { word: i + 3 for i, word in enumerate(self.external_embedding) } self.elookup = self.model.add_lookup_parameters( (len(self.external_embedding) + 3, self.edim)) for word, i in self.extrnd.iteritems(): self.elookup.init_row(i, self.external_embedding[word]) self.extrnd['*PAD*'] = 1 self.extrnd['*INITIAL*'] = 2 print 'Load external embedding. Vector dimensions', self.edim