示例#1
0
class FeatureExtractor(object):
    def __init__(self, model, options, vocab, nnvecs=1):

        self.word_counts, words, chars, pos, cpos, rels, treebanks, langs = vocab

        self.model = model
        self.nnvecs = nnvecs

        # Load ELMo if the option is set
        if options.elmo is not None:
            from elmo import ELMo
            self.elmo = ELMo(options.elmo, options.elmo_gamma,
                             options.elmo_learn_gamma)
            self.elmo.init_weights(model)
        else:
            self.elmo = None

        extra_words = 2  # MLP padding vector and OOV vector
        self.words = {word: ind for ind, word in enumerate(words, extra_words)}
        self.word_lookup = self.model.add_lookup_parameters(
            (len(self.words) + extra_words, options.word_emb_size))

        extra_pos = 2  # MLP padding vector and OOV vector
        self.pos = {pos: ind for ind, pos in enumerate(cpos, extra_pos)}
        self.pos_lookup = self.model.add_lookup_parameters(
            (len(cpos) + extra_pos, options.pos_emb_size))

        self.irels = rels
        self.rels = {rel: ind for ind, rel in enumerate(rels)}

        extra_chars = 1  # OOV vector
        self.chars = {char: ind for ind, char in enumerate(chars, extra_chars)}
        self.char_lookup = self.model.add_lookup_parameters(
            (len(chars) + extra_chars, options.char_emb_size))

        extra_treebanks = 1  # Padding vector
        self.treebanks = {
            treebank: ind
            for ind, treebank in enumerate(treebanks, extra_treebanks)
        }
        self.treebank_lookup = self.model.add_lookup_parameters(
            (len(treebanks) + extra_treebanks, options.tbank_emb_size))

        # initialise word vectors with external embeddings where they exist
        # This part got ugly - TODO: refactor
        if not options.predict:
            self.external_embedding = defaultdict(lambda: {})

            if options.ext_word_emb_file and options.word_emb_size > 0:
                # Load pre-trained word embeddings
                for lang in langs:
                    embeddings = utils.get_external_embeddings(
                        options,
                        emb_file=options.ext_word_emb_file,
                        lang=lang,
                        words=self.words.viewkeys())
                    self.external_embedding["words"].update(embeddings)

            if options.ext_char_emb_file and options.char_emb_size > 0:
                # Load pre-trained character embeddings
                for lang in langs:
                    embeddings = utils.get_external_embeddings(
                        options,
                        emb_file=options.ext_char_emb_file,
                        lang=lang,
                        words=self.chars,
                        chars=True)
                    self.external_embedding["chars"].update(embeddings)

            if options.ext_emb_dir:
                # For every language, load the data for the word and character
                # embeddings from a directory.
                for lang in langs:
                    if options.word_emb_size > 0:
                        embeddings = utils.get_external_embeddings(
                            options,
                            emb_dir=options.ext_emb_dir,
                            lang=lang,
                            words=self.words.viewkeys())
                        self.external_embedding["words"].update(embeddings)

                    if options.char_emb_size > 0:
                        embeddings = utils.get_external_embeddings(
                            options,
                            emb_dir=options.ext_emb_dir,
                            lang=lang,
                            words=self.chars,
                            chars=True)
                        self.external_embedding["chars"].update(embeddings)

            self.init_lookups(options)

        elmo_emb_size = self.elmo.emb_dim if self.elmo else 0
        self.lstm_input_size = (
            options.word_emb_size + elmo_emb_size + options.pos_emb_size +
            options.tbank_emb_size + 2 *
            (options.char_lstm_output_size if options.char_emb_size > 0 else 0)
        )
        print "Word-level LSTM input size: " + str(self.lstm_input_size)

        self.bilstms = []
        if options.no_bilstms > 0:
            self.bilstms.append(
                BiLSTM(self.lstm_input_size,
                       options.lstm_output_size,
                       self.model,
                       dropout_rate=0.33))
            for i in range(1, options.no_bilstms):
                self.bilstms.append(
                    BiLSTM(2 * options.lstm_output_size,
                           options.lstm_output_size,
                           self.model,
                           dropout_rate=0.33))
            #used in the PaddingVec
            self.word2lstm = self.model.add_parameters(
                (options.lstm_output_size * 2, self.lstm_input_size))
            self.word2lstmbias = self.model.add_parameters(
                (options.lstm_output_size * 2))
        else:
            self.word2lstm = self.model.add_parameters(
                (self.lstm_input_size, self.lstm_input_size))
            self.word2lstmbias = self.model.add_parameters(
                (self.lstm_input_size))

        self.char_bilstm = BiLSTM(options.char_emb_size,
                                  options.char_lstm_output_size,
                                  self.model,
                                  dropout_rate=0.33)

        self.charPadding = self.model.add_parameters(
            (options.char_lstm_output_size * 2))

    def Init(self, options):
        paddingWordVec = self.word_lookup[
            1] if options.word_emb_size > 0 else None
        paddingElmoVec = dy.zeros(self.elmo.emb_dim) if self.elmo else None
        paddingPosVec = self.pos_lookup[1] if options.pos_emb_size > 0 else None
        paddingCharVec = self.charPadding.expr(
        ) if options.char_emb_size > 0 else None
        paddingTbankVec = self.treebank_lookup[
            0] if options.tbank_emb_size > 0 else None

        self.paddingVec = dy.tanh(self.word2lstm.expr() *\
            dy.concatenate(filter(None,[paddingWordVec,
                                        paddingElmoVec,
                                        paddingPosVec,
                                        paddingCharVec,
                                        paddingTbankVec])) + self.word2lstmbias.expr())

        self.empty = self.paddingVec if self.nnvecs == 1 else\
            dy.concatenate([self.paddingVec for _ in xrange(self.nnvecs)])

    def getWordEmbeddings(self,
                          sentence,
                          train,
                          options,
                          test_embeddings=defaultdict(lambda: {})):

        if self.elmo:
            # Get full text of sentence - excluding root, which is loaded differently
            # for transition and graph-based parsers.
            if options.graph_based:
                sentence_text = " ".join(
                    [entry.form for entry in sentence[1:]])
            else:
                sentence_text = " ".join(
                    [entry.form for entry in sentence[:-1]])

            elmo_sentence_representation = \
                self.elmo.get_sentence_representation(sentence_text)

        for i, root in enumerate(sentence):
            root.vecs = defaultdict(
                lambda: None
            )  # all vecs are None by default (possibly a little risky?)
            if options.word_emb_size > 0:
                if train:
                    word_count = float(self.word_counts.get(root.norm, 0))
                    dropFlag = random.random() > word_count / (0.25 +
                                                               word_count)
                    root.vecs["word"] = self.word_lookup[
                        self.words.get(root.norm, 0) if not dropFlag else 0]
                else:  # need to check in test_embeddings at prediction time
                    if root.norm in self.words:
                        root.vecs["word"] = self.word_lookup[self.words[
                            root.norm]]
                    elif root.norm in test_embeddings["words"]:
                        root.vecs["word"] = dy.inputVector(
                            test_embeddings["words"][root.norm])
                    else:
                        root.vecs["word"] = self.word_lookup[0]
            if options.pos_emb_size > 0:
                root.vecs["pos"] = self.pos_lookup[self.pos.get(root.cpos, 0)]
            if options.char_emb_size > 0:
                root.vecs["char"] = self.get_char_vector(
                    root, train, test_embeddings["chars"])
            if options.tbank_emb_size > 0:
                if options.forced_tbank_emb:
                    treebank_id = options.forced_tbank_emb
                elif root.proxy_tbank:
                    treebank_id = root.proxy_tbank
                else:
                    treebank_id = root.treebank_id
                # this is a bit of a hack for models trained on an old version of the code
                # that used treebank name rather than id as the lookup
                if not treebank_id in self.treebanks and treebank_id in utils.reverse_iso_dict and \
                    utils.reverse_iso_dict[treebank_id] in self.treebanks:
                    treebank_id = utils.reverse_iso_dict[treebank_id]
                root.vecs["treebank"] = self.treebank_lookup[
                    self.treebanks[treebank_id]]
            if self.elmo:
                if i < len(sentence) - 1:
                    # Don't look up the 'root' word
                    root.vecs["elmo"] = elmo_sentence_representation[i]
                else:
                    # TODO
                    root.vecs["elmo"] = dy.zeros(self.elmo.emb_dim)

            root.vec = dy.concatenate(
                filter(None, [
                    root.vecs["word"], root.vecs["elmo"], root.vecs["pos"],
                    root.vecs["char"], root.vecs["treebank"]
                ]))

        for bilstm in self.bilstms:
            bilstm.set_token_vecs(sentence, train)

    def get_char_vector(self, root, train, test_embeddings_chars={}):

        if root.char_rep == "*root*":  # no point running a character analysis over this placeholder token
            return self.charPadding.expr(
            )  # use the padding vector if it's the root token
        else:
            char_vecs = []
            for char in root.char_rep:
                if char in self.chars:
                    char_vecs.append(self.char_lookup[self.chars[char]])
                elif char in test_embeddings_chars:
                    char_vecs.append(
                        dy.inputVector(test_embeddings_chars[char]))
                else:
                    char_vecs.append(self.char_lookup[0])
            return self.char_bilstm.get_sequence_vector(char_vecs, train)

    def init_lookups(self, options):

        if self.external_embedding["words"]:
            print 'Initialising %i word vectors with external embeddings' % len(
                self.external_embedding["words"])
            for word in self.external_embedding["words"]:
                if len(self.external_embedding["words"]
                       [word]) != options.word_emb_size:
                    raise Exception(
                        "Size of external embedding does not match specified word embedding size of %s"
                        % (options.word_emb_size))
                self.word_lookup.init_row(
                    self.words[word], self.external_embedding["words"][word])
        elif options.word_emb_size > 0:
            print 'No word external embeddings found: all vectors initialised randomly'

        if self.external_embedding["chars"]:
            print 'Initialising %i char vectors with external embeddings' % len(
                self.external_embedding["chars"])
            for char in self.external_embedding["chars"]:
                if len(self.external_embedding["chars"]
                       [char]) != options.char_emb_size:
                    raise Exception(
                        "Size of external embedding does not match specified char embedding size of %s"
                        % (options.char_emb_size))
                self.char_lookup.init_row(
                    self.chars[char], self.external_embedding["chars"][char])
        elif options.char_emb_size > 0:
            print 'No character external embeddings found: all vectors initialised randomly'
示例#2
0
class FeatureExtractor(object):
    def __init__(self, model, wordsCount, rels, langs, words, ch, nnvecs, options):
        """
        Options handling
        """
        self.model = model
        if langs:
            self.langs = {lang: ind+1 for ind, lang in enumerate(langs)} # +1 for padding vector
        else:
            self.langs = None
        self.nnvecs = nnvecs
        self.multiling = options.multiling #and options.use_lembed
        self.external_embedding = None
        if options.external_embedding is not None:
            self.get_external_embeddings(options.external_embedding, model, wordsCount)
        self.disable_bilstm = options.disable_bilstm
        self.disable_second_bilstm = options.disable_second_bilstm

        """sharing"""
        self.shareBiLSTM = options.shareBiLSTM
        self.shareWordLookup = options.shareWordLookup
        self.shareCharLookup = options.shareCharLookup
        self.shareCharBiLSTM = options.shareCharBiLSTM
        self.word_lembed = options.lembed_word
        self.char_lembed = options.lembed_char

        """dims"""
        self.word_emb_size = options.word_emb_size
        self.char_emb_size = options.char_emb_size
        self.lstm_output_size = options.lstm_output_size
        self.char_lstm_output_size = options.char_lstm_output_size
        self.lang_emb_size = options.lang_emb_size

        lstm_input_size = self.word_emb_size + (self.edim if self.external_embedding is\
                          not None else 0) + (self.lang_emb_size if self.word_lembed else 0)\
                          + 2 * self.char_lstm_output_size

        """UTILS"""
        self.wordsCount = wordsCount
        self.irels = rels

        if self.multiling and not self.shareWordLookup:
            w2i = {}
            for lang in self.langs:
                 w2i[lang] = {w: i for i, w in enumerate(words[lang])}
            self.vocab = {}
            for lang in self.langs:
                self.vocab[lang] = {word: ind+2 for word, ind in w2i[lang].iteritems()}

        else:
            w2i = {w: i for i, w in enumerate(words)}
            self.vocab = {word: ind+2 for word, ind in w2i.iteritems()} # +2 for MLP padding vector and OOV vector

        if not self.multiling or self.shareCharLookup:
            self.chars = {char: ind+1 for ind, char in enumerate(ch)} # +1 for OOV vector
        else:
            self.chars = {}
            for lang in self.langs:
                self.chars[lang] = {char: ind+1 for ind, char in enumerate(ch[lang])}
        self.rels = {word: ind for ind, word in enumerate(rels)}

        """BILSTMS"""
        #word
        if not self.multiling or self.shareBiLSTM:
            if not self.disable_bilstm:
                self.bilstm1 = BiLSTM(lstm_input_size, self.lstm_output_size, model,
                                      dropout_rate=0.33)
                if not self.disable_second_bilstm:
                    self.bilstm2 = BiLSTM(2* self.lstm_output_size, self.lstm_output_size, model,
                                          dropout_rate=0.33)
            else:
                self.lstm_output_size = int(lstm_input_size * 0.5)
        else:
            self.bilstm1= {}
            self.bilstm2= {}
            for lang in self.langs:
                self.bilstm1[lang] = BiLSTM(lstm_input_size, self.lstm_output_size, model,
                                      dropout_rate=0.33)
                self.bilstm2[lang] = BiLSTM(2* self.lstm_output_size, self.lstm_output_size, model,
                                            dropout_rate=0.33)

        #char
        if self.char_lembed:
            char_in_dims = self.char_emb_size + self.lang_emb_size
        else:
            char_in_dims = self.char_emb_size

        if not self.multiling or self.shareCharBiLSTM:
            self.char_bilstm = BiLSTM(char_in_dims,self.char_lstm_output_size,self.model,dropout_rate=0.33)
        else:
            self.char_bilstms = {}
            for lang in self.langs:
                self.char_bilstms[lang] = BiLSTM(char_in_dims,self.char_lstm_output_size,self.model,dropout_rate=0.33)

        """LOOKUPS"""
        if not self.multiling or self.shareCharLookup:
            self.clookup = self.model.add_lookup_parameters((len(ch) + 1, self.char_emb_size))
        else:
            self.clookups = {}
            for lang in self.langs:
                self.clookups[lang] = self.model.add_lookup_parameters((len(ch[lang]) + 1, self.char_emb_size))

        if not self.multiling or self.shareWordLookup:
            self.wlookup = self.model.add_lookup_parameters((len(words) + 2, self.word_emb_size))
        else:
            self.wlookups = {}
            for lang in self.langs:
                self.wlookups[lang] = self.model.add_lookup_parameters((len(words[lang]) + 2, self.word_emb_size))

        if self.multiling and self.lang_emb_size > 0:
            self.langslookup = model.add_lookup_parameters((len(langs) + 1, self.lang_emb_size))


        """Padding"""
        self.word2lstm = model.add_parameters((self.lstm_output_size * 2, lstm_input_size))
        self.word2lstmbias = model.add_parameters((self.lstm_output_size *2))
        self.chPadding = model.add_parameters((self.char_lstm_output_size *2))

    def get_char_vec(self,word,dropout,lang=None,langvec=None):
        if word.form == "*root*":
            word.chVec = self.chPadding.expr() # use the padding vector if it's the word token
        else:
            char_vecs = []
            for char in word.form:
                if lang:
                    cvec = self.clookups[lang][self.chars[lang].get(char,0)]
                else:
                    cvec = self.clookup[self.chars.get(char,0)]
                if langvec:
                    char_vecs.append(dy.concatenate([langvec,cvec]))
                else:
                    char_vecs.append(cvec)
            if lang:
                word.chVec = self.char_bilstms[lang].get_sequence_vector(char_vecs,dropout)
            else:
                word.chVec = self.char_bilstm.get_sequence_vector(char_vecs,dropout)

    def Init(self):
        #TODO: This function makes me cry
        #I'm not sure how necessary it is to get different padding vecs
        evec = self.elookup[1] if self.external_embedding is not None else None
        paddingLangVec = self.langslookup[0] if self.multiling and self.lang_emb_size > 0 else None
        if not self.multiling or self.shareWordLookup:
            paddingWordVec = self.wlookup[1]
            #import ipdb;ipdb.set_trace()
            self.paddingVec = dy.tanh(self.word2lstm.expr() * dy.concatenate(filter(None,
                                                                              [paddingWordVec,
                                                                               evec,
                                                                               self.chPadding.expr(),
                                                                               paddingLangVec if self.word_lembed else None]))
                                                                              + self.word2lstmbias.expr() )
            self.empty = self.paddingVec if self.nnvecs == 1 else dy.concatenate([self.paddingVec for _ in xrange(self.nnvecs)])
        else:
            paddingWordVecs = {}
            self.paddingVecs = {}
            self.emptyVecs = {}
            for lang in self.langs:
                paddingWordVecs[lang] = self.wlookups[lang][1]
                self.paddingVecs[lang] = dy.tanh(self.word2lstm.expr() * dy.concatenate(filter(None,
                                                                                        [paddingWordVecs[lang],
                                                                                         evec,
                                                                                         self.chPadding.expr(),
                                                                                         paddingLangVec if self.word_lembed else None]))
                                                                                          + self.word2lstmbias.expr() )
                self.emptyVecs[lang] = self.paddingVecs[lang] if self.nnvecs == 1 else dy.concatenate([self.paddingVecs[lang] for _ in xrange(self.nnvecs)])

    def getWordEmbeddings(self, sentence, train, get_vectors=False):

        lang = sentence[0].language_id

        for root in sentence:
            #word
            if not self.multiling or self.shareWordLookup:
                wordcount = float(self.wordsCount.get(root.norm, 0))
            else:
                wordcount = float(self.wordsCount[lang].get(root.norm, 0))

            noDropFlag =  not train or (random.random() < (wordcount/(0.25+wordcount)))
            if not self.multiling or self.shareWordLookup:
                root.wordvec = self.wlookup[int(self.vocab.get(root.norm, 0)) if noDropFlag else 0]
            else:
                root.wordvec = self.wlookups[lang][int(self.vocab[lang].get(root.norm, 0)) if noDropFlag else 0]

            if self.multiling and self.word_lembed:
                root.langvec = self.langslookup[self.langs[root.language_id]] if self.lang_emb_size > 0 else None
            else:
                root.langvec = None

            #char
            if not self.multiling or self.shareCharBiLSTM:
                if self.char_lembed:
                    langVec = self.langslookup[self.langs[lang]]
                    self.get_char_vec(root,train, langvec=langvec)
                else:
                    self.get_char_vec(root,train)

            else:
                self.get_char_vec(root,train, lang=lang)

            if self.external_embedding is not None:
                if not noDropFlag and random.random() < 0.5:
                    root.evec = self.elookup[0]
                elif root.form in self.external_embedding:
                    root.evec = self.elookup[self.extrnd[root.form]]
                elif root.norm in self.external_embedding:
                    root.evec = self.elookup[self.extrnd[root.norm]]
                else:
                    root.evec = self.elookup[0]
            else:
                root.evec = None

            root.vec = dy.concatenate(filter(None, [root.wordvec,
                                                    root.evec,
                                                    root.chVec,
                                                    root.langvec]))

        if not self.multiling or self.shareBiLSTM:
            self.bilstm1.set_token_vecs(sentence,train)
            self.bilstm2.set_token_vecs(sentence,train)
        else:
            self.bilstm1[lang].set_token_vecs(sentence,train)
            self.bilstm2[lang].set_token_vecs(sentence,train)

        if get_vectors:
            data_vec = list()
            for i, token in enumerate(sentence):
                if token.form != '*root*':
                    import pdb
                    wordvec = token.wordvec.value()
                    if self.external_embedding is not None:
                        wordvec += token.evec.value()
                    data_tuple = (i+1, token.form, token.cpos, token.feats, token.chVec.value(), wordvec, token.vec.value())
                    data_vec.append(data_tuple)
            return data_vec 


    def get_external_embeddings(self, external_embedding_file, model, wordsCount):
        
        # NOTE: this is modified to load fastText embeddings!
        self.external_embedding = {}
        external_embedding_fp = codecs.open(external_embedding_file, 'r', encoding='utf-8')

        # read first line --- number of tokens and embedding dimension
        self.edim = int(external_embedding_fp.readline().split()[1])
        num_tokens = 0

        for line in external_embedding_fp:
            line = line.strip().split()
            if len(line) != self.edim + 1: 
                continue
            else:
                if line[0] in wordsCount:
                    self.external_embedding[line[0]] = [float(f) for f in line[1:]]
                    num_tokens += 1


        external_embedding_fp.close()
        # self.edim = len(self.external_embedding.values()[0])
        self.noextrn = [0.0 for _ in xrange(self.edim)] #???
        self.extrnd = {word: i + 3 for i, word in enumerate(self.external_embedding)}
        self.elookup = model.add_lookup_parameters((len(self.external_embedding) + 3, self.edim))
        for word, i in self.extrnd.iteritems():
            self.elookup.init_row(i, self.external_embedding[word])
        self.extrnd['*PAD*'] = 1
        self.extrnd['*INITIAL*'] = 2

        print '-' * 100
        print 'Load external embedding. Vector dimensions:', self.edim, ', number of tokens:', num_tokens
        print '-' * 100
示例#3
0
class FeatureExtractor(object):
    def __init__(self, model, options, vocab, nnvecs):

        self.word_counts, words, chars, pos, cpos, self.irels, treebanks, langs = vocab

        self.model = model
        self.nnvecs = nnvecs

        extra_words = 2  # MLP padding vector and OOV vector
        self.words = {word: ind for ind, word in enumerate(words, extra_words)}
        self.word_lookup = self.model.add_lookup_parameters(
            (len(self.words) + extra_words, options.word_emb_size))

        extra_pos = 2  # MLP padding vector and OOV vector
        self.pos = {pos: ind for ind, pos in enumerate(cpos, extra_pos)}
        self.pos_lookup = self.model.add_lookup_parameters(
            (len(cpos) + extra_pos, options.pos_emb_size))

        extra_chars = 1  # OOV vector
        self.chars = {char: ind for ind, char in enumerate(chars, extra_chars)}
        self.char_lookup = self.model.add_lookup_parameters(
            (len(chars) + extra_chars, options.char_emb_size))

        extra_treebanks = 1  # Padding vector
        self.treebanks = {
            treebank: ind
            for ind, treebank in enumerate(treebanks, extra_treebanks)
        }
        self.treebank_lookup = self.model.add_lookup_parameters(
            (len(treebanks) + extra_treebanks, options.tbank_emb_size))

        # initialise word vectors with external embeddings where they exist
        if (options.ext_emb_dir
                or options.ext_emb_file) and not options.predict:
            self.external_embedding = defaultdict(lambda: {})
            for lang in langs:
                if options.word_emb_size > 0:
                    self.external_embedding["words"].update(
                        utils.get_external_embeddings(options, lang,
                                                      self.words.viewkeys()))
                if options.char_emb_size > 0:
                    self.external_embedding["chars"].update(
                        utils.get_external_embeddings(options,
                                                      lang,
                                                      self.chars,
                                                      chars=True))
            self.init_lookups(options)

        self.lstm_input_size = options.word_emb_size + options.pos_emb_size + options.tbank_emb_size +\
            2* (options.char_lstm_output_size if options.char_emb_size > 0 else 0)

        print "Word-level LSTM input size: " + str(self.lstm_input_size)

        self.bilstms = []
        if options.no_bilstms > 0:
            self.bilstms.append(
                BiLSTM(self.lstm_input_size,
                       options.lstm_output_size,
                       self.model,
                       dropout_rate=0.33))
            for i in range(1, options.no_bilstms):
                self.bilstms.append(
                    BiLSTM(2 * options.lstm_output_size,
                           options.lstm_output_size,
                           self.model,
                           dropout_rate=0.33))
            #used in the PaddingVec
            self.word2lstm = self.model.add_parameters(
                (options.lstm_output_size * 2, self.lstm_input_size))
            self.word2lstmbias = self.model.add_parameters(
                (options.lstm_output_size * 2))
        else:
            self.word2lstm = self.model.add_parameters(
                (self.lstm_input_size, self.lstm_input_size))
            self.word2lstmbias = self.model.add_parameters(
                (self.lstm_input_size))

        self.char_bilstm = BiLSTM(options.char_emb_size,
                                  options.char_lstm_output_size,
                                  self.model,
                                  dropout_rate=0.33)

        self.charPadding = self.model.add_parameters(
            (options.char_lstm_output_size * 2))

    def Init(self, options):
        paddingWordVec = self.word_lookup[
            1] if options.word_emb_size > 0 else None
        paddingPosVec = self.pos_lookup[1] if options.pos_emb_size > 0 else None
        paddingCharVec = self.charPadding.expr(
        ) if options.char_emb_size > 0 else None
        paddingTbankVec = self.treebank_lookup[
            0] if options.tbank_emb_size > 0 else None

        self.paddingVec = dy.tanh(self.word2lstm.expr() *\
            dy.concatenate(filter(None,[paddingWordVec,
                                        paddingPosVec,
                                        paddingCharVec,
                                        paddingTbankVec])) + self.word2lstmbias.expr())

        self.empty = self.paddingVec if self.nnvecs == 1 else\
            dy.concatenate([self.paddingVec for _ in xrange(self.nnvecs)])

    def getWordEmbeddings(self,
                          sentence,
                          train,
                          options,
                          test_embeddings=defaultdict(lambda: {})):
        for root in sentence:
            root.vecs = defaultdict(
                lambda: None
            )  # all vecs are None by default (possibly a little risky?)
            if options.word_emb_size > 0:
                if train:
                    word_count = float(self.word_counts.get(root.norm, 0))
                    dropFlag = random.random() > word_count / (0.25 +
                                                               word_count)
                    root.vecs["word"] = self.word_lookup[
                        self.words.get(root.norm, 0) if not dropFlag else 0]
                else:  # need to check in test_embeddings at prediction time
                    if root.norm in self.words:
                        root.vecs["word"] = self.word_lookup[self.words[
                            root.norm]]
                    elif root.norm in test_embeddings["words"]:
                        root.vecs["word"] = dy.inputVector(
                            test_embeddings["words"][root.norm])
                    else:
                        root.vecs["word"] = self.word_lookup[0]
            if options.pos_emb_size > 0:
                root.vecs["pos"] = self.pos_lookup[self.pos.get(root.cpos, 0)]
            if options.char_emb_size > 0:
                root.vecs["char"] = self.get_char_vector(
                    root, train, test_embeddings["chars"])
            if options.tbank_emb_size > 0:
                if options.forced_tbank_emb:
                    treebank_id = options.forced_tbank_emb
                elif root.proxy_tbank:
                    treebank_id = root.proxy_tbank
                else:
                    treebank_id = root.treebank_id
                # this is a bit of a hack for models trained on an old version of the code
                # that used treebank name rather than id as the lookup
                if not treebank_id in self.treebanks and treebank_id in utils.reverse_iso_dict and \
                    utils.reverse_iso_dict[treebank_id] in self.treebanks:
                    treebank_id = utils.reverse_iso_dict[treebank_id]
                root.vecs["treebank"] = self.treebank_lookup[
                    self.treebanks[treebank_id]]

            root.vec = dy.concatenate(
                filter(None, [
                    root.vecs["word"], root.vecs["pos"], root.vecs["char"],
                    root.vecs["treebank"]
                ]))

        for bilstm in self.bilstms:
            bilstm.set_token_vecs(sentence, train)

    def get_char_vector(self, root, train, test_embeddings_chars={}):

        if root.char_rep == "*root*":  # no point running a character analysis over this placeholder token
            return self.charPadding.expr(
            )  # use the padding vector if it's the root token
        else:
            char_vecs = []
            for char in root.char_rep:
                if char in self.chars:
                    char_vecs.append(self.char_lookup[self.chars[char]])
                elif char in test_embeddings_chars:
                    char_vecs.append(
                        dy.inputVector(test_embeddings_chars[char]))
                else:
                    char_vecs.append(self.char_lookup[0])
            return self.char_bilstm.get_sequence_vector(char_vecs, train)

    def init_lookups(self, options):

        if self.external_embedding["words"]:
            print 'Initialising %i word vectors with external embeddings' % len(
                self.external_embedding["words"])
            for word in self.external_embedding["words"]:
                if len(self.external_embedding["words"]
                       [word]) != options.word_emb_size:
                    raise Exception(
                        "Size of external embedding does not match specified word embedding size of %s"
                        % (options.word_emb_size))
                self.word_lookup.init_row(
                    self.words[word], self.external_embedding["words"][word])
        elif options.word_emb_size > 0:
            print 'No word external embeddings found: all vectors initialised randomly'

        if self.external_embedding["chars"]:
            print 'Initialising %i char vectors with external embeddings' % len(
                self.external_embedding["chars"])
            for char in self.external_embedding["chars"]:
                if len(self.external_embedding["chars"]
                       [char]) != options.char_emb_size:
                    raise Exception(
                        "Size of external embedding does not match specified char embedding size of %s"
                        % (options.char_emb_size))
                self.char_lookup.init_row(
                    self.chars[char], self.external_embedding["chars"][char])
        elif options.char_emb_size > 0:
            print 'No character external embeddings found: all vectors initialised randomly'
示例#4
0
class FeatureExtractor(object):
    def __init__(self, model, options, words, rels, langs, w2i, ch, nnvecs):
        self.model = model
        self.disableBilstm = options.disable_bilstm
        self.multiling = options.use_lembed and options.multiling
        self.lstm_output_size = options.lstm_output_size
        self.char_lstm_output_size = options.char_lstm_output_size
        self.word_emb_size = options.word_emb_size
        self.char_emb_size = options.char_emb_size
        self.lang_emb_size = options.lang_emb_size
        self.wordsCount = words
        self.vocab = {word: ind + 2
                      for word, ind in w2i.iteritems()
                      }  # +2 for MLP padding vector and OOV vector
        self.chars = {char: ind + 1
                      for ind, char in enumerate(ch)}  # +1 for OOV vector
        self.rels = {word: ind for ind, word in enumerate(rels)}
        self.nnvecs = nnvecs
        if langs:
            self.langs = {lang: ind + 1
                          for ind, lang in enumerate(langs)
                          }  # +1 for padding vector
        else:
            self.langs = None
        self.irels = rels
        self.external_embedding = None
        if options.external_embedding is not None:
            self.get_external_embeddings(options.external_embedding)

        lstm_input_size = self.word_emb_size + (self.edim if self.external_embedding is\
                                                not None else 0) + (self.lang_emb_size if
                                                                    self.multiling else 0) + 2 * self.char_lstm_output_size

        if not self.disableBilstm:
            self.bilstm1 = BiLSTM(lstm_input_size,
                                  self.lstm_output_size,
                                  self.model,
                                  dropout_rate=0.33)
            self.bilstm2 = BiLSTM(2 * self.lstm_output_size,
                                  self.lstm_output_size,
                                  self.model,
                                  dropout_rate=0.33)
        else:
            self.lstm_output_size = int(lstm_input_size * 0.5)

        self.char_bilstm = BiLSTM(self.char_emb_size,
                                  self.char_lstm_output_size,
                                  self.model,
                                  dropout_rate=0.33)

        self.clookup = self.model.add_lookup_parameters(
            (len(ch) + 1, self.char_emb_size))
        self.wlookup = self.model.add_lookup_parameters(
            (len(words) + 2, self.word_emb_size))
        if self.multiling and self.lang_emb_size > 0:
            self.langslookup = self.model.add_lookup_parameters(
                (len(langs) + 1, self.lang_emb_size))

        #used in the PaddingVec
        self.word2lstm = self.model.add_parameters(
            (self.lstm_output_size * 2, lstm_input_size))
        self.word2lstmbias = self.model.add_parameters(
            (self.lstm_output_size * 2))
        self.chPadding = self.model.add_parameters(
            (self.char_lstm_output_size * 2))

    def Init(self):
        evec = self.elookup[1] if self.external_embedding is not None else None
        paddingWordVec = self.wlookup[1]
        paddingLangVec = self.langslookup[
            0] if self.multiling and self.lang_emb_size > 0 else None

        self.paddingVec = dy.tanh(self.word2lstm.expr() * dy.concatenate(
            filter(
                None,
                [paddingWordVec, evec,
                 self.chPadding.expr(), paddingLangVec])) +
                                  self.word2lstmbias.expr())
        self.empty = self.paddingVec if self.nnvecs == 1 else dy.concatenate(
            [self.paddingVec for _ in xrange(self.nnvecs)])

    def getWordEmbeddings(self, sentence, train):
        for root in sentence:
            wordcount = float(self.wordsCount.get(root.norm, 0))
            noDropFlag = not train or (random.random() <
                                       (wordcount / (0.25 + wordcount)))
            root.wordvec = self.wlookup[int(self.vocab.get(root.norm, 0)
                                            ) if noDropFlag else 0]
            self.get_char_vector(root, train)

            if self.external_embedding is not None:
                if not noDropFlag and random.random() < 0.5:
                    root.evec = self.elookup[0]
                elif root.form in self.external_embedding:
                    root.evec = self.elookup[self.extrnd[root.form]]
                elif root.norm in self.external_embedding:
                    root.evec = self.elookup[self.extrnd[root.norm]]
                else:
                    root.evec = self.elookup[0]
            else:
                root.evec = None

            if self.multiling:
                root.langvec = self.langslookup[self.langs[
                    root.language_id]] if self.lang_emb_size > 0 else None
            else:
                root.langvec = None

            root.vec = dy.concatenate(
                filter(None,
                       [root.wordvec, root.evec, root.chVec, root.langvec]))
        if not self.disableBilstm:
            self.bilstm1.set_token_vecs(sentence, train)
            self.bilstm2.set_token_vecs(sentence, train)

    def get_char_vector(self, root, train):
        if root.form == "*root*":  # no point running a character analysis over this placeholder token
            root.chVec = self.chPadding.expr(
            )  # use the padding vector if it's the root token
        else:
            char_vecs = []
            for char in root.form:
                char_vecs.append(self.clookup[self.chars.get(char, 0)])
            root.chVec = self.char_bilstm.get_sequence_vector(char_vecs, train)

    def get_external_embeddings(self, external_embedding_file):
        external_embedding_fp = codecs.open(external_embedding_file,
                                            'r',
                                            encoding='utf-8')
        external_embedding_fp.readline()
        self.external_embedding = {}
        for line in external_embedding_fp:
            line = line.strip().split()
            self.external_embedding[line[0]] = [float(f) for f in line[1:]]

        external_embedding_fp.close()

        self.edim = len(self.external_embedding.values()[0])
        self.noextrn = [0.0 for _ in xrange(self.edim)]  #???
        self.extrnd = {
            word: i + 3
            for i, word in enumerate(self.external_embedding)
        }
        self.elookup = self.model.add_lookup_parameters(
            (len(self.external_embedding) + 3, self.edim))
        for word, i in self.extrnd.iteritems():
            self.elookup.init_row(i, self.external_embedding[word])
            self.extrnd['*PAD*'] = 1
            self.extrnd['*INITIAL*'] = 2

        print 'Load external embedding. Vector dimensions', self.edim