def build_char_lang(): lang = Lang() lang.word2index = dict() lang.index2word = dict() lang.n_words = 0 chars = "!\"$%&'()*+,-./0123456789:;<>?[]abcdefghijklmnopqrstuvwxyz" for c in chars: lang.addWord(c) return lang
def __init__(self, word_vectors, max_length, char_embed=False, seeder=int(time.time())): super(PreTrainedEmbeddingEncoderBiRNN, self).__init__(word_vectors.vector_size, max_length, seeder=seeder) self.model_type = 'pre_trained_embedding' # define word vector embedding self.word_vectors = word_vectors # empty vector for oov self.empty_vector = Variable(torch.Tensor(self.empty_vector)).view( 1, 1, -1) # char embed self.char_embed = char_embed if self.char_embed: lang = Lang() lang.word2index = dict() lang.index2word = dict() lang.n_words = 0 chars = 'abcdefghijklmnopqrstuvwxyz0123456789' for c in chars: lang.addWord(c) self.charbased_model = WordEncoderBiRNN(self.hidden_size // 2, params.CHAR_LENGTH, lang, seeder=seeder) # word vector for start of string sos = torch.ones(self.hidden_size) self.sos_vector = Variable(sos).view(1, 1, -1) # word vector for end of string eos = torch.ones(self.hidden_size) * -1 self.eos_vector = Variable(eos).view(1, 1, -1) if params.USE_CUDA: self.cuda() self.empty_vector = self.empty_vector.cuda() self.sos_vector = self.sos_vector.cuda() self.eos_vector = self.eos_vector.cuda() self.cache_dict = dict() self.cache_dict[params.SOS_TOKEN] = self.sos_vector self.cache_dict[params.EOS_TOKEN] = self.eos_vector