def pretrained_word_emb(vocab, emb_dim): word2emb = vocab['word'].load_word2emb() word_emb = Embedding(len(vocab['word']), emb_dim) W = word_emb.get_weights()[0] for i, word in enumerate(word2emb.keys()): W[i] = word2emb[word] word_emb.set_weights([W]) return word_emb
def run_model(self): vocab_size = len(self.word2id) input_target = Input((1, )) input_context = Input((1, )) embedding = Embedding(vocab_size, self.vector_dim, input_length=1, name='embedding') target = embedding(input_target) target = Reshape((self.vector_dim, 1))(target) context = embedding(input_context) context = Reshape((self.vector_dim, 1))(context) dot_product = dot([target, context], axes=1, normalize=False) dot_product = Reshape((1, ))(dot_product) #output = Dense(1, activation='sigmoid')(dot_product) model = Model(input=[input_target, input_context], output=dot_product) model.compile(loss='binary_crossentropy', optimizer='rmsprop') target_arr = np.zeros((1, )) context_arr = np.zeros((1, )) label_arr = np.zeros((1, )) word_target, word_context, labels = zip(*self.train_data) for cnt in range(self.epochs): idx = np.random.randint(0, len(labels) - 1) target_arr[0, ] = word_target[idx] context_arr[0, ] = word_context[idx] label_arr[0, ] = labels[idx] loss = model.train_on_batch([target_arr, context_arr], label_arr) if cnt % 100 == 0: print("Iteration {}, loss={}".format(cnt, loss)) weights = embedding.get_weights()[0] words_embeddings = {w: weights[idx] for w, idx in self.word2id.items()} return words_embeddings
loss_plot = TensorBoard(log_dir=train_name + '_logs', write_graph=False, embeddings_freq=10) earlystopping = EarlyStopping(monitor='loss', min_delta=0.0001, patience=1, verbose=1, mode='auto') steps = no_train_pairs / batch_size # How many times per epoch we will ask the batch generator to yield a batch # Let's start training! start = time.time() history = keras_model.fit_generator( batch_generator(wordpairs, vocabulary, vocab_size, negative, batch_size), callbacks=[sim_cb, loss_plot, earlystopping], steps_per_epoch=steps, epochs=10, workers=cores, verbose=1) end = time.time() print('Training took:', int(end - start), 'seconds', file=sys.stderr) # Saving the resulting vectors: filename = train_name + '.vec.gz' save_word2vec_format(filename, vocabulary, word_embedding_layer.get_weights()[0]) backend.clear_session()
concat_axis=-1) # The dot products are outputted model = Model(input=[word_index, context, negative_samples], output=[word_context_product, negative_context_product]) # binary crossentropy is applied on the output model.compile(optimizer='rmsprop', loss='binary_crossentropy') print model.summary() # model.fit_generator(V_gen.pretraining_batch_generator(sentences, vocabulary, reverse_vocabulary), samples_per_epoch=G.train_words, nb_epoch=1) model.fit_generator(V_gen.pretraining_batch_generator(sentences, vocabulary, reverse_vocabulary), samples_per_epoch=10, nb_epoch=1) # Save the trained embedding S.save_embeddings("embedding.txt", shared_embedding_layer.get_weights()[0], vocabulary) # input_context = np.random.randint(10, size=(1, context_size)) # input_word = np.random.randint(10, size=(1,)) # input_negative = np.random.randint(10, size=(1, G.negative)) # print "word, context, negative samples" # print input_word.shape, input_word # print input_context.shape, input_context # print input_negative.shape, input_negative # output_dot_product, output_negative_product = model.predict([input_word, input_context, input_negative]) # print "word cbow dot product" # print output_dot_product.shape, output_dot_product # print "cbow negative dot product" # print output_negative_product.shape, output_negative_product
+ str(args.regularize) # create a secondary validation model to run our similarity checks during training similarity = dot([word_embedding, context_embedding], axes=1, normalize=True) validation_model = Model(inputs=[word_index, context_index], outputs=[similarity]) sim_cb = helpers.SimilarityCallback(validation_model=validation_model) loss_plot = TensorBoard(log_dir=train_name + '_logs', write_graph=False) earlystopping = EarlyStopping(monitor='loss', min_delta=0.0001, patience=1, verbose=1, mode='auto') # How many times per epoch we will ask the batch generator to yield a batch? steps = no_train_pairs / batch_size # Let's start training! start = time.time() history = keras_model.fit_generator( helpers.batch_generator(wordpairs, vocab_dict, vocab_size, negative, batch_size, args.use_neighbors, neighbors_count), callbacks=[sim_cb, loss_plot, earlystopping], steps_per_epoch=steps, epochs=args.epochs, workers=cores, verbose=2) end = time.time() print('Training took:', int(end - start), 'seconds', file=sys.stderr) # Saving the resulting vectors: filename = train_name + '_' + run_name + '.vec.gz' helpers.save_word2vec_format(filename, vocab_dict, word_embedding_layer.get_weights()[0]) backend.clear_session()
negative_words_embedding = shared_embedding_layer(negative_samples) # Now the context words are averaged to get the CBOW vector cbow = Lambda(lambda x: K.mean(x, axis=1), output_shape=(G.embedding_dimension,))(context_embeddings) # The context is multiplied (dot product) with current word and negative sampled words word_context_product = merge([word_embedding, cbow], mode='dot') negative_context_product = merge([negative_words_embedding, cbow], mode='dot', concat_axis=-1) # The dot products are outputted model = Model(input=[word_index, context, negative_samples], output=[word_context_product, negative_context_product]) # binary crossentropy is applied on the output model.compile(optimizer='rmsprop', loss='binary_crossentropy') print(model.summary()) # model.fit_generator(V_gen.pretraining_batch_generator(sentences, vocabulary, reverse_vocabulary), samples_per_epoch=G.train_words, nb_epoch=1) model.fit_generator(V_gen.pretraining_batch_generator(sentences, vocabulary, reverse_vocabulary), samples_per_epoch=10000, nb_epoch=10) # Save the trained embedding S.save_embeddings("word2vec_50.txt", shared_embedding_layer.get_weights()[0], vocabulary) # input_context = np.random.randint(10, size=(1, context_size)) # input_word = np.random.randint(10, size=(1,)) # input_negative = np.random.randint(10, size=(1, G.negative)) # print "word, context, negative samples" # print input_word.shape, input_word # print input_context.shape, input_context # print input_negative.shape, input_negative # output_dot_product, output_negative_product = model.predict([input_word, input_context, input_negative]) # print "word cbow dot product" # print output_dot_product.shape, output_dot_product # print "cbow negative dot product" # print output_negative_product.shape, output_negative_product
class LSTMEncDec: __LOSS_FUNCS__ = ('mean_squared_error', 'categorical_crossentropy') __DECODER_BUILDS__ = None def __init__(self, word_vec, word_to_index, index_to_word, weight_file=None, enc_layer_output=(32, ), dec_layer_output=(32, ), learning_rate=0.001, sequence_len=200, output_len=2000, directory='.', out_type=0, decoder_type=0): """ :param out_type: 0: word vector output/similarity inference 1: for softmax word distribution output. :param decoder_type: 0: non-readout LSTM decoder 1: recurrentshop's readout decoder 2: seq2seq decoder. """ self.__DECODER_BUILDS__ = (self.__build_repeat_decoder__, self.__build_readout_decoder__, self.__build_seq2seq_decoder__) self.word_to_index = word_to_index self.index_to_word = index_to_word self.sequence_len = sequence_len self.output_len = output_len self.directory = directory self.enc_layer_output = enc_layer_output self.dec_layer_output = dec_layer_output self.decoder_type = decoder_type self.out_type = out_type try: loss = self.__LOSS_FUNCS__[out_type] except IndexError: raise ValueError('Invalid output type %s.' % self.out_type) self.encoder = Sequential(name='Encoder') self.decoder = Sequential(name='Decoder') self.embed = None self.batch_size = 0 input_layer, output_layer = self.config_model(word_vec) self.model = Model(inputs=[input_layer], outputs=[output_layer]) if weight_file is not None: self.model.load_weights(weight_file) self.compile(learning_rate, loss) def config_model(self, word_vec): """ Creates the encoder-decoder structure and returns the symbolic input and output """ train_embed = True # Configure input layer input_layer = Input(shape=(self.sequence_len, ), name='Input') # Embedding layer should be initialized with a word-vector array and # not be trained as the output relies on the same array self.embed = Embedding(input_dim=np.size(word_vec, 0), output_dim=np.size(word_vec, 1), weights=[word_vec], trainable=train_embed, mask_zero=True, name='Embed') # Configure encoder network with the given output sizes. # Embedding for encoder only since decoder receives the question vector. self.encoder.add(self.embed) for el in self.enc_layer_output[:-1]: self.encoder.add( LSTM(el, return_sequences=True, consume_less='mem')) self.encoder.add(LSTM(self.enc_layer_output[-1]) ) # Final LSTM layer only outputs the last vector # Encoder outputs the question vector as a tensor with each time-step output being the final question vector question_vec = self.encoder(input_layer) # Configure decoder network with the given output sizes. # Layer connecting to encoder output try: self.__DECODER_BUILDS__[self.out_type]() except IndexError: raise ValueError('Invalid decoder type %s.' % self.decoder_type) if self.out_type == 0: # Final layer outputting a sequence of word vectors self.decoder.add( TimeDistributed( Dense(np.size(word_vec, 1), activation='linear'))) else: # Final layer outputting a sequence of word distribution vectors self.decoder.add( TimeDistributed( Dense(len(self.index_to_word), activation='softmax'))) output_layer = self.decoder(question_vec) return input_layer, output_layer def __build_repeat_decoder__(self): # Repeat the final vector for answer input self.decoder.add( RepeatVector(self.sequence_len, input_shape=(self.enc_layer_output[-1], ))) self.decoder.add( LSTM(self.dec_layer_output[0], input_shape=(self.sequence_len, self.enc_layer_output[-1]), name='ConnectorLSTM', return_sequences=True, consume_less='mem')) for dl in self.dec_layer_output[1:]: self.decoder.add( LSTM(dl, return_sequences=True, consume_less='mem')) def __build_readout_decoder__(self): self.decoder.add( RepeatVector( self.sequence_len, input_shape=(self.enc_layer_output[-1], ))) # Repeat the final vector for answer input # Using recurrentshop's container with readout container = RecurrentContainer(readout=True, return_sequences=True, output_length=self.sequence_len) if len(self.dec_layer_output) > 1: container.add( LSTMCell(output_dim=self.dec_layer_output[0], input_dim=self.enc_layer_output[-1])) for dl in self.dec_layer_output[1:-1]: container.add(LSTMCell(output_dim=dl)) container.add(LSTMCell(output_dim=self.enc_layer_output[-1])) else: container.add( LSTMCell(input_dim=self.enc_layer_output[-1], output_dim=self.enc_layer_output[-1])) if self.enc_layer_output[-1] != self.dec_layer_output[-1]: print( 'WARNING: Overriding final decoder output to %s for readout compatibility' % self.enc_layer_output[-1]) self.decoder.add(container) def __build_seq2seq_decoder__(self): # Using recurrentshop's decoder container container = RecurrentContainer( return_sequences=True, readout='add', output_length=self.sequence_len, input_shape=(self.enc_layer_output[-1], ), decode=True) if len(self.dec_layer_output) > 1: container.add( LSTMCell(output_dim=self.dec_layer_output[0], input_dim=self.enc_layer_output[-1])) for dl in self.dec_layer_output[1:-1]: container.add(LSTMCell(output_dim=dl)) container.add(LSTMCell(output_dim=self.enc_layer_output[-1])) else: container.add( LSTMCell(input_dim=self.enc_layer_output[-1], output_dim=self.enc_layer_output[-1])) if self.enc_layer_output[-1] != self.dec_layer_output[-1]: print( 'WARNING: Overriding final decoder output to %s for readout compatibility' % self.enc_layer_output[-1]) self.decoder.add(container) def compile(self, learning_rate, loss): if self.out_type == 0: metrics = ['mean_absolute_error'] else: metrics = [] self.model.compile(optimizer=RMSprop(lr=learning_rate), loss=loss, metrics=metrics, sample_weight_mode='temporal') def train(self, Xtrain, ytrain, nb_epoch, Xval=None, yval=None, train_mask=None, val_mask=None, batch_size=10, queries=None): """ Uses a generator to decompress labels from integers to hot-coded vectors batch-by-batch to save memory. See utils.generate_batch(). """ self.batch_size = batch_size callback = EncDecCallback(self, queries, True) logger = CSVLogger(self.directory + '/epochs.csv') nb_class = len(self.index_to_word) total_len = np.size(ytrain, 0) if self.out_type == 0: generator = utils.generate_vector_batch else: generator = utils.generate_batch if Xval is None or yval is None: self.model.fit_generator(generator(Xtrain, ytrain, self.embed.get_weights()[0], train_mask, nb_class, total_len, batch_size), steps_per_epoch=total_len / batch_size, workers=1, epochs=nb_epoch, callbacks=[callback, logger], verbose=1, max_q_size=1) else: self.model.fit_generator( generator(Xtrain, ytrain, self.embed.get_weights()[0], train_mask, nb_class, total_len, batch_size), steps_per_epoch=total_len / batch_size, epochs=nb_epoch, callbacks=[callback, logger], verbose=1, max_q_size=1, workers=1, validation_steps=Xval.shape[0] / self.batch_size, validation_data=generator(Xval, yval, self.embed.get_weights()[0], val_mask, nb_class, Xval.shape[0], batch_size)) def generate_response(self, query): """ Pre-processes a raw query string and return a response string """ tokens = nltk.word_tokenize(query.lower())[:self.sequence_len] indices = [ self.word_to_index[w] if w in self.word_to_index else self.word_to_index[UNKNOWN_TOKEN] for w in tokens ] indices.extend([0] * (self.sequence_len - len(indices))) indices = np.asarray(indices, dtype=np.int32).reshape( (1, self.sequence_len)) output = self.model.predict(indices, batch_size=1, verbose=0) vectors = self.embed.get_weights()[0] response = [] if self.out_type == 0: for word_vec in output[0]: word = self.index_to_word[utils.nearest_vector_index( vectors, word_vec)] if word == MASK_TOKEN: continue elif word == SENTENCE_END_TOKEN: break response.append(word) else: out_idx = np.argmax(output, axis=2) # noinspection PyTypeChecker for idx in out_idx[0]: word = self.index_to_word[idx] if word == MASK_TOKEN: continue elif word == SENTENCE_END_TOKEN: response.append(word) break response.append(word) return ' '.join(response) def generate_candidates(self, query, top=3): """ Generates a list of top candidates for each word position given a raw string query. Only applies for softmax model. """ tokens = nltk.word_tokenize(query.lower())[:self.sequence_len] indices = [ self.word_to_index[w] if w in self.word_to_index else self.word_to_index[UNKNOWN_TOKEN] for w in tokens ] indices.extend([0] * (self.sequence_len - len(indices))) indices = np.asarray(indices, dtype=np.int32).reshape( (1, self.sequence_len)) output = self.model.predict(indices, batch_size=1, verbose=0) vectors = self.embed.get_weights()[0] response, candidates = [], [] if self.out_type == 0: for word_vec in output[0]: word = self.index_to_word[utils.nearest_vector_index( vectors, word_vec)] if word == MASK_TOKEN: continue elif word == lstm.tokens.SENTENCE_END_TOKEN: break response.append(word) else: out_idx = utils.k_largest_idx(output, top) # noinspection PyTypeChecker for ca in out_idx[0]: word = self.index_to_word[ca[0]] if word == MASK_TOKEN: continue elif word == SENTENCE_END_TOKEN: response.append(word) break response.append(word) candidates.append([self.index_to_word[c] for c in ca]) return ' '.join(response), candidates def log(self, string='', out=True): f = open(self.directory + '/log.txt', mode='at') if out: print(string) print(string, file=f) f.close()
def compile_model(inputs, repeat): (vocab_size1, max_sent1, sent_maxlen1, query_maxlen1) = inputs[0] (vocab_size2, max_sent2, sent_maxlen2, query_maxlen2) = inputs[1] mvocab_size = vocab_size1 if (mvocab_size < vocab_size2): mvocab_size = vocab_size2 story_input1 = Input((max_sent1, sent_maxlen1)) story_input2 = Input((max_sent2, sent_maxlen2)) query_input1 = Input((sent_maxlen1, )) query_input2 = Input((sent_maxlen2, )) # H = Dense(EMBED_HIDDEN_SIZE) embedBlayer1 = Embedding(vocab_size1, EMBED_HIDDEN_SIZE, input_length=sent_maxlen1, init=INIT_WEIGHT) embedBlayer2 = Embedding(vocab_size2, EMBED_HIDDEN_SIZE, input_length=sent_maxlen2, init=INIT_WEIGHT) embeddingBs = embedBlayer1(query_input1), embedBlayer2(query_input2) #u = Lambda(lambda x: K.sum(x, axis=2), output_shape=lambda s: (s[0], s[1]))(embeddingB) u = [ Lambda(lambda x: K.sum(x, axis=1), output_shape=(EMBED_HIDDEN_SIZE, ))(embeddingB) for embeddingB in embeddingBs ] embedlayer1 = None embedlayer2 = None for hop in range(HOPS): embeddingAs = None if hop == 0: embeddingAs = [ embedBlayer1(story_input1), embedBlayer2(story_input2) ] else: embeddingAs = [ embedlayer1(story_input1), embedlayer2(story_input2) ] #ms = Lambda(lambda x: K.sum(x, axis=3), output_shape=lambda s: (s[0], s[1], s[2]))(embeddingA) mss = [ Lambda(lambda x: K.sum(x, axis=2), output_shape=(max_sent1, EMBED_HIDDEN_SIZE))(embeddingAs[0]), Lambda(lambda x: K.sum(x, axis=2), output_shape=(max_sent2, EMBED_HIDDEN_SIZE))(embeddingAs[1]) ] dotproducts = [ merge([mss[0], u[-2]], mode=row_wise_dot, output_shape=(max_sent1, )), merge([mss[1], u[-1]], mode=row_wise_dot, output_shape=(max_sent2, )) ] #dotproduct = merge([ms, u], mode=row_wise_cos, output_shape=(max_sent,)) probs = [ Activation('softmax')(dotproduct) for dotproduct in dotproducts ] embedlayer1 = Embedding(vocab_size1, EMBED_HIDDEN_SIZE, input_length=sent_maxlen1, init=INIT_WEIGHT) embedlayer2 = Embedding(vocab_size2, EMBED_HIDDEN_SIZE, input_length=sent_maxlen2, init=INIT_WEIGHT) embeddingCs = [embedlayer1(story_input1), embedlayer2(story_input2)] cs = [ Lambda(lambda x: K.sum(x, axis=2), output_shape=(max_sent1, EMBED_HIDDEN_SIZE))(embeddingCs[0]), Lambda(lambda x: K.sum(x, axis=2), output_shape=(max_sent2, EMBED_HIDDEN_SIZE))(embeddingCs[1]) ] c_temps = [ Lambda(lambda x: tf.transpose(x, [0, 2, 1]), output_shape=(EMBED_HIDDEN_SIZE, max_sent1))(cs[0]), Lambda(lambda x: tf.transpose(x, [0, 2, 1]), output_shape=(EMBED_HIDDEN_SIZE, max_sent2))(cs[1]) ] os = [ merge([c_temp, prob], mode=row_wise_dot, output_shape=(EMBED_HIDDEN_SIZE, )) for c_temp, prob in zip(c_temps, probs) ] newus = [ merge([u[-2], os[0]], mode='sum', output_shape=(EMBED_HIDDEN_SIZE, )), merge([u[-1], os[1]], mode='sum', output_shape=(EMBED_HIDDEN_SIZE, )) ] # u.append(H(newu)) u.append(newus[0]) u.append(newus[1]) # Applying w matrix #dl = Dense(vocab_size, input_dim=(EMBED_HIDDEN_SIZE,))(u[-1]) # Using last C as W per adjacent weight tying func1 = lambda x: tf.matmul( x, tf.transpose(embedlayer1.get_weights()[0], [1, 0])) func2 = lambda x: tf.matmul( x, tf.transpose(embedlayer2.get_weights()[0], [1, 0])) dls = [Lambda(func1)(newus[0]), Lambda(func2)(newus[1])] preds = [ Dense(vocab_size1, activation='softmax')(dls[0]), Dense(vocab_size2, activation='softmax')(dls[1]) ] model = Model( input=[story_input1, query_input1, story_input2, query_input2], output=dls) # opt = Adam(lr=0.001, # beta_1=0.9, # beta_2=0.999, # epsilon=1e-08, # decay=0.0) opt = SGD(lr=0.0, momentum=0.0, decay=0.0, nesterov=False) model.compile(optimizer=opt, loss='categorical_crossentropy', metrics=['accuracy']) return model, [LearningRateScheduler(step_decay)]
class HN(): def __init__(self, src_list, save_dir, data_directory, data_name, glove, word_coverage, non_test, initial_learning_rate, learning_rate_decay, optimizer_kwargs, adjust_learning_rate, clip_batch_size, set_min_batch_size, char_rnn, REMAINDER, CHAR, MAX_WORD_LENGTH, BUCKET_DIVISION, WordEmb_dropout, WordRnn_dropout, SentenceRnn_dropout, rnn, CHARACTER_RNN_DIMENSION, conv_unit_size, mse, TRAIN, pretrain, mode, mem_test=False, test=False, bucket_coverage=None, max_n_epoch=55, cnn_window_size=7, memory=False, optimizer_name='rmsprop', std_batch_size=64, embedding_regularizer_coefficient=None, recurrent_activation='sigmoid', conv_dense_activation='tanh', entire_char_size=50, kernel_regularizer_coefficient=None, sec_period=30, memory_fraction=1, stack=[1, 1], RNN_DIMENSION=50, trainable_word_emb=True, year=None, flag_embedding_layer_mask_zero=True, rnn_implementation=2, tanh2_dropout=0, patience=3, complete_pretrain=False): save_filename = 'model.h5' if os.path.isfile(os.path.join(save_dir, save_filename)): print('===== will load weights ======') else: # create new path save_dir = find_new_dir(save_dir) # copy source codes save_src(save_dir, src_list) # open log file log_path = os.path.join(save_dir, 'log.txt') self.log, sys.stdout = open(log_path, 'a', 1), open(log_path, 'a', 1) self.log.write('\n\n\n\n\n') # adjust config if test: data_name = 'small' max_n_epoch = 3 BUCKET_DIVISION = 2 conv_unit_size = 32 if set_min_batch_size and K.backend() == 'theano': adjust_learning_rate = False import theano theano.config.optimizer = 'fast_run' if pretrain: print('pre-train is impossible now due to fast-mem_test') sys.exit() # parameters self.log_path, self.sec_period, self.std_batch_size = log_path, sec_period, std_batch_size self.adjust_learning_rate = adjust_learning_rate self.best_validation_save_path = os.path.join(save_dir, 'best-accuracy.txt') self.best_save_path = os.path.join(save_dir, 'best.h5') self.model_save_path = os.path.join(save_dir, save_filename) self.mem_test_flag = mem_test self.learning_rate_decay = learning_rate_decay self.custom_objects = {'AttentionLayer': AttentionLayer} self.word_embedding_dim = 200 self.rnn_implementation = rnn_implementation self.trainable_word_emb, self.flag_embedding_layer_mask_zero = trainable_word_emb, flag_embedding_layer_mask_zero self.CHAR = CHAR self.REMAINDER = REMAINDER self.rnn, self.conv_dense_activation = rnn, conv_dense_activation self.recurrent_activation = recurrent_activation self.initial_learning_rate = initial_learning_rate self.TRAIN = TRAIN self.stack = stack self.pretrain, self.complete_pretrain = pretrain, complete_pretrain self.patience, self.mse, self.max_n_epoch = patience, mse, max_n_epoch self.MULTI_RNN_DIMENSION = RNN_DIMENSION # int(RNN_DIMENSION / math.sqrt(N_LAYERS)) self.conv_unit_size = conv_unit_size self.embedding_regularizer_coefficient = embedding_regularizer_coefficient self.kernel_regularizer_coefficient = kernel_regularizer_coefficient self.char_rnn_flag, self.CHARACTER_RNN_DIMENSION = char_rnn, CHARACTER_RNN_DIMENSION self.WordEmb_dropout, self.WordRnn_dropout, self.SentenceRnn_dropout, self.tanh2_dropout = WordEmb_dropout, WordRnn_dropout, SentenceRnn_dropout, tanh2_dropout self.optimizer_name, self.optimizer_kwargs = optimizer_name, optimizer_kwargs self.save_dir = save_dir self.mode = mode # prepare dataset self.path = Path(directory_path=data_directory, name=data_name, glove=glove, year=year) MEMORY = self.memory_control(memory_fraction) if memory: MEMORY = memory self.log.write('MEMORY: {}\n'.format(MEMORY)) self.dataset = Dataset(log_path, self.path, set_min_batch_size=set_min_batch_size, bucket_coverage=bucket_coverage, clip_batch_size=clip_batch_size, non_test_flag=non_test, word_coverage=word_coverage, save_dir=save_dir, MAX_WORD_LENGTH=MAX_WORD_LENGTH, MAX_N_CHARACTER=entire_char_size) self.dataset.bucketize2(BUCKET_DIVISION, MEMORY) self.dataset.import_word_embedding(EMBEDDING_DIM=200) self.log.write( 'vocabulary_size={}, embedding_matrix_size, EMBEDDING_DIM = {}, {}\n' .format(len(self.dataset.word_count), len(self.dataset.word_embedding_matrix), self.word_embedding_dim)) self.dataset.std_batch_size = std_batch_size self.max_word_length = self.dataset.MAX_WORD_LENGTH # print configurations self.log.write('test={} mem_test={} memory_fraction = {}\n'.format( test, mem_test, memory_fraction)) self.log.write('data_name = {} {}\n'.format(data_name, year)) self.log.write( 'set_min_batch_size = {} bucket_coverage = {}\n'.format( set_min_batch_size, bucket_coverage)) self.log.write('TRAIN = {} mse = {}\n'.format(TRAIN, mse)) self.log.write('mode = {}'.format(self.mode)) self.log.write( 'glove = {}, non_test = {} word_coverage = {}\n'.format( glove, non_test, word_coverage)) self.log.write( 'entire_char_size={}, MAX_WORD_LENGTH = {}, BUCKET_DIVISION = {}\n' .format(entire_char_size, MAX_WORD_LENGTH, BUCKET_DIVISION)) self.log.write('save_dir = {}\n'.format(save_dir)) self.log.write('log_path = {}\n'.format(log_path)) self.log.write( 'optimizer_kwargs = {} initial_learning_rate = {}\n'.format( optimizer_kwargs, initial_learning_rate)) self.log.write( 'learning_rate_decay = {}\n'.format(learning_rate_decay)) self.log.write('backend: {}\n'.format(K.backend())) self.log.write('REMAINDER = {}\n'.format(REMAINDER)) self.log.write('CHAR = {}, char_rnn = {}\n'.format(CHAR, char_rnn)) self.log.write('embedding_regularizer_coefficient = {}\n'.format( embedding_regularizer_coefficient)) self.log.write('kernel_regularizer_coefficient = {}\n'.format( kernel_regularizer_coefficient)) self.log.write('rnn = {}, stack = {} conv_unit_size={}\n'.format( rnn, stack, conv_unit_size)) self.log.write('optimizer_name = {}\n'.format(optimizer_name)) self.log.write('cnn_window_size = {}\n'.format(cnn_window_size)) self.log.write( 'CHARACTER_RNN_DIMENSION = {}\n'.format(CHARACTER_RNN_DIMENSION)) self.log.write('std_batch_size = {}\n'.format(std_batch_size)) self.log.write( 'WordEmb_dropout, WordRnn_dropout, SentenceRnn_dropout = {}, {}, {}\n' .format(WordEmb_dropout, WordRnn_dropout, SentenceRnn_dropout)) self.log.write('pretrain = {} trainable_word_emb = {}\n'.format( pretrain, trainable_word_emb)) self.log.write( 'adjust_learning_rate = {}\n'.format(adjust_learning_rate)) self.log.write('tanh2_dropout = {}\n'.format(tanh2_dropout)) self.log.write('patience ={}\n'.format(patience)) self.log.write('clip_batch_size = {}\n'.format(clip_batch_size)) self.log.write( 'conv_dense_activation = {} recurrent_activation = {}\n'.format( conv_dense_activation, recurrent_activation)) self.make_layers() def run(self): if self.mem_test_flag is True: self.mem_test() self.run_without_mem_test(compiled=True) else: self.run_without_mem_test() print('=== finished ===') def mem_test(self): self.dataset.mem_test, self.mem_test_flag = True, True print('=== memory test ====') self.run_without_mem_test() self.dataset.mem_test, self.mem_test_flag = False, False def make_layers(self): self.word_embedding_layer = Embedding( len(self.dataset.word_embedding_matrix), self.word_embedding_dim, embeddings_initializer=Constant( self.dataset.word_embedding_matrix), trainable=self.trainable_word_emb, mask_zero=self.flag_embedding_layer_mask_zero, name='word_emb', embeddings_regularizer=l2(self.embedding_regularizer_coefficient)) self.word_rnn = self.stack_rnn(self.stack[0]) self.tanh1 = self.make_dense(DIMENSION_ATTENTION, activation='tanh') self.att1 = AttentionLayer(name='att1') self.rnn2 = self.stack_rnn(self.stack[1]) self.tanh2 = self.make_dense(DIMENSION_ATTENTION, activation='tanh') self.att2 = AttentionLayer(name='att2') self.logit = self.make_dense(self.dataset.n_classes, activation='softmax', name='logit') if self.optimizer_name == 'rmsprop': self.optimizer = optimizers.RMSprop(lr=self.initial_learning_rate, **self.optimizer_kwargs) elif self.optimizer_name == 'sgd': self.optimizer = optimizers.SGD(lr=self.initial_learning_rate, momentum=0.9, **self.optimizer_kwargs) elif self.optimizer_name == 'adam': self.optimizer = optimizers.Adam(lr=self.initial_learning_rate, **self.optimizer_kwargs) else: self.log.write('unknown optimizer name') sys.exit(1) if self.CHAR: self.dataset.make_character_embedding_index() self.character_embedding_layer = Embedding( len(self.dataset.char_embedding_index), len(self.dataset.char_embedding_index) - 2, # weights=[self.dataset.char_embedding_matrix], embeddings_initializer=Constant( self.dataset.char_embedding_matrix), mask_zero=self.char_rnn_flag, trainable=True, name='ch_emb', embeddings_regularizer=l2( self.embedding_regularizer_coefficient)) if not self.char_rnn_flag: # character cnn self.conv1 = self.make_conv(self.conv_unit_size, 5) self.conv2 = self.make_conv(self.conv_unit_size, 2) else: # character rnn self.char_rnn = self.make_rnn(self.CHARACTER_RNN_DIMENSION, False) # temp self.word_linear = self.make_dense(self.word_embedding_dim, activation='linear') self.char_linear = self.make_dense(self.word_embedding_dim, activation='linear') self.max_tanh = self.make_dense(self.word_embedding_dim, activation='tanh') # layers for merging words and characters self.conv_dense = self.make_dense( self.word_embedding_dim, activation=self.conv_dense_activation) self.max_relu = self.make_dense(self.word_embedding_dim, activation='relu') def memory_control(self, memory_fraction): memory = { 'citron': 11169, 'apple': 8108, 'cacao': 4041, 'lime': 3300, 'tangerine': 4036 } # 'lime':3050 # memory['citron'] = memory['cacao'] # memory['apple'] = memory['cacao'] # memory['tangerine'] = memory['cacao'] # memory['lime'] = memory['cacao'] memory['durian'] = memory['citron'] # 11169 memory['lemon'] = memory['apple'] host = socket.gethostname() if self.CHAR: MEMORY = int(10 * memory[host] * memory_fraction * 0.8) if False and memory[host] > 11000 and memory_fraction > 0.85: MEMORY = int(0.8 * MEMORY) else: MEMORY = int(10 * memory[host] * memory_fraction * 0.8) if K.backend() == 'tensorflow' and memory_fraction < 0.85: import tensorflow as tf from keras.backend.tensorflow_backend import set_session tf_config = tf.ConfigProto() tf_config.gpu_options.per_process_gpu_memory_fraction = memory_fraction set_session(tf.Session(config=tf_config)) return MEMORY def make_conv(self, unit_size, kernel_size): return Conv1D(unit_size, kernel_size, activation='relu', kernel_regularizer=l2( self.kernel_regularizer_coefficient)) def stack_rnn(self, stack): rnn = [] for _ in range(stack): rnn += [self.make_bi_rnn()] return rnn def make_bi_rnn(self): return Bidirectional(self.make_rnn(self.MULTI_RNN_DIMENSION, True)) def make_rnn(self, dimension, return_sequences=False): if self.rnn == 'gru': rnn = GRU elif self.rnn == 'lstm': rnn = LSTM return rnn(dimension, return_sequences=return_sequences, implementation=self.rnn_implementation, recurrent_activation=self.recurrent_activation, kernel_regularizer=l2(self.kernel_regularizer_coefficient), recurrent_regularizer=l2( self.kernel_regularizer_coefficient)) def make_dense(self, dim, activation, name=None): return Dense(dim, activation=activation, name=name, kernel_regularizer=l2( self.kernel_regularizer_coefficient)) def char_to_word_model( self, max_word_length): # char_i_input: (batch_size, word-length) char_i_input = Input(shape=(max_word_length, ), dtype='int32', name='word_ch_input') embedded_characters = self.character_embedding_layer(char_i_input) if not self.char_rnn_flag: conv_tensor = self.conv1(embedded_characters) conv_tensor = MaxPooling1D(3)(conv_tensor) conv_tensor = self.conv2(conv_tensor) conv_tensor = MaxPooling1D(2)(conv_tensor) # conv_out_shape = K.int_shape(conv_tensor) # output = Flatten()(conv_tensor) # flatt_out_shape = (conv_out_shape[0], conv_out_shape[1]*conv_out_shape[2]) # output = Lambda(lambda x: Flatten()(x), output_shape=flatt_out_shape)(conv_tensor) output = MyFlat()(conv_tensor) else: output = self.char_rnn(embedded_characters) print('flatten: ', output) model = Model(char_i_input, output) print('model.output_shape=', model.output_shape) return model def embedded_word_to_sentence(self, max_sentence_length, embed_dim): embedded_word = Input(shape=(max_sentence_length, embed_dim), dtype='float32', name='emb_word_in') masked = Masking()(embedded_word) masked = Dropout(self.WordEmb_dropout)(masked) for i in range(self.stack[0]): masked = self.word_rnn[i](masked) wordRnn = Dropout(self.WordRnn_dropout)(masked) word_tanh = self.tanh1(wordRnn) # word_tanh = Dropout(self.WordRnn_dropout)(word_tanh) attention = self.att1(word_tanh) sentenceEmb = Multiply()([wordRnn, attention]) sentenceEmb = Lambda(lambda x: K.sum(x, axis=1), output_shape=lambda x: (x[0], x[2]))(sentenceEmb) modelSentence = Model(embedded_word, sentenceEmb) # print('word_to_sentence model summary') # print(modelSentence.summary()) # modelSentAttention = Model(embedded_word, attention) return modelSentence def embed_word_document(self, wordsInputs): embedded = TimeDistributed(self.word_embedding_layer)(wordsInputs) return Masking()(embedded) def embed_char_document(self, char_input): # char_input: (batch_size, n_sentences, sentence_length, word_length) if False: model = self.char_to_word_model( self.max_word_length ) # char_i_input: (batch_size, word-length) return TimeDistributed(TimeDistributed(model))(char_input) else: embedded = TimeDistributed( TimeDistributed(self.character_embedding_layer))(char_input) conv_tensor = TimeDistributed(TimeDistributed( self.conv1))(embedded) conv_tensor = TimeDistributed(TimeDistributed( MaxPooling1D(3)))(conv_tensor) conv_tensor = TimeDistributed(TimeDistributed( self.conv2))(conv_tensor) conv_tensor = TimeDistributed(TimeDistributed( MaxPooling1D(2)))(conv_tensor) output = TimeDistributed(TimeDistributed(Flatten()))(conv_tensor) output = self.conv_dense(output) return output def embedded_word_to_document(self, max_sentence_length, embed_dim, embedded, sentence_remainder=None, word_remainder=None): # input:(batch_size, sentence_size, sentence_length, embed_dim) # assume input is masked # embedded = append_word_remainder(word_remainder, words) # sentence level if self.REMAINDER: # expanded_word_remainder = K.expand_dims(word_remainder, axis=-1) embedded = Concatenate()([word_remainder, embedded]) embed_dim += 1 modelSentence = self.embedded_word_to_sentence(max_sentence_length, embed_dim) sentenceEmbbeding = TimeDistributed(modelSentence)(embedded) # sentenceAttention = TimeDistributed(modelSentAttention)(embedded) # document level sentenceEmbbeding = Masking()(sentenceEmbbeding) if self.REMAINDER: # expanded_sentence_remainder = K.expand_dims(sentence_remainder, axis=-1) sentenceEmbbeding = Concatenate()( [sentence_remainder, sentenceEmbbeding]) for i in range(self.stack[1]): sentenceEmbbeding = self.rnn2[i](sentenceEmbbeding) # sentenceEmbbeding = Dropout(self.SentenceRnn_dropout)(sentenceEmbbeding) sentence_tanh = self.tanh2(sentenceEmbbeding) sentence_tanh = Dropout(self.tanh2_dropout)(sentence_tanh) attentionSent = self.att2(sentence_tanh) documentEmb = Multiply()([sentenceEmbbeding, attentionSent]) documentEmb = Lambda(lambda x: K.sum(x, axis=1), output_shape=lambda x: (x[0], x[2]), name="sum_att2")(documentEmb) documentOut = self.logit(documentEmb) return documentOut def append_word_remainder(self, word_remainder, words): if self.REMAINDER: words = concatenate([word_remainder, words]) return words def remainder_input_tensor(self, max_n_sentences, max_length): if self.REMAINDER: # sentence and word remainder return Input(shape=(max_n_sentences, 1), dtype='float32'), Input(shape=(max_n_sentences, max_length, 1), dtype='float32') else: return None, None def word_model(self, batch_size, max_n_sentences, max_length): # embed input sentence_remainder, word_remainder = self.remainder_input_tensor( max_n_sentences, max_length) documentInputs = Input(shape=(max_n_sentences, max_length), dtype='int32', name='word_input') embedded = self.embed_word_document(documentInputs) # masked output = self.embedded_word_to_document(max_length, self.word_embedding_dim, embedded, sentence_remainder, word_remainder) # model creation if not self.REMAINDER: model = Model([documentInputs], output) else: model = Model([documentInputs, sentence_remainder, word_remainder], output) # modelAttentionEv = Model(inputs=[documentInputs], outputs=[output, sentenceAttention, attentionSent]) self.compile_model(model) # self.compile_model(modelAttentionEv) return model def combined_model(self, batch_size, max_n_sentences, max_sentence_length, max_word_length): sentence_remainder, word_remainder = self.remainder_input_tensor( max_n_sentences, max_sentence_length) exists = Input(shape=(max_n_sentences, max_sentence_length), dtype='float32', name='exists') known = Input(shape=(max_n_sentences, max_sentence_length), dtype='float32', name='known') # embed word word_input = Input(shape=(max_n_sentences, max_sentence_length), dtype='int32', name='doc-wd_in') embedded_word = self.embed_word_document(word_input) # masked # embed char char_input = Input(shape=(max_n_sentences, max_sentence_length, max_word_length), dtype='int32', name='doc_ch_in') embedded_char = self.embed_char_document(char_input) # combine (masked during merging) if False: # concat_word = Concatenate()([embedded_char, embedded_word]) # concat_word = self.conv_dense(concat_word) embedded_word = self.max_relu(embedded_word) embedded_char = self.max_relu(embedded_char) concat_word = Maximum()([embedded_word, embedded_char]) # concat_word = self.max_tanh(concat_word) elif self.mode == 'max': # embedded_word = self.word_linear(embedded_word) # embedded_char = self.char_linear(embedded_char) concat_word = Maximum()([embedded_word, embedded_char]) # concat_word = self.max_tanh(concat_word) elif self.mode == 'switch': if True: concat_word = Switch(self.word_embedding_dim, 4)( [exists, known, embedded_word, embedded_char]) else: concat_word = embedded_char def expand(x): y = K.expand_dims(exists) return K.repeat_elements(y, self.word_embedding_dim, axis=3) def calc_output_shape(in_shape): return in_shape + (self.word_embedding_dim, ) expanded_exists = Lambda(expand, output_shape=calc_output_shape)(exists) concat_word = Multiply()([concat_word, expanded_exists]) # to document output = self.embedded_word_to_document(max_sentence_length, self.word_embedding_dim, concat_word, sentence_remainder, word_remainder) if not self.REMAINDER: model = Model([exists, known, word_input, char_input], output) else: model = Model([ exists, known, word_input, char_input, sentence_remainder, word_remainder ], output) self.compile_model(model) return model def compile_model(self, model): metrics = ['accuracy'] if self.mse: metrics += [argmax_mse] model.compile(loss='categorical_crossentropy', optimizer=self.optimizer, metrics=metrics) def make_models(self): self.models = [] if self.CHAR: compile_models = self.bucket_combined_models else: compile_models = self.bucket_word_models compile_models() self.log.write(str(self.models[0].summary()) + '\n') self.make_mini_batch_fn() self.my_keras = My_keras( self.mini_batch_fn, self.log_path, self.dataset, self.CHAR, self.REMAINDER, self.sec_period, self.dataset.n_classes, self.std_batch_size, adjust_learning_rate=self.adjust_learning_rate, mse=self.mse) return self.models def make_mini_batch_fn(self): def mini_batch_fn(is_train, bucket_i, xs, ys): if is_train: return self.models[bucket_i].train_on_batch(xs, ys) else: return self.models[bucket_i].test_on_batch(xs, ys) self.mini_batch_fn = mini_batch_fn def bucket_word_models(self): for i in range(len(self.dataset.bucket_bounds)): batch_size = self.dataset.bucket_batch_size[i] max_n_sentences = self.dataset.bucket_bounds[i][0] max_sentence_length = self.dataset.bucket_bounds[i][1] self.models += [ self.word_model(batch_size, max_n_sentences, max_sentence_length) ] def bucket_combined_models(self): for i in range(len(self.dataset.bucket_bounds)): batch_size = self.dataset.bucket_batch_size[i] max_n_sentences = self.dataset.bucket_bounds[i][0] max_sentence_length = self.dataset.bucket_bounds[i][1] self.models += [ self.combined_model(batch_size, max_n_sentences, max_sentence_length, self.dataset.MAX_WORD_LENGTH) ] def set_learning_rate(self, models, learning_rate): K.set_value(self.optimizer.lr, learning_rate) def run_epochs(self, models, save_path, max_n_epoch, pretrain=False, decay=None, partial_train=False, best_validation_accuracy=0): self.my_keras.models = models best_val_acc, n_patient = best_validation_accuracy, 0 learning_rate = self.initial_learning_rate if not decay: decay = self.learning_rate_decay for i in range(max_n_epoch): self.log.write('learning_rate = {}\n'.format(learning_rate)) if self.TRAIN: self.log.write('==== train ==== (epoch {})\n'.format(i + 1)) train_acc, train_loss, train_rmse = self.my_keras.train_model( self.dataset.train, learning_rate, partial_train=partial_train) self.log.write('========== validation ==========\n') val_acc, val_loss, val_rmse = self.my_keras.test_model( self.dataset.validation, partial_train=partial_train) self.log.write('========== test ==========\n') test_acc, test_loss, test_rmse = self.my_keras.test_model( self.dataset.test, partial_train=partial_train) self.log.write('{} '.format(i + 1)) if self.TRAIN: self.log.write('train loss = {:.5f} train rmse = {}'.format( train_loss, train_rmse)) self.log.write(' val_loss = {:.5f} val_rmse={}'.format( val_loss, val_rmse)) self.log.write(' test_loss = {:.5f} test_rmse={}\n'.format( test_loss, test_rmse)) if self.TRAIN: self.log.write('train_acc = {:.5f} '.format(train_acc)) self.log.write('val_acc = {:.5f} test_acc = {:.5f}\n'.format( val_acc, test_acc)) with open('acc.txt', 'a') as f: f.write( 'val_loss = {:.5f}, val_acc = {:.5f}, test_loss = {:.5f}, test accuracy = {:.5f}\n' .format(val_loss, val_acc, test_loss, test_acc)) self.print_emb_matrix() # save the current if not self.mem_test_flag: models[0].save_weights(save_path) # save the best prev_best_val_acc = best_val_acc if not self.mem_test_flag and val_acc >= best_val_acc + 0.0005: best_val_acc = val_acc models[0].save_weights(self.best_save_path) self.save_best_validation(best_val_acc) self.log.write('====== saved ======\n') # increace patience if the improvement is not enough if val_acc >= prev_best_val_acc + 0.0005: n_patient = 0 else: n_patient += 1 print('n_patient = {}'.format(n_patient)) if n_patient >= self.patience: break # the break condition for pretrain if pretrain and val_acc - best_val_acc < 0.01: break # terminate when pretrain and small difference learning_rate /= decay self.set_learning_rate(models, learning_rate) if self.mem_test_flag: break def load(self, models, save_path): if os.path.exists(save_path): with CustomObjectScope(self.custom_objects): models[0].load_weights(save_path) with open(self.best_validation_save_path, 'r') as f: best_val_acc = float(f.read( )) # error if there is no float value in the file self.log.write( '====== {} loaded (best validation accuracy = {}) =====\n'. format(save_path, best_val_acc)) return best_val_acc else: self.log.write( '======== failed loading .... NEW {} =========\n'.format( save_path)) self.print_emb_matrix('before save') models[0].save_weights( save_path) # save initial restore point for mem_test self.print_emb_matrix('after save') self.save_best_validation(0) return 0 def run_without_mem_test(self, compiled=False): if not self.CHAR: # only word self.log.write('========== word-model ==========\n') if not compiled: self.make_models() best_validation_accuracy = self.load(self.models, self.model_save_path) self.run_epochs(self.models, self.model_save_path, self.max_n_epoch, best_validation_accuracy=best_validation_accuracy) else: self.log.write('=== combined model ======\n') concat_save_path = os.path.join(self.save_dir, 'concat.h5') if not self.mem_test and self.pretrain and not os.path.exists( concat_save_path): # pre-train self.word_embedding_layer.trainable = False pretrain_models = self.make_models() pretrain_save_path = os.path.join(self.save_dir, 'pretrain.h5') if not self.complete_pretrain or not os.path.exists( pretrain_save_path): self.log.write('=== pretrain ====\n') best_validation_accuracy = self.load( pretrain_models, self.best_save_path) self.run_epochs( pretrain_models, pretrain_save_path, max_n_epoch=2, decay=1, best_validation_accuracy=best_validation_accuracy) else: self.log.write('=== finish pre-train ====\n') # convert to full-model self.load(pretrain_models, self.best_save_path) self.word_embedding_layer.trainable = True concat_models = self.make_models() concat_models[0].save_weights(concat_save_path) self.clear() if not compiled: self.make_models() best_validation_accuracy = self.load(self.models, self.model_save_path) self.run_epochs(self.models, self.model_save_path, self.max_n_epoch, best_validation_accuracy=best_validation_accuracy) def save_best_validation(self, accuracy): with open(self.best_validation_save_path, 'w') as f: f.write(str(accuracy)) def clear(self): print('cleared') if os.environ['KERAS_BACKEND'] == 'tensorflow': K.clear_session() self.make_layers() def print_emb_matrix(self, in_str=None): with open(os.path.join(self.save_dir, 'weights.txt'), 'a') as f: f.write('=======================\n') if in_str: f.write(in_str + '\n') f.write('word_rnn = {}\n'.format(self.word_rnn[0].get_weights())) f.write('word = {}\n'.format( self.word_embedding_layer.get_weights())) if self.CHAR: f.write('char = {}\n'.format( self.character_embedding_layer.get_weights())) f.write('conv1 = {}\n'.format(self.conv1.get_weights())) f.write('conv2 = {}\n'.format(self.conv2.get_weights()))
class LSTMLangModel: def __init__(self, word_vec, word_to_index, index_to_word, weight_file=None, learning_rate=0.001, sequence_len=2000, directory='./models/LM_debug', dropout=0.0, outputs=(32, )): self.word_vec = word_vec self.word_to_index = word_to_index self.index_to_word = index_to_word self.sequence_len = sequence_len self.directory = directory self.outputs = outputs self.dropout = dropout self.model = Sequential() self.embed = Embedding(input_dim=np.size(word_vec, 0), output_dim=np.size(word_vec, 1), weights=[word_vec], trainable=True, mask_zero=True, name='Embed', input_shape=(self.sequence_len, )) self.model.add(self.embed) for lo in outputs: self.model.add( LSTM(lo, implementation=1, return_sequences=True, dropout=self.dropout)) self.model.add( TimeDistributed( Dense(len(self.index_to_word), activation='softmax'))) if weight_file is not None: self.model.load_weights(weight_file) self.model.compile(RMSprop(lr=learning_rate), 'categorical_crossentropy', sample_weight_mode='temporal', metrics=[]) def train(self, Xtrain, ytrain, nb_epoch, Xval=None, yval=None, train_mask=None, val_mask=None, batch_size=10): """ Uses a generator to decompress labels from integers to hot-coded vectors batch-by-batch to save memory. See utils.commons.generate_batch(). """ callback = LangModelCallback(self) logger = CSVLogger(self.directory + '/epochs.csv') nb_class = len(self.index_to_word) total_len = np.size(ytrain, 0) generator = commons.generate_batch if Xval is None or yval is None: self.model.fit_generator(generator(Xtrain, ytrain, self.embed.get_weights()[0], train_mask, nb_class, total_len, batch_size), steps_per_epoch=total_len / batch_size, nb_worker=1, epochs=nb_epoch, callbacks=[callback, logger], verbose=1, max_q_size=1) else: self.model.fit_generator( generator(Xtrain, ytrain, self.embed.get_weights()[0], train_mask, nb_class, total_len, batch_size), steps_per_epoch=total_len / batch_size, epochs=nb_epoch, callbacks=[callback, logger], verbose=1, max_q_size=1, workers=1, validation_steps=Xval.shape[0] / batch_size, validation_data=generator(Xval, yval, self.embed.get_weights()[0], val_mask, nb_class, Xval.shape[0], batch_size)) def predict(self, query_tokens, top=None): if top is None: top = np.size(self.word_vec, 0) out_pos = len(query_tokens) - 1 indices = [ self.word_to_index[w] if w in self.word_to_index else self.word_to_index[tokens.UNKNOWN_TOKEN] for w in query_tokens ] indices.extend([0] * (self.sequence_len - len(indices))) indices = np.asarray(indices, dtype=np.int32).reshape( (1, self.sequence_len)) output = self.model.predict(indices, batch_size=1, verbose=0) dist = np.asarray(output[0][out_pos]) index_rank = np.flip(np.argsort(dist)[-top:], 0) result = [] for idx in index_rank: result.append((self.index_to_word[idx], dist[idx])) return result def log(self, string='', out=True): f = open(self.directory + '/log.txt', mode='at') if out: print(string) print(string, file=f) f.close() def save(self): f1 = self.directory + '/weights.hdf5' f2 = self.directory + '/config.pkl' f3 = self.directory + '/dictionary.npz' self.model.save_weights(f1) config = { 'seq_len': self.sequence_len, 'word_vec_dim': np.shape(self.word_vec), 'outputs': self.outputs, 'dropout': self.dropout } pickle.dump(config, open(f2, 'wb'), pickle.HIGHEST_PROTOCOL) np.savez(f3, wit=self.word_to_index, itw=self.index_to_word, wv=self.word_vec) logging.info('\nSaved model to %s' % self.directory) @staticmethod def load(directory): f1 = directory + '/weights.hdf5' f2 = directory + '/config.pkl' f3 = directory + '/dictionary.npz' logging.info('Loading model from %s...' % directory) try: config = pickle.load(open(f2, 'rb')) npz_file = np.load(f3) word_to_index, index_to_word, word_vec = npz_file["wit"].reshape( 1)[0], npz_file["itw"], npz_file["wv"].reshape( config['word_vec_dim']) logging.info('Done.') return LSTMLangModel(word_vec, word_to_index, index_to_word, weight_file=f1, sequence_len=config.get('seq_len', 2000), directory=directory, outputs=config.get('outputs', (32, )), dropout=config.get('dropout', 0.0)) except FileNotFoundError: print('One or more model files cannot be found. Terminating...') sys.exit()
target = Reshape((vector_dim, 1))(target) context = embedding(input_context) context = Reshape((vector_dim, 1))(context) dot_product = merge([target, context], mode='dot', dot_axes=1) dot_product = Reshape((1, ))(dot_product) output = Dense(1, activation='softmax')(dot_product) #sigmoid #output = Dense(len(set(labels)), activation='sigmoid')(dot_product) model = Model(input=[input_target, input_context], output=output) model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['acc']) #model.compile(loss='sparse_categorical_crossentropy', optimizer='adam',metrics=['acc']) print model.summary() epochs = int(sys.argv[2]) model.fit_generator(generator(word_target, word_context, labels, 100), steps_per_epoch=100, epochs=epochs) save_embeddings("embedding.txt", embedding.get_weights()[0], ValueIdentifierDict) model = load_embedding("embedding.txt", 100) #tsne_plot(model) exit(0)
def collect_data(): filename = "C:\\Users\\rkrit\\Documents\\CS 584_Data Mining\\source_code.zip" vocabulary,code = read_data(filename) code = np.squeeze(np.asarray(code)) print(len(vocabulary)) #print(vocabulary[:7]) vocabulary_size = 83123 sum = 0 code3=code #data, count, dictionary, reverse_dictionary = vocab_size =83123 window_size = 100 data, count, dictionary, reverse_dictionary,code_data = build_dataset(vocabulary,vocabulary_size,code3) print(code_data) couples, labels = skipgrams(data, vocab_size, window_size=window_size,shuffle=False) vocab_size = 83123 print(data) vocab_size =83123 window_size = 100 print(data[:7]) import numpy as np window_size = 5 vector_dim = 300 epochs = 2000 valid_size = 16 # Random set of words to evaluate similarity on. valid_window = 100 # Only pick dev samples in the head of the distribution. valid_examples = np.random.choice(valid_window, valid_size, replace=False) sampling_table = sequence.make_sampling_table(16) couples, labels = skipgrams(data, vocab_size, window_size=window_size) word_target, word_context = zip(*couples) word_target = np.array(word_target, dtype="int32") word_context = np.array(word_context, dtype="int32") print(couples[:10], labels[:10]) # create some input variables input_target = Input((1,)) input_context = Input((1,)) embedding = Embedding(vocab_size, vector_dim, input_length=1, name='embedding') target = embedding(input_target) target = Reshape((vector_dim, 1))(target) context = embedding(input_context) context = Reshape((vector_dim, 1))(context) print(len(embedding.get_weights()[0])) sum1=0 final =[] param =embedding.get_weights()[0] indices=[] prev=0 for i in vocabulary: indices.append([prev,len(i)+prev]) prev=len(i) sum=0 for ind in indices: final.append(param[ind[0]:ind[1]]) sum=sum+len(param[ind[0]:ind[1]]) print(len(indices)) print(sum) print(sum1) print(len(final)) print(len(param)) print(len(vocabulary[124])) print(len(final[124])) for code1,code2 in code_data: print("The cosine similarity for ",code1," is ", K.eval(cos_distance(np.array(param[code2]),np.array(param[code2+100:code2+200])))) # setup a cosine similarity operation which will be output in a secondary model model = Sequential() target = keras.layers.Input(shape=(300,)) model.add(target) context = keras.layers.Input(shape=(300,)) model.add(Dense(units=50, activation='sigmoid')(context)) print(couples[:10], labels[:10]) print(K.eval(cos_distance(x1, x2))) model = Sequential() model.add(Dense(units=50,activation="relu",input_shape=(300,))) model.add(Dense(units=50,activation="relu",input_shape=(300,))) model.add(Dense(units=10,activation="softmax")) model.compile(optimizer=SGD(0.001),loss="binary_crossentropy",metrics=["accuracy"],optimizer='rmsprop') # # setup a cosine similarity operation which will be output in a secondary model similarity = merge([target, context], mode='cos', dot_axes=0) # now perform the dot product operation to get a similarity measure dot_product = merge([target, context], mode='dot', dot_axes=1) dot_product = Reshape((1,))(dot_product) # add the sigmoid output layer output = Dense(1, activation='sigmoid')(dot_product) # create the primary training model model = Model(input=[input_target, input_context], output=output) model.compile(loss='binary_crossentropy', optimizer='rmsprop') # create a secondary validation model to run our similarity checks during training validation_model = Model(input=[input_target, input_context], output=similarity) #del vocabulary # Hint to reduce memory. return data, count, dictionary, reverse_dictionary
model = Model([word_input, doc_input], dense) model.compile(loss='categorical_crossentropy', optimizer='rmsprop', metrics=['acc']) model.fit([X_train_data_arr, X_train_doc_idx_arr], label_train, validation_data=([X_test_data_arr, X_test_doc_idx_arr], [label_test]), epochs=1, batch_size=128) model.evaluate([X_train_data_arr, X_train_doc_idx_arr], label_train, batch_size=128, verbose=1, sample_weight=None) em = embedding_layer_word.get_weights()[0] em_norm = LA.norm(em, axis=1) #norm = np.sqrt(np.reduce_sum(np.square(em), 1, keep_dims=True)) em_n = em / em_norm.reshape((20000, 1)) similarity = np.matmul(em_n, np.transpose(em_n)) i = 559 i = 358 top_k = 10 nearest = (-similarity[i, :]).argsort()[1:top_k + 1] log = 'Nearest to %s:' % index_word[i] for k in range(top_k): close_word = index_word[nearest[k]] log = '%s %s,' % (log, close_word) print(log)