def main(): args = parse_arguments() data_root = args.dataroot experiment_root = args.experiment_root # Set both the numpy and the Python random seeds. random.seed(RANDOM_SEED) np.random.seed(RANDOM_SEED) tf.set_random_seed(RANDOM_SEED) # Load data needed for training and save all parameters/mappings to make # sure experiments are reproducible questions_train_all, answers_train_all, images_train_all = load_train_data( data_root) # Since we are simplifying the problem of Visual QA to a classification # problem in this baseline, we want to limit the number of possible # answers, and have the model simply pick the most appropriate one. max_answers = 1000 questions_train_all, answers_train_all, images_train_all = \ select_frequent_answers(questions_train_all, answers_train_all, images_train_all, max_answers) # Encode the remaining (top max_answers) answers and save the mapping. labelencoder = preprocessing.LabelEncoder() labelencoder.fit(answers_train_all) nb_classes = len(list(labelencoder.classes_)) with open(pjoin(experiment_root, 'labelencoder.pkl'), 'wb') as pfile: pickle.dump(labelencoder, pfile) # The initial shuffle ensures that the train-val split is randomized # depending on the random seed, and not fixed every time (which would be # very bad). print("Performing initial shuffle...") questions_train_all, answers_train_all, images_train_all = sklearn_shuffle( questions_train_all, answers_train_all, images_train_all) train_all_count = len(questions_train_all) valid_count = int(train_all_count * args.valid_ratio) train_count = train_all_count - valid_count print("We have {0} total Q-A pairs. Will use {1:.2f}% for validation, " "which is {2} data points. {3} data points will be used for " "actual training.".format(train_all_count, args.valid_ratio * 100.0, valid_count, train_count)) questions_train = questions_train_all[:train_count] answers_train = answers_train_all[:train_count] images_train = images_train_all[:train_count] # Note again that this is NOT the official validation set, but just a # fraction (`args.valid_ratio`) of the training set. The full validation # set evaluation is performed separately. questions_valid = questions_train_all[train_count:] answers_valid = answers_train_all[train_count:] images_valid = images_train_all[train_count:] # construct the model final_model, lang_model, img_model = construct_model( args, data_root, experiment_root, nb_classes) model = final_model.model # Compute val error K times per epoch. val_per_epoch = 4 eval_valid_every = int((train_count / args.batch_size) / val_per_epoch) # Perform Tensorboard-friendly dumps. # TODO(andrei): This only works when using Keras's 'fit' method directly. # tensorboard_log_dir = pjoin(experiment_root, 'logs') # tensorboard_cb = keras.callbacks.TensorBoard(log_dir=tensorboard_log_dir, # histogram_freq=0, # write_graph=True, # write_images=False) # The training part starts here print('Training started...') last_valid_loss = 10 for epoch in range(args.num_epochs): epoch_start_ms = int(time.time() * 1000) # shuffle the data points before going through them questions_train, answers_train, images_train = sklearn_shuffle( questions_train, answers_train, images_train) progbar = generic_utils.Progbar(len(questions_train)) batches = batchify(args.batch_size, questions_train, answers_train, images_train) for batch_idx, (qu_batch, an_batch, im_batch) in enumerate(batches): # Extract batch vectors to train on # Converts the answers to their index (we're just doing # classification at this point) y_batch = get_answers_matrix(an_batch, labelencoder) # train on language only or language and image both if args.language_only: x_q_batch = lang_model.process_input(qu_batch) loss = model.train_on_batch(x_q_batch, y_batch) else: x_q_batch = lang_model.process_input(qu_batch) x_i_batch = img_model.process_input(im_batch) loss = model.train_on_batch([x_q_batch, x_i_batch], y_batch) if (batch_idx + 1) % eval_valid_every == 0: # It's time to validate on the held-out part of the training # dataset. batch_val_losses = [] val_batches = batchify(args.batch_size, questions_valid, answers_valid, images_valid) for (qu_val_batch, an_val_batch, im_val_batch) in val_batches: y_val_batch = get_answers_matrix(an_val_batch, labelencoder) if args.language_only: val_loss = model.test_on_batch( lang_model.process_input(qu_val_batch), y_val_batch) else: val_loss = model.test_on_batch([ lang_model.process_input(qu_val_batch), img_model.process_input(im_val_batch) ], y_val_batch) batch_val_losses.append(val_loss) # The validation loss is just the average of the individual # losses computed for each batch of the validation data. last_valid_loss = np.mean(batch_val_losses) # if batch_idx % progress_update_every == 0: # Important: because of retarded reasons, the progress bar # averages these values, so the reported validation loss will # have a bit of lag. progbar.add(args.batch_size, values=[("tra-loss", loss), ("val-loss", last_valid_loss)]) epoch_end_ms = int(time.time() * 1000) epoch_delta_s = (epoch_end_ms - epoch_start_ms) / 1000.0 print("Epoch {0}/{1} took {2:.1f}s.".format( (epoch + 1), args.num_epochs, epoch_delta_s)) print("Latest validation loss: {0:4f}".format(last_valid_loss)) # Dump a checkpoint periodically. if (epoch + 1) % args.model_save_interval == 0: model_dump_fname = pjoin(experiment_root, 'weights_{0}.hdf5'.format(epoch + 1)) print('Saving model to file: {0}'.format(model_dump_fname)) model.save_weights(model_dump_fname) # Compute overall accuracy periodically on OFFICIAL full validation # set (but not too often, as it can get quite slow). if (epoch + 1) % args.model_eval_full_valid_interval == 0: # TODO(andrei): Implement this in a neat way. pass # TODO(Bernhard): catch control+c and store last parameters... # Final checkpoint dump. model.save_weights( pjoin(experiment_root, 'weights_{0}_final.hdf5'.format(epoch + 1)))
def main(): parser = argparse.ArgumentParser() parser.add_argument('-num_hidden_units', type=int, default=512) parser.add_argument('-num_lstm_layers', type=int, default=2) parser.add_argument('-dropout', type=float, default=0.2) parser.add_argument('-activation', type=str, default='tanh') parser.add_argument('-num_epochs', type=int, default=100) parser.add_argument('-model_save_interval', type=int, default=5) parser.add_argument('-batch_size', type=int, default=128) parser.add_argument('-word_vector', type=str, default='') args = parser.parse_args() questions_train = open('../data/preprocessed/questions_train2014.txt', 'r').read().decode('utf8').splitlines() questions_lengths_train = open('../data/preprocessed/questions_lengths_train2014.txt', 'r').read().decode('utf8').splitlines() answers_train = open('../data/preprocessed/answers_train2014.txt', 'r').read().decode('utf8').splitlines() images_train = open('../data/preprocessed/images_train2014.txt', 'r').read().decode('utf8').splitlines() max_answers = 1000 questions_train, answers_train, images_train = selectFrequentAnswers(questions_train,answers_train,images_train, max_answers) print 'Loaded questions, sorting by length...' questions_lengths_train, questions_train, answers_train = (list(t) for t in zip(*sorted(zip(questions_lengths_train, questions_train, answers_train)))) #encode the remaining answers labelencoder = preprocessing.LabelEncoder() labelencoder.fit(answers_train) nb_classes = len(list(labelencoder.classes_)) joblib.dump(labelencoder,'../models/labelencoder.pkl') max_len = 30 #25 is max for training, 27 is max for validation word_vec_dim = 300 model = Sequential() model.add(LSTM(output_dim = args.num_hidden_units, activation='tanh', return_sequences=True, input_shape=(max_len, word_vec_dim))) model.add(Dropout(args.dropout)) model.add(LSTM(args.num_hidden_units, return_sequences=False)) model.add(Dense(nb_classes, init='uniform')) model.add(Activation('softmax')) json_string = model.to_json() model_file_name = '../models/lstm_language_only_num_hidden_units_' + str(args.num_hidden_units) + '_num_lstm_layers_' + str(args.num_lstm_layers) + '_dropout_' + str(args.dropout) open(model_file_name + '.json', 'w').write(json_string) print 'Compiling model...' model.compile(loss='categorical_crossentropy', optimizer='rmsprop') print 'Compilation done...' #set up word vectors # Code to choose the word vectors, default is Goldberg but GLOVE is preferred if args.word_vector == 'glove': nlp = spacy.load('en', vectors='en_glove_cc_300_1m_vectors') else: nlp = English() print 'loaded ' + args.word_vector + ' word2vec features...' ## training # Moved few variables to args.parser (num_epochs, batch_size, model_save_interval) print 'Training started...' for k in xrange(args.num_epochs): progbar = generic_utils.Progbar(len(questions_train)) for qu_batch,an_batch,im_batch in zip(grouper(questions_train, args.batch_size, fillvalue=questions_train[0]), grouper(answers_train, args.batch_size, fillvalue=answers_train[0]), grouper(images_train, args.batch_size, fillvalue=images_train[0])): timesteps = len(nlp(qu_batch[-1])) #questions sorted in descending order of length X_q_batch = get_questions_tensor_timeseries(qu_batch, nlp, timesteps) Y_batch = get_answers_matrix(an_batch, labelencoder) loss = model.train_on_batch(X_q_batch, Y_batch) # fix for the Keras v0.3 issue #9 progbar.add(args.batch_size, values=[("train loss", loss[0])]) if k%args.model_save_interval == 0: model.save_weights(model_file_name + '_epoch_{:02d}.hdf5'.format(k)) model.save_weights(model_file_name + '_epoch_{:02d}.hdf5'.format(k+1))
def main(): parser = argparse.ArgumentParser() parser.add_argument('-num_hidden_units', type=int, default=512) parser.add_argument('-num_lstm_layers', type=int, default=2) parser.add_argument('-dropout', type=float, default=0.2) parser.add_argument('-activation', type=str, default='tanh') args = parser.parse_args() questions_train = open('../data/preprocessed/questions_train2014.txt', 'r').read().decode('utf8').splitlines() questions_lengths_train = open( '../data/preprocessed/questions_lengths_train2014.txt', 'r').read().decode('utf8').splitlines() answers_train = open('../data/preprocessed/answers_train2014.txt', 'r').read().decode('utf8').splitlines() images_train = open('../data/preprocessed/images_train2014.txt', 'r').read().decode('utf8').splitlines() max_answers = 1000 questions_train, answers_train, images_train = selectFrequentAnswers( questions_train, answers_train, images_train, max_answers) print 'Loaded questions, sorting by length...' questions_lengths_train, questions_train, answers_train = ( list(t) for t in zip(*sorted( zip(questions_lengths_train, questions_train, answers_train)))) #encode the remaining answers labelencoder = preprocessing.LabelEncoder() labelencoder.fit(answers_train) nb_classes = len(list(labelencoder.classes_)) joblib.dump(labelencoder, '../models/labelencoder.pkl') max_len = 30 #25 is max for training, 27 is max for validation word_vec_dim = 300 model = Sequential() model.add( LSTM(output_dim=args.num_hidden_units, activation='tanh', return_sequences=True, input_shape=(max_len, word_vec_dim))) model.add(Dropout(args.dropout)) model.add(LSTM(args.num_hidden_units, return_sequences=False)) model.add(Dense(nb_classes, init='uniform')) model.add(Activation('softmax')) json_string = model.to_json() model_file_name = '../models/lstm_language_only_num_hidden_units_' + str( args.num_hidden_units) + '_num_lstm_layers_' + str( args.num_lstm_layers) + '_dropout_' + str(args.dropout) open(model_file_name + '.json', 'w').write(json_string) print 'Compiling model...' model.compile(loss='categorical_crossentropy', optimizer='rmsprop') print 'Compilation done...' #set up word vectors nlp = English() print 'loaded word2vec features...' ## training print 'Training started...' numEpochs = 100 model_save_interval = 5 batchSize = 128 for k in xrange(numEpochs): progbar = generic_utils.Progbar(len(questions_train)) for qu_batch, an_batch, im_batch in zip( grouper(questions_train, batchSize, fillvalue=questions_train[0]), grouper(answers_train, batchSize, fillvalue=answers_train[0]), grouper(images_train, batchSize, fillvalue=images_train[0])): timesteps = len(nlp( qu_batch[-1])) #questions sorted in descending order of length X_q_batch = get_questions_tensor_timeseries( qu_batch, nlp, timesteps) Y_batch = get_answers_matrix(an_batch, labelencoder) loss = model.train_on_batch(X_q_batch, Y_batch) progbar.add(batchSize, values=[("train loss", loss)]) if k % model_save_interval == 0: model.save_weights(model_file_name + '_epoch_{:02d}.hdf5'.format(k)) model.save_weights(model_file_name + '_epoch_{:02d}.hdf5'.format(k + 1))
def main(): parser = argparse.ArgumentParser() parser.add_argument('-num_hidden_units_mlp', type=int, default=1024) parser.add_argument('-num_hidden_units_lstm', type=int, default=512) parser.add_argument('-num_hidden_layers_mlp', type=int, default=3) parser.add_argument('-num_hidden_layers_lstm', type=int, default=1) parser.add_argument('-dropout', type=float, default=0.5) parser.add_argument('-activation_mlp', type=str, default='tanh') parser.add_argument('-num_epochs', type=int, default=100) parser.add_argument('-model_save_interval', type=int, default=5) parser.add_argument('-batch_size', type=int, default=128) #TODO Feature parser.add_argument('-resume_training', type=str) #TODO Feature parser.add_argument('-language_only', type=bool, default= False) args = parser.parse_args() word_vec_dim = 300 img_dim = 4096 max_len = 30 nb_classes = 1000 #get the data questions_train = open('../data/preprocessed/questions_train2014.txt', 'r').read().decode('utf8').splitlines() questions_lengths_train = open( '../data/preprocessed/questions_lengths_train2014.txt', 'r').read().decode('utf8').splitlines() answers_train = open('../data/preprocessed/answers_train2014.txt', 'r').read().decode('utf8').splitlines() images_train = open('../data/preprocessed/images_train2014.txt', 'r').read().decode('utf8').splitlines() vgg_model_path = '../features/coco/vgg_feats.mat' max_answers = nb_classes questions_train, answers_train, images_train = selectFrequentAnswers( questions_train, answers_train, images_train, max_answers) questions_lengths_train, questions_train, answers_train, images_train = ( list(t) for t in zip(*sorted( zip(questions_lengths_train, questions_train, answers_train, images_train)))) #encode the remaining answers labelencoder = preprocessing.LabelEncoder() labelencoder.fit(answers_train) nb_classes = len(list(labelencoder.classes_)) joblib.dump(labelencoder, '../models/labelencoder.pkl') image_model = Sequential() image_model.add(Reshape(input_shape=(img_dim, ), dims=(img_dim, ))) language_model = Sequential() if args.num_hidden_layers_lstm == 1: language_model.add( LSTM(output_dim=args.num_hidden_units_lstm, return_sequences=False, input_shape=(max_len, word_vec_dim))) else: language_model.add( LSTM(output_dim=args.num_hidden_units_lstm, return_sequences=True, input_shape=(max_len, word_vec_dim))) for i in xrange(args.num_hidden_layers_lstm - 2): language_model.add( LSTM(output_dim=args.num_hidden_units_lstm, return_sequences=True)) language_model.add( LSTM(output_dim=args.num_hidden_units_lstm, return_sequences=False)) model = Sequential() model.add( Merge([language_model, image_model], mode='concat', concat_axis=1)) for i in xrange(args.num_hidden_layers_mlp): model.add(Dense(args.num_hidden_units_mlp, init='uniform')) model.add(Activation(args.activation_mlp)) model.add(Dropout(args.dropout)) model.add(Dense(nb_classes)) model.add(Activation('softmax')) json_string = model.to_json() model_file_name = '../models/lstm_1_num_hidden_units_lstm_' + str(args.num_hidden_units_lstm) + \ '_num_hidden_units_mlp_' + str(args.num_hidden_units_mlp) + '_num_hidden_layers_mlp_' + \ str(args.num_hidden_layers_mlp) + '_num_hidden_layers_lstm_' + str(args.num_hidden_layers_lstm) open(model_file_name + '.json', 'w').write(json_string) model.compile(loss='categorical_crossentropy', optimizer='rmsprop') print 'Compilation done' features_struct = scipy.io.loadmat(vgg_model_path) VGGfeatures = features_struct['feats'] print 'loaded vgg features' image_ids = open('../features/coco_vgg_IDMap.txt').read().splitlines() img_map = {} for ids in image_ids: id_split = ids.split() img_map[id_split[0]] = int(id_split[1]) nlp = English() print 'loaded word2vec features...' ## training print 'Training started...' for k in xrange(args.num_epochs): progbar = generic_utils.Progbar(len(questions_train)) for qu_batch, an_batch, im_batch in zip( grouper(questions_train, args.batch_size, fillvalue=questions_train[-1]), grouper(answers_train, args.batch_size, fillvalue=answers_train[-1]), grouper(images_train, args.batch_size, fillvalue=images_train[-1])): timesteps = len(nlp( qu_batch[-1])) #questions sorted in descending order of length X_q_batch = get_questions_tensor_timeseries( qu_batch, nlp, timesteps) X_i_batch = get_images_matrix(im_batch, img_map, VGGfeatures) Y_batch = get_answers_matrix(an_batch, labelencoder) loss = model.train_on_batch([X_q_batch, X_i_batch], Y_batch) progbar.add(args.batch_size, values=[("train loss", loss)]) if k % args.model_save_interval == 0: model.save_weights(model_file_name + '_epoch_{:03d}.hdf5'.format(k)) model.save_weights(model_file_name + '_epoch_{:03d}.hdf5'.format(k))
def main(): parser = argparse.ArgumentParser() parser.add_argument('-num_hidden_units', type=int, default=1024) parser.add_argument('-num_hidden_layers', type=int, default=3) parser.add_argument('-dropout', type=float, default=0.5) parser.add_argument('-activation', type=str, default='tanh') parser.add_argument('-language_only', type=bool, default=False) parser.add_argument('-num_epochs', type=int, default=10) parser.add_argument('-model_save_interval', type=int, default=10) parser.add_argument('-batch_size', type=int, default=128) args = parser.parse_args() questions_train = open('../data/preprocessed/questions_train2014.txt', 'r').read().decode('utf8').splitlines() answers_train = open('../data/preprocessed/answers_train2014_modal.txt', 'r').read().decode('utf8').splitlines() images_train = open('../data/preprocessed/images_train2014.txt', 'r').read().decode('utf8').splitlines() vgg_model_path = '../features/coco/vgg_feats.mat' maxAnswers = 1000 questions_train, answers_train, images_train = selectFrequentAnswers( questions_train, answers_train, images_train, maxAnswers) #encode the remaining answers labelencoder = preprocessing.LabelEncoder() labelencoder.fit(answers_train) nb_classes = len(list(labelencoder.classes_)) joblib.dump(labelencoder, '../models/labelencoder.pkl') features_struct = scipy.io.loadmat(vgg_model_path) VGGfeatures = features_struct['feats'] print 'loaded vgg features' image_ids = open('../features/coco_vgg_IDMap.txt').read().splitlines() id_map = {} for ids in image_ids: id_split = ids.split() id_map[id_split[0]] = int(id_split[1]) nlp = English() print 'loaded word2vec features...' img_dim = 4096 word_vec_dim = 300 model = Sequential() if args.language_only: model.add( Dense(args.num_hidden_units, input_dim=word_vec_dim, init='uniform')) else: model.add( Dense(args.num_hidden_units, input_dim=img_dim + word_vec_dim, init='uniform')) model.add(Activation(args.activation)) if args.dropout > 0: model.add(Dropout(args.dropout)) for i in xrange(args.num_hidden_layers - 1): model.add(Dense(args.num_hidden_units, init='uniform')) model.add(Activation(args.activation)) if args.dropout > 0: model.add(Dropout(args.dropout)) model.add(Dense(nb_classes, init='uniform')) model.add(Activation('softmax')) json_string = model.to_json() if args.language_only: model_file_name = '../models/mlp_language_only_num_hidden_units_' + str( args.num_hidden_units) + '_num_hidden_layers_' + str( args.num_hidden_layers) else: model_file_name = '../models/mlp_num_hidden_units_' + str( args.num_hidden_units) + '_num_hidden_layers_' + str( args.num_hidden_layers) open(model_file_name + '.json', 'w').write(json_string) print 'Compiling model...' model.compile(loss='categorical_crossentropy', optimizer='rmsprop') print 'Compilation done...' print 'Training started...' for k in xrange(args.num_epochs): #shuffle the data points before going through them index_shuf = range(len(questions_train)) shuffle(index_shuf) questions_train = [questions_train[i] for i in index_shuf] answers_train = [answers_train[i] for i in index_shuf] images_train = [images_train[i] for i in index_shuf] progbar = generic_utils.Progbar(len(questions_train)) for qu_batch, an_batch, im_batch in zip( grouper(questions_train, args.batch_size, fillvalue=questions_train[-1]), grouper(answers_train, args.batch_size, fillvalue=answers_train[-1]), grouper(images_train, args.batch_size, fillvalue=images_train[-1])): X_q_batch = get_questions_matrix_sum(qu_batch, nlp) if args.language_only: X_batch = X_q_batch else: X_i_batch = get_images_matrix(im_batch, id_map, VGGfeatures) X_batch = np.hstack((X_q_batch, X_i_batch)) Y_batch = get_answers_matrix(an_batch, labelencoder) loss = model.train_on_batch(X_batch, Y_batch) progbar.add(args.batch_size, values=[("train loss", loss)]) #print type(loss) if k % args.model_save_interval == 0: model.save_weights(model_file_name + '_epoch_{:02d}.hdf5'.format(k)) model.save_weights(model_file_name + '_epoch_{:02d}.hdf5'.format(k))
def main(): parser = argparse.ArgumentParser() parser.add_argument('-num_hidden_units', type=int, default=1024) parser.add_argument('-num_hidden_layers', type=int, default=3) parser.add_argument('-dropout', type=float, default=0.5) parser.add_argument('-activation', type=str, default='tanh') parser.add_argument('-language_only', type=bool, default= False) parser.add_argument('-num_epochs', type=int, default=100) parser.add_argument('-model_save_interval', type=int, default=10) parser.add_argument('-batch_size', type=int, default=128) parser.add_argument('-word_vector', type=str, default='') args = parser.parse_args() questions_train = open('../data/preprocessed/questions_train2014.txt', 'r').read().decode('utf8').splitlines() answers_train = open('../data/preprocessed/answers_train2014_modal.txt', 'r').read().decode('utf8').splitlines() images_train = open('../data/preprocessed/images_train2014.txt', 'r').read().decode('utf8').splitlines() vgg_model_path = '../features/coco/vgg_feats.mat' maxAnswers = 1000 questions_train, answers_train, images_train = selectFrequentAnswers(questions_train,answers_train,images_train, maxAnswers) #encode the remaining answers labelencoder = preprocessing.LabelEncoder() labelencoder.fit(answers_train) nb_classes = len(list(labelencoder.classes_)) joblib.dump(labelencoder,'../models/labelencoder.pkl') features_struct = scipy.io.loadmat(vgg_model_path) VGGfeatures = features_struct['feats'] print 'loaded vgg features' image_ids = open('../features/coco_vgg_IDMap.txt').read().splitlines() id_map = {} for ids in image_ids: id_split = ids.split() id_map[id_split[0]] = int(id_split[1]) # Code to choose the word vectors, default is Goldberg but GLOVE is preferred if args.word_vector == 'glove': nlp = spacy.load('en', vectors='en_glove_cc_300_1m_vectors') else: nlp = English() print 'loaded ' + args.word_vector + ' word2vec features...' img_dim = 4096 word_vec_dim = 300 model = Sequential() if args.language_only: model.add(Dense(args.num_hidden_units, input_dim=word_vec_dim, init='uniform')) else: model.add(Dense(args.num_hidden_units, input_dim=img_dim+word_vec_dim, init='uniform')) model.add(Activation(args.activation)) if args.dropout>0: model.add(Dropout(args.dropout)) for i in xrange(args.num_hidden_layers-1): model.add(Dense(args.num_hidden_units, init='uniform')) model.add(Activation(args.activation)) if args.dropout>0: model.add(Dropout(args.dropout)) model.add(Dense(nb_classes, init='uniform')) model.add(Activation('softmax')) json_string = model.to_json() if args.language_only: model_file_name = '../models/mlp_language_only_num_hidden_units_' + str(args.num_hidden_units) + '_num_hidden_layers_' + str(args.num_hidden_layers) else: model_file_name = '../models/mlp_num_hidden_units_' + str(args.num_hidden_units) + '_num_hidden_layers_' + str(args.num_hidden_layers) open(model_file_name + '.json', 'w').write(json_string) print 'Compiling model...' model.compile(loss='categorical_crossentropy', optimizer='rmsprop') print 'Compilation done...' print 'Training started...' for k in xrange(args.num_epochs): #shuffle the data points before going through them index_shuf = range(len(questions_train)) shuffle(index_shuf) questions_train = [questions_train[i] for i in index_shuf] answers_train = [answers_train[i] for i in index_shuf] images_train = [images_train[i] for i in index_shuf] progbar = generic_utils.Progbar(len(questions_train)) for qu_batch,an_batch,im_batch in zip(grouper(questions_train, args.batch_size, fillvalue=questions_train[-1]), grouper(answers_train, args.batch_size, fillvalue=answers_train[-1]), grouper(images_train, args.batch_size, fillvalue=images_train[-1])): X_q_batch = get_questions_matrix_sum(qu_batch, nlp) if args.language_only: X_batch = X_q_batch else: X_i_batch = get_images_matrix(im_batch, id_map, VGGfeatures) X_batch = np.hstack((X_q_batch, X_i_batch)) Y_batch = get_answers_matrix(an_batch, labelencoder) loss = model.train_on_batch(X_batch, Y_batch) # fix for the Keras v0.3 issue #9 progbar.add(args.batch_size, values=[("train loss", loss[0])]) #print type(loss) if k%args.model_save_interval == 0: model.save_weights(model_file_name + '_epoch_{:02d}.hdf5'.format(k)) model.save_weights(model_file_name + '_epoch_{:02d}.hdf5'.format(k))
def main(): cwd = os.getcwd() parser = argparse.ArgumentParser() parser.add_argument('-num_hidden_units', type=int, default=1024) parser.add_argument('-num_hidden_layers', type=int, default=3) parser.add_argument('-dropout', type=float, default=0.5) parser.add_argument('-activation', type=str, default='tanh') parser.add_argument('-language_only', type=bool, default= False) parser.add_argument('-num_epochs', type=int, default=2) parser.add_argument('-model_save_interval', type=int, default=10) parser.add_argument('-model_weights_path', type=str, default=cwd+'/vgg/vgg16_weights.h5') parser.add_argument('-batch_size', type=int, default=128) parser.add_argument('-questions_train',type=str, default = cwd+'/data/preprocessed/questions_train2015.txt') parser.add_argument('-answers_train',type=str, default = cwd+'/data/preprocessed/answers_train2015_modal.txt') parser.add_argument('-im_dir',type=str, default =cwd+'/data/preprocessed/scene_img_abstract_v002_train2015/') #parser.add_argument('-questions_train',type=str, default = cwd+'/data/preprocessed/questions_train2014.txt') args = parser.parse_args() questions_train = open(args.questions_train, 'r').read().decode('utf8').splitlines() answers_train = open(args.answers_train, 'r').read().decode('utf8').splitlines() images_train = open(cwd+'/data/preprocessed/images_train2015.txt', 'r').read().decode('utf8').splitlines() #vgg_model_path = cwd+'/features/coco/vgg_feats.mat' #this needs to change maxAnswers = 100 questions_train, answers_train, images_train = selectFrequentAnswers(questions_train,answers_train,images_train, maxAnswers) #encode the remaining answers labelencoder = preprocessing.LabelEncoder() labelencoder.fit(answers_train) nb_classes = len(list(labelencoder.classes_)) joblib.dump(labelencoder,cwd+'/models/labelencoder.pkl') #features_struct = scipy.io.loadmat(vgg_model_path) #VGGfeatures = features_struct['feats'] # print 'loaded vgg features' # image_ids = open(cwd+'/features/coco_vgg_IDMap.txt').read().splitlines() # id_map = {} # for ids in image_ids: # id_split = ids.split() # id_map[id_split[0]] = int(id_split[1]) vgg_model = vgg16.VGG_16(args.model_weights_path) sgd = SGD(lr=0.1, decay=1e-6, momentum=0.9, nesterov=True) vgg_model.compile(optimizer=sgd, loss='categorical_crossentropy') print 'loaded vgg model...' nlp = English() print 'loaded word2vec features...' img_dim = 4096 word_vec_dim = 300 model = Sequential() if args.language_only: model.add(Dense(args.num_hidden_units, input_dim=word_vec_dim, init='uniform')) else: model.add(Dense(args.num_hidden_units, input_dim=img_dim+word_vec_dim, init='uniform')) model.add(Activation(args.activation)) if args.dropout>0: model.add(Dropout(args.dropout)) for i in xrange(args.num_hidden_layers-1): model.add(Dense(args.num_hidden_units, init='uniform')) model.add(Activation(args.activation)) if args.dropout>0: model.add(Dropout(args.dropout)) model.add(Dense(nb_classes, init='uniform')) model.add(Activation('softmax')) json_string = model.to_json() model_file_name = cwd+'/models/mlp_num_hidden_units_' + str(args.num_hidden_units) + '_num_hidden_layers_' + str(args.num_hidden_layers) open(model_file_name + '.json', 'w').write(json_string) print 'Training started...' id_map = {} f1 = open('abstract_image_precompute') f2 = open('abstract_image_precompute_reverse') VGGfeatures = np.loadtxt(f1) VGGfeatures_reverse = np.loadtxt(f2) f1.close() f2.close() for k in xrange(args.num_epochs): #shuffle the data points before going through them index_shuf = range(len(questions_train)) shuffle(index_shuf) questions_train = [questions_train[i] for i in index_shuf] answers_train = [answers_train[i] for i in index_shuf] images_train = [images_train[i] for i in index_shuf] progbar = generic_utils.Progbar(len(questions_train)) for qu_batch,an_batch,im_batch in zip(grouper(questions_train, args.batch_size, fillvalue=questions_train[-1]), grouper(answers_train, args.batch_size, fillvalue=answers_train[-1]), grouper(images_train, args.batch_size, fillvalue=images_train[-1])): X_q_batch = get_questions_matrix_sum(qu_batch, nlp) im_path = args.im_dir +"abstract_v002_train2015_" print 'getting image features...' X_i_batch = get_images_matrix(im_batch, VGGfeatures, VGGfeatures_reverse) # X_i_batch = get_images_matrix_from_model(vgg_model, im_batch, im_path, id_map) X_batch = np.hstack((X_q_batch, X_i_batch)) Y_batch = get_answers_matrix(an_batch, labelencoder) print 'running training on batch...' loss = model.train_on_batch(X_batch, Y_batch) progbar.add(args.batch_size, values=[("train loss", loss)]) if k%args.model_save_interval == 0: model.save_weights(model_file_name + '_epoch_{:02d}.hdf5'.format(k)) model.save_weights(model_file_name + '_epoch_{:02d}.hdf5'.format(k))
def main(): parser = argparse.ArgumentParser() parser.add_argument('-num_hidden_units_mlp', type=int, default=1024) parser.add_argument('-num_hidden_units_lstm', type=int, default=512) parser.add_argument('-num_hidden_layers_mlp', type=int, default=3) parser.add_argument('-num_hidden_layers_lstm', type=int, default=1) parser.add_argument('-dropout', type=float, default=0.5) parser.add_argument('-activation_mlp', type=str, default='tanh') parser.add_argument('-num_epochs', type=int, default=100) parser.add_argument('-model_save_interval', type=int, default=5) parser.add_argument('-batch_size', type=int, default=128) parser.add_argument('-word_vector', type=str, default='') #TODO Feature parser.add_argument('-resume_training', type=str) #TODO Feature parser.add_argument('-language_only', type=bool, default= False) args = parser.parse_args() word_vec_dim= 300 img_dim = 4096 max_len = 30 nb_classes = 1000 #get the data questions_train = open('../data/preprocessed/questions_train2014.txt', 'r').read().decode('utf8').splitlines() questions_lengths_train = open('../data/preprocessed/questions_lengths_train2014.txt', 'r').read().decode('utf8').splitlines() answers_train = open('../data/preprocessed/answers_train2014_modal.txt', 'r').read().decode('utf8').splitlines() images_train = open('../data/preprocessed/images_train2014.txt', 'r').read().decode('utf8').splitlines() vgg_model_path = '../features/coco/vgg_feats.mat' max_answers = nb_classes questions_train, answers_train, images_train = selectFrequentAnswers(questions_train,answers_train,images_train, max_answers) questions_lengths_train, questions_train, answers_train, images_train = (list(t) for t in zip(*sorted(zip(questions_lengths_train, questions_train, answers_train, images_train)))) #encode the remaining answers labelencoder = preprocessing.LabelEncoder() labelencoder.fit(answers_train) nb_classes = len(list(labelencoder.classes_)) joblib.dump(labelencoder,'../models/labelencoder.pkl') image_model = Sequential() image_model.add(Reshape(input_shape = (img_dim,), dims=(img_dim,))) language_model = Sequential() if args.num_hidden_layers_lstm == 1: language_model.add(LSTM(output_dim = args.num_hidden_units_lstm, return_sequences=False, input_shape=(max_len, word_vec_dim))) else: language_model.add(LSTM(output_dim = args.num_hidden_units_lstm, return_sequences=True, input_shape=(max_len, word_vec_dim))) for i in xrange(args.num_hidden_layers_lstm-2): language_model.add(LSTM(output_dim = args.num_hidden_units_lstm, return_sequences=True)) language_model.add(LSTM(output_dim = args.num_hidden_units_lstm, return_sequences=False)) model = Sequential() model.add(Merge([language_model, image_model], mode='concat', concat_axis=1)) for i in xrange(args.num_hidden_layers_mlp): model.add(Dense(args.num_hidden_units_mlp, init='uniform')) model.add(Activation(args.activation_mlp)) model.add(Dropout(args.dropout)) model.add(Dense(nb_classes)) model.add(Activation('softmax')) json_string = model.to_json() model_file_name = '../models/lstm_1_num_hidden_units_lstm_' + str(args.num_hidden_units_lstm) + \ '_num_hidden_units_mlp_' + str(args.num_hidden_units_mlp) + '_num_hidden_layers_mlp_' + \ str(args.num_hidden_layers_mlp) + '_num_hidden_layers_lstm_' + str(args.num_hidden_layers_lstm) open(model_file_name + '.json', 'w').write(json_string) model.compile(loss='categorical_crossentropy', optimizer='rmsprop') print 'Compilation done' features_struct = scipy.io.loadmat(vgg_model_path) VGGfeatures = features_struct['feats'] print 'loaded vgg features' image_ids = open('../features/coco_vgg_IDMap.txt').read().splitlines() img_map = {} for ids in image_ids: id_split = ids.split() img_map[id_split[0]] = int(id_split[1]) # Code to choose the word vectors, default is Goldberg but GLOVE is preferred if args.word_vector == 'glove': nlp = spacy.load('en', vectors='en_glove_cc_300_1m_vectors') else: nlp = English() print 'loaded ' + args.word_vector + ' word2vec features...' ## training print 'Training started...' for k in xrange(args.num_epochs): progbar = generic_utils.Progbar(len(questions_train)) for qu_batch,an_batch,im_batch in zip(grouper(questions_train, args.batch_size, fillvalue=questions_train[-1]), grouper(answers_train, args.batch_size, fillvalue=answers_train[-1]), grouper(images_train, args.batch_size, fillvalue=images_train[-1])): timesteps = len(nlp(qu_batch[-1])) #questions sorted in descending order of length X_q_batch = get_questions_tensor_timeseries(qu_batch, nlp, timesteps) X_i_batch = get_images_matrix(im_batch, img_map, VGGfeatures) Y_batch = get_answers_matrix(an_batch, labelencoder) loss = model.train_on_batch([X_q_batch, X_i_batch], Y_batch) # fix for the Keras v0.3 issue #9 progbar.add(args.batch_size, values=[("train loss", loss[0])]) if k%args.model_save_interval == 0: model.save_weights(model_file_name + '_epoch_{:03d}.hdf5'.format(k)) model.save_weights(model_file_name + '_epoch_{:03d}.hdf5'.format(k))
for i in xrange(args.num_epochs): progbar = generic_utils.Progbar(len(questions_train)) for qu_batch, an_batch, im_batch in zip( grouper(questions_train, args.batch_size, fillvalue=questions_train[-1]), grouper(answers_train, args.batch_size, fillvalue=answers_train[-1]), grouper(images_train, args.batch_size, fillvalue=images_train[-1])): # logging.debug("One batch done") x_q_batch = get_questions_matrix_sum(qu_batch, nlp) # logging.debug("length of qu_batch is %d", len(qu_batch)) # logging.debug("Shape of x_q_batch is: %s", x_q_batch.shape) x_i_batch = get_images_matrix(im_batch, id_map, sherlock_features) # logging.debug("shape of x_i_batch is %s", x_i_batch.shape) x_batch = np.hstack((x_q_batch, x_i_batch)) y_batch = get_answers_matrix(an_batch, labelencoder) loss = model.train_on_batch(x_batch, y_batch) training_loss.append(loss) progbar.add(args.batch_size, values=[("train_loss", loss)]) # print "\n" # if __name__ == '__main__': # main() plt.plot(training_loss) plt.title("Training loss for the network") plt.savefig('loss.png') model.save('trained_vqa_100epochs.hd5')