def main(config): datadir = './data/' process = ['train', 'val', 'test'] for proc in process: # read the stories stories = utils_vist.getSent(datadir + proc + '/' + proc + '_text.csv') parsetree = get_parse_tree(stories) print('obtained parse trees..') # save the trees in file np.save(datadir + proc + '/' + proc + '_parsetree.npy', parsetree) # get entitiy feature for all stories in the file entity_feat = entity_score.entity_feature(parsetree) print('obtained entity features..') # convert dict to numpy array cohvec = np.zeros((len(stories), 64), dtype=float) for i in entity_feat: cohvec[int(i)] = entity_feat[i] # save entity feature as numpy file np.save(datadir + proc + '/cohvec_' + proc + '.npy', cohvec)
def test(config, num_words, embedding_matrix, test_data): # load all data results = dict() testdir = config['datadir'] + 'test/' testinsamplename = config['testsamples'] predictions = config['savepred'] test_sents = utils_vist.getSent(testdir + 'test_text.csv') # get sentence and image vectors from stage 1 print('obtaining sent and image vectors from stage 1...') x_test, y_test, id_test = get_sent_img_feats_baseline( config, test_data, num_words, embedding_matrix) test_lines = [ line.rstrip('\n') for line in open(testinsamplename, 'r', encoding='utf-8') ] # test_batch = len(test_data[0]) # model_test = modelArch.baseline(num_words, embedding_matrix) # model_test.load_weights('baseline_' + params['date'] + # '.h5', by_name=True) # [loss1, rec1] = model_test.predict(test_data, batch_size=test_batch) # print('predict res: loss:{} recall@1:{}'.format(np.mean(loss1), # np.mean(rec1))) # get input and gt ready print('gettin input and coherence vectors ready..') test_sent = [] test_imgids = [] test_stories = [] for ind in test_lines: ind = int(ind) test_sent.append(x_test[ind]) test_imgids.append(id_test[ind][:]) test_stories.append(test_sents[ind]) test_sent = np.array(test_sent) test_imgids = np.array(test_imgids) # save predictions with open(predictions, 'wb') as fp: pickle.dump(test_sent, fp) # retrieving images for input stories finalpreds = utils_vist.retrieve_images(np.array(y_test), np.array(test_sent), np.array(id_test)) # saving result dictionary results['input_stories'] = test_stories results['test_samples'] = test_lines results['test_gt_imageids'] = test_imgids results['test_pred_imageids'] = finalpreds pickle.dump(results, open('results_baseline_' + config['date'] + '.pickle', 'wb')) return results
def main_func(datadir, process, model, isprune): print('processing {0:s} data'.format(process)) # Load story to imageids csv file imageids = utils_vist.getImgIds(datadir + process + '/' + process + '_imageids.csv') # image2path file imagepath = datadir + 'raw/images/' + process + '/' image_size = (224, 224) starttime = time.time() #Extract features for each unique image for training, testing and validation img_feats, story_noimg = Feat_forArray(imageids, imagepath, model, image_size) print('features extracted for {0:s} in {1:f} seconds'.format( process, time.time() - starttime)) if isprune: # then post process VIST data to remove stories that do not have # all the images present in the data. indexes = sorted(list(story_noimg.keys()), reverse=True) for index in indexes: del imageids[index] # do same for text stories as well stories = utils_vist.getSent(datadir + process + '/' + process + '_stories.csv') for index in indexes: del stories[index] # save deleted imageids and stories as CSV for further use utils_vist.write2csv(datadir + process + "/" + process + "_image.csv", imageids) utils_vist.write2csv(datadir + process + "/" + process + "_text.csv", stories) return img_feats, story_noimg
def loadData(config): try: num_words = pickle.load( open(config['datadir'] + 'num_words.pickle', 'rb')) embedding_matrix = pickle.load( open(config['datadir'] + 'embedding_matrix.pickle', 'rb')) train_data = pickle.load( open(config['datadir'] + 'train_data.pickle', 'rb')) valid_data = pickle.load( open(config['datadir'] + 'valid_data.pickle', 'rb')) test_data = pickle.load( open(config['datadir'] + 'test_data.pickle', 'rb')) print('loaded existing data files') except: print('processed file(s) do not exist! Re-extracting all data') traindir = config['datadir'] + 'train/' testdir = config['datadir'] + 'test/' valdir = config['datadir'] + 'val/' glovetext = config['glovetext'] MAX_SEQUENCE_LENGTH = config['stage1']['MAX_SEQUENCE_LENGTH'] img_fea_dim = config['stage1']['img_fea_dim'] EMBEDDING_DIM = config['stage1']['wd_embd_dim'] MAX_NB_WORDS = config['stage1']['MAX_NB_WORDS'] starttime = time.time() # load img feat files img_fea_train = json.loads( open(traindir + 'train_imgfeat.json').read()) img_fea_valid = json.loads(open(valdir + 'val_imgfeat.json').read()) img_fea_test = json.loads(open(testdir + 'test_imgfeat.json').read()) # get img IDs train_imgID = utils_vist.getImgIds(traindir + 'train_image.csv') valid_imgID = utils_vist.getImgIds(valdir + 'val_image.csv') test_imgID = utils_vist.getImgIds(testdir + 'test_image.csv') # get stories train_sents = utils_vist.getSent(traindir + 'train_text.csv') valid_sents = utils_vist.getSent(valdir + 'val_text.csv') test_sents = utils_vist.getSent(testdir + 'test_text.csv') print('loaded all files in {} secs'.format(time.time() - starttime)) # get word vectors from glove embeddings_index = {} f = open(glovetext, 'r', encoding='utf-8') for line in f: values = line.split() word = values[0] coefs = np.asarray(values[1:], dtype='float32') embeddings_index[word] = coefs f.close() print('Indexed word vectors.') # get num of samples trainNum = len(train_imgID) * 5 validNum = len(valid_imgID) * 5 testNum = len(test_imgID) * 5 # get image features and text sentences in a single list train_sents, train_img_feats, trainids = utils_vist.flatten_all( train_imgID, img_fea_train, train_sents) valid_sents, valid_img_feats, valids = utils_vist.flatten_all( valid_imgID, img_fea_valid, valid_sents) test_sents, test_img_feats, testids = utils_vist.flatten_all( test_imgID, img_fea_test, test_sents) # get all text in single list to process them together sents = train_sents + valid_sents + test_sents # tokenize and convert each sentence to sequences tokenizer = Tokenizer(num_words=MAX_NB_WORDS) tokenizer.fit_on_texts(sents) sequences = tokenizer.texts_to_sequences(sents) word_index = tokenizer.word_index print('Found %s unique tokens.' % len(word_index)) data_sents = pad_sequences(sequences, MAX_SEQUENCE_LENGTH) # get train data train_sents = data_sents[0:trainNum] train_img_feats = pad_sequences(train_img_feats, img_fea_dim) train_data = [train_sents, train_img_feats, trainids] # check some samples train_text = train_data[0] print(len(train_data[0])) train_imgs = train_data[1] print(np.shape(train_imgs[0])) # get val data valid_sents = data_sents[trainNum:(trainNum + validNum)] valid_img_feats = pad_sequences(valid_img_feats, img_fea_dim) valid_data = [valid_sents, valid_img_feats, valids] # get test data test_sents = data_sents[(trainNum + validNum):(trainNum + validNum + testNum)] test_img_feats = pad_sequences(test_img_feats, img_fea_dim) test_data = [test_sents, test_img_feats, testids] print('Preparing embedding matrix.') # prepare embedding matrix num_words = min(MAX_NB_WORDS, len(word_index) + 1) embedding_matrix = np.zeros((num_words, EMBEDDING_DIM)) for word, i in word_index.items(): if i >= MAX_NB_WORDS: print('{}: {}'.format(word, i)) continue if i == 0: print('{}: {}'.format(word, i)) embedding_vector = embeddings_index.get(word) if embedding_vector is not None: # words not found in embedding index will be all-zeros. embedding_matrix[i] = embedding_vector pickle.dump( num_words, open(config['datadir'] + 'num_words.pickle', 'wb', pickle.HIGHEST_PROTOCOL)) pickle.dump( embedding_matrix, open(config['datadir'] + 'embedding_matrix.pickle', 'wb', pickle.HIGHEST_PROTOCOL)) pickle.dump( train_data, open(config['datadir'] + 'train_data.pickle', 'wb', pickle.HIGHEST_PROTOCOL)) pickle.dump( valid_data, open(config['datadir'] + 'valid_data.pickle', 'wb', pickle.HIGHEST_PROTOCOL)) pickle.dump( test_data, open(config['datadir'] + 'test_data.pickle', 'wb', pickle.HIGHEST_PROTOCOL)) return num_words, embedding_matrix, train_data, valid_data, test_data
def test(config, modelname, test_data, num_words, embedding_matrix, modeltype='cnsi'): results = dict() testdir = config['datadir'] + 'test/' testinsamplename = config['testsamples'] predictions = config['savepred'] test_sents = utils_vist.getSent(testdir + 'test_text.csv') # get sentence and image vectors from stage 1 print('obtaining sent and image vectors from stage 1...') x_test, y_test, id_test = get_sent_img_feats_stage1( config, test_data, num_words, embedding_matrix) test_lines = [ line.rstrip('\n') for line in open(testinsamplename, 'r', encoding='utf-8') ] if modeltype == 'cnsi': coh_sent_test = np.expand_dims(np.load(testdir + 'cohvec_test.npy'), axis=1) # get input and gt ready print('gettin input and coherence vectors ready..') test_sent = [] test_imgids = [] # test_vecs = [] test_stories = [] for ind in test_lines: ind = int(ind) test_sent.append(x_test[ind]) test_imgids.append(id_test[ind][:]) # test_vecs.append(y_test[ind]) test_stories.append(test_sents[ind]) test_sent = np.array(test_sent) test_imgids = np.array(test_imgids) # test_vecs = np.array(test_vecs) if modeltype == 'cnsi': coh_sent_test = coh_sent_test[np.array(test_lines).astype(int), :, :] coh_sent_test = np.repeat(coh_sent_test, 5, axis=1) # load model and predict print('predicting using stage 2...') trained_model = keras.models.load_model( modelname, custom_objects={'orderEmb_loss': modelArch.orderEmb_loss}) if modeltype == 'cnsi': out_fea = trained_model.predict([test_sent, coh_sent_test]) else: out_fea = trained_model.predict(test_sent) # save predictions with open(predictions, 'wb') as fp: pickle.dump(out_fea, fp) # retrieving images for input stories finalpreds = utils_vist.retrieve_images(np.array(y_test), out_fea, np.array(id_test)) # saving result dictionary results['input_stories'] = test_stories results['test_samples'] = test_lines results['test_gt_imageids'] = test_imgids results['test_pred_imageids'] = finalpreds pickle.dump(results, open('results' + config['date'] + '.pickle', 'wb')) return results