예제 #1
0
def main(config):

    datadir = './data/'
    process = ['train', 'val', 'test']

    for proc in process:
        # read the stories
        stories = utils_vist.getSent(datadir + proc + '/' + proc + '_text.csv')
        parsetree = get_parse_tree(stories)

        print('obtained parse trees..')
        # save the trees in file
        np.save(datadir + proc + '/' + proc + '_parsetree.npy', parsetree)

        # get entitiy feature for all stories in the file
        entity_feat = entity_score.entity_feature(parsetree)
        print('obtained entity features..')

        # convert dict to numpy array
        cohvec = np.zeros((len(stories), 64), dtype=float)
        for i in entity_feat:
            cohvec[int(i)] = entity_feat[i]

        # save entity feature as numpy file
        np.save(datadir + proc + '/cohvec_' + proc + '.npy', cohvec)
예제 #2
0
def test(config, num_words, embedding_matrix, test_data):

    # load all data
    results = dict()
    testdir = config['datadir'] + 'test/'
    testinsamplename = config['testsamples']
    predictions = config['savepred']
    test_sents = utils_vist.getSent(testdir + 'test_text.csv')

    # get sentence and image vectors from stage 1
    print('obtaining sent and image vectors from stage 1...')
    x_test, y_test, id_test = get_sent_img_feats_baseline(
        config, test_data, num_words, embedding_matrix)

    test_lines = [
        line.rstrip('\n')
        for line in open(testinsamplename, 'r', encoding='utf-8')
    ]
    #    test_batch = len(test_data[0])
    #    model_test = modelArch.baseline(num_words, embedding_matrix)
    #    model_test.load_weights('baseline_' + params['date'] +
    #                            '.h5', by_name=True)
    #    [loss1, rec1] = model_test.predict(test_data,  batch_size=test_batch)
    #    print('predict res: loss:{} recall@1:{}'.format(np.mean(loss1),
    #                                                    np.mean(rec1)))
    # get input and gt ready
    print('gettin input and coherence vectors ready..')
    test_sent = []
    test_imgids = []
    test_stories = []
    for ind in test_lines:
        ind = int(ind)
        test_sent.append(x_test[ind])
        test_imgids.append(id_test[ind][:])
        test_stories.append(test_sents[ind])
    test_sent = np.array(test_sent)
    test_imgids = np.array(test_imgids)

    # save predictions
    with open(predictions, 'wb') as fp:
        pickle.dump(test_sent, fp)

    # retrieving images for input stories
    finalpreds = utils_vist.retrieve_images(np.array(y_test),
                                            np.array(test_sent),
                                            np.array(id_test))

    # saving result dictionary

    results['input_stories'] = test_stories
    results['test_samples'] = test_lines
    results['test_gt_imageids'] = test_imgids
    results['test_pred_imageids'] = finalpreds

    pickle.dump(results,
                open('results_baseline_' + config['date'] + '.pickle', 'wb'))
    return results
예제 #3
0
def main_func(datadir, process, model, isprune):

    print('processing {0:s} data'.format(process))
    # Load story to imageids csv file
    imageids = utils_vist.getImgIds(datadir + process + '/' + process +
                                    '_imageids.csv')

    # image2path file
    imagepath = datadir + 'raw/images/' + process + '/'
    image_size = (224, 224)
    starttime = time.time()
    #Extract features for each unique image for training, testing and validation
    img_feats, story_noimg = Feat_forArray(imageids, imagepath, model,
                                           image_size)
    print('features extracted for {0:s} in {1:f} seconds'.format(
        process,
        time.time() - starttime))

    if isprune:
        # then post process VIST data to remove stories that do not have
        # all the images present in the data.
        indexes = sorted(list(story_noimg.keys()), reverse=True)
        for index in indexes:
            del imageids[index]

        # do same for text stories as well
        stories = utils_vist.getSent(datadir + process + '/' + process +
                                     '_stories.csv')

        for index in indexes:
            del stories[index]

        # save deleted imageids and stories as CSV for further use
        utils_vist.write2csv(datadir + process + "/" + process + "_image.csv",
                             imageids)
        utils_vist.write2csv(datadir + process + "/" + process + "_text.csv",
                             stories)

    return img_feats, story_noimg
예제 #4
0
def loadData(config):

    try:
        num_words = pickle.load(
            open(config['datadir'] + 'num_words.pickle', 'rb'))
        embedding_matrix = pickle.load(
            open(config['datadir'] + 'embedding_matrix.pickle', 'rb'))
        train_data = pickle.load(
            open(config['datadir'] + 'train_data.pickle', 'rb'))
        valid_data = pickle.load(
            open(config['datadir'] + 'valid_data.pickle', 'rb'))
        test_data = pickle.load(
            open(config['datadir'] + 'test_data.pickle', 'rb'))
        print('loaded existing data files')
    except:
        print('processed file(s) do not exist! Re-extracting all data')

        traindir = config['datadir'] + 'train/'
        testdir = config['datadir'] + 'test/'
        valdir = config['datadir'] + 'val/'
        glovetext = config['glovetext']

        MAX_SEQUENCE_LENGTH = config['stage1']['MAX_SEQUENCE_LENGTH']
        img_fea_dim = config['stage1']['img_fea_dim']
        EMBEDDING_DIM = config['stage1']['wd_embd_dim']
        MAX_NB_WORDS = config['stage1']['MAX_NB_WORDS']

        starttime = time.time()
        # load img feat files
        img_fea_train = json.loads(
            open(traindir + 'train_imgfeat.json').read())
        img_fea_valid = json.loads(open(valdir + 'val_imgfeat.json').read())
        img_fea_test = json.loads(open(testdir + 'test_imgfeat.json').read())

        # get img IDs
        train_imgID = utils_vist.getImgIds(traindir + 'train_image.csv')
        valid_imgID = utils_vist.getImgIds(valdir + 'val_image.csv')
        test_imgID = utils_vist.getImgIds(testdir + 'test_image.csv')

        # get stories
        train_sents = utils_vist.getSent(traindir + 'train_text.csv')
        valid_sents = utils_vist.getSent(valdir + 'val_text.csv')
        test_sents = utils_vist.getSent(testdir + 'test_text.csv')

        print('loaded all files in {} secs'.format(time.time() - starttime))
        # get word vectors from glove
        embeddings_index = {}
        f = open(glovetext, 'r', encoding='utf-8')
        for line in f:
            values = line.split()
            word = values[0]
            coefs = np.asarray(values[1:], dtype='float32')
            embeddings_index[word] = coefs
        f.close()
        print('Indexed word vectors.')

        # get num of samples
        trainNum = len(train_imgID) * 5
        validNum = len(valid_imgID) * 5
        testNum = len(test_imgID) * 5

        # get image features and text sentences in a single list
        train_sents, train_img_feats, trainids = utils_vist.flatten_all(
            train_imgID, img_fea_train, train_sents)

        valid_sents, valid_img_feats, valids = utils_vist.flatten_all(
            valid_imgID, img_fea_valid, valid_sents)

        test_sents, test_img_feats, testids = utils_vist.flatten_all(
            test_imgID, img_fea_test, test_sents)

        # get all text in single list to process them together
        sents = train_sents + valid_sents + test_sents

        # tokenize and convert each sentence to sequences
        tokenizer = Tokenizer(num_words=MAX_NB_WORDS)
        tokenizer.fit_on_texts(sents)
        sequences = tokenizer.texts_to_sequences(sents)
        word_index = tokenizer.word_index
        print('Found %s unique tokens.' % len(word_index))
        data_sents = pad_sequences(sequences, MAX_SEQUENCE_LENGTH)

        # get train data
        train_sents = data_sents[0:trainNum]
        train_img_feats = pad_sequences(train_img_feats, img_fea_dim)
        train_data = [train_sents, train_img_feats, trainids]

        # check some samples
        train_text = train_data[0]
        print(len(train_data[0]))
        train_imgs = train_data[1]
        print(np.shape(train_imgs[0]))

        # get val data
        valid_sents = data_sents[trainNum:(trainNum + validNum)]
        valid_img_feats = pad_sequences(valid_img_feats, img_fea_dim)
        valid_data = [valid_sents, valid_img_feats, valids]

        # get test data
        test_sents = data_sents[(trainNum + validNum):(trainNum + validNum +
                                                       testNum)]
        test_img_feats = pad_sequences(test_img_feats, img_fea_dim)
        test_data = [test_sents, test_img_feats, testids]

        print('Preparing embedding matrix.')
        # prepare embedding matrix
        num_words = min(MAX_NB_WORDS, len(word_index) + 1)
        embedding_matrix = np.zeros((num_words, EMBEDDING_DIM))
        for word, i in word_index.items():
            if i >= MAX_NB_WORDS:
                print('{}: {}'.format(word, i))
                continue
            if i == 0:
                print('{}: {}'.format(word, i))

            embedding_vector = embeddings_index.get(word)
            if embedding_vector is not None:
                # words not found in embedding index will be all-zeros.
                embedding_matrix[i] = embedding_vector

        pickle.dump(
            num_words,
            open(config['datadir'] + 'num_words.pickle', 'wb',
                 pickle.HIGHEST_PROTOCOL))
        pickle.dump(
            embedding_matrix,
            open(config['datadir'] + 'embedding_matrix.pickle', 'wb',
                 pickle.HIGHEST_PROTOCOL))
        pickle.dump(
            train_data,
            open(config['datadir'] + 'train_data.pickle', 'wb',
                 pickle.HIGHEST_PROTOCOL))
        pickle.dump(
            valid_data,
            open(config['datadir'] + 'valid_data.pickle', 'wb',
                 pickle.HIGHEST_PROTOCOL))
        pickle.dump(
            test_data,
            open(config['datadir'] + 'test_data.pickle', 'wb',
                 pickle.HIGHEST_PROTOCOL))

    return num_words, embedding_matrix, train_data, valid_data, test_data
예제 #5
0
def test(config,
         modelname,
         test_data,
         num_words,
         embedding_matrix,
         modeltype='cnsi'):

    results = dict()
    testdir = config['datadir'] + 'test/'
    testinsamplename = config['testsamples']
    predictions = config['savepred']
    test_sents = utils_vist.getSent(testdir + 'test_text.csv')

    # get sentence and image vectors from stage 1
    print('obtaining sent and image vectors from stage 1...')
    x_test, y_test, id_test = get_sent_img_feats_stage1(
        config, test_data, num_words, embedding_matrix)

    test_lines = [
        line.rstrip('\n')
        for line in open(testinsamplename, 'r', encoding='utf-8')
    ]
    if modeltype == 'cnsi':
        coh_sent_test = np.expand_dims(np.load(testdir + 'cohvec_test.npy'),
                                       axis=1)
    # get input and gt ready
    print('gettin input and coherence vectors ready..')
    test_sent = []
    test_imgids = []
    #    test_vecs = []
    test_stories = []
    for ind in test_lines:
        ind = int(ind)
        test_sent.append(x_test[ind])
        test_imgids.append(id_test[ind][:])
        #        test_vecs.append(y_test[ind])
        test_stories.append(test_sents[ind])
    test_sent = np.array(test_sent)
    test_imgids = np.array(test_imgids)
    #    test_vecs = np.array(test_vecs)
    if modeltype == 'cnsi':
        coh_sent_test = coh_sent_test[np.array(test_lines).astype(int), :, :]
        coh_sent_test = np.repeat(coh_sent_test, 5, axis=1)

    # load model and predict
    print('predicting using stage 2...')
    trained_model = keras.models.load_model(
        modelname, custom_objects={'orderEmb_loss': modelArch.orderEmb_loss})
    if modeltype == 'cnsi':
        out_fea = trained_model.predict([test_sent, coh_sent_test])
    else:
        out_fea = trained_model.predict(test_sent)

    # save predictions
    with open(predictions, 'wb') as fp:
        pickle.dump(out_fea, fp)

    # retrieving images for input stories
    finalpreds = utils_vist.retrieve_images(np.array(y_test), out_fea,
                                            np.array(id_test))

    # saving result dictionary

    results['input_stories'] = test_stories
    results['test_samples'] = test_lines
    results['test_gt_imageids'] = test_imgids
    results['test_pred_imageids'] = finalpreds

    pickle.dump(results, open('results' + config['date'] + '.pickle', 'wb'))
    return results