예제 #1
0
def create_gt_synset_cooccur(exp_const, dataloader):
    print('Creating cooccur ...')
    cooccur = {}
    for data in tqdm(dataloader):
        B = len(data['object_synsets'])

        for b in range(B):
            for wnid1 in set(data['attribute_synsets'][b]):
                for wnid2 in set(data['attribute_synsets'][b]):
                    if wnid1 not in cooccur:
                        cooccur[wnid1] = {}

                    if wnid2 not in cooccur[wnid1]:
                        cooccur[wnid1][wnid2] = 0

                    cooccur[wnid1][wnid2] += 1

    synset_cooccur_json = os.path.join(exp_const.exp_dir,
                                       'synset_cooccur.json')
    io.dump_json_object(cooccur, synset_cooccur_json)

    print('Checking symmetry and self constraint in synset cooccur ...')
    for wnid1, context in tqdm(cooccur.items()):
        for wnid2, count in context.items():
            sym_err_msg = f'Word cooccurence not symmetric ({wnid1} / {wnid2})'
            assert (cooccur[wnid2][wnid1] == count), err_msg

    print('Constraints satisfied')
예제 #2
0
def main(**kwargs):
    subset = kwargs['subset']
    const = FlickrConstants()

    io.mkdir_if_not_exists(const.flickr_paths['proc_dir'])

    image_ids = io.read(const.subset_ids[subset])
    image_ids = [idx.decode() for idx in image_ids.split()]

    # Write boxes to json
    boxes = {}
    for image_id in tqdm(image_ids):
        box_xml = os.path.join(const.flickr_paths['anno_dir'],
                               f'{image_id}.xml')
        boxes[image_id] = get_annotations(box_xml)

    io.dump_json_object(boxes, const.box_json[subset])

    # Write sentence annos to json
    sent = {}
    for image_id in tqdm(image_ids):
        sent_txt = os.path.join(const.flickr_paths['sent_dir'],
                                f'{image_id}.txt')
        sent[image_id] = get_sentence_data(sent_txt)

    io.dump_json_object(sent, const.sent_json[subset])
예제 #3
0
def prepare_data(exp_const, data_const):
    io.mkdir_if_not_exists(exp_const.exp_dir)

    print('Writing constants to exp dir ...')
    data_const_json = os.path.join(exp_const.exp_dir, 'data_const.json')
    data_const.to_json(data_const_json)

    exp_const_json = os.path.join(exp_const.exp_dir, 'exp_const.json')
    exp_const.to_json(exp_const_json)

    print('Loading anno_list.json ...')
    anno_list = io.load_json_object(data_const.anno_list_json)

    print('Creating input json for faster rcnn ...')
    images_in_out = [None] * len(anno_list)
    for i, anno in enumerate(anno_list):
        global_id = anno['global_id']
        image_in_out = dict()
        image_in_out['in_path'] = os.path.join(data_const.images_dir,
                                               anno['image_path_postfix'])
        image_in_out['out_dir'] = os.path.join(data_const.proc_dir,
                                               'faster_rcnn_boxes')
        image_in_out['prefix'] = f'{global_id}_'
        images_in_out[i] = image_in_out

    images_in_out_json = os.path.join(exp_const.exp_dir,
                                      'faster_rcnn_im_in_out.json')
    io.dump_json_object(images_in_out, images_in_out_json)
예제 #4
0
def main(exp_const, data_const):
    print(f'Creating directory {exp_const.exp_dir} ...')
    io.mkdir_if_not_exists(exp_const.exp_dir, recursive=True)

    print('Saving constants ...')
    save_constants({'exp': exp_const, 'data': data_const}, exp_const.exp_dir)

    print('Loading data ...')
    img_id_to_obj_id = io.load_json_object(
        data_const.image_id_to_object_id_json)
    object_annos = io.load_json_object(data_const.object_annos_json)

    cooccur = {}
    for img_id, obj_ids in tqdm(img_id_to_obj_id.items()):
        synset_list = create_synset_list(object_annos, obj_ids)
        for synset1 in synset_list:
            for synset2 in synset_list:
                if synset1 not in cooccur:
                    cooccur[synset1] = {}

                if synset2 not in cooccur[synset1]:
                    cooccur[synset1][synset2] = 0

                cooccur[synset1][synset2] += 1

    synset_cooccur_json = os.path.join(exp_const.exp_dir,
                                       'synset_cooccur.json')
    io.dump_json_object(cooccur, synset_cooccur_json)
예제 #5
0
def main():
    const = VisualGenomeConstants()
    io.mkdir_if_not_exists(const.proc_dir,recursive=True)
    
    print('Loading objects.json ...')
    objects = io.load_json_object(const.objects_json)
    
    print('Loading object_synsets.json ...')
    object_synsets = io.load_json_object(const.object_synsets_json)
    
    print('Creating image_id_to_object_id.json ...')
    image_id_to_object_id = get_image_id_to_object_id(objects)
    io.dump_json_object(
        image_id_to_object_id,
        os.path.join(const.proc_dir,'image_id_to_object_id.json'))

    print('Loading attributes.json ...')
    attributes = io.load_json_object(const.attributes_json)
    
    print('Loading attribute_synsets.json ...')
    attribute_synsets = io.load_json_object(const.attribute_synsets_json)

    print('Creating object_annos.json ...')
    object_annos = get_object_annos(objects,attributes,attribute_synsets)
    io.dump_json_object(
        object_annos,
        os.path.join(const.proc_dir,'object_annos.json'))
예제 #6
0
def main(exp_const, data_const, model_const):
    print('Creating network ...')
    model = Model()
    model.const = model_const
    model.encoder = Encoder(model.const.encoder).cuda()
    encoder_path = os.path.join(exp_const.model_dir,
                                'encoder_' + str(model.const.model_num))
    model.encoder.load_state_dict(torch.load(encoder_path))

    print('Creating dataloader ...')
    dataset = VisualFeaturesDataset(data_const)
    dataloader = DataLoader(dataset,
                            batch_size=exp_const.batch_size,
                            shuffle=True)

    print('Get features ...')
    features = get_visual_features(model, dataloader, exp_const)

    print('Save features h5py ...')
    word_features_h5py = h5py.File(
        os.path.join(exp_const.exp_dir, 'word_features.h5py'), 'w')
    word_features_h5py.create_dataset('features',
                                      data=features,
                                      chunks=(1, features.shape[1]))
    word_features_h5py.create_dataset('mean', data=np.mean(features, axis=0))
    word_features_h5py.close()

    print('Save features word idx json ...')
    word_to_idx_json = os.path.join(exp_const.exp_dir, 'word_to_idx.json')
    io.dump_json_object(dataloader.dataset.word_to_idx, word_to_idx_json)
예제 #7
0
def main(exp_const, data_const):
    class_confmat = np.load(data_const.class_confmat_npy)
    visual_embed = np.load(data_const.visual_embed_npy)
    labels = np.load(data_const.labels_npy)

    glove_vecs = visual_embed[:, :data_const.glove_dim]
    visual_vecs = visual_embed[:, data_const.glove_dim:]

    visual_sim = np.matmul(visual_vecs, np.transpose(visual_vecs))
    glove_sim = np.matmul(glove_vecs, np.transpose(glove_vecs))

    corr_pvalue = {}
    filename = os.path.join(exp_const.vis_dir,
                            'class_vs_glove_visual_sim.html')
    x, y, _ = create_scatter_plot(class_confmat, glove_sim + visual_sim,
                                  labels, filename)
    corr_pvalue['glove+visual'] = pearsoncorr(x, y)

    filename = os.path.join(exp_const.vis_dir, 'class_vs_visual_sim.html')
    create_scatter_plot(class_confmat, visual_sim, labels, filename)
    x, y, _ = create_scatter_plot(class_confmat, visual_sim, labels, filename)
    corr_pvalue['visual'] = pearsoncorr(x, y)

    filename = os.path.join(exp_const.vis_dir, 'class_vs_glove_sim.html')
    create_scatter_plot(class_confmat, glove_sim, labels, filename)
    x, y, _ = create_scatter_plot(class_confmat, glove_sim, labels, filename)
    corr_pvalue['glove'] = pearsoncorr(x, y)

    corr_pvalue_json = os.path.join(exp_const.exp_dir, 'corr_pvalue.json')
    io.dump_json_object(corr_pvalue, corr_pvalue_json)
예제 #8
0
def main():
    args = parser.parse_args()

    data_const = HicoConstants(exp_ver=args.exp_ver)
    print('Creating output dir ...')
    io.mkdir_if_not_exists(data_const.result_dir + '/map', recursive=True)

    # Load hoi_list
    hoi_list_json = os.path.join(data_const.proc_dir, 'hoi_list.json')
    hoi_list = io.load_json_object(hoi_list_json)

    # Load subset ids to eval on
    split_ids_json = os.path.join(data_const.proc_dir, 'split_ids.json')
    split_ids = io.load_json_object(split_ids_json)
    global_ids = split_ids[args.subset]
    global_ids_set = set(global_ids)

    # Create gt_dets
    print('Creating GT dets ...')
    gt_dets = load_gt_dets(data_const.proc_dir, global_ids_set)

    eval_inputs = []
    for hoi in hoi_list:
        eval_inputs.append((hoi['id'], global_ids, gt_dets,
                            data_const.result_dir + '/pred_hoi_dets.hdf5',
                            data_const.result_dir + '/map'))

    # import ipdb; ipdb.set_trace()
    # eval_hoi(*eval_inputs[0])

    print(f'Starting a pool of {args.num_processes} workers ...')
    p = Pool(args.num_processes)

    print(f'Begin mAP computation ...')
    output = p.starmap(eval_hoi, eval_inputs)
    #output = eval_hoi('003',global_ids,gt_dets,args.pred_hoi_dets_hdf5,args.out_dir)

    p.close()
    p.join()

    mAP = {
        'AP': {},
        'mAP': 0,
        'invalid': 0,
    }
    map_ = 0
    count = 0
    for ap, hoi_id in output:
        mAP['AP'][hoi_id] = ap
        if not np.isnan(ap):
            count += 1
            map_ += ap

    mAP['mAP'] = map_ / count
    mAP['invalid'] = len(output) - count

    mAP_json = os.path.join(data_const.result_dir + '/map', 'mAP.json')
    io.dump_json_object(mAP, mAP_json)

    print(f'APs have been saved to {data_const.result_dir}/map')
def main(**kwargs):
    print('Creating Caption Encoder (tokenizer) ...')
    cap_encoder = CapEncoder(CapEncoderConstants())

    nltk.download('punkt')

    data_const = FlickrDatasetConstants(kwargs['subset'])
    data_const.read_noun_token_ids = False
    dataset = FlickrDataset(data_const)
    noun_token_ids = [None] * len(dataset)
    noun_vocab = set()
    num_human_captions = 0
    num_noun_captions = 0
    for i, data in enumerate(tqdm(dataset)):
        image_id = data['image_id']
        cap_id = data['cap_id']
        caption = data['caption']
        token_ids, tokens = cap_encoder.tokenize(caption)

        nltk_tokens = nltk.word_tokenize(caption.lower())
        pos_tags = nltk.pos_tag(nltk_tokens)
        pos_tags = ignore_words_from_pos(pos_tags,
                                         ['is', 'has', 'have', 'had', 'be'])

        alignment = align_pos_tokens(pos_tags, tokens)
        noun_token_ids_, noun_words = get_noun_token_ids(pos_tags, alignment)
        noun_token_ids_ = group_token_ids(noun_token_ids_, tokens)
        if len(noun_token_ids_) > 0:
            num_noun_captions += 1

        noun_token_ids[i] = {
            'image_id': image_id,
            'cap_id': cap_id,
            'token_ids': noun_token_ids_,
            'words': list(noun_words)
        }

        noun_vocab.update(noun_words)

        for human_word in [
                'man', 'person', 'human', 'woman', 'boy', 'girl', 'men',
                'women', 'boys', 'girls', 'child', 'children'
        ]:
            if human_word in tokens:
                num_human_captions += 1
                break

    io.mkdir_if_not_exists(
        os.path.join(flickr_paths['proc_dir'], 'annotations'))
    io.dump_json_object(noun_token_ids, data_const.noun_tokens_json)
    io.dump_json_object(sorted(list(noun_vocab)), data_const.noun_vocab_json)
    print('Number of human captions:', num_human_captions)
    print('Number of noun captions:', num_noun_captions)
    print('Total number of captions:', len(dataset))
    print('Size of noun vocabulary:', len(noun_vocab))
예제 #10
0
def main(exp_const, data_const, model_const):
    io.mkdir_if_not_exists(exp_const.vis_dir)

    print('Creating network ...')
    model = Model()
    model.const = model_const

    model.net = ResnetModel(model.const.net)
    if model.const.model_num is not None:
        model.net.load_state_dict(torch.load(model.const.net_path))
    model.net.cuda()

    if exp_const.feedforward == False:
        model.AttributeEmbeddings = AttributeEmbeddings(
            model.const.AttributeEmbeddings)
        if model.const.model_num is not None:
            model.AttributeEmbeddings.load_state_dict(
                torch.load(model.const.AttributeEmbeddings_path))
        model.AttributeEmbeddings.cuda()

    model.img_mean = np.array([0.485, 0.456, 0.406])
    model.img_std = np.array([0.229, 0.224, 0.225])

    print('Creating dataloader ...')
    dataset = Cifar100Dataset(data_const)
    dataloader = DataLoader(dataset,
                            batch_size=exp_const.batch_size,
                            shuffle=True,
                            num_workers=exp_const.num_workers)

    eval_results = eval_model(model, dataloader, exp_const)

    confmat_npy = os.path.join(exp_const.exp_dir, 'confmat.npy')
    np.save(confmat_npy, eval_results['Conf Mat'])

    results = {
        'Avg Loss': eval_results['Avg Loss'],
        'Acc': eval_results['Acc']
    }

    print(results)
    results_json = os.path.join(exp_const.exp_dir, 'results.json')
    io.dump_json_object(results, results_json)

    embeddings_npy = os.path.join(exp_const.exp_dir, 'embeddings.npy')
    if exp_const.feedforward == True:
        np.save(embeddings_npy,
                model.net.resnet_layers.fc.weight.data.cpu().numpy())
    else:
        np.save(embeddings_npy,
                model.AttributeEmbeddings.embed.weight.data.cpu().numpy())

    labels_npy = os.path.join(exp_const.exp_dir, 'labels.npy')
    np.save(labels_npy, dataset.labels)
예제 #11
0
def main():
    embeddings_h5py = os.path.join(
        os.getcwd(),
        'symlinks/exp/google_images/' + \
        'normalized_resnet_embeddings_recon_loss_trained_on_google/' + \
        'concat_glove_and_visual/visual_word_vecs.h5py')
    word_to_idx_json = os.path.join(
        os.getcwd(),
        'symlinks/exp/google_images/' + \
        'normalized_resnet_embeddings_recon_loss_trained_on_google/' + \
        'concat_glove_and_visual/visual_word_vecs_idx.json')
    semaleval_all_words_json = os.path.join(
        os.getcwd(), 'symlinks/data/semeval_2018_10/proc/all_words.json')

    print('Reading embeddings ...')
    embeddings = h5py.File(embeddings_h5py, 'r')['embeddings'][()]
    mean_embedding = np.mean(embeddings, 0)
    word_to_idx = io.load_json_object(word_to_idx_json)
    words_to_select = list(
        io.load_json_object(semaleval_all_words_json).keys())

    print('Selecting subset ...')
    subset_embeddings = np.zeros([len(words_to_select), embeddings.shape[1]])
    subset_word_to_idx = {}
    count = 0
    for i, word in enumerate(tqdm(words_to_select)):
        subset_word_to_idx[word] = i
        if word not in word_to_idx:
            count += 1
            subset_embeddings[i] = mean_embedding
        else:
            idx = word_to_idx[word]
            subset_embeddings[i] = embeddings[idx]
    print(count)

    print('Saving selected subset embeddings ...')
    subset_visual_word_vecs_h5py_filename = os.path.join(
        os.getcwd(),
        'symlinks/exp/google_images/' + \
        'normalized_resnet_embeddings_recon_loss_trained_on_google/' + \
        'concat_glove_and_visual/subset_visual_word_vecs.h5py')
    subset_embeddings_h5py = h5py.File(subset_visual_word_vecs_h5py_filename,
                                       'w')
    subset_embeddings_h5py.create_dataset('embeddings',
                                          data=subset_embeddings,
                                          chunks=(1,
                                                  subset_embeddings.shape[1]))

    subset_word_to_idx_json = os.path.join(
        os.getcwd(),
        'symlinks/exp/google_images/' + \
        'normalized_resnet_embeddings_recon_loss_trained_on_google/' + \
        'concat_glove_and_visual/subset_visual_word_vecs_idx.json')
    io.dump_json_object(subset_word_to_idx, subset_word_to_idx_json)
예제 #12
0
def main():
    genome_const = VisualGenomeConstants()
    object_freqs = io.load_json_object(genome_const.object_freqs_json)
    attribute_freqs = io.load_json_object(genome_const.attribute_freqs_json)
    vocab = copy.deepcopy(object_freqs)
    for word,freq in attribute_freqs.items():
        if word not in vocab:
            vocab[word] = 0

        vocab[word] += freq

    io.dump_json_object(vocab, genome_const.all_word_freqs_json)
예제 #13
0
def main():
    url = 'https://gist.githubusercontent.com/yrevar/6135f1bd8dcf2e0cc683/' + \
        'raw/d133d61a09d7e5a3b36b8c111a8dd5c4b5d560ee/' + \
        'imagenet1000_clsid_to_human.pkl'
    outdir = os.path.join(os.getcwd(), 'symlinks/data/imagenet/proc')
    io.mkdir_if_not_exists(outdir, recursive=True)
    labels_json = os.path.join(outdir, 'labels.json')
    labels_dict = pickle.load(urlrequest.urlopen(url))
    labels = []
    for i in range(len(labels_dict)):
        labels.append(labels_dict[i])
    io.dump_json_object(labels, labels_json)
예제 #14
0
def main():
    semeval_const = SemEval201810Constants()
    word_freqs = io.load_json_object(semeval_const.word_freqs)
    feature_freqs = io.load_json_object(semeval_const.feature_freqs)
    vocab = copy.deepcopy(word_freqs)
    for word, freq in feature_freqs.items():
        if word not in vocab:
            vocab[word] = 0

        vocab[word] += freq

    io.dump_json_object(vocab, semeval_const.all_word_freqs)
def main(exp_const, data_const):
    io.mkdir_if_not_exists(exp_const.exp_dir, recursive=True)
    save_constants({'exp': exp_const, 'data': data_const}, exp_const.exp_dir)

    print('Loading glove embeddings ...')
    glove_idx = io.load_json_object(data_const.glove_idx)
    glove_h5py = h5py.File(data_const.glove_h5py, 'r')
    glove_embeddings = glove_h5py['embeddings'][()]
    num_glove_words, glove_dim = glove_embeddings.shape
    print('-' * 80)
    print(f'number of glove words: {num_glove_words}')
    print(f'glove dim: {glove_dim}')
    print('-' * 80)

    print('Loading visual features ...')
    visual_features_idx = io.load_json_object(data_const.visual_features_idx)
    visual_features_h5py = h5py.File(data_const.visual_features_h5py, 'r')
    visual_features = visual_features_h5py['features'][()]
    num_visual_features, visual_features_dim = visual_features.shape
    print('-' * 80)
    print(f'number of visual features: {num_visual_features}')
    print(f'visual feature dim: {visual_features_dim}')
    print('-' * 80)

    print('Combining glove with visual features ...')
    visual_word_vecs_idx_json = os.path.join(exp_const.exp_dir,
                                             'visual_word_vecs_idx.json')
    io.dump_json_object(glove_idx, visual_word_vecs_idx_json)
    visual_word_vecs_h5py = h5py.File(
        os.path.join(exp_const.exp_dir, 'visual_word_vecs.h5py'), 'w')
    visual_word_vec_dim = glove_dim + visual_features_dim
    visual_word_vecs = np.zeros([num_glove_words, visual_word_vec_dim])
    mean_visual_feature = visual_features_h5py['mean'][()]
    for word in tqdm(glove_idx.keys()):
        glove_id = glove_idx[word]
        glove_vec = glove_embeddings[glove_id]
        if word in visual_features_idx:
            feature_id = visual_features_idx[word]
            feature = visual_features[feature_id]
        else:
            feature = mean_visual_feature
        visual_word_vec = np.concatenate(
            (glove_vec, (feature - mean_visual_feature)))
        # visual_word_vec = np.concatenate((
        #     normalize(glove_vec),
        #     normalize(feature)))
        visual_word_vecs[glove_id] = visual_word_vec

    visual_word_vecs_h5py.create_dataset('embeddings',
                                         data=visual_word_vecs,
                                         chunks=(1, visual_word_vec_dim))
    visual_word_vecs_h5py.close()
예제 #16
0
def main():
    const = ImagenetConstants()

    print('Reading txt file ...')
    with open(const.is_a_txt, 'r', encoding='ISO-8859-1') as f:  #ISO-8859-1
        lines = f.readlines()

    print('Parsing is_a relationship ...')
    lines = [line.rstrip('\n') for line in lines]
    lines = [line.split(' ') for line in lines]
    wnid_to_parent = {c: p for p, c in lines}

    print('Saving wnid_to_parent.json')
    io.dump_json_object(wnid_to_parent, const.wnid_to_parent_json)
    def convert(self):
        print('Creating anno list for vcoco...')
        corre = np.zeros((24, 80))
        anno_list, corre = self.create_anno_list(corre)
        io.dump_json_object(anno_list, self.const.anno_list_json)
        np.save(self.const.mat_npy, corre)

        print('Creating hoi list for vcoco...')
        hoi_list = self.create_hoi_list()
        io.dump_json_object(hoi_list, self.const.hoi_list_json)

        print('Creating object list for vcoco...')
        object_list = self.create_obj_list()
        io.dump_json_object(object_list, self.const.object_list_json)

        print('Creating verb list for vcoco...')
        verb_list = []
        for i, verb in enumerate(self.VCOCO_train.actions):
            verb_list_item = {
                'id':
                str(self.VCOCO_train.actions_to_id_map[verb] + 1).zfill(3),
                'name': verb
            }
            verb_list.append(verb_list_item)
        io.dump_json_object(verb_list, self.const.verb_list_json)
예제 #18
0
def main():
    args = parser.parse_args()
    data_const = HicoConstants(exp_ver=args.exp_ver)
    out_dir = data_const.result_dir+'/map'

    bin_to_hoi_ids = io.load_json_object(data_const.bin_to_hoi_ids_json)
    
    mAP_json = os.path.join(out_dir,'mAP.json')
    APs = io.load_json_object(mAP_json)['AP']
    bin_map = {}
    bin_count = {}
    for bin_id,hoi_ids in bin_to_hoi_ids.items():
        bin_map[bin_id] = compute_mAP(APs,hoi_ids)

    non_rare_hoi_ids = []
    for ul in bin_to_hoi_ids.keys():
        if ul=='10':
            continue
        non_rare_hoi_ids += bin_to_hoi_ids[ul]

    sample_complexity_analysis = {
        'bin': bin_map,
        'full': compute_mAP(APs,APs.keys()),
        'rare': bin_map['10'],
        'non_rare': compute_mAP(APs,non_rare_hoi_ids)
    }

    sample_complexity_analysis_json = os.path.join(
        out_dir,
        f'sample_complexity_analysis.json')
    io.dump_json_object(
        sample_complexity_analysis,
        sample_complexity_analysis_json)


    bin_names = sorted([int(ul) for ul in bin_map.keys()])
    bin_names = [str(ul) for ul in bin_names]
    bin_headers = ['0'] + bin_names
    bin_headers = [bin_headers[i]+'-'+str(int(ul)-1) for i,ul in enumerate(bin_headers[1:])]
    headers = ['Full','Rare','Non-Rare'] + bin_headers

    sca = sample_complexity_analysis
    values = [sca['full'],sca['rare'],sca['non_rare']] + \
        [bin_map[name] for name in bin_names]
    values = [str(round(v*100,2)) for v in values]

    print('Space delimited values that can be copied to spreadsheet and split by space')
    print(' '.join(headers))
    print(' '.join(values))
예제 #19
0
def main(**kwargs):
    data_const = CocoConstants()
    image_dir = data_const.image_subset_dir[kwargs['subset']]
    image_path_list = glob.glob(os.path.join(image_dir,'*.jpg'))
    det_input = []
    for image_path in tqdm(image_path_list):
        det_input.append({
            'path': image_path,
            'id': os.path.splitext(os.path.basename(image_path))[0]
        })

    io.dump_json_object(
        det_input,
        os.path.join(
            kwargs['out_dir'],
            'det_input_'+kwargs['subset']+'.json'))
def main(exp_const, data_const, model_const):
    np.random.seed(exp_const.seed)
    torch.manual_seed(exp_const.seed)
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = False

    print('Creating network ...')
    model = Constants()
    model.const = model_const
    model.object_encoder = ObjectEncoder(model.const.object_encoder)
    model.cap_encoder = CapEncoder(model.const.cap_encoder)

    o_dim = model.object_encoder.const.object_feature_dim
    if exp_const.contextualize == True:
        o_dim = model.object_encoder.const.context_layer.hidden_size

    model.lang_sup_criterion = create_cap_info_nce_criterion(
        o_dim, model.object_encoder.const.object_feature_dim,
        model.cap_encoder.model.config.hidden_size,
        model.cap_encoder.model.config.hidden_size // 2,
        model.const.cap_info_nce_layers)
    if model.const.model_num != -1:
        loaded_object_encoder = torch.load(model.const.object_encoder_path)
        print('Loaded model number:', loaded_object_encoder['step'])
        model.object_encoder.load_state_dict(
            loaded_object_encoder['state_dict'])
        model.lang_sup_criterion.load_state_dict(
            torch.load(model.const.lang_sup_criterion_path)['state_dict'])
        if exp_const.random_lang is True:
            model.cap_encoder.load_state_dict(
                torch.load(model.const.cap_encoder_path)['state_dict'])

    model.object_encoder.cuda()
    model.cap_encoder.cuda()
    model.lang_sup_criterion.cuda()

    print('Creating dataloader ...')
    dataset = FlickrDataset(data_const)

    with torch.no_grad():
        results = eval_model(model, dataset, exp_const)

    filename = os.path.join(
        exp_const.exp_dir,
        f'results_{data_const.subset}_{model_const.model_num}.json')
    io.dump_json_object(results, filename)
예제 #21
0
def main():
    const = SemEval201810Constants()
    io.mkdir_if_not_exists(const.proc_dir)

    subset_txt_file = {
        'train': const.train_txt,
        'val': const.val_txt,
        'test': const.test_txt,
        'truth': const.truth_txt
    }

    for subset, txt_file in subset_txt_file.items():
        print(f'Converting {subset}.txt file to json ...')
        data = read_txt(txt_file)
        print(f'Number of samples: {len(data)}')
        io.dump_json_object(data, os.path.join(const.proc_dir,
                                               f'{subset}.json'))
예제 #22
0
def main(**kwargs):
    subset = kwargs['subset']
    const = FlickrConstants()

    image_ids = io.read(const.subset_ids[subset])
    image_ids = [idx.decode() for idx in image_ids.split()]

    image_dir = const.flickr_paths['image_dir']

    det_input = []
    for image_id in tqdm(image_ids):
        image_path = os.path.join(image_dir, f'{image_id}.jpg')
        det_input.append({'path': image_path, 'id': image_id})

    io.dump_json_object(
        det_input,
        os.path.join(kwargs['out_dir'],
                     'det_input_' + kwargs['subset'] + '.json'))
예제 #23
0
def main(exp_const,data_const,model_const):
    print('Creating network ...')
    model = Model()
    model.const = model_const
    model.net = LogBilinear(model.const.net)
    if model.const.model_num is not None:
        model.net.load_state_dict(torch.load(model.const.net_path))
    
    embeddings = 0.5*(model.net.embed1.W.weight + model.net.embed2.W.weight)
    embeddings = embeddings.data.numpy()
    embeddings_json = os.path.join(exp_const.exp_dir,'visual_embeddings.npy')
    np.save(embeddings_json,embeddings)

    print('Saving word_to_idx.json ...')
    dataset = MultiSenseCooccurDataset(data_const)
    word_to_idx = dataset.word_to_idx
    word_to_idx_json = os.path.join(exp_const.exp_dir,'word_to_idx.json')
    io.dump_json_object(word_to_idx,word_to_idx_json)
예제 #24
0
def main(embedPath, outdir, vocab_json, embed_type):
    io.mkdir_if_not_exists(outdir)

    vocab = io.load_json_object(vocab_json)

    with open(embedPath, 'r', encoding='latin') as fileId:
        # Read only the word, ignore feature vector
        lines = []
        for line in tqdm(fileId.readlines()):
            lines.append(line.split(' ', 1))
            #import pdb; pdb.set_trace()
        #lines = [line.split(' ', 1) for line in fileId.readlines()]; #[0]

    #vocab_size = int(lines[0][0])
    vocab_size = len(vocab)
    dim = int(lines[0][1][:-1])
    print(vocab_size, dim)
    embed = np.zeros([vocab_size, dim])
    word_to_idx = {}
    count = 0
    for line in tqdm(lines[1:]):
        word = str(line[0])  #.lower()
        if word not in vocab:
            continue

        vec = line[1]  # space separated string of numbers with '\n' at the end
        if embed_type == 'word2vec_wiki' or embed_type == 'visual_word2vec_wiki':
            vec = vec[:-1]
            vec = vec.split(' ')
        else:
            vec = vec.split(' ')[:-1]  # get rid of the '\n'

        count = vocab[word]
        word_to_idx[word] = count
        embed[count] = [float(s) for s in vec]
        #count+=1

    import pdb
    pdb.set_trace()
    embed_npy = os.path.join(outdir, 'visual_embeddings.npy')
    np.save(embed_npy, embed)

    word_to_idx_json = os.path.join(outdir, 'word_to_idx.json')
    io.dump_json_object(word_to_idx, word_to_idx_json)
예제 #25
0
def create_gt_synset_cooccur(exp_const, dataloader):
    print('Creating cooccur ...')
    cooccur = {}
    for data in tqdm(dataloader):
        on_wnids = data['on_wnids']
        for b in range(len(on_wnids)):
            for wnid1 in set(on_wnids[b]):
                for wnid2 in set(on_wnids[b]):
                    if wnid1 not in cooccur:
                        cooccur[wnid1] = {}

                    if wnid2 not in cooccur[wnid1]:
                        cooccur[wnid1][wnid2] = 0

                    cooccur[wnid1][wnid2] += 1

    print('Creating offsets to synsets dict ...')
    offset_to_synset = {}
    for offset in tqdm(cooccur.keys()):
        offset_to_synset[offset] = wnid_offset_to_synset(offset)

    print('Replacing offset by synset in cooccur ...')
    synset_cooccur = {}
    for offset1 in tqdm(cooccur.keys()):
        synset1 = offset_to_synset[offset1]

        context = {}
        for offset2, count in cooccur[offset1].items():
            synset2 = offset_to_synset[offset2]
            context[synset2] = count

        synset_cooccur[synset1] = context

    synset_cooccur_json = os.path.join(exp_const.exp_dir,
                                       'synset_cooccur.json')
    io.dump_json_object(synset_cooccur, synset_cooccur_json)

    print('Checking symmetry and self constraint in synset cooccur ...')
    for wnid1, context in tqdm(synset_cooccur.items()):
        for wnid2, count in context.items():
            sym_err_msg = f'Word cooccurence not symmetric ({wnid1} / {wnid2})'
            assert (synset_cooccur[wnid2][wnid1] == count), err_msg

    print('Constraints satisfied')
예제 #26
0
def main():
    const = VisualGenomeConstants()
    io.mkdir_if_not_exists(const.proc_dir, recursive=True)

    print('Loading object_annos.json ...')
    object_annos = io.load_json_object(const.object_annos_json)

    print('Computing object frequencies ...')
    object_freqs = compute_object_freqs(object_annos)
    print(f'Number of objects: {len(object_freqs)}')
    io.dump_json_object(object_freqs,
                        os.path.join(const.proc_dir, 'object_freqs.json'))

    print('Computing object synset frequencies ...')
    object_synset_freqs = compute_object_synset_freqs(object_annos)
    print(f'Number of object_synsets: {len(object_synset_freqs)}')
    io.dump_json_object(
        object_synset_freqs,
        os.path.join(const.proc_dir, 'object_synset_freqs.json'))
예제 #27
0
def main(exp_const,data_const):
    print('Loading synset cooccurence ...')
    synset_cooccur = io.load_json_object(data_const.synset_cooccur_json)

    print('Checking symmetry and self constraint in synset cooccur ...')
    sym_err_msg = 'Word cooccurence not symmetric ...'
    for word1, context in tqdm(synset_cooccur.items()):
        for word2, count in context.items():
            assert(synset_cooccur[word2][word1]==count), err_msg


    print('Mapping synsets to words ...')
    synset_to_words_dict = {}
    for synset in tqdm(synset_cooccur.keys()):
        synset_to_words_dict[synset] = synset_to_words(synset)

    print('Creating word cooccurrence ...')
    word_cooccur = {}
    for wnid1, context in tqdm(synset_cooccur.items()):
        words1 = synset_to_words_dict[wnid1]

        for wnid2, count in context.items():
            words2 = synset_to_words_dict[wnid2]
            
            for word1 in set(words1):
                for word2 in set(words2):
                    if word1 not in word_cooccur:
                        word_cooccur[word1] = {}

                    if word2 not in word_cooccur[word1]:
                        word_cooccur[word1][word2] = 0

                    word_cooccur[word1][word2] += count

    io.dump_json_object(word_cooccur,data_const.word_cooccur_json)

    print('Checking symmetry and self constraint in word cooccur...')
    for word1, context in tqdm(word_cooccur.items()):
        for word2, count in context.items():
            sym_err_msg = f'Word cooccurence not symmetric ({word1} / {word2})'
            assert(word_cooccur[word2][word1]==count), err_msg

    print('Constraints satisfied')
예제 #28
0
def main():
    const = SemEval201810Constants()

    subset_json = {
        'train': const.train_json,
        'val': const.val_json,
        'test': const.test_json
    }

    words = []
    for subset, json_file in subset_json.items():
        data = io.load_json_object(json_file)
        data = [row[:2] for row in data]
        words += data

    print('Computing word frequency ...')
    word_freqs = compute_word_freq(words)
    print(f'Number of words: {len(word_freqs)}')
    io.dump_json_object(word_freqs,
                        os.path.join(const.proc_dir, 'word_freqs.json'))
예제 #29
0
def main(exp_const):
    io.mkdir_if_not_exists(exp_const.exp_dir, recursive=True)

    cooccur = {}
    nltk.download('wordnet')
    nltk.download('stopwords')
    stop_words = set(stopwords.words('english'))
    for synset in wn.all_synsets():
        words = synset_to_words(synset, stop_words)
        for word1 in words:
            for word2 in words:
                if word1 not in cooccur:
                    cooccur[word1] = {}

                if word2 not in cooccur[word1]:
                    cooccur[word1][word2] = 0

                cooccur[word1][word2] += 1

    cooccur_json = os.path.join(exp_const.exp_dir, 'word_cooccur.json')
    io.dump_json_object(cooccur, cooccur_json)
예제 #30
0
def main():
    const = SemEval201810Constants()

    subset_json = {
        'train': const.train_json,
        'val': const.val_json,
        'test': const.test_json
    }

    features = []
    for subset, json_file in subset_json.items():
        data = io.load_json_object(json_file)
        data = [row[2] for row in data]
        features += data

    print('Computing feature frequency ...')
    feature_freqs = compute_feature_freq(features)
    print(f'Number of features: {len(feature_freqs)}')
    io.dump_json_object(
        feature_freqs,
        os.path.join(const.proc_dir,'feature_freqs.json'))