def create_gt_synset_cooccur(exp_const, dataloader): print('Creating cooccur ...') cooccur = {} for data in tqdm(dataloader): B = len(data['object_synsets']) for b in range(B): for wnid1 in set(data['attribute_synsets'][b]): for wnid2 in set(data['attribute_synsets'][b]): if wnid1 not in cooccur: cooccur[wnid1] = {} if wnid2 not in cooccur[wnid1]: cooccur[wnid1][wnid2] = 0 cooccur[wnid1][wnid2] += 1 synset_cooccur_json = os.path.join(exp_const.exp_dir, 'synset_cooccur.json') io.dump_json_object(cooccur, synset_cooccur_json) print('Checking symmetry and self constraint in synset cooccur ...') for wnid1, context in tqdm(cooccur.items()): for wnid2, count in context.items(): sym_err_msg = f'Word cooccurence not symmetric ({wnid1} / {wnid2})' assert (cooccur[wnid2][wnid1] == count), err_msg print('Constraints satisfied')
def main(**kwargs): subset = kwargs['subset'] const = FlickrConstants() io.mkdir_if_not_exists(const.flickr_paths['proc_dir']) image_ids = io.read(const.subset_ids[subset]) image_ids = [idx.decode() for idx in image_ids.split()] # Write boxes to json boxes = {} for image_id in tqdm(image_ids): box_xml = os.path.join(const.flickr_paths['anno_dir'], f'{image_id}.xml') boxes[image_id] = get_annotations(box_xml) io.dump_json_object(boxes, const.box_json[subset]) # Write sentence annos to json sent = {} for image_id in tqdm(image_ids): sent_txt = os.path.join(const.flickr_paths['sent_dir'], f'{image_id}.txt') sent[image_id] = get_sentence_data(sent_txt) io.dump_json_object(sent, const.sent_json[subset])
def prepare_data(exp_const, data_const): io.mkdir_if_not_exists(exp_const.exp_dir) print('Writing constants to exp dir ...') data_const_json = os.path.join(exp_const.exp_dir, 'data_const.json') data_const.to_json(data_const_json) exp_const_json = os.path.join(exp_const.exp_dir, 'exp_const.json') exp_const.to_json(exp_const_json) print('Loading anno_list.json ...') anno_list = io.load_json_object(data_const.anno_list_json) print('Creating input json for faster rcnn ...') images_in_out = [None] * len(anno_list) for i, anno in enumerate(anno_list): global_id = anno['global_id'] image_in_out = dict() image_in_out['in_path'] = os.path.join(data_const.images_dir, anno['image_path_postfix']) image_in_out['out_dir'] = os.path.join(data_const.proc_dir, 'faster_rcnn_boxes') image_in_out['prefix'] = f'{global_id}_' images_in_out[i] = image_in_out images_in_out_json = os.path.join(exp_const.exp_dir, 'faster_rcnn_im_in_out.json') io.dump_json_object(images_in_out, images_in_out_json)
def main(exp_const, data_const): print(f'Creating directory {exp_const.exp_dir} ...') io.mkdir_if_not_exists(exp_const.exp_dir, recursive=True) print('Saving constants ...') save_constants({'exp': exp_const, 'data': data_const}, exp_const.exp_dir) print('Loading data ...') img_id_to_obj_id = io.load_json_object( data_const.image_id_to_object_id_json) object_annos = io.load_json_object(data_const.object_annos_json) cooccur = {} for img_id, obj_ids in tqdm(img_id_to_obj_id.items()): synset_list = create_synset_list(object_annos, obj_ids) for synset1 in synset_list: for synset2 in synset_list: if synset1 not in cooccur: cooccur[synset1] = {} if synset2 not in cooccur[synset1]: cooccur[synset1][synset2] = 0 cooccur[synset1][synset2] += 1 synset_cooccur_json = os.path.join(exp_const.exp_dir, 'synset_cooccur.json') io.dump_json_object(cooccur, synset_cooccur_json)
def main(): const = VisualGenomeConstants() io.mkdir_if_not_exists(const.proc_dir,recursive=True) print('Loading objects.json ...') objects = io.load_json_object(const.objects_json) print('Loading object_synsets.json ...') object_synsets = io.load_json_object(const.object_synsets_json) print('Creating image_id_to_object_id.json ...') image_id_to_object_id = get_image_id_to_object_id(objects) io.dump_json_object( image_id_to_object_id, os.path.join(const.proc_dir,'image_id_to_object_id.json')) print('Loading attributes.json ...') attributes = io.load_json_object(const.attributes_json) print('Loading attribute_synsets.json ...') attribute_synsets = io.load_json_object(const.attribute_synsets_json) print('Creating object_annos.json ...') object_annos = get_object_annos(objects,attributes,attribute_synsets) io.dump_json_object( object_annos, os.path.join(const.proc_dir,'object_annos.json'))
def main(exp_const, data_const, model_const): print('Creating network ...') model = Model() model.const = model_const model.encoder = Encoder(model.const.encoder).cuda() encoder_path = os.path.join(exp_const.model_dir, 'encoder_' + str(model.const.model_num)) model.encoder.load_state_dict(torch.load(encoder_path)) print('Creating dataloader ...') dataset = VisualFeaturesDataset(data_const) dataloader = DataLoader(dataset, batch_size=exp_const.batch_size, shuffle=True) print('Get features ...') features = get_visual_features(model, dataloader, exp_const) print('Save features h5py ...') word_features_h5py = h5py.File( os.path.join(exp_const.exp_dir, 'word_features.h5py'), 'w') word_features_h5py.create_dataset('features', data=features, chunks=(1, features.shape[1])) word_features_h5py.create_dataset('mean', data=np.mean(features, axis=0)) word_features_h5py.close() print('Save features word idx json ...') word_to_idx_json = os.path.join(exp_const.exp_dir, 'word_to_idx.json') io.dump_json_object(dataloader.dataset.word_to_idx, word_to_idx_json)
def main(exp_const, data_const): class_confmat = np.load(data_const.class_confmat_npy) visual_embed = np.load(data_const.visual_embed_npy) labels = np.load(data_const.labels_npy) glove_vecs = visual_embed[:, :data_const.glove_dim] visual_vecs = visual_embed[:, data_const.glove_dim:] visual_sim = np.matmul(visual_vecs, np.transpose(visual_vecs)) glove_sim = np.matmul(glove_vecs, np.transpose(glove_vecs)) corr_pvalue = {} filename = os.path.join(exp_const.vis_dir, 'class_vs_glove_visual_sim.html') x, y, _ = create_scatter_plot(class_confmat, glove_sim + visual_sim, labels, filename) corr_pvalue['glove+visual'] = pearsoncorr(x, y) filename = os.path.join(exp_const.vis_dir, 'class_vs_visual_sim.html') create_scatter_plot(class_confmat, visual_sim, labels, filename) x, y, _ = create_scatter_plot(class_confmat, visual_sim, labels, filename) corr_pvalue['visual'] = pearsoncorr(x, y) filename = os.path.join(exp_const.vis_dir, 'class_vs_glove_sim.html') create_scatter_plot(class_confmat, glove_sim, labels, filename) x, y, _ = create_scatter_plot(class_confmat, glove_sim, labels, filename) corr_pvalue['glove'] = pearsoncorr(x, y) corr_pvalue_json = os.path.join(exp_const.exp_dir, 'corr_pvalue.json') io.dump_json_object(corr_pvalue, corr_pvalue_json)
def main(): args = parser.parse_args() data_const = HicoConstants(exp_ver=args.exp_ver) print('Creating output dir ...') io.mkdir_if_not_exists(data_const.result_dir + '/map', recursive=True) # Load hoi_list hoi_list_json = os.path.join(data_const.proc_dir, 'hoi_list.json') hoi_list = io.load_json_object(hoi_list_json) # Load subset ids to eval on split_ids_json = os.path.join(data_const.proc_dir, 'split_ids.json') split_ids = io.load_json_object(split_ids_json) global_ids = split_ids[args.subset] global_ids_set = set(global_ids) # Create gt_dets print('Creating GT dets ...') gt_dets = load_gt_dets(data_const.proc_dir, global_ids_set) eval_inputs = [] for hoi in hoi_list: eval_inputs.append((hoi['id'], global_ids, gt_dets, data_const.result_dir + '/pred_hoi_dets.hdf5', data_const.result_dir + '/map')) # import ipdb; ipdb.set_trace() # eval_hoi(*eval_inputs[0]) print(f'Starting a pool of {args.num_processes} workers ...') p = Pool(args.num_processes) print(f'Begin mAP computation ...') output = p.starmap(eval_hoi, eval_inputs) #output = eval_hoi('003',global_ids,gt_dets,args.pred_hoi_dets_hdf5,args.out_dir) p.close() p.join() mAP = { 'AP': {}, 'mAP': 0, 'invalid': 0, } map_ = 0 count = 0 for ap, hoi_id in output: mAP['AP'][hoi_id] = ap if not np.isnan(ap): count += 1 map_ += ap mAP['mAP'] = map_ / count mAP['invalid'] = len(output) - count mAP_json = os.path.join(data_const.result_dir + '/map', 'mAP.json') io.dump_json_object(mAP, mAP_json) print(f'APs have been saved to {data_const.result_dir}/map')
def main(**kwargs): print('Creating Caption Encoder (tokenizer) ...') cap_encoder = CapEncoder(CapEncoderConstants()) nltk.download('punkt') data_const = FlickrDatasetConstants(kwargs['subset']) data_const.read_noun_token_ids = False dataset = FlickrDataset(data_const) noun_token_ids = [None] * len(dataset) noun_vocab = set() num_human_captions = 0 num_noun_captions = 0 for i, data in enumerate(tqdm(dataset)): image_id = data['image_id'] cap_id = data['cap_id'] caption = data['caption'] token_ids, tokens = cap_encoder.tokenize(caption) nltk_tokens = nltk.word_tokenize(caption.lower()) pos_tags = nltk.pos_tag(nltk_tokens) pos_tags = ignore_words_from_pos(pos_tags, ['is', 'has', 'have', 'had', 'be']) alignment = align_pos_tokens(pos_tags, tokens) noun_token_ids_, noun_words = get_noun_token_ids(pos_tags, alignment) noun_token_ids_ = group_token_ids(noun_token_ids_, tokens) if len(noun_token_ids_) > 0: num_noun_captions += 1 noun_token_ids[i] = { 'image_id': image_id, 'cap_id': cap_id, 'token_ids': noun_token_ids_, 'words': list(noun_words) } noun_vocab.update(noun_words) for human_word in [ 'man', 'person', 'human', 'woman', 'boy', 'girl', 'men', 'women', 'boys', 'girls', 'child', 'children' ]: if human_word in tokens: num_human_captions += 1 break io.mkdir_if_not_exists( os.path.join(flickr_paths['proc_dir'], 'annotations')) io.dump_json_object(noun_token_ids, data_const.noun_tokens_json) io.dump_json_object(sorted(list(noun_vocab)), data_const.noun_vocab_json) print('Number of human captions:', num_human_captions) print('Number of noun captions:', num_noun_captions) print('Total number of captions:', len(dataset)) print('Size of noun vocabulary:', len(noun_vocab))
def main(exp_const, data_const, model_const): io.mkdir_if_not_exists(exp_const.vis_dir) print('Creating network ...') model = Model() model.const = model_const model.net = ResnetModel(model.const.net) if model.const.model_num is not None: model.net.load_state_dict(torch.load(model.const.net_path)) model.net.cuda() if exp_const.feedforward == False: model.AttributeEmbeddings = AttributeEmbeddings( model.const.AttributeEmbeddings) if model.const.model_num is not None: model.AttributeEmbeddings.load_state_dict( torch.load(model.const.AttributeEmbeddings_path)) model.AttributeEmbeddings.cuda() model.img_mean = np.array([0.485, 0.456, 0.406]) model.img_std = np.array([0.229, 0.224, 0.225]) print('Creating dataloader ...') dataset = Cifar100Dataset(data_const) dataloader = DataLoader(dataset, batch_size=exp_const.batch_size, shuffle=True, num_workers=exp_const.num_workers) eval_results = eval_model(model, dataloader, exp_const) confmat_npy = os.path.join(exp_const.exp_dir, 'confmat.npy') np.save(confmat_npy, eval_results['Conf Mat']) results = { 'Avg Loss': eval_results['Avg Loss'], 'Acc': eval_results['Acc'] } print(results) results_json = os.path.join(exp_const.exp_dir, 'results.json') io.dump_json_object(results, results_json) embeddings_npy = os.path.join(exp_const.exp_dir, 'embeddings.npy') if exp_const.feedforward == True: np.save(embeddings_npy, model.net.resnet_layers.fc.weight.data.cpu().numpy()) else: np.save(embeddings_npy, model.AttributeEmbeddings.embed.weight.data.cpu().numpy()) labels_npy = os.path.join(exp_const.exp_dir, 'labels.npy') np.save(labels_npy, dataset.labels)
def main(): embeddings_h5py = os.path.join( os.getcwd(), 'symlinks/exp/google_images/' + \ 'normalized_resnet_embeddings_recon_loss_trained_on_google/' + \ 'concat_glove_and_visual/visual_word_vecs.h5py') word_to_idx_json = os.path.join( os.getcwd(), 'symlinks/exp/google_images/' + \ 'normalized_resnet_embeddings_recon_loss_trained_on_google/' + \ 'concat_glove_and_visual/visual_word_vecs_idx.json') semaleval_all_words_json = os.path.join( os.getcwd(), 'symlinks/data/semeval_2018_10/proc/all_words.json') print('Reading embeddings ...') embeddings = h5py.File(embeddings_h5py, 'r')['embeddings'][()] mean_embedding = np.mean(embeddings, 0) word_to_idx = io.load_json_object(word_to_idx_json) words_to_select = list( io.load_json_object(semaleval_all_words_json).keys()) print('Selecting subset ...') subset_embeddings = np.zeros([len(words_to_select), embeddings.shape[1]]) subset_word_to_idx = {} count = 0 for i, word in enumerate(tqdm(words_to_select)): subset_word_to_idx[word] = i if word not in word_to_idx: count += 1 subset_embeddings[i] = mean_embedding else: idx = word_to_idx[word] subset_embeddings[i] = embeddings[idx] print(count) print('Saving selected subset embeddings ...') subset_visual_word_vecs_h5py_filename = os.path.join( os.getcwd(), 'symlinks/exp/google_images/' + \ 'normalized_resnet_embeddings_recon_loss_trained_on_google/' + \ 'concat_glove_and_visual/subset_visual_word_vecs.h5py') subset_embeddings_h5py = h5py.File(subset_visual_word_vecs_h5py_filename, 'w') subset_embeddings_h5py.create_dataset('embeddings', data=subset_embeddings, chunks=(1, subset_embeddings.shape[1])) subset_word_to_idx_json = os.path.join( os.getcwd(), 'symlinks/exp/google_images/' + \ 'normalized_resnet_embeddings_recon_loss_trained_on_google/' + \ 'concat_glove_and_visual/subset_visual_word_vecs_idx.json') io.dump_json_object(subset_word_to_idx, subset_word_to_idx_json)
def main(): genome_const = VisualGenomeConstants() object_freqs = io.load_json_object(genome_const.object_freqs_json) attribute_freqs = io.load_json_object(genome_const.attribute_freqs_json) vocab = copy.deepcopy(object_freqs) for word,freq in attribute_freqs.items(): if word not in vocab: vocab[word] = 0 vocab[word] += freq io.dump_json_object(vocab, genome_const.all_word_freqs_json)
def main(): url = 'https://gist.githubusercontent.com/yrevar/6135f1bd8dcf2e0cc683/' + \ 'raw/d133d61a09d7e5a3b36b8c111a8dd5c4b5d560ee/' + \ 'imagenet1000_clsid_to_human.pkl' outdir = os.path.join(os.getcwd(), 'symlinks/data/imagenet/proc') io.mkdir_if_not_exists(outdir, recursive=True) labels_json = os.path.join(outdir, 'labels.json') labels_dict = pickle.load(urlrequest.urlopen(url)) labels = [] for i in range(len(labels_dict)): labels.append(labels_dict[i]) io.dump_json_object(labels, labels_json)
def main(): semeval_const = SemEval201810Constants() word_freqs = io.load_json_object(semeval_const.word_freqs) feature_freqs = io.load_json_object(semeval_const.feature_freqs) vocab = copy.deepcopy(word_freqs) for word, freq in feature_freqs.items(): if word not in vocab: vocab[word] = 0 vocab[word] += freq io.dump_json_object(vocab, semeval_const.all_word_freqs)
def main(exp_const, data_const): io.mkdir_if_not_exists(exp_const.exp_dir, recursive=True) save_constants({'exp': exp_const, 'data': data_const}, exp_const.exp_dir) print('Loading glove embeddings ...') glove_idx = io.load_json_object(data_const.glove_idx) glove_h5py = h5py.File(data_const.glove_h5py, 'r') glove_embeddings = glove_h5py['embeddings'][()] num_glove_words, glove_dim = glove_embeddings.shape print('-' * 80) print(f'number of glove words: {num_glove_words}') print(f'glove dim: {glove_dim}') print('-' * 80) print('Loading visual features ...') visual_features_idx = io.load_json_object(data_const.visual_features_idx) visual_features_h5py = h5py.File(data_const.visual_features_h5py, 'r') visual_features = visual_features_h5py['features'][()] num_visual_features, visual_features_dim = visual_features.shape print('-' * 80) print(f'number of visual features: {num_visual_features}') print(f'visual feature dim: {visual_features_dim}') print('-' * 80) print('Combining glove with visual features ...') visual_word_vecs_idx_json = os.path.join(exp_const.exp_dir, 'visual_word_vecs_idx.json') io.dump_json_object(glove_idx, visual_word_vecs_idx_json) visual_word_vecs_h5py = h5py.File( os.path.join(exp_const.exp_dir, 'visual_word_vecs.h5py'), 'w') visual_word_vec_dim = glove_dim + visual_features_dim visual_word_vecs = np.zeros([num_glove_words, visual_word_vec_dim]) mean_visual_feature = visual_features_h5py['mean'][()] for word in tqdm(glove_idx.keys()): glove_id = glove_idx[word] glove_vec = glove_embeddings[glove_id] if word in visual_features_idx: feature_id = visual_features_idx[word] feature = visual_features[feature_id] else: feature = mean_visual_feature visual_word_vec = np.concatenate( (glove_vec, (feature - mean_visual_feature))) # visual_word_vec = np.concatenate(( # normalize(glove_vec), # normalize(feature))) visual_word_vecs[glove_id] = visual_word_vec visual_word_vecs_h5py.create_dataset('embeddings', data=visual_word_vecs, chunks=(1, visual_word_vec_dim)) visual_word_vecs_h5py.close()
def main(): const = ImagenetConstants() print('Reading txt file ...') with open(const.is_a_txt, 'r', encoding='ISO-8859-1') as f: #ISO-8859-1 lines = f.readlines() print('Parsing is_a relationship ...') lines = [line.rstrip('\n') for line in lines] lines = [line.split(' ') for line in lines] wnid_to_parent = {c: p for p, c in lines} print('Saving wnid_to_parent.json') io.dump_json_object(wnid_to_parent, const.wnid_to_parent_json)
def convert(self): print('Creating anno list for vcoco...') corre = np.zeros((24, 80)) anno_list, corre = self.create_anno_list(corre) io.dump_json_object(anno_list, self.const.anno_list_json) np.save(self.const.mat_npy, corre) print('Creating hoi list for vcoco...') hoi_list = self.create_hoi_list() io.dump_json_object(hoi_list, self.const.hoi_list_json) print('Creating object list for vcoco...') object_list = self.create_obj_list() io.dump_json_object(object_list, self.const.object_list_json) print('Creating verb list for vcoco...') verb_list = [] for i, verb in enumerate(self.VCOCO_train.actions): verb_list_item = { 'id': str(self.VCOCO_train.actions_to_id_map[verb] + 1).zfill(3), 'name': verb } verb_list.append(verb_list_item) io.dump_json_object(verb_list, self.const.verb_list_json)
def main(): args = parser.parse_args() data_const = HicoConstants(exp_ver=args.exp_ver) out_dir = data_const.result_dir+'/map' bin_to_hoi_ids = io.load_json_object(data_const.bin_to_hoi_ids_json) mAP_json = os.path.join(out_dir,'mAP.json') APs = io.load_json_object(mAP_json)['AP'] bin_map = {} bin_count = {} for bin_id,hoi_ids in bin_to_hoi_ids.items(): bin_map[bin_id] = compute_mAP(APs,hoi_ids) non_rare_hoi_ids = [] for ul in bin_to_hoi_ids.keys(): if ul=='10': continue non_rare_hoi_ids += bin_to_hoi_ids[ul] sample_complexity_analysis = { 'bin': bin_map, 'full': compute_mAP(APs,APs.keys()), 'rare': bin_map['10'], 'non_rare': compute_mAP(APs,non_rare_hoi_ids) } sample_complexity_analysis_json = os.path.join( out_dir, f'sample_complexity_analysis.json') io.dump_json_object( sample_complexity_analysis, sample_complexity_analysis_json) bin_names = sorted([int(ul) for ul in bin_map.keys()]) bin_names = [str(ul) for ul in bin_names] bin_headers = ['0'] + bin_names bin_headers = [bin_headers[i]+'-'+str(int(ul)-1) for i,ul in enumerate(bin_headers[1:])] headers = ['Full','Rare','Non-Rare'] + bin_headers sca = sample_complexity_analysis values = [sca['full'],sca['rare'],sca['non_rare']] + \ [bin_map[name] for name in bin_names] values = [str(round(v*100,2)) for v in values] print('Space delimited values that can be copied to spreadsheet and split by space') print(' '.join(headers)) print(' '.join(values))
def main(**kwargs): data_const = CocoConstants() image_dir = data_const.image_subset_dir[kwargs['subset']] image_path_list = glob.glob(os.path.join(image_dir,'*.jpg')) det_input = [] for image_path in tqdm(image_path_list): det_input.append({ 'path': image_path, 'id': os.path.splitext(os.path.basename(image_path))[0] }) io.dump_json_object( det_input, os.path.join( kwargs['out_dir'], 'det_input_'+kwargs['subset']+'.json'))
def main(exp_const, data_const, model_const): np.random.seed(exp_const.seed) torch.manual_seed(exp_const.seed) torch.backends.cudnn.deterministic = True torch.backends.cudnn.benchmark = False print('Creating network ...') model = Constants() model.const = model_const model.object_encoder = ObjectEncoder(model.const.object_encoder) model.cap_encoder = CapEncoder(model.const.cap_encoder) o_dim = model.object_encoder.const.object_feature_dim if exp_const.contextualize == True: o_dim = model.object_encoder.const.context_layer.hidden_size model.lang_sup_criterion = create_cap_info_nce_criterion( o_dim, model.object_encoder.const.object_feature_dim, model.cap_encoder.model.config.hidden_size, model.cap_encoder.model.config.hidden_size // 2, model.const.cap_info_nce_layers) if model.const.model_num != -1: loaded_object_encoder = torch.load(model.const.object_encoder_path) print('Loaded model number:', loaded_object_encoder['step']) model.object_encoder.load_state_dict( loaded_object_encoder['state_dict']) model.lang_sup_criterion.load_state_dict( torch.load(model.const.lang_sup_criterion_path)['state_dict']) if exp_const.random_lang is True: model.cap_encoder.load_state_dict( torch.load(model.const.cap_encoder_path)['state_dict']) model.object_encoder.cuda() model.cap_encoder.cuda() model.lang_sup_criterion.cuda() print('Creating dataloader ...') dataset = FlickrDataset(data_const) with torch.no_grad(): results = eval_model(model, dataset, exp_const) filename = os.path.join( exp_const.exp_dir, f'results_{data_const.subset}_{model_const.model_num}.json') io.dump_json_object(results, filename)
def main(): const = SemEval201810Constants() io.mkdir_if_not_exists(const.proc_dir) subset_txt_file = { 'train': const.train_txt, 'val': const.val_txt, 'test': const.test_txt, 'truth': const.truth_txt } for subset, txt_file in subset_txt_file.items(): print(f'Converting {subset}.txt file to json ...') data = read_txt(txt_file) print(f'Number of samples: {len(data)}') io.dump_json_object(data, os.path.join(const.proc_dir, f'{subset}.json'))
def main(**kwargs): subset = kwargs['subset'] const = FlickrConstants() image_ids = io.read(const.subset_ids[subset]) image_ids = [idx.decode() for idx in image_ids.split()] image_dir = const.flickr_paths['image_dir'] det_input = [] for image_id in tqdm(image_ids): image_path = os.path.join(image_dir, f'{image_id}.jpg') det_input.append({'path': image_path, 'id': image_id}) io.dump_json_object( det_input, os.path.join(kwargs['out_dir'], 'det_input_' + kwargs['subset'] + '.json'))
def main(exp_const,data_const,model_const): print('Creating network ...') model = Model() model.const = model_const model.net = LogBilinear(model.const.net) if model.const.model_num is not None: model.net.load_state_dict(torch.load(model.const.net_path)) embeddings = 0.5*(model.net.embed1.W.weight + model.net.embed2.W.weight) embeddings = embeddings.data.numpy() embeddings_json = os.path.join(exp_const.exp_dir,'visual_embeddings.npy') np.save(embeddings_json,embeddings) print('Saving word_to_idx.json ...') dataset = MultiSenseCooccurDataset(data_const) word_to_idx = dataset.word_to_idx word_to_idx_json = os.path.join(exp_const.exp_dir,'word_to_idx.json') io.dump_json_object(word_to_idx,word_to_idx_json)
def main(embedPath, outdir, vocab_json, embed_type): io.mkdir_if_not_exists(outdir) vocab = io.load_json_object(vocab_json) with open(embedPath, 'r', encoding='latin') as fileId: # Read only the word, ignore feature vector lines = [] for line in tqdm(fileId.readlines()): lines.append(line.split(' ', 1)) #import pdb; pdb.set_trace() #lines = [line.split(' ', 1) for line in fileId.readlines()]; #[0] #vocab_size = int(lines[0][0]) vocab_size = len(vocab) dim = int(lines[0][1][:-1]) print(vocab_size, dim) embed = np.zeros([vocab_size, dim]) word_to_idx = {} count = 0 for line in tqdm(lines[1:]): word = str(line[0]) #.lower() if word not in vocab: continue vec = line[1] # space separated string of numbers with '\n' at the end if embed_type == 'word2vec_wiki' or embed_type == 'visual_word2vec_wiki': vec = vec[:-1] vec = vec.split(' ') else: vec = vec.split(' ')[:-1] # get rid of the '\n' count = vocab[word] word_to_idx[word] = count embed[count] = [float(s) for s in vec] #count+=1 import pdb pdb.set_trace() embed_npy = os.path.join(outdir, 'visual_embeddings.npy') np.save(embed_npy, embed) word_to_idx_json = os.path.join(outdir, 'word_to_idx.json') io.dump_json_object(word_to_idx, word_to_idx_json)
def create_gt_synset_cooccur(exp_const, dataloader): print('Creating cooccur ...') cooccur = {} for data in tqdm(dataloader): on_wnids = data['on_wnids'] for b in range(len(on_wnids)): for wnid1 in set(on_wnids[b]): for wnid2 in set(on_wnids[b]): if wnid1 not in cooccur: cooccur[wnid1] = {} if wnid2 not in cooccur[wnid1]: cooccur[wnid1][wnid2] = 0 cooccur[wnid1][wnid2] += 1 print('Creating offsets to synsets dict ...') offset_to_synset = {} for offset in tqdm(cooccur.keys()): offset_to_synset[offset] = wnid_offset_to_synset(offset) print('Replacing offset by synset in cooccur ...') synset_cooccur = {} for offset1 in tqdm(cooccur.keys()): synset1 = offset_to_synset[offset1] context = {} for offset2, count in cooccur[offset1].items(): synset2 = offset_to_synset[offset2] context[synset2] = count synset_cooccur[synset1] = context synset_cooccur_json = os.path.join(exp_const.exp_dir, 'synset_cooccur.json') io.dump_json_object(synset_cooccur, synset_cooccur_json) print('Checking symmetry and self constraint in synset cooccur ...') for wnid1, context in tqdm(synset_cooccur.items()): for wnid2, count in context.items(): sym_err_msg = f'Word cooccurence not symmetric ({wnid1} / {wnid2})' assert (synset_cooccur[wnid2][wnid1] == count), err_msg print('Constraints satisfied')
def main(): const = VisualGenomeConstants() io.mkdir_if_not_exists(const.proc_dir, recursive=True) print('Loading object_annos.json ...') object_annos = io.load_json_object(const.object_annos_json) print('Computing object frequencies ...') object_freqs = compute_object_freqs(object_annos) print(f'Number of objects: {len(object_freqs)}') io.dump_json_object(object_freqs, os.path.join(const.proc_dir, 'object_freqs.json')) print('Computing object synset frequencies ...') object_synset_freqs = compute_object_synset_freqs(object_annos) print(f'Number of object_synsets: {len(object_synset_freqs)}') io.dump_json_object( object_synset_freqs, os.path.join(const.proc_dir, 'object_synset_freqs.json'))
def main(exp_const,data_const): print('Loading synset cooccurence ...') synset_cooccur = io.load_json_object(data_const.synset_cooccur_json) print('Checking symmetry and self constraint in synset cooccur ...') sym_err_msg = 'Word cooccurence not symmetric ...' for word1, context in tqdm(synset_cooccur.items()): for word2, count in context.items(): assert(synset_cooccur[word2][word1]==count), err_msg print('Mapping synsets to words ...') synset_to_words_dict = {} for synset in tqdm(synset_cooccur.keys()): synset_to_words_dict[synset] = synset_to_words(synset) print('Creating word cooccurrence ...') word_cooccur = {} for wnid1, context in tqdm(synset_cooccur.items()): words1 = synset_to_words_dict[wnid1] for wnid2, count in context.items(): words2 = synset_to_words_dict[wnid2] for word1 in set(words1): for word2 in set(words2): if word1 not in word_cooccur: word_cooccur[word1] = {} if word2 not in word_cooccur[word1]: word_cooccur[word1][word2] = 0 word_cooccur[word1][word2] += count io.dump_json_object(word_cooccur,data_const.word_cooccur_json) print('Checking symmetry and self constraint in word cooccur...') for word1, context in tqdm(word_cooccur.items()): for word2, count in context.items(): sym_err_msg = f'Word cooccurence not symmetric ({word1} / {word2})' assert(word_cooccur[word2][word1]==count), err_msg print('Constraints satisfied')
def main(): const = SemEval201810Constants() subset_json = { 'train': const.train_json, 'val': const.val_json, 'test': const.test_json } words = [] for subset, json_file in subset_json.items(): data = io.load_json_object(json_file) data = [row[:2] for row in data] words += data print('Computing word frequency ...') word_freqs = compute_word_freq(words) print(f'Number of words: {len(word_freqs)}') io.dump_json_object(word_freqs, os.path.join(const.proc_dir, 'word_freqs.json'))
def main(exp_const): io.mkdir_if_not_exists(exp_const.exp_dir, recursive=True) cooccur = {} nltk.download('wordnet') nltk.download('stopwords') stop_words = set(stopwords.words('english')) for synset in wn.all_synsets(): words = synset_to_words(synset, stop_words) for word1 in words: for word2 in words: if word1 not in cooccur: cooccur[word1] = {} if word2 not in cooccur[word1]: cooccur[word1][word2] = 0 cooccur[word1][word2] += 1 cooccur_json = os.path.join(exp_const.exp_dir, 'word_cooccur.json') io.dump_json_object(cooccur, cooccur_json)
def main(): const = SemEval201810Constants() subset_json = { 'train': const.train_json, 'val': const.val_json, 'test': const.test_json } features = [] for subset, json_file in subset_json.items(): data = io.load_json_object(json_file) data = [row[2] for row in data] features += data print('Computing feature frequency ...') feature_freqs = compute_feature_freq(features) print(f'Number of features: {len(feature_freqs)}') io.dump_json_object( feature_freqs, os.path.join(const.proc_dir,'feature_freqs.json'))