def load_and_process_metadata(subset): tf.logging.info('Processing meta data of %s...' % subset) t = time() is_test = subset.startswith('test') year = 2015 if is_test else 2014 subtype = '%s%d' % (subset, year) ann_root = '/usr/data/fl302/code/VQA-tensorflow/data/annotations' datatype = 'test2015' if is_test else subtype # tf.logging.info('Loading annotations and questions...') questions = load_json( os.path.join(ann_root, 'MultipleChoice_mscoco_%s_questions.json' % subtype))['questions'] dataset = questions if is_test \ else load_json(os.path.join(ann_root, 'mscoco_%s_annotations.json' % subtype))['annotations'] meta = [] for info, quest in zip(dataset, questions): ans = None if is_test else info['multiple_choice_answer'] token_ans = None if is_test else ans quest_id = info['question_id'] image_id = info['image_id'] question = quest['question'] mc_ans = quest['multiple_choices'] meta.append( ImageMetadata(image_id, None, quest_id, question, ans, token_ans, mc_ans)) tf.logging.info('Time %0.2f sec.' % (time() - t)) return meta
def __init__(self, g_id, addr, port, global_cfg_path, drone_cfg_path=None): self._g_id = g_id self._addr = addr self._port = port self._connection = None self._ctrl_station = None # Messages received from Dronology self._dronology_in = communication.core.MessageQueue() # Handshake messages sent by ControlStation to Dronology self._dronology_handshake_out = communication.core.MessageQueue() # State messages sent by ControlSTation to Dronology self._dronology_state_out = communication.core.MessageQueue() # New vehicle messages sent by <anyone> to ControlStation. self._new_vehicle_in = communication.core.MessageQueue() self._global_cfg = util.load_json(global_cfg_path) if 'ardupath' not in self._global_cfg: _LOG.error( 'You must specify \"ardupath\" in the global config file at {}' .format(global_cfg_path)) exit(1) self._drone_cfgs = [] # If a JSON drone configuration path is provided, load the drone configurations. # These will be used to create or connect to virtual or physical drones. if drone_cfg_path is not None: self._drone_cfgs = util.load_json(drone_cfg_path) self._is_alive = True
def __init__(self): gen_funcs = [ self._gen_doc_file, self._gen_hist_file, self._gen_conf_file ] cfiles = [TORG_DOC_FILE, TORG_HIST_FILE, TORG_CONF_FILE] # Check for main config directory structure if not os.path.isdir(TORG_CONF_PATH): os.mkdir(TORG_CONF_PATH) for i in range(len(cfiles)): gen_funcs[i](cfiles[i]) # Check if all config files exist for i in range(len(cfiles)): if not os.path.isfile(cfiles[i]): gen_funcs[i](cfiles[i]) # Read all config files conf = load_json(TORG_CONF_FILE) hist = load_json(TORG_HIST_FILE) doc = load_json(TORG_DOC_FILE) self._conf = Map.recursive_map(Map(conf)) self._hist = Map.recursive_map(Map(hist)) self._doc = Map.recursive_map(Map(doc)) return None
def get_conferences(): files = util.listdir(CONFERENCE_FOLDER) util.mkdir(CONFERENCE_CRALWED_FOLDER) cnt = 0 conf = util.load_json('conf_name.json') for file_name in files: save_path = os.path.join(CONFERENCE_CRALWED_FOLDER, file_name) if util.exists(save_path): continue data = util.load_json(os.path.join(CONFERENCE_FOLDER, file_name)) if data['short'] not in conf.keys(): continue html = util.get_page(data['url']) subs = get_subs(data['short'], html) data['name'] = conf[data['short']] data['sub'] = {} for sub in subs: if sub not in conf.keys(): continue html = util.get_page('http://dblp.uni-trier.de/db/conf/' + sub) data['sub'][sub] = {} data['sub'][sub]['pub'] = get_publications(html) data['sub'][sub]['name'] = conf[sub] cnt += 1 print cnt, len(files), data['short'] util.save_json(save_path, data)
def svd_user_business(data_dir, k=50): print "Loading data and building user-business matrix..." users = util.load_json('./data/' + data_dir + '/user.json').keys() businesses = util.load_json('./data/' + data_dir + '/business.json').keys() examples = util.load_json('./data/' + data_dir + '/examples.json') user_to_row = dict(zip(users, range(len(users)))) business_to_column = dict(zip(businesses, range(len(businesses)))) user_business_matrix = sparse.lil_matrix((len(users), len(businesses)), dtype=float) with open('./data/' + data_dir + '/graph.txt') as f: for line in f: u, b = line.split() user_business_matrix[user_to_row[u], business_to_column[b]] = 1 user_business_matrix = sparse.csr_matrix(user_business_matrix) print "Computing singular value decomposition..." u, s, vt = sparse.linalg.svds(user_business_matrix, k=k) us = u * s print "Writing results..." for u in examples: for b in examples[u]: examples[u][b] = np.dot(us[user_to_row[u], :], vt[:, business_to_column[b]]) util.write_json(examples, './data/' + data_dir + '/svd.json')
def intersect_datasets_on_ids(dataset1, dataset2): """ Reduce dataset1 to include only those qa ids which occur in dataset2. This is useful eg to reduce a marked dataset based on a dataset with applied exact-match filters from refine_json_dataset.py """ data1 = load_json(dataset1) data2 = load_json(dataset2) new_data = [] # obtain ids from dataset2 data2_ids = set() for datum in data2[DATA_KEY]: for qa in datum[DOC_KEY][QAS_KEY]: data2_ids.add(qa[ID_KEY]) # reduce data1 based on ids from data2 for datum in data1[DATA_KEY]: qas = [] for qa in datum[DOC_KEY][QAS_KEY]: if qa[ID_KEY] in data2_ids: qas.append(qa) else: print("reduction") if qas: new_doc = document_instance(datum[DOC_KEY][CONTEXT_KEY], datum[DOC_KEY][TITLE_KEY], qas) new_data.append(datum_instance(new_doc, datum[SOURCE_KEY])) return dataset_instance(data1[VERSION_KEY], new_data)
def X_y_e(is_train, vectorizer): print "Loading data..." dataset = "train" if is_train else "test" examples = util.load_json('./data/' + dataset + '/examples.json') users = util.load_json('./data/' + dataset + '/user.json') businesses = util.load_json('./data/' + dataset + '/business.json') unsupervised_scores = { f: util.load_json('./data/' + dataset + '/' + f + '.json') for f in [ 'svd', 'weighted_random_walks', 'random_walks', 'b_adamic', 'b_cn', 'b_jaccard', 'u_adamic', 'u_cn', 'u_jaccard' ] } print "Computing features..." feature_dicts, y, e = [], [], [] for u in examples: for b in examples[u]: e.append((u, b)) y.append(examples[u][b]) feature_dicts.append( get_features(u, b, unsupervised_scores, users[u], businesses[b])) X = vectorizer.fit_transform( feature_dicts) if is_train else vectorizer.transform(feature_dicts) return X, y, e
def _load_data(self): meta_file = os.path.join(_DATA_ROOT, 'data/%svqa_std_mscoco_%s.meta' % (self._version_suffix, self._subset)) data_file = os.path.join(_DATA_ROOT, 'data3/%svqa_mc_w2v_coding_%s.data' % (self._version_suffix, self._subset)) cand_file = os.path.join(_DATA_ROOT, 'data3/%svqa_mc_cands_%s.meta' % (self._version_suffix, self._subset)) # load meta d = load_json(meta_file) self._images = d['images'] self._quest_ids = np.array(d['quest_id']) vqa_image_ids = [find_image_id_from_fname(im_name) for im_name in self._images] self._vqa_image_ids = np.array(vqa_image_ids, dtype=np.int32) # load QA data d = load_hdf5(data_file) self._quests = d['quest_w2v'].astype(np.float32) self._answer = d['cands_w2v'].astype(np.float32) self._labels = d['labels'] # load candiates self._cand_ans = load_json(cand_file) self._num = self._labels.size # double check question ids assert (np.all(self._quest_ids == d['quest_ids'])) self._load_global_image_feature()
def run_random_walks(data_dir, weight_edges=False): print "Loading data and building transition matrix..." examples = util.load_json('./data/' + data_dir + '/examples.json') G = nx.read_edgelist('./data/' + data_dir + '/graph.txt', nodetype=int) if weight_edges: reviews = util.load_json('./data/' + data_dir + '/review.json') end_date = datetime.date(2012, 1, 1) if data_dir == 'train' else datetime.date(2013, 1, 1) edges = G.edges() for e in util.logged_loop(edges, util.LoopLogger(20000, len(edges), True)): n1, n2 = str(e[0]), str(e[1]) if n1 not in reviews or n2 not in reviews[n1]: n1, n2 = n2, n1 G[e[0]][e[1]]['weight'] = 1.0 / ((end_date - get_date(reviews[n1][n2][0])).days + 90) del reviews # save some memory adjacency_matrix = nx.adjacency_matrix(G) inverse_degree_matrix = sparse.diags([[1.0 / adjacency_matrix.getrow(i).sum() for i in range(adjacency_matrix.shape[0])]], [0]) transition_matrix = inverse_degree_matrix.dot(adjacency_matrix) print "Running random walks..." for u in util.logged_loop(examples, util.LoopLogger(10, len(examples), True)): p = run_random_walk(transition_matrix, int(u), 10).todense() for b in examples[u]: examples[u][b] = p[0, int(b)] util.write_json(examples, './data/' + data_dir + ('/weighted_random_walks.json' if weight_edges else '/random_walks.json'))
def __init__(self, data_file_src, data_file_tgt1, data_file_tgt2, data_file_tgt3, tokenizer_enc, tokenizer_dec, is_train=True): self.src = [] self.tgt = [] self.src_mask = [] self.original_target = [] print('loading data from', data_file_src) with open(data_file_src) as f_src: lines_src = f_src.readlines() lines_tgt = [] if data_file_tgt1 is not None and data_file_tgt2 is not None and data_file_tgt3 is not None: lines_tgt1 = [] tgt_dialogs = load_json(data_file_tgt1) for d in tgt_dialogs['dialogs']: for turn in d['dialog']: target = turn['target'] lines_tgt1.append(target) with open(data_file_tgt2) as f_tgt: lines_tgt2 = f_tgt.readlines() lines_tgt2 = [ e.replace('<EOS>', '').replace(START_BELIEF_STATE, '').replace(END_OF_BELIEF, '').strip() for e in lines_tgt2 ] lines_tgt3 = [] tgt_dialogs = load_json(data_file_tgt3) for d in tgt_dialogs['dialogs']: for turn in d['dialog']: target = turn['answer'].strip() lines_tgt3.append(target) lines_tgt = [ '<cls> ' + lines_tgt1[idx] + ' <sep1> ' + lines_tgt2[idx] + ' <sep2> ' + lines_tgt3[idx] + ' <end>' for idx in range(len(lines_tgt1)) ] else: lines_tgt = ['<cls> <end>' for _ in range(len(lines_src))] for idx in range(len(lines_src)): src = tokenizer_enc(lines_src[idx], add_special_tokens=True) src_vec = src.input_ids src_mask = src.attention_mask self.src.append(src_vec) self.src_mask.append(src_mask) self.original_target.append(lines_tgt[idx]) if len(lines_tgt) > 0: tgt_vec = tokenizer_dec.encode(lines_tgt[idx], add_special_tokens=True) self.tgt.append(tgt_vec)
def main(args): raw_count, char_file, out_dir = args[0], args[1], args[2] raw = util.load_json(raw_count) char = util.load_json(char_file) result = count(raw, char) # save result save_result(os.path.join(out_dir, OUT_CHAR_ASCII), result['ascii']) save_result(os.path.join(out_dir, OUT_CHAR_CHINESE), result['chinese']) save_result(os.path.join(out_dir, OUT_CHAR_OTHER), result['other'])
def __enter__(self): self.tokenizer = W2VTokenizer() self.vocab = Vocab() self.bins = load_json('./data/wenmf/bins.json', []) status = load_json('./data/wenmf/state.json', { 'initial_done': False, }) self.topic_frames = [] self.initial_done = status['initial_done'] return self
def print_dataset_stats(data_dir): G = snap.LoadEdgeList(snap.PUNGraph, data_dir + 'graph.txt', 0, 1) print "Num nodes:", G.GetNodes() print "Num edges:", G.GetEdges() n_users = len(util.load_json(data_dir + "user.json")) n_businesses = len(util.load_json(data_dir + "business.json")) n_edges = util.lines_in_file(data_dir + "new_edges.txt") print "({:} users) * ({:} businesses) = {:.3e} candidate edges".format( n_users, n_businesses, n_users * n_businesses) print "{:} edges, {:0.5f}% of candidate edges".format(n_edges, 100 * n_edges / float(n_users * n_businesses))
def _load_dataset(): # load trainval subset = 'trainval' vqa_meta_file = '../iccv_vaq/data/vqa_std_mscoco_%s.meta' % subset vqa_meta_trainval = load_json(vqa_meta_file) # load dev subset = 'dev' vqa_meta_file = '../iccv_vaq/data/vqa_std_mscoco_%s.meta' % subset vqa_meta_dev = load_json(vqa_meta_file) return vqa_meta_trainval['quest_id'], vqa_meta_dev['quest_id']
def print_dataset_stats(data_dir): G = snap.LoadEdgeList(snap.PUNGraph, data_dir + 'graph.txt', 0, 1) print "Num nodes:", G.GetNodes() print "Num edges:", G.GetEdges() n_users = len(util.load_json(data_dir + "user.json")) n_businesses = len(util.load_json(data_dir + "business.json")) n_edges = util.lines_in_file(data_dir + "new_edges.txt") print "({:} users) * ({:} businesses) = {:.3e} candidate edges".format( n_users, n_businesses, n_users * n_businesses) print "{:} edges, {:0.5f}% of candidate edges".format( n_edges, 100 * n_edges / float(n_users * n_businesses))
def on_connect(code, msg): print "connect" print(yield gen.Task(riakclient.ping)) print(yield gen.Task(riakclient.put, "test", "one", dict(hello="world"))) args, kwargs = yield gen.Task(riakclient.get, "test", "one") print load_json(args[1][0][1]) buckets = yield gen.Task(riakclient.get_buckets) print "buckets:", buckets keys = yield gen.Task(riakclient.get_keys, "test") print "keys:", keys print(yield gen.Task(riakclient.delete, "test", "one")) ioloop.IOLoop.instance().stop()
def load_config(): def expand_paths(config): paths = ["history_file", "shortcuts_paths_file", "user_config_file"] for param in paths: config[param] = expanduser(config[param]) return config ref_config = expand_paths(util.load_json(get_reference_config_path())) if os.path.exists(ref_config["user_config_file"]): usr_config = expand_paths(util.load_json(ref_config["user_config_file"])) config = util.patch_dict(ref_config, usr_config) config["user_config_file"] = ref_config["user_config_file"] return config return ref_config
def _eval(self, gt_fname=None, rec_fname=None, gt=None, rec=None): if gt_fname and rec_fname: gt_playlists = load_json(gt_fname) rec_playlists = load_json(rec_fname) elif gt and rec: gt_playlists = gt rec_playlists = rec else: raise Exception("데이터가 올바르지 않습니다.") gt_dict = {g["id"]: g for g in gt_playlists} gt_ids = set([g["id"] for g in gt_playlists]) rec_ids = set([r["id"] for r in rec_playlists]) if gt_ids != rec_ids: raise Exception("결과의 플레이리스트 수가 올바르지 않습니다.") rec_song_counts = [len(p["songs"]) for p in rec_playlists] rec_tag_counts = [len(p["tags"]) for p in rec_playlists] if set(rec_song_counts) != set([100]): raise Exception("추천 곡 결과의 개수가 맞지 않습니다.") if set(rec_tag_counts) != set([10]): raise Exception("추천 태그 결과의 개수가 맞지 않습니다.") rec_unique_song_counts = [len(set(p["songs"])) for p in rec_playlists] rec_unique_tag_counts = [len(set(p["tags"])) for p in rec_playlists] if set(rec_unique_song_counts) != set([100]): raise Exception("한 플레이리스트에 중복된 곡 추천은 허용되지 않습니다.") if set(rec_unique_tag_counts) != set([10]): raise Exception("한 플레이리스트에 중복된 태그 추천은 허용되지 않습니다.") music_ndcg = 0.0 tag_ndcg = 0.0 for rec in rec_playlists: gt = gt_dict[rec["id"]] music_ndcg += self._ndcg(gt["songs"], rec["songs"][:100]) tag_ndcg += self._ndcg(gt["tags"], rec["tags"][:10]) music_ndcg = music_ndcg / len(rec_playlists) tag_ndcg = tag_ndcg / len(rec_playlists) score = music_ndcg * 0.85 + tag_ndcg * 0.15 return music_ndcg, tag_ndcg, score
def _load_R(self): if is_precomputed(self.h, self.c, self.p_dir): return util.load_json(self.h, self.c, self.p_dir) else: R = precompute_c(self.c, self.h) util.save_json_c(self.h, self.c, R, self.p_dir) return R
def smap_actuate(uuid, reading, q_url, a_url): q_url = q_url or smap_query_url a_url = a_url or smap_actuation_url #query the acutation stream for 'Properties' try: r = "select * where uuid = '{}'".format(uuid) resp = requests.post(q_url, r) j = load_json(resp.text) properties = j[0].get('Properties', {}) except Exception as e: print "Error: smap_actuate --failed to extract 'propereties'" print e exit(1) #uuid of our stream #TODO: should we have a unique one for each thread? act_stream_uuid = "52edbddd-98e9-5cef-8cc9-9ddee810cd88" #construct our actuation request act = {'/actuate': {'uuid': act_stream_uuid, 'Readings': [[int(time.time()), reading]], 'Properties': properties, 'Metadata':{'override': uuid}}} print "sending smap actuation..." print requests.post(a_url, data=json.dumps(act))
def process(): def _parse_image_id(image): return int(image.split('.')[0].split('_')[-1]) model = AttentionModel() ans2top_ans = AnswerTokenToTopAnswer() task_data_dir = '/usr/data/fl302/code/utils/bs_data_maker' task_data_file = os.path.join(task_data_dir, 'task_data_for_verif.json') task_data = load_json(task_data_file) is_valid = [] num = len(task_data) for i, info in enumerate(task_data): print('%d/%d' % (i, num)) image = info['image'] image_id = _parse_image_id(image) question = info['target'] answer = info['answer'] scores = model.inference(image_id, question) scores[:, -1] = -10. # pdb.set_trace() top_ans_id = ans2top_ans.direct_query(answer) if top_ans_id == 2000: raise Exception('Warning: answer oov') scores = scores.flatten() pred_top_ans_id = scores.argmax() is_valid.append(int(pred_top_ans_id == top_ans_id)) n_valid = sum(is_valid) print('valid: %d/%d' % (n_valid, num)) save_json(os.path.join(task_data_dir, 'task_data_verif_state.json'), is_valid)
def parse_results(): ann_file = 'data/tmp.json' anno = load_json(ann_file)['annotation'] num_confused = np.array([len(info['confused']) for info in anno], dtype=np.float32) import pdb pdb.set_trace()
def get_phi(is_train): data_dir = "train" if is_train else "test" print "Loading reviews..." reviews = util.load_json("./data/" + data_dir + "/review.json") print "Building graph..." G = nx.read_edgelist("./data/" + data_dir + "/graph.txt", nodetype=int) n = G.number_of_nodes() print "Building feature matrices..." phi = defaultdict(lambda: sparse.lil_matrix((n, n), dtype=float)) for (u, v) in G.edges(): if str(u) not in reviews: u, v = v, u features = get_features(reviews[str(u)][str(v)], is_train) for feature_name, value in features.iteritems(): phi[feature_name][u, v] = value phi[feature_name][v, u] = value print "Converting..." for k, m in phi.items(): phi[k] = sparse.csr_matrix(m) return phi
def test(): phi = get_phi(False) examples = util.load_json("./data/test/examples.json") w = util.load_json("./data/supervised_random_walks_weights.json") print "Computing Q and initializing..." Q = get_Q(phi, w) ps = {} for u in examples: p = np.zeros(phi["bias"].shape[0]) p[int(u)] = 1.0 ps[int(u)] = sparse.csr_matrix(p) get_ps(Q, ps, max_iter=20, convergence_criteria=0, log=True, examples=examples) print "Writing..." util.write_json(examples, "./data/test/supervised_random_walks.json")
def _load_data(self): meta_file = os.path.join( _DATA_ROOT, 'data/%svqa_std_mscoco_%s.meta' % (self._version_suffix, self._subset)) data_file = os.path.join( _DATA_ROOT, 'data/%svqa_std_mscoco_%s.data' % (self._version_suffix, self._subset)) # load meta d = load_json(meta_file) self._images = d['images'] self._quest_ids = np.array(d['quest_id']) vqa_image_ids = [ find_image_id_from_fname(im_name) for im_name in self._images ] self._vqa_image_ids = np.array(vqa_image_ids, dtype=np.int32) # load QA data d = load_hdf5(data_file) self._quest = d['quest_arr'].astype(np.int32) self._quest_len = d['quest_len'].astype(np.int32) self._answer = d['answer'].astype(np.int32) self._check_valid_answers() # self._load_caption_feature() self._load_global_image_feature()
def __init__(self, result_file, subset='val'): self._subset = subset self.results = load_json(result_file) self.num = len(self.results) self._im_root = get_image_feature_root() self.prog_str = '' self.mc_ctx = MultiChoiceQuestionManger(subset='val')
def _load_mapping(self): d = load_json('vqa_val_quest_id2im_id.json') mapping = {} for k, v in d.iteritems(): mapping[int(k)] = u'%d' % v self.quest_id2image_id = mapping self.dummy_quest_id = self.quest_id2image_id.keys()[0]
def extract_raw_refs_from_pdfs(): metas = util.load_json(cfg.paths['papers-metadata']) refs = util.parallelize(extract_raw_refs_from_pdf, metas, N_THREADS) refs = {m['uid']: r for m, r in zip(metas, refs)} util.save_json(cfg.paths['raw-papers-refs'], refs) print('saved raw papers refs to "{}"'.format(cfg.paths['raw-papers-refs']))
def _load_data(self): meta_file = os.path.join(_DATA_ROOT, 'data/%svqa_std_mscoco_%s.meta' % (self._version_suffix, self._subset)) data_file = os.path.join(_DATA_ROOT, 'data4/%svar_ivqa_%s_question_answers.data' % (self._version_suffix, self._subset)) # load meta d = load_json(meta_file) images = d['images'] quest_ids = np.array(d['quest_id']) vqa_image_ids = [find_image_id_from_fname(im_name) for im_name in images] quest_id2image_id = {qid: im_id for (qid, im_id) in zip(quest_ids, vqa_image_ids)} self._vqa_image_ids = np.array(vqa_image_ids, dtype=np.int32) # load QA data d = load_hdf5(data_file) self._quest_ids = d['ext_quest_ids'].astype(np.int32) self._quest = d['ext_quest_arr'].astype(np.int32) self._quest_len = d['ext_quest_len'].astype(np.int32) self._answer = d['ext_top_answer'].astype(np.int32) self._check_valid_answers() # sort images abs_quest_ids = self._quest_ids[:, 0] self._vqa_image_ids = np.array([quest_id2image_id[_id] for _id in abs_quest_ids], dtype=np.int32) # self._load_caption_feature() self._load_global_image_feature()
def _load_R(self, h, c): if is_precomputed(h, c, self.p_dir): return util.load_json(h, c, self.p_dir) else: R = precompute_c(c, h) util.save_json_c(h, c, R, self.p_dir) return R
def graph_it(input_file): json_file = util.load_json(input_file) co_dict = defaultdict(int) co_co_dict = {} master_id = json_file.get('GENERAL-INFORMATION').get('ID') articles = json_file.get('ARTICLES') for a in articles: for co in a.get('AUTHORS'): co_id = co.get('ID') if (co_id != master_id and co_id): co_dict[co_id] += 1 co_dict = dict(co_dict) for key in co_dict.iterkeys(): tmp_dict = defaultdict(int) for a in articles: pre_list = [x.get('ID') for x in a.get('AUTHORS')] if (key in pre_list): for co in a.get('AUTHORS'): co_id = co.get('ID') if (co_id != key and co_id): tmp_dict[co_id] += 1 tmp_dict = dict(tmp_dict) if (len(tmp_dict) > 0): co_co_dict[key] = tmp_dict #construct_graph(co_dict,co_co_dict) print calculate(co_dict, co_co_dict)
def eval_recall(results): over_complete = load_json(_ANNO_FILE) accs = [] num_cands = 0 for res in results: quest_id = str(res['question_id']) cands = res['answers'] gts = over_complete[quest_id] cand_acc = [] for cand in cands: cand = str(cand) cand = cand.strip() if cand in gts: _n = gts[cand] else: _n = 0 cand_acc.append(min(1., float(_n) / 3)) v = max(cand_acc) accs.append(v) num_cands += len(cands) # num_correct += (_n >= 3) num_tot = len(results) # mean_acc = num_correct / float(num_tot) mean_acc = np.array(accs).mean() print('Evaluated %d questions' % num_tot) print('Total number of candidates: %d (%0.2f/image)' % (num_cands, float(num_cands) / num_tot)) print('Recall: %0.2f' % (100. * mean_acc))
def make_examples_simple(data_dir, n_users, negative_examples_per_user=10): G = snap.LoadEdgeList(snap.PUNGraph, data_dir + 'graph.txt', 0, 1) new_edges = defaultdict(dict) with open(data_dir + 'new_edges.txt') as f: for line in f: u, b = map(int, line.split()) new_edges[u][b] = 1 businesses = map(int, util.load_json(data_dir + 'business.json').keys()) examples = defaultdict(dict) users = random.sample([NI.GetId() for NI in G.Nodes()], n_users) for u in users: examples[u] = new_edges[u] for i in range(negative_examples_per_user): b = random.choice(businesses) examples[u][b] = 0 p, n = 0, 0 for u in examples: for b in examples[u]: p += examples[u][b] n += 1 - examples[u][b] print "Positive:", p print "Negative:", n print "Data skew:", p / float(p + n) print "Sampling rate:", negative_examples_per_user / float(len(businesses)) print "Writing examples..." util.write_json(examples, data_dir + 'examples_simple.json')
def _load_data(self): meta_file = 'data/%svqa_std_mscoco_%s.meta' % (self._version_suffix, self._subset) data_file = 'data/%svqa_std_mscoco_%s.data' % (self._version_suffix, self._subset) self._images = load_json(meta_file)['images'] d = load_hdf5(data_file) self._quest = d['quest_arr'].astype(np.int32) self._quest_len = d['quest_len'].astype(np.int32) self._answer = d['answer'].astype(np.int32) self._num = self._answer.size self._check_valid_answers() vqa_image_ids = [ find_image_id_from_fname(im_name) for im_name in self._images ] self._vqa_image_ids = np.array(vqa_image_ids, dtype=np.int32) # load caption data self._load_caption_data() # load attributes if self._attr_type == 'res152': self._load_global_image_feature() else: self._load_attributes() # load answer sequences self._load_answer_sequence()
def __init__(self, subset='val', num_eval=None, need_im_feat=True, need_attr=False, use_ans_type=False, feat_type='res152'): anno_file = 'data/MultipleChoicesQuestionsKarpathy%sV2.0.json' % subset.title( ) self._subset = subset d = load_json(anno_file) self._id2type = d['candidate_types'] self._annotations = d['annotation'] if num_eval == 0: num_eval = len(self._annotations) self._num_to_eval = num_eval self._idx = 0 self._need_attr = need_attr self._need_im_feat = need_im_feat self._quest_encoder = SentenceEncoder('question') self._answer_encoder = SentenceEncoder('answer') self._im_encoder = MCDataFetcher(subset='kp%s' % subset, feat_type=feat_type) self.num_samples = len(self._annotations) self._mc_ctx = MultiChoiceQuestionManger(subset='val') self._group_by_answer_type() self._use_ans_type = use_ans_type
def get_contexts( dataset_file="/mnt/b5320167-5dbd-4498-bf34-173ac5338c8d/Datasets/bmj_case_reports_data/dataset_json_concept_annotated/train1.0.json", output_file="/mnt/b5320167-5dbd-4498-bf34-173ac5338c8d/Datasets/bmj_case_reports_data/dataset_json_concept_annotated/train1.0.txt", downcase=False): """ Gets passage text with no concept annotations. """ dataset = load_json(dataset_file) data = dataset[DATA_KEY] n_all = 0 all_contexts = "" for datum in data: new_context = "\n" + datum[DOC_KEY][TITLE_KEY] + "\n" + datum[DOC_KEY][ CONTEXT_KEY] all_contexts += remove_concept_marks(new_context) curr_queries = set() for qa in datum[DOC_KEY][QAS_KEY]: a = "" for ans in qa[ANS_KEY]: if ans[ORIG_KEY] == "dataset": a = ans[TXT_KEY] assert a curr_queries.add( remove_concept_marks(qa[QUERY_KEY]).replace( PLACEHOLDER_KEY, a)) all_contexts += "\n" + "\n".join(curr_queries) n_all += 1 print(n_all) all_contexts = all_contexts.replace("\n\n", "\n") with open(output_file, "w") as fh: fh.write(all_contexts.lower() if downcase else all_contexts)
def load_info(self, load_path=None): if load_path is None: with pyglet.resource.file(self.resource_path('info.json'), 'r') as info_file: self.info = json.load(info_file) else: self.info = util.load_json(load_path)
def __init__(self, name, init_z=0, init_x=0, init_y=0, init_ox=0, init_oy=0, animations=[]): # SUPER CALL super(SpriteAnimationSet, self).__init__(name, init_z, init_x, init_y, 0, 0, init_ox, init_oy) if len(animations) <= 0: raise InvalidArgumentsExecption("No animation files set.") # Loads animations framedata and spritesheets. self.animations = {} spritesheet = {} for animation in animations: framedata = FrameData(util.load_json(animation[1])) if framedata.sprite_sheet not in spritesheet.keys(): spritesheet[framedata.sprite_sheet] = SpriteSheetManager( framedata.sprite_sheet) self.animations[animation[0]] = SpriteAnimation( animation[0], init_z=1, init_x=0, init_y=0, frame_data=framedata, sprite_sheet=spritesheet[framedata.sprite_sheet]) self.curr_animation = animations[0][0]
def main(example_file,graph_file,u_methods,u_outfiles,b_methods,b_outfiles): start = datetime.datetime.now() print "Loading examples..." examples = util.load_json(example_file) print "Loading graph..." G = snap.LoadEdgeList(snap.PUNGraph, graph_file, 0, 1) users(examples, G, u_methods, u_outfiles) business(examples, G, b_methods, b_outfiles)
def on_value_changed(self,text): try: node = self.get_node() node.parent().replace_node( node, load_json(text) ) except ValueError: self.model().invalid_json.emit()
def get(self, model, key, r=None, pr=None, vtag=None, callback=None): bucket_name = model.__name__ (code, docs), _ = yield gen.Task(super(RiakAdapter, self).get, bucket_name, str(key), r, pr, vtag) if docs and model: docs = map(lambda x: model(load_json(x[1]), x[0]), docs) callback(ViewResult(docs)) else: callback(docs)
def getIps(file_name): sl = util.load_json(file_name) public_list = [] private_list = [] for server in sl.values(): pub_ip = server["addresses"]["public"][0] priv_ip = server["addresses"]["private"][0] public_list.append(pub_ip) private_list.append(priv_ip) return {"PUBLIC": public_list, "SERVICENET": private_list}
def load(self, folder_name="autosave"): """Returns a scene if save existed and was loaded successfully""" base_path = os.path.join(self.save_path, folder_name) scn = self.scene_handler.scene if not os.path.exists(base_path): return None else: my_info = util.load_json(os.path.join(base_path, 'game')) self.game_variables = my_info['game_variables'] return scene.Scene(my_info['first_scene'], self.scene_handler, self.ui, load_path=os.path.join(base_path, my_info['first_scene']))
def get_conferences(): files = util.listdir(CONFERENCE_FOLDER) util.mkdir(CONFERENCE_CRALWED_FOLDER) cnt = 0 conf = util.load_json('conf_name.json') for file_name in files: save_path = os.path.join(CONFERENCE_CRALWED_FOLDER, file_name) if util.exists(save_path): continue data = util.load_json(os.path.join(CONFERENCE_FOLDER, file_name)) if data['short'] not in conf.keys(): continue html = util.get_page(data['url']) subs = get_subs(data['short'], html) data['name'] = conf[data['short']] data['sub'] = {} if len(subs) == 0: data['sub']['#'] = get_publications(html) util.save_json(save_path, data) cnt += 1
def X_y_e(is_train, vectorizer): print "Loading data..." dataset = "train" if is_train else "test" examples = util.load_json('./data/' + dataset + '/examples.json') users = util.load_json('./data/' + dataset + '/user.json') businesses = util.load_json('./data/' + dataset + '/business.json') unsupervised_scores = {f: util.load_json('./data/' + dataset + '/' + f + '.json') for f in ['svd', 'weighted_random_walks', 'random_walks', 'b_adamic', 'b_cn', 'b_jaccard', 'u_adamic', 'u_cn', 'u_jaccard']} print "Computing features..." feature_dicts, y, e = [], [], [] for u in examples: for b in examples[u]: e.append((u, b)) y.append(examples[u][b]) feature_dicts.append(get_features(u, b, unsupervised_scores, users[u], businesses[b])) X = vectorizer.fit_transform(feature_dicts) if is_train else vectorizer.transform(feature_dicts) return X, y, e
def get_journals(): files = util.listdir(JOURNAL_FOLDER) util.mkdir(JOURNAL_CRALWED_FOLDER) cnt = 0 jour = util.load_json('jour_name.json') for file_name in files: save_path = os.path.join(JOURNAL_CRALWED_FOLDER, file_name) if util.exists(save_path): continue data = util.load_json(os.path.join(JOURNAL_FOLDER, file_name)) if data['short'] not in jour.keys(): continue html = util.get_page(data['url']) subs = get_subs(data['short'], html) data['name'] = jour[data['short']] data['sub'] = {} if len(subs) == 0: data['sub']['#'] = get_publications(html) util.save_json(save_path, data) cnt += 1 print cnt, len(files), data['short']
def train(): phi = get_phi(True) print "Loading examples..." Ds, Ls = {}, {} examples = util.load_json("./data/train/examples.json") us = list(examples.keys()) random.seed(0) random.shuffle(us) for u in us: D, L = set(), set() for b in examples[u]: (D if examples[u][b] == 1 else L).add(int(b)) if len(D) > MAX_POSITIVE_EDGES_PER_USER: D = random.sample(D, MAX_POSITIVE_EDGES_PER_USER) if len(L) > MAX_NEGATIVE_EDGES_PER_USER: L = random.sample(L, MAX_POSITIVE_EDGES_PER_USER) if len(D) > 1 and len(L) > 10: Ds[int(u)] = list(D) Ls[int(u)] = list(L) if len(Ds) > NUM_TRAIN_USERS: break print "Setting initial conditions..." ps = {} for u in Ds: p = np.zeros(phi["bias"].shape[0]) p[u] = 1.0 ps[u] = sparse.csr_matrix(p) print "Training..." w = INITIAL_WEIGHTS best_loss = 100000 for i in range(100): print "ITERATION " + str(i + 1) + ": base" base_loss, ps = run(phi, w, Ds, Ls, ps) if base_loss < best_loss: best_loss = base_loss util.write_json(w, "./data/supervised_random_walks_weights.json") partials = {} for k in w: print "ITERATION " + str(i + 1) + ": " + k new_w = w.copy() new_w[k] += H new_loss, _ = run(phi, new_w, Ds, Ls, ps) partials[k] = (new_loss - base_loss) / H print partials[k] * LEARNING_RATE for (k, dwk) in partials.iteritems(): w[k] -= LEARNING_RATE * dwk
def get_journals(): files = util.listdir(JOURNAL_FOLDER) util.mkdir(JOURNAL_CRALWED_FOLDER) cnt = 0 for file_name in files: save_path = os.path.join(JOURNAL_CRALWED_FOLDER, file_name) data = util.load_json(os.path.join(JOURNAL_FOLDER, file_name)) html = util.get_page(data['url']) full_name = get_full_name(html) data['name'] = full_name cnt += 1 print cnt, len(files), data['short'] data['links'] = get_links(data['short'], html)
def load_headers(): h = pickle.load(open("auth_headers.db","r")) intended_keys = ["x-auth-token"] headers = {} headers["content-type"] = "application/xml" headers["accept"] = "application/xml" if os.path.isfile("extra_headers.json"): extra_headers = util.load_json("extra_headers.json") headers.update(extra_headers) for auth_key in intended_keys: headers[auth_key]=h[auth_key] printf("\n\nheaders=%s\n",headers) return headers
def get_authors(): files = util.listdir(AUTHOR_FOLDER) util.mkdir(AUTHOR_CRALWED_FOLDER) for file_name in files: save_path = os.path.join(AUTHOR_CRALWED_FOLDER, file_name) if util.exists(save_path): continue data = util.load_json(os.path.join(AUTHOR_FOLDER, file_name)) html = util.get_page(data['url']) full_name = get_full_name(html) data['name'] = full_name print data['short'], full_name data['links'] = get_links(data['short'], html) util.save_json(save_path, data)
def run_generator(args): import json, sys # Grab the spec's and schema's path. search_paths = {} for filename in [args.spec, args.schema]: path = os.path.abspath(os.path.dirname(os.path.expanduser(filename))) search_paths[path] = True spec = load_json(args.spec) schema = load_json(args.schema) writer = OutputWriter() if not args.dryrun else DryRunWritter() reader = FileReader() generator = Generator(spec, schema, writer=writer, reader=reader, mode=args.target, paths=search_paths.keys()) error_details = {} try: generator.generate(error_details=error_details) return except SchemaError, err: print 'Schema Error:', err.message
def get_journals(): files = util.listdir(JOURNAL_FOLDER) util.mkdir(JOURNAL_CRALWED_FOLDER) cnt = 0 jour = util.load_json('jour_name.json') for file_name in files: save_path = os.path.join(JOURNAL_CRALWED_FOLDER, file_name) if util.exists(save_path): continue data = util.load_json(os.path.join(JOURNAL_FOLDER, file_name)) if data['short'] not in jour.keys(): continue html = util.get_page(data['url']) subs = get_subs(data['short'], html) data['name'] = jour[data['short']] data['sub'] = {} for sub in subs: html = util.get_page('http://dblp.uni-trier.de/db/journals/' + sub) data['sub'][sub] = {} data['sub'][sub]['pub'] = get_publications(html) data['sub'][sub]['name'] = jour[sub] cnt += 1 print cnt, len(files), data['short'] util.save_json(save_path, data)
def svd(data_dir, k=50): print "Loading data and building adjacency matrix..." examples = util.load_json('./data/' + data_dir + '/examples.json') G = nx.read_edgelist('./data/' + data_dir + '/graph.txt', nodetype=int) adjacency_matrix = sparse.csr_matrix(nx.adjacency_matrix(G), dtype=float) print "Computing singular value decomposition..." u, s, vt = sparse.linalg.svds(adjacency_matrix, k=k) us = u * s print "Writing results..." for u in examples: for b in examples[u]: examples[u][b] = np.dot(us[u, :], vt[:, b]) util.write_json(examples, './data/' + data_dir + '/svd.json')
def parse_file(filename): def fail(): failure = (False,None) QtGui.QMessageBox.warning( self, 'invalid json file', 'file was not list of json objects' ) return failure with open(filename,'r') as f: try: json_object = load_json(f.read()) if type(json_object) is not list: return fail() return (True, json_object) except ValueError: return fail()
def get_conferences(): files = util.listdir(CONFERENCE_FOLDER) util.mkdir(CONFERENCE_CRALWED_FOLDER) cnt = 0 for file_name in files: cnt += 1 if cnt < 1970: continue save_path = os.path.join(CONFERENCE_CRALWED_FOLDER, file_name) data = util.load_json(os.path.join(CONFERENCE_FOLDER, file_name)) html = util.get_page(data['url']) full_name = get_full_name(html) data['name'] = full_name try: print cnt, len(files), data['short'] except: pass data['links'] = get_links(data['short'], html)
def run_evaluation(examples, methods, precision_at=20): curve_args = [] for i, method in enumerate(methods): predictions = util.load_json('./data/test/' + method + '.json') total_precision = 0 all_ys, all_ps = [], [] for u in predictions: ys, ps = zip(*[(examples[u][b], predictions[u][b]) for b in predictions[u]]) all_ys += ys all_ps += ps n = min(precision_at, len(ys)) top_ys = zip(*sorted(zip(ys, ps), key=itemgetter(1), reverse=True))[0][:n] total_precision += sum(top_ys) / float(n) roc_auc = roc_auc_score(all_ys, all_ps) fpr, tpr, t = roc_curve(all_ys, all_ps) curve_args.append((fpr, tpr, method, COLORS[i % len(COLORS)])) print "Method:", method print " Precision @{:} = {:.4f}".format(precision_at, total_precision / len(examples)) print " ROC Auc = {:.4f}".format(roc_auc) if i >= len(COLORS): print "Too many methods to plot all of them!" return plt.figure(figsize=(9, 9)) plt.xlabel('False Positive Rate') plt.ylabel('True Positive Rate') plt.xlim([0.0, 1.0]) plt.title('ROC curves') for (fpr, tpr, label, color) in curve_args: plt.plot(fpr, tpr, label=label, color=color) plt.legend(loc="best") plt.show()
import util jour = util.load_json('jour_name.json') conf = util.load_json('conf_name.json') print len(jour), len(conf), len(jour) + len(conf) for k, v in jour.items(): conf[k] = v print len(conf)
def make_examples(data_dir, n_users=5000, min_degree=1, negative_sample_rate=0.01, min_active_time=None, new_edge_only=False): print "Loading data..." # TODO: switch to networkx? G = snap.LoadEdgeList(snap.PUNGraph, data_dir + 'graph.txt', 0, 1) with open(data_dir + 'new_edges.txt') as f: edges = {tuple(map(int, line.split())) for line in f} new_edge_count = Counter() for (u, b) in edges: new_edge_count[u] += 1 review_data = util.load_json(data_dir + 'review.json') n_businesses = len(util.load_json(data_dir + "business.json")) recently_active_users = [] other_users = [] print "Getting candidate set of users..." users = [] for Node in util.logged_loop(G.Nodes(), util.LoopLogger(50000, G.GetNodes(), True)): u = Node.GetId() if new_edge_only and not u in new_edge_count: continue if str(u) not in review_data or Node.GetOutDeg() < min_degree: continue if min_active_time: recent_review = False for b in review_data[str(u)]: if (int(u), int(b)) in edges: continue for r in review_data[str(u)][b]: if get_date(r) > min_active_time: users.append(u) recently_active_users.append(u) recent_review = True break if recent_review: break if not recent_review: other_users.append(u) else: users.append(u) if min_active_time: recent_positive = sum(new_edge_count[u] for u in recently_active_users) recent_examples = len(recently_active_users) * n_businesses other_positive = sum(new_edge_count[u] for u in other_users) other_examples = len(other_users) * n_businesses print "Positives retained from recently active filter:", \ recent_positive / float(recent_positive + other_positive) print "Negatives retained from recently active filter:", \ (recent_examples - recent_positive) / \ float(recent_examples - recent_positive + other_examples - other_positive) random.seed(0) users = random.sample(users, n_users) print "Getting candidate set of edges..." examples = defaultdict(dict) for u in util.logged_loop(users, util.LoopLogger(50, n_users, True)): candidate_businesses = snap.TIntV() snap.GetNodesAtHop(G, u, 3, candidate_businesses, True) for b in candidate_businesses: if (u, b) in edges: examples[u][b] = 1 elif random.random() < negative_sample_rate: examples[u][b] = 0 hop3_positives = 0 for u in examples: for b in examples[u]: hop3_positives += examples[u][b] hop3_examples = sum(len(examples[u]) for u in examples) n_positives = sum([new_edge_count[u] for u in users]) n_examples = len(users) * n_businesses print "Positives retained from hop3 filter:", hop3_positives / float(n_positives) print "Negatives retained from hop3 filter:", (hop3_examples - hop3_positives) / \ (negative_sample_rate * float(n_examples - n_positives)) print "Data skew:", hop3_positives / float(hop3_examples) print "Writing examples..." util.write_json(examples, data_dir + 'examples.json')