Пример #1
0
def load_and_process_metadata(subset):
    tf.logging.info('Processing meta data of %s...' % subset)
    t = time()
    is_test = subset.startswith('test')
    year = 2015 if is_test else 2014
    subtype = '%s%d' % (subset, year)
    ann_root = '/usr/data/fl302/code/VQA-tensorflow/data/annotations'
    datatype = 'test2015' if is_test else subtype
    # tf.logging.info('Loading annotations and questions...')
    questions = load_json(
        os.path.join(ann_root, 'MultipleChoice_mscoco_%s_questions.json' %
                     subtype))['questions']
    dataset = questions if is_test \
        else load_json(os.path.join(ann_root, 'mscoco_%s_annotations.json' % subtype))['annotations']

    meta = []
    for info, quest in zip(dataset, questions):
        ans = None if is_test else info['multiple_choice_answer']
        token_ans = None if is_test else ans
        quest_id = info['question_id']
        image_id = info['image_id']
        question = quest['question']
        mc_ans = quest['multiple_choices']
        meta.append(
            ImageMetadata(image_id, None, quest_id, question, ans, token_ans,
                          mc_ans))
    tf.logging.info('Time %0.2f sec.' % (time() - t))
    return meta
Пример #2
0
    def __init__(self, g_id, addr, port, global_cfg_path, drone_cfg_path=None):
        self._g_id = g_id
        self._addr = addr
        self._port = port
        self._connection = None
        self._ctrl_station = None
        # Messages received from Dronology
        self._dronology_in = communication.core.MessageQueue()
        # Handshake messages sent by ControlStation to Dronology
        self._dronology_handshake_out = communication.core.MessageQueue()
        # State messages sent by ControlSTation to Dronology
        self._dronology_state_out = communication.core.MessageQueue()
        # New vehicle messages sent by <anyone> to ControlStation.
        self._new_vehicle_in = communication.core.MessageQueue()
        self._global_cfg = util.load_json(global_cfg_path)

        if 'ardupath' not in self._global_cfg:
            _LOG.error(
                'You must specify \"ardupath\" in the global config file at {}'
                .format(global_cfg_path))
            exit(1)

        self._drone_cfgs = []
        # If a JSON drone configuration path is provided, load the drone configurations.
        # These will be used to create or connect to virtual or physical drones.
        if drone_cfg_path is not None:
            self._drone_cfgs = util.load_json(drone_cfg_path)

        self._is_alive = True
Пример #3
0
    def __init__(self):
        gen_funcs = [
            self._gen_doc_file, self._gen_hist_file, self._gen_conf_file
        ]
        cfiles = [TORG_DOC_FILE, TORG_HIST_FILE, TORG_CONF_FILE]

        # Check for main config directory structure
        if not os.path.isdir(TORG_CONF_PATH):
            os.mkdir(TORG_CONF_PATH)
            for i in range(len(cfiles)):
                gen_funcs[i](cfiles[i])

        # Check if all config files exist
        for i in range(len(cfiles)):
            if not os.path.isfile(cfiles[i]):
                gen_funcs[i](cfiles[i])

        # Read all config files
        conf = load_json(TORG_CONF_FILE)
        hist = load_json(TORG_HIST_FILE)
        doc = load_json(TORG_DOC_FILE)
        self._conf = Map.recursive_map(Map(conf))
        self._hist = Map.recursive_map(Map(hist))
        self._doc = Map.recursive_map(Map(doc))

        return None
Пример #4
0
def get_conferences():
    files = util.listdir(CONFERENCE_FOLDER)
    util.mkdir(CONFERENCE_CRALWED_FOLDER)
    cnt = 0
    conf = util.load_json('conf_name.json')
    for file_name in files:
        save_path = os.path.join(CONFERENCE_CRALWED_FOLDER, file_name)
        if util.exists(save_path):
            continue
        data = util.load_json(os.path.join(CONFERENCE_FOLDER, file_name))
        if data['short'] not in conf.keys():
            continue
        html = util.get_page(data['url'])
        subs = get_subs(data['short'], html)
        data['name'] = conf[data['short']]
        data['sub'] = {}
        for sub in subs:
            if sub not in conf.keys():
                continue
            html = util.get_page('http://dblp.uni-trier.de/db/conf/' + sub)
            data['sub'][sub] = {}
            data['sub'][sub]['pub'] = get_publications(html)
            data['sub'][sub]['name'] = conf[sub]
        cnt += 1
        print cnt, len(files), data['short']
        util.save_json(save_path, data)
Пример #5
0
def svd_user_business(data_dir, k=50):
    print "Loading data and building user-business matrix..."
    users = util.load_json('./data/' + data_dir + '/user.json').keys()
    businesses = util.load_json('./data/' + data_dir + '/business.json').keys()
    examples = util.load_json('./data/' + data_dir + '/examples.json')

    user_to_row = dict(zip(users, range(len(users))))
    business_to_column = dict(zip(businesses, range(len(businesses))))

    user_business_matrix = sparse.lil_matrix((len(users), len(businesses)), dtype=float)
    with open('./data/' + data_dir + '/graph.txt') as f:
        for line in f:
            u, b = line.split()
            user_business_matrix[user_to_row[u], business_to_column[b]] = 1
    user_business_matrix = sparse.csr_matrix(user_business_matrix)

    print "Computing singular value decomposition..."
    u, s, vt = sparse.linalg.svds(user_business_matrix, k=k)
    us = u * s

    print "Writing results..."
    for u in examples:
        for b in examples[u]:
            examples[u][b] = np.dot(us[user_to_row[u], :], vt[:, business_to_column[b]])
    util.write_json(examples, './data/' + data_dir + '/svd.json')
Пример #6
0
def intersect_datasets_on_ids(dataset1, dataset2):
    """
    Reduce dataset1 to include only those qa ids which occur in dataset2.
    This is useful eg to reduce a marked dataset based on a dataset with applied
    exact-match filters from refine_json_dataset.py
    """
    data1 = load_json(dataset1)
    data2 = load_json(dataset2)
    new_data = []

    # obtain ids from dataset2
    data2_ids = set()
    for datum in data2[DATA_KEY]:
        for qa in datum[DOC_KEY][QAS_KEY]:
            data2_ids.add(qa[ID_KEY])

    # reduce data1 based on ids from data2
    for datum in data1[DATA_KEY]:
        qas = []
        for qa in datum[DOC_KEY][QAS_KEY]:
            if qa[ID_KEY] in data2_ids:
                qas.append(qa)
            else:
                print("reduction")
        if qas:
            new_doc = document_instance(datum[DOC_KEY][CONTEXT_KEY], datum[DOC_KEY][TITLE_KEY], qas)
            new_data.append(datum_instance(new_doc, datum[SOURCE_KEY]))

    return dataset_instance(data1[VERSION_KEY], new_data)
def X_y_e(is_train, vectorizer):
    print "Loading data..."
    dataset = "train" if is_train else "test"
    examples = util.load_json('./data/' + dataset + '/examples.json')
    users = util.load_json('./data/' + dataset + '/user.json')
    businesses = util.load_json('./data/' + dataset + '/business.json')
    unsupervised_scores = {
        f: util.load_json('./data/' + dataset + '/' + f + '.json')
        for f in [
            'svd', 'weighted_random_walks', 'random_walks', 'b_adamic', 'b_cn',
            'b_jaccard', 'u_adamic', 'u_cn', 'u_jaccard'
        ]
    }

    print "Computing features..."
    feature_dicts, y, e = [], [], []
    for u in examples:
        for b in examples[u]:
            e.append((u, b))
            y.append(examples[u][b])
            feature_dicts.append(
                get_features(u, b, unsupervised_scores, users[u],
                             businesses[b]))
    X = vectorizer.fit_transform(
        feature_dicts) if is_train else vectorizer.transform(feature_dicts)

    return X, y, e
Пример #8
0
def get_conferences():
    files = util.listdir(CONFERENCE_FOLDER)
    util.mkdir(CONFERENCE_CRALWED_FOLDER)
    cnt = 0
    conf = util.load_json('conf_name.json')
    for file_name in files:
        save_path = os.path.join(CONFERENCE_CRALWED_FOLDER, file_name)
        if util.exists(save_path):
            continue
        data = util.load_json(os.path.join(CONFERENCE_FOLDER, file_name))
        if data['short'] not in conf.keys():
            continue
        html = util.get_page(data['url'])
        subs = get_subs(data['short'], html)
        data['name'] = conf[data['short']]
        data['sub'] = {}
        for sub in subs:
            if sub not in conf.keys():
                continue
            html = util.get_page('http://dblp.uni-trier.de/db/conf/' + sub)
            data['sub'][sub] = {}
            data['sub'][sub]['pub'] = get_publications(html)
            data['sub'][sub]['name'] = conf[sub]
        cnt += 1
        print cnt, len(files), data['short']
        util.save_json(save_path, data)
    def _load_data(self):
        meta_file = os.path.join(_DATA_ROOT,
                                 'data/%svqa_std_mscoco_%s.meta' %
                                 (self._version_suffix, self._subset))
        data_file = os.path.join(_DATA_ROOT,
                                 'data3/%svqa_mc_w2v_coding_%s.data' %
                                 (self._version_suffix, self._subset))
        cand_file = os.path.join(_DATA_ROOT,
                                 'data3/%svqa_mc_cands_%s.meta' %
                                 (self._version_suffix, self._subset))
        # load meta
        d = load_json(meta_file)
        self._images = d['images']
        self._quest_ids = np.array(d['quest_id'])
        vqa_image_ids = [find_image_id_from_fname(im_name) for im_name in self._images]
        self._vqa_image_ids = np.array(vqa_image_ids, dtype=np.int32)

        # load QA data
        d = load_hdf5(data_file)
        self._quests = d['quest_w2v'].astype(np.float32)
        self._answer = d['cands_w2v'].astype(np.float32)
        self._labels = d['labels']

        # load candiates
        self._cand_ans = load_json(cand_file)

        self._num = self._labels.size

        # double check question ids
        assert (np.all(self._quest_ids == d['quest_ids']))

        self._load_global_image_feature()
def run_random_walks(data_dir, weight_edges=False):
    print "Loading data and building transition matrix..."
    examples = util.load_json('./data/' + data_dir + '/examples.json')
    G = nx.read_edgelist('./data/' + data_dir + '/graph.txt', nodetype=int)
    if weight_edges:
        reviews = util.load_json('./data/' + data_dir + '/review.json')
        end_date = datetime.date(2012, 1, 1) if data_dir == 'train' else datetime.date(2013, 1, 1)
        edges = G.edges()
        for e in util.logged_loop(edges, util.LoopLogger(20000, len(edges), True)):
            n1, n2 = str(e[0]), str(e[1])
            if n1 not in reviews or n2 not in reviews[n1]:
                n1, n2 = n2, n1
            G[e[0]][e[1]]['weight'] = 1.0 / ((end_date - get_date(reviews[n1][n2][0])).days + 90)
        del reviews  # save some memory

    adjacency_matrix = nx.adjacency_matrix(G)
    inverse_degree_matrix = sparse.diags([[1.0 / adjacency_matrix.getrow(i).sum()
                                           for i in range(adjacency_matrix.shape[0])]], [0])
    transition_matrix = inverse_degree_matrix.dot(adjacency_matrix)

    print "Running random walks..."
    for u in util.logged_loop(examples, util.LoopLogger(10, len(examples), True)):
        p = run_random_walk(transition_matrix, int(u), 10).todense()
        for b in examples[u]:
            examples[u][b] = p[0, int(b)]

    util.write_json(examples, './data/' + data_dir
                    + ('/weighted_random_walks.json' if weight_edges else '/random_walks.json'))
Пример #11
0
    def __init__(self,
                 data_file_src,
                 data_file_tgt1,
                 data_file_tgt2,
                 data_file_tgt3,
                 tokenizer_enc,
                 tokenizer_dec,
                 is_train=True):
        self.src = []
        self.tgt = []
        self.src_mask = []
        self.original_target = []

        print('loading data from', data_file_src)
        with open(data_file_src) as f_src:
            lines_src = f_src.readlines()

        lines_tgt = []
        if data_file_tgt1 is not None and data_file_tgt2 is not None and data_file_tgt3 is not None:
            lines_tgt1 = []
            tgt_dialogs = load_json(data_file_tgt1)
            for d in tgt_dialogs['dialogs']:
                for turn in d['dialog']:
                    target = turn['target']
                    lines_tgt1.append(target)
            with open(data_file_tgt2) as f_tgt:
                lines_tgt2 = f_tgt.readlines()
            lines_tgt2 = [
                e.replace('<EOS>', '').replace(START_BELIEF_STATE,
                                               '').replace(END_OF_BELIEF,
                                                           '').strip()
                for e in lines_tgt2
            ]
            lines_tgt3 = []
            tgt_dialogs = load_json(data_file_tgt3)
            for d in tgt_dialogs['dialogs']:
                for turn in d['dialog']:
                    target = turn['answer'].strip()
                    lines_tgt3.append(target)
            lines_tgt = [
                '<cls> ' + lines_tgt1[idx] + ' <sep1> ' + lines_tgt2[idx] +
                ' <sep2> ' + lines_tgt3[idx] + ' <end>'
                for idx in range(len(lines_tgt1))
            ]
        else:
            lines_tgt = ['<cls> <end>' for _ in range(len(lines_src))]

        for idx in range(len(lines_src)):
            src = tokenizer_enc(lines_src[idx], add_special_tokens=True)
            src_vec = src.input_ids
            src_mask = src.attention_mask
            self.src.append(src_vec)
            self.src_mask.append(src_mask)

            self.original_target.append(lines_tgt[idx])
            if len(lines_tgt) > 0:
                tgt_vec = tokenizer_dec.encode(lines_tgt[idx],
                                               add_special_tokens=True)
                self.tgt.append(tgt_vec)
def main(args):
    raw_count, char_file, out_dir = args[0], args[1], args[2]

    raw = util.load_json(raw_count)
    char = util.load_json(char_file)
    result = count(raw, char)
    # save result
    save_result(os.path.join(out_dir, OUT_CHAR_ASCII), result['ascii'])
    save_result(os.path.join(out_dir, OUT_CHAR_CHINESE), result['chinese'])
    save_result(os.path.join(out_dir, OUT_CHAR_OTHER), result['other'])
Пример #13
0
 def __enter__(self):
     self.tokenizer = W2VTokenizer()
     self.vocab = Vocab()
     self.bins = load_json('./data/wenmf/bins.json', [])
     status = load_json('./data/wenmf/state.json', {
         'initial_done': False,
     })
     self.topic_frames = []
     self.initial_done = status['initial_done']
     return self
def print_dataset_stats(data_dir):
    G = snap.LoadEdgeList(snap.PUNGraph, data_dir + 'graph.txt', 0, 1)
    print "Num nodes:", G.GetNodes()
    print "Num edges:", G.GetEdges()

    n_users = len(util.load_json(data_dir + "user.json"))
    n_businesses = len(util.load_json(data_dir + "business.json"))
    n_edges = util.lines_in_file(data_dir + "new_edges.txt")
    print "({:} users) * ({:} businesses) = {:.3e} candidate edges".format(
        n_users, n_businesses, n_users * n_businesses)
    print "{:} edges, {:0.5f}% of candidate edges".format(n_edges, 100 * n_edges /
                                                          float(n_users * n_businesses))
Пример #15
0
def _load_dataset():
    # load trainval
    subset = 'trainval'
    vqa_meta_file = '../iccv_vaq/data/vqa_std_mscoco_%s.meta' % subset
    vqa_meta_trainval = load_json(vqa_meta_file)

    # load dev
    subset = 'dev'
    vqa_meta_file = '../iccv_vaq/data/vqa_std_mscoco_%s.meta' % subset
    vqa_meta_dev = load_json(vqa_meta_file)

    return vqa_meta_trainval['quest_id'], vqa_meta_dev['quest_id']
Пример #16
0
def print_dataset_stats(data_dir):
    G = snap.LoadEdgeList(snap.PUNGraph, data_dir + 'graph.txt', 0, 1)
    print "Num nodes:", G.GetNodes()
    print "Num edges:", G.GetEdges()

    n_users = len(util.load_json(data_dir + "user.json"))
    n_businesses = len(util.load_json(data_dir + "business.json"))
    n_edges = util.lines_in_file(data_dir + "new_edges.txt")
    print "({:} users) * ({:} businesses) = {:.3e} candidate edges".format(
        n_users, n_businesses, n_users * n_businesses)
    print "{:} edges, {:0.5f}% of candidate edges".format(
        n_edges, 100 * n_edges / float(n_users * n_businesses))
Пример #17
0
def on_connect(code, msg):
    print "connect"
    print(yield gen.Task(riakclient.ping))
    print(yield gen.Task(riakclient.put, "test", "one", dict(hello="world")))
    args, kwargs = yield gen.Task(riakclient.get, "test", "one")
    print load_json(args[1][0][1])
    buckets = yield gen.Task(riakclient.get_buckets)
    print "buckets:", buckets
    keys = yield gen.Task(riakclient.get_keys, "test")
    print "keys:", keys
    print(yield gen.Task(riakclient.delete, "test", "one"))

    ioloop.IOLoop.instance().stop()
Пример #18
0
def load_config():
    def expand_paths(config):
        paths = ["history_file", "shortcuts_paths_file", "user_config_file"]
        for param in paths:
            config[param] = expanduser(config[param])
        return config

    ref_config = expand_paths(util.load_json(get_reference_config_path()))
    if os.path.exists(ref_config["user_config_file"]):
        usr_config = expand_paths(util.load_json(ref_config["user_config_file"]))
        config = util.patch_dict(ref_config, usr_config)
        config["user_config_file"] = ref_config["user_config_file"]
        return config
    return ref_config
    def _eval(self, gt_fname=None, rec_fname=None, gt=None, rec=None):
        if gt_fname and rec_fname:
            gt_playlists = load_json(gt_fname)
            rec_playlists = load_json(rec_fname)
        elif gt and rec:
            gt_playlists = gt
            rec_playlists = rec
        else:
            raise Exception("데이터가 올바르지 않습니다.")

        gt_dict = {g["id"]: g for g in gt_playlists}
        gt_ids = set([g["id"] for g in gt_playlists])
        rec_ids = set([r["id"] for r in rec_playlists])

        if gt_ids != rec_ids:
            raise Exception("결과의 플레이리스트 수가 올바르지 않습니다.")

        rec_song_counts = [len(p["songs"]) for p in rec_playlists]
        rec_tag_counts = [len(p["tags"]) for p in rec_playlists]

        if set(rec_song_counts) != set([100]):
            raise Exception("추천 곡 결과의 개수가 맞지 않습니다.")

        if set(rec_tag_counts) != set([10]):
            raise Exception("추천 태그 결과의 개수가 맞지 않습니다.")

        rec_unique_song_counts = [len(set(p["songs"])) for p in rec_playlists]
        rec_unique_tag_counts = [len(set(p["tags"])) for p in rec_playlists]

        if set(rec_unique_song_counts) != set([100]):
            raise Exception("한 플레이리스트에 중복된 곡 추천은 허용되지 않습니다.")

        if set(rec_unique_tag_counts) != set([10]):
            raise Exception("한 플레이리스트에 중복된 태그 추천은 허용되지 않습니다.")

        music_ndcg = 0.0
        tag_ndcg = 0.0

        for rec in rec_playlists:
            gt = gt_dict[rec["id"]]
            music_ndcg += self._ndcg(gt["songs"], rec["songs"][:100])
            tag_ndcg += self._ndcg(gt["tags"], rec["tags"][:10])

        music_ndcg = music_ndcg / len(rec_playlists)
        tag_ndcg = tag_ndcg / len(rec_playlists)
        score = music_ndcg * 0.85 + tag_ndcg * 0.15

        return music_ndcg, tag_ndcg, score
Пример #20
0
 def _load_R(self):
     if is_precomputed(self.h, self.c, self.p_dir):
         return util.load_json(self.h, self.c, self.p_dir)
     else:
         R = precompute_c(self.c, self.h)
         util.save_json_c(self.h, self.c, R, self.p_dir)
         return R
Пример #21
0
def smap_actuate(uuid, reading, q_url, a_url):
    q_url = q_url or smap_query_url
    a_url = a_url or smap_actuation_url
    #query the acutation stream for 'Properties'
    try:
        r = "select * where uuid = '{}'".format(uuid)
        resp = requests.post(q_url, r)
        j = load_json(resp.text)
        properties = j[0].get('Properties', {})
    except Exception as e:
        print "Error: smap_actuate --failed to extract 'propereties'"
        print e
        exit(1)

    #uuid of our stream
    #TODO: should we have a unique one for each thread?
    act_stream_uuid = "52edbddd-98e9-5cef-8cc9-9ddee810cd88"

    #construct our actuation request
    act = {'/actuate': {'uuid': act_stream_uuid,
                        'Readings': [[int(time.time()), reading]],
                        'Properties': properties,
                        'Metadata':{'override': uuid}}}

    print "sending smap actuation..."
    print requests.post(a_url, data=json.dumps(act))
Пример #22
0
def process():
    def _parse_image_id(image):
        return int(image.split('.')[0].split('_')[-1])

    model = AttentionModel()
    ans2top_ans = AnswerTokenToTopAnswer()

    task_data_dir = '/usr/data/fl302/code/utils/bs_data_maker'
    task_data_file = os.path.join(task_data_dir, 'task_data_for_verif.json')
    task_data = load_json(task_data_file)
    is_valid = []
    num = len(task_data)
    for i, info in enumerate(task_data):
        print('%d/%d' % (i, num))
        image = info['image']
        image_id = _parse_image_id(image)
        question = info['target']
        answer = info['answer']
        scores = model.inference(image_id, question)
        scores[:, -1] = -10.
        # pdb.set_trace()
        top_ans_id = ans2top_ans.direct_query(answer)
        if top_ans_id == 2000:
            raise Exception('Warning: answer oov')
        scores = scores.flatten()
        pred_top_ans_id = scores.argmax()
        is_valid.append(int(pred_top_ans_id == top_ans_id))

    n_valid = sum(is_valid)
    print('valid: %d/%d' % (n_valid, num))
    save_json(os.path.join(task_data_dir, 'task_data_verif_state.json'),
              is_valid)
Пример #23
0
def parse_results():
    ann_file = 'data/tmp.json'
    anno = load_json(ann_file)['annotation']
    num_confused = np.array([len(info['confused']) for info in anno],
                            dtype=np.float32)
    import pdb
    pdb.set_trace()
def get_phi(is_train):
    data_dir = "train" if is_train else "test"

    print "Loading reviews..."
    reviews = util.load_json("./data/" + data_dir + "/review.json")

    print "Building graph..."
    G = nx.read_edgelist("./data/" + data_dir + "/graph.txt", nodetype=int)
    n = G.number_of_nodes()

    print "Building feature matrices..."
    phi = defaultdict(lambda: sparse.lil_matrix((n, n), dtype=float))
    for (u, v) in G.edges():
        if str(u) not in reviews:
            u, v = v, u
        features = get_features(reviews[str(u)][str(v)], is_train)
        for feature_name, value in features.iteritems():
            phi[feature_name][u, v] = value
            phi[feature_name][v, u] = value

    print "Converting..."
    for k, m in phi.items():
        phi[k] = sparse.csr_matrix(m)

    return phi
def test():
    phi = get_phi(False)
    examples = util.load_json("./data/test/examples.json")
    w = util.load_json("./data/supervised_random_walks_weights.json")

    print "Computing Q and initializing..."
    Q = get_Q(phi, w)
    ps = {}
    for u in examples:
        p = np.zeros(phi["bias"].shape[0])
        p[int(u)] = 1.0
        ps[int(u)] = sparse.csr_matrix(p)
    get_ps(Q, ps, max_iter=20, convergence_criteria=0, log=True, examples=examples)

    print "Writing..."
    util.write_json(examples, "./data/test/supervised_random_walks.json")
Пример #26
0
    def _load_data(self):
        meta_file = os.path.join(
            _DATA_ROOT, 'data/%svqa_std_mscoco_%s.meta' %
            (self._version_suffix, self._subset))
        data_file = os.path.join(
            _DATA_ROOT, 'data/%svqa_std_mscoco_%s.data' %
            (self._version_suffix, self._subset))
        # load meta
        d = load_json(meta_file)
        self._images = d['images']
        self._quest_ids = np.array(d['quest_id'])
        vqa_image_ids = [
            find_image_id_from_fname(im_name) for im_name in self._images
        ]
        self._vqa_image_ids = np.array(vqa_image_ids, dtype=np.int32)

        # load QA data
        d = load_hdf5(data_file)
        self._quest = d['quest_arr'].astype(np.int32)
        self._quest_len = d['quest_len'].astype(np.int32)
        self._answer = d['answer'].astype(np.int32)
        self._check_valid_answers()

        # self._load_caption_feature()
        self._load_global_image_feature()
Пример #27
0
 def __init__(self, result_file, subset='val'):
     self._subset = subset
     self.results = load_json(result_file)
     self.num = len(self.results)
     self._im_root = get_image_feature_root()
     self.prog_str = ''
     self.mc_ctx = MultiChoiceQuestionManger(subset='val')
Пример #28
0
 def _load_mapping(self):
     d = load_json('vqa_val_quest_id2im_id.json')
     mapping = {}
     for k, v in d.iteritems():
         mapping[int(k)] = u'%d' % v
     self.quest_id2image_id = mapping
     self.dummy_quest_id = self.quest_id2image_id.keys()[0]
def extract_raw_refs_from_pdfs():
    metas = util.load_json(cfg.paths['papers-metadata'])
    refs = util.parallelize(extract_raw_refs_from_pdf, metas, N_THREADS)
    refs = {m['uid']: r for m, r in zip(metas, refs)}
    util.save_json(cfg.paths['raw-papers-refs'], refs)

    print('saved raw papers refs to "{}"'.format(cfg.paths['raw-papers-refs']))
    def _load_data(self):
        meta_file = os.path.join(_DATA_ROOT,
                                 'data/%svqa_std_mscoco_%s.meta' %
                                 (self._version_suffix, self._subset))
        data_file = os.path.join(_DATA_ROOT,
                                 'data4/%svar_ivqa_%s_question_answers.data' %
                                 (self._version_suffix, self._subset))
        # load meta
        d = load_json(meta_file)
        images = d['images']
        quest_ids = np.array(d['quest_id'])
        vqa_image_ids = [find_image_id_from_fname(im_name) for im_name in images]
        quest_id2image_id = {qid: im_id for (qid, im_id) in zip(quest_ids, vqa_image_ids)}
        self._vqa_image_ids = np.array(vqa_image_ids, dtype=np.int32)

        # load QA data
        d = load_hdf5(data_file)
        self._quest_ids = d['ext_quest_ids'].astype(np.int32)
        self._quest = d['ext_quest_arr'].astype(np.int32)
        self._quest_len = d['ext_quest_len'].astype(np.int32)
        self._answer = d['ext_top_answer'].astype(np.int32)
        self._check_valid_answers()

        # sort images
        abs_quest_ids = self._quest_ids[:, 0]
        self._vqa_image_ids = np.array([quest_id2image_id[_id] for _id in abs_quest_ids],
                                       dtype=np.int32)

        # self._load_caption_feature()
        self._load_global_image_feature()
Пример #31
0
 def _load_R(self, h, c):
     if is_precomputed(h, c, self.p_dir):
         return util.load_json(h, c, self.p_dir)
     else:
         R = precompute_c(c, h)
         util.save_json_c(h, c, R, self.p_dir)
         return R
Пример #32
0
def graph_it(input_file):
    json_file = util.load_json(input_file)
    co_dict = defaultdict(int)
    co_co_dict = {}

    master_id = json_file.get('GENERAL-INFORMATION').get('ID')

    articles = json_file.get('ARTICLES')

    for a in articles:
        for co in a.get('AUTHORS'):
            co_id = co.get('ID')
            if (co_id != master_id and co_id):
                co_dict[co_id] += 1

    co_dict = dict(co_dict)

    for key in co_dict.iterkeys():

        tmp_dict = defaultdict(int)
        for a in articles:
            pre_list = [x.get('ID') for x in a.get('AUTHORS')]
            if (key in pre_list):
                for co in a.get('AUTHORS'):
                    co_id = co.get('ID')
                    if (co_id != key and co_id):
                        tmp_dict[co_id] += 1

        tmp_dict = dict(tmp_dict)
        if (len(tmp_dict) > 0):
            co_co_dict[key] = tmp_dict

    #construct_graph(co_dict,co_co_dict)
    print calculate(co_dict, co_co_dict)
Пример #33
0
def eval_recall(results):
    over_complete = load_json(_ANNO_FILE)
    accs = []
    num_cands = 0
    for res in results:
        quest_id = str(res['question_id'])
        cands = res['answers']
        gts = over_complete[quest_id]
        cand_acc = []
        for cand in cands:
            cand = str(cand)
            cand = cand.strip()
            if cand in gts:
                _n = gts[cand]
            else:
                _n = 0
            cand_acc.append(min(1., float(_n) / 3))
        v = max(cand_acc)
        accs.append(v)
        num_cands += len(cands)
        # num_correct += (_n >= 3)
    num_tot = len(results)
    # mean_acc = num_correct / float(num_tot)
    mean_acc = np.array(accs).mean()
    print('Evaluated %d questions' % num_tot)
    print('Total number of candidates: %d (%0.2f/image)' %
          (num_cands, float(num_cands) / num_tot))
    print('Recall: %0.2f' % (100. * mean_acc))
def make_examples_simple(data_dir, n_users, negative_examples_per_user=10):
    G = snap.LoadEdgeList(snap.PUNGraph, data_dir + 'graph.txt', 0, 1)
    new_edges = defaultdict(dict)
    with open(data_dir + 'new_edges.txt') as f:
        for line in f:
            u, b = map(int, line.split())
            new_edges[u][b] = 1

    businesses = map(int, util.load_json(data_dir + 'business.json').keys())
    examples = defaultdict(dict)
    users = random.sample([NI.GetId() for NI in G.Nodes()], n_users)
    for u in users:
        examples[u] = new_edges[u]
        for i in range(negative_examples_per_user):
            b = random.choice(businesses)
            examples[u][b] = 0

    p, n = 0, 0
    for u in examples:
        for b in examples[u]:
            p += examples[u][b]
            n += 1 - examples[u][b]
    print "Positive:", p
    print "Negative:", n
    print "Data skew:", p / float(p + n)
    print "Sampling rate:", negative_examples_per_user / float(len(businesses))

    print "Writing examples..."
    util.write_json(examples, data_dir + 'examples_simple.json')
 def _load_data(self):
     meta_file = 'data/%svqa_std_mscoco_%s.meta' % (self._version_suffix,
                                                    self._subset)
     data_file = 'data/%svqa_std_mscoco_%s.data' % (self._version_suffix,
                                                    self._subset)
     self._images = load_json(meta_file)['images']
     d = load_hdf5(data_file)
     self._quest = d['quest_arr'].astype(np.int32)
     self._quest_len = d['quest_len'].astype(np.int32)
     self._answer = d['answer'].astype(np.int32)
     self._num = self._answer.size
     self._check_valid_answers()
     vqa_image_ids = [
         find_image_id_from_fname(im_name) for im_name in self._images
     ]
     self._vqa_image_ids = np.array(vqa_image_ids, dtype=np.int32)
     # load caption data
     self._load_caption_data()
     # load attributes
     if self._attr_type == 'res152':
         self._load_global_image_feature()
     else:
         self._load_attributes()
     # load answer sequences
     self._load_answer_sequence()
Пример #36
0
 def __init__(self,
              subset='val',
              num_eval=None,
              need_im_feat=True,
              need_attr=False,
              use_ans_type=False,
              feat_type='res152'):
     anno_file = 'data/MultipleChoicesQuestionsKarpathy%sV2.0.json' % subset.title(
     )
     self._subset = subset
     d = load_json(anno_file)
     self._id2type = d['candidate_types']
     self._annotations = d['annotation']
     if num_eval == 0:
         num_eval = len(self._annotations)
     self._num_to_eval = num_eval
     self._idx = 0
     self._need_attr = need_attr
     self._need_im_feat = need_im_feat
     self._quest_encoder = SentenceEncoder('question')
     self._answer_encoder = SentenceEncoder('answer')
     self._im_encoder = MCDataFetcher(subset='kp%s' % subset,
                                      feat_type=feat_type)
     self.num_samples = len(self._annotations)
     self._mc_ctx = MultiChoiceQuestionManger(subset='val')
     self._group_by_answer_type()
     self._use_ans_type = use_ans_type
Пример #37
0
def get_contexts(
        dataset_file="/mnt/b5320167-5dbd-4498-bf34-173ac5338c8d/Datasets/bmj_case_reports_data/dataset_json_concept_annotated/train1.0.json",
        output_file="/mnt/b5320167-5dbd-4498-bf34-173ac5338c8d/Datasets/bmj_case_reports_data/dataset_json_concept_annotated/train1.0.txt",
        downcase=False):
    """
    Gets passage text with no concept annotations.
    """
    dataset = load_json(dataset_file)
    data = dataset[DATA_KEY]
    n_all = 0
    all_contexts = ""

    for datum in data:
        new_context = "\n" + datum[DOC_KEY][TITLE_KEY] + "\n" + datum[DOC_KEY][
            CONTEXT_KEY]
        all_contexts += remove_concept_marks(new_context)
        curr_queries = set()
        for qa in datum[DOC_KEY][QAS_KEY]:
            a = ""
            for ans in qa[ANS_KEY]:
                if ans[ORIG_KEY] == "dataset":
                    a = ans[TXT_KEY]
            assert a
            curr_queries.add(
                remove_concept_marks(qa[QUERY_KEY]).replace(
                    PLACEHOLDER_KEY, a))
        all_contexts += "\n" + "\n".join(curr_queries)
        n_all += 1
    print(n_all)

    all_contexts = all_contexts.replace("\n\n", "\n")
    with open(output_file, "w") as fh:
        fh.write(all_contexts.lower() if downcase else all_contexts)
Пример #38
0
 def load_info(self, load_path=None):
     if load_path is None:
         with pyglet.resource.file(self.resource_path('info.json'),
                                   'r') as info_file:
             self.info = json.load(info_file)
     else:
         self.info = util.load_json(load_path)
Пример #39
0
 def __init__(self,
              name,
              init_z=0,
              init_x=0,
              init_y=0,
              init_ox=0,
              init_oy=0,
              animations=[]):
     # SUPER CALL
     super(SpriteAnimationSet, self).__init__(name, init_z, init_x, init_y,
                                              0, 0, init_ox, init_oy)
     if len(animations) <= 0:
         raise InvalidArgumentsExecption("No animation files set.")
     # Loads animations framedata and spritesheets.
     self.animations = {}
     spritesheet = {}
     for animation in animations:
         framedata = FrameData(util.load_json(animation[1]))
         if framedata.sprite_sheet not in spritesheet.keys():
             spritesheet[framedata.sprite_sheet] = SpriteSheetManager(
                 framedata.sprite_sheet)
         self.animations[animation[0]] = SpriteAnimation(
             animation[0],
             init_z=1,
             init_x=0,
             init_y=0,
             frame_data=framedata,
             sprite_sheet=spritesheet[framedata.sprite_sheet])
     self.curr_animation = animations[0][0]
def main(example_file,graph_file,u_methods,u_outfiles,b_methods,b_outfiles):
	start = datetime.datetime.now()
	print "Loading examples..."
	examples = util.load_json(example_file)
	print "Loading graph..."
	G = snap.LoadEdgeList(snap.PUNGraph, graph_file, 0, 1)
	users(examples, G, u_methods, u_outfiles)
	business(examples, G, b_methods, b_outfiles)
Пример #41
0
 def on_value_changed(self,text):
     try:
         node = self.get_node()
         node.parent().replace_node(
             node,
             load_json(text)
         )
     except ValueError:
         self.model().invalid_json.emit()
Пример #42
0
 def get(self, model, key, r=None, pr=None,
         vtag=None, callback=None):
     bucket_name = model.__name__
     (code, docs), _ = yield gen.Task(super(RiakAdapter, self).get,
                                      bucket_name, str(key), r, pr, vtag)
     if docs and model:
         docs = map(lambda x: model(load_json(x[1]), x[0]), docs)
         callback(ViewResult(docs))
     else:
         callback(docs)
Пример #43
0
def getIps(file_name):
    sl = util.load_json(file_name)
    public_list = []
    private_list = []
    for server in sl.values():
        pub_ip = server["addresses"]["public"][0]
        priv_ip = server["addresses"]["private"][0]
        public_list.append(pub_ip)
        private_list.append(priv_ip)
    return {"PUBLIC": public_list, "SERVICENET": private_list}
Пример #44
0
 def load(self, folder_name="autosave"):
     """Returns a scene if save existed and was loaded successfully"""
     base_path = os.path.join(self.save_path, folder_name)
     scn = self.scene_handler.scene
     if not os.path.exists(base_path):
         return None
     else:
         my_info = util.load_json(os.path.join(base_path, 'game'))
         self.game_variables = my_info['game_variables']
         return scene.Scene(my_info['first_scene'], self.scene_handler, self.ui,
                            load_path=os.path.join(base_path, my_info['first_scene']))
Пример #45
0
def get_conferences():
	files = util.listdir(CONFERENCE_FOLDER)
	util.mkdir(CONFERENCE_CRALWED_FOLDER)
	cnt = 0
	conf = util.load_json('conf_name.json')
	for file_name in files:
		save_path = os.path.join(CONFERENCE_CRALWED_FOLDER, file_name)
		if util.exists(save_path):
			continue
		data = util.load_json(os.path.join(CONFERENCE_FOLDER, file_name))
		if data['short'] not in conf.keys():
			continue
		html = util.get_page(data['url'])
		subs = get_subs(data['short'], html)
		data['name'] = conf[data['short']]
		data['sub'] = {}
		if len(subs) == 0:
			data['sub']['#'] = get_publications(html)
			util.save_json(save_path, data)
		cnt += 1
def X_y_e(is_train, vectorizer):
    print "Loading data..."
    dataset = "train" if is_train else "test"
    examples = util.load_json('./data/' + dataset + '/examples.json')
    users = util.load_json('./data/' + dataset + '/user.json')
    businesses = util.load_json('./data/' + dataset + '/business.json')
    unsupervised_scores = {f: util.load_json('./data/' + dataset + '/' + f + '.json') for f in
                           ['svd', 'weighted_random_walks', 'random_walks', 'b_adamic', 'b_cn',
                            'b_jaccard', 'u_adamic', 'u_cn', 'u_jaccard']}

    print "Computing features..."
    feature_dicts, y, e = [], [], []
    for u in examples:
        for b in examples[u]:
            e.append((u, b))
            y.append(examples[u][b])
            feature_dicts.append(get_features(u, b, unsupervised_scores, users[u], businesses[b]))
    X = vectorizer.fit_transform(feature_dicts) if is_train else vectorizer.transform(feature_dicts)

    return X, y, e
Пример #47
0
def get_journals():
	files = util.listdir(JOURNAL_FOLDER)
	util.mkdir(JOURNAL_CRALWED_FOLDER)
	cnt = 0
	jour = util.load_json('jour_name.json')
	for file_name in files:
		save_path = os.path.join(JOURNAL_CRALWED_FOLDER, file_name)
		if util.exists(save_path):
			continue
		data = util.load_json(os.path.join(JOURNAL_FOLDER, file_name))
		if data['short'] not in jour.keys():
			continue
		html = util.get_page(data['url'])
		subs = get_subs(data['short'], html)
		data['name'] = jour[data['short']]
		data['sub'] = {}
		if len(subs) == 0:
			data['sub']['#'] = get_publications(html)
			util.save_json(save_path, data)
		cnt += 1
		print cnt, len(files), data['short']		
def train():
    phi = get_phi(True)

    print "Loading examples..."
    Ds, Ls = {}, {}
    examples = util.load_json("./data/train/examples.json")
    us = list(examples.keys())
    random.seed(0)
    random.shuffle(us)
    for u in us:
        D, L = set(), set()
        for b in examples[u]:
            (D if examples[u][b] == 1 else L).add(int(b))
        if len(D) > MAX_POSITIVE_EDGES_PER_USER:
            D = random.sample(D, MAX_POSITIVE_EDGES_PER_USER)
        if len(L) > MAX_NEGATIVE_EDGES_PER_USER:
            L = random.sample(L, MAX_POSITIVE_EDGES_PER_USER)
        if len(D) > 1 and len(L) > 10:
            Ds[int(u)] = list(D)
            Ls[int(u)] = list(L)
            if len(Ds) > NUM_TRAIN_USERS:
                break

    print "Setting initial conditions..."
    ps = {}
    for u in Ds:
        p = np.zeros(phi["bias"].shape[0])
        p[u] = 1.0
        ps[u] = sparse.csr_matrix(p)

    print "Training..."
    w = INITIAL_WEIGHTS
    best_loss = 100000
    for i in range(100):
        print "ITERATION " + str(i + 1) + ": base"
        base_loss, ps = run(phi, w, Ds, Ls, ps)
        if base_loss < best_loss:
            best_loss = base_loss
            util.write_json(w, "./data/supervised_random_walks_weights.json")

        partials = {}
        for k in w:
            print "ITERATION " + str(i + 1) + ": " + k
            new_w = w.copy()
            new_w[k] += H
            new_loss, _ = run(phi, new_w, Ds, Ls, ps)
            partials[k] = (new_loss - base_loss) / H

            print partials[k] * LEARNING_RATE

        for (k, dwk) in partials.iteritems():
            w[k] -= LEARNING_RATE * dwk
Пример #49
0
def get_journals():
	files = util.listdir(JOURNAL_FOLDER)
	util.mkdir(JOURNAL_CRALWED_FOLDER)
	cnt = 0
	for file_name in files:
		save_path = os.path.join(JOURNAL_CRALWED_FOLDER, file_name)
		data = util.load_json(os.path.join(JOURNAL_FOLDER, file_name))
		html = util.get_page(data['url'])
		full_name = get_full_name(html)
		data['name'] = full_name
		cnt += 1
		print cnt, len(files), data['short']
		data['links'] = get_links(data['short'], html)
Пример #50
0
def load_headers():
    h = pickle.load(open("auth_headers.db","r"))
    intended_keys = ["x-auth-token"]
    headers = {}
    headers["content-type"] = "application/xml"
    headers["accept"] = "application/xml"
    if os.path.isfile("extra_headers.json"):
       extra_headers = util.load_json("extra_headers.json")
       headers.update(extra_headers)
    for auth_key in intended_keys:
        headers[auth_key]=h[auth_key]
    printf("\n\nheaders=%s\n",headers)
    return headers
Пример #51
0
def get_authors():
	files = util.listdir(AUTHOR_FOLDER)
	util.mkdir(AUTHOR_CRALWED_FOLDER)
	for file_name in files:
		save_path = os.path.join(AUTHOR_CRALWED_FOLDER, file_name)
		if util.exists(save_path):
			continue
		data = util.load_json(os.path.join(AUTHOR_FOLDER, file_name))
		html = util.get_page(data['url'])
		full_name = get_full_name(html)
		data['name'] = full_name
		print data['short'], full_name
		data['links'] = get_links(data['short'], html)
		util.save_json(save_path, data)
Пример #52
0
def run_generator(args):
    import json, sys
    # Grab the spec's and schema's path.
    search_paths = {}
    for filename in [args.spec, args.schema]:
        path = os.path.abspath(os.path.dirname(os.path.expanduser(filename)))
        search_paths[path] = True

    spec = load_json(args.spec)
    schema = load_json(args.schema)
    writer = OutputWriter() if not args.dryrun else DryRunWritter()
    reader = FileReader()
    generator = Generator(spec, schema,
                          writer=writer,
                          reader=reader,
                          mode=args.target,
                          paths=search_paths.keys())
    error_details = {}
    try:
        generator.generate(error_details=error_details)
        return
    except SchemaError, err:
        print 'Schema Error:', err.message
Пример #53
0
def get_journals():
    files = util.listdir(JOURNAL_FOLDER)
    util.mkdir(JOURNAL_CRALWED_FOLDER)
    cnt = 0
    jour = util.load_json('jour_name.json')
    for file_name in files:
        save_path = os.path.join(JOURNAL_CRALWED_FOLDER, file_name)
        if util.exists(save_path):
            continue
        data = util.load_json(os.path.join(JOURNAL_FOLDER, file_name))
        if data['short'] not in jour.keys():
            continue
        html = util.get_page(data['url'])
        subs = get_subs(data['short'], html)
        data['name'] = jour[data['short']]
        data['sub'] = {}
        for sub in subs:
            html = util.get_page('http://dblp.uni-trier.de/db/journals/' + sub)
            data['sub'][sub] = {}
            data['sub'][sub]['pub'] = get_publications(html)
            data['sub'][sub]['name'] = jour[sub]
        cnt += 1
        print cnt, len(files), data['short']
        util.save_json(save_path, data)
Пример #54
0
def svd(data_dir, k=50):
    print "Loading data and building adjacency matrix..."
    examples = util.load_json('./data/' + data_dir + '/examples.json')
    G = nx.read_edgelist('./data/' + data_dir + '/graph.txt', nodetype=int)
    adjacency_matrix = sparse.csr_matrix(nx.adjacency_matrix(G), dtype=float)

    print "Computing singular value decomposition..."
    u, s, vt = sparse.linalg.svds(adjacency_matrix, k=k)
    us = u * s

    print "Writing results..."
    for u in examples:
        for b in examples[u]:
            examples[u][b] = np.dot(us[u, :], vt[:, b])
    util.write_json(examples, './data/' + data_dir + '/svd.json')
Пример #55
0
 def parse_file(filename):
     def fail():
         failure = (False,None)
         QtGui.QMessageBox.warning(
             self,
             'invalid json file',
             'file was not list of json objects'
         )
         return failure            
     with open(filename,'r') as f:
         try:
             json_object = load_json(f.read())
             if type(json_object) is not list:
                 return fail()
             return (True, json_object)
         except ValueError:                    
             return fail()
Пример #56
0
def get_conferences():
	files = util.listdir(CONFERENCE_FOLDER)
	util.mkdir(CONFERENCE_CRALWED_FOLDER)
	cnt = 0
	for file_name in files:
		cnt += 1
		if cnt < 1970:
			continue
		save_path = os.path.join(CONFERENCE_CRALWED_FOLDER, file_name)
		data = util.load_json(os.path.join(CONFERENCE_FOLDER, file_name))
		html = util.get_page(data['url'])
		full_name = get_full_name(html)
		data['name'] = full_name
		try:
			print cnt, len(files), data['short']
		except:
			pass
		data['links'] = get_links(data['short'], html)
Пример #57
0
def run_evaluation(examples, methods, precision_at=20):
    curve_args = []

    for i, method in enumerate(methods):
        predictions = util.load_json('./data/test/' + method + '.json')
        total_precision = 0
        all_ys, all_ps = [], []
        for u in predictions:
            ys, ps = zip(*[(examples[u][b], predictions[u][b]) for b in predictions[u]])
            all_ys += ys
            all_ps += ps

            n = min(precision_at, len(ys))
            top_ys = zip(*sorted(zip(ys, ps), key=itemgetter(1), reverse=True))[0][:n]
            total_precision += sum(top_ys) / float(n)

        roc_auc = roc_auc_score(all_ys, all_ps)
        fpr, tpr, t = roc_curve(all_ys, all_ps)
        curve_args.append((fpr, tpr, method, COLORS[i % len(COLORS)]))

        print "Method:", method
        print "  Precision @{:} = {:.4f}".format(precision_at, total_precision / len(examples))
        print "  ROC Auc = {:.4f}".format(roc_auc)

    if i >= len(COLORS):
        print "Too many methods to plot all of them!"
        return

    plt.figure(figsize=(9, 9))
    plt.xlabel('False Positive Rate')
    plt.ylabel('True Positive Rate')
    plt.xlim([0.0, 1.0])
    plt.title('ROC curves')
    for (fpr, tpr, label, color) in curve_args:
        plt.plot(fpr, tpr, label=label, color=color)
    plt.legend(loc="best")
    plt.show()
Пример #58
0
import util

jour = util.load_json('jour_name.json')
conf = util.load_json('conf_name.json')

print len(jour), len(conf), len(jour) + len(conf)

for k, v in jour.items():
	conf[k] = v

print len(conf)
def make_examples(data_dir, n_users=5000, min_degree=1, negative_sample_rate=0.01,
                  min_active_time=None, new_edge_only=False):
    print "Loading data..."
    # TODO: switch to networkx?
    G = snap.LoadEdgeList(snap.PUNGraph, data_dir + 'graph.txt', 0, 1)
    with open(data_dir + 'new_edges.txt') as f:
        edges = {tuple(map(int, line.split())) for line in f}
    new_edge_count = Counter()
    for (u, b) in edges:
        new_edge_count[u] += 1
    review_data = util.load_json(data_dir + 'review.json')
    n_businesses = len(util.load_json(data_dir + "business.json"))

    recently_active_users = []
    other_users = []
    print "Getting candidate set of users..."
    users = []
    for Node in util.logged_loop(G.Nodes(), util.LoopLogger(50000, G.GetNodes(), True)):
        u = Node.GetId()
        if new_edge_only and not u in new_edge_count:
            continue
        if str(u) not in review_data or Node.GetOutDeg() < min_degree:
            continue
        if min_active_time:
            recent_review = False
            for b in review_data[str(u)]:
                if (int(u), int(b)) in edges:
                    continue
                for r in review_data[str(u)][b]:
                    if get_date(r) > min_active_time:
                        users.append(u)
                        recently_active_users.append(u)
                        recent_review = True
                        break
                if recent_review:
                    break
            if not recent_review:
                other_users.append(u)
        else:
            users.append(u)

    if min_active_time:
        recent_positive = sum(new_edge_count[u] for u in recently_active_users)
        recent_examples = len(recently_active_users) * n_businesses
        other_positive = sum(new_edge_count[u] for u in other_users)
        other_examples = len(other_users) * n_businesses
        print "Positives retained from recently active filter:", \
            recent_positive / float(recent_positive + other_positive)
        print "Negatives retained from recently active filter:", \
            (recent_examples - recent_positive) / \
            float(recent_examples - recent_positive + other_examples - other_positive)

    random.seed(0)
    users = random.sample(users, n_users)

    print "Getting candidate set of edges..."
    examples = defaultdict(dict)
    for u in util.logged_loop(users, util.LoopLogger(50, n_users, True)):
        candidate_businesses = snap.TIntV()
        snap.GetNodesAtHop(G, u, 3, candidate_businesses, True)
        for b in candidate_businesses:
            if (u, b) in edges:
                examples[u][b] = 1
            elif random.random() < negative_sample_rate:
                examples[u][b] = 0

    hop3_positives = 0
    for u in examples:
        for b in examples[u]:
            hop3_positives += examples[u][b]
    hop3_examples = sum(len(examples[u]) for u in examples)
    n_positives = sum([new_edge_count[u] for u in users])
    n_examples = len(users) * n_businesses
    print "Positives retained from hop3 filter:", hop3_positives / float(n_positives)
    print "Negatives retained from hop3 filter:", (hop3_examples - hop3_positives) / \
            (negative_sample_rate * float(n_examples - n_positives))
    print "Data skew:", hop3_positives / float(hop3_examples)

    print "Writing examples..."
    util.write_json(examples, data_dir + 'examples.json')