Exemplo n.º 1
0
def save_data(config, anew=True):
    # веса VK не используем, так как 98% из них одинаковы

    labels = pd.Series(
        to_numpy(load_vector(config.join("labels.bin"), 'i'), 'l'))
    num_nodes = len(labels)

    nodes = None
    if not config.directed:
        nodes = to_numpy(load_vector(config.join("nodes.bin"), 'I'))

    train, test, _ = train_test_partition(config, labels, nodes, anew=anew)
    train_test = np.concatenate([train, test])
    y, num_classes, cities = get_target(config, labels, train_test)
    print("num_classes = %s" % num_classes)

    data = Data(y=y)

    data.num_nodes = num_nodes
    data.num_classes = num_classes
    data.train_mask = torch.zeros(num_nodes, dtype=torch.uint8)
    data.train_mask[train] = 1
    data.test_mask = torch.zeros(num_nodes, dtype=torch.uint8)
    data.test_mask[test] = 1
    data.cities = cities

    torch.save(data, config.join('data%s.pt' % config.postfix))
Exemplo n.º 2
0
def analyze(basedir, typecode):
    # basedir='../data/twitter'
    # typecode = 'L'
    cities = pd.read_csv(join(basedir, "geography.csv"), header=None)[0]
    labels = to_numpy(load_vector(join(basedir, "labels.bin"), 'i'))
    nodes = tools.load_vector(join(basedir, "nodes.bin"), typecode)
    id2ix = MapU64U32()
    for ix, _id in enumerate(nodes):
        id2ix[_id] = ix

    found = array.array('f')
    with open(join(basedir, "post_geo.csv")) as f:
        reader = csv.reader(f)
        next(reader, None)  # skip the headers
        for line in tqdm(reader):
            ix = id2ix[int(line[0])]
            l = labels[ix]
            if l > -1:
                city = cities[l]
                splited = line[1].split(',')
                try:
                    index = splited[::2].index(city)
                except:
                    continue
                counts = np.array(splited[1::2], dtype='I')
                # r = float(counts[index]) / counts.sum()
                r = counts[index]
                found.append(r)

    found = to_numpy(found)
    qs = [[.2, .3, .4, .5, .6, .7, .8, .9]]
    qs.append(np.quantile(found, qs[0]))
    qs = np.array(qs)
Exemplo n.º 3
0
def load_data(config: Config):
    Data.num_node_features = property(lambda self: self.x.shape[1])
    data = torch.load(config.pt_data_path)
    data.x = CSRMatrix(config.node_csr_data_path, config.node_csr_indices_path,
                       config.node_csr_indptr_path, config.cpu_count)

    if config.directed:
        Data.num_edge_features = property(lambda self: self.edge_attr.shape[1])
        data.edge_attr = CSRMatrix(config.edge_csr_data_path,
                                   config.edge_csr_indices_path,
                                   config.edge_csr_indptr_path,
                                   config.cpu_count)
        data.reverse_edge_map = to_numpy(
            load_vector(config.reverse_edge_map_path, 'I'))

    if not os.path.exists(config.stat_path):
        compute_mean_std(config, data)
    stat = torch.load(config.stat_path)
    data.stat = stat
    if config.directed:
        data.double_mu_edge = torch.cat([stat.mu_edge, stat.mu_edge])
        data.double_std_edge = torch.cat([stat.std_edge, stat.std_edge])

    for part_path in config.part_dirs:
        print(part_path)
        torch.cuda.empty_cache()
        if not config.dev_mode:
            data.predict_mask = torch.zeros(data.num_nodes, dtype=torch.uint8)
            if config.directed:
                findex = join(part_path, "predict_index.bin")
            else:
                findex = join(part_path, "nodes.bin")
            data.predict_mask[load_vector(findex, 'I')] = 1
        data.edge_index = torch.from_numpy(
            load_2d_vec(join(part_path, "colrow.bin"),
                        nrows=2,
                        typecode='i',
                        order='F'))
        deg = None
        if config.directed:
            deg = torch.from_numpy(
                to_numpy(load_vector(join(part_path, "degrees.bin"), 'I'),
                         'l'))

        nbr_sampler = NeighborSampler(config, data, deg=deg, batch_size=1000)

        yield nbr_sampler
Exemplo n.º 4
0
def check_russian():
    nodes = tools.load_vector('../data/facebook/nodes.bin', 'L')
    id2ix = get_id2ix(nodes)
    labels = tools.to_numpy(
        tools.load_vector('../data/facebook/labels.bin', 'i'))

    ixs = []
    with open('../data/facebook/queue2.txt') as f:
        for s in f:
            try:
                ixs.append(id2ix[int(s)])
            except:
                continue

    # я не взял профили без ребер, это логично

    sub = labels[ixs]
    sub[sub > -1].shape
Exemplo n.º 5
0
def train_test_partition(config, labels, nodes=None, anew=False):
    """ Sample active users with cities, filter nonfrequent cities, return train/test split """
    ftest = config.join("test_index.bin")
    ftrain = config.join("train_index.bin")
    fpredict = config.join("predict_index.bin")

    # sample only users having at least 30 friends with cities
    if anew:
        degrees_cities = to_numpy(
            load_vector(config.join("degrees_with_cities.bin"), 'f'))
        is_active_cities = degrees_cities > config.degree_city_threshold

        if config.directed:
            is_node_active = is_active_cities
        else:
            degrees = to_numpy(load_vector(config.join("degrees.bin"), 'I'))
            is_active = np.logical_and(degrees > config.degree_threshold,
                                       is_active_cities)
            is_node = np.zeros(len(labels), dtype=np.bool)
            is_node[nodes] = True
            is_node_active = is_node & is_active

        is_city = to_numpy(load_vector(config.join("is_city.bin"), 'B'))
        train_test = labels[(is_city == 1) & is_node_active]
        # predict = labels[~is_city & is_node_active]
        predict = labels[is_node_active]

        city_freq = train_test.value_counts()
        is_valid = np.zeros(city_freq.index.max() + 1, dtype=np.bool)
        # exclude very infrequent
        valid_cities = city_freq[city_freq > config.city_freq_threshold].index
        is_valid[valid_cities] = True
        train_test = train_test[is_valid[train_test]]
        train_size = min(config.max_train_size,
                         len(train_test) - config.test_size)
        train, test = train_test_split(train_test,
                                       stratify=train_test,
                                       test_size=config.test_size,
                                       train_size=train_size,
                                       random_state=11)
        with open(ftest, 'wb') as f1, \
             open(ftrain,'wb') as f2, \
             open(fpredict,'wb') as f3:
            test = array.array('I', test.index)
            test.tofile(f1)
            train = array.array('I', train.index)
            train.tofile(f2)
            predict = array.array('I', predict.index)
            predict.tofile(f3)
    else:
        test = load_vector(ftest, 'I')
        train = load_vector(ftrain, 'I')
        predict = load_vector(fpredict, 'I')

    return train, test, predict
Exemplo n.º 6
0
def compute_mean_std(config, data):
    def compute(x, ixs, chunk_size):
        device = config.device
        total = math.ceil(len(ixs) / chunk_size)
        s1 = torch.zeros(x.shape[1], dtype=torch.float32).to(device)
        s2 = torch.zeros(x.shape[1], dtype=torch.float32).to(device)
        counts = torch.zeros(x.shape[1], dtype=torch.int64).to(device)
        for ch_ixs in tqdm(tools.grouper(ixs, chunk_size),
                           total=total,
                           desc="mean/std"):
            ch_ixs = np.array(ch_ixs, dtype='I')
            chunk = torch.from_numpy(x[ch_ixs]).to(device)
            s1 += chunk.sum(axis=0)
            s2 += torch.square(chunk).sum(axis=0)
            counts += (chunk != 0).sum(dim=0)

        mu = s1 / x.shape[0]
        var = s2 / x.shape[0] - torch.square(mu)
        std = torch.pow(var, 0.5)
        std[std == 0] = 1.
        return counts, mu, std

    if config.directed:
        ixs = range(data.x.shape[0])
    else:
        ixs = load_vector(config.join("nodes.bin"), "I")

    counts_node, mu_node, std_node = compute(data.x, ixs, 10**6)
    lessthen = (counts_node < config.city_freq_threshold).sum()
    if lessthen:
        print("node features having less then %d counts: %d" %
              (config.city_freq_threshold, lessthen))

    counts_edge, mu_edge, std_edge = None, None, None
    if config.directed:
        ixs = range(data.edge_attr.shape[0])
        counts_edge, mu_edge, std_edge = compute(data.edge_attr, ixs, 10**7)

    stat = Data(counts_node=counts_node,
                mu_node=mu_node,
                std_node=std_node,
                counts_edge=counts_edge,
                mu_edge=mu_edge,
                std_edge=std_edge)
    torch.save(stat, config.stat_path)
Exemplo n.º 7
0
def show():
    u64_nodes = tools.load_vector('../data/facebook/nodes.bin', 'L')
    edge_index = tools.load_2d_vec(
        '../data/facebook/colrow.bin',  # edgelist_double.bin
        nrows=2,
        typecode='i',
        order='F')

    # ======================================================================

    edge_map = tools.load_vector("../data/facebook/reverse_edge_map.bin", 'I')

    print_reverse_edges(2345, edge_index, edge_map)

    # ======================================================================

    # id2ix = get_id2ix(u64_nodes)
    x = CSRMatrix('../data/facebook/train/neib_ftrs_data.bin',
                  '../data/facebook/train/neib_ftrs_indices.bin',
                  '../data/facebook/train/neib_ftrs_indptr.bin',
                  multiprocessing.cpu_count())

    labels = tools.to_numpy(
        tools.load_vector("../data/facebook/labels.bin", 'i'))

    geo_splited = np.genfromtxt('../data/facebook/geography_splited.csv', str)
    geography = np.genfromtxt('../data/facebook/geography.csv', dtype=str)

    extid = 100001046296473
    query(extid, u64_nodes, labels, geography, group=True)
    print()
    pprint(get_neib_counts(extid, x, u64_nodes, geo_splited))

    # ======================================================================

    weights = CSRMatrix('../data/facebook/edge_ftrs_data.bin',
                        '../data/facebook/edge_ftrs_indices.bin',
                        '../data/facebook/edge_ftrs_indptr.bin',
                        multiprocessing.cpu_count())

    # cond = np.all(weights, axis=0)
    # eall = edge_index[:, cond]
    # wall = weights[:, cond]

    cond = np.array([23144], dtype='I')
    print_ix(cond, edge_index, weights, u64_nodes)

    ix1 = tools.binary_search(u64_nodes, 100009116587703)
    ix2 = tools.binary_search(u64_nodes, 100001989967439)

    cond = np.where(np.logical_and(edge_index[0] == ix1,
                                   edge_index[1] == ix2))[0].astype('I')
    print_ix(cond, edge_index, weights, u64_nodes)

    cond = np.where(np.logical_and(edge_index[0] == ix2,
                                   edge_index[1] == ix1))[0].astype('I')
    print_ix(cond, edge_index, weights, u64_nodes)

    # 100003289482076  # co_Ukraine.r1_KievObl.ci_Kiev
    100009116587703
    100001989967439
Exemplo n.º 8
0
def predict(nbr_sampler, config):
    data = nbr_sampler.data
    model, _ = load_model(config, data)
    model.eval()
    device = config.device
    thresholds = pd.read_csv(config.join('thresholds.csv'))
    thresholds1 = load_thresholds(thresholds, config.precision_min1, device)
    thresholds2 = load_thresholds(thresholds, config.precision_min2, device)

    geos = open(config.join('cities_splited.csv')).read().split()
    if config.directed:
        nodes = to_numpy(load_vector(config.join("nodes.bin"), 'L'))

    def last_starts_with(prefix):
        return torch.tensor(
            [s.split('.')[-1].startswith(prefix) for s in geos]).to(device)

    r2_mask = last_starts_with('r2_')
    r3_mask = last_starts_with('r3_')
    city_mask = tools.get_city_mask(geos, exclude_continent=False)
    city_mask = torch.tensor(city_mask).to(device)

    no_interest = ~(r2_mask | r3_mask | city_mask)

    output_queue = multiprocessing.Queue()
    writer_process = multiprocessing.Process(target=__writer,
                                             args=(config, output_queue, geos,
                                                   city_mask.cpu().numpy()))
    writer_process.start()
    with torch.no_grad():
        total = tqdm_total(data.predict_mask, nbr_sampler.batch_size)
        for data_flow in tqdm(nbr_sampler(data.predict_mask), total=total):
            x = slice_data_flow(config, data, data_flow)
            logits = model(x, data_flow.to(device))
            logits[:, no_interest] = -inf
            logits[logits < thresholds2] = -inf
            logits[logits < thresholds1] *= -1
            cond = logits.min(dim=1)[0] < inf
            logits, ixs = torch.sort(logits[cond], dim=1)
            # break

            n_id = data_flow.n_id[cond].numpy()
            if config.directed: n_id = nodes[n_id]
            output_queue.put_nowait(
                (n_id, ixs.cpu().numpy(), logits.cpu().numpy()))

    # _id = 527
    # _ixs = ixs[_id].cpu().detach().numpy()
    # _probs = torch.exp(logits[_id]).cpu().detach().numpy()

    # df = pd.DataFrame([np.array(data.cities)[_ixs], _probs]).T
    # df.columns = ['geo', 'prob']
    # df['extid'] = data_flow.n_id[_id].detach().numpy()
    # df['th'] = torch.exp(thresholds)[_ixs].cpu().detach().numpy()
    # df['diff'] = df['prob'] - df['th']
    # df = df[['extid', 'geo', 'prob', 'th', 'diff']]
    # df.to_csv('../data/vk/example.csv', index=False)

    # put_nowait
    output_queue.put(None)
    writer_process.join()