Exemplo n.º 1
0
def is_convex(dataset):
    X = np.genfromtxt('res/benchmark/SSL,set=' + str(dataset) + ',X.tab')
    #X = (X - np.min(X, axis=0)) / (np.max(X, axis=0) - np.min(X, axis=0))
    y = (np.genfromtxt('res/benchmark/SSL,set=' + str(dataset) + ',y.tab'))

    n = 300
    for n_prime in [n]:
        print("================================")
        print("n_prime=", n_prime)
        for q in [0.001, 0.002, 0.005, 0.01, 0.02, 0.05]:
            print("q=", q)
            dists = scipy.spatial.distance.cdist(X, X)
            y = y[:n]

            W = dists[:n, :n]  #np.exp(-(dists) ** 2 / (2 * sigma ** 2))
            np.fill_diagonal(W, 0)
            W[W > np.quantile(W, q)] = np.inf
            # W2 = np.copy(W) less edges is slower strangely
            # W2[W2 <= 0.1] = 0

            weights = W[(W < np.inf) & (W > 0)].flatten()
            edges = np.array(np.where((W < np.inf) & (W > 0))).T

            np.random.seed(0)

            g = gt.Graph()

            # construct actual graph
            g.add_vertex(n)
            g.add_edge_list(edges)
            weight_prop = g.new_edge_property("double", vals=weights)

            comps, hist = gt.topology.label_components(g)

            print(len(simplicial_vertices(g)))
            continue
            paths = shortest_path_cover_logn_apx(g, weight_prop)

            sum = 0
            for i in paths:
                sum += np.ceil(np.log2(len(i)))

            print("|S|=", len(paths))
            print("#queries<=", sum, "%:", sum / n)

            pos = list(np.arange(n)[y > 0])[:n_prime]
            neg = list(np.arange(n)[y <= 0])[:n_prime]

            print(n, pos, neg)
            print("p", len(pos))
            print("n", len(neg))

            pos_hull = closure.compute_hull(g, pos, weight_prop, comps, hist)
            print(np.sum(pos_hull))
            neg_hull = closure.compute_hull(g, neg, weight_prop, comps, hist)
            print(np.sum(neg_hull))
            print(
                len(
                    set(np.where(pos_hull)[0]).intersection(
                        set(np.where(neg_hull)[0]))) / n)
def is_convex():
    print("citeseer")
    print("weighted")
    np.random.seed(0)

    attributes_df = pd.read_csv('res/citeseer/citeseer.content',
                                sep="\t",
                                header=None,
                                dtype=np.str)
    features = attributes_df.iloc[:, 1:-1].to_numpy(dtype=np.int)
    labels, _ = pd.factorize(attributes_df.iloc[:, -1])
    new_ids, old_ids = pd.factorize(attributes_df.iloc[:, 0])

    edges_df = pd.read_csv('res/citeseer/citeseer.cites',
                           sep="\t",
                           header=None,
                           dtype=np.str)
    edges_df = edges_df[edges_df.iloc[:, 0].apply(lambda x: x in old_ids)]
    edges_df = edges_df[edges_df.iloc[:, 1].apply(lambda x: x in old_ids)]
    renamed = edges_df.replace(old_ids, new_ids)
    edges = renamed.to_numpy(dtype=np.int)
    edges = np.fliplr(edges)
    g = gt.Graph(directed=True)

    g.add_edge_list(edges)

    weight = np.sum(np.abs(features[edges[:, 0]] - features[edges[:, 1]]),
                    axis=1)

    weight_prop = g.new_edge_property("int", val=1)

    #weight = g.new_edge_property("double", vals=weight)

    comps, hist = gt.label_components(g)
    print(hist)
    dist_map = gt.shortest_distance(g, weights=weight_prop)  #, weights=weight)
    simple = simplicial_vertices.simplicial_vertices(g)

    print("n=", g.num_vertices(), "s=", len(simple))

    spc = shortest_path_cover_logn_apx(g, weight_prop)

    pickle.dump(spc, open("res/citeseer/spc_directed_unweighted.p", "wb"))
    '''intersection_0 = []
Exemplo n.º 3
0
def is_convex(directed):
    print("cora")
    np.random.seed(0)
    edges = np.genfromtxt('res/cora/cora.edges', dtype=np.int,
                          delimiter=',')[:, :2] - 1

    labels = np.genfromtxt('res/cora/cora.node_labels',
                           dtype=np.int,
                           delimiter=',')[:, 1]

    g = gt.Graph(directed=directed)

    g.add_edge_list(edges)

    weight = g.new_edge_property("double", val=1)

    comps, hist = gt.label_components(g)
    print(hist)
    dist_map = gt.shortest_distance(g, weights=weight)  #, weights=weight)
    simple = simplicial_vertices.simplicial_vertices(g)

    print("n=", g.num_vertices(), "s=", len(simple))

    spc = pickle.load(open("res/cora/spc_" + str(directed) + ".p",
                           "rb"))  #shortest_path_cover_logn_apx(g, weight)

    a, b = spc_querying_naive(g, spc, labels)
    print(a)
    print(b, np.sum(b))
    print(np.sum(a == labels))
    return

    print("len(spc)", len(spc))
    num_of_convex_paths = 0
    total_error = 0
    for p in spc:
        error = are_convex(labels[p])
        if error == 0:
            num_of_convex_paths += 1
        else:
            total_error += error

    print("#convex paths", num_of_convex_paths)
    print("total error on paths", total_error)
    return
    pickle.dump(spc, open("res/cora/spc_" + str(directed) + ".p", "wb"))

    for c in np.unique(labels):
        print("class label", c)
        print("class size: ", np.sum(labels == c))
        cls = np.where(labels == c)[0]
        for sample_size in [5, 10, 20, len(cls)]:
            print("sample_size", sample_size)
            if sample_size <= 20:
                times = 5
            else:
                times = 1
            for _ in range(times):

                sample = np.random.choice(cls, sample_size, replace=False)

                hull_p = compute_hull(g,
                                      sample,
                                      dist_map=dist_map,
                                      comps=comps,
                                      hist=hist,
                                      compute_closure=False)
                print("size interval: ", np.sum(hull_p))
                print("number of correct in interval: ", np.sum(hull_p[cls]))

                hull_p = compute_hull(g,
                                      sample,
                                      dist_map=dist_map,
                                      comps=comps,
                                      hist=hist)
                print("size hull: ", np.sum(hull_p))
                print("number of correct in interval: ", np.sum(hull_p[cls]))

    print("==================================")
Exemplo n.º 4
0
def is_convex(dir, prefix, target_column, weighted=False):
    print(dir)
    np.random.seed(0)
    edges = np.genfromtxt(dir + prefix + '_edges.csv',
                          skip_header=True,
                          dtype=np.int,
                          delimiter=',')

    df = pd.read_csv(dir + prefix + '_target.csv')  #.sort_values('new_id')
    print(dir, "weighted", weighted)

    weight = 1
    if weighted:
        if 'twitch' in dir:
            weight = np.zeros(edges.shape[0])
            max = df.iloc[:, 1].max()
            min = df.iloc[:, 1].min()
            df.iloc[:, 1] = (df.iloc[:, 1] - min) / (max - min)
            max = df.iloc[:, 3].max()
            min = df.iloc[:, 3].min()
            df.iloc[:, 3] = (df.iloc[:, 3] - min) / (max - min)

            for i, e in enumerate(edges):
                weight[i] = (df.iloc[e[0], 1] - df.iloc[e[1], 1])**2 + (
                    df.iloc[e[0], 3] - df.iloc[e[1], 3])**2

        elif 'facebook' in dir:
            attributes = json.load(
                open('res/git/' + dir + '/facebook_features.json'))
            weight = np.zeros(edges.shape[0])
            for i, e in enumerate(edges):
                weight[i] = len(
                    set(attributes[str(e[0])]).symmetric_difference(
                        attributes[str(e[1])]))

    labels, _ = pd.factorize(df.iloc[:, target_column])

    new_n = 4000
    pos_label, neg_label = np.unique(labels)
    pos = np.where(labels == pos_label)[0]
    neg = np.where(labels == neg_label)[0]

    g = gt.Graph(directed=False)

    g.add_edge_list(edges)
    '''d = g.get_out_degrees(range(g.num_vertices()))
    
    

    d_pos = d[pos].argsort()[-new_n//2:][::-1]
    d_neg = d[neg].argsort()[-new_n//2:][::-1]

    d = np.append(d_pos, d_neg)

    g2 = gt.Graph(directed=False)

    edges =edges[np.isin(edges[:,0],d)&np.isin(edges[:,1],d)]

    indexes = np.unique(edges)
    labels = labels[indexes]
    for i, idx in enumerate(indexes):
        edges[edges==idx] = i

    g2.add_edge_list(edges)

    comp = gt.topology.label_largest_component(g2)
    d = np.where(comp.a == 1)[0]
    labels = labels[d]
    g3 = gt.Graph(directed=False)

    edges = edges[np.isin(edges[:, 0], d) & np.isin(edges[:, 1], d)]

    for i, idx in enumerate(np.unique(edges)):
        edges[edges == idx] = i
    g3.add_edge_list(edges)
    g = g3'''

    if weighted:
        weight = g.new_edge_property("double", vals=weight)
    else:
        weight = g.new_edge_property("double", val=1)

    comps, hist = gt.topology.label_components(g)
    #print(hist)
    #dist_map = gt.shortest_distance(g, weights=weight)
    simple = simplicial_vertices.simplicial_vertices(g)
    gt.stats.remove_self_loops(g)
    print("n=", g.num_vertices(), "simplicial=", len(simple))
    #spc = shortest_path_cover_logn_apx(g, weight)
    if weighted:
        weighted_str = "_weigted_"
    else:
        weighted_str = ""
    #pickle.dump(spc, open(dir+'spc'+weighted_str+'.p', 'wb'))
    spc = pickle.load(open(dir + 'spc' + weighted_str + '.p', 'rb'))

    weight = None

    pos = np.where(labels == pos_label)[0]
    neg = np.where(labels == neg_label)[0]

    print("pos", len(pos))
    print("neg", len(neg))
    spc_semi_supervised_experiments(g, weight, labels)

    p_interval = compute_hull(g, pos, weight, compute_closure=False)
    n_interval = compute_hull(g, neg, weight, compute_closure=False)

    print("pos_interval size: ", np.sum(p_interval))
    print("neg_interval size: ", np.sum(n_interval))
    print("intersection of intervals size: ", np.sum(p_interval & n_interval))

    p_hull = compute_hull(g, pos, weight)
    n_hull = compute_hull(g, neg, weight)

    print("pos_hull size: ", np.sum(p_hull))
    print("neg_hull size: ", np.sum(n_hull))
    print("intersection of hulls size: ", np.sum(p_hull & n_hull))
def florians_procedure(g: gt.Graph, use_simplicial):
    n = g.num_vertices()

    if not use_simplicial:
        s = simplicial_vertices(g)
        a = s[0]
        while a in s:
            a = np.random.randint(0, n)

        b = a
        while a == b or b in s:
            b = np.random.randint(0, n)

    else:
        a = np.random.randint(0, n)

        b = a
        while a == b:
            b = np.random.randint(0, n)

    A = np.zeros(n, dtype=np.bool)
    A[a] = True
    B = np.zeros(n, dtype=np.bool)
    B[b] = True

    F = set(range(n)).difference(np.where(A | B == True)[0])

    i = 0
    while len(F) > 0:
        e = F.pop()

        if i % 2 == 0:

            A[e] = True
            A_new = (g, np.where(A == True)[0])
            if not np.any(B & A_new):
                A = A_new
                F = F.difference(set(np.where(A == True)[0]))
            else:
                A[e] = False
                B[e] = True
                B_new = compute_hull(g, np.where(B == True)[0])
                if not np.any(A & B_new):
                    B = B_new
                    F = F.difference(set(np.where(A == True)[0]))
                else:
                    B[e] = False
        else:
            B[e] = True
            B_new = compute_hull(g, np.where(B == True)[0])
            if not np.any(A & B_new):
                B = B_new
                F = F.difference(set(np.where(A == True)[0]))
            else:
                B[e] = False
                A[e] = True
                A_new = compute_hull(g, np.where(A == True)[0])
                if not np.any(B & A_new):
                    A = A_new
                    F = F.difference(set(np.where(A == True)[0]))

        i += 1
        print(len(F))
    return A, B
Exemplo n.º 6
0
weights = W[(W < np.inf) & (W > 0)].flatten()
edges = np.array(np.where((W < np.inf) & (W > 0))).T

np.random.seed(0)

g = gt.Graph()

# construct actual graph
g.add_vertex(n)
g.add_edge_list(edges)
weight_prop = g.new_edge_property("double", vals=weights)

comps, hist = gt.topology.label_components(g)

print(len(simplicial_vertices(g)))

paths = shortest_path_cover_logn_apx(g, weight_prop)

sum = 0
for i in paths:
    sum += np.ceil(np.log2(len(i)))

print("|S|=", len(paths))
print("#queries<=", sum, "%:", sum / n)

pos = list(np.arange(n)[y > 0])
neg = list(np.arange(n)[y <= 0])

print(n, pos, neg)
print("p", len(pos))
Exemplo n.º 7
0
def is_convex(dataset,q,weighted=True):
    X = np.genfromtxt('res/benchmark/SSL,set=' + str(dataset) + ',X.tab')
    #X = (X - np.min(X, axis=0)) / (np.max(X, axis=0) - np.min(X, axis=0))
    y = (np.genfromtxt('res/benchmark/SSL,set=' + str(dataset) + ',y.tab'))

    n = 100
    dists = scipy.spatial.distance.cdist(X, X)
    y = y[:n]
    y = (y-np.min(y))//(np.max(y)-np.min(y))
    #q = 0.04
    W = dists[:n,:n]#np.exp(-(dists) ** 2 / (2 * sigma ** 2))
    q = np.quantile(W, 0.1)
    W[W > q] = np.inf
    # W2 = np.copy(W) less edges is slower strangely
    if not weighted:
        W[W <= q] = 1
    np.fill_diagonal(W, 0)

    weights = W[(W<np.inf) & (W>0)].flatten()
    edges = np.array(np.where((W<np.inf) & (W>0))).T

    print("e",len(edges))
    #return

    np.random.seed(0)

    g = gt.Graph()

    # construct actual graph
    g.add_vertex(n)
    g.add_edge_list(edges)
    weight_prop = g.new_edge_property("double", val=1)


    comps,hist = gt.topology.label_components(g)

    simpl = simplicial_vertices(g)

    print(len(simpl), np.sum(closure.compute_hull(g, simpl, weight_prop)>0))
    #return
    paths = shortest_path_cover_logn_apx(g, weight_prop)



    sum = 0
    for i in paths:
        sum += np.ceil(np.log2(len(i)))

    print("|S|=", len(paths))
    print("#queries<=", sum, "%:", sum / n)


    pos = list(np.arange(n)[y > 0])[:n]
    neg = list(np.arange(n)[y <= 0])[:n]

    print(n,pos,neg)
    print("p",len(pos))
    print("n",len(neg))

    #pos_hull = closure.compute_hull(g,pos, weight_prop,comps,hist)
    #print(np.sum(pos_hull))
    #neg_hull = closure.compute_hull(g, neg, weight_prop,comps,hist)
    #print(np.sum(neg_hull))
    #print(len(set(np.where(pos_hull)[0]).intersection(set(np.where(neg_hull)[0])))/n)

    print("===============================================================")
    known_labels, budget = spc_querying_with_closure(g, paths,weight_prop,y)
    print(np.sum(np.abs(known_labels-y)/n))
    print(budget)
Exemplo n.º 8
0
def is_convex(dir, prefix, target_column, weighted=False):
    print(dir)
    np.random.seed(0)
    edges = np.genfromtxt(dir + prefix + '_edges.csv',
                          skip_header=True,
                          dtype=np.int,
                          delimiter=',')

    df = pd.read_csv(dir + prefix + '_target.csv')  #.sort_values('new_id')
    print(dir, "weighted", weighted)

    weight = 1
    if weighted:
        if 'twitch' in dir:
            weight = np.zeros(edges.shape[0])
            max = df.iloc[:, 1].max()
            min = df.iloc[:, 1].min()
            df.iloc[:, 1] = (df.iloc[:, 1] - min) / (max - min)
            max = df.iloc[:, 3].max()
            min = df.iloc[:, 3].min()
            df.iloc[:, 3] = (df.iloc[:, 3] - min) / (max - min)

            for i, e in enumerate(edges):
                weight[i] = (df.iloc[e[0], 1] - df.iloc[e[1], 1])**2 + (
                    df.iloc[e[0], 3] - df.iloc[e[1], 3])**2

        elif 'facebook' in dir:
            attributes = json.load(
                open('res/git/' + dir + '/facebook_features.json'))
            weight = np.zeros(edges.shape[0])
            for i, e in enumerate(edges):
                weight[i] = len(
                    set(attributes[str(e[0])]).symmetric_difference(
                        attributes[str(e[1])]))

    labels, _ = pd.factorize(df.iloc[:, target_column])

    g = gt.Graph(directed=False)

    g.add_edge_list(edges)

    if weighted:
        weight = g.new_edge_property("double", vals=weight)
    else:
        weight = g.new_edge_property("double", val=1)

    #comps, hist = gt.label_components(g)
    #dist_map = gt.shortest_distance(g, weights=weight)
    simple = simplicial_vertices.simplicial_vertices(g)
    spc = shortest_path_cover_logn_apx(g, weight)
    if weighted:
        weighted_str = "_weigted_"
    else:
        weighted_str = ""
    pickle.dump(spc, open(dir + 'spc' + weighted_str + '.p', 'wb'))

    print("n=", g.num_vertices(), "s=", len(simple))
    '''
    intersection_0 = []
    intersection_1 = []
    intersection_2 = []
    intersection_3 = []
    for c in np.unique(labels):
        print(c)
        cls = np.where(labels==c)[0][:5]
        hull_p = compute_hull(g, cls, weight,dist_map=dist_map, comps=comps, hist=hist, compute_closure=False)
        print(np.sum(hull_p), np.sum(hull_p) / g.num_vertices())
        hull_p = compute_hull(g, cls, weight,dist_map=dist_map, comps=comps, hist=hist)
        intersection_0.append(hull_p)
        print(np.sum(hull_p), np.sum(hull_p) / g.num_vertices())
        cls = np.where(labels == c)[0][:10]
        hull_p = compute_hull(g, cls, weight,dist_map=dist_map, comps=comps, hist=hist, compute_closure=False)
        print(np.sum(hull_p), np.sum(hull_p) / g.num_vertices())
        hull_p = compute_hull(g, cls, weight,dist_map=dist_map, comps=comps, hist=hist)
        intersection_1.append(hull_p)
        print(np.sum(hull_p), np.sum(hull_p) / g.num_vertices())
        cls = np.where(labels == c)[0][:50]
        hull_p = compute_hull(g, cls, weight,dist_map=dist_map,comps=comps, hist=hist,compute_closure=False)
        print(np.sum(hull_p),np.sum(hull_p)/g.num_vertices())
        hull_p = compute_hull(g, cls, weight,dist_map=dist_map, comps=comps, hist=hist)
        intersection_2.append(hull_p)
        print(np.sum(hull_p),np.sum(hull_p) / g.num_vertices())
        cls = np.where(labels == c)[0]
        hull_p = compute_hull(g, cls, weight,dist_map=dist_map, comps=comps, hist=hist, compute_closure=False)
        print(np.sum(hull_p), np.sum(hull_p) / g.num_vertices())
        hull_p = compute_hull(g, cls,weight, dist_map=dist_map, comps=comps, hist=hist)
        intersection_3.append(hull_p)
        print(np.sum(hull_p), np.sum(hull_p) / g.num_vertices())
        print("==========")

    print(np.sum(intersection_0[0] & intersection_0[1])/g.num_vertices())
    print(np.sum(intersection_1[0] & intersection_1[1]) / g.num_vertices())
    print(np.sum(intersection_2[0] & intersection_2[1]) / g.num_vertices())
    print(np.sum(intersection_3[0] & intersection_3[1]) / g.num_vertices())

    '''
    print("==================================")