def spc_querying_naive(g : graph_tool.Graph, paths, y, trust_own_predictions=True, weight=None, closed_interval=False): ''' :param g: :param paths: list of paths :param y: ground truth :param weight: :return: ''' known_labels = -np.ones(g.num_vertices())*np.inf budget = np.zeros(g.num_vertices()) for i, path in enumerate(paths): if not trust_own_predictions or known_labels[path[0]] == -np.inf: budget[i] += 1 known_labels[path[0]] = y[path[0]] if not trust_own_predictions or known_labels[path[-1]] == -np.inf: budget[i] += 1 known_labels[path[-1]] = y[path[-1]] if known_labels[path[0]] == known_labels[path[-1]]: known_labels[path] = known_labels[path[0]] else: label_budget, new_labels = binarySearch(y[path], 0, len(path)-1, known_labels[path[0]], known_labels[path]) known_labels[path] = new_labels budget[i] += label_budget if closed_interval: p =closure.compute_hull(g, np.where(known_labels==np.unique(y)[0])[0], weight, compute_closure=False) n = closure.compute_hull(g, np.where(known_labels==np.unique(y)[1])[0], weight, compute_closure=False) known_labels[p] = np.unique(y)[0] known_labels[n] = np.unique(y)[1] return known_labels, budget
def is_convex(dataset): X = np.genfromtxt('res/benchmark/SSL,set=' + str(dataset) + ',X.tab') #X = (X - np.min(X, axis=0)) / (np.max(X, axis=0) - np.min(X, axis=0)) y = (np.genfromtxt('res/benchmark/SSL,set=' + str(dataset) + ',y.tab')) n = 300 for n_prime in [n]: print("================================") print("n_prime=", n_prime) for q in [0.001, 0.002, 0.005, 0.01, 0.02, 0.05]: print("q=", q) dists = scipy.spatial.distance.cdist(X, X) y = y[:n] W = dists[:n, :n] #np.exp(-(dists) ** 2 / (2 * sigma ** 2)) np.fill_diagonal(W, 0) W[W > np.quantile(W, q)] = np.inf # W2 = np.copy(W) less edges is slower strangely # W2[W2 <= 0.1] = 0 weights = W[(W < np.inf) & (W > 0)].flatten() edges = np.array(np.where((W < np.inf) & (W > 0))).T np.random.seed(0) g = gt.Graph() # construct actual graph g.add_vertex(n) g.add_edge_list(edges) weight_prop = g.new_edge_property("double", vals=weights) comps, hist = gt.topology.label_components(g) print(len(simplicial_vertices(g))) continue paths = shortest_path_cover_logn_apx(g, weight_prop) sum = 0 for i in paths: sum += np.ceil(np.log2(len(i))) print("|S|=", len(paths)) print("#queries<=", sum, "%:", sum / n) pos = list(np.arange(n)[y > 0])[:n_prime] neg = list(np.arange(n)[y <= 0])[:n_prime] print(n, pos, neg) print("p", len(pos)) print("n", len(neg)) pos_hull = closure.compute_hull(g, pos, weight_prop, comps, hist) print(np.sum(pos_hull)) neg_hull = closure.compute_hull(g, neg, weight_prop, comps, hist) print(np.sum(neg_hull)) print( len( set(np.where(pos_hull)[0]).intersection( set(np.where(neg_hull)[0]))) / n)
def is_convex(directed): print("cora") np.random.seed(0) edges = np.genfromtxt('res/cora/cora.edges', dtype=np.int, delimiter=',')[:, :2] - 1 labels = np.genfromtxt('res/cora/cora.node_labels', dtype=np.int, delimiter=',')[:, 1] g = gt.Graph(directed=directed) g.add_edge_list(edges) weight = g.new_edge_property("double", val=1) comps, hist = gt.label_components(g) print(hist) dist_map = gt.shortest_distance(g, weights=weight) #, weights=weight) simple = simplicial_vertices.simplicial_vertices(g) print("n=", g.num_vertices(), "s=", len(simple)) spc = pickle.load(open("res/cora/spc_" + str(directed) + ".p", "rb")) #shortest_path_cover_logn_apx(g, weight) a, b = spc_querying_naive(g, spc, labels) print(a) print(b, np.sum(b)) print(np.sum(a == labels)) return print("len(spc)", len(spc)) num_of_convex_paths = 0 total_error = 0 for p in spc: error = are_convex(labels[p]) if error == 0: num_of_convex_paths += 1 else: total_error += error print("#convex paths", num_of_convex_paths) print("total error on paths", total_error) return pickle.dump(spc, open("res/cora/spc_" + str(directed) + ".p", "wb")) for c in np.unique(labels): print("class label", c) print("class size: ", np.sum(labels == c)) cls = np.where(labels == c)[0] for sample_size in [5, 10, 20, len(cls)]: print("sample_size", sample_size) if sample_size <= 20: times = 5 else: times = 1 for _ in range(times): sample = np.random.choice(cls, sample_size, replace=False) hull_p = compute_hull(g, sample, dist_map=dist_map, comps=comps, hist=hist, compute_closure=False) print("size interval: ", np.sum(hull_p)) print("number of correct in interval: ", np.sum(hull_p[cls])) hull_p = compute_hull(g, sample, dist_map=dist_map, comps=comps, hist=hist) print("size hull: ", np.sum(hull_p)) print("number of correct in interval: ", np.sum(hull_p[cls])) print("==================================")
def budgeted_heuristic_querying(g: graph_tool.Graph, y, weights=None, budget=50, compute_hulls_between_queries=False, hull_as_optimization=False, use_adjacency=False): ''' :param g: :param paths: list of paths :param y: ground truth :param weight: :return: ''' deg = g.degree_property_map("total").a #deg = deg*deg if use_adjacency: dist_map = graph_tool.topology.shortest_distance(g, weights=weights).get_2d_array(range(g.num_vertices())).T adjacency = dist_map.copy() adjacency[adjacency > 1] = 0 else: # to prevent overflow etc. dist_map = graph_tool.topology.shortest_distance(g, weights=weights).get_2d_array( range(g.num_vertices())).T.astype(np.double) dist_map[dist_map > g.num_vertices()] = np.inf # hack to allow both endpoints as candidates: # new_spc = paths.copy() # for p in paths: # new_spc.append(p[::-1]) # paths = new_spc comps, hist = graph_tool.topology.label_components(g) n = g.num_vertices() classes = np.unique(y) known_labels = -np.ones(g.num_vertices()) * np.inf candidate_hulls = np.zeros(n, dtype=np.object) candidate_hull_sizes = np.zeros(n) known_classes = dict() classes_hulls = dict() for j in range(n): candidate_hulls[j] = dict() for c in classes: known_classes[c] = set() classes_hulls[c] = dict() classes_hulls[c] = np.zeros(n, np.bool) for j in range(n): one_hot = np.zeros(n, dtype=np.bool) one_hot[j] = True candidate_hulls[j][c] = one_hot # singleton hull for z in range(budget): # compute most promising vertex for p in range(n): if known_labels[p] == -np.inf: candidate_hull_sizes[p] = helper_sum_sizes(candidate_hulls[p], classes_hulls) else: candidate_hull_sizes[p] = -1 maximizers = np.where(candidate_hull_sizes == np.max(candidate_hull_sizes))[0] #overlap of classes classes_hulls_overlap = np.sum(np.array([key_index_array[1] for key_index_array in classes_hulls.items()]), axis=0) #classes_hulls_overlap[classes_hulls_overlap<=1] = 0 maximizers = maximizers[np.where(classes_hulls_overlap[maximizers] == np.min(classes_hulls_overlap[maximizers]))[0]] #maximizers = maximizers[np.where(deg[maximizers] == np.max(deg[maximizers]))[0]] p_star = np.random.choice(maximizers) # query it known_labels[p_star] = y[p_star] # update data structures known_classes[known_labels[p_star]].add(p_star) classes_hulls[known_labels[p_star]] = candidate_hulls[p_star][known_labels[p_star]] for j in range(n): if known_labels[j] == -np.inf:# and not classes_hulls[c][j]: # if not candidate_hulls[j][c][candidate]: # if not classes_hulls[c][path[candidates[j]]]: # classes_hulls_c_set = set(np.where(classes_hulls[c])[0]) # old_hull_with_new_candidate = list(classes_hulls_c_set) # old_hull_with_new_candidate.append(path[candidates[j]]) c = known_labels[p_star] candidate_hulls[j][c] = compute_hull(g, list(known_classes[c].union([j])), weights, dist_map, comps, hist, hull_as_optimization) # , classes_hulls_c_set) test = np.zeros(n, dtype=np.bool) for p1 in list(known_classes[c].union([j])): for p2 in list(known_classes[c].union([j])): test[dist_map[p1,:]+ dist_map[:,p2] == dist_map[p1,p2]] = True '''if compute_hulls_between_queries: for c in classes: known_labels[np.where(compute_hull(g, np.where(known_labels == c)[0], weights, dist_map, comps, hist))[0]] = c''' if compute_hulls_between_queries: known_labels_augmented = known_labels.copy() known_classes_hulls_temp = np.zeros((n, len(classes)), dtype=np.bool) for i, c in enumerate(classes): known_classes_hulls_temp[:, i] = compute_hull(g, np.where(known_labels_augmented == c)[0], weights, dist_map, comps, hist, compute_closure=False) for i, c in enumerate(classes): only_c = known_classes_hulls_temp[:, i] & ~( np.sum(known_classes_hulls_temp[:, np.arange(len(classes)) != i], axis=1).astype(bool)) known_labels_augmented[only_c] = c else: known_labels_augmented = known_labels if use_adjacency: prediction = label_propagation(adjacency, known_labels_augmented, y, use_adjacency=use_adjacency) else: prediction = label_propagation(dist_map, known_labels_augmented, y, use_adjacency=use_adjacency) print("=====") print(z + 1, np.sum(known_labels > -np.inf)) print(np.sum(np.array([i[1] for i in list(classes_hulls.items())]),axis=1)) print("accuracy", np.sum(prediction == y) / y.size) #print(known_classes) return known_labels
def budgeted_spc_querying(g : graph_tool.Graph, paths, y, weights=None, budget=50, compute_hulls_between_queries=False, hull_as_optimization=False, use_adjacency=False): ''' :param g: :param paths: list of paths :param y: ground truth :param weight: :return: ''' if use_adjacency: dist_map = graph_tool.topology.shortest_distance(g, weights=weights).get_2d_array(range(g.num_vertices())).T adjacency = dist_map.copy() adjacency[adjacency > 1] = 0 else: #to prevent overflow etc. dist_map = graph_tool.topology.shortest_distance(g, weights=weights).get_2d_array( range(g.num_vertices())).T.astype(np.double) dist_map[dist_map > g.num_vertices()] = np.inf #hack to allow both endpoints as candidates: #new_spc = paths.copy() #for p in paths: # new_spc.append(p[::-1]) #paths = new_spc comps, hist = graph_tool.topology.label_components(g) n = g.num_vertices() classes = np.unique(y) known_labels = -np.ones(g.num_vertices())*np.inf candidates = np.zeros(len(paths), dtype=np.int) candidate_generators = np.zeros(len(paths), dtype=np.object) for i, path in enumerate(paths): candidate_generators[i] = binarySearchGenerator(known_labels, path, 0, len(path)-1) candidates[i] = next(candidate_generators[i]) candidate_hulls = np.zeros(len(paths), dtype=np.object) candidate_hull_sizes = np.zeros(len(paths)) classes_hull_sizes = np.zeros(len(paths)) known_classes = dict() classes_hulls = dict() deg = g.degree_property_map("total").a deg = deg*deg for j, candidate in enumerate(candidates): candidate_hulls[j] = dict() for c in classes: known_classes[c] = set() classes_hulls[c] = dict() for j, candidate in enumerate(candidates): temp = np.zeros(n, dtype=np.bool) classes_hulls[c] = temp.copy() #empty hulls temp[paths[j][candidate]] = True candidate_hulls[j][c] = temp #singleton hull for z in range(budget): #compute most promising vertex for p in range(len(paths)): if known_labels[paths[p][candidates[p]]] == -np.inf: candidate_hull_sizes[p] = helper_sum_sizes(candidate_hulls[p], classes_hulls) else: candidate_hull_sizes[p] = -1 maximizers = np.where(candidate_hull_sizes == np.max(candidate_hull_sizes))[0] #prefer not queried paths if np.any(candidates[maximizers] == 0): maximizers = maximizers[np.where(candidates[maximizers] == 0)[0]] p_star = np.random.choice(maximizers) else: p_star = np.random.choice(maximizers) candidate = paths[p_star][candidates[p_star]] #query it known_labels[candidate] = y[candidate] #update data structures known_classes[known_labels[candidate]].add(candidate) classes_hulls[known_labels[candidate]] = candidate_hulls[p_star][known_labels[candidate]] for j in range(len(candidates)): path = paths[j] while known_labels[path[candidates[j]]] != -np.inf or path[candidates[j]] in classes_hulls[known_labels[candidate]]: try: candidates[j] = next(candidate_generators[j]) except StopIteration: break #if not candidate_hulls[j][c][candidate]: #if not classes_hulls[c][path[candidates[j]]]: #classes_hulls_c_set = set(np.where(classes_hulls[c])[0]) #old_hull_with_new_candidate = list(classes_hulls_c_set) #old_hull_with_new_candidate.append(path[candidates[j]]) for c in classes: candidate_hulls[j][c] = compute_hull(g, list(known_classes[c].union([path[candidates[j]]])), weights, dist_map, comps, hist, hull_as_optimization)#, classes_hulls_c_set) '''if compute_hulls_between_queries: for c in classes: known_labels[np.where(compute_hull(g, np.where(known_labels == c)[0], weights, dist_map, comps, hist))[0]] = c''' if compute_hulls_between_queries: known_labels_augmented = known_labels.copy() known_classes_hulls_temp = np.zeros((n, len(classes)), dtype=np.bool) for i, c in enumerate(classes): known_classes_hulls_temp[:,i] = compute_hull(g, np.where(known_labels_augmented == c)[0], weights, dist_map, comps, hist, compute_closure=False) for i, c in enumerate(classes): only_c = known_classes_hulls_temp[:,i] & ~(np.sum(known_classes_hulls_temp[:,np.arange(len(classes))!=i],axis=1).astype(bool)) known_labels_augmented[only_c] = c else: known_labels_augmented = known_labels if use_adjacency: prediction = label_propagation(adjacency, known_labels_augmented, y, use_adjacency=use_adjacency) else: prediction = label_propagation(dist_map, known_labels_augmented, y, use_adjacency=use_adjacency) print("======") print(z+1, np.sum(known_labels>-np.inf)) print("accuracy", np.sum(prediction==y)/y.size) #print(known_classes) return known_labels
def spc_querying_with_shadow(g: graph_tool.Graph, paths, weights, y): ''' :param g: :param paths: list of paths :param y: ground truth :param weight: :return: ''' np.random.seed(55) #these two lines make repetitive closure computation a lot faster dist_map = graph_tool.topology.shortest_distance(g, weights=weights).get_2d_array(range(g.num_vertices())).T comps, hist = graph_tool.topology.label_components(g) known_labels = -np.ones(g.num_vertices()) num_of_known_labels = 0 budget = 0 pos_value, neg_value = np.unique(y) next_candidate_queues = [Queue() for _ in paths] left = np.zeros(len(paths), dtype=np.int) right = np.array([len(p)-1 for p in paths], dtype=np.int) queue_idxs = list(range(len(paths))) n = g.num_vertices() for i,path in enumerate(paths): next_candidate_queues[i].put(0) if len(path) > 1: next_candidate_queues[i].put(len(path)-1) starting_idx = np.random.choice(np.where(right>0)[0]) starting_path = paths[starting_idx] budget += 2 l = next_candidate_queues[starting_idx].get() r = next_candidate_queues[starting_idx].get() known_labels[starting_path[l]] = y[starting_path[l]] known_labels[starting_path[r]] = y[starting_path[r]] if known_labels[starting_path[0]] == known_labels[starting_path[-1]]: #color the hull of the path in the color of the endpoints path_closure = np.where(compute_hull(g, starting_path, weights, dist_map, comps, hist))[0] known_labels[path_closure] = known_labels[starting_path[0]] num_of_known_labels = len(path_closure) del queue_idxs[starting_idx] else: if (len(starting_path)>=3): next_candidate_queues[starting_idx].put(l + (r - l)//2) else: del queue_idxs[starting_idx] num_of_known_labels = 2 pos = np.where(known_labels==pos_value)[0] neg = np.where(known_labels==neg_value)[0] candidates = np.zeros(len(paths), dtype=np.int) candidates[queue_idxs] = [next_candidate_queues[queue_idx].get() for queue_idx in queue_idxs] #this is always relative to the path candidate_pos_hulls = np.zeros((len(paths),n), dtype=np.bool) temp_pos_hulls = np.zeros((n,n), dtype=np.bool) if len(pos) > 0: candidate_pos_hulls[queue_idxs] = [closure.compute_hull(g, np.append(pos, paths[idx][candidates[idx]]), weights, dist_map, comps, hist) for idx in queue_idxs] else: for idx in queue_idxs: candidate_pos_hulls[idx][paths[idx][candidates[idx]]] = True candidate_neg_hulls = np.zeros((len(paths),n), dtype=np.bool) temp_neg_hulls = np.zeros((n, n), dtype=np.bool) if len(neg) > 0: candidate_neg_hulls[queue_idxs] = [closure.compute_hull(g, np.append(neg, paths[idx][candidates[idx]]), weights, dist_map, comps, hist) for idx in queue_idxs] else: for idx in queue_idxs: candidate_neg_hulls[idx][paths[idx][candidates[idx]]] = True pos_gains = np.zeros(len(paths)) neg_gains = np.zeros(len(paths)) while num_of_known_labels < n: to_remove = [] changed = [] for idx in queue_idxs: while known_labels[paths[idx][candidates[idx]]] >= 0: if not next_candidate_queues[idx].empty(): candidates[idx] = next_candidate_queues[idx].get() else: maybe_remove = refill_queue_for_candidate(idx, candidates[idx], candidates, known_labels, left, next_candidate_queues, paths, queue_idxs, right) if maybe_remove is not None: to_remove.append(maybe_remove) break else: candidates[idx] = next_candidate_queues[idx].get() changed.append(idx) for i in range(n): temp_pos_hulls[i] = closure.compute_hull(g, np.append(pos, i), weights, dist_map, comps, hist, True, pos if len(pos) > 0 else None) temp_neg_hulls[i] = closure.compute_hull(g, np.append(neg, i), weights, dist_map, comps, hist, True, neg if len(neg) > 0 else None) for i in changed: candidate_pos_hulls[i] = closure.compute_shadow(g, np.append(pos, paths[i][candidates[i]]), neg, weights, dist_map, comps, hist, B_hulls=temp_neg_hulls) candidate_neg_hulls[i] = closure.compute_shadow(g, np.append(neg, paths[i][candidates[i]]), pos, weights, dist_map, comps, hist, B_hulls=temp_pos_hulls) for i in to_remove: queue_idxs.remove(i) if np.sum(known_labels[paths[i]] >= 0) != len(paths[i]): exit(555) pos_gains[queue_idxs] = np.sum(candidate_pos_hulls[queue_idxs], axis=1) - len(pos) neg_gains[queue_idxs] = np.sum(candidate_neg_hulls[queue_idxs], axis=1) - len(neg) heuristic = np.average(np.array([pos_gains[queue_idxs], neg_gains[queue_idxs]]), axis=0) candidate_idx = queue_idxs[np.argmax(heuristic)] candidate_vertex = candidates[candidate_idx] if known_labels[paths[candidate_idx][candidate_vertex]] == y[paths[candidate_idx][candidate_vertex]]: exit(9) known_labels[paths[candidate_idx][candidate_vertex]] = y[paths[candidate_idx][candidate_vertex]] budget += 1 if known_labels[paths[candidate_idx][candidate_vertex]] == pos_value: pos =np.where(candidate_pos_hulls[candidate_idx])[0] known_labels[pos] = pos_value #only recompute pos hulls, the negatives won't change candidate_pos_hulls[queue_idxs] = [closure.compute_shadow(g, np.append(pos, paths[idx][candidates[idx]]), neg, weights, dist_map, comps, hist, temp_neg_hulls) for idx in queue_idxs] candidate_neg_hulls[queue_idxs] = [closure.compute_shadow(g, np.append(neg, paths[idx][candidates[idx]]), pos, weights, dist_map, comps, hist, temp_pos_hulls) for idx in queue_idxs] else: neg =np.where(candidate_neg_hulls[candidate_idx])[0] known_labels[neg] = neg_value # only recompute pos hulls, the negatives won't change candidate_pos_hulls[queue_idxs] = [closure.compute_shadow(g, np.append(pos, paths[idx][candidates[idx]]), neg, weights, dist_map, comps, hist, temp_neg_hulls) for idx in queue_idxs] candidate_neg_hulls[queue_idxs] = [closure.compute_shadow(g, np.append(neg, paths[idx][candidates[idx]]), pos, weights, dist_map, comps, hist, temp_pos_hulls) for idx in queue_idxs] if next_candidate_queues[candidate_idx].empty(): maybe_remove = refill_queue_for_candidate(candidate_idx, candidate_vertex, candidates, known_labels, left, next_candidate_queues, paths, queue_idxs, right) if maybe_remove is None: candidates[candidate_idx] = next_candidate_queues[candidate_idx].get() else: queue_idxs.remove(candidate_idx) else: candidates[candidate_idx] = next_candidate_queues[candidate_idx].get() candidate_pos_hulls[candidate_idx] = closure.compute_shadow(g, np.append(pos, paths[candidate_idx][candidates[candidate_idx]]), neg, weights, dist_map, comps, hist, temp_neg_hulls) candidate_neg_hulls[candidate_idx] = closure.compute_shadow(g, np.append(neg, paths[candidate_idx][candidates[candidate_idx]]), pos, weights, dist_map, comps, hist, temp_pos_hulls) #pos = np.where(known_labels==pos_value)[0] #neg = np.where(known_labels==neg_value)[0] pos = np.where(compute_hull(g, np.where(known_labels==pos_value)[0], weights, dist_map, comps, hist))[0] neg = np.where(compute_hull(g, np.where(known_labels==neg_value)[0], weights, dist_map, comps, hist))[0] num_of_known_labels = len(pos) + len(neg) print(num_of_known_labels, n) return known_labels, budget
def spc_querying_naive_one_convex(g : graph_tool.Graph, paths, y, convex_label, epsilon=0.5, weight=None, binary_search=False,closed_interval=False): ''' :param g: :param paths: list of paths :param y: ground truth :param weight: :return: ''' print("epsilon", epsilon) known_labels = -np.ones(g.num_vertices())*np.inf budget = np.zeros(g.num_vertices()) non_convex_label = np.unique(y) non_convex_label = non_convex_label[int(np.where(non_convex_label==convex_label)[0]+1)%2] for i, full_path in enumerate(paths): if np.any(known_labels[full_path] == convex_label): smallest = np.min(np.where(known_labels[full_path] == convex_label)[0]) biggest = np.max(np.where(known_labels[full_path] == convex_label)[0]) if np.any(known_labels[full_path[:smallest]] == non_convex_label): known_labels[full_path[:np.max(np.where(known_labels[full_path[:smallest]] == non_convex_label)[0])]] = non_convex_label if np.any(known_labels[full_path[biggest:]] == non_convex_label): known_labels[full_path[np.min(np.where(known_labels[full_path[biggest:]] == non_convex_label)[0]):]] = non_convex_label path = np.array(full_path)[known_labels[full_path] == -np.inf] for z in range(1,int(np.ceil(1/epsilon))): j = int(z*(np.ceil(epsilon*len(path)))) while j < len(path) and known_labels[path[j]] != -np.inf: j += 1 if j >= len(path): break if np.sum(np.where(known_labels==-np.inf)[0]) <= epsilon*len(path): conv_region = np.where(known_labels[path] == convex_label)[0] if conv_region.size > 0: known_labels[path] = known_labels[path[0]] known_labels[np.min(conv_region):np.max(conv_region)+1] = convex_label break known_labels[path[j]] = y[path[j]] budget[i] += 1 if np.any(known_labels[path] == convex_label): smallest = np.min(np.where(known_labels[path] == convex_label)[0]) biggest = np.max(np.where(known_labels[path] == convex_label)[0]) if binary_search: l_path = path[:smallest+1] if known_labels[l_path[0]] == -np.inf: known_labels[l_path[0]] = y[l_path[0]] budget[i] += 1 label_budget, new_labels = binarySearch(y[l_path], 0, len(l_path) - 1, known_labels[l_path[0]], known_labels[l_path]) known_labels[l_path] = new_labels budget[i] += label_budget r_path = path[biggest:] if known_labels[r_path[-1]] == -np.inf: known_labels[r_path[-1]] = y[r_path[-1]] budget[i] += 1 label_budget, new_labels = binarySearch(y[r_path], 0, len(r_path) - 1, known_labels[r_path[0]], known_labels[r_path]) known_labels[r_path] = new_labels budget[i] += label_budget else: j_minus = smallest -1 while j_minus > 0 and known_labels[path[j_minus]] == -np.inf: j_minus -= 1 j_plus = biggest+ 1 while j_plus < len(path) and known_labels[path[j_plus]] == -np.inf: j_plus += 1 if known_labels[path[j_minus + (smallest - j_minus)//2]] == -np.inf: known_labels[path[j_minus + (smallest - j_minus)//2]] = y[path[j_minus + (smallest - j_minus)//2]] budget[i] += 1 if known_labels[path[biggest + (j_plus - biggest) // 2]] == -np.inf: known_labels[path[biggest + (j_plus - biggest) // 2]] = y[path[biggest + (j_plus - biggest) // 2]] budget[i] += 1 smallest = np.min(np.where(known_labels[path] == convex_label)[0]) biggest = np.max(np.where(known_labels[path] == convex_label)[0]) known_labels[path[smallest:biggest+1]] = convex_label if smallest > 0: known_labels[path[:smallest-1]] = non_convex_label if biggest < len(path)-1: known_labels[path[biggest+1:]] = non_convex_label else: known_labels[path] = non_convex_label convex_class = closure.compute_hull(g, np.where(known_labels == convex_label)[0], weight) known_labels[convex_class] = convex_label return known_labels, budget
def spc_semi_supervised_experiments(g: gt.Graph, weight_prop: gt.EdgePropertyMap, labels): np.random.seed(1) dist_map = gt.topology.shortest_distance(g, weights=weight_prop) W = dist_map.get_2d_array(range(g.num_vertices())) # original distance map new_labels = np.zeros(g.num_vertices()) new_labels[labels == np.unique(labels)[1]] = 1 for budget in [10, 20, 50, 100]: print("========================================================") print("budget: ", budget, "|V|=", g.num_vertices()) print("==================s2=====================") overall_labelling = shortest_shortest_path_querying.s2( g, weight_prop, labels, budget) print("accuracy after label_prop: ", np.sum(overall_labelling == new_labels) / g.num_vertices()) for _ in range(5): starting_vertices = np.random.choice(range(g.num_vertices()), budget, replace=False) known_labels = -np.ones(g.num_vertices()) * np.inf known_labels[starting_vertices] = labels[starting_vertices] pos_label, neg_label = np.unique(labels) pos = np.where(known_labels == pos_label)[0] neg = np.where(known_labels == neg_label)[0] print("=============without hull===================") print("label propagation") overall_labelling = label_propagation(W, known_labels, np.unique(labels)) print("accuracy after label_prop: ", np.sum(overall_labelling == new_labels) / g.num_vertices()) print("=============interval============") pos_hull = compute_hull(g, pos, weight_prop, dist_map, compute_closure=False) neg_hull = compute_hull(g, neg, weight_prop, dist_map, compute_closure=False) print("pos", pos.size) print("hull size: ", np.sum(pos_hull)) print("hull correctness overall", np.sum(pos_hull & (labels == pos_label))) mask = np.ones(g.num_vertices(), dtype=np.bool) mask[pos] = False print("hull correctness on new vertices", np.sum(pos_hull[mask] & (labels == pos_label)[mask])) known_labels[pos_hull] = pos_label print("neg", neg.size) print("hull size: ", np.sum(neg_hull)) print("hull correctness overall", np.sum(neg_hull & (labels == neg_label))) mask = np.ones(g.num_vertices(), dtype=np.bool) mask[neg] = False print("hull correctness on new vertices", np.sum(neg_hull[mask] & (labels == neg_label)[mask])) known_labels[neg_hull] = neg_label print("label propagation") overall_labelling = label_propagation(W, known_labels, np.unique(labels)) print("accuracy after label_prop: ", np.sum(overall_labelling == new_labels) / g.num_vertices()) print("==============closure=================") pos_hull = compute_hull(g, pos, weight_prop, dist_map) neg_hull = compute_hull(g, neg, weight_prop, dist_map) print("pos", pos.size) print("hull size: ", np.sum(pos_hull)) print("hull correctness overall", np.sum(pos_hull & (labels == pos_label))) mask = np.ones(g.num_vertices(), dtype=np.bool) mask[pos] = False print("hull correctness on new vertices", np.sum(pos_hull[mask] & (labels == pos_label)[mask])) known_labels[pos_hull] = pos_label print("neg", neg.size) print("hull size: ", np.sum(neg_hull)) print("hull correctness overall", np.sum(neg_hull & (labels == neg_label))) mask = np.ones(g.num_vertices(), dtype=np.bool) mask[neg] = False print("hull correctness on new vertices", np.sum(neg_hull[mask] & (labels == neg_label)[mask])) print("label propagation") known_labels[neg_hull] = neg_label overall_labelling = label_propagation(W, known_labels, np.unique(labels)) print("accuracy after label_prop: ", np.sum(overall_labelling == new_labels) / g.num_vertices())
def is_convex(dir, prefix, target_column, weighted=False): print(dir) np.random.seed(0) edges = np.genfromtxt(dir + prefix + '_edges.csv', skip_header=True, dtype=np.int, delimiter=',') df = pd.read_csv(dir + prefix + '_target.csv') #.sort_values('new_id') print(dir, "weighted", weighted) weight = 1 if weighted: if 'twitch' in dir: weight = np.zeros(edges.shape[0]) max = df.iloc[:, 1].max() min = df.iloc[:, 1].min() df.iloc[:, 1] = (df.iloc[:, 1] - min) / (max - min) max = df.iloc[:, 3].max() min = df.iloc[:, 3].min() df.iloc[:, 3] = (df.iloc[:, 3] - min) / (max - min) for i, e in enumerate(edges): weight[i] = (df.iloc[e[0], 1] - df.iloc[e[1], 1])**2 + ( df.iloc[e[0], 3] - df.iloc[e[1], 3])**2 elif 'facebook' in dir: attributes = json.load( open('res/git/' + dir + '/facebook_features.json')) weight = np.zeros(edges.shape[0]) for i, e in enumerate(edges): weight[i] = len( set(attributes[str(e[0])]).symmetric_difference( attributes[str(e[1])])) labels, _ = pd.factorize(df.iloc[:, target_column]) new_n = 4000 pos_label, neg_label = np.unique(labels) pos = np.where(labels == pos_label)[0] neg = np.where(labels == neg_label)[0] g = gt.Graph(directed=False) g.add_edge_list(edges) '''d = g.get_out_degrees(range(g.num_vertices())) d_pos = d[pos].argsort()[-new_n//2:][::-1] d_neg = d[neg].argsort()[-new_n//2:][::-1] d = np.append(d_pos, d_neg) g2 = gt.Graph(directed=False) edges =edges[np.isin(edges[:,0],d)&np.isin(edges[:,1],d)] indexes = np.unique(edges) labels = labels[indexes] for i, idx in enumerate(indexes): edges[edges==idx] = i g2.add_edge_list(edges) comp = gt.topology.label_largest_component(g2) d = np.where(comp.a == 1)[0] labels = labels[d] g3 = gt.Graph(directed=False) edges = edges[np.isin(edges[:, 0], d) & np.isin(edges[:, 1], d)] for i, idx in enumerate(np.unique(edges)): edges[edges == idx] = i g3.add_edge_list(edges) g = g3''' if weighted: weight = g.new_edge_property("double", vals=weight) else: weight = g.new_edge_property("double", val=1) comps, hist = gt.topology.label_components(g) #print(hist) #dist_map = gt.shortest_distance(g, weights=weight) simple = simplicial_vertices.simplicial_vertices(g) gt.stats.remove_self_loops(g) print("n=", g.num_vertices(), "simplicial=", len(simple)) #spc = shortest_path_cover_logn_apx(g, weight) if weighted: weighted_str = "_weigted_" else: weighted_str = "" #pickle.dump(spc, open(dir+'spc'+weighted_str+'.p', 'wb')) spc = pickle.load(open(dir + 'spc' + weighted_str + '.p', 'rb')) weight = None pos = np.where(labels == pos_label)[0] neg = np.where(labels == neg_label)[0] print("pos", len(pos)) print("neg", len(neg)) spc_semi_supervised_experiments(g, weight, labels) p_interval = compute_hull(g, pos, weight, compute_closure=False) n_interval = compute_hull(g, neg, weight, compute_closure=False) print("pos_interval size: ", np.sum(p_interval)) print("neg_interval size: ", np.sum(n_interval)) print("intersection of intervals size: ", np.sum(p_interval & n_interval)) p_hull = compute_hull(g, pos, weight) n_hull = compute_hull(g, neg, weight) print("pos_hull size: ", np.sum(p_hull)) print("neg_hull size: ", np.sum(n_hull)) print("intersection of hulls size: ", np.sum(p_hull & n_hull))
def is_convex(weighted): print("digit1") np.random.seed(0) X = np.genfromtxt('res/benchmark/SSL,set=' + str(1) + ',X.tab') # X = (X - np.min(X, axis=0)) / (np.max(X, axis=0) - np.min(X, axis=0)) y = (np.genfromtxt('res/benchmark/SSL,set=' + str(1) + ',y.tab')) n = X.shape[0] dists = scipy.spatial.distance.cdist(X, X) y = y[:n] W = dists[:n, :n] # np.exp(-(dists) ** 2 / (2 * sigma ** 2)) np.fill_diagonal(W, 0) W[W > np.quantile(W, 0.004)] = np.inf # W2 = np.copy(W) less edges is slower strangely # W2[W2 <= 0.1] = 0 weights = W[(W < np.inf) & (W > 0)].flatten() edges = np.array(np.where((W < np.inf) & (W > 0))).T np.random.seed(0) g = gt.Graph() # construct actual graph g.add_vertex(n) g.add_edge_list(edges) if weighted: weight_prop = g.new_edge_property("double", vals=weights) else: weight_prop = g.new_edge_property("double", val=1) comps, hist = gt.label_components(g) #print("simplicial=", len(simplicial_vertices(g)), "#coms=", hist.size) dist_map = gt.shortest_distance(g, weights=weight_prop) #paths = shortest_path_cover_logn_apx(g, weight_prop) if not weighted: spc = pickle.load( open( "res/benchmark/spc_" + str(1) + "_q_" + str(0.004) + "_weighted_" + str(weighted) + ".p", "rb")) else: spc = shortest_path_cover_logn_apx(g, weight_prop) labels = y a, b = spc_querying_naive(g, spc, labels) print(a) print(b, np.sum(b)) print(np.sum(a == labels)) print("len(spc)", len(spc)) num_of_convex_paths = 0 total_error = 0 for p in spc: error = are_convex(labels[p]) if error == 0: num_of_convex_paths += 1 else: total_error += error print("#convex paths", num_of_convex_paths) print("total error on paths", total_error) return for c in np.unique(labels): print("class label", c) print("class size: ", np.sum(labels == c)) cls = np.where(labels == c)[0] for sample_size in [5, 10, 20, len(cls)]: print("sample_size", sample_size) if sample_size <= 20: times = 5 else: times = 1 for _ in range(times): sample = np.random.choice(cls, sample_size, replace=False) hull_p = compute_hull(g, sample, dist_map=dist_map, comps=comps, hist=hist, compute_closure=False) print("size interval: ", np.sum(hull_p)) print("number of correct in interval: ", np.sum(hull_p[cls])) hull_p = compute_hull(g, sample, dist_map=dist_map, comps=comps, hist=hist) print("size hull: ", np.sum(hull_p)) print("number of correct in interval: ", np.sum(hull_p[cls])) print("==================================")
for v in g.vertices(): #try to find a clique around v #TODO: Replace with numpy style for x, y in itertools.combinations(g.get_all_neighbors(v), 2): if g.edge(x, y) is None: break else: simplicial_vertices.append(int(v)) #print(len(g.get_all_neighbors(v))) return simplicial_vertices if __name__ == "__main__": for i in range(1, 100): deg_sampler = lambda: np.random.randint(1, i * 50) g = random_graph(i * 100, deg_sampler, directed=False) weight = g.new_edge_property("int", val=1) s = simplicial_vertices(g) print(i * 100, len(s)) if len(s) > 0: print(np.sum(compute_hull(g, s, weight) > 0)) print( np.sum( compute_hull(g, np.random.randint(0, i * 100, len(s)), weight) > 0)) print("=========================")
def florians_procedure(g: gt.Graph, use_simplicial): n = g.num_vertices() if not use_simplicial: s = simplicial_vertices(g) a = s[0] while a in s: a = np.random.randint(0, n) b = a while a == b or b in s: b = np.random.randint(0, n) else: a = np.random.randint(0, n) b = a while a == b: b = np.random.randint(0, n) A = np.zeros(n, dtype=np.bool) A[a] = True B = np.zeros(n, dtype=np.bool) B[b] = True F = set(range(n)).difference(np.where(A | B == True)[0]) i = 0 while len(F) > 0: e = F.pop() if i % 2 == 0: A[e] = True A_new = (g, np.where(A == True)[0]) if not np.any(B & A_new): A = A_new F = F.difference(set(np.where(A == True)[0])) else: A[e] = False B[e] = True B_new = compute_hull(g, np.where(B == True)[0]) if not np.any(A & B_new): B = B_new F = F.difference(set(np.where(A == True)[0])) else: B[e] = False else: B[e] = True B_new = compute_hull(g, np.where(B == True)[0]) if not np.any(A & B_new): B = B_new F = F.difference(set(np.where(A == True)[0])) else: B[e] = False A[e] = True A_new = compute_hull(g, np.where(A == True)[0]) if not np.any(B & A_new): A = A_new F = F.difference(set(np.where(A == True)[0])) i += 1 print(len(F)) return A, B
def is_convex(dataset,q,weighted=True): X = np.genfromtxt('res/benchmark/SSL,set=' + str(dataset) + ',X.tab') #X = (X - np.min(X, axis=0)) / (np.max(X, axis=0) - np.min(X, axis=0)) y = (np.genfromtxt('res/benchmark/SSL,set=' + str(dataset) + ',y.tab')) n = 100 dists = scipy.spatial.distance.cdist(X, X) y = y[:n] y = (y-np.min(y))//(np.max(y)-np.min(y)) #q = 0.04 W = dists[:n,:n]#np.exp(-(dists) ** 2 / (2 * sigma ** 2)) q = np.quantile(W, 0.1) W[W > q] = np.inf # W2 = np.copy(W) less edges is slower strangely if not weighted: W[W <= q] = 1 np.fill_diagonal(W, 0) weights = W[(W<np.inf) & (W>0)].flatten() edges = np.array(np.where((W<np.inf) & (W>0))).T print("e",len(edges)) #return np.random.seed(0) g = gt.Graph() # construct actual graph g.add_vertex(n) g.add_edge_list(edges) weight_prop = g.new_edge_property("double", val=1) comps,hist = gt.topology.label_components(g) simpl = simplicial_vertices(g) print(len(simpl), np.sum(closure.compute_hull(g, simpl, weight_prop)>0)) #return paths = shortest_path_cover_logn_apx(g, weight_prop) sum = 0 for i in paths: sum += np.ceil(np.log2(len(i))) print("|S|=", len(paths)) print("#queries<=", sum, "%:", sum / n) pos = list(np.arange(n)[y > 0])[:n] neg = list(np.arange(n)[y <= 0])[:n] print(n,pos,neg) print("p",len(pos)) print("n",len(neg)) #pos_hull = closure.compute_hull(g,pos, weight_prop,comps,hist) #print(np.sum(pos_hull)) #neg_hull = closure.compute_hull(g, neg, weight_prop,comps,hist) #print(np.sum(neg_hull)) #print(len(set(np.where(pos_hull)[0]).intersection(set(np.where(neg_hull)[0])))/n) print("===============================================================") known_labels, budget = spc_querying_with_closure(g, paths,weight_prop,y) print(np.sum(np.abs(known_labels-y)/n)) print(budget)