def run_random_walks(data_dir, weight_edges=False): print "Loading data and building transition matrix..." examples = util.load_json('./data/' + data_dir + '/examples.json') G = nx.read_edgelist('./data/' + data_dir + '/graph.txt', nodetype=int) if weight_edges: reviews = util.load_json('./data/' + data_dir + '/review.json') end_date = datetime.date(2012, 1, 1) if data_dir == 'train' else datetime.date(2013, 1, 1) edges = G.edges() for e in util.logged_loop(edges, util.LoopLogger(20000, len(edges), True)): n1, n2 = str(e[0]), str(e[1]) if n1 not in reviews or n2 not in reviews[n1]: n1, n2 = n2, n1 G[e[0]][e[1]]['weight'] = 1.0 / ((end_date - get_date(reviews[n1][n2][0])).days + 90) del reviews # save some memory adjacency_matrix = nx.adjacency_matrix(G) inverse_degree_matrix = sparse.diags([[1.0 / adjacency_matrix.getrow(i).sum() for i in range(adjacency_matrix.shape[0])]], [0]) transition_matrix = inverse_degree_matrix.dot(adjacency_matrix) print "Running random walks..." for u in util.logged_loop(examples, util.LoopLogger(10, len(examples), True)): p = run_random_walk(transition_matrix, int(u), 10).todense() for b in examples[u]: examples[u][b] = p[0, int(b)] util.write_json(examples, './data/' + data_dir + ('/weighted_random_walks.json' if weight_edges else '/random_walks.json'))
def graph_propagate(embeddings, positive_seeds, negative_seeds, **kwargs): """ Graph propagation method dapted from Velikovich, Leonid, et al. "The viability of web-derived polarity lexicons." http://www.aclweb.org/anthology/N10-1119 Should be used with arccos=True """ def run_graph_propagate(seeds, alpha_mat, trans_mat, T=1, **kwargs): def get_rel_edges(ind_set): rel_edges = set([]) for node in ind_set: rel_edges = rel_edges.union([ (node, other) for other in trans_mat[node, :].nonzero()[1] ]) return rel_edges for seed in seeds: F = set([seed]) for t in range(T): for edge in get_rel_edges(F): alpha_mat[seed, edge[1]] = max( alpha_mat[seed, edge[1]], alpha_mat[seed, edge[0]] * trans_mat[edge[0], edge[1]]) F.add(edge[1]) return alpha_mat M = similarity_matrix(embeddings, **kwargs) M = (M + M.T) / 2 print("Getting positive scores..") pos_alpha = M.copy() neg_alpha = M.copy() M = csr_matrix(M) pos_alpha = run_graph_propagate( [embeddings.wi[seed] for seed in positive_seeds], pos_alpha, M, **kwargs) pos_alpha = pos_alpha + pos_alpha.T print("Getting negative scores..") neg_alpha = run_graph_propagate( [embeddings.wi[seed] for seed in negative_seeds], neg_alpha, M, **kwargs) neg_alpha = neg_alpha + neg_alpha.T print("Computing final scores...") polarities = {} index = embeddings.wi pos_pols = {w: 1.0 for w in positive_seeds} for w in negative_seeds: pos_pols[w] = 0.0 neg_pols = {w: 1.0 for w in negative_seeds} for w in positive_seeds: neg_pols[w] = 0.0 for w in util.logged_loop(index): if w not in positive_seeds and w not in negative_seeds: pos_pols[w] = sum(pos_alpha[index[w], index[seed]] for seed in positive_seeds if seed in index) neg_pols[w] = sum(neg_alpha[index[w], index[seed]] for seed in negative_seeds if seed in index) beta = np.sum(list(pos_pols.values())) / np.sum(list(neg_pols.values())) for w in index: polarities[w] = pos_pols[w] - beta * neg_pols[w] return polarities
def run_random_walks(data_dir, weight_edges=False): print("Loading data and building transition matrix...") examples = util.load_json('./data/' + data_dir + '/oag_examples_simple.json') G = nx.read_edgelist('./data/' + data_dir + '/graph.txt', nodetype=int) # Get all nodes, but not the edges(those need to be predicted) with open('./data/nid_to_id.txt', 'r') as file: line = file.readline() while line: keys = line.split() if keys[0] not in G: G.add_node(keys[0]) line = file.readline() # Real id to substitute id #id_map = {} #count = 0 #for n in G: # id_map[n] = count # count += 1 #if weight_edges: # reviews = util.load_json('./data/' + data_dir + '/review.json') # end_date = datetime.date(2012, 1, 1) if data_dir == 'train' else datetime.date(2013, 1, 1) # edges = G.edges() # for e in util.logged_loop(edges, util.LoopLogger(20000, len(edges), True)): # n1, n2 = str(e[0]), str(e[1]) # if n1 not in reviews or n2 not in reviews[n1]: # n1, n2 = n2, n1 # G[e[0]][e[1]]['weight'] = 1.0 / ((end_date - get_date(reviews[n1][n2][0])).days + 90) # del reviews # save some memory adjacency_matrix = nx.adjacency_matrix(G) inverse_degree_matrix = sparse.diags([[ 1.0 / adjacency_matrix.getrow(i).sum() for i in range(adjacency_matrix.shape[0]) ]], [0]) transition_matrix = inverse_degree_matrix.dot(adjacency_matrix) print("Running random walks...") for u in util.logged_loop(examples, util.LoopLogger(10, len(examples), True)): p = run_random_walk(transition_matrix, int(u), 10).todense() #row for adj matrix for b in examples[u]: examples[u][b] = p[0, int(b)] util.write_json( examples, './data/' + data_dir + ('/oag_weighted_random_walks.json' if weight_edges else '/oag_random_walks.json'))
def run_random_walks(data_dir, weight_edges=False): print "Loading data and building transition matrix..." examples = util.load_json('./data/' + data_dir + '/examples.json') G = nx.read_edgelist('./data/' + data_dir + '/graph.txt', nodetype=int) if weight_edges: reviews = util.load_json('./data/' + data_dir + '/review.json') end_date = datetime.date( 2012, 1, 1) if data_dir == 'train' else datetime.date(2013, 1, 1) edges = G.edges() for e in util.logged_loop(edges, util.LoopLogger(20000, len(edges), True)): n1, n2 = str(e[0]), str(e[1]) if n1 not in reviews or n2 not in reviews[n1]: n1, n2 = n2, n1 G[e[0]][e[1]]['weight'] = 1.0 / ( (end_date - get_date(reviews[n1][n2][0])).days + 90) del reviews # save some memory adjacency_matrix = nx.adjacency_matrix(G) inverse_degree_matrix = sparse.diags([[ 1.0 / adjacency_matrix.getrow(i).sum() for i in range(adjacency_matrix.shape[0]) ]], [0]) transition_matrix = inverse_degree_matrix.dot(adjacency_matrix) print "Running random walks..." for u in util.logged_loop(examples, util.LoopLogger(10, len(examples), True)): p = run_random_walk(transition_matrix, int(u), 10).todense() for b in examples[u]: examples[u][b] = p[0, int(b)] util.write_json( examples, './data/' + data_dir + ('/weighted_random_walks.json' if weight_edges else '/random_walks.json'))
def write_probable_pairs(dataset_name, action_space_path, scores): probable_pairs = {} margin_removals = 0 total_pairs = 0 total_size = 0 for did in util.logged_loop(scores): doc_scores = scores[did] pairs = sorted([pair for pair in doc_scores.keys() if pair[0] != -1], key=lambda pr: doc_scores[pr] - (-1 - 0.3*doc_scores[(-1, pr[1])]), reverse=True) total_pairs += len(pairs) probable_pairs[did] = [] for pair in pairs: score = doc_scores[pair] - (-1 - 0.3*doc_scores[(-1, pair[1])]) if score < SCORE_THRESHOLD: break probable_pairs[did].append(pair) max_scores = {} for pair in probable_pairs[did]: if pair[1] not in max_scores: max_scores[pair[1]] = max(doc_scores[pair], -1 - 0.3*doc_scores[(-1, pair[1])]) else: max_scores[pair[1]] = max(max_scores[pair[1]], doc_scores[pair]) margin_removals += len(probable_pairs[did]) probable_pairs[did] = [p for p in probable_pairs[did] if doc_scores[p] - max_scores[p[1]] > MARGIN_THRESHOLD] margin_removals -= len(probable_pairs[did]) total_size += len(probable_pairs[did]) print "num docs:", len(scores) print "avg size without filter: {:.1f}".format(total_pairs / float(len(scores))) print "avg size: {:.1f}".format(total_size / float(len(scores))) print "margin removals size: {:.1f}".format(margin_removals / float(len(scores))) util.write_pickle(probable_pairs, action_space_path + dataset_name + '_probable_pairs.pkl') shutil.copyfile('clustering_preprocessing.py', action_space_path + 'clustering_preprocessing.py')
def make_examples(data_dir, n_users=5000, min_degree=1, negative_sample_rate=0.01, min_active_time=None, new_edge_only=False): print "Loading data..." # TODO: switch to networkx? G = snap.LoadEdgeList(snap.PUNGraph, data_dir + 'graph.txt', 0, 1) with open(data_dir + 'new_edges.txt') as f: edges = {tuple(map(int, line.split())) for line in f} new_edge_count = Counter() for (u, b) in edges: new_edge_count[u] += 1 review_data = util.load_json(data_dir + 'review.json') n_businesses = len(util.load_json(data_dir + "business.json")) recently_active_users = [] other_users = [] print "Getting candidate set of users..." users = [] for Node in util.logged_loop(G.Nodes(), util.LoopLogger(50000, G.GetNodes(), True)): u = Node.GetId() if new_edge_only and not u in new_edge_count: continue if str(u) not in review_data or Node.GetOutDeg() < min_degree: continue if min_active_time: recent_review = False for b in review_data[str(u)]: if (int(u), int(b)) in edges: continue for r in review_data[str(u)][b]: if get_date(r) > min_active_time: users.append(u) recently_active_users.append(u) recent_review = True break if recent_review: break if not recent_review: other_users.append(u) else: users.append(u) if min_active_time: recent_positive = sum(new_edge_count[u] for u in recently_active_users) recent_examples = len(recently_active_users) * n_businesses other_positive = sum(new_edge_count[u] for u in other_users) other_examples = len(other_users) * n_businesses print "Positives retained from recently active filter:", \ recent_positive / float(recent_positive + other_positive) print "Negatives retained from recently active filter:", \ (recent_examples - recent_positive) / \ float(recent_examples - recent_positive + other_examples - other_positive) random.seed(0) users = random.sample(users, n_users) print "Getting candidate set of edges..." examples = defaultdict(dict) for u in util.logged_loop(users, util.LoopLogger(50, n_users, True)): candidate_businesses = snap.TIntV() snap.GetNodesAtHop(G, u, 3, candidate_businesses, True) for b in candidate_businesses: if (u, b) in edges: examples[u][b] = 1 elif random.random() < negative_sample_rate: examples[u][b] = 0 hop3_positives = 0 for u in examples: for b in examples[u]: hop3_positives += examples[u][b] hop3_examples = sum(len(examples[u]) for u in examples) n_positives = sum([new_edge_count[u] for u in users]) n_examples = len(users) * n_businesses print "Positives retained from hop3 filter:", hop3_positives / float(n_positives) print "Negatives retained from hop3 filter:", (hop3_examples - hop3_positives) / \ (negative_sample_rate * float(n_examples - n_positives)) print "Data skew:", hop3_positives / float(hop3_examples) print "Writing examples..." util.write_json(examples, data_dir + 'examples.json')
def reviews_iterator(path='./data/provided/yelp_academic_dataset_review.json'): return util.logged_loop(util.load_json_lines(path), util.LoopLogger(100000, util.lines_in_file(path), True))
def make_examples(data_dir, n_users=5000, min_degree=1, negative_sample_rate=0.01, min_active_time=None, new_edge_only=False): print "Loading data..." # TODO: switch to networkx? G = snap.LoadEdgeList(snap.PUNGraph, data_dir + 'graph.txt', 0, 1) with open(data_dir + 'new_edges.txt') as f: edges = {tuple(map(int, line.split())) for line in f} new_edge_count = Counter() for (u, b) in edges: new_edge_count[u] += 1 review_data = util.load_json(data_dir + 'review.json') n_businesses = len(util.load_json(data_dir + "business.json")) recently_active_users = [] other_users = [] print "Getting candidate set of users..." users = [] for Node in util.logged_loop(G.Nodes(), util.LoopLogger(50000, G.GetNodes(), True)): u = Node.GetId() if new_edge_only and not u in new_edge_count: continue if str(u) not in review_data or Node.GetOutDeg() < min_degree: continue if min_active_time: recent_review = False for b in review_data[str(u)]: if (int(u), int(b)) in edges: continue for r in review_data[str(u)][b]: if get_date(r) > min_active_time: users.append(u) recently_active_users.append(u) recent_review = True break if recent_review: break if not recent_review: other_users.append(u) else: users.append(u) if min_active_time: recent_positive = sum(new_edge_count[u] for u in recently_active_users) recent_examples = len(recently_active_users) * n_businesses other_positive = sum(new_edge_count[u] for u in other_users) other_examples = len(other_users) * n_businesses print "Positives retained from recently active filter:", \ recent_positive / float(recent_positive + other_positive) print "Negatives retained from recently active filter:", \ (recent_examples - recent_positive) / \ float(recent_examples - recent_positive + other_examples - other_positive) random.seed(0) users = random.sample(users, n_users) print "Getting candidate set of edges..." examples = defaultdict(dict) for u in util.logged_loop(users, util.LoopLogger(50, n_users, True)): candidate_businesses = snap.TIntV() snap.GetNodesAtHop(G, u, 3, candidate_businesses, True) for b in candidate_businesses: if (u, b) in edges: examples[u][b] = 1 elif random.random() < negative_sample_rate: examples[u][b] = 0 hop3_positives = 0 for u in examples: for b in examples[u]: hop3_positives += examples[u][b] hop3_examples = sum(len(examples[u]) for u in examples) n_positives = sum([new_edge_count[u] for u in users]) n_examples = len(users) * n_businesses print "Positives retained from hop3 filter:", hop3_positives / float( n_positives) print "Negatives retained from hop3 filter:", (hop3_examples - hop3_positives) / \ (negative_sample_rate * float(n_examples - n_positives)) print "Data skew:", hop3_positives / float(hop3_examples) print "Writing examples..." util.write_json(examples, data_dir + 'examples.json')
def reviews_iterator(path='./data/provided/yelp_academic_dataset_review.json'): return util.logged_loop( util.load_json_lines(path), util.LoopLogger(100000, util.lines_in_file(path), True))