def calculate_similarity(self, pair_key, lines): ''' Sum components of each corating pair across all users who rated both item x and item y, then calculate pairwise pearson similarity and corating counts. The similarities are normalized to the [0,1] scale because we do a numerical sort. 19,21 0.4,2,[user1, user2, ...] 21,19 0.4,2,[user1, user2, ...] 19,70 0.6,1,[user1, user2, ...] 70,19 0.6,1,[user1, user2, ...] 21,70 0.1,1,[user1, user2, ...] 70,21 0.1,1,[user1, user2, ...] ''' sum_xx, sum_xy, sum_yy, sum_x, sum_y, n = (0.0, 0.0, 0.0, 0.0, 0.0, 0) item_pair, co_ratings_id = pair_key, lines item_xname, item_yname = item_pair for item_x, item_y, user_id in lines: sum_xx += item_x * item_x sum_yy += item_y * item_y sum_xy += item_x * item_y sum_y += item_y sum_x += item_x n += 1 cos_sim = cosine(sum_xy, sqrt(sum_xx), sqrt(sum_yy)) yield (item_xname, item_yname), (cos_sim, n)
def topicProj(topicMatrix, threshold=0): G = nx.Graph() G.clear() # ----- Add nodes # print('\nAdding nodes for graph...') for topic, _ in topicMatrix.iteritems(): # Extract all text from the actor G.add_node(topic) # print('All nodes successfully added!') # ----- Add edges # print('\nAdding edges for graph...') for i, topic_i in enumerate(topicMatrix.columns): for j, topic_j in enumerate(topicMatrix.columns[(i + 1):]): similarity = simi.cosine(topicMatrix[topic_i], topicMatrix[topic_j]) if similarity > threshold: G.add_edge(topic_i, topic_j, weight=similarity) # print('All edges successfully added!') cent = [G.degree(topic, weight='weight') for topic in topicMatrix.columns] cent = pd.DataFrame(cent, columns='edgeWeightSum'.split(), index=topicMatrix.columns) # Normalise by the max number of other node a node can be connected to cent['Degree'] = cent['edgeWeightSum'] / (len(topicMatrix.columns) - 1) # Add centrality information to node attribute nx.set_node_attributes(G, cent['Degree'], 'degreeCent') return G, cent
def getItemRecommendation(matrix,item): totals = {} simSums = {} for other in matrix: if other == item : continue sim = similarity.cosine(matrix,item,other) #sim = similarity.pearson(matrix,item,other) if sim <= 0 : continue for user in matrix[other]: if user not in matrix[item] or matrix[item][user] == 0: totals.setdefault(user,0) totals[user] += matrix[other][user]*sim simSums.setdefault(user,0) simSums[user] += sim #itemEval=[(total/simSums[user],user) for user,total in totals.items()] itemEval={user:total/simSums[user] for user,total in totals.items()} #print str(itemEval) return itemEval
def basesdecomp(bases, vec): best = float('inf') bestans = [] cosineval = float('inf') for i in range(0, 1000): ans = getDecomp(bases, vec) try: cosineval = similarity.stddev([similarity.cosine(x, y) for x,y in zip(ans,bases)]) except TypeError: None if cosineval < best: best = cosineval project = [similarity.cosine(x, y) for x,y in zip(ans,bases)] bestans = ans for i in range(1, len(ans)): sum1 = [x + y for x, y in zip(bases[i], bases[i-1])] fidelity = similarity.cosine(sum1, vec) return best, bestans, fidelity, project
def test_cosine(self): G = self.G cos = cosine(G) ns.assert_equal(len(cos), 7) for i in range(7): assert(i in cos) for i in self.G.cos_sim.keys(): ns.assert_equal(len(self.G.cos_sim[i]), len(cos[i])) for j in self.G.cos_sim[i].keys(): ns.assert_almost_equal(cos[i][j], self.G.cos_sim[i][j], places=4)
def average_similarity(products, features): sample_size = 500 sample = products.sample(sample_size).index.values sim = 0 for i1 in sample: for i2 in sample: if i1 != i2: sim += similarity.cosine(features, products.loc[i1, :], products.loc[i2, :]) return sim / (sample_size * (sample_size - 1))
def get_similarity(products, features, p1, all_articles): sim = 0 no_others = len(all_articles) - 1 for id in all_articles: # If product in product database, calculate similarity. if id in products['article_id'].values: p2 = products[products['article_id'] == id] if p1['article_id'].values != p2['article_id'].values: sim += similarity.cosine(features, p1.iloc[0], p2.iloc[0]) # Else decrease amount of product used for average with one. else: no_others -= 1 if no_others > 0: return sim / float(no_others) else: return 0
def get_similarity(products, features): # No. of products m = len(products) # Save article numbers on first line article_ids = products['article_id'].values # Similarity is calculated per product and appended to similarity file. # The similarity matrix can not be made at once because of memory constraints. for i in range(m): p1 = article_ids[i] # Write header to file, overwriting the file if necessary save.array("Similarity/" + str(p1), ["article_id", "similarity"], "w+") for j in range(m): if i != j: p2 = article_ids[j] sim = similarity.cosine(features, products.loc[i, :], products.loc[j, :]) save.array("Similarity/" + str(p1), [p2, sim])
def guess_from_l(sketch, freq, k, ref_subset, candidate_subset, unknown_node, dist_mat_knownledge=1): dist_candidate_subset = oracle.query_subset(sketch, candidate_subset, ref_subset) # only consider a partial distance matrix partial_mask = np.around(np.random.binomial(n=1, p=dist_mat_knownledge, size=(len(candidate_subset), len(ref_subset)))) partial_dist = np.multiply(partial_mask, dist_candidate_subset) # set hidden values to avg avg = np.sum(partial_dist, axis=(0,1))/np.sum(partial_mask, axis=(0,1)) partial_dist = partial_dist + np.multiply(avg, (np.ones((len(candidate_subset), len(ref_subset))) - partial_mask)) # distance estimate vector dist_est = sketch_pattern.distance_estimate_subset(sketch, freq, k/2, np.array([unknown_node]), ref_subset) # inverse value and calculate similarity inv_partial_dist = np.power(partial_dist, -1) inv_dist_est = np.power(dist_est, -1) sim = similarity.cosine(inv_partial_dist, inv_dist_est) # check if the recovery succeded guessed_right = (candidate_subset[np.argmax(sim)] == unknown_node) return guessed_right
def getUserRecommendation(matrix,user): totals = {} simSums = {} u_average = sum(matrix[user][item] for item in matrix[user])/len(matrix[user]) for other in matrix: if other == user : continue sim = similarity.cosine(matrix,user,other) #sim = similarity.pearson(matrix,user,other) if sim <= 0 : continue o_average = sum(matrix[other][item] for item in matrix[other])/len(matrix[other]) for item in matrix[other]: if item not in matrix[user] or matrix[user][item] == 0: totals.setdefault(item,0) totals[item] += (matrix[other][item] - o_average)*sim simSums.setdefault(item,0) simSums[item] += sim #userEval=[(u_average + totals/simSums[item],item) for item,total in totals.items()] userEval={item:u_average+total/simSums[item] for item,total in totals.items()} return userEval
katz, lhn, rss2, dice, inverse_log_weighted, rsa import pickle import pandas ### VERTEX SIMILARITY graphs = pickle.load(open('data/graphs_networkx.pkl','rb')) results = dict() for graph_type,G in graphs.iteritems(): print "Calculating for %s" %(graph_type) results[graph_type] = dict() print "ASCOS ---------------------------" results[graph_type]["ascos"] = ascos(G) print "COSINE ---------------------------" results[graph_type]["cosine"] = cosine(G) print "JACCARD --------------------------" results[graph_type]["jaccard"] = jaccard(G) print "KATZ -----------------------------" results[graph_type]["katz"] = katz(G) print "LHN ------------------------------" results[graph_type]["lhn"] = lhn(G) print "RSS2 -----------------------------" results[graph_type]["rss2"] = rss2(G) print "DICE -----------------------------" results[graph_type]["dice"] = dice(G) print "INVERSE LOG WEIGHTED --------------" results[graph_type]["inverse_log_weighted"] = inverse_log_weighted(G) pickle.dump(results,open("data/sim_metrics.pkl","wb"))
__author__ = 'John' import pandas as pd import numpy as np import re import cf import similarity import filter_demo_data cosine = similarity.cosine() similarity_helper = cosine import sys def read_input(filename, top_N=10): #just reads in X and category matrix so loading it will not take time shop_mall = pd.read_csv("Demographic Filtering/mall_store_list.csv", encoding = "ISO-8859-1", index_col=False) #This is used for obttaining unique malls stores_db = shop_mall[["store", "store_id"]].drop_duplicates(subset = ["store"]) #get unique stores stores_db.index = pd.Series(np.arange(stores_db.shape[0])) #stores_db.to_csv("command_line_files/store_list.csv", header="true") mall_demographic = pd.read_csv("Demographic Filtering/mall_with_demographic_category.csv", encoding = "ISO-8859-1", index_col=False) county_db = mall_demographic[["county", "usps"]].copy(deep=True) #reads all county_db["county"] =county_db["county"].str.lower() county_db["usps"] =county_db["usps"].str.lower() county_db =county_db.drop_duplicates(["county", "usps"]) #county_db.to_csv("command_line_files/county_list.csv", header="true") #save columns into another file #read txt file file = open(filename) entire_file = file.read() user = re.split('\n+', entire_file) #first entry of user is their county followed by stores
#-*- coding: utf-8 -*- import similarity as sim if __name__=="__main__": #辞書 dictionary = { "A":{"apple":2.5,"mikan":2.5,"banana":5.0,"melon":2.0}, "B":{"apple":2.5,"banana":1.5,"kiui":3.0,"melon":4.0} } #ユークリッド距離 value = sim.euclidean(dictionary,"A","B") print str(value) #ピアソン相関 value = sim.pearson(dictionary,"A","B") print str(value) #コサイン類似度 value = sim.cosine(dictionary,"A","B") print str(value)
start_time = time.time() print('Creating the Query Vector...', end='', flush=True) query_vec = qp.query_vector(invf_indexes, len(invf_terms)) print(f'done. ({time.time()-start_time}sec)') if should_do_rocchio: # Select Relevant Documents relevant_postings = [] # Similarity if method == 'dot': sim = similarity.dot(VS, query_vec, invf_indexes, merged_postings) else: sim = similarity.cosine(VS, query_vec, invf_indexes, merged_postings, hybrid=True, power=2) # Ranking rank = [] for i in range(len(sim)): rank.append((sim[i], merged_postings[i])) rank.sort(reverse=True) if num_relevant_docs > len(rank): num_relevant_docs = len(rank) for i in range(num_relevant_docs): relevant_postings.append(rank[i][1]) start_time = time.time() print('Rocchio...', end='', flush=True) qp.rocchio(VS,
katz, lhn, rss2, dice, inverse_log_weighted, rsa import pickle import pandas ### VERTEX SIMILARITY graphs = pickle.load(open('data/graphs_networkx.pkl', 'rb')) results = dict() for graph_type, G in graphs.iteritems(): print "Calculating for %s" % (graph_type) results[graph_type] = dict() print "ASCOS ---------------------------" results[graph_type]["ascos"] = ascos(G) print "COSINE ---------------------------" results[graph_type]["cosine"] = cosine(G) print "JACCARD --------------------------" results[graph_type]["jaccard"] = jaccard(G) print "KATZ -----------------------------" results[graph_type]["katz"] = katz(G) print "LHN ------------------------------" results[graph_type]["lhn"] = lhn(G) print "RSS2 -----------------------------" results[graph_type]["rss2"] = rss2(G) print "DICE -----------------------------" results[graph_type]["dice"] = dice(G) print "INVERSE LOG WEIGHTED --------------" results[graph_type]["inverse_log_weighted"] = inverse_log_weighted(G) pickle.dump(results, open("data/sim_metrics.pkl", "wb"))
def actorProj(actorProfile, topicMatrix, threshold=0): # ================================================================================ # ----- Error Checking assert (actorProfile.index).equals( topicMatrix.index ), "actorProfile and topicMatrix do not have the same indicies." # ================================================================================ # ----- Construct Weighted Graph G = nx.Graph() G.clear() # ----- Add nodes # print('\nAdding nodes for graph...') for actor, row in actorProfile.iterrows(): # Extract all text from the actor G.add_node( actor, gender=row['Gender'], party=row['Party'], metro=row['Metro'], elec=row['Elec'], wordCount=row['WordCount'], ) # print('All nodes successfully added!') # ----- Add edges # print('\nAdding edges for graph...') for i, (a_i, row_i) in enumerate(actorProfile.iterrows()): for j, (a_j, row_j) in enumerate(actorProfile[(i + 1):].iterrows()): # Retrieve and parse topicVector profile of actor_i and actor_j tv_i = topicMatrix.loc[a_i].tolist() tv_j = topicMatrix.loc[a_j].tolist() similarity = simi.cosine(tv_i, tv_j) # Both actors cannot be the same person and similarity is above a threshold if a_i != a_j and similarity > threshold: # Add edge with appropriate attributes G.add_edge( a_i, a_j, weight=similarity, closeness=abs(1 - similarity), ) # Log progress... # logPath = "jaccardSim" # kf.log(f"{i:{5}},{j:{5}} of {len(actorProfile):{5}}", logPath) # kf.log(f"{a_i:{30}}{a_j}", logPath) # kf.log(f"{row_i['Topics']:{30}}{row_i['Topics']}", logPath) # kf.log(f"Jaccard Similarity: {similarity}\n", logPath) # print('All edges successfully added!') # ================================================================================ # ----- Compute degree of centrality and add as node attribute # G.degree() returns the number of edges adjacent to a node, taking into account of the edge weight cent = pd.DataFrame( [G.degree(actor, weight='weight') for actor in actorProfile.index], columns='edgeWeightSum'.split(), index=actorProfile.index) # Normalise by the max number of other node a node can be connected to cent['Degree'] = cent['edgeWeightSum'] / (len(actorProfile) - 1) cent['Betweenness'] = pd.Series(nx.betweenness_centrality(G)) # cent['Betweenness'] = pd.Series(nx.betweenness_centrality(G, weight='weight')) # cent['Betweenness'] = pd.Series(nx.betweenness_centrality(G, weight='closeness')) cent['Closeness'] = pd.Series(nx.closeness_centrality(G)) # cent['Closeness'] = pd.Series(nx.closeness_centrality(G, distance='weight')) # cent['Closeness'] = pd.Series(nx.closeness_centrality(G, distance='closeness')) # Add centrality information to node attribute nx.set_node_attributes(G, cent['Degree'], 'DegreeCent') nx.set_node_attributes(G, cent['Betweenness'], 'BtwnCent') nx.set_node_attributes(G, cent['Closeness'], 'ClosenessCent') # Find list of connected componenets connected_components = list(nx.connected_components(G)) # Concat centrality measures to actorprofile actorProfile_out = copy(actorProfile) actorProfile_out['DegreeCentrality'] = cent['Degree'] actorProfile_out['BtwnCentrality'] = cent['Betweenness'] actorProfile_out['ClosenessCentrality'] = cent['Closeness'] # ===================================================================================== # ----- FOR DEBUGGING # # Save results # nx.write_gpickle(G, f"{PATH}{TIME_FRAME}/ssm_weightedGraph_{TIME_FRAME}.gpickle") # cent.to_csv(f"{PATH}{TIME_FRAME}/ssm_centrality_{TIME_FRAME}.csv") # with open(f"{PATH}{TIME_FRAME}/ssm_cliques_{TIME_FRAME}.pickle", "wb") as file: # pickle.dump(cliques, file) # ================================================================================ return G, actorProfile_out, connected_components