def find_closest(word_vector, vocab, number, metric):
	vectors = np.array(word_vector)
	# print vectors
	# print len(vectors)
	top_number = -number
	similarity = []
	if (metric == 'cosine'):
		for c in vocab:
			sim = cosine_sim(c[1], vectors)
			similarity.append([c[0], sim])

	# print len(vocab)
	# print vocab[1]
	# print len(similarity)
	# print similarity[1]
	
	simi = [x[1] for x in similarity]

	top_n = np.argsort(simi)[-10:]
	# print top_n
	top_n_sim = []
	wiki_closest = []
	for j in top_n:
		top_n_sim.append(simi[j])
		wiki_closest.append(vocab[j])

	return top_n_sim, wiki_closest
Пример #2
0
def ICS(reviews):
    customerToProductDictionary = defaultdict(lambda: defaultdict(lambda: []))
    ICS_Dictionary = defaultdict(lambda: defaultdict(lambda: 0))

    for review in reviews:  #build up dictionaries
        if (review["productId"] != "None" and review["memberId"] != "None"):
            customerToProductDictionary[review["memberId"]][
                review["productId"]].append(review["reviewText"])

    for member in customerToProductDictionary:
        for productId in customerToProductDictionary[member]:
            temp = 0
            cnt = 0
            if len(
                    customerToProductDictionary[member][productId]
            ) > 1:  #check if the same memeber review on a product repeatedly
                for i in customerToProductDictionary[member][productId]:
                    for j in customerToProductDictionary[member][productId]:
                        if i != j:
                            temp = cosine_sim(i, j)
                            cnt += 1
            if cnt != 0:
                ICS = 1.0 * temp / cnt  #take average
                ICS_Dictionary[member][productId] = ICS
    return ICS_Dictionary
def main():
    itemsim = {}
    tagged = pickle.load(open("tagged", "rb"))
    c = 1
    for a1 in tagged:
        for a2 in tagged:
            if c % 1000000 == 0:
                print(c / 1000000, "M")
            #DONT compare same artists
            if int(a2) > int(a1):
                score = cosine_sim(tagged[a1], tagged[a2])
                #only include cosine similarities higher than a certain threshold
                if score > 0.50:
                    try:
                        itemsim[a1][a2] = score
                    except KeyError:
                        itemsim[a1] = {}
                        itemsim[a1][a2] = score
                    try:
                        itemsim[a2][a1] = score
                    except KeyError:
                        itemsim[a2] = {}
                        itemsim[a2][a1] = score
            c += 1

    pickle.dump(itemsim, open("item_sim50", "wb"))
Пример #4
0
def GMCS(group):
  MCS = []
  count = []
  for i in range(len(group)):
    cur_user = group[i]
    MCS.append(0)
    count.append(0)
    for x in range(len(cur_user[1])-1):#each review
      for y in range(x+1,len(cur_user[1])):
        MCS[i]+=cosine_sim(cur_user[1][x]["reviewText"], cur_user[1][y]["reviewText"])    
        count[i]+=1 
    MCS[i]/=count[i]
  Sum = 0
  for indi in MCS:
    Sum+=indi
  return float(Sum)/len(group)
def main():
    listen = get_listen()
    try:
        sim = pickle.load(open("user_sim01", "rb"))
    except FileNotFoundError:
        sim = {}
        for u1 in listen:
            sim[u1] = {}
            for u2 in listen:
                if u1 != u2:
                    score = cosine_sim(listen[u1], listen[u2])
                    if score > 0.01:
                        sim[u1][u2] = score
        pickle.dump(sim, open("user_sim01", "wb"))

    total = 0  # count the total amount of similarity relations
    for u1 in sim:
        for u2 in sim[u1]:
            total += 1
    print(total)
    """u1 = "2"
Пример #6
0
def CS(reviews):
    texts = [review["reviewText"] for review in reviews]
    return avg([
        cosine_sim(review1, review2) for review1 in texts for review2 in texts
    ])
Пример #7
0
def recommender(user_train,user_test,movie_train,k):

    user_avg={}
    for user in user_train:
        count2=0
        sum=0
        for movie in user_train[user]:
            if user_train[user][movie]!=0:
                sum=sum+user_train[user][movie]
                count2=count2+1

        if count2==0:
            avg=0
        else :
            avg=float(sum)/float(count2)
        user_avg[user]=avg

    IUF_train={}
    for movie in movie_train:
        count2=0
        for user in movie_train[movie]:
            if  movie_train[movie][user]!=0:
                count2=count2+1
        if count2 ==0:
            IUF==0
        else:
            a=1000/count2
            IUF=log10(a)
        IUF_train[movie]=IUF

    sim_user={}
    sim_userabs={}
    for user in user_test:
        dict1={}
        dict2={}
        for user1 in user_train:
            if user!=user1:
                '''if user==201 and user1==2:
                    print 'user_train'
                    print user_train[user1]
                    print 'user_test'
                    print user_test[user]'''
                #print 'train '+ str(user1)+' '+'test '+ str(user)
                sim=cosine_sim(user_train[user1],user_test[user],IUF_train)
                dict1[user1]=sim
                dict2[user1]=abs(sim)
        sim_user[user]=dict1
        sim_userabs[user]=dict2



    reco_dict1={}
    for user in user_test:
        dict2={}
        for movie in user_test[user]:
            if user_test[user][movie]==0:
                #print user
                #print movie
                lista=sorted(sim_userabs[user].items(),key=itemgetter(1), reverse=True)
                #print lista
                count=0
                topklist=[]
                for user1,sim in lista:
                    #print user1
                    #print movie
                    #print 'user'+str(user)
                    if user_train[user1][movie]!=0:
                        topklist.append(user1)
                        count=count+1
                        if count==k:
                            break


                rating=reco_cosine(user,movie,user_train,user_test,topklist,sim_user,user_avg,k)
                #print rating

                a=int(rating)
                #a=int(round(rating))
                #if a==0:
                #   a=1
                dict2[movie]=a
        #reco_dict[user]=dict2
        reco_dict1[user]=dict2

    #print reco_dict[205]

    return reco_dict1
Пример #8
0
from util import *
from cosine_sim import cosine_sim
import sys
if __name__ == "__main__":
    cosine_sim_weights = [0.50, 0.30, 0.20]
    window_weight = 100
    window_function = (lambda v : 1.0 / v)
    dictionary = read_dictionary()
    data = sys.argv[1]
    queries = read_train_data(data)
    corpus = read_corpus()
    for q in queries:
        scored_urls = cosine_sim(q, cosine_sim_weights, dictionary, corpus)
        # output the urls in order
        print "query: " + q.query_terms
        boosted_urls = []
        for (s,u) in scored_urls:
            min_dist = u.minimum_body_window(q.query_terms)
            min_dist = min(min_dist, u.minimum_title_window(q.query_terms))
            min_dist = min(min_dist, u.minimum_anchor_window(q.query_terms))
            min_dist -= len(q.query_terms.split())
            # 1/0 isn't a thing
            min_dist += 1
            # maps it in the range [B,1]
            B = 1.0 + window_function(min_dist) * (window_weight - 1)
            boosted_urls.append((s * B, u))
        boosted_urls.sort(reverse=True)
        for (s,u) in boosted_urls:
            print " url: " + u.url
    with open('Weights','w') as f:
        cosine_sim_weights.reverse()