def classifierTest(dataSet,labels): # ratio=0.10 m=dataSet.shape[0] neighbour=25 # numOfTests=int(m*ratio) # dataSet,ranges,minVals=normalize(dataSet) trainData=loadData.loadTrainingData("u.data") # trainData=minor2.test() numOfErrors=0 dictionary={} prev=-1 testD=loadData.loadTrainingData("u1.test") for i in range(0,dataSet.shape[0]): user=dataSet[i,0] movie=dataSet[i,1] classifierResult=int(classify0(dataSet[i,:],trainData,neighbour)) if i==0: prev=user dictionary[movie]=classifierResult else: if user!=prev: pred_rating.append(dictionary) dictionary={} dictionary[movie]=classifierResult prev=user else: dictionary[movie]=classifierResult if len(dictionary): pred_rating.append(dictionary) i=0;ndcg=0 prec=0;recall=0 for l in pred_rating: # tuPlus=[] tuPlus=0;intersection=0 prec_u=0;recall_u=0;dcg_u=0;idcg_u=0;ndcg_u=0 temp=sorted(l.items(),key=operator.itemgetter(1),reverse=True) top_n=temp[:neighbour] for j in range(testD.shape[1]): if testD[i,j]==5: tuPlus+=1 num=1 for q in top_n: idcg_u+=(1.0/math.log(num+1,2)) if q[0]==j+1: dcg_u+=(1.0/math.log(num+1,2)) intersection+=1 num+=1 ndcg_u=dcg_u/idcg_u print intersection,neighbour,tuPlus prec_u=intersection*1.0/neighbour if tuPlus: recall_u=intersection*1.0/tuPlus prec+=prec_u recall+=recall_u ndcg+=ndcg_u i+=1 prec=prec*1.0/i recall=recall*1.0/i ndcg=ndcg*1.0/i print prec,recall,ndcg
def test(): dataSet=loadData.loadTrainingData("u.data") occupation=occupationLoad() listOfKValues=[8,16,32,64] for x in listOfKValues: centroids,clusterAssignment=kMeans(dataSet,x) print "For Clusters= %d :-"%x testFile="u" avg=0.0 standardDeviation=0.0 numOfTimes=True for i in range(1,6): k1,k2=0,0 testData,testLabel=loadData.loadTestData(testFile+str(i)+".test") m = shape(dataSet)[0] totalError=0 index=0# for test Label no. predictions=[] for t in testData: user,movie=int(t[0])-1,int(t[1])-1 label=testLabel[index] clusterNum=clusterAssignment[user] # cluster number of the user to test userInCluster=[] for i in range(0,m): if clusterAssignment[i]==clusterNum: userInCluster.append(i) sumOfRatings=0 count=0 # number of users who've watched the movie count1=0 for i in userInCluster: if dataSet[i][movie]!=0:# if movie is watched sumOfRatings+=dataSet[i][movie] count+=1 if occupation[i][1]==occupation[user][1]: k1+=dataSet[i][movie] count1+=1 if count==0:# if there is no user in the cluster who've watched the movie ratingsPredicted=3 else: if count1!=0: temp1=around(sumOfRatings/count)# average of the raings.. round-off temp2=around(k1/count1) ratingsPredicted=min(temp2,temp1) else: ratingsPredicted=around(sumOfRatings/count) predictions.append(ratingsPredicted) totalError+=absolute(ratingsPredicted-label) index+=1 meanError=totalError/len(testData) if numOfTimes: print metrics.classification_report(testLabel,predictions) numOfTimes=False # print meanError avg+=meanError predictions=array(predictions) standardDeviationError+=std(predictions) print "Mean Error: ", float(avg)/5 print "Standard Deviation: ",float(standardDeviation)/5 print
def testKMeans(dataSet): for x in range(50,61,2): centroids,clusterAssignment=kMeans(dataSet,x) dataSet=loadData.loadTrainingData("u.data") testFile="u" avg=0.0 standardDeviationError=0 for i in range(1,6): testData,testLabel=loadData.loadTestData(testFile+str(i)+".test") m = shape(dataSet)[0] totalError=0 index=0# for test Label no. predictions=[] for t in testData: user,movie=int(t[0])-1,int(t[1])-1 label=testLabel[index] clusterNum=clusterAssignment[user] # cluster number of the user to test userInCluster=[] for i in range(0,m): if clusterAssignment[i]==clusterNum: userInCluster.append(i) sumOfRatings=0 count=0 # number of users who've watched the movie for i in userInCluster: if dataSet[i][movie]!=0:# if movie is watched sumOfRatings+=dataSet[i][movie] count+=1 if count==0:# if there is no user in the cluster who've watched the movie ratingsPredicted=3 else: ratingsPredicted=around(sumOfRatings/count)# average of the raings.. round-off predictions.append(ratingsPredicted) totalError+=absolute(ratingsPredicted-label) index+=1 meanError=totalError/len(testData) # print "Mean Absolute Error: "+str(meanError) # print avg+=meanError # print "Precision And Recall: " # print metrics.classification_report(testLabel,predictions) # print predictions=array(predictions) standardDeviationError+=std(predictions) # print "Standard Deviation: "+str(standardDeviationError) # meanActual=mean(array(testLabel)) # standardDeviationActual=std(array(testLabel)) # tValue=(meanActual-meanError)/( sqrt( (((standardDeviationActual)**2)/len(testLabel)) + (((standardDeviationError)**2)/len(predictions)) ) ) # print "tValue: "+str(tValue) # break # break # print print "Mean Absolute Error: "+str(float(avg)/5) print print "Standard Deviation: "+str(float(standardDeviationError/5)) print break
def testClassifier(): k=25;m=943;n=1682 trainData=loadData.loadTrainingData("u.data") testData=loadData.loadTrainingData("u1.test") trainClassifier(trainData, k) ndcg=0;dcg=0;idcg=0;prec=0;recall=0 for i in range(0,testData.shape[0]): # print "inside i loop" ndcg_u=0;dcg_u=0;idcg_u=0;tuPlus=0;intersection=0;prec_u=0;recall_u=0 for j in range(0,testData.shape[1]): if testData[i,j]==5: # print "inside if" num=1 tuPlus=tuPlus+1 for n in range (0,k): idcg_u+=(1.0/math.log(num+1,2)) # print j in topn[i] for a in range(0,k): #if j in topn[i] print j,top_nrecommendtest[i][a] if j==top_nrecommendtest[i][a]: print "inside if" intersection+=1 dcg_u+=(1.0/math.log(num+1,2)) num+=1 ndcg_u=dcg_u/idcg_u prec_u=1.0*intersection/k if tuPlus: recall_u=1.0*intersection/tuPlus print prec_u,recall_u,ndcg_u prec+=prec_u recall+=recall_u ndcg+=ndcg_u ndcg=1.0*ndcg/m prec=1.0*prec/m recall=1.0*recall/m print prec,recall,ndcg
def test(): dataSet=loadData.loadTrainingData("u.data") avg=0.0 standardDeviation=0.0 for x in range(1,6): testSet,testLabel=loadData.loadTestData("u"+str(x)+".test") # for i in range(shape(testSet)[0]): testLabel=testLabel[:100] index=0 totalError=0 mTest=0 predictions=[] for t in testSet: user,movie=int(t[0])-1,int(t[1])-1 label=testLabel[index] relation=pearson(dataSet,user) summation=0.0 answer=0.0 count=0 for j in range(0,shape(dataSet)[0]): if user!=j: if dataSet[j,movie]!=0: summation+=((dataSet[j,movie])*relation[0,j]) count+=1 if count==0: answer=3 else: answer=around(summation/(count)) # print answer predictions.append(answer) totalError+=absolute(answer-label) index+=1 mTest+=1 if mTest==100: break # stdDeviation= meanError=float(totalError)/mTest predictions=array(predictions) avg+=meanError standardDeviation+=std(predictions) print metrics.classification_report(testLabel,predictions) # print meanError print print "Mean Absolute Error: "+str(float(avg)/5) print print "Standard Deviation: "+str(float(standardDeviation/5)) print
def classifierTest(dataSet,labels): # ratio=0.10 m=dataSet.shape[0] # numOfTests=int(m*ratio) # dataSet,ranges,minVals=normalize(dataSet) trainData=loadData.loadTrainingData("u.data") numOfErrors=0 for i in range(0,dataSet.shape[0]): classifierResult=int(classify0(dataSet[i,:],trainData,25)) # print classifierResult,labels[i] # print # print "the classifier came back with: %d, the real answer is: %d"% (classifierResult, labels[i]) if classifierResult!=labels[i]: numOfErrors+=1 errorRate=numOfErrors*1.0/dataSet.shape[0] print errorRate
def test(): dataSet=loadData.loadTrainingData("u.data") u,clusterAssignment=fcm(dataSet,2,8,2) # centroids,clusterAssignment=kMeans(dataSet,x)# 15 clusters print u # return testFile="u" avg=0.0 for i in range(1,6): testData,testLabel=loadData.loadTestData(testFile+str(i)+".test") m = shape(dataSet)[0] totalError=0 index=0# for test Label no. predictions=[] # clusterAssignedCode for t in testData: user,movie=int(t[0])-1,int(t[1])-1 label=testLabel[index] clusterNum=clusterAssignment[user] # cluster number of the user to test userInCluster=[] for i in range(0,m): if clusterAssignment[i]==clusterNum: userInCluster.append(i) sumOfRatings=0 count=0 # number of users who've watched the movie for i in userInCluster: if dataSet[i][movie]!=0:# if movie is watched sumOfRatings+=dataSet[i][movie] count+=1 if count==0:# if there is no user in the cluster who've watched the movie ratingsPredicted=3 else: ratingsPredicted=around(sumOfRatings/count)# average of the raings.. round-off predictions.append(ratingsPredicted) print ratingsPredicted totalError+=absolute(ratingsPredicted-label) index+=1 meanError=totalError/len(testData) # avg+=meanError # print metrics.classification_report(testLabel,predictions) # meanError=totalError/len(testData) print meanError
def test(): dataSet=loadData.loadTrainingData("u.data") for x in range(20,40,2): centroids,clusterAssignment=kMeans(dataSet[:100,:],x,shape(dataSet)[0])# 15 clusters emptyPool=[] for i in range(100,shape(dataSet)[0]): emptyPool.append(i) print centroids return testFile="u" avg=0.0 for i in range(1,6): testData,testLabel=loadData.loadTestData(testFile+str(i)+".test") m = shape(dataSet)[0] totalError=0 index=0# for test Label no. predictions=[] for t in testData: user,movie=int(t[0])-1,int(t[1])-1 label=testLabel[index] clusterNum=clusterAssignment[user] # cluster number of the user to test userInCluster=[] for i in range(0,m): if clusterAssignment[i]==clusterNum: userInCluster.append(i) sumOfRatings=0 count=0 # number of users who've watched the movie for i in userInCluster: if dataSet[i][movie]!=0:# if movie is watched sumOfRatings+=dataSet[i][movie] count+=1 if count==0:# if there is no user in the cluster who've watched the movie ratingsPredicted=3 else: ratingsPredicted=around(sumOfRatings/count)# average of the raings.. round-off predictions.append(ratingsPredicted) totalError+=absolute(ratingsPredicted-label) index+=1 meanError=totalError/len(testData) # print meanError avg+=meanError print print float(avg)/5
def testKMeansForPca(data): dataSet=loadData.loadTrainingData("u1.base") # centroids,clusterAssignment=kMeans(dataSet,15)# 15 clusters testData,testLabel=loadData.loadTestData("u1.test") clf=KMeans(n_clusters=15) clf.fit(data) clusterAssignment=clf.predict(data) m = shape(dataSet)[0] totalError=0 index=0# for test Label no. predictions=[] for t in testData: user,movie=int(t[0])-1,int(t[1])-1 label=testLabel[index] clusterNum=clusterAssignment[user] # cluster number of the user to test userInCluster=[] for i in range(0,m): if clusterAssignment[i]==clusterNum: userInCluster.append(i) sumOfRatings=0 count=0 # number of users who've watched the movie for i in userInCluster: if dataSet[i][movie]!=0:# if movie is watched sumOfRatings+=dataSet[i][movie] count+=1 if count==0:# if there is no user in the cluster who've watched the movie ratingsPredicted=3 else: ratingsPredicted=around(sumOfRatings/count)# average of the raings.. round-off predictions.append(ratingsPredicted) totalError+=absolute(ratingsPredicted-label) index+=1 meanError=totalError/len(testData) print meanError standardDeviation=std(predictions) print standardDeviation
def test(): dataSet=loadData.loadTrainingData("u.data")#mXn data=pca(dataSet,100) testKMeans(data)
def testPCA(): dataSet=loadData.loadTrainingData("u1.base")#mXn pca = PCA(n_components=100) # data=pca(dataSet,100) data=pca.fit_transform(dataSet) testKMeansForPca(data)
def test(): no_grids=0 trainData=loadData.loadTrainingData("u1.base") # testData,testLabels=loadData.loadTestData("u1.test") # SOM(trainData) m=shape(trainData)[0] # print testData # print testLabels # print m # centers=mat(zeros((numClusters,2)) clusters=initClusters() print clusters centers=mat(zeros((numClusters,2))) bmu_of=[0 for m in range(0,users)] radius=4 # print "In test" for i in range (0,radius): for j in range (0,radius): if no_grids<numClusters: centers[no_grids,0]=i centers[no_grids,1]=j # print centers[no_grids,0] # print centers[no_grids,1] # print no_grids+=1 else: break iterations=0 temp_difference=0 difference=[0 for m in range(numClusters)] # print shape(clusters) # for c in range(0,movies): # if clusters[0,c]==5: # print "usahiuhsauifhduisahfuiashfiuhsauifhuishfuisah" # print clusters[0,c] # print clusters[0,0] while radius>=1: for i in range (0,users): print i for j in range (0,numClusters): temp_difference=0 for k in range (0,movies): temp_difference+=(trainData[i,k]-clusters[j,k])*(trainData[i,k]-clusters[j,k]) difference[j]=sqrt(temp_difference) min_difference=10000001 for j in range (0,numClusters): if difference[j]<min_difference: min_difference=difference[j] bmu_of[i]=j currentLearningRate=update_learning_rate(learningRate, iterations, total_iterations) temp_rating=0 for j in range(0,numClusters): distance=math.sqrt((centers[bmu_of[i],0]-centers[j,0])*(centers[bmu_of[i],0]-centers[j,0])+(centers[bmu_of[i],1]-centers[j,1])*(centers[bmu_of[i],1]-centers[j,1])) if distance<radius: newInfluence=update_influence(centers, bmu_of[i], j, radius) netChange=0 for k in range (0,movies): # print "updating" # print j,k # print trainData[i,k] # print clusters[j,k] # print rating_difference=trainData[i,k]-clusters[j,k] netChange=(currentLearningRate*newInfluence*rating_difference) temp_rating=clusters[j,k] temp_rating+=netChange clusters[j,k]=temp_rating if clusters[j,k]>5: clusters[j,k]=5 if clusters[j,k]<1: clusters[j,k]=1 # break # break # break # break print radius iterations+=1 # temp_radius=update_radius(initRadius, time_constant, iterations) # radius=temp_radius radius-=1 # break # break error=0 count=0 fr=open("u1.test") lines=fr.readlines() # testMat=mat(zeros(943,1682)) for line in lines: word = line.split("\t") u=int(word[0])-1 m=int(word[1])-1 r=int(word[2]) # testMat[int(word[0])-1,int(word[1])-1]=int(word[2]) error+=abs(r-clusters[bmu_of[u],m]) count+=1 print error/count
def trainClassifier(trainData,k): m=trainData.shape[0] n=trainData.shape[1] norm_trainData=trainData/5 # watched=[0]*943 watched=zeros((943,1682)) trans=norm_trainData.T model=mat(trans)*mat(norm_trainData) final_model=mat(zeros((1682,1682))) for i in range(0,m): for j in range(0,n): if trainData[i,j]==0: watched[i,j]=0 else: watched[i,j]=1 trans_watch=watched.T watch_together=mat(trans_watch)*mat(watched) # print final_model.shape[0],final_model.shape[1] # print model.shape[0],model.shape[1] # print watch_together.shape[0],watch_together.shape[1] # print model[0,0] for i in range (0,n): for j in range (0,n): if watch_together[i,j]!=0: final_model[i,j]=model[i,j]/watch_together[i,j]*1.0 # else: # final_model[i,j]=0.0 for i in range(0,n): for j in range(0,n): if i==j: final_model[i,j]=0 # print final_model u=zeros((943,1682)) for i in range (0,m): for j in range (0,n): if trainData[i,j]==0: u[i,j]=1; else: u[i,j]=0 trans_u=u.T recommend=final_model*trans_u recommend=recommend.T for i in range (0,943): for j in range (0,1682): if u[i,j]==0: recommend[i,j]=0 topn=zeros((943,1682)) sumi=0 # print recommend[0] # print # for i in range (0,n): # if recommend[0,i]!=0: # print i # sumi=sumi+1 # print sumis for i in range (0,m): topn[i]=argsort(recommend[i]) testData=loadData.loadTrainingData("u1.test") # print argmax(recommend[0]) # print testData[0,argmax(recommend[0])] # for i in range (n-1,0,-1): # print topn[0,i],testData[0,topn[0,i]] # print sumi # print k # print topn topn_recommendation=zeros((943,k)) for i in range (0,m): topn_recommendation[i]=topn[i,n-k:] for i in range(0,topn_recommendation.shape[0]): temp=[] for j in range(0,len(topn_recommendation[i])): if(testData[i,topn_recommendation[i,j]]!=0): temp.append(topn_recommendation[i,j]); top_nrecommendtest.append(temp); # testData=loadData.loadTrainingData("u1.test") # sum1=0 # sum2=0 # for i in range(0,trainData.shape[1]): # if trainData[0,i]!=0: # sum1=sum1+1 # for i in range(0,testData.shape[1]): # if testData[0,i]!=0: # sum2=sum2+1 # print sum1,sum2,sum1+sum2 # print topn_recommendation # for i in range (0,k): # print testData[0,topn_recommendation[0,i]] # print topn_recommendation return topn_recommendation
def test(): trainData=loadData.loadTrainingData("u.data") # trainClassifier(trainData, 25) testClassifier()
def test(): dataSet=loadData.loadTrainingData("u1.base")#mXn data=pca(dataSet,200) testKMeans(data)# mX200
def test(): dataSet=loadData.loadTrainingData("u.data") for clu in range(74,75,2): avg=0.0 standardDeviation=0.0 for te in range(1,6): testData,testLabel=loadData.loadTestData("u"+str(te)+".test") clusters,emptyPool,meanList=initialization(dataSet,clu) while len(emptyPool): randVar=random.randint(0,len(emptyPool)-1) user=emptyPool[randVar] randNes=random.randint(0,len(clusters)-1) mae=float(sum(abs(dataSet[user,:]-meanList[randNes])))/1682 mini=100000000 count=0 threshold=int(0.3*len(clusters[randNes])) minPerson=-1 for i in range(0,len(clusters[randNes])): mae=float(sum(abs(dataSet[clusters[randNes][i][0],:]-meanList[randNes])))/1682 if mae<mini: count+=1 mini=mae minPerson=clusters[randNes][i][0] if count: if count>=threshold: for c in clusters[randNes]: if c[0]==minPerson: q=clusters[randNes].index(c) for t in range(0,1682): add=(meanList[randNes][t]*len(meanList[randNes]))-dataSet[minPerson,t] add=add/(len(meanList[randNes])-1) meanList[randNes][t]=add del(clusters[randNes][q]) emptyPool.append(minPerson) clusters[randNes].append([user,dataSet[user,:]]) ind=emptyPool.index(user) for t in range(0,1682): add=(meanList[randNes][t]*len(meanList[randNes]))+dataSet[user,t] add=add/(len(meanList[randNes])+1) meanList[randNes][t]=add del(emptyPool[ind]) # var+=1 summation=0 for c in clusters: # print len(c) summation+=len(c) # print summation totalError=0 predictions=[] m = shape(dataSet)[0] index=0 for t in testData: user,movie=int(t[0])-1,int(t[1])-1 # print user,movie label=testLabel[index] check=False for i in range(0,len(clusters)): for j in range(0,len(clusters[i])): if clusters[i][j][0]==user: count =0 tum=0.0 for k in range(0,len(clusters[i])): if dataSet[clusters[i][k][0],movie]!=0: count+=1 tum+=dataSet[clusters[i][k][0],movie] if count!=0: tum=tum/count check=True if check: break if check: break predictions.append(tum) totalError+=absolute(tum-label) index+=1 meanError=totalError/len(testData) print "Precision And Recall: " print shape(testLabel) print metrics.classification_report(testLabel,predictions) # print return predictions=array(predictions) standardDeviation+=std(predictions) # print standardDeviation # print meanError avg+=meanError # break # break print "Standard Deviation: "+str(standardDeviation)
lines=f.readlines() l=[] for line in lines: l.append([float(v) for v in line.split("|")[5:]]) # print l # print l[1] for i in range(len(l)): count=0 for j in range(len(l[i])): if l[i][j]==1: count+=1 for j in range(len(l[i])): if l[i][j]==1: l[i][j]/=count # print l[1] trainData=loadData.loadTrainingData("u.data") alpha=0.01 # thres=0.001 theta=mat(zeros((943,19))) # while True: print shape(mat(l)) # return for i in range(10): for k in range(19): old=theta[i,k] thres=0.1 sumi=0 # T=2 while True: for j in range(1682): if trainData[i,j]:
def test(): trainData = loadData.loadTrainingData("u.data") # create_pref_model(trainData) pref_matrix = create_pref_model(trainData) # print pref_matrix return pref_matrix
def test(): dataSet=loadData.loadTrainingData("u1.base") testSet,testLabel=loadData.loadTestData("u1.test") som(dataSet)