def valuate(mincount2,maxcount2,PredictAndAnalyze = PredictAndAnalyze2): vectorinfo = {} for ID in ["0000","0001","0002","0003","0004","0005","0006"]: vectorinfo[ID] = {} for j in textinfo[ID].keys(): #print j try: vectorinfo[ID][j] = createvector(video_id = j, ID = ID,mincount = mincount2,maxcount = maxcount2) except: #vectorinfo[ID][j] = np.zeros(len(model[model.vocab.keys()[0]])) print ID,j target2 = createtargetarray(maxcount2,100000000,10760.0,34544) data2 = createtvectorMat(maxcount2,100000000,vectorinfo) (TfidfTextList, word2freqlist) = makeTfidfTextList(maxcount2,100000000,mincount2,maxcount2) tfidf = TfidfVectorizer(tokenizer=tokenize) tfs = tfidf.fit_transform(TfidfTextList.values()) idlist = TfidfTextList.keys() feature_names = tfidf.get_feature_names() tfidfvectorinfo = {} sample = tfs.toarray().shape[0] print sample, len(feature_names), for n in range(0,sample): #print n tfidfvectorinfo[idlist[n]] = maketfidfvec(n,feature_names = feature_names,tfs = tfs,idlist= idlist,word2freqlist = word2freqlist) l = {} for ID in ["0000","0001","0002","0003","0004","0005","0006"]: l[ID] = vectorinfo[ID].keys() tfidfdata = createtfidfvectorMat(maxcount2,100000000,tfidfvectorinfo) print mincount2,maxcount2 print "logreg" k2 = PredictAndAnalyze(data2,target2,clf_cv =linear_model.LogisticRegression(C=1e1)) print accuracy_score(k2[0],k2[1]),(1.0 - accuracy_score(k2[2],k2[3])) k22 = PredictAndAnalyze(data = tfidfdata,target = target2,clf_cv =linear_model.LogisticRegression(C=1e1)) print accuracy_score(k22[0],k22[1]),(1.0 - accuracy_score(k22[2],k22[3])) print "svm" k0 = PredictAndAnalyze(data = data2,target = target2,clf_cv = svm.SVC(kernel='linear', probability=True)) print accuracy_score(k0[0],k0[1]),(1.0 - accuracy_score(k0[2],k0[3])) print "LinearSVM" k1 = PredictAndAnalyze(data = data2,target = target2,clf_cv = svm.LinearSVC()) print accuracy_score(k1[0],k1[1]),(1.0 - accuracy_score(k1[2],k1[3])) k00 = PredictAndAnalyze(data = tfidfdata,target = target2,clf_cv = svm.LinearSVC()) print accuracy_score(k00[0],k00[1]),(1.0 - accuracy_score(k00[2],k00[3])) return k2,k0,k1
k1 = PredictAndAnalyze(data2,target2,clf_cv = neighbors.KNeighborsClassifier(n_neighbors=10)) k2 = PredictAndAnalyze(data2,target2,clf_cv =linear_model.LogisticRegression(C=1e1)) l = {} for ID in ["0000","0001","0002","0003"]: l[ID] = vectorinfo[ID].keys() (TfidfTextList, word2freqlist) = makeTfidfTextList(100,100000000) tfs = tfidf.fit_transform(TfidfTextList.values()) idlist = TfidfTextList.keys() tfidfvectorinfo = {} sample = tfs.toarray().shape[0] for n in range(0,sample): tfidfvectorinfo[idlist[n]] = maketfidfvec(n,100,100000000) tfidfdata = createtfidfvectorMat(100,100000000) k2 = PredictAndAnalyze(data = tfidfdata,target = target2,clf_cv =linear_model.LogisticRegression(C=1e5)) k0 = PredictAndAnalyze(data = tfidfdata,target = target2,clf_cv = svm.SVC(kernel='linear', probability=True)) #コメント数毎で比較 tfidf = TfidfVectorizer(tokenizer=tokenize) for narray in range(2,10): target2 = createtargetarray(narray * 100 - 100,narray * 100 + 100,10000,30000) data2 = createtvectorMat(narray * 100 - 100,narray * 100 + 100) (TfidfTextList, word2freqlist) = makeTfidfTextList(narray * 100 - 100,narray * 100 + 100) tfs = tfidf.fit_transform(TfidfTextList.values()) idlist = TfidfTextList.keys() tfidfvectorinfo = {} sample = tfs.toarray().shape[0] for n in range(0,sample):