def main(): cl = docclass.classifier(docclass.getwords) cl.setdb('test1.db') docclass.sampletrain(cl) print cl.fprob('quick', 'good') print cl.weighted_prob('money', 'good', cl.fprob) docclass.sampletrain(cl) print cl.weighted_prob('money', 'good', cl.fprob) clnb = docclass.naivebayes(docclass.getwords) clnb.setdb('test1.db') docclass.sampletrain(clnb) print clnb.prob('quick rabbit', 'good') print clnb.prob('quick rabbit', 'bad') print clnb.classify('quick rabbit', default='unknown') print clnb.classify('quick money', default='unknown') clnb.setthreshold('bad', 3.0) print clnb.classify('quick money', default='unknown') clfs = docclass.fisherclassifier(docclass.getwords) clfs.setdb('test1.db') docclass.sampletrain(clfs) print clfs.cprob('quick', 'good') print clfs.cprob('money', 'bad') print clfs.weighted_prob('money', 'bad', clfs.cprob) print clfs.fisherprob('quick rabbit', 'good') print clfs.fisherprob('quick rabbit', 'bad') print clfs.classify('quick rabbit') print clfs.classify('quick money') clfs2 = docclass.fisherclassifier(docclass.getwords) clfs2.setdb('test1.db') feedclassifier('feed_sample2.rss', clfs2) print clfs2.cprob('Pandas', 'python') print clfs2.cprob('python', 'python')
def testProb(self): cl = docclass.fisherclassifier(docclass.getwords) cl.setdb('test.db') docclass.sampletrain(cl) self.assertAlmostEquals(0.57142857, cl.cprob('quick', 'good')) self.assertAlmostEquals(0.78013987, cl.fisherprob('quick rabbit', 'good')) self.assertAlmostEquals(0.35633596, cl.fisherprob('quick rabbit', 'bad'))
def train_fisher(train_path, test_path): cl = docclass.fisherclassifier(docclass.getwords) for filename in glob.glob(train_path): with open(filename, 'r') as f: f = f.read() label = filename.split('.')[3] cl.train(f, label) print "Train Done!" TP = 0.0 FN = 0.0 FP = 0.0 TN = 0.0 for filename in glob.glob(test_path): with open(filename, 'r') as f: f = f.read() label = filename.split('.')[3] predict = cl.classify(f) if label == 'spam' and predict == 'spam': TP += 1 elif label == 'spam' and predict == 'ham': FN += 1 elif label == 'ham' and predict == 'spam': FP += 1 elif label == 'ham' and predict == 'ham': TN += 1 else: print predict, label print "Test Done!" score(TP, FN, FP, TN)
def test_fisher_weightedprob(): sys.stderr.write("testing computation of fisher weightedprob...\n") reload(docclass) cl=docclass.fisherclassifier(docclass.getwords) docclass.sampletrain(cl) wp = cl.weightedprob('money', 'bad', cl.cprob) sys.stdout.write("%f\n" %(wp)) # 0.75
def main(): # table = string.maketrans("","") # s = '从满脸痘痘到细腻皮肤的蜕变,大S及皮肤科医生都推荐的修复面膜,[ 围观]解决皮肤的多种问题~点击查看详情:http://t.cn/zHFnve4' # for x in getWords(s): # print x # s.translate(table, string.punctuation+extra_punctuation) # regxs = {r'\[\S+?\]': ''} # for key,value in regxs.items(): # print key, value # with open("test.txt", "r") as f: # consumer_key,consumer_secret,key,secret,userid = f.readlines()[0].strip().split(' ') # print consumer_key,consumer_secret,key,secret,userid # run_crawler(consumer_key,consumer_secret,key,secret,'1986653865') # weibo = Sina_master(consumer_key,consumer_secret) # weibo.setToken(key, secret) # weibo.manage_access() # info = weibo.get_latest_weibo(count=5, user_id=userid) # reptile(sina_reptile,userid) # sina_reptile.connection.close() # for x in info: # print x # print x['geo']['city'] # print x['text'] # words =getWords(x['text']) # # print x['text'] cl = docclass.fisherclassifier(docclass.getWords) cl.setdb('statuses.db') # print cl.cprob('幸福', 'test') # print cl.fisherprob('幸福', 'test') # cl.train(x, 'test;up;kill;volite') dic =cl.classifypercent(input_) print sorted(dic.items(), key=lambda e:e[1], reverse=True)
def analyzeResults(settings, resuls): # database = FeedDatabase(settings['database']) classifier = fisherclassifier(getwords) classifier.setdb(settings["database"]) counter = 0 size = len(resuls.keys()) for resul in resuls.keys(): # sys.stderr.write() # Get the actual category and description of the blog entry query = "SELECT actualcategory,description,title FROM feeds WHERE guid='{0}'".format(resuls[resul]["guid"]) query = classifier.con.execute(query).fetchone() actCatgory = query[0] descrip = query[1] # print(query) # print(qResult[0]) cProb = -1 fProb = -1 # print(descrip) # print(cprob) fProb = classifier.fisherprob(descrip, actCatgory) guid = resuls[resul]["guid"] predictedCategory = resuls[resul]["category"] tit = unicodedata.normalize("NFKD", query[2]).encode("ascii", "ignore") tit = tit.replace("&", "\&") # print('{0}|{1}|{2}|{3}'.format(guid,predictedCategory,actCatgory,fProb)) print("{0} & {1} & {2} & {3} \\\\\\hline".format(tit, predictedCategory, actCatgory, fProb)) # classifier.con.execute('''INSERT INTO predictedEntries VALUES (?,?,?,?)''',(guid,predictedCategory,fProb,cprob)) # classifier.con.commit() # print(len(qResult)) qry = "" report = [{""}]
def guess_the_prof(self): if self.method=='naive': cl=docclass.naivebayes(docclass.getwords) prof_sel=self.box.get() # This is the professor whose department we want to guess doc_of_prof=self.profs_data[prof_sel] self.trainer(prof_sel,cl) all_thresh=self.lb.get(0,END) thresholds=[] for item in all_thresh: merged=item.split('-') threshnum=float(merged[0]) thresh=merged[1] thresholds.append((thresh,threshnum)) for thr,num in thresholds: cl.setthreshold(thr,num) self.pdep= cl.classify(doc_of_prof,default='unknown') else: cl=docclass.fisherclassifier(docclass.getwords) prof_sel=self.box.get() # This is the professor whose department we want to guess doc_of_prof=self.profs_data[prof_sel] self.trainer(prof_sel,cl) all_thresh=self.lb.get(0,END) thresholds=[] for item in all_thresh: merged=item.split('-') threshnum=float(merged[0]) thresh=merged[1] thresholds.append((thresh,threshnum)) for thr,num in thresholds: cl.setminimum(thr,num) self.pdep= cl.classify(doc_of_prof,default='unknown') self.verdict()
def train3(): import docclass as docclass cl2 = docclass.fisherclassifier(docclass.getwords) cl2.setdb('test3.db') for a in range(2000): docclass.sampletrain(cl2) cl2.con.commit()
def test_fisher_cprob(): sys.stderr.write("testing computation of fisher cprob...\n") reload(docclass) cl=docclass.fisherclassifier(docclass.getwords) docclass.sampletrain(cl) cp1 = cl.cprob('quick', 'good') sys.stdout.write("%f\n" %(cp1)) # 0.57142857... cp2 = cl.cprob('money', 'bad') sys.stdout.write("%f\n" %(cp2)) # 1.0
def myFisherModelInTrainingAndTesting( trainingInputFileName, entriesXMLFileName, dbFileName, mode, maxItems, getWordGetEntryMethod="getWord" ): if (mode == "train" or mode == "test") and ( getWordGetEntryMethod == "getWord" or getWordGetEntryMethod == "getEntry" ): if len(trainingInputFileName) > 0 and len(entriesXMLFileName) > 0 and len(dbFileName) > 0 and maxItems > 0: if getWordGetEntryMethod == "getWord": cl = docclass.fisherclassifier(docclass.getwords) else: cl = docclass.fisherclassifier(feedfilter.entryfeatures) cl.setdb(dbFileName) feedfilter.nonInteractiveRead( entriesXMLFileName, cl, trainingInputFileName, mode, maxItems, getWordGetEntryMethod )
def test_fisher_fisherprob(): sys.stderr.write("testing computation of fisher fisherprob...\n") reload(docclass) cl=docclass.fisherclassifier(docclass.getwords) docclass.sampletrain(cl) # cprob cp = cl.cprob('quick', 'good') sys.stdout.write("%f\n" %(cp)) # 0.57142857... # fisher prob fp1 = cl.fisherprob('quick rabbit', 'good') sys.stdout.write("%f\n" %(fp1)) # 0.780139 fp2 = cl.fisherprob('quick rabbit', 'bad') sys.stdout.write("%f\n" %(fp2)) # 0.356335
def testClassify(self): cl = docclass.fisherclassifier(docclass.getwords) cl.setdb('test.db') docclass.sampletrain(cl) self.assertEquals('good', cl.classify('quick rabbit', default='unknown')) self.assertEquals('bad', cl.classify('quick money', default='unknown')) cl.setminimum('bad', 0.8) self.assertEquals('good', cl.classify('quick money', default='unknown')) cl.setminimum('bad', 0.4) self.assertEquals('bad', cl.classify('quick money', default='unknown'))
def myFisherModelInTrainingAndTesting(trainingInputFileName, entriesXMLFileName, dbFileName, mode, maxItems, getWordGetEntryMethod='getWord'): if ((mode == 'train' or mode == 'test') and (getWordGetEntryMethod == 'getWord' or getWordGetEntryMethod == 'getEntry')): if (len(trainingInputFileName) > 0 and len(entriesXMLFileName) > 0 and len(dbFileName) > 0 and maxItems > 0): if (getWordGetEntryMethod == 'getWord'): cl = docclass.fisherclassifier(docclass.getwords) else: cl = docclass.fisherclassifier(feedfilter.entryfeatures) cl.setdb(dbFileName) feedfilter.nonInteractiveRead(entriesXMLFileName, cl, trainingInputFileName, mode, maxItems, getWordGetEntryMethod)
def train_classifier(settings,trainingData): counter = 0 size = len(trainingData) database = FeedDatabase(settings['database']) for key in trainingData.keys(): database.change_classified(key,classified=True) database.close_database() classifier = fisherclassifier(getwords) classifier.setdb(settings['database']) for key in trainingData.keys(): sys.stderr.write('...Training ({0}/{1})...\n'.format(counter,size)) classifier.train(trainingData[key]['description'],trainingData[key]['category']) counter +=1 sys.stderr.write('...Finished Training Classifier\n')
def Fisher_prediction(self): cll = docclass.fisherclassifier(docclass.getwords) for category in self.list_of_department: #Category=department for teacher in self.dictionary_of_department_and_professor[ category]: if teacher == self.combovar.get(): continue else: cll.train( self.dictionary_as_database[teacher], category ) #self.dictionary_as_database[teacher]=professor's information for item in self.list_of_thresholds: department, score = item cll.setminimum(department, score) prediction = cll.classify( self.dictionary_as_database[self.combovar.get()], default=None) self.help_to_write(prediction)
def test_fisher_classify(): sys.stderr.write("testing fisher classification...\n") reload(docclass) cl=docclass.fisherclassifier(docclass.getwords) docclass.sampletrain(cl) # classify c1 = cl.classify('quick rabbit') sys.stdout.write("%s\n" %(c1)) # 'good' c2 = cl.classify('quick money') sys.stdout.write("%s\n" %(c2)) # 'bad' # set minimum for 'bad' cl.setminimum('bad', 0.8) c3 = cl.classify('quick money') sys.stdout.write("%s\n" %(c3)) # 'good' # set minimum for 'good' cl.setminimum('good', 0.4) c4 = cl.classify('quick money') sys.stdout.write("%s\n" %(c4)) # 'good'
def main(): cl = docclass.fisherclassifier(docclass.getWords) cl.setdb('statuses.db') with open("test.txt", "r") as f: consumer_key,consumer_secret,key,secret,userid = f.readlines()[0].strip().split(' ') weibo = Sina_master(consumer_key,consumer_secret) weibo.setToken(key, secret) info = weibo.get_latest_weibo(count=50, user_id="1906168267")#1986653865 #1794530900 with open(cache_file, 'w') as f: for x in info: p= cl.classifypercent(x['text']) f.write(json.dumps(p)+"\n") with open(cache_file, "rb") as f: dic= user_line(f.readlines()) print sorted(dic.items(), key=lambda e:e[1], reverse=True) os.remove(cache_file)
def classifyEntries(settings): database = FeedDatabase(settings['database']) unclassifiedEntries = database.get_unpredicted_entries() #for i in unclassifiedEntries: # print(i) #print(len(unclassifiedEntries)) database.close_database() classifier = fisherclassifier(getwords) classifier.setdb(settings['database']) counter = 0 size = len(unclassifiedEntries) results = [] for entr in unclassifiedEntries: a = open('script50.txt','w+') for i in results: a.write('{0}|{1}\n'.format(i['guid'],i['category'])) a.close() category = classifier.classify(entr['description']) #print('{0}|{1}'.format(entr['guid'],category)) results.append({'guid':entr['guid'],'category':category}) counter += 1 sys.stderr.write('...Classified {0} of {1} entries\n'.format(counter,size))
#!/usr/bin/python import feedfilter import docclass import feedparser import randomGram import unidecode #docclass.getwords("this") myclassifier = docclass.fisherclassifier(docclass.getwords) myclassifier.setdb('CSblogfeed.db') #feedfilter.read('CSblogfeed.xml',myclassifier) #pull in correct answers file = open('correct.dat', 'r') correctf = file.read() file.close() correct = correctf.split('\n') #for line in correct: # print line #del correct[-1] #print len(correct) #print type(correct) #print correctf #for item in correctf: # print item f = feedparser.parse('CSblogfeed.xml') count = 0 for entry in f['entries']: #train using 50 if count != 90: # print count," ",correct[count]," ",entry['title']
def main(): cl=docclass.fisherclassifier(docclass.getwords) cl.setdb('allsports3.db') read('allsports.xml',cl)
import docclass as d cl = d.fisherclassifier(d.getwords) d.sampletrain(cl) print cl.classify('quick rabbit') print cl.classify('quick money') cl.setminimum('bad', 0.8) print cl.classify('quick money') cl.setminimum('good', 0.4) print cl.classify('quick money') for i in range(10): d.sampletrain(cl) print cl.classify('quick money')
f = feedparser.parse(feed) for entry in f['entries']: print print '----' print 'Title: ' + entry['title'].encode('utf-8') print 'Publisher: ' + entry['publisher'].encode('utf-8') print print entry['summary'].encode('utf-8') fulltext = '%s\n%s\n%s' % (entry['title'], entry['publisher'], entry['summary']) #print 'Guess: ' + str(classifier.classify(fulltext)) #cl = raw_input('Enter category: ') #classifier.train(fulltext, cl) print 'Guess: ' + str(classifier.classify(entry)) cl = raw_input('Enter category: ') classifier.train(entry, cl) if __name__ == '__main__': import docclass #cl = docclass.fisherclassifier(docclass.getwords) cl = docclass.fisherclassifier(entryfeatures) cl.setdb('python_feed.db') read('python_search.xml', cl)
def main(): cl=docclass.fisherclassifier(docclass.getwords) cl.setdb('dpaladhi.db') read('my_data.xml',cl)
fisherclassifier.train(fulltext,temp) else: value1 = str(fisherclassifier.classify(fulltext)) print(value1) actual.append(int(temp)) print() return actual def readVector(filename): lines=[] for line in open(filename): lines.append(line) colnames=lines[0].strip().split('\t')[1:] rownames=[] data=[] for line in lines[1:]: p=line.strip().split('\t') rownames.append(p[0]) data.append([float(x) for x in p[1:]]) return rownames,colnames,data c2=docclass.fisherclassifier(docclass.getwords) blognames,words,data=readVector('blogdata1.txt') Yvalue = readfile("http://superchicken46.blogspot.com/feeds/posts/default?max-results=100&alt=rss", c2) X_digits = np.array(data) Y_digits = np.array(Yvalue) clf = svm.SVC(kernel='linear', C=10) clf.fit(X_digits, Y_digits) scores = cross_validation.cross_val_score(clf, X_digits, Y_digits, cv = 10) print(scores.mean()) for i in scores: print("Value:", i)
summarywords = [s.lower() for s in splitter.split(entry['summary']) if len(s) > 2 and len(s) < 20] # 统计大写字母 uc = 0 for i in range(len(summarywords)): w = summarywords[i] f[w] = 1 if w.isupper(): uc += 1 # 将从摘要中获得的词组作为特征 if i < len(summarywords)-1: twowords = ' '.join(summarywords[i:i+1]) f[twowords] = 1 # 保持文章创建者和发布者名字的完整性 f['Publisher:' + entry['publisher']] = 1 # UPPERCASE 是一个“虚拟”单词,用以指示存在过多的大写内容 if float(uc) / len(summarywords) > 0.3: f['UPPERCASE'] = 1 return f if __name__ == '__main__': # cl = docclass.fisherclassifier(docclass.getwords) cl = docclass.fisherclassifier(entryfeatures) cl.setdb('python_feed.db') feedread('python_feed.xml', cl)
def main(): cl = docclass.fisherclassifier(docclass.getwords) cl.setdb('bbokka.db') print "testing the program" feedfilter.read('test.xml', cl)
def testOneCategory(self): cl = docclass.fisherclassifier(docclass.getwords) cl.setdb('test.db') cl.train('hallo hallo', 'greeting') self.assertEquals('greeting', cl.classify('hallo world'))
for i in range(len(summarywords)): w = summarywords[i] f[w] = 1 if w.isupper(): uc += 1 # 将从摘要中获得的词组作为特征 if i < len(summarywords) - 1: twowords = ' '.join(summarywords[i:i + 1]) f[twowords] = 1 # 保持文章创建者和发布者名字的完整性 f['Publisher:' + entry['publisher']] = 1 # UPPERCASE是一个“虚拟”单词,用以指示存在过多的大写内容 if float(uc) / len(summarywords) > 0.3: f['UPPERCASE'] = 1 ''' # 将当前分类的最佳推测结果打印输出 print 'Guess: ' + str(classifier.classify(entry)) # 请求用户给出正确分类, 并据此进行训练 c1 = raw_input('Enter category: ') classifier.train(entry, c1) ''' return f c1 = fisherclassifier(entryfeatures) read('../data/python_search.xml', c1)
uc = 0 for i in range(len(summarywords)): w = summarywords[i] features[w] = 1 if w.isupper(): uc += 1 # 将从摘要中获得词组作为特征 if i < len(summarywords) - 1: twowords = ' '.join(summarywords[i:i + 1]) features[twowords] = 1 # 保持文章创建者和发布者名字的完整性 features['Publisher:' + entry['publisher']] = 1 # UPPERCASE是一个“虚拟”单词,用以指示存在过多的大写内容 if float(uc) / len(summarywords) > 0.3: features['UPPERCASE'] = 1 return features if __name__ == "__main__": #只有在执行当前模块时才会运行此函数 # 对博客文章进行分类和训练 cl = docclass.fisherclassifier(docclass.getwords) cl.setdb('python_feed.db') read('python_search.xml', cl) # 使用改进的特征提取函数对文章分类进行处理 cl = docclass.fisherclassifier(entryfeatures) cl.setdb('python_feed.db') read('python_search.xml', cl)
urls = ( '/api/mining', 'Mining', '/api/pro', 'Resouce', '/api/traindata', 'Traindata', '/.*' , 'Index', ) with open("test.txt", "r") as f: consumer_key,consumer_secret,key,secret,userid = f.readlines()[0].strip().split(' ') render = web.template.render('templates') app = web.application(urls, globals()) db = web.database(dbn="sqlite", db=conf.db_name) cl = docclass.fisherclassifier(docclass.getWords) cl.setdb(conf.db_name) weibo = Sina_master(consumer_key,consumer_secret) weibo.setToken(key, secret) res = open(conf.pro_path).read() class Index: def GET(self): i = web.input(pageIndex=1, pageSize=5) '''select id, text from statuses order by id limit ? , ?''' posts = db.query('select id, text from statuses where status=0 order by id limit $pageIndex , $pageSize', \ vars={'pageIndex': (int(i.pageIndex)-1)*int(i.pageSize), 'pageSize': i.pageSize}) count = db.select('statuses', what='count(*) total_num', where=' status=$status', vars={'status': 0}) # print 'val:%d'%int(count.c) total_num = count[0].total_num c = total_num/int(i.pageSize)
def testingModel(dictionaryOfTitleAndClass): cl=docclass.fisherclassifier(docclass.getwords) cl.setdb('politics_feed.db') # Only if you implemented SQLite feedfilter.readNonInteractiveTesting(dictionaryOfTitleAndClass,'politics_search2.xml',cl)
def main(): cl=docclass.fisherclassifier(docclass.getwords) cl.setdb('smajeti.db') print "testing the program" feedfilter.read('toiEntertainment.xml',cl)
#!/usr/bin/python2.7 # _*_ coding: utf-8 _*_ import docclass as ori import os c1 = ori.fisherclassifier(ori.getwords) c1.setdb('test1.db') ''' def getrate(self) right=sum(self.con.execute('select count from cc where wr=1',(cat,wr)).fetchall().value()) wrong=sum(self.con.execute('select count from cc where wr=0',(cat,wr)).fetchall().value()) rate=right/(right+wrong) return rate ''' def doctest(cl): right = 0.0 wrong = 0.0 dir = os.getcwd() dirham = dir + r'\data_set\hw1_data\test\ham' # print dirham+'\n' dirspam = dir + r'\data_set\hw1_data\test\spam' list1 = ori.GetFileList(dirspam, []) list2 = ori.GetFileList(dirham, []) # print list1+'\n' for item in list1: f = open(item) words = f.read() #words = textParser(f)