def f2j(title): title=title.decode('utf-8') strTemp='' for j in range(len(title)): charTemp=title[j].encode('utf-8') if u'\u4E00'<title[j]<u'\u9FFF': charTemp=jft.f2j('utf-8','utf-8',charTemp) strTemp+=charTemp return strTemp.decode('utf-8')
def rssUpdata(): print time.strftime("%Y/%m/%d %X", time.localtime(time.time())) times = [] titles = [] btAdds = [] dirNow = os.path.dirname(sys.argv[0]) strfile = dirNow + '\list.txt' try: f2 = file(strfile, 'rb') except: print "请在目录下创建list.txt,并以行为分隔写入关键词".decode('utf').encode('gbk') os.system('pause') sys.exit(0) keywordList = f2.readlines() ## d = feedparser.parse(r'http://bt.ktxp.com/rss-sort-12.xml') title = re.sub(r'amp;', '', a) ## print title.decode('utf8') for keywords in keywordList: print '\n' hasWord = 1 keywords = re.sub('\r\n', '', keywords.decode('gbk').encode('utf8')).split(' ') for keyword in keywords: if title.upper().find(keyword.upper()) >= 0: print title.upper().find( keyword.upper()), keyword.decode('utf8').encode('gbk') pass else: strTemp = '' for i in title.decode('utf8'): if u'\u4E00' < i < u'\u9FFF': strTemp += i strTemp = strTemp.encode('utf8') strTemp = jft.f2j('utf8', 'utf8', strTemp) if strTemp.find(keyword) >= 0: print strTemp.find(keyword), keyword.decode('utf8').encode( 'gbk') pass else: hasWord = 0 ## print keyword,hasWord ## print time.strftime("%Y/%m/%d %X",time.strptime(d.entries[j].published,'%a, %d %b %Y %X +%f')) if hasWord: #times.append(time.strftime("%Y/%m/%d %H:%M",time.strptime(d.entries[j].published,'%a, %d %b %Y %X +%f'))) titles.append(title.decode('utf8')) #btAdds.append(d.entries[j].enclosures[0].href) print times, titles, btAdds
def dirName(title, animeNum): dirNow = os.getcwd() cf = ConfigParser.ConfigParser() cf.read(os.path.join(dirNow, "config.ini")) downloadpath = cf.get('info', 'downloadpath') data = xlrd.open_workbook(os.path.join(dirNow, 'dirList.xls')) table = data.sheets()[0] keywords = table.col_values(3) for rownum in range(table.nrows): hasWord = 1 for keyword in keywords[rownum].split(' '): keyword = keyword.encode('utf8') if title.upper().find(keyword.upper()) < 0: strTemp = '' for i in title.decode('utf8'): if u'\u4E00' < i < u'\u9FFF': strTemp += i strTemp = strTemp.encode('utf8') strTemp = jft.f2j('utf8', 'utf8', strTemp) if strTemp.upper().find(keyword.upper()) < 0: hasWord = 0 if hasWord == 1: dirWords = table.row_values(rownum) break dirname = '%s-%s-%s' % (dirWords[0], animeNum, dirWords[1]) for i in dirWords[2].split(' '): dirname += '[%s]' % i dirname = dirname.encode('gbk') name = dirWords[1].encode('gbk') L = os.listdir(downloadpath) for folder in L: if folder.find(name) > 0: oldFolder = folder if animeNum < '00': dirname = oldFolder try: os.rename(os.path.join(downloadpath, oldFolder), os.path.join(downloadpath, dirname)) except: try: os.makedirs(os.path.join(downloadpath, dirname)) except: None return dirname
def rssUpdata(): print time.strftime("%Y/%m/%d %X",time.localtime(time.time())) times=[] titles=[] btAdds=[] dirNow=os.path.dirname(sys.argv[0]) strfile=dirNow+'\list.txt' try: f2 = file(strfile, 'rb') except: print "请在目录下创建list.txt,并以行为分隔写入关键词".decode('utf').encode('gbk') os.system('pause') sys.exit(0) keywordList=f2.readlines() ## d = feedparser.parse(r'http://bt.ktxp.com/rss-sort-12.xml') title=re.sub(r'amp;','',a) ## print title.decode('utf8') for keywords in keywordList: print '\n' hasWord=1 keywords=re.sub('\r\n','',keywords.decode('gbk').encode('utf8')).split(' ') for keyword in keywords: if title.upper().find(keyword.upper())>=0: print title.upper().find(keyword.upper()),keyword.decode('utf8').encode('gbk') pass else: strTemp='' for i in title.decode('utf8'): if u'\u4E00'<i<u'\u9FFF': strTemp+=i strTemp=strTemp.encode('utf8') strTemp=jft.f2j('utf8','utf8',strTemp) if strTemp.find(keyword)>=0: print strTemp.find(keyword),keyword.decode('utf8').encode('gbk') pass else: hasWord=0 ## print keyword,hasWord ## print time.strftime("%Y/%m/%d %X",time.strptime(d.entries[j].published,'%a, %d %b %Y %X +%f')) if hasWord: #times.append(time.strftime("%Y/%m/%d %H:%M",time.strptime(d.entries[j].published,'%a, %d %b %Y %X +%f'))) titles.append(title.decode('utf8')) #btAdds.append(d.entries[j].enclosures[0].href) print times,titles,btAdds
def dirName(title,animeNum): dirNow=os.getcwd() cf = ConfigParser.ConfigParser() cf.read(os.path.join(dirNow,"config.ini")) downloadpath=cf.get('info','downloadpath') data = xlrd.open_workbook(os.path.join(dirNow,'dirList.xls')) table = data.sheets()[0] keywords=table.col_values(3) for rownum in range(table.nrows): hasWord=1 for keyword in keywords[rownum].split(' '): keyword=keyword.encode('utf8') if title.upper().find(keyword.upper())<0: strTemp='' for i in title.decode('utf8'): if u'\u4E00'<i<u'\u9FFF': strTemp+=i strTemp=strTemp.encode('utf8') strTemp=jft.f2j('utf8','utf8',strTemp) if strTemp.upper().find(keyword.upper())<0: hasWord=0 if hasWord==1: dirWords=table.row_values(rownum) break dirname='%s-%s-%s'%(dirWords[0],animeNum,dirWords[1]) for i in dirWords[2].split(' '): dirname+='[%s]'%i dirname=dirname.encode('gbk') name=dirWords[1].encode('gbk') L = os.listdir(downloadpath) for folder in L: if folder.find(name)>0: oldFolder=folder if animeNum<'00': dirname=oldFolder try: os.rename(os.path.join(downloadpath,oldFolder),os.path.join(downloadpath,dirname)) except: try: os.makedirs(os.path.join(downloadpath,dirname)) except: None return dirname
def rssUpdata(): print time.strftime("%Y/%m/%d %X",time.localtime(time.time())) times=[] titles=[] btAdds=[] dirNow=os.path.dirname(sys.argv[0]) dirNow='F:\Anime\ktxpRss' strfile=dirNow+'\list.txt' try: f2 = file(strfile, 'rb') except: print "请在目录下创建list.txt,并以行为分隔写入关键词".decode('utf').encode('gbk') os.system('pause') sys.exit(0) keywordList=f2.readlines() title=re.sub(r'amp;','',"[澄空学园&华盟字幕社] Robotics;Notes 第05话 简体 MP4 720p") ## print title.decode('utf8') for keywords in keywordList: hasWord=1 keywords=re.sub('\r\n','',keywords.decode('gbk').encode('utf8')).split(' ') for keyword in keywords: if title.upper().find(keyword.upper())>=0: ## print title.find(keyword),keyword pass else: strTemp='' for i in title.decode('utf8'): if u'\u4E00'<i<u'\u9FFF': strTemp+=i strTemp=strTemp.encode('utf8') strTemp=jft.f2j('utf8','utf8',strTemp) if strTemp.find(keyword)>=0: ## print strTemp.find(keyword),keyword pass else: hasWord=0 ## print keyword,hasWord ## print time.strftime("%Y/%m/%d %X",time.strptime(d.entries[j].published,'%a, %d %b %Y %X +%f')) if hasWord: ## times.append(time.strftime("%Y/%m/%d %H:%M",time.strptime(d.entries[j].published,'%a, %d %b %Y %X +%f'))) print 1
def testClassfier(): global n for i in range(testNum): p = [1,1] pXC = [1,1] has1 = [] has1.append([]) has1.append([]) for j in range(featureNum): weiboContent = testData[i][4] if(i == 8): weiboContent = jft.f2j('utf8', 'utf8', testData[i][4]) if(weiboContent.count(feature[j]) != 0): pXC[0] = pXC[0] * C1_map[j] if(j >= 0 and j <= 4): has1[0].append(j) if(j >= 5 and j < featureNum): has1[1].append(j) else: pXC[0] = pXC[0] * (1 - C1_map[j]) if(len(has1[0]) != 0 and len(has1[1]) == 0): pXC[0] = pXC[0] * 2 if(len(has1[0]) == len(has1[1]) == 1): if(weiboContent.find(feature[has1[0][0]]) < weiboContent.find(feature[has1[1][0]])): pXC[0] = pXC[0] * 10 words = [w.word for w in pseg.cut(testData[i][4])] tags = [w.flag for w in pseg.cut(testData[i][4])] for lostFIndex in has1[1]: #取得丢失特征词在已分词的微博的索引index for index in range(len(words)): if(words[index] == feature[lostFIndex]): break #判断丢失特征词前面有没有以下的词语 if(words[index-1] == "你"): pXC[0] = pXC[0] * 100 for findFIndex in has1[0]: #取得捡到特征词在已分词的微博的索引index for index in range(len(words)): if(words[index] == feature[findFIndex]): break #判断捡到特征词前面有没有以下的词语 if(words[index-1] == "我"): pXC[0] = pXC[0] * 100 p[0] = A[0] * pXC[0] has2 = [] has2.append([]) has2.append([]) for j in range(featureNum): weiboContent = testData[i][4] if(i == 8): weiboContent = jft.f2j('utf8', 'utf8', testData[i][4]) if(weiboContent.count(feature[j]) != 0): pXC[1] = pXC[1] * C2_map[j] if(j >= 0 and j <= 4): has2[0].append(j) if(j >= 5 and j < featureNum): has2[1].append(j) else: pXC[1] = pXC[1] * (1 - C2_map[j]) if(len(has2[0]) == 0 and len(has2[1]) != 0): pXC[1] = pXC[1] * 2 if(len(has2[0]) == len(has2[1]) == 1): if(weiboContent.find(feature[has2[0][0]]) > weiboContent.find(feature[has2[1][0]])): if(tag[has2[0][0]] != "n"): pXC[1] = pXC[1] * 10 for lostFIndex in has1[1]: #取得丢失特征词在已分词的微博的索引index for index in range(len(words)): if(words[index] == feature[lostFIndex]): break #判断丢失特征词前面有没有以下的词语 if(words[index-1] == "我"): pXC[1] = pXC[1] * 100 for findFIndex in has1[0]: #取得捡到特征词在已分词的微博的索引index for index in range(len(words)): if(words[index] == feature[findFIndex]): break #判断捡到特征词前面有没有以下的词语 while(not(tags[index-1] == 'r' or tags[index-1] == 'n')): index = index - 1 indexTemp = index - 1 while(tags[indexTemp].find('n') != -1): indexTemp = indexTemp - 1 if(tags[indexTemp] == 'p'): indexTemp = indexTemp - 1 while(not(tags[indexTemp] == 'r' or tags[indexTemp] == 'n')): indexTemp = indexTemp - 1 if(words[indexTemp] == "你" or words[indexTemp] == "谁" or words[indexTemp] == "人"): pXC[1] = pXC[1] * 100 p[1] = A[1] * pXC[1] if(p[0] > p[1]): if(testData[i][0]==str(1)): n = n + 1 else: print i,testData[i][4],testData[i][0],1,p[0],p[1],has1 cuttest(testData[i][4]) #tags = jieba.analyse.extract_tags(testData[i][4],50) #print ",".join(tags) else: if(testData[i][0]==str(2)): n = n + 1 else: print i,testData[i][4],testData[i][0],2,p[0],p[1],has2 cuttest(testData[i][4]) #tags = jieba.analyse.extract_tags(testData[i][4],50) #print ",".join(tags) has1.pop() has1.pop() has2.pop() has2.pop() print "测试微博总数:",testNum print "分类正确的微博总数:",n tp = n / testNum fp = 1 - tp print "正确率:",tp * 100 , '%' print "错误率:",fp * 100, '%'
def rssUpdata(): print time.strftime("%Y/%m/%d %X", time.localtime(time.time())) times = [] titles = [] btAdds = [] dirNow = os.path.dirname(sys.argv[0]) dirNow = 'F:\Anime\ktxpRss' strfile = dirNow + '\list.txt' try: f2 = file(strfile, 'rb') except: print "请在目录下创建list.txt,并以行为分隔写入关键词".decode('utf').encode('gbk') os.system('pause') sys.exit(0) keywordList = f2.readlines() d = feedparser.parse(r'http://bt.ktxp.com/rss-sort-12.xml') for j in range(len(d.entries))[::-1]: title = re.sub(r'amp;', '', d.entries[j].title.encode('utf8')) ## print title.decode('utf8') for keywords in keywordList: hasWord = 1 keywords = re.sub('\r\n', '', keywords.decode('gbk').encode('utf8')).split(' ') for keyword in keywords: if title.upper().find(keyword.upper()) >= 0: ## print title.find(keyword),keyword pass else: strTemp = '' for i in title.decode('utf8'): if u'\u4E00' < i < u'\u9FFF': strTemp += i strTemp = strTemp.encode('utf8') strTemp = jft.f2j('utf8', 'utf8', strTemp) if strTemp.find(keyword) >= 0: ## print strTemp.find(keyword),keyword pass else: hasWord = 0 ## print keyword,hasWord ## print time.strftime("%Y/%m/%d %X",time.strptime(d.entries[j].published,'%a, %d %b %Y %X +%f')) if hasWord: times.append( time.strftime( "%Y/%m/%d %H:%M", time.strptime(d.entries[j].published, '%a, %d %b %Y %X +%f'))) titles.append(re.sub(r'"', '"', title.decode('utf8'))) btAdds.append(d.entries[j].enclosures[0].href) hasNew = dbwrite.dbwrite(times, titles, btAdds) miss = ktxpupdate.update() if miss > 0: import sqlite3 cx = sqlite3.connect("ktxp.db") cx.isolation_level = None cx.text_factory = str cu = cx.cursor() cu.execute("select * from t1 Order by id desc LIMIT %s" % miss) res = cu.fetchall() for i in res: f3 = file('Miss.log', 'a') f3.write("%s %s\n" % (i[1], i[2])) f3.close() download.downNew(hasNew + miss)
#encoding=utf-8 import jft import sys reload(sys) sys.setdefaultencoding('utf-8') infile = open('../category_path.txt','rb') outfile = open('../category_path_clean.txt','wb') row_index = 0 for row in infile: row_index += 1 # print row_index fanti_items = row.strip().split(',') jianti_items = [] try: for fanti_item in fanti_items: fanti_item.encode('utf-8') jianti_item = jft.f2j('utf-8','gbk',fanti_item) jianti_item = unicode(jianti_item,'gbk').decode('utf-8') jianti_items.append(jianti_item) outfile.write(','.join(jianti_items)+'\r\n') except: print 'ohh' print 'finsihed'
#encoding=utf-8 import jft import sys reload(sys) sys.setdefaultencoding('utf-8') infile = open('../category_path.txt', 'rb') outfile = open('../category_path_clean.txt', 'wb') row_index = 0 for row in infile: row_index += 1 # print row_index fanti_items = row.strip().split(',') jianti_items = [] try: for fanti_item in fanti_items: fanti_item.encode('utf-8') jianti_item = jft.f2j('utf-8', 'gbk', fanti_item) jianti_item = unicode(jianti_item, 'gbk').decode('utf-8') jianti_items.append(jianti_item) outfile.write(','.join(jianti_items) + '\r\n') except: print 'ohh' print 'finsihed'