Exemplo n.º 1
0
def f2j(title):
    title=title.decode('utf-8')
    strTemp=''
    for j in range(len(title)):
        charTemp=title[j].encode('utf-8')
        if u'\u4E00'<title[j]<u'\u9FFF':
            charTemp=jft.f2j('utf-8','utf-8',charTemp)
        strTemp+=charTemp
    return strTemp.decode('utf-8')
Exemplo n.º 2
0
def rssUpdata():
    print time.strftime("%Y/%m/%d %X", time.localtime(time.time()))
    times = []
    titles = []
    btAdds = []
    dirNow = os.path.dirname(sys.argv[0])
    strfile = dirNow + '\list.txt'
    try:
        f2 = file(strfile, 'rb')
    except:
        print "请在目录下创建list.txt,并以行为分隔写入关键词".decode('utf').encode('gbk')
        os.system('pause')
        sys.exit(0)
    keywordList = f2.readlines()
    ##    d = feedparser.parse(r'http://bt.ktxp.com/rss-sort-12.xml')
    title = re.sub(r'amp;', '', a)
    ##        print title.decode('utf8')
    for keywords in keywordList:
        print '\n'
        hasWord = 1
        keywords = re.sub('\r\n', '',
                          keywords.decode('gbk').encode('utf8')).split(' ')
        for keyword in keywords:
            if title.upper().find(keyword.upper()) >= 0:
                print title.upper().find(
                    keyword.upper()), keyword.decode('utf8').encode('gbk')
                pass
            else:
                strTemp = ''
                for i in title.decode('utf8'):
                    if u'\u4E00' < i < u'\u9FFF':
                        strTemp += i
                strTemp = strTemp.encode('utf8')
                strTemp = jft.f2j('utf8', 'utf8', strTemp)
                if strTemp.find(keyword) >= 0:
                    print strTemp.find(keyword), keyword.decode('utf8').encode(
                        'gbk')
                    pass
                else:
                    hasWord = 0


##                    print keyword,hasWord
##            print time.strftime("%Y/%m/%d %X",time.strptime(d.entries[j].published,'%a, %d %b %Y %X +%f'))
        if hasWord:
            #times.append(time.strftime("%Y/%m/%d %H:%M",time.strptime(d.entries[j].published,'%a, %d %b %Y %X +%f')))
            titles.append(title.decode('utf8'))
            #btAdds.append(d.entries[j].enclosures[0].href)
    print times, titles, btAdds
Exemplo n.º 3
0
def dirName(title, animeNum):
    dirNow = os.getcwd()
    cf = ConfigParser.ConfigParser()
    cf.read(os.path.join(dirNow, "config.ini"))
    downloadpath = cf.get('info', 'downloadpath')
    data = xlrd.open_workbook(os.path.join(dirNow, 'dirList.xls'))
    table = data.sheets()[0]
    keywords = table.col_values(3)
    for rownum in range(table.nrows):
        hasWord = 1
        for keyword in keywords[rownum].split(' '):
            keyword = keyword.encode('utf8')
            if title.upper().find(keyword.upper()) < 0:
                strTemp = ''
                for i in title.decode('utf8'):
                    if u'\u4E00' < i < u'\u9FFF':
                        strTemp += i
                strTemp = strTemp.encode('utf8')
                strTemp = jft.f2j('utf8', 'utf8', strTemp)
                if strTemp.upper().find(keyword.upper()) < 0:
                    hasWord = 0
        if hasWord == 1:
            dirWords = table.row_values(rownum)
            break
    dirname = '%s-%s-%s' % (dirWords[0], animeNum, dirWords[1])
    for i in dirWords[2].split(' '):
        dirname += '[%s]' % i
    dirname = dirname.encode('gbk')
    name = dirWords[1].encode('gbk')
    L = os.listdir(downloadpath)
    for folder in L:
        if folder.find(name) > 0:
            oldFolder = folder
    if animeNum < '00':
        dirname = oldFolder
    try:
        os.rename(os.path.join(downloadpath, oldFolder),
                  os.path.join(downloadpath, dirname))
    except:
        try:
            os.makedirs(os.path.join(downloadpath, dirname))
        except:
            None
    return dirname
Exemplo n.º 4
0
def rssUpdata():
    print time.strftime("%Y/%m/%d %X",time.localtime(time.time()))
    times=[]
    titles=[]
    btAdds=[]
    dirNow=os.path.dirname(sys.argv[0])
    strfile=dirNow+'\list.txt'
    try:
        f2 = file(strfile, 'rb')
    except:
        print "请在目录下创建list.txt,并以行为分隔写入关键词".decode('utf').encode('gbk')
        os.system('pause')
        sys.exit(0)
    keywordList=f2.readlines()
##    d = feedparser.parse(r'http://bt.ktxp.com/rss-sort-12.xml')
    title=re.sub(r'amp;','',a)
##        print title.decode('utf8')
    for keywords in keywordList:
        print '\n'
        hasWord=1
        keywords=re.sub('\r\n','',keywords.decode('gbk').encode('utf8')).split(' ')
        for keyword in keywords:
            if title.upper().find(keyword.upper())>=0:
                print title.upper().find(keyword.upper()),keyword.decode('utf8').encode('gbk')
                pass
            else:
                strTemp=''
                for i in title.decode('utf8'):
                    if u'\u4E00'<i<u'\u9FFF':
                        strTemp+=i
                strTemp=strTemp.encode('utf8')
                strTemp=jft.f2j('utf8','utf8',strTemp)
                if strTemp.find(keyword)>=0:
                    print strTemp.find(keyword),keyword.decode('utf8').encode('gbk')
                    pass
                else:
                    hasWord=0
##                    print keyword,hasWord
##            print time.strftime("%Y/%m/%d %X",time.strptime(d.entries[j].published,'%a, %d %b %Y %X +%f'))
        if hasWord:
            #times.append(time.strftime("%Y/%m/%d %H:%M",time.strptime(d.entries[j].published,'%a, %d %b %Y %X +%f')))
            titles.append(title.decode('utf8'))
            #btAdds.append(d.entries[j].enclosures[0].href)
    print times,titles,btAdds
Exemplo n.º 5
0
def dirName(title,animeNum):
    dirNow=os.getcwd()
    cf = ConfigParser.ConfigParser()
    cf.read(os.path.join(dirNow,"config.ini"))
    downloadpath=cf.get('info','downloadpath')
    data = xlrd.open_workbook(os.path.join(dirNow,'dirList.xls'))
    table = data.sheets()[0]
    keywords=table.col_values(3)
    for rownum in range(table.nrows):
        hasWord=1
        for keyword in keywords[rownum].split(' '):
            keyword=keyword.encode('utf8')
            if title.upper().find(keyword.upper())<0:
                strTemp=''
                for i in title.decode('utf8'):
                    if u'\u4E00'<i<u'\u9FFF':
                        strTemp+=i
                strTemp=strTemp.encode('utf8')
                strTemp=jft.f2j('utf8','utf8',strTemp)
                if strTemp.upper().find(keyword.upper())<0:
                    hasWord=0
        if hasWord==1:
            dirWords=table.row_values(rownum)
            break
    dirname='%s-%s-%s'%(dirWords[0],animeNum,dirWords[1])
    for i in dirWords[2].split(' '):
        dirname+='[%s]'%i
    dirname=dirname.encode('gbk')
    name=dirWords[1].encode('gbk')
    L = os.listdir(downloadpath)
    for folder in L:
        if folder.find(name)>0:
            oldFolder=folder
    if animeNum<'00':
        dirname=oldFolder
    try:
        os.rename(os.path.join(downloadpath,oldFolder),os.path.join(downloadpath,dirname))
    except:
        try:
            os.makedirs(os.path.join(downloadpath,dirname))
        except:
            None
    return dirname
Exemplo n.º 6
0
def rssUpdata():
    print time.strftime("%Y/%m/%d %X",time.localtime(time.time()))
    times=[]
    titles=[]
    btAdds=[]
    dirNow=os.path.dirname(sys.argv[0])
    dirNow='F:\Anime\ktxpRss'
    strfile=dirNow+'\list.txt'
    try:
        f2 = file(strfile, 'rb')
    except:
        print "请在目录下创建list.txt,并以行为分隔写入关键词".decode('utf').encode('gbk')
        os.system('pause')
        sys.exit(0)
    keywordList=f2.readlines()
    title=re.sub(r'amp;','',"[澄空学园&华盟字幕社] Robotics;Notes 第05话 简体 MP4 720p")
##        print title.decode('utf8')
    for keywords in keywordList:
        hasWord=1
        keywords=re.sub('\r\n','',keywords.decode('gbk').encode('utf8')).split(' ')
        for keyword in keywords:
            if title.upper().find(keyword.upper())>=0:
##                    print title.find(keyword),keyword
                pass
            else:
                strTemp=''
                for i in title.decode('utf8'):
                    if u'\u4E00'<i<u'\u9FFF':
                        strTemp+=i
                strTemp=strTemp.encode('utf8')
                strTemp=jft.f2j('utf8','utf8',strTemp)
                if strTemp.find(keyword)>=0:
##                        print strTemp.find(keyword),keyword
                    pass
                else:
                    hasWord=0
##                    print keyword,hasWord
##            print time.strftime("%Y/%m/%d %X",time.strptime(d.entries[j].published,'%a, %d %b %Y %X +%f'))
        if hasWord:
##            times.append(time.strftime("%Y/%m/%d %H:%M",time.strptime(d.entries[j].published,'%a, %d %b %Y %X +%f')))
            print 1
Exemplo n.º 7
0
def testClassfier():
    global n
    
    for i in range(testNum):
        p = [1,1]
        pXC = [1,1]
        
        has1 = []
        has1.append([])
        has1.append([])
        for j in range(featureNum):
            weiboContent = testData[i][4]
            if(i == 8):
                weiboContent = jft.f2j('utf8', 'utf8', testData[i][4])
            if(weiboContent.count(feature[j]) != 0):
                pXC[0] = pXC[0] * C1_map[j]
                if(j >= 0 and j <= 4):
                    has1[0].append(j)
                if(j >= 5 and j < featureNum):
                    has1[1].append(j)             
            else:
                pXC[0] = pXC[0] * (1 - C1_map[j])              
        if(len(has1[0]) != 0 and len(has1[1]) == 0):
            pXC[0] = pXC[0] * 2
        if(len(has1[0]) == len(has1[1]) == 1):
            if(weiboContent.find(feature[has1[0][0]]) < 
               weiboContent.find(feature[has1[1][0]])):
                pXC[0] = pXC[0] * 10
                
        words = [w.word for w in pseg.cut(testData[i][4])]
        tags = [w.flag for w in pseg.cut(testData[i][4])]
        for lostFIndex in has1[1]:
            #取得丢失特征词在已分词的微博的索引index
            for index in range(len(words)):
                if(words[index] == feature[lostFIndex]):
                    break
            #判断丢失特征词前面有没有以下的词语
            if(words[index-1] == "你"):
                pXC[0] = pXC[0] * 100
        for findFIndex in has1[0]:
            #取得捡到特征词在已分词的微博的索引index
            for index in range(len(words)):
                if(words[index] == feature[findFIndex]):
                    break
            #判断捡到特征词前面有没有以下的词语
            if(words[index-1] == "我"):
                pXC[0] = pXC[0] * 100
        p[0] = A[0] * pXC[0]
             
        has2 = []
        has2.append([])
        has2.append([])
        for j in range(featureNum):
            weiboContent = testData[i][4]
            if(i == 8):
                weiboContent = jft.f2j('utf8', 'utf8', testData[i][4])
            if(weiboContent.count(feature[j]) != 0):
                pXC[1] = pXC[1] * C2_map[j]
                if(j >= 0 and j <= 4):
                    has2[0].append(j)
                if(j >= 5 and j < featureNum):
                    has2[1].append(j)
            else:
                pXC[1] = pXC[1] * (1 - C2_map[j]) 
        if(len(has2[0]) == 0 and len(has2[1]) != 0):
            pXC[1] = pXC[1] * 2
        if(len(has2[0]) == len(has2[1]) == 1):
            if(weiboContent.find(feature[has2[0][0]]) > 
                weiboContent.find(feature[has2[1][0]])):
                if(tag[has2[0][0]] != "n"):
                    pXC[1] = pXC[1] * 10
                    
        for lostFIndex in has1[1]:
            #取得丢失特征词在已分词的微博的索引index
            for index in range(len(words)):
                if(words[index] == feature[lostFIndex]):
                    break
            #判断丢失特征词前面有没有以下的词语
            if(words[index-1] == "我"):
                pXC[1] = pXC[1] * 100
        for findFIndex in has1[0]:
            #取得捡到特征词在已分词的微博的索引index
            for index in range(len(words)):
                if(words[index] == feature[findFIndex]):
                    break
            #判断捡到特征词前面有没有以下的词语
            while(not(tags[index-1] == 'r' or tags[index-1] == 'n')):
                index = index - 1
            indexTemp = index - 1
            while(tags[indexTemp].find('n') != -1):
                indexTemp = indexTemp - 1
            if(tags[indexTemp] == 'p'):
                indexTemp = indexTemp - 1
                while(not(tags[indexTemp] == 'r' or tags[indexTemp] == 'n')):
                    indexTemp = indexTemp - 1
            if(words[indexTemp] == "你" or words[indexTemp] == "谁" or words[indexTemp] == "人"):
                pXC[1] = pXC[1] * 100
                     
        p[1] = A[1] * pXC[1]

        if(p[0] > p[1]):
            if(testData[i][0]==str(1)):
                n = n + 1
            else:
                print i,testData[i][4],testData[i][0],1,p[0],p[1],has1
                cuttest(testData[i][4])
                #tags = jieba.analyse.extract_tags(testData[i][4],50)
                #print ",".join(tags)
  
        else:
            if(testData[i][0]==str(2)):
                n = n + 1     
            else:
                print i,testData[i][4],testData[i][0],2,p[0],p[1],has2
                cuttest(testData[i][4])
                #tags = jieba.analyse.extract_tags(testData[i][4],50)
                #print ",".join(tags)
        
        has1.pop()
        has1.pop()
        has2.pop()
        has2.pop()
    
    print "测试微博总数:",testNum
    print "分类正确的微博总数:",n
    tp = n / testNum
    fp = 1 - tp
    print "正确率:",tp * 100 , '%'
    print "错误率:",fp * 100, '%'
Exemplo n.º 8
0
def rssUpdata():
    print time.strftime("%Y/%m/%d %X", time.localtime(time.time()))
    times = []
    titles = []
    btAdds = []
    dirNow = os.path.dirname(sys.argv[0])
    dirNow = 'F:\Anime\ktxpRss'
    strfile = dirNow + '\list.txt'
    try:
        f2 = file(strfile, 'rb')
    except:
        print "请在目录下创建list.txt,并以行为分隔写入关键词".decode('utf').encode('gbk')
        os.system('pause')
        sys.exit(0)
    keywordList = f2.readlines()
    d = feedparser.parse(r'http://bt.ktxp.com/rss-sort-12.xml')
    for j in range(len(d.entries))[::-1]:
        title = re.sub(r'amp;', '', d.entries[j].title.encode('utf8'))
        ##        print title.decode('utf8')
        for keywords in keywordList:
            hasWord = 1
            keywords = re.sub('\r\n', '',
                              keywords.decode('gbk').encode('utf8')).split(' ')
            for keyword in keywords:
                if title.upper().find(keyword.upper()) >= 0:
                    ##                    print title.find(keyword),keyword
                    pass
                else:
                    strTemp = ''
                    for i in title.decode('utf8'):
                        if u'\u4E00' < i < u'\u9FFF':
                            strTemp += i
                    strTemp = strTemp.encode('utf8')
                    strTemp = jft.f2j('utf8', 'utf8', strTemp)
                    if strTemp.find(keyword) >= 0:
                        ##                        print strTemp.find(keyword),keyword
                        pass
                    else:
                        hasWord = 0


##                    print keyword,hasWord
##            print time.strftime("%Y/%m/%d %X",time.strptime(d.entries[j].published,'%a, %d %b %Y %X +%f'))
            if hasWord:
                times.append(
                    time.strftime(
                        "%Y/%m/%d %H:%M",
                        time.strptime(d.entries[j].published,
                                      '%a, %d %b %Y %X +%f')))
                titles.append(re.sub(r'&quot;', '"', title.decode('utf8')))
                btAdds.append(d.entries[j].enclosures[0].href)
    hasNew = dbwrite.dbwrite(times, titles, btAdds)
    miss = ktxpupdate.update()
    if miss > 0:
        import sqlite3
        cx = sqlite3.connect("ktxp.db")
        cx.isolation_level = None
        cx.text_factory = str
        cu = cx.cursor()
        cu.execute("select * from t1 Order by id desc LIMIT %s" % miss)
        res = cu.fetchall()
        for i in res:
            f3 = file('Miss.log', 'a')
            f3.write("%s %s\n" % (i[1], i[2]))
            f3.close()
    download.downNew(hasNew + miss)
Exemplo n.º 9
0
#encoding=utf-8
import jft
import sys
reload(sys)
sys.setdefaultencoding('utf-8')

infile = open('../category_path.txt','rb')
outfile = open('../category_path_clean.txt','wb')
row_index = 0
for row in infile:
	row_index += 1
	# print row_index
	fanti_items = row.strip().split(',')
	jianti_items = []
	try:
		for fanti_item in fanti_items:
			fanti_item.encode('utf-8')
			jianti_item = jft.f2j('utf-8','gbk',fanti_item)
			jianti_item = unicode(jianti_item,'gbk').decode('utf-8')
			jianti_items.append(jianti_item)
		outfile.write(','.join(jianti_items)+'\r\n')
	except:
		print 'ohh'

print 'finsihed'
Exemplo n.º 10
0
#encoding=utf-8
import jft
import sys
reload(sys)
sys.setdefaultencoding('utf-8')

infile = open('../category_path.txt', 'rb')
outfile = open('../category_path_clean.txt', 'wb')
row_index = 0
for row in infile:
    row_index += 1
    # print row_index
    fanti_items = row.strip().split(',')
    jianti_items = []
    try:
        for fanti_item in fanti_items:
            fanti_item.encode('utf-8')
            jianti_item = jft.f2j('utf-8', 'gbk', fanti_item)
            jianti_item = unicode(jianti_item, 'gbk').decode('utf-8')
            jianti_items.append(jianti_item)
        outfile.write(','.join(jianti_items) + '\r\n')
    except:
        print 'ohh'

print 'finsihed'