class Translator(object): """翻译组件""" def __init__(self, config): super(Translator, self).__init__() self.config = config self.log = Log() def transTag(self): #翻译标签的方法 try: db = Database(self.log,self.config) tags = db.getAllTagWithoutCN() if tags: for tagOne in tags: postfix = tagOne[1].replace(' ','+') url = self.config.transURL+postfix cookie = cookielib.CookieJar() opener = urllib2.build_opener(urllib2.HTTPCookieProcessor(cookie)) req = urllib2.Request( url = url ) htmlData = "" #获取网页原始数据 htmlData = opener.open(req).read() result = json.loads(htmlData) t_name_cn = '' if result: if 'translation' in result.keys(): #print('dic:'+result['translation'][0]) t_name_cn = result['translation'][0] elif 'web' in result.keys(): #print('web:'+result['web'][0]['value'][0]) t_name_cn = result['web'][0]['value'][0] else: continue if t_name_cn: db.updateTagCN(tagOne[0],t_name_cn) else: continue else: return return except Exception, TransErr: self.log.takeLog('ERROR','Translator Error in tag:' + str(TransErr))
class Reader(object): """读取HTML的类""" def __init__(self, config): reload(sys) sys.setdefaultencoding('utf-8') self.config = config self.log = Log() #剧名的字母顺序表 self.AllShowsList = ['0','A','B','C','D','E','F','G','H','I','J','K','L','M','N','O','P','Q','R','S','T','U','V','W','X','Y','Z'] def allShowsWork(self): #初步获取show的方法 for i in xrange(0,len(self.AllShowsList)): #注意这里是0~len,并不是0~len-1 try: cookie = cookielib.CookieJar() opener = urllib2.build_opener(urllib2.HTTPCookieProcessor(cookie)) #urlX = 'http://www.pogdesign.co.uk/cat/all-shows/0' req = urllib2.Request( #从config中取出Allshows的所需要的url,加上上面那个数组所对应的头字母 url = self.config.urlAllShows+self.AllShowsList[i] ) htmlData = "" #获取网页原始数据 htmlData = opener.open(req).read() except Exception, connErr: self.log.takeLog('ERROR','Connection Error:' + str(connErr)) if htmlData: try: for OneBox in BeautifulSoup(htmlData).findAll('div', attrs={'class' : 'contbox prembox removed'}) : #用bs取到所有的小box,每个box是一个剧名 showName = BeautifulSoup(str(OneBox)).h2.get_text() #showName = MySQLdb.escape_string(showName) imageURL = str(BeautifulSoup(str(OneBox)).a['style']) #此处获取到的是图片URL的一段style的js连接,需要精加工 statusStringArray = BeautifulSoup(str(OneBox)).find('span',attrs={'class':'hil selby'}) #此处是要获得剧状态的span标签 #edit on 20160520由于原网页出现格式变化,改从注释中提取播放状态,一旦注释消失记得修改此处注释内容 statusString = str(statusStringArray) #此处是要获得span标签中的内容,之后把|左半拉的内容取出来,但是由于含有空格需要精加工 #print(statusStringArray.get_text())#标签从这里入手 #print (str(statusStringArray)) #这个是标签 tag = statusString[24:-8] tag = tag.replace('Â ',' ')#过滤空格 #tag = MySQLdb.escape_string(tag) #print(tag) aShow = { 's_name' : showName, 's_sibox_image' : imageURL[22:-2], 'link' : BeautifulSoup(str(OneBox)).a['href'] } #print(aShow) #if aShow['s_name'] == '' or aShow['link'] == '' or aShow['status'] == '' : if aShow['s_name'] == '' or aShow['link'] == '' or aShow['s_sibox_image'] == '': self.log.takeLog('WARNING','''allShowsWork function cannot collect data correctly, the vars are like below:\n s_name=%s,s_sibox_image=%s,link=%s'''%(aShow['s_name'],aShow['s_sibox_image'],aShow['link'])) db = Database(self.log,self.config) Id = db.insertShowFirstTime(aShow) if len(tag) != 0: db.insertTag(Id,tag); except Exception, syntaxErr: raise syntaxErr self.log.takeLog('ERROR','Syntax Tree Error:' + str(syntaxErr))
class Tools(object): """A class which is very simililar to ReadHtml class. The diffierence is it use step-forward method to handle data mannually""" def __init__(self, config): reload(sys) sys.setdefaultencoding('utf-8') self.config = config self.log = Log() #剧名的字母顺序表 self.AllShowsList = ['0','A','B','C','D','E','F','G','H','I','J','K','L','M','N','O','P','Q','R','S','T','U','V','W','X','Y','Z'] #月份换算表 #已废弃,处于兼容目的保留 self.month = { 'Jan' : '01', 'Feb' : '02', 'Mar' : '03', 'Apr' : '04', 'May' : '05', 'Jun' : '06', 'Jul' : '07', 'Aug' : '08', 'Sep' : '09', 'Oct' : '10', 'Nov' : '11', 'Dec' : '12' } def flush_one_page(self,character): #刷新某一页所有剧的方法,character是一个大写字母 try: cookie = cookielib.CookieJar() opener = urllib2.build_opener(urllib2.HTTPCookieProcessor(cookie)) #urlX = 'http://www.pogdesign.co.uk/cat/all-shows/0' req = urllib2.Request( #从config中取出Allshows的所需要的url,加上上面那个数组所对应的头字母 url = self.config.urlAllShows+str(character) ) htmlData = "" #获取网页原始数据 htmlData = opener.open(req).read() except Exception, connErr: self.log.takeLog('ERROR','Connection Error:' + str(connErr)) if htmlData: #try: for OneBox in BeautifulSoup(htmlData).findAll('div', attrs={'class' : 'contbox prembox removed'}) : #用bs取到所有的小box,每个box是一个剧名 showName = BeautifulSoup(str(OneBox)).h2.get_text() showName = showName.replace("'","\\'") imageURL = str(BeautifulSoup(str(OneBox)).a['style']) #此处获取到的是图片URL的一段style的js连接,需要精加工 statusStringArray = BeautifulSoup(str(OneBox)).find('span',attrs={'class':'hil selby'}) #此处是要获得剧状态的span标签 #edit on 20160520由于原网页出现格式变化,改从注释中提取播放状态,一旦注释消失记得修改此处注释内容 #edit on 20160530原网页已将注释删除, 现在需要修改流程,该注释下一版本删除 statusString = str(statusStringArray) #此处是要获得span标签中的内容,之后把|左半拉的内容取出来,但是由于含有空格需要精加工 #标签从这里入手 #print (str(statusStringArray)) #这个是标签 tag = statusString[24:-8] tag = tag.replace('Â ',' ')#过滤空格 tag = tag.replace("\'","\\'") aShow = { 's_name' : showName, 's_sibox_image' : imageURL[22:-2], 'link' : BeautifulSoup(str(OneBox)).a['href'], } print(aShow) #if aShow['s_name'] == '' or aShow['link'] == '' or aShow['status'] == '' : if aShow['s_name'] == '' or aShow['link'] == '' or aShow['s_sibox_image'] == '': self.log.takeLog('WARNING','''allShowsWork function cannot collect data correctly, the vars are like below:\n s_name=%s,s_sibox_image=%s,link=%s'''%(aShow['s_name'],aShow['s_sibox_image'],aShow['link'])) # db = Database(self.log,self.config) # Id = db.insertShowFirstTime(aShow) # if len(tag) != 0: # db.insertTag(Id,tag); #except Exception, syntaxErr: # self.log.takeLog('ERROR','Syntax Tree Error:' + str(syntaxErr)) # raise syntaxErr return