Exemplo n.º 1
0
class Translator(object):
    """翻译组件"""
    def __init__(self, config):
        super(Translator, self).__init__()
        self.config = config
        self.log = Log()

    def transTag(self):
        #翻译标签的方法
        try:
            db = Database(self.log,self.config)
            tags = db.getAllTagWithoutCN()
            if tags:
                for tagOne in tags:
                    postfix = tagOne[1].replace(' ','+')
                    url = self.config.transURL+postfix
                    cookie = cookielib.CookieJar()
                    opener = urllib2.build_opener(urllib2.HTTPCookieProcessor(cookie))
                    req = urllib2.Request(
                        url = url
                    )
                    htmlData = ""
                    #获取网页原始数据
                    htmlData = opener.open(req).read()
                    result = json.loads(htmlData)
                    t_name_cn = ''
                    if result:
                        if 'translation' in result.keys():
                            #print('dic:'+result['translation'][0])
                            t_name_cn = result['translation'][0]
                        elif 'web' in result.keys():
                            #print('web:'+result['web'][0]['value'][0])
                            t_name_cn = result['web'][0]['value'][0]
                        else:
                            continue
                        if t_name_cn:
                            db.updateTagCN(tagOne[0],t_name_cn)
                    else:
                        continue
            else:
                return
            return
        except Exception, TransErr:
            self.log.takeLog('ERROR','Translator Error in tag:' + str(TransErr))
Exemplo n.º 2
0
class Reader(object):
    """读取HTML的类"""
    def __init__(self, config):
        reload(sys)
        sys.setdefaultencoding('utf-8')
        self.config = config
        self.log = Log()
        #剧名的字母顺序表
        self.AllShowsList = ['0','A','B','C','D','E','F','G','H','I','J','K','L','M','N','O','P','Q','R','S','T','U','V','W','X','Y','Z']

    def allShowsWork(self):
        #初步获取show的方法
        for i in xrange(0,len(self.AllShowsList)):  #注意这里是0~len,并不是0~len-1
            try:
                cookie = cookielib.CookieJar()
                opener = urllib2.build_opener(urllib2.HTTPCookieProcessor(cookie))
                #urlX = 'http://www.pogdesign.co.uk/cat/all-shows/0'
                req = urllib2.Request(
                    #从config中取出Allshows的所需要的url,加上上面那个数组所对应的头字母
                    url = self.config.urlAllShows+self.AllShowsList[i]
                )
                htmlData = ""
                #获取网页原始数据
                htmlData = opener.open(req).read()
            except Exception, connErr:
                self.log.takeLog('ERROR','Connection Error:' + str(connErr))
            if htmlData:
                try:
                    for OneBox in BeautifulSoup(htmlData).findAll('div', attrs={'class' : 'contbox prembox removed'}) : #用bs取到所有的小box,每个box是一个剧名
                        showName = BeautifulSoup(str(OneBox)).h2.get_text()
                        #showName = MySQLdb.escape_string(showName)
                        imageURL = str(BeautifulSoup(str(OneBox)).a['style'])                                           #此处获取到的是图片URL的一段style的js连接,需要精加工
                        statusStringArray = BeautifulSoup(str(OneBox)).find('span',attrs={'class':'hil selby'})         #此处是要获得剧状态的span标签
                        
                        #edit on 20160520由于原网页出现格式变化,改从注释中提取播放状态,一旦注释消失记得修改此处注释内容
                        statusString = str(statusStringArray)                                    #此处是要获得span标签中的内容,之后把|左半拉的内容取出来,但是由于含有空格需要精加工
                        #print(statusStringArray.get_text())#标签从这里入手
                        #print (str(statusStringArray))
                        #这个是标签
                        tag = statusString[24:-8]
                        tag = tag.replace(' ',' ')#过滤空格
                        #tag = MySQLdb.escape_string(tag)
                        #print(tag)                             
                        
                        aShow = {
                            's_name' : showName,
                            's_sibox_image' : imageURL[22:-2],
                            'link' : BeautifulSoup(str(OneBox)).a['href']
                        }
                        #print(aShow)
                        #if aShow['s_name'] == '' or aShow['link'] == '' or aShow['status'] == '' :
                        if aShow['s_name'] == '' or aShow['link'] == '' or aShow['s_sibox_image'] == '':
                            self.log.takeLog('WARNING','''allShowsWork function cannot collect data correctly, the vars are like below:\n s_name=%s,s_sibox_image=%s,link=%s'''%(aShow['s_name'],aShow['s_sibox_image'],aShow['link']))
                        db = Database(self.log,self.config)
                        Id = db.insertShowFirstTime(aShow)
                        if len(tag) != 0:
                            db.insertTag(Id,tag);
                except Exception, syntaxErr:
                    raise syntaxErr
                    self.log.takeLog('ERROR','Syntax Tree Error:' + str(syntaxErr))
Exemplo n.º 3
0
class Tools(object):
    """A class which is very simililar to ReadHtml class. The diffierence is it use step-forward method to handle data mannually"""
    def __init__(self, config):
        reload(sys)
        sys.setdefaultencoding('utf-8')
        self.config = config
        self.log = Log()
        #剧名的字母顺序表
        self.AllShowsList = ['0','A','B','C','D','E','F','G','H','I','J','K','L','M','N','O','P','Q','R','S','T','U','V','W','X','Y','Z']
        #月份换算表
        #已废弃,处于兼容目的保留
        self.month = {
            'Jan' : '01',
            'Feb' : '02',
            'Mar' : '03',
            'Apr' : '04',
            'May' : '05',
            'Jun' : '06',
            'Jul' : '07',
            'Aug' : '08',
            'Sep' : '09',
            'Oct' : '10',
            'Nov' : '11',
            'Dec' : '12'
        }



    def flush_one_page(self,character):
        #刷新某一页所有剧的方法,character是一个大写字母
        try:
            cookie = cookielib.CookieJar()
            opener = urllib2.build_opener(urllib2.HTTPCookieProcessor(cookie))
            #urlX = 'http://www.pogdesign.co.uk/cat/all-shows/0'
            req = urllib2.Request(
                #从config中取出Allshows的所需要的url,加上上面那个数组所对应的头字母
                url = self.config.urlAllShows+str(character)
            )
            htmlData = ""
            #获取网页原始数据
            htmlData = opener.open(req).read()
        except Exception, connErr:
            self.log.takeLog('ERROR','Connection Error:' + str(connErr))
        if htmlData:
            #try:
            for OneBox in BeautifulSoup(htmlData).findAll('div', attrs={'class' : 'contbox prembox removed'}) : #用bs取到所有的小box,每个box是一个剧名
                showName = BeautifulSoup(str(OneBox)).h2.get_text()
                showName = showName.replace("'","\\'")
                imageURL = str(BeautifulSoup(str(OneBox)).a['style'])                                           #此处获取到的是图片URL的一段style的js连接,需要精加工
                statusStringArray = BeautifulSoup(str(OneBox)).find('span',attrs={'class':'hil selby'})         #此处是要获得剧状态的span标签
                
                #edit on 20160520由于原网页出现格式变化,改从注释中提取播放状态,一旦注释消失记得修改此处注释内容
                #edit on 20160530原网页已将注释删除, 现在需要修改流程,该注释下一版本删除
                statusString = str(statusStringArray)                                  #此处是要获得span标签中的内容,之后把|左半拉的内容取出来,但是由于含有空格需要精加工
                #标签从这里入手
                #print (str(statusStringArray))
                #这个是标签
                tag = statusString[24:-8]
                tag = tag.replace(' ',' ')#过滤空格
                tag = tag.replace("\'","\\'")                          
                
                aShow = {
                    's_name' : showName,
                    's_sibox_image' : imageURL[22:-2],
                    'link' : BeautifulSoup(str(OneBox)).a['href'],
                }
                
                print(aShow)
                #if aShow['s_name'] == '' or aShow['link'] == '' or aShow['status'] == '' :
                if aShow['s_name'] == '' or aShow['link'] == '' or aShow['s_sibox_image'] == '':
                    self.log.takeLog('WARNING','''allShowsWork function cannot collect data correctly, the vars are like below:\n s_name=%s,s_sibox_image=%s,link=%s'''%(aShow['s_name'],aShow['s_sibox_image'],aShow['link']))
                # db = Database(self.log,self.config)
                # Id = db.insertShowFirstTime(aShow)
                # if len(tag) != 0:
                #     db.insertTag(Id,tag);
            #except Exception, syntaxErr:
            #    self.log.takeLog('ERROR','Syntax Tree Error:' + str(syntaxErr))
            #    raise syntaxErr
            return