예제 #1
0
    def parse(self, task):
        newTasks = []
        ret, status = self.parseContent(task['__data'])
        if status == 'OK':
            #dump list
            for i in range(len(ret['stockList'])):
                ret['stockList'][i]['href'] = urlparse.urljoin(task.url, ret['stockList'][i]['href'])
                newT = Task(-1, url=ret['stockList'][i]['href'], handler='ShStockBasicInfoHandler', ref=task.url)  
                newTasks.append(newT)
                
            keyOutputPath = iPapa.iTsOutputPath
            outputJsonLoc = os.path.join(keyOutputPath, 'sh_stock_list_page_%d.json' % task['key'])
            outputTxtLoc = os.path.join(keyOutputPath, 'sh_stock_list_page_%d.txt' % task['key'])

            if util.dump2JsonFile(ret['stockList'], outputJsonLoc) != True:
                task.status = 'failed'

            if util.dumpDictList2TxtFile(ret['stockList'], outputTxtLoc) != True:
                task.status = 'failed'

            #new tasks here

            page = task['key']
            if ret['nextPage'] != '':
                newT = Task(-1, url=urlparse.urljoin(task.url, ret['nextPage']), handler='InitShHandler', ref=task.url)  
                newT['key'] = page+1
                newTasks.append(newT)
        else:
            task.status = 'failed'
        if newTasks != []:
            return {'newTasks': newTasks}
        return {}
예제 #2
0
    def parse(self, task):
        newTasks = []
        ret, status = self.parseContent(task['__data'])
        if status == 'OK':
            #dump list
            keyOutputPath = iPapa.iTsOutputPath
            outputJsonLoc = os.path.join(keyOutputPath,
                                         'hk_gem_stock_list_cn.json')
            outputTxtLoc = os.path.join(keyOutputPath,
                                        'hk_gem_stock_list_cn.txt')
            if util.dump2JsonFile(ret['stockList'], outputJsonLoc) != True:
                task.status = 'failed'

            if util.dump2TxtFile(ret['stockList'], outputTxtLoc) != True:
                task.status = 'failed'

            for k in ret['stockInfoPage']:
                if k not in self.historyKeys:
                    newT = Task(-1,
                                url=ret['stockInfoPage'][k],
                                handler='HkStockInfoCnPageHandler',
                                ref=task.url)
                    newT['key'] = "hk_stock_info_" + k
                    newTasks.append(newT)
        else:
            task.status = 'failed'
        if newTasks != []:
            return {'newTasks': newTasks}
        return {}
예제 #3
0
    def dealWithParserOutput(self, task):
        if task.status == "done":
            # counters
            self.taskDoneCounter.inc()
            self.rmTask(task)
            # print "task id %s done" % (task.id)
            myLogger.info("task [%d] Done [%s]" % (task.id, task.exprMe()))
            task["__expr__"] = task.exprMe()
            fileName = os.path.join(iPapa.iTsOutputPath, "doneTask.%d.json" % task.id)
            util.dump2JsonFile(task["__expr__"], fileName)
        elif task.status == "ignore":
            self.taskIgnoreCounter.inc()
            self.rmTask(task)
            # print "task id %s done" % (task.id)
            myLogger.info("task [%d] Ignored [%s]" % (task.id, task.exprMe()))
            task["__expr__"] = task.exprMe()
            fileName = os.path.join(iPapa.iTsOutputPath, "ignoreTask.%d.json" % task.id)
            util.dump2JsonFile(task["__expr__"], fileName)

        else:
            # counters
            self.returnVal = False
            self.taskFailedCounter.inc()
            self.rmTask(task)
            myLogger.error("task [%d] Failed [%s], dump it" % (task.id, task.exprMe()))
            task["__expr__"] = task.exprMe()
            if "__data" in task:
                del task["__data"]
            fileName = os.path.join(iPapa.iTsOutputPath, "failedTask.%d.json" % task.id)
            util.dump2JsonFile(task, fileName)
            # todo :if we want to support the repeat argument
            #  take care of the function flush
        pass
예제 #4
0
 def dealWithParserOutput(self, task):
     if task.status == 'done':
         #counters
         self.taskDoneCounter.inc()
         self.rmTask(task)
         #print "task id %s done" % (task.id)
         myLogger.info("task [%d] Done [%s]" % (task.id, task.exprMe()))
         task['__expr__'] = task.exprMe()
         fileName = os.path.join(iPapa.iTsOutputPath, "doneTask.%d.json" % task.id)
         util.dump2JsonFile(task['__expr__'], fileName)
     elif task.status == 'ignore':
         self.taskIgnoreCounter.inc()
         self.rmTask(task)
         #print "task id %s done" % (task.id)
         myLogger.info("task [%d] Ignored [%s]" % (task.id, task.exprMe()))
         task['__expr__'] = task.exprMe()
         fileName = os.path.join(iPapa.iTsOutputPath, "ignoreTask.%d.json" % task.id)
         util.dump2JsonFile(task['__expr__'], fileName)
         
     else:
         #counters
         self.returnVal = False
         self.taskFailedCounter.inc()
         self.rmTask(task)
         myLogger.error("task [%d] Failed [%s], dump it" % (task.id, task.exprMe()))
         task['__expr__'] = task.exprMe()
         if '__data' in task:
             del task['__data']
         fileName = os.path.join(iPapa.iTsOutputPath, "failedTask.%d.json" % task.id)
         util.dump2JsonFile(task, fileName)
         # todo :if we want to support the repeat argument
         #  take care of the function flush
     pass
    def parse(self, task):
        newTasks = []
        ret, status = self.parseContent(task['__data'])
        if status == 'OK':
            #dump list
            code = ret['stockBasicInfo']['code']
            keyOutputPath = iPapa.iTsOutputPath
            outputJsonLoc = os.path.join(keyOutputPath, 'sh_stock_basic_info_%s.json'%code)
            outputTxtLoc = os.path.join(keyOutputPath, 'sh_stock_basic_info_%s.txt'%code)
            if util.dump2JsonFile(ret['stockBasicInfo'], outputJsonLoc) != True:
                task.status = 'failed'

            if util.dumpDict2TxtFile(ret['stockBasicInfo'], outputTxtLoc) != True:
                task.status = 'failed'
        else:
            task.status = 'failed'
        if newTasks != []:
            return {'newTasks': newTasks}
        return {}
예제 #6
0
    def parse(self, task):
        newTasks = []
        ret, status = self.parseContent(task['__data'])
        if status == 'OK':
            #dump list
            for i in range(len(ret['stockList'])):
                ret['stockList'][i]['href'] = urlparse.urljoin(
                    task.url, ret['stockList'][i]['href'])
                newT = Task(-1,
                            url=ret['stockList'][i]['href'],
                            handler='ShStockBasicInfoHandler',
                            ref=task.url)
                newTasks.append(newT)

            keyOutputPath = iPapa.iTsOutputPath
            outputJsonLoc = os.path.join(
                keyOutputPath, 'sh_stock_list_page_%d.json' % task['key'])
            outputTxtLoc = os.path.join(
                keyOutputPath, 'sh_stock_list_page_%d.txt' % task['key'])

            if util.dump2JsonFile(ret['stockList'], outputJsonLoc) != True:
                task.status = 'failed'

            if util.dumpDictList2TxtFile(ret['stockList'],
                                         outputTxtLoc) != True:
                task.status = 'failed'

            #new tasks here

            page = task['key']
            if ret['nextPage'] != '':
                newT = Task(-1,
                            url=urlparse.urljoin(task.url, ret['nextPage']),
                            handler='InitShHandler',
                            ref=task.url)
                newT['key'] = page + 1
                newTasks.append(newT)
        else:
            task.status = 'failed'
        if newTasks != []:
            return {'newTasks': newTasks}
        return {}
    def parse(self, task):
        print "HkStockInfoCnPageHandler parse", task.url, task['key']
        ret, status = self.parseContent(task['__data'])
        if status == 'OK':
            key = task['key']
            keyOutputPath = iPapa.iTsOutputPath

            outputJsonLoc = os.path.join(keyOutputPath, 'hk_stock_info_cn_'+key+'.json')
            outputTxtLoc = os.path.join(keyOutputPath, 'hk_stock_info_cn_'+key+'.txt')

            if util.dump2JsonFile(ret, outputJsonLoc) != True:
                task.status = 'failed'

            if util.dump2TxtFile(zip(ret.keys(), ret.values()), outputTxtLoc) != True:
                task.status = 'failed'
        
        else:
            task.status = 'failed'
        if task.status == 'ignore': 
            return {}
        return {}
예제 #8
0
    def parse(self, task):
        newTasks = []
        ret, status = self.parseContent(task['__data'])
        if status == 'OK':
            #dump list
            code = ret['stockBasicInfo']['code']
            keyOutputPath = iPapa.iTsOutputPath
            outputJsonLoc = os.path.join(keyOutputPath,
                                         'sh_stock_basic_info_%s.json' % code)
            outputTxtLoc = os.path.join(keyOutputPath,
                                        'sh_stock_basic_info_%s.txt' % code)
            if util.dump2JsonFile(ret['stockBasicInfo'],
                                  outputJsonLoc) != True:
                task.status = 'failed'

            if util.dumpDict2TxtFile(ret['stockBasicInfo'],
                                     outputTxtLoc) != True:
                task.status = 'failed'
        else:
            task.status = 'failed'
        if newTasks != []:
            return {'newTasks': newTasks}
        return {}
예제 #9
0
    def parse(self, task):
        print "HkStockInfoCnPageHandler parse", task.url, task['key']
        ret, status = self.parseContent(task['__data'])
        if status == 'OK':
            key = task['key']
            keyOutputPath = iPapa.iTsOutputPath

            outputJsonLoc = os.path.join(keyOutputPath,
                                         'hk_stock_info_cn_' + key + '.json')
            outputTxtLoc = os.path.join(keyOutputPath,
                                        'hk_stock_info_cn_' + key + '.txt')

            if util.dump2JsonFile(ret, outputJsonLoc) != True:
                task.status = 'failed'

            if util.dump2TxtFile(zip(ret.keys(), ret.values()),
                                 outputTxtLoc) != True:
                task.status = 'failed'

        else:
            task.status = 'failed'
        if task.status == 'ignore':
            return {}
        return {}
예제 #10
0
    def parse(self, task):
        newTasks = []
        ret, status = self.parseContent(task['__data'])
        if status == 'OK':
            #dump list
            keyOutputPath = iPapa.iTsOutputPath
            outputJsonLoc = os.path.join(keyOutputPath, 'hk_main_stock_list_cn.json')
            outputTxtLoc = os.path.join(keyOutputPath, 'hk_main_stock_list_cn.txt')
            if util.dump2JsonFile(ret['stockList'], outputJsonLoc) != True:
                task.status = 'failed'

            if util.dump2TxtFile(ret['stockList'], outputTxtLoc) != True:
                task.status = 'failed'

            for k in ret['stockInfoPage']:
                if k not in self.historyKeys:
                    newT = Task(-1, url=ret['stockInfoPage'][k], handler='HkStockInfoCnPageHandler', ref=task.url)  
                    newT['key'] = "hk_stock_info_" + k
                    newTasks.append(newT)
        else:
            task.status = 'failed'
        if newTasks != []:
            return {'newTasks': newTasks}
        return {}
예제 #11
0
    def parse(self, task):
        print "ContentPageHandler parse", task.url, task['key']
        newTasks = []
        ret, status = self.parseContent(task['__data'])
        meta = {}
        if status == 'OK':
            key = task['key']
            keyOutputPath = os.path.join(iPapa.iTsOutputPath, key)
            #siteTile 
            meta['siteTitle'] = ret['siteTitle']
            #title
            meta['title'] = ret['title']
            # url
            meta['url'] = task.url
            # date
            meta['date'] = ret['date']
            #contentPics
            #record and new task to download it
            meta['contentPics'] = ret['contentPics']
            meta['contentPicCaptions'] = ret['contentPicCaptions']
            meta['embPics'] = ret['embPics']
            #create new tasks here
            if len(ret['contentPics']) and len(ret['contentPicCaptions']):
                picUrl = ret['contentPics'][0]
                #only the first pic here, ignore others now
                dest = os.path.join(key, util.getUrlFileName(picUrl)) 
                newT = Task(-1, url=picUrl, handler='PicHandler', taskType='media', ref=task.url, dest=dest)  
                newT['key'] = task['key']
                newT['picType'] = 'contentPic'
                newTasks.append(newT)

            #for content, we store it 
            contentLoc = os.path.join(keyOutputPath, 'content.html')
            util.writeFile(contentLoc, ret['content'])
            # contentMp3 
            if 'contentMp3' in ret:
                url = ret['contentMp3']
                dest = os.path.join(key, util.getUrlFileName(url)) 
                newT = Task(-1, url=url, handler='AudioHandler', taskType='media', ref=task.url, dest=dest)  
                newT['key'] = task['key']
                newT['audioType'] = os.path.splitext(dest)[1].upper()
                newTasks.append(newT)
            elif 'contentMp3Page' in ret: #always be with big file, we ignore it 
                #url = ret['contentMp3Page']
                #newT = Task(-1, url=urlparse.urljoin(task.url, url), handler='ContentMp3PageHandler', ref=task.url,) 
                #newT['key'] = task['key']
                #newTasks.append(newT)
                task.status = 'ignore' 
                meta['isIgnore'] = True
                meta['ignoreMsg'] = "Audio file is too big, we should ignore this now."
                task.msg = 'Audio file is too big, we should ignore this now.' 
            else:
                #Failed
                task.status = 'failed' 
                task.msg = 'failed in Findding a Audio' 

            # download here 
            # embPics
            for embPic in ret['embPics']:
                picUrl = embPic
                #only the first pic here, ignore others now
                dest = os.path.join(key, util.getUrlFileName(picUrl)) 
                newT = Task(-1, url=picUrl, handler='PicHandler', taskType='media', ref=task.url, dest=dest)  
                newT['key'] = task['key']
                newT['picType'] = 'embPic'
                newTasks.append(newT)

            # store meta file
            metaLoc = os.path.join(keyOutputPath, 'meta.json') 
            if util.dump2JsonFile(meta, metaLoc) != True:
                task.status = 'failed'    

        else:
            task.status = 'failed'
        if task.status == 'ignore': 
            return {}
        if newTasks != []:
            return {'newTasks': newTasks}
        return {}
예제 #12
0
    def parse(self, task):
        print "ContentPageHandler parse", task.url, task['key']
        newTasks = []
        ret, status = self.parseContent(task['__data'])
        meta = {}
        if status == 'OK':
            key = task['key']
            keyOutputPath = os.path.join(iPapa.iTsOutputPath, key)
            #siteTile
            meta['siteTitle'] = ret['siteTitle']
            #title
            meta['title'] = ret['title']
            # url
            meta['url'] = task.url
            # date
            meta['date'] = ret['date']
            #contentPics
            #record and new task to download it
            meta['contentPics'] = ret['contentPics']
            meta['contentPicCaptions'] = ret['contentPicCaptions']
            meta['embPics'] = ret['embPics']
            #create new tasks here
            if len(ret['contentPics']) and len(ret['contentPicCaptions']):
                picUrl = ret['contentPics'][0]
                #only the first pic here, ignore others now
                dest = os.path.join(key, util.getUrlFileName(picUrl))
                newT = Task(-1,
                            url=picUrl,
                            handler='PicHandler',
                            taskType='media',
                            ref=task.url,
                            dest=dest)
                newT['key'] = task['key']
                newT['picType'] = 'contentPic'
                newTasks.append(newT)

            #for content, we store it
            contentLoc = os.path.join(keyOutputPath, 'content.html')
            util.writeFile(contentLoc, ret['content'])
            # contentMp3
            if 'contentMp3' in ret:
                url = ret['contentMp3']
                dest = os.path.join(key, util.getUrlFileName(url))
                newT = Task(-1,
                            url=url,
                            handler='AudioHandler',
                            taskType='media',
                            ref=task.url,
                            dest=dest)
                newT['key'] = task['key']
                newT['audioType'] = os.path.splitext(dest)[1].upper()
                newTasks.append(newT)
            elif 'contentMp3Page' in ret:  #always be with big file, we ignore it
                #url = ret['contentMp3Page']
                #newT = Task(-1, url=urlparse.urljoin(task.url, url), handler='ContentMp3PageHandler', ref=task.url,)
                #newT['key'] = task['key']
                #newTasks.append(newT)
                task.status = 'ignore'
                meta['isIgnore'] = True
                meta[
                    'ignoreMsg'] = "Audio file is too big, we should ignore this now."
                task.msg = 'Audio file is too big, we should ignore this now.'
            else:
                #Failed
                task.status = 'failed'
                task.msg = 'failed in Findding a Audio'

            # download here
            # embPics
            for embPic in ret['embPics']:
                picUrl = embPic
                #only the first pic here, ignore others now
                dest = os.path.join(key, util.getUrlFileName(picUrl))
                newT = Task(-1,
                            url=picUrl,
                            handler='PicHandler',
                            taskType='media',
                            ref=task.url,
                            dest=dest)
                newT['key'] = task['key']
                newT['picType'] = 'embPic'
                newTasks.append(newT)

            # store meta file
            metaLoc = os.path.join(keyOutputPath, 'meta.json')
            if util.dump2JsonFile(meta, metaLoc) != True:
                task.status = 'failed'

        else:
            task.status = 'failed'
        if task.status == 'ignore':
            return {}
        if newTasks != []:
            return {'newTasks': newTasks}
        return {}