示例#1
0
 def process(self, params):
     # S2 Query Process
     if SPIDER_CHANNEL_S2 == SpiderConfigure.getinstance().getchannel():
         if SPIDER_S2_WEBSITE_TYPE not in params.customized:
             return True
     xparser = XPathUtility(params.content)
     maxitmes = 0
     pageinfo = PageBasicInfo()
     template = None
     for template in TemplateManager.getxpaths(params.url):
         Logger.getlogging().debug('URL_TEMPLATE {url}\t{template}'.format(
             url=params.url,
             template=template[TemplateManager.XPATH_KEY_URL_TEMPLATE]))
         pageinfo, items = self.parsefromcontent(params, template, xparser)
         if constant.SPIDER_S2_WEBSITE_TYPE in params.customized:
             pageinfo.type = params.customized[
                 constant.SPIDER_S2_WEBSITE_TYPE]
     #if not params.page_title and not pageinfo.title and not params.lastretry:
     #return False
     if template is None:
         Logger.log(params.url, constant.ERRORCODE_SITE_NOGET_TEMPLATE)
     #值覆盖
     pageinfo.url = params.url
     if not pageinfo.title:
         pageinfo.title = params.page_title
     if not pageinfo.body:
         pageinfo.body = params.page_body
     if not pageinfo.pubtime:
         pageinfo.pubtime = params.html_time
     NewsStorage.seturlinfos(pageinfo)
示例#2
0
 def __storeurllist__(self, urllist, type=constant.SPIDER_S2_WEBSITE_VIDEO, referlist=[]):
     count = 0
     index = 0
     for url in urllist:
         params = PageBasicInfo()
         params.url = url
         params.type = type
         #检查是否在cold数据库中
         #如果不在cold数据库中则插入hot数据库中
         if not NewsStorage.exist_cold(url):
             NewsStorage.seturlinfos(params)
         #params = {constant.SPIDER_S2_WEBSITE_TYPE: type,
         #constant.SPIDER_CHANNEL: constant.SPIDER_CHANNEL_S1}            
         #url = url.strip()
         #if not URLManager.getinstance().exist(url):
         #count += 1
         #if referlist:
         #params[SiteS2Query.REFER_URL] = referlist[index]
         #urlcontext = URLContext()
         #urlcontext.url = url
         #urlcontext.type = URLContext.S1_MAIN_BODY
         #urlcontext.originalurl = url
         #urlcontext.customized = params
         #URLManager.getinstance().storeurl(url, urlcontext, constant.REQUEST_TYPE_WEBKIT) 
         index += 1
示例#3
0
 def __storeurl__(self, url, publishdate, type=constant.SPIDER_S2_WEBSITE_VIDEO):
     params = PageBasicInfo()
     params.url = url
     params.type = type
     params.pubtime = publishdate
     #检查是否在cold数据库中
     #如果不在cold数据库中则插入hot数据库中
     if not NewsStorage.exist_cold(url):
         NewsStorage.seturlinfos(params)
 def process(self, params):
     title = params.page_title
     if not title:
         title = XPathUtility(params.content).gettitle('/html/head/title')
     # 设置URL的标题、正文、发布时间信息
     dict = {
         MongoDAO.SPIDER_COLLECTION_NEWS_TITLE: title,
         MongoDAO.SPIDER_COLLECTION_NEWS_BODY: params.page_body,
         MongoDAO.SPIDER_COLLECTION_NEWS_PUBLISH_DATE: params.html_time
     }
     NewsStorage.seturlinfos(params.url, dict)
示例#5
0
    def wb_analysis(self, filepath):
        Logger.getlogging().info(
            'Now, Start to analysis Waibu file {fl}'.format(fl=filepath))
        if '302_tencent_video' in filepath:
            type = constant.SPIDER_S2_WEBSITE_VIDEO
        else:
            type = constant.SPIDER_S2_WEBSITE_NEWS

        self.conf.setchannel(constant.SPIDER_CHANNEL_S2)
        lines = FileUtility.readlines(filepath)
        tempwaibustorage = {}
        for line in lines:
            try:
                line = json.loads(line)
                params = PageBasicInfo()
                params.query = line['query']
                params.url = line['url']
                params.title = Common.strfilter(line['title'])
                params.body = Common.strfilter(line['body'])
                params.pubtime = line['pubtime']
                clicknum = line.get('clicknum', 0)
                if clicknum:
                    params.clicknum = int(clicknum)
                params.type = type
                if params.query not in URLManager.waibustorage:
                    URLManager.waibustorage[params.query] = []
                if params.query not in tempwaibustorage:
                    tempwaibustorage[params.query] = []
                URLManager.waibustorage[params.query].append(params)
                tempwaibustorage[params.query].append(params)
            except:
                Logger.printexception()

        Logger.getlogging().debug(
            'Now, Starting Select url to Insert and Update for uploading WAIBU data!'
        )
        for query in tempwaibustorage:
            paramslist = tempwaibustorage[query]
            for params in paramslist:
                self.conf.setquery(query)
                NewsStorage.seturlinfos(params)
示例#6
0
    def dumpurls(self):
        #dump本台机器query对应的urllsit, 并存储到对应的文件中
        s2file = SpiderConfigure.getinstance().gets2file()
        s2temppath = Storage.getstoragelocation(
            const.SPIDER_QUERY_TEMP_PATH) + FileUtility.getfilename(s2file)
        #querys = [''] + QueryStorage.getinstance().getlocalquerys(s2temppath, ETLController.LOCALMACHINEFLAG)
        querys = QueryStorage.getinstance().getlocalquerys(
            s2temppath, ETLController.LOCALMACHINEFLAG)
        for query in querys:
            Logger.getlogging().debug(
                'Now, Starting Select url to Insert and Update for uploading location urlfile!'
            )
            self.conf.setchannel(constant.SPIDER_CHANNEL_S2)
            self.conf.setquery(query)
            #此处注释请勿删除
            #1.转换周期内数据
            # 1.1pulishdate存在,时间为最近一周
            # 2.1publistdate为0,使用创建时间,时间为最近一周
            #wheref = '{key1}={val1} and {key2}={val2} and {createdate}!={starttime} and \
            #(({time1}!={time0} and TIMESTAMPDIFF(SECOND, now(), {time1}) <= {secs}) or \
            #({time1}={time0} and TIMESTAMPDIFF(SECOND, now(), FROM_UNIXTIME({time2}, {timeformat})) <= {secs}))'
            #where = wheref.format(key1=SQLDAO.SPIDER_TABLE_NEWS_MACHINEFLAG, val1=ETLController.LOCALMACHINEFLAG,
            #key2=SQLDAO.SPIDER_TABLE_NEWS_QUERY, val2='\"'+query+'\"',
            #createdate = SQLDAO.SPIDER_TABLE_NEWS_CREATE_DATE,
            #starttime = SpiderConfigure.getinstance().starttime(),
            #time0='\"'+TimeUtility.getuniformtime(0)+'\"',
            #time1=SQLDAO.SPIDER_TABLE_NEWS_PUBLISH_DATE,
            #time2=SQLDAO.SPIDER_TABLE_NEWS_CREATE_DATE,
            #timeformat = '\"'+TimeUtility.SQLTIMEFORMAT+'\"',
            #secs =self.period * 24*60*60
            #)
            where = {
                SQLDAO.SPIDER_TABLE_NEWS_QUERY:
                query,
                SQLDAO.SPIDER_TABLE_NEWS_MACHINEFLAG:
                ETLController.LOCALMACHINEFLAG
            }
            Logger.getlogging().debug(
                'Query condition: {where}'.format(where=str(where)))
            results = SQLDAO.getinstance().find(SQLDAO.SPIDER_TABLE_NEWS,
                                                where)
            urltemplist = []
            for result in results:
                data = SQLDAO.getdictdata(SQLDAO.SPIDER_TABLE_NEWS_KEYS,
                                          result)
                publishdate = data[SQLDAO.SPIDER_TABLE_NEWS_PUBLISH_DATE]
                createdate = data[SQLDAO.SPIDER_TABLE_NEWS_CREATE_DATE]
                url = data[SQLDAO.SPIDER_TABLE_NEWS_URL].strip()
                if (publishdate == TimeUtility.getintformtime(0) and SQLDAO.gettime() - createdate <= self.period * 24*60*60) or \
                   (publishdate != TimeUtility.getintformtime(0) and SQLDAO.gettime() - TimeUtility.getinttime(publishdate) <= self.period * 24*60*60):
                    if url not in urltemplist:
                        urltemplist.append(url)
                        params = PageBasicInfo()
                        params.url = url
                        NewsStorage.seturlinfos(params)

            #2.抽取createdate为本次开始时间的数据
            URLFileManager.getinstance().generateurlfilepath()
            where = {
                SQLDAO.SPIDER_TABLE_NEWS_QUERY:
                query,
                SQLDAO.SPIDER_TABLE_NEWS_MACHINEFLAG:
                ETLController.LOCALMACHINEFLAG,
                SQLDAO.SPIDER_TABLE_NEWS_CREATE_DATE:
                SpiderConfigure.getinstance().starttime()
            }
            results = SQLDAO.getinstance().find(SQLDAO.SPIDER_TABLE_NEWS,
                                                where)
            urllist = []
            linecount = 0
            for result in results:
                data = SQLDAO.getdictdata(SQLDAO.SPIDER_TABLE_NEWS_KEYS,
                                          result)
                url = data[SQLDAO.SPIDER_TABLE_NEWS_URL].strip()
                urllist.append(url)
                context = URLContext()
                context.originalurl = url
                context.type = URLContext.S1_MAIN_BODY
                context.customized[constant.SPIDER_S2_WEBSITE_TYPE] = data[
                    SQLDAO.SPIDER_TABLE_NEWS_TYPE]
                Logger.getlogging().debug(url)
                URLManager.getinstance().storeurl(url, context,
                                                  REQUEST_TYPE_WEBKIT)
                linecount += 1