def storeurl(self, url, originalurl, step, others={}): urlparam = URLContext() urlparam.url = url urlparam.originalurl = originalurl urlparam.step = step urlparam.type = URLContext.S1_MAIN_BODY urlparam.customized = others URLManager.getinstance().storeurl(url, urlparam, constant.REQUEST_TYPE_IMG)
def storeurl(self, url, originalurl, step, others={}): urlparam = URLContext() urlparam.url = url urlparam.originalurl = originalurl urlparam.step = step urlparam.type = URLContext.S1_COMMENTS urlparam.customized = others URLManager.getinstance().storeurl(url, urlparam)
def storeposturl(self, url, originalurl, step, data, others={}): urlcontext = URLContext() urlcontext.url = json.dumps({'url': url, 'data': urllib.urlencode(data)}) urlcontext.originalurl = originalurl urlcontext.step = step urlcontext.type = URLContext.S1_COMMENTS urlcontext.customized = others URLManager.getinstance().storeurl(urlcontext.url, urlcontext, constant.REQUEST_TYPE_POST)
def __storeqeuryurl__(self, url, step, data, customized={}): customized[constant.SPIDER_CHANNEL] = constant.SPIDER_CHANNEL_S2 urlcontext = URLContext() urlcontext.url = json.dumps({'url': url, 'data': urllib.urlencode(data)}) urlcontext.originalurl = self.fakeoriginalurl urlcontext.step = step urlcontext.type = URLContext.S2_QUERY urlcontext.customized = customized URLManager.getinstance().storeurl(urlcontext.url, urlcontext, constant.REQUEST_TYPE_POST)
def s1upload(self, sfile): if FileUtility.exists(sfile): lines = FileUtility.readlines(sfile) self.conf.setchannel(SPIDER_CHANNEL_S1) self.conf.setquery('') URLFileManager.getinstance().generateurlfilepath() for line in lines: try: url = line.strip() params = PageBasicInfo() params.url = url #NewsStorage.seturlinfos(params) context = URLContext() context.originalurl = url context.type = URLContext.S1_MAIN_BODY Logger.getlogging().debug(url) URLManager.getinstance().storeurl(url, context, REQUEST_TYPE_WEBKIT) except: Logger.printexception()
def __storeqeuryurllist__(self, urllist, step, customized={}): for url in urllist: customized[constant.SPIDER_CHANNEL] = constant.SPIDER_CHANNEL_S2 urlcontext = URLContext() urlcontext.url = url urlcontext.originalurl = self.fakeoriginalurl urlcontext.type = URLContext.S2_QUERY urlcontext.step = step urlcontext.customized = customized URLManager.getinstance().storeurl(url, urlcontext)
def storeurl(self, url, originalurl, step, customized={}): urlparam = URLContext() urlparam.url = url urlparam.originalurl = originalurl urlparam.step = step urlparam.customized = customized URLManager.getinstance().storeurl(url, urlparam)
def dumpurls(self): #dump本台机器query对应的urllsit, 并存储到对应的文件中 s2file = SpiderConfigure.getinstance().gets2file() s2temppath = Storage.getstoragelocation( const.SPIDER_QUERY_TEMP_PATH) + FileUtility.getfilename(s2file) #querys = [''] + QueryStorage.getinstance().getlocalquerys(s2temppath, ETLController.LOCALMACHINEFLAG) querys = QueryStorage.getinstance().getlocalquerys( s2temppath, ETLController.LOCALMACHINEFLAG) for query in querys: Logger.getlogging().debug( 'Now, Starting Select url to Insert and Update for uploading location urlfile!' ) self.conf.setchannel(constant.SPIDER_CHANNEL_S2) self.conf.setquery(query) #此处注释请勿删除 #1.转换周期内数据 # 1.1pulishdate存在,时间为最近一周 # 2.1publistdate为0,使用创建时间,时间为最近一周 #wheref = '{key1}={val1} and {key2}={val2} and {createdate}!={starttime} and \ #(({time1}!={time0} and TIMESTAMPDIFF(SECOND, now(), {time1}) <= {secs}) or \ #({time1}={time0} and TIMESTAMPDIFF(SECOND, now(), FROM_UNIXTIME({time2}, {timeformat})) <= {secs}))' #where = wheref.format(key1=SQLDAO.SPIDER_TABLE_NEWS_MACHINEFLAG, val1=ETLController.LOCALMACHINEFLAG, #key2=SQLDAO.SPIDER_TABLE_NEWS_QUERY, val2='\"'+query+'\"', #createdate = SQLDAO.SPIDER_TABLE_NEWS_CREATE_DATE, #starttime = SpiderConfigure.getinstance().starttime(), #time0='\"'+TimeUtility.getuniformtime(0)+'\"', #time1=SQLDAO.SPIDER_TABLE_NEWS_PUBLISH_DATE, #time2=SQLDAO.SPIDER_TABLE_NEWS_CREATE_DATE, #timeformat = '\"'+TimeUtility.SQLTIMEFORMAT+'\"', #secs =self.period * 24*60*60 #) where = { SQLDAO.SPIDER_TABLE_NEWS_QUERY: query, SQLDAO.SPIDER_TABLE_NEWS_MACHINEFLAG: ETLController.LOCALMACHINEFLAG } Logger.getlogging().debug( 'Query condition: {where}'.format(where=str(where))) results = SQLDAO.getinstance().find(SQLDAO.SPIDER_TABLE_NEWS, where) urltemplist = [] for result in results: data = SQLDAO.getdictdata(SQLDAO.SPIDER_TABLE_NEWS_KEYS, result) publishdate = data[SQLDAO.SPIDER_TABLE_NEWS_PUBLISH_DATE] createdate = data[SQLDAO.SPIDER_TABLE_NEWS_CREATE_DATE] url = data[SQLDAO.SPIDER_TABLE_NEWS_URL].strip() if (publishdate == TimeUtility.getintformtime(0) and SQLDAO.gettime() - createdate <= self.period * 24*60*60) or \ (publishdate != TimeUtility.getintformtime(0) and SQLDAO.gettime() - TimeUtility.getinttime(publishdate) <= self.period * 24*60*60): if url not in urltemplist: urltemplist.append(url) params = PageBasicInfo() params.url = url NewsStorage.seturlinfos(params) #2.抽取createdate为本次开始时间的数据 URLFileManager.getinstance().generateurlfilepath() where = { SQLDAO.SPIDER_TABLE_NEWS_QUERY: query, SQLDAO.SPIDER_TABLE_NEWS_MACHINEFLAG: ETLController.LOCALMACHINEFLAG, SQLDAO.SPIDER_TABLE_NEWS_CREATE_DATE: SpiderConfigure.getinstance().starttime() } results = SQLDAO.getinstance().find(SQLDAO.SPIDER_TABLE_NEWS, where) urllist = [] linecount = 0 for result in results: data = SQLDAO.getdictdata(SQLDAO.SPIDER_TABLE_NEWS_KEYS, result) url = data[SQLDAO.SPIDER_TABLE_NEWS_URL].strip() urllist.append(url) context = URLContext() context.originalurl = url context.type = URLContext.S1_MAIN_BODY context.customized[constant.SPIDER_S2_WEBSITE_TYPE] = data[ SQLDAO.SPIDER_TABLE_NEWS_TYPE] Logger.getlogging().debug(url) URLManager.getinstance().storeurl(url, context, REQUEST_TYPE_WEBKIT) linecount += 1