def storecmt(url, content, pubdate, user): content = Common.strfilter(content) user = Common.strfilter(user) pubdate = TimeUtility.getuniformtime(pubdate) if not CMTStorage.exist(url, content, pubdate, user): Logger.getlogging().debug( 'url:{url}, content:{content}, pubdate:{pubdate}, user:{user}'. format(url=url, content=content, pubdate=pubdate, user=user)) id = CMTStorage.getid(url, content, pubdate, user) data = { SQLDAO.SPIDER_TABLE_COMMENTS_ID: id, SQLDAO.SPIDER_TABLE_COMMENTS_URL: url, SQLDAO.SPIDER_TABLE_COMMENTS_PUBLISH_DATE: pubdate, SQLDAO.SPIDER_TABLE_COMMENTS_USER: user, SQLDAO.SPIDER_TABLE_COMMENTS_CONTENT: content, SQLDAO.SPIDER_TABLE_COMMENTS_CREATE_DATE: SpiderConfigure.getinstance().starttime() } SQLDAO.getinstance().insert( SQLDAO.SPIDER_TABLE_COMMENTS, SQLDAO.SPIDER_TABLE_COMMENTS_KEYS, SQLDAO.getvaluesfromkeys(data, SQLDAO.SPIDER_TABLE_COMMENTS_KEYS))
def seturlinfos(params): id = NewsStorage.getid(params.url) if NewsStorage.exist(params.url): doc = NewsStorage.getdoc(params.url) data = {} #data[SQLDAO.SPIDER_TABLE_NEWS_TYPE] = params.type data[SQLDAO.SPIDER_TABLE_NEWS_TITLE] = Common.strfilter( params.title) if params.type != constant.SPIDER_S2_WEBSITE_VIDEO: data[SQLDAO.SPIDER_TABLE_NEWS_BODY] = Common.strfilter( params.body) if doc.get(SQLDAO.SPIDER_TABLE_NEWS_PUBLISH_DATE, TimeUtility.getintformtime( 0)) == TimeUtility.getintformtime(0): data[ SQLDAO. SPIDER_TABLE_NEWS_PUBLISH_DATE] = TimeUtility.getuniformtime( params.pubtime) data[SQLDAO.SPIDER_TABLE_NEWS_CMTNUM] = params.cmtnum data[SQLDAO.SPIDER_TABLE_NEWS_CLICKNUM] = params.clicknum data[SQLDAO.SPIDER_TABLE_NEWS_FANSNUM] = params.fansnum data[SQLDAO.SPIDER_TABLE_NEWS_VOTENUM] = params.votenum data[SQLDAO.SPIDER_TABLE_NEWS_UPDATE_DATE] = SQLDAO.gettime() SQLDAO.getinstance().update(SQLDAO.SPIDER_TABLE_NEWS, {SQLDAO.SPIDER_TABLE_NEWS_ID: id}, data) else: data = {} data[SQLDAO.SPIDER_TABLE_NEWS_TYPE] = params.type data[SQLDAO.SPIDER_TABLE_NEWS_TITLE] = Common.strfilter( params.title) if params.type != constant.SPIDER_S2_WEBSITE_VIDEO: data[SQLDAO.SPIDER_TABLE_NEWS_BODY] = Common.strfilter( params.body) data[SQLDAO. SPIDER_TABLE_NEWS_PUBLISH_DATE] = TimeUtility.getuniformtime( params.pubtime) data[SQLDAO.SPIDER_TABLE_NEWS_CMTNUM] = params.cmtnum data[SQLDAO.SPIDER_TABLE_NEWS_CLICKNUM] = params.clicknum data[SQLDAO.SPIDER_TABLE_NEWS_FANSNUM] = params.fansnum data[SQLDAO.SPIDER_TABLE_NEWS_VOTENUM] = params.votenum data[SQLDAO.SPIDER_TABLE_NEWS_UPDATE_DATE] = SQLDAO.gettime() data[SQLDAO.SPIDER_TABLE_NEWS_ID] = id data[SQLDAO.SPIDER_TABLE_NEWS_URL] = params.url data[SQLDAO.SPIDER_TABLE_NEWS_QUERY] = params.query data[SQLDAO.SPIDER_TABLE_NEWS_CHANNEL] = params.channel data[SQLDAO.SPIDER_TABLE_NEWS_CREATE_DATE] = params.createtime data[SQLDAO. SPIDER_TABLE_NEWS_MACHINEFLAG] = NewsStorage.LOCALMACHINEFLAG SQLDAO.getinstance().insert(SQLDAO.SPIDER_TABLE_NEWS, SQLDAO.SPIDER_TABLE_NEWS_KEYS, SQLDAO.getvaluesfromkeys(data))
def storetiebaquery(self, query, queryurl, machineflaglist=MACHINEFLAGLIST_TIEBA): #查询query是否存在,如果存在则更新当前updatetime # 如果不存在则查找具有query数量最小的机器,进行query存储 query = query.strip() queryurl = queryurl.strip() result = QueryStorage.find(query, machineflaglist, table=SQLDAO.SPIDER_TABLE_QUERYS_TIEBA) if result: resultdict = SQLDAO.getdictdata(SQLDAO.SPIDER_TABLE_QUERYS_KEYS, result) machine = resultdict[SQLDAO.SPIDER_TABLE_QUERYS_MACHINEFLAG] id = QueryStorage.getid(query, machine) SQLDAO.getinstance().update( SQLDAO.SPIDER_TABLE_QUERYS_TIEBA, {SQLDAO.SPIDER_TABLE_QUERYS_ID: id}, { SQLDAO.SPIDER_TABLE_QUERYS_UPDATEDATE: SpiderConfigure.getinstance().starttime(), SQLDAO.SPIDER_TABLE_QUERYS_VALID: 1 }) else: machine = min(self.querystorage_tieba.iteritems(), key=lambda x: x[1])[0] data = { SQLDAO.SPIDER_TABLE_QUERYS_ID: QueryStorage.getid(query, machine), SQLDAO.SPIDER_TABLE_QUERYS_QUERY: query, SQLDAO.SPIDER_TABLE_QUERYS_CREATEDATE: SpiderConfigure.getinstance().starttime(), SQLDAO.SPIDER_TABLE_QUERYS_UPDATEDATE: SpiderConfigure.getinstance().starttime(), SQLDAO.SPIDER_TABLE_QUERYS_MACHINEFLAG: machine, SQLDAO.SPIDER_TABLE_QUERYS_QUERYURL: queryurl, SQLDAO.SPIDER_TABLE_QUERYS_VALID: 1 } SQLDAO.getinstance().insert( SQLDAO.SPIDER_TABLE_QUERYS_TIEBA, SQLDAO.SPIDER_TABLE_QUERYS_KEYS, SQLDAO.getvaluesfromkeys(data, SQLDAO.SPIDER_TABLE_QUERYS_KEYS)) #对各machine的实时存储记录 self.querystorage_tieba[machine] = self.querystorage_tieba.get( machine, 0) + 1
def storeurl(url): id = NewsStorage.getid(url) if not NewsStorage.exist(url): data = {} data[SQLDAO.SPIDER_TABLE_NEWS_ID] = id data[SQLDAO.SPIDER_TABLE_NEWS_URL] = url data[SQLDAO.SPIDER_TABLE_NEWS_QUERY] = SpiderConfigure.getinstance( ).getquery() data[SQLDAO. SPIDER_TABLE_NEWS_CHANNEL] = SpiderConfigure.getinstance( ).getchannel() data[SQLDAO. SPIDER_TABLE_NEWS_MACHINEFLAG] = NewsStorage.LOCALMACHINEFLAG data[SQLDAO. SPIDER_TABLE_NEWS_CREATE_DATE] = SpiderConfigure.getinstance( ).starttime() data[SQLDAO.SPIDER_TABLE_NEWS_UPDATE_DATE] = SQLDAO.gettime() SQLDAO.getinstance().insert(SQLDAO.SPIDER_TABLE_NEWS, SQLDAO.SPIDER_TABLE_NEWS_KEYS, SQLDAO.getvaluesfromkeys(data))
def seturlinfo(url, key=None, value=None, data={}): id = NewsStorage.getid(url) if data: SQLDAO.getinstance().update(SQLDAO.SPIDER_TABLE_NEWS, {SQLDAO.SPIDER_TABLE_NEWS_ID: id}, data) return if SQLDAO.SPIDER_TABLE_NEWS_PUBLISH_DATE == key: value = TimeUtility.getuniformtime(value) if NewsStorage.exist(url): doc = NewsStorage.getdoc(url) tempvalue = doc.get(key, '') if tempvalue != value: data = { key: value, SQLDAO.SPIDER_TABLE_NEWS_UPDATE_DATE: SQLDAO.gettime() } SQLDAO.getinstance().update(SQLDAO.SPIDER_TABLE_NEWS, {SQLDAO.SPIDER_TABLE_NEWS_ID: id}, data) else: data = {} data[SQLDAO.SPIDER_TABLE_NEWS_ID] = id data[SQLDAO.SPIDER_TABLE_NEWS_URL] = url data[SQLDAO.SPIDER_TABLE_NEWS_QUERY] = SpiderConfigure.getinstance( ).getquery() data[SQLDAO. SPIDER_TABLE_NEWS_CHANNEL] = SpiderConfigure.getinstance( ).getchannel() data[SQLDAO. SPIDER_TABLE_NEWS_MACHINEFLAG] = NewsStorage.LOCALMACHINEFLAG data[SQLDAO. SPIDER_TABLE_NEWS_CREATE_DATE] = SpiderConfigure.getinstance( ).starttime() data[key] = value data[SQLDAO.SPIDER_TABLE_NEWS_UPDATE_DATE] = SQLDAO.gettime() SQLDAO.getinstance().insert(SQLDAO.SPIDER_TABLE_NEWS, SQLDAO.SPIDER_TABLE_NEWS_KEYS, SQLDAO.getvaluesfromkeys(data))