def getid(url, content, pubdate, user): content = Common.strfilter(content) user = Common.strfilter(user) pubdate = TimeUtility.getuniformtime(pubdate) return Common.md5( Common.urlenc(url) + Common.urlenc(content) + pubdate + Common.urlenc(user))
def getfilename(self, url): # 渠道 self.channel = SpiderConfigure.getinstance().getchannel() # S2查询信息 self.query = SpiderConfigure.getinstance().getquery() # S2页面类型 self.type = SpiderConfigure.getinstance().gettype() if self.channel == SPIDER_CHANNEL_S2: q = Common.md5(self.query) else: q = self.query return Storage.SPIDER_STORE_FILENAME_FORMAT.format( path = self.cache_path, date = TimeUtility.getcurrentdate(), channel = self.channel, query = q, filename = Common.md5(url))
def getid(url): idformat = '{machine}_{query}_{url}_{starttime}' id = idformat.format( machine=NewsStorage.LOCALMACHINEFLAG, query=Common.urlenc(SpiderConfigure.getinstance().getquery()), url=Common.urlenc(url), starttime=SpiderConfigure.getinstance().starttime()) return Common.md5(id)
def aggregate_curcmtnum(self): #计算本次未推送过的评论数量 sqlf = 'SELECT {url},count(*) from {table} where {key1} is null group by {url}' sql = sqlf.format(table=SQLDAO.SPIDER_TABLE_COMMENTS, url=SQLDAO.SPIDER_TABLE_COMMENTS_URL, key1=SQLDAO.SPIDER_TABLE_COMMENTS_KEY1) results = SQLDAO.getinstance().execute(sql, find=True) for result in results: key = Common.md5(result[0].strip()) if key not in self.url_curcmtnum_map: self.url_curcmtnum_map[key] = int(result[1])
def storeurl(self, url, urlcontext, request=constant.REQUEST_TYPE_COMMON): if url.strip(): urlfile = URLFileManager.getinstance().geturlfilepath(request) if FileUtility.geturlfilelines( urlfile) + 1 > URLFileManager.URL_FILE_LINES_MAX_NUMBER: URLFileManager.getinstance().generateurlfilepath() urlfile = URLFileManager.getinstance().geturlfilepath(request) FileUtility.writeline(urlfile, url) key = Common.md5(url.strip()) if key not in self.urlcontextdict: self.urlcontextdict[key] = [] self.urlcontextdict[key].append(urlcontext)
def aggregate_beforenewsnum(self): #计算url本次之前已经推送过的次数 #如果key1标记为1,则表示该url对应的news id已经推送过;否则表示未推送过 sqlf = 'SELECT {url},count(*) from {table} where {key1}=1 group by {url}' sql = sqlf.format(table=SQLDAO.SPIDER_TABLE_NEWS, url=SQLDAO.SPIDER_TABLE_NEWS_URL, key1=SQLDAO.SPIDER_TABLE_NEWS_KEY1) results = SQLDAO.getinstance().execute(sql, find=True) for result in results: key = Common.md5(result[0].strip()) if key not in self.url_beforenewsnum_map: self.url_beforenewsnum_map[key] = int(result[1])
def s2queryurl(query, website, url, onlywrite=False): sitename = str(website) if '.' in sitename: sitename = sitename[sitename.rindex('.') + 1:] if not onlywrite: SpiderReport.removequerysite(query, sitename) SpiderReport.getinstance().s2urlsitemap[Common.md5(url.strip())] = sitename SpiderReport.updates2site(query, sitename, SpiderReport.URL_UPLOAD, 1) FileUtility.writeline(SpiderReport.getinstance().s2urlfilepath, SpiderReport.S2URL_FORMAT.format( query=query, website=sitename, url=url ))
def __getcontent(self, url, method): database = bsddb.btopen(self.file, 'c') if database.has_key(Common.md5(url)): content = Common.urldec( database[Common.md5(url)]).decode(CHARSET_DEFAULT) database.close() return content if method == constant.REQUEST_TYPE_POST: js = json.loads(url) content = HttpUtility().post(js['url'], js['data']) elif method == constant.REQUEST_TYPE_WEBKIT: content = HttpUtility().wget(url) elif method == constant.REQUEST_TYPE_IMG: content = HttpUtility().get(url) content = binascii.b2a_hex(content) else: content = HttpUtility().get(url) if content is None: database.close() return None charset = RegexUtility().getid('charset', content) unic = Common.trydecode(content, charset) utf8str = unic.encode(CHARSET_UTF8) charset = CHARSET_UTF8 self.urlmap[Common.md5(url)] = unic # content = content.encode('utf8') line = { "md5": Common.md5(url), "charset": charset, "html": Common.urlenc(utf8str), "url": Common.urlenc(url) } if len(utf8str) > 2000: database = bsddb.btopen(self.file, 'c') database[Common.md5(url)] = Common.urlenc(utf8str) database.close() # FileUtility.writeline(self.file, json.dumps(line)) return utf8str.decode(CHARSET_UTF8)
def aggregate_curcomments(self): #汇总本次未推送的评论 sqlf = 'SELECT {url},{content},{publish} from {table} where {key1} is null' sql = sqlf.format(table=SQLDAO.SPIDER_TABLE_COMMENTS, url=SQLDAO.SPIDER_TABLE_COMMENTS_URL, content=SQLDAO.SPIDER_TABLE_COMMENTS_CONTENT, publish=SQLDAO.SPIDER_TABLE_COMMENTS_PUBLISH_DATE, key1=SQLDAO.SPIDER_TABLE_COMMENTS_KEY1) cmtsresults = SQLDAO.getinstance().execute(sql, find=True) for cmtsresult in cmtsresults: urlmd5 = Common.md5(cmtsresult[0]) content = self.strfilter(cmtsresult[1]) publish = TimeUtility.getinttime(cmtsresult[2]) if urlmd5 not in self.url_curcmtcontent_map: self.url_curcmtcontent_map[urlmd5] = [] self.url_curcmtcontent_map[urlmd5].append(content + '_' + str(int(publish)))
def generateurlfilepath(self, retrytimes=0): context = URLFileContext() context.channel = SpiderConfigure.getinstance().getchannel() context.query = SpiderConfigure.getinstance().getquery() context.retry = retrytimes # 防止生成相同的URL文件,等待1秒后重新获取时间戳 if self.urlfiletimestamp == int(time.time()): time.sleep(1) self.urlfiletimestamp = int(time.time()) self.urlsfile = URLFileManager.URLS_FILE_PATTERN.format( path=self.tempurldir, channel=context.channel, query=Common.md5(context.query), ts=self.urlfiletimestamp) context.filename = self.urlsfile self.urlsfilemap[FileUtility.getfilename(self.urlsfile)] = context Logger.getlogging().info(self.urlsfile) return self.urlsfile
def update(channelorquery, type, key, delta, url=None): # update reportlist if channelorquery not in SpiderReport.getinstance().reportlist: SpiderReport.getinstance().reportlist[channelorquery] = {} r = SpiderReport.getinstance().reportlist[channelorquery] if type not in r: r[type] = Report() if channelorquery != constant.SPIDER_CHANNEL_S1: r[type].channel = constant.SPIDER_CHANNEL_S2 r[type].query = channelorquery r[type].type = type r[type].values[key] += delta # update s2 site report list if channelorquery != constant.SPIDER_CHANNEL_S1 and url is not None: urlmd5 = Common.md5(url.strip()) if urlmd5 in SpiderReport.getinstance().s2urlsitemap: website = SpiderReport.getinstance().s2urlsitemap[urlmd5] SpiderReport.updates2site(channelorquery, website, key, delta) # udapte all SpiderReport.getinstance().totalreport.values[key] += delta
def getid(query, machine): return Common.md5(Common.urlenc(query) + machine)
def exist(self, url): if self.urlcontextdict.has_key(Common.md5(url)): if self.urlcontextdict[Common.md5(url)]: return True return False
def seturlcontext(self, url, urlcontext): key = Common.md5(url.strip()) if key not in self.urlcontextdict: self.urlcontextdict[key] = [] self.urlcontextdict[key].append(urlcontext)
def geturlcontext(self, url): if self.urlcontextdict[Common.md5(url)]: return self.urlcontextdict[Common.md5(url)].pop()
def fileformat(self): self.aggregate_beforenewsinfo() self.aggregate_beforenewsnum() self.aggregate_curcomments() self.aggregate_curcmtnum() self.aggregate_beforecmtsnum() self.dereplicate() urllist = [] idlist = [] newscond = '{key} is null'.format(key=SQLDAO.SPIDER_TABLE_NEWS_KEY1) results = SQLDAO.getinstance().find(SQLDAO.SPIDER_TABLE_NEWS, where=newscond) for result in results: doc = SQLDAO.getdictdata(SQLDAO.SPIDER_TABLE_NEWS_KEYS, result) id = doc[SQLDAO.SPIDER_TABLE_NEWS_ID] url = doc[SQLDAO.SPIDER_TABLE_NEWS_URL].strip() try: urlmd5 = Common.md5(url) channel = doc.get(SQLDAO.SPIDER_TABLE_NEWS_CHANNEL, '201') title = doc.get(SQLDAO.SPIDER_TABLE_NEWS_TITLE, '') body = doc.get(SQLDAO.SPIDER_TABLE_NEWS_BODY, '') commentlist = self.url_curcmtcontent_map.get(urlmd5, []) comments = ' '.join(commentlist) pubtime = doc.get(SQLDAO.SPIDER_TABLE_NEWS_PUBLISH_DATE, TimeUtility.getintformtime(0)) crawlertime = doc.get(SQLDAO.SPIDER_TABLE_NEWS_CREATE_DATE, TimeUtility.getintformtime(0)) type = doc.get(SQLDAO.SPIDER_TABLE_NEWS_TYPE, '') query = doc.get(SQLDAO.SPIDER_TABLE_NEWS_QUERY, '') #评论量增量推送 # 第一次推送全量:如果comments对应的内容没有被取过(key1没有标记1),则应推送全量 # 此时如果news中cmtnum>0,则推送news中的cmtnum,否则推送comment中的cmtnum(已经聚合到url_curcmtnum_map中) # 第二次推送增量:如果comments对应的内容有取过(key1有部分标记1),则应推送增量,推送comment中的cmtnum(已经聚合到url_curcmtnum_map中) cmtkey1flag = self.url_beforecmtnum_map.get(urlmd5, -1) if cmtkey1flag <= 0: cmtnum = doc.get(SQLDAO.SPIDER_TABLE_NEWS_CMTNUM, -1) if cmtnum < 0: cmtnum = self.url_curcmtnum_map.get(urlmd5, 0) else: cmtnum = self.url_curcmtnum_map.get(urlmd5, 0) #其他增量 clicknum = doc.get(SQLDAO.SPIDER_TABLE_NEWS_CLICKNUM, -1) clicknum = self.increment(urlmd5, SQLDAO.SPIDER_TABLE_NEWS_CLICKNUM, clicknum) votenum = doc.get(SQLDAO.SPIDER_TABLE_NEWS_VOTENUM, -1) votenum = self.increment(urlmd5, SQLDAO.SPIDER_TABLE_NEWS_VOTENUM, votenum) fansnum = doc.get(SQLDAO.SPIDER_TABLE_NEWS_FANSNUM, -1) fansnum = self.increment(urlmd5, SQLDAO.SPIDER_TABLE_NEWS_FANSNUM, fansnum) string = FileFormat.DEFAULT_NEWS_FORMAT.format(channel=channel, url=url, title=self.strfilter(title), body=self.strfilter(body), comments=comments, cmtnum=cmtnum, clicknum=clicknum, votenum=votenum, fansnum=fansnum, pubtime=TimeUtility.getinttime(pubtime), crawlertime=crawlertime, type=type, query=self.strfilter(query)) Logger.getlogging().info(u'{channel}\t{query}\t{url}'.format(channel=channel, query=query, url=url).encode(constant.CHARSET_UTF8)) if not title: FileUtility.writeline(self.errorinfopath, string.encode(constant.CHARSET_UTF8)) else: FileUtility.writeline(self.outputpath, string.encode(constant.CHARSET_UTF8)) if id not in idlist: idlist.append(id) if title and commentlist: if url not in urllist: urllist.append(url) except: Logger.getlogging().error(str(result)) Logger.printexception() #已经提取过,则变更key1标记为1 self.updatenewsflag(idlist) self.updatecommentsflag(urllist)
def aggregate_beforenewsinfo(self): #如何提取已取过url的最大值 #1.首先值是有效的,并存储对应的值的抓取时间 #2.其次按抓取时间排序,取最大抓取时间对应的值 sqlf = 'SELECT {url},{createtime},{cmtnum}, {clicknum},{votenum},{fansnum} from {table} where {key1}=1' sql = sqlf.format(table=SQLDAO.SPIDER_TABLE_NEWS, url=SQLDAO.SPIDER_TABLE_NEWS_URL, createtime=SQLDAO.SPIDER_TABLE_NEWS_CREATE_DATE, cmtnum=SQLDAO.SPIDER_TABLE_NEWS_CMTNUM, clicknum=SQLDAO.SPIDER_TABLE_NEWS_CLICKNUM, votenum=SQLDAO.SPIDER_TABLE_NEWS_VOTENUM, fansnum=SQLDAO.SPIDER_TABLE_NEWS_FANSNUM, key1=SQLDAO.SPIDER_TABLE_NEWS_KEY1) results = SQLDAO.getinstance().execute(sql, find=True) cmtnumlist = {} clicknumlist = {} votenumlist = {} fansnumlist = {} for result in results: url = result[0].strip() urlmd5 = Common.md5(url) createtime = result[1] cmtnum = result[2] clicknum = result[3] votenum = result[4] fansnum = result[5] if urlmd5 not in cmtnumlist: cmtnumlist[urlmd5] = {} if urlmd5 not in clicknumlist: clicknumlist[urlmd5] = {} if urlmd5 not in votenumlist: votenumlist[urlmd5] = {} if urlmd5 not in fansnumlist: fansnumlist[urlmd5] = {} #存储有效的值(>0)及对应的抓取时间 if cmtnum > 0: if cmtnumlist[urlmd5].get(str(createtime), 0) <= cmtnum: cmtnumlist[urlmd5][str(createtime)] = cmtnum if clicknum > 0: if clicknumlist[urlmd5].get(str(createtime), 0) <= clicknum: clicknumlist[urlmd5][str(createtime)] = clicknum if votenum > 0: if votenumlist[urlmd5].get(str(createtime), 0) <= votenum: votenumlist[urlmd5][str(createtime)] = votenum if fansnum > 0: if fansnumlist[urlmd5].get(str(createtime), 0) <= fansnum: fansnumlist[urlmd5][str(createtime)] = fansnum for urlmd5, value in cmtnumlist.iteritems(): if not value: continue self.url_beforenewsinfo_map[SQLDAO.SPIDER_TABLE_NEWS_CMTNUM][urlmd5] = value[max(value)] for urlmd5, value in clicknumlist.iteritems(): if not value: continue self.url_beforenewsinfo_map[SQLDAO.SPIDER_TABLE_NEWS_CLICKNUM][urlmd5] = value[max(value)] for urlmd5, value in votenumlist.iteritems(): if not value: continue self.url_beforenewsinfo_map[SQLDAO.SPIDER_TABLE_NEWS_VOTENUM][urlmd5] = value[max(value)] for urlmd5, value in fansnumlist.iteritems(): if not value: continue self.url_beforenewsinfo_map[SQLDAO.SPIDER_TABLE_NEWS_FANSNUM][urlmd5] = value[max(value)]