def getid(url, content, pubdate, user):
     content = Common.strfilter(content)
     user = Common.strfilter(user)
     pubdate = TimeUtility.getuniformtime(pubdate)
     return Common.md5(
         Common.urlenc(url) + Common.urlenc(content) + pubdate +
         Common.urlenc(user))
示例#2
0
 def getfilename(self, url):
     # 渠道
     self.channel = SpiderConfigure.getinstance().getchannel()
     # S2查询信息
     self.query = SpiderConfigure.getinstance().getquery()
     # S2页面类型
     self.type = SpiderConfigure.getinstance().gettype()
     if self.channel == SPIDER_CHANNEL_S2:
         q = Common.md5(self.query)
     else:
         q = self.query
     return Storage.SPIDER_STORE_FILENAME_FORMAT.format(
         path = self.cache_path,
         date = TimeUtility.getcurrentdate(),
         channel = self.channel,
         query = q,
         filename = Common.md5(url))
 def getid(url):
     idformat = '{machine}_{query}_{url}_{starttime}'
     id = idformat.format(
         machine=NewsStorage.LOCALMACHINEFLAG,
         query=Common.urlenc(SpiderConfigure.getinstance().getquery()),
         url=Common.urlenc(url),
         starttime=SpiderConfigure.getinstance().starttime())
     return Common.md5(id)
示例#4
0
 def aggregate_curcmtnum(self):
     #计算本次未推送过的评论数量
     sqlf = 'SELECT {url},count(*) from {table} where {key1} is null group by {url}'
     sql = sqlf.format(table=SQLDAO.SPIDER_TABLE_COMMENTS,
                       url=SQLDAO.SPIDER_TABLE_COMMENTS_URL,
                       key1=SQLDAO.SPIDER_TABLE_COMMENTS_KEY1)
     results = SQLDAO.getinstance().execute(sql, find=True)
     for result in results:
         key = Common.md5(result[0].strip())
         if key not in self.url_curcmtnum_map:
             self.url_curcmtnum_map[key] = int(result[1])
 def storeurl(self, url, urlcontext, request=constant.REQUEST_TYPE_COMMON):
     if url.strip():
         urlfile = URLFileManager.getinstance().geturlfilepath(request)
         if FileUtility.geturlfilelines(
                 urlfile) + 1 > URLFileManager.URL_FILE_LINES_MAX_NUMBER:
             URLFileManager.getinstance().generateurlfilepath()
             urlfile = URLFileManager.getinstance().geturlfilepath(request)
         FileUtility.writeline(urlfile, url)
         key = Common.md5(url.strip())
         if key not in self.urlcontextdict:
             self.urlcontextdict[key] = []
         self.urlcontextdict[key].append(urlcontext)
示例#6
0
 def aggregate_beforenewsnum(self):
     #计算url本次之前已经推送过的次数
     #如果key1标记为1,则表示该url对应的news id已经推送过;否则表示未推送过
     sqlf = 'SELECT {url},count(*) from {table} where {key1}=1 group by {url}'
     sql = sqlf.format(table=SQLDAO.SPIDER_TABLE_NEWS,
                       url=SQLDAO.SPIDER_TABLE_NEWS_URL,
                       key1=SQLDAO.SPIDER_TABLE_NEWS_KEY1)   
     results = SQLDAO.getinstance().execute(sql, find=True)
     for result in results:
         key = Common.md5(result[0].strip())
         if key not in self.url_beforenewsnum_map:
             self.url_beforenewsnum_map[key] = int(result[1])         
 def s2queryurl(query, website, url, onlywrite=False):
     sitename = str(website)
     if '.' in sitename:
         sitename = sitename[sitename.rindex('.') + 1:]
     if not onlywrite:
         SpiderReport.removequerysite(query, sitename)
         SpiderReport.getinstance().s2urlsitemap[Common.md5(url.strip())] = sitename
         SpiderReport.updates2site(query, sitename, SpiderReport.URL_UPLOAD, 1)
     FileUtility.writeline(SpiderReport.getinstance().s2urlfilepath, SpiderReport.S2URL_FORMAT.format(
         query=query,
         website=sitename,
         url=url
     ))
 def __getcontent(self, url, method):
     database = bsddb.btopen(self.file, 'c')
     if database.has_key(Common.md5(url)):
         content = Common.urldec(
             database[Common.md5(url)]).decode(CHARSET_DEFAULT)
         database.close()
         return content
     if method == constant.REQUEST_TYPE_POST:
         js = json.loads(url)
         content = HttpUtility().post(js['url'], js['data'])
     elif method == constant.REQUEST_TYPE_WEBKIT:
         content = HttpUtility().wget(url)
     elif method == constant.REQUEST_TYPE_IMG:
         content = HttpUtility().get(url)
         content = binascii.b2a_hex(content)
     else:
         content = HttpUtility().get(url)
     if content is None:
         database.close()
         return None
     charset = RegexUtility().getid('charset', content)
     unic = Common.trydecode(content, charset)
     utf8str = unic.encode(CHARSET_UTF8)
     charset = CHARSET_UTF8
     self.urlmap[Common.md5(url)] = unic
     # content = content.encode('utf8')
     line = {
         "md5": Common.md5(url),
         "charset": charset,
         "html": Common.urlenc(utf8str),
         "url": Common.urlenc(url)
     }
     if len(utf8str) > 2000:
         database = bsddb.btopen(self.file, 'c')
         database[Common.md5(url)] = Common.urlenc(utf8str)
         database.close()
         # FileUtility.writeline(self.file, json.dumps(line))
     return utf8str.decode(CHARSET_UTF8)
示例#9
0
 def aggregate_curcomments(self):
     #汇总本次未推送的评论
     sqlf = 'SELECT {url},{content},{publish} from {table} where {key1} is null'
     sql = sqlf.format(table=SQLDAO.SPIDER_TABLE_COMMENTS,
                       url=SQLDAO.SPIDER_TABLE_COMMENTS_URL,
                       content=SQLDAO.SPIDER_TABLE_COMMENTS_CONTENT,
                       publish=SQLDAO.SPIDER_TABLE_COMMENTS_PUBLISH_DATE,
                       key1=SQLDAO.SPIDER_TABLE_COMMENTS_KEY1)
     cmtsresults = SQLDAO.getinstance().execute(sql, find=True)
     for cmtsresult in cmtsresults:
         urlmd5 = Common.md5(cmtsresult[0])
         content = self.strfilter(cmtsresult[1])
         publish = TimeUtility.getinttime(cmtsresult[2])
         if urlmd5 not in self.url_curcmtcontent_map:
             self.url_curcmtcontent_map[urlmd5] = []
         self.url_curcmtcontent_map[urlmd5].append(content + '_' + str(int(publish)))
示例#10
0
 def generateurlfilepath(self, retrytimes=0):
     context = URLFileContext()
     context.channel = SpiderConfigure.getinstance().getchannel()
     context.query = SpiderConfigure.getinstance().getquery()
     context.retry = retrytimes
     # 防止生成相同的URL文件,等待1秒后重新获取时间戳
     if self.urlfiletimestamp == int(time.time()):
         time.sleep(1)
     self.urlfiletimestamp = int(time.time())
     self.urlsfile = URLFileManager.URLS_FILE_PATTERN.format(
         path=self.tempurldir,
         channel=context.channel,
         query=Common.md5(context.query),
         ts=self.urlfiletimestamp)
     context.filename = self.urlsfile
     self.urlsfilemap[FileUtility.getfilename(self.urlsfile)] = context
     Logger.getlogging().info(self.urlsfile)
     return self.urlsfile
 def update(channelorquery, type, key, delta, url=None):
     # update reportlist
     if channelorquery not in SpiderReport.getinstance().reportlist:
         SpiderReport.getinstance().reportlist[channelorquery] = {}
     r = SpiderReport.getinstance().reportlist[channelorquery]
     if type not in r:
         r[type] = Report()
     if channelorquery != constant.SPIDER_CHANNEL_S1:
         r[type].channel = constant.SPIDER_CHANNEL_S2
         r[type].query = channelorquery
         r[type].type = type
     r[type].values[key] += delta
     # update s2 site report list
     if channelorquery != constant.SPIDER_CHANNEL_S1 and url is not None:
         urlmd5 = Common.md5(url.strip())
         if urlmd5 in SpiderReport.getinstance().s2urlsitemap:
             website = SpiderReport.getinstance().s2urlsitemap[urlmd5]
             SpiderReport.updates2site(channelorquery, website, key, delta)
     # udapte all
     SpiderReport.getinstance().totalreport.values[key] += delta
示例#12
0
 def getid(query, machine):
     return Common.md5(Common.urlenc(query) + machine)
 def exist(self, url):
     if self.urlcontextdict.has_key(Common.md5(url)):
         if self.urlcontextdict[Common.md5(url)]:
             return True
     return False
 def seturlcontext(self, url, urlcontext):
     key = Common.md5(url.strip())
     if key not in self.urlcontextdict:
         self.urlcontextdict[key] = []
     self.urlcontextdict[key].append(urlcontext)
 def geturlcontext(self, url):
     if self.urlcontextdict[Common.md5(url)]:
         return self.urlcontextdict[Common.md5(url)].pop()
示例#16
0
 def fileformat(self):
     self.aggregate_beforenewsinfo()
     self.aggregate_beforenewsnum()        
     self.aggregate_curcomments()
     self.aggregate_curcmtnum()
     self.aggregate_beforecmtsnum()
     self.dereplicate()
     urllist = []
     idlist = []
     newscond = '{key} is null'.format(key=SQLDAO.SPIDER_TABLE_NEWS_KEY1)
     results = SQLDAO.getinstance().find(SQLDAO.SPIDER_TABLE_NEWS, where=newscond)
     for result in results:
         doc = SQLDAO.getdictdata(SQLDAO.SPIDER_TABLE_NEWS_KEYS, result)
         id = doc[SQLDAO.SPIDER_TABLE_NEWS_ID]
         url = doc[SQLDAO.SPIDER_TABLE_NEWS_URL].strip()            
         try:
             urlmd5 = Common.md5(url)
             channel = doc.get(SQLDAO.SPIDER_TABLE_NEWS_CHANNEL, '201')
             title = doc.get(SQLDAO.SPIDER_TABLE_NEWS_TITLE, '')
             body = doc.get(SQLDAO.SPIDER_TABLE_NEWS_BODY, '')
             commentlist = self.url_curcmtcontent_map.get(urlmd5, [])
             comments = ' '.join(commentlist)
             pubtime = doc.get(SQLDAO.SPIDER_TABLE_NEWS_PUBLISH_DATE, TimeUtility.getintformtime(0))
             crawlertime = doc.get(SQLDAO.SPIDER_TABLE_NEWS_CREATE_DATE, TimeUtility.getintformtime(0))
             type = doc.get(SQLDAO.SPIDER_TABLE_NEWS_TYPE, '')
             query = doc.get(SQLDAO.SPIDER_TABLE_NEWS_QUERY, '')
             #评论量增量推送
             #      第一次推送全量:如果comments对应的内容没有被取过(key1没有标记1),则应推送全量
             #                     此时如果news中cmtnum>0,则推送news中的cmtnum,否则推送comment中的cmtnum(已经聚合到url_curcmtnum_map中)                             
             #      第二次推送增量:如果comments对应的内容有取过(key1有部分标记1),则应推送增量,推送comment中的cmtnum(已经聚合到url_curcmtnum_map中)
             cmtkey1flag = self.url_beforecmtnum_map.get(urlmd5, -1)
             if cmtkey1flag <= 0:
                 cmtnum = doc.get(SQLDAO.SPIDER_TABLE_NEWS_CMTNUM, -1)
                 if cmtnum < 0:
                     cmtnum = self.url_curcmtnum_map.get(urlmd5, 0)
             else:
                 cmtnum = self.url_curcmtnum_map.get(urlmd5, 0)
             #其他增量
             clicknum = doc.get(SQLDAO.SPIDER_TABLE_NEWS_CLICKNUM, -1)
             clicknum = self.increment(urlmd5, SQLDAO.SPIDER_TABLE_NEWS_CLICKNUM, clicknum)
             votenum = doc.get(SQLDAO.SPIDER_TABLE_NEWS_VOTENUM, -1)
             votenum = self.increment(urlmd5, SQLDAO.SPIDER_TABLE_NEWS_VOTENUM, votenum)
             fansnum = doc.get(SQLDAO.SPIDER_TABLE_NEWS_FANSNUM, -1)
             fansnum = self.increment(urlmd5, SQLDAO.SPIDER_TABLE_NEWS_FANSNUM, fansnum)
             string = FileFormat.DEFAULT_NEWS_FORMAT.format(channel=channel,
                                                            url=url,
                                                            title=self.strfilter(title),
                                                            body=self.strfilter(body),
                                                            comments=comments,
                                                            cmtnum=cmtnum,
                                                            clicknum=clicknum,
                                                            votenum=votenum,
                                                            fansnum=fansnum,
                                                            pubtime=TimeUtility.getinttime(pubtime),
                                                            crawlertime=crawlertime,
                                                            type=type,
                                                            query=self.strfilter(query))   
             Logger.getlogging().info(u'{channel}\t{query}\t{url}'.format(channel=channel, query=query, url=url).encode(constant.CHARSET_UTF8))
             if not title:
                 FileUtility.writeline(self.errorinfopath, string.encode(constant.CHARSET_UTF8)) 
             else:
                 FileUtility.writeline(self.outputpath, string.encode(constant.CHARSET_UTF8))  
       
             if id not in idlist:
                 idlist.append(id)
             if title and commentlist:
                 if url not in urllist:
                     urllist.append(url)
         except:
             Logger.getlogging().error(str(result))
             Logger.printexception()
     #已经提取过,则变更key1标记为1
     self.updatenewsflag(idlist)
     self.updatecommentsflag(urllist)
示例#17
0
 def aggregate_beforenewsinfo(self):
     #如何提取已取过url的最大值
     #1.首先值是有效的,并存储对应的值的抓取时间
     #2.其次按抓取时间排序,取最大抓取时间对应的值
     sqlf = 'SELECT {url},{createtime},{cmtnum}, {clicknum},{votenum},{fansnum} from {table} where  {key1}=1'
     sql = sqlf.format(table=SQLDAO.SPIDER_TABLE_NEWS,
                       url=SQLDAO.SPIDER_TABLE_NEWS_URL,
                       createtime=SQLDAO.SPIDER_TABLE_NEWS_CREATE_DATE,
                       cmtnum=SQLDAO.SPIDER_TABLE_NEWS_CMTNUM,
                       clicknum=SQLDAO.SPIDER_TABLE_NEWS_CLICKNUM,
                       votenum=SQLDAO.SPIDER_TABLE_NEWS_VOTENUM,
                       fansnum=SQLDAO.SPIDER_TABLE_NEWS_FANSNUM,
                       key1=SQLDAO.SPIDER_TABLE_NEWS_KEY1)
     results = SQLDAO.getinstance().execute(sql, find=True)
     cmtnumlist = {}
     clicknumlist = {}
     votenumlist = {}
     fansnumlist = {}
     for result in results:
         url = result[0].strip()
         urlmd5 = Common.md5(url)
         createtime = result[1]
         cmtnum = result[2]
         clicknum = result[3]
         votenum = result[4]
         fansnum = result[5]
         if urlmd5 not in cmtnumlist:
             cmtnumlist[urlmd5] = {}
         if urlmd5 not in clicknumlist:
             clicknumlist[urlmd5] = {}
         if urlmd5 not in votenumlist:
             votenumlist[urlmd5] = {}
         if urlmd5 not in fansnumlist:
             fansnumlist[urlmd5] = {}
         #存储有效的值(>0)及对应的抓取时间
         if cmtnum > 0:
             if cmtnumlist[urlmd5].get(str(createtime), 0) <= cmtnum:
                 cmtnumlist[urlmd5][str(createtime)] = cmtnum
         if clicknum > 0:
             if clicknumlist[urlmd5].get(str(createtime), 0) <= clicknum:
                 clicknumlist[urlmd5][str(createtime)] = clicknum
         if votenum > 0:
             if votenumlist[urlmd5].get(str(createtime), 0) <= votenum:
                 votenumlist[urlmd5][str(createtime)] = votenum            
         if fansnum > 0:
             if fansnumlist[urlmd5].get(str(createtime), 0) <= fansnum:
                 fansnumlist[urlmd5][str(createtime)] = fansnum 
     for urlmd5, value in cmtnumlist.iteritems():
         if not value:
             continue
         self.url_beforenewsinfo_map[SQLDAO.SPIDER_TABLE_NEWS_CMTNUM][urlmd5] = value[max(value)]
     for urlmd5, value in clicknumlist.iteritems():
         if not value:
             continue            
         self.url_beforenewsinfo_map[SQLDAO.SPIDER_TABLE_NEWS_CLICKNUM][urlmd5] = value[max(value)]
     for urlmd5, value in votenumlist.iteritems():
         if not value:
             continue            
         self.url_beforenewsinfo_map[SQLDAO.SPIDER_TABLE_NEWS_VOTENUM][urlmd5] = value[max(value)]   
     for urlmd5, value in fansnumlist.iteritems():
         if not value:
             continue            
         self.url_beforenewsinfo_map[SQLDAO.SPIDER_TABLE_NEWS_FANSNUM][urlmd5] = value[max(value)]