Пример #1
0
 def cache_url(self, url, status, response=None):
     url_hash = md5(url).hexdigest()
     page = Page().one({'url_hash':url_hash})
     if not page:
         page = New(Page())
         page.url = url if isinstance(url, unicode) else url.decode('utf-8')
         page.url_hash = url_hash if isinstance(url_hash, unicode) else url_hash.decode('utf-8')
     page.save()
     if not response: return
     page_version = New(PageVersion())
     page_version.page= page._id
     page_version.code = response.code
     body = response.body if isinstance(response.body, unicode) else response.body.decode('utf-8')
     page_version.raw = body
     page_version.content_hash = sha256(body.encode('utf-8')).hexdigest().decode('utf-8')
     page_version.url = response.url if isinstance(response.url, unicode) else response.url.decode('utf-8')
     page_version.save()
     return
Пример #2
0
 def dump(self, mario, rss=None):
     page_sandbox = New(PageSandbox())
     if self.analysis: page_sandbox.analyzer = 1
     if self.mixed: page_sandbox.mixed = 1
     page_sandbox.identifier = self.identifier if isinstance(self.identifier, unicode) else self.identifier.decode('utf-8')
     page_sandbox.starturl = self.starturl if isinstance(self.starturl, unicode) else self.starturl.decode('utf-8')
     for r in self.results:
         url_hash = md5(r.effective_url).hexdigest()
         if not self.analysis and self.check_duplicate_sandbox(r.url if isinstance(r.url, unicode) else r.url.decode('utf-8')):
             logger.debug("Existed in sandbox: %s"%(r.url))
             continue
         page = Page().one({'url_hash':url_hash})
         if not page:
             page = New(Page())
             page.effective_url = r.effective_url if isinstance(r.effective_url, unicode) else r.effective_url.decode('utf-8')
             page.url_hash = url_hash if isinstance(url_hash, unicode) else url_hash.decode('utf-8')
         for links in (links for url, links in mario.link_title_db.dic.items() if url == r.effective_url):
             for link, title, context in links:
                 lt = {'name':title if isinstance(title, unicode) else title.decode('utf-8'), 'url':link if isinstance(link, unicode) else link.decode('utf-8')}
                 if lt not in page.anchors:
                     page.anchors.append(lt)
         page.save()
         page_versions = PageVersion().find({'page':page._id}).limit(1)
         if page_versions.count() > 0:
             for page_version in page_versions:
                 page_sandbox.page_versions.append(page_version._id)
                 break
         if not page_versions:
             page_version = New(PageVersion())
             page_version.page= page._id
             page_version.code = r.code
             body = r.body if isinstance(r.body, unicode) else r.body.decode('utf-8')
             page_version.raw = body
             page_version.content_hash = sha256(body.encode('utf-8')).hexdigest().decode('utf-8')
             page_version.url = r.url if isinstance(r.url, unicode) else r.url.decode('utf-8')
             page_version.save()
             page_sandbox.page_versions.append(page_version._id)
     if rss:
         try:
             page_sandbox.rss.url = rss[0] if isinstance(rss[0], unicode) else rss[0].decode('utf-8')
             page_sandbox.rss.body = rss[1] if isinstance(rss[1], unicode) else rss[1].decode('utf-8')
         except:
             pass
     page_sandbox.save()