def cache_url(self, url, status, response=None): url_hash = md5(url).hexdigest() page = Page().one({'url_hash':url_hash}) if not page: page = New(Page()) page.url = url if isinstance(url, unicode) else url.decode('utf-8') page.url_hash = url_hash if isinstance(url_hash, unicode) else url_hash.decode('utf-8') page.save() if not response: return page_version = New(PageVersion()) page_version.page= page._id page_version.code = response.code body = response.body if isinstance(response.body, unicode) else response.body.decode('utf-8') page_version.raw = body page_version.content_hash = sha256(body.encode('utf-8')).hexdigest().decode('utf-8') page_version.url = response.url if isinstance(response.url, unicode) else response.url.decode('utf-8') page_version.save() return
def dump(self, mario, rss=None): page_sandbox = New(PageSandbox()) if self.analysis: page_sandbox.analyzer = 1 if self.mixed: page_sandbox.mixed = 1 page_sandbox.identifier = self.identifier if isinstance(self.identifier, unicode) else self.identifier.decode('utf-8') page_sandbox.starturl = self.starturl if isinstance(self.starturl, unicode) else self.starturl.decode('utf-8') for r in self.results: url_hash = md5(r.effective_url).hexdigest() if not self.analysis and self.check_duplicate_sandbox(r.url if isinstance(r.url, unicode) else r.url.decode('utf-8')): logger.debug("Existed in sandbox: %s"%(r.url)) continue page = Page().one({'url_hash':url_hash}) if not page: page = New(Page()) page.effective_url = r.effective_url if isinstance(r.effective_url, unicode) else r.effective_url.decode('utf-8') page.url_hash = url_hash if isinstance(url_hash, unicode) else url_hash.decode('utf-8') for links in (links for url, links in mario.link_title_db.dic.items() if url == r.effective_url): for link, title, context in links: lt = {'name':title if isinstance(title, unicode) else title.decode('utf-8'), 'url':link if isinstance(link, unicode) else link.decode('utf-8')} if lt not in page.anchors: page.anchors.append(lt) page.save() page_versions = PageVersion().find({'page':page._id}).limit(1) if page_versions.count() > 0: for page_version in page_versions: page_sandbox.page_versions.append(page_version._id) break if not page_versions: page_version = New(PageVersion()) page_version.page= page._id page_version.code = r.code body = r.body if isinstance(r.body, unicode) else r.body.decode('utf-8') page_version.raw = body page_version.content_hash = sha256(body.encode('utf-8')).hexdigest().decode('utf-8') page_version.url = r.url if isinstance(r.url, unicode) else r.url.decode('utf-8') page_version.save() page_sandbox.page_versions.append(page_version._id) if rss: try: page_sandbox.rss.url = rss[0] if isinstance(rss[0], unicode) else rss[0].decode('utf-8') page_sandbox.rss.body = rss[1] if isinstance(rss[1], unicode) else rss[1].decode('utf-8') except: pass page_sandbox.save()