def finalized(self): """Sends an email to admins indicating this Pipeline has completed. For developer convenience. Automatically called from finalized for root Pipelines that do not override the default action. """ status = "successful" if self.was_aborted: status = "aborted" url = memcache.get("url") email = memcache.get("email") base_dir = os.path.realpath(os.path.dirname(__file__)) # Configure jinja for internal templates env = Environment( autoescape=True, extensions=["jinja2.ext.i18n"], loader=FileSystemLoader(os.path.join(base_dir, "templates")), ) subject = "Your Fetcher Job is " + status crawl_db_datum = crawl_db_datums = CrawlDbDatum.query(CrawlDbDatum.url == url).fetch() crawl_db_datum = crawl_db_datums[0] content_db_datums = ContentDbDatum.query(ancestor=crawl_db_datum.key).fetch_async() fetched_db_datums = FetchedDbDatum.query(ancestor=crawl_db_datum.key).fetch() attachments = [] if len(fetched_db_datums) > 0: fetched_db_datum = fetched_db_datums[0] attachments.append(("fetched_content.html", fetched_db_datum.fetched_content)) link_db_datums = LinkDbDatum.query(ancestor=crawl_db_datum.key).fetch_async() html = env.get_template("mail_template.html").render(url=url, contents=content_db_datums, links=link_db_datums) attachments.append(("sendmail.html", html)) sender = "*****@*****.**" mail.send_mail( sender=sender, to=email, subject=subject, body="FetchResults", html=html, attachments=attachments )
def htmlParser(key, content): outlinks = re.findall(r'href=[\'"]?([^\'" >]+)', content) link_datums = [] for link in outlinks: link_datum = LinkDbDatum(parent=key, link_url=link) link_datums.append(link_datum) ndb.put_multi_async(link_datums) content_links = re.findall(r'src=[\'"]?([^\'" >]+)', content) return content_links
def testFetchEndToEnd(self): """Test for through of fetcher job""" createMockCrawlDbDatum("http://foo.com/bar.html") static_robots = "User-agent: test\nDisallow: /content_0\nDisallow: /content_1\nDisallow: /content_3" self.setReturnValue(url="http://foo.com/robots.txt", content=static_robots, headers={"Content-Length": len(static_robots)}) #static resource is read from resource resource = self.getResource("sample_content.html") static_content = resource.read() static_content_length = len(static_content) self.setReturnValue(url="http://foo.com/bar.html", content=static_content, headers={ "Content-Length": static_content_length, "Content-Type": "text/html" }) resource_image = self.getResource("slide1.png") static_content_image = resource_image.read() static_content_length = len(static_content_image) self.setReturnValue(url="http://foo.com/images/slide1.png", content=static_content_image, headers={ "Content-Length": static_content_length, "Content-Type": "image/png" }) p = pipelines.FetcherPipeline( "FetcherPipeline", params={"entity_kind": "lakshmi.datum.CrawlDbDatum"}, parser_params={"text/html": __name__ + ".htmlParser"}, shards=2) p.start() test_support.execute_until_empty(self.taskqueue) crawl_db_datums = CrawlDbDatum.query( CrawlDbDatum.url == "http://foo.com/bar.html").fetch() crawl_db_datum = crawl_db_datums[0] self.assertTrue(pipelines.FETCHED, crawl_db_datum.last_status) fetched_db_datums = FetchedDbDatum.query( ancestor=crawl_db_datum.key).fetch() fetched_db_datum = fetched_db_datums[0] self.assertTrue(fetched_db_datum is not None) self.assertTrue("http://foo.com/bar.html", fetched_db_datum.fetched_url) link_db_datums = LinkDbDatum.query(ancestor=crawl_db_datum.key).fetch() self.assertTrue(len(link_db_datums) > 0) contents_db_datums = ContentDbDatum.query( ancestor=crawl_db_datum.key).fetch() self.assertTrue(len(contents_db_datums) > 0)
def testFetchEndToEnd(self): """Test for through of fetcher job""" createMockCrawlDbDatum("http://foo.com/bar.html") static_robots = "User-agent: test\nDisallow: /content_0\nDisallow: /content_1\nDisallow: /content_3" self.setReturnValue(url="http://foo.com/robots.txt", content=static_robots, headers={"Content-Length": len(static_robots)}) #static resource is read from resource resource = self.getResource("sample_content.html") static_content = resource.read() static_content_length = len(static_content) self.setReturnValue(url="http://foo.com/bar.html", content=static_content, headers={"Content-Length": static_content_length, "Content-Type": "text/html"}) resource_image = self.getResource("slide1.png") static_content_image = resource_image.read() static_content_length = len(static_content_image) self.setReturnValue(url="http://foo.com/images/slide1.png", content=static_content_image, headers={"Content-Length": static_content_length, "Content-Type": "image/png"}) p = pipelines.FetcherPipeline("FetcherPipeline", params={ "entity_kind": "lakshmi.datum.CrawlDbDatum" }, parser_params={ "text/html": __name__ + ".htmlParser" }, shards=2) p.start() test_support.execute_until_empty(self.taskqueue) crawl_db_datums = CrawlDbDatum.query(CrawlDbDatum.url=="http://foo.com/bar.html").fetch() crawl_db_datum = crawl_db_datums[0] self.assertTrue(pipelines.FETCHED, crawl_db_datum.last_status) fetched_db_datums = FetchedDbDatum.query(ancestor=crawl_db_datum.key).fetch() fetched_db_datum = fetched_db_datums[0] self.assertTrue(fetched_db_datum is not None) self.assertTrue("http://foo.com/bar.html", fetched_db_datum.fetched_url) link_db_datums = LinkDbDatum.query(ancestor=crawl_db_datum.key).fetch() self.assertTrue(len(link_db_datums)>0) contents_db_datums = ContentDbDatum.query(ancestor=crawl_db_datum.key).fetch() self.assertTrue(len(contents_db_datums)>0)
def finalized(self): """Sends an email to admins indicating this Pipeline has completed. For developer convenience. Automatically called from finalized for root Pipelines that do not override the default action. """ status = 'successful' if self.was_aborted: status = 'aborted' url = memcache.get("url") email = memcache.get("email") base_dir = os.path.realpath(os.path.dirname(__file__)) # Configure jinja for internal templates env = Environment(autoescape=True, extensions=['jinja2.ext.i18n'], loader=FileSystemLoader( os.path.join(base_dir, 'templates'))) subject = "Your Fetcher Job is " + status crawl_db_datum = crawl_db_datums = CrawlDbDatum.query( CrawlDbDatum.url == url).fetch() crawl_db_datum = crawl_db_datums[0] content_db_datums = ContentDbDatum.query( ancestor=crawl_db_datum.key).fetch_async() fetched_db_datums = FetchedDbDatum.query( ancestor=crawl_db_datum.key).fetch() attachments = [] if len(fetched_db_datums) > 0: fetched_db_datum = fetched_db_datums[0] attachments.append( ("fetched_content.html", fetched_db_datum.fetched_content)) link_db_datums = LinkDbDatum.query( ancestor=crawl_db_datum.key).fetch_async() html = env.get_template("mail_template.html").render( url=url, contents=content_db_datums, links=link_db_datums) attachments.append(("sendmail.html", html)) sender = "*****@*****.**" mail.send_mail(sender=sender, to=email, subject=subject, body="FetchResults", html=html, attachments=attachments)