def testSuccessfulRun(self): createMockCrawlDbDatum(2, 2, True) file_name1 = self.createMockData(("http://hoge_0.com/content_0", True)) file_name2 = self.createMockData( ("http://hoge_1.com/content_0", False)) static_content = "<html><body>TestContent</body></html>" self.setReturnValue(content=static_content, headers={ "Content-Length": len(static_content), "Content-Type": "text/html" }) p = pipelines._FetchPagePipeline("FetchPipeline", [file_name1, file_name2], 2) p.start() test_support.execute_until_empty(self.taskqueue) finished_map = pipelines._FetchPagePipeline.from_id(p.pipeline_id) # Can open files file_paths = finished_map.outputs.default.value self.assertTrue(len(file_paths) > 0) self.assertTrue(file_paths[0].startswith("/blobstore/")) entities = CrawlDbDatum.query( CrawlDbDatum.url == "http://hoge_0.com/content_0").fetch() entity = entities[0] fetched_datum = FetchedDbDatum.query(ancestor=entity.key).fetch() self.assertTrue(fetched_datum is not None)
def finalized(self): """Sends an email to admins indicating this Pipeline has completed. For developer convenience. Automatically called from finalized for root Pipelines that do not override the default action. """ status = "successful" if self.was_aborted: status = "aborted" url = memcache.get("url") email = memcache.get("email") base_dir = os.path.realpath(os.path.dirname(__file__)) # Configure jinja for internal templates env = Environment( autoescape=True, extensions=["jinja2.ext.i18n"], loader=FileSystemLoader(os.path.join(base_dir, "templates")), ) subject = "Your Fetcher Job is " + status crawl_db_datum = crawl_db_datums = CrawlDbDatum.query(CrawlDbDatum.url == url).fetch() crawl_db_datum = crawl_db_datums[0] content_db_datums = ContentDbDatum.query(ancestor=crawl_db_datum.key).fetch_async() fetched_db_datums = FetchedDbDatum.query(ancestor=crawl_db_datum.key).fetch() attachments = [] if len(fetched_db_datums) > 0: fetched_db_datum = fetched_db_datums[0] attachments.append(("fetched_content.html", fetched_db_datum.fetched_content)) link_db_datums = LinkDbDatum.query(ancestor=crawl_db_datum.key).fetch_async() html = env.get_template("mail_template.html").render(url=url, contents=content_db_datums, links=link_db_datums) attachments.append(("sendmail.html", html)) sender = "*****@*****.**" mail.send_mail( sender=sender, to=email, subject=subject, body="FetchResults", html=html, attachments=attachments )
def _extract_content_urls_map(data): """Map function of extract outlinks from content. Function to be extracted and parsed to extract contents url with UDF. For example, You specified parser UDF for HTML, would like to fetch content from target page, and storing outlinks. implement default like this:: def htmlParser(key, content): outlinks = re.findall(r'href=[\'"]?([^\'" >]+)', content) link_datums = [] for link in outlinks: link_datum = LinkDbDatum(parent=key, link_url=link) link_datums.append(link_datum) ndb.put_multi_async(link_datums) content_links = re.findall(r'src=[\'"]?([^\'" >]+)', content) return content_links Note:Note:The above function to return the URL of the target of url that will fetch in the next job(FetchContentPipeline) Args: data: key value data, that key is position, value is url. Returns: url: The page url. """ k, url = data query = CrawlDbDatum.query(CrawlDbDatum.url == url) crawl_db_datum = query.fetch() key = crawl_db_datum[0].key fetched_datums = FetchedDbDatum.query(ancestor=key).fetch() fetched_datum = fetched_datums[0] content = None if fetched_datum is not None: content = fetched_datum.fetched_content mime_type = fetched_datum.content_type if content is not None: parsed_obj = None try: params = _get_parser_param(_PARSER_PARAM_KEY) parsed_obj = util.handler_for_name(params[mime_type])(key, content) except Exception as e: logging.warning("Can not handle for %s[params:%s]:%s" % (mime_type, params, e.message)) if parsed_obj is not None: for content_urls in parsed_obj: yield (url, content_urls)
def testFetchEndToEnd(self): """Test for through of fetcher job""" createMockCrawlDbDatum("http://foo.com/bar.html") static_robots = "User-agent: test\nDisallow: /content_0\nDisallow: /content_1\nDisallow: /content_3" self.setReturnValue(url="http://foo.com/robots.txt", content=static_robots, headers={"Content-Length": len(static_robots)}) #static resource is read from resource resource = self.getResource("sample_content.html") static_content = resource.read() static_content_length = len(static_content) self.setReturnValue(url="http://foo.com/bar.html", content=static_content, headers={ "Content-Length": static_content_length, "Content-Type": "text/html" }) resource_image = self.getResource("slide1.png") static_content_image = resource_image.read() static_content_length = len(static_content_image) self.setReturnValue(url="http://foo.com/images/slide1.png", content=static_content_image, headers={ "Content-Length": static_content_length, "Content-Type": "image/png" }) p = pipelines.FetcherPipeline( "FetcherPipeline", params={"entity_kind": "lakshmi.datum.CrawlDbDatum"}, parser_params={"text/html": __name__ + ".htmlParser"}, shards=2) p.start() test_support.execute_until_empty(self.taskqueue) crawl_db_datums = CrawlDbDatum.query( CrawlDbDatum.url == "http://foo.com/bar.html").fetch() crawl_db_datum = crawl_db_datums[0] self.assertTrue(pipelines.FETCHED, crawl_db_datum.last_status) fetched_db_datums = FetchedDbDatum.query( ancestor=crawl_db_datum.key).fetch() fetched_db_datum = fetched_db_datums[0] self.assertTrue(fetched_db_datum is not None) self.assertTrue("http://foo.com/bar.html", fetched_db_datum.fetched_url) link_db_datums = LinkDbDatum.query(ancestor=crawl_db_datum.key).fetch() self.assertTrue(len(link_db_datums) > 0) contents_db_datums = ContentDbDatum.query( ancestor=crawl_db_datum.key).fetch() self.assertTrue(len(contents_db_datums) > 0)
def _extract_content_urls_map(data): """Map function of extract outlinks from content. Function to be extracted and parsed to extract contents url with UDF. For example, You specified parser UDF for HTML, would like to fetch content from target page, and storing outlinks. implement default like this:: def htmlParser(key, content): outlinks = re.findall(r'href=[\'"]?([^\'" >]+)', content) link_datums = [] for link in outlinks: link_datum = LinkDbDatum(parent=key, link_url=link) link_datums.append(link_datum) ndb.put_multi_async(link_datums) content_links = re.findall(r'src=[\'"]?([^\'" >]+)', content) return content_links Note:Note:The above function to return the URL of the target of url that will fetch in the next job(FetchContentPipeline) Args: data: key value data, that key is position, value is url. Returns: url: The page url. """ k, url = data query = CrawlDbDatum.query(CrawlDbDatum.url==url) crawl_db_datum = query.fetch() key = crawl_db_datum[0].key fetched_datums = FetchedDbDatum.query(ancestor=key).fetch() fetched_datum = fetched_datums[0] content = None if fetched_datum is not None: content = fetched_datum.fetched_content mime_type = fetched_datum.content_type if content is not None: parsed_obj = None try: params = _get_parser_param(_PARSER_PARAM_KEY) parsed_obj = util.handler_for_name(params[mime_type])(key, content) except Exception as e: logging.warning("Can not handle for %s[params:%s]:%s"%(mime_type, params, e.message)) if parsed_obj is not None: for content_urls in parsed_obj: yield (url, content_urls)
def testFetchEndToEnd(self): """Test for through of fetcher job""" createMockCrawlDbDatum("http://foo.com/bar.html") static_robots = "User-agent: test\nDisallow: /content_0\nDisallow: /content_1\nDisallow: /content_3" self.setReturnValue(url="http://foo.com/robots.txt", content=static_robots, headers={"Content-Length": len(static_robots)}) #static resource is read from resource resource = self.getResource("sample_content.html") static_content = resource.read() static_content_length = len(static_content) self.setReturnValue(url="http://foo.com/bar.html", content=static_content, headers={"Content-Length": static_content_length, "Content-Type": "text/html"}) resource_image = self.getResource("slide1.png") static_content_image = resource_image.read() static_content_length = len(static_content_image) self.setReturnValue(url="http://foo.com/images/slide1.png", content=static_content_image, headers={"Content-Length": static_content_length, "Content-Type": "image/png"}) p = pipelines.FetcherPipeline("FetcherPipeline", params={ "entity_kind": "lakshmi.datum.CrawlDbDatum" }, parser_params={ "text/html": __name__ + ".htmlParser" }, shards=2) p.start() test_support.execute_until_empty(self.taskqueue) crawl_db_datums = CrawlDbDatum.query(CrawlDbDatum.url=="http://foo.com/bar.html").fetch() crawl_db_datum = crawl_db_datums[0] self.assertTrue(pipelines.FETCHED, crawl_db_datum.last_status) fetched_db_datums = FetchedDbDatum.query(ancestor=crawl_db_datum.key).fetch() fetched_db_datum = fetched_db_datums[0] self.assertTrue(fetched_db_datum is not None) self.assertTrue("http://foo.com/bar.html", fetched_db_datum.fetched_url) link_db_datums = LinkDbDatum.query(ancestor=crawl_db_datum.key).fetch() self.assertTrue(len(link_db_datums)>0) contents_db_datums = ContentDbDatum.query(ancestor=crawl_db_datum.key).fetch() self.assertTrue(len(contents_db_datums)>0)
def finalized(self): """Sends an email to admins indicating this Pipeline has completed. For developer convenience. Automatically called from finalized for root Pipelines that do not override the default action. """ status = 'successful' if self.was_aborted: status = 'aborted' url = memcache.get("url") email = memcache.get("email") base_dir = os.path.realpath(os.path.dirname(__file__)) # Configure jinja for internal templates env = Environment(autoescape=True, extensions=['jinja2.ext.i18n'], loader=FileSystemLoader( os.path.join(base_dir, 'templates'))) subject = "Your Fetcher Job is " + status crawl_db_datum = crawl_db_datums = CrawlDbDatum.query( CrawlDbDatum.url == url).fetch() crawl_db_datum = crawl_db_datums[0] content_db_datums = ContentDbDatum.query( ancestor=crawl_db_datum.key).fetch_async() fetched_db_datums = FetchedDbDatum.query( ancestor=crawl_db_datum.key).fetch() attachments = [] if len(fetched_db_datums) > 0: fetched_db_datum = fetched_db_datums[0] attachments.append( ("fetched_content.html", fetched_db_datum.fetched_content)) link_db_datums = LinkDbDatum.query( ancestor=crawl_db_datum.key).fetch_async() html = env.get_template("mail_template.html").render( url=url, contents=content_db_datums, links=link_db_datums) attachments.append(("sendmail.html", html)) sender = "*****@*****.**" mail.send_mail(sender=sender, to=email, subject=subject, body="FetchResults", html=html, attachments=attachments)
def testSuccessfulRun(self): """Test extract outlinks by UDF.""" resource_neg = self.getResource("cloudysunny14.html") static_content = resource_neg.read() createMockFetchedDatum("http://cloudysunny14.html", static_content, pipelines.FETCHED) file_name = self.createMockDataLine("http://cloudysunny14.html\n") p = pipelines._ExtractOutlinksPipeline("ExtractOutlinksPipeline", file_names=[file_name], parser_params={ "text/html": __name__+"._htmlOutlinkParser" }) p.start() test_support.execute_until_empty(self.taskqueue) entities = CrawlDbDatum.query(CrawlDbDatum.url=="http://cloudysunny14.html").fetch() entity = entities[0] fetched_datum = FetchedDbDatum.query(ancestor=entity.key).fetch() self.assertTrue(fetched_datum!=None) qry = CrawlDbDatum.query(CrawlDbDatum.last_status == pipelines.UNFETCHED) crawl_db_datums = qry.fetch() self.assertTrue(len(crawl_db_datums)==0)
def testSuccessfulRun(self): """Test extract outlinks by UDF.""" resource_neg = self.getResource("cloudysunny14.html") static_content = resource_neg.read() createMockFetchedDatum("http://cloudysunny14.html", static_content, pipelines.FETCHED) file_name = self.createMockDataLine("http://cloudysunny14.html\n") p = pipelines._ExtractOutlinksPipeline( "ExtractOutlinksPipeline", file_names=[file_name], parser_params={"text/html": __name__ + "._htmlOutlinkParser"}) p.start() test_support.execute_until_empty(self.taskqueue) entities = CrawlDbDatum.query( CrawlDbDatum.url == "http://cloudysunny14.html").fetch() entity = entities[0] fetched_datum = FetchedDbDatum.query(ancestor=entity.key).fetch() self.assertTrue(fetched_datum != None) qry = CrawlDbDatum.query( CrawlDbDatum.last_status == pipelines.UNFETCHED) crawl_db_datums = qry.fetch() self.assertTrue(len(crawl_db_datums) == 0)
def testSuccessfulRun(self): createMockCrawlDbDatum(2, 2, True) file_name1 = self.createMockData(("http://hoge_0.com/content_0", True)) file_name2 = self.createMockData(("http://hoge_1.com/content_0", False)) static_content = "<html><body>TestContent</body></html>" self.setReturnValue(content=static_content, headers={"Content-Length": len(static_content), "Content-Type": "text/html"}) p = pipelines._FetchPagePipeline("FetchPipeline", [file_name1, file_name2], 2) p.start() test_support.execute_until_empty(self.taskqueue) finished_map = pipelines._FetchPagePipeline.from_id(p.pipeline_id) # Can open files file_paths = finished_map.outputs.default.value self.assertTrue(len(file_paths) > 0) self.assertTrue(file_paths[0].startswith("/blobstore/")) entities = CrawlDbDatum.query(CrawlDbDatum.url=="http://hoge_0.com/content_0").fetch() entity = entities[0] fetched_datum = FetchedDbDatum.query(ancestor=entity.key).fetch() self.assertTrue(fetched_datum is not None)