def createLinkDatum(parent_url, url): """Create Link CrawlDbDatum mock data.""" key = ndb.Key(CrawlDbDatum, parent_url) CrawlDbDatum.get_or_insert(url, parent=key, url=url, last_status=pipelines.UNFETCHED)
def createMockCrawlDbDatum(domain_count, url_count, isExtracted): """Create CrawlDbDatum mock data.""" for d in range(domain_count): for n in range(url_count): url = "http://hoge_%d.com/content_%d" % (d, n) extracted_url = None if isExtracted: extracted_url = "http://hoge_%d.com" % (d) datum = CrawlDbDatum(parent=ndb.Key(CrawlDbDatum, url), url=url, extract_domain_url=extracted_url, last_status=pipelines.UNFETCHED) datum.put()
def createMockCrawlDbDatum(domain_count, url_count, isExtracted): """Create CrawlDbDatum mock data.""" for d in range(domain_count): for n in range(url_count): url = "http://hoge_%d.com/content_%d" % (d, n) extracted_url = None if isExtracted: extracted_url = "http://hoge_%d.com"%(d) datum = CrawlDbDatum( parent =ndb.Key(CrawlDbDatum, url), url=url, extract_domain_url=extracted_url, last_status=pipelines.UNFETCHED) datum.put()
def testSuccessfulRun(self): file_name1 = self.createMockData( ("https://developers.google.com/appengine/", "http://k.yimg.jp/images/top/sp/logo.gif")) file_name2 = self.createMockData( ("https://developers.google.com/appengine/", "/appengine/images/slide1.png")) datum = CrawlDbDatum( parent=ndb.Key(CrawlDbDatum, "https://developers.google.com/appengine/"), url="https://developers.google.com/appengine/", extract_domain_url="https://developers.google.com", last_status=pipelines.UNFETCHED) datum.put() resource = self.getResource("slide1.png") static_content = resource.read() self.setReturnValue(content=static_content, headers={ "Content-Length": len(static_content), "Content-Type": "image/png" }) p = pipelines._FetchContentPipeline("FetchContentPipeline", [file_name1, file_name2]) p.start() test_support.execute_until_empty(self.taskqueue) finished_map = pipelines._FetchSetsBufferPipeline.from_id( p.pipeline_id) # Can open files file_paths = finished_map.outputs.default.value self.assertTrue(len(file_paths) > 0) self.assertTrue(file_paths[0].startswith("/blobstore/")) reader = input_readers.RecordsReader(file_paths, 0) for binary_record in reader: proto = file_service_pb.KeyValue() proto.ParseFromString(binary_record) key = proto.key() value = proto.value() self.assertTrue(key is not None) self.assertTrue(value is not None) query = CrawlDbDatum.query( CrawlDbDatum.url == "https://developers.google.com/appengine/") crawl_db_datums = query.fetch() self.assertTrue(len(crawl_db_datums) > 0) key = crawl_db_datums[0].key content_datums = ContentDbDatum.query(ancestor=key).fetch() self.assertEqual(2, len(content_datums))
def get(self): url = self.request.get("target", default_value=None) email = self.request.get("email", default_value=None) if url is None: url = memcache.get("url") else: memcache.set(key="url", value=url) if email is None: return data = CrawlDbDatum(parent=ndb.Key(CrawlDbDatum, url), url=url, last_status=pipelines.UNFETCHED) data.put() pipeline = FecherJobPipeline(email) pipeline.start() path = pipeline.base_path + "/status?root=" + pipeline.pipeline_id self.redirect(path)
def testSuccessfulRun(self): createMockCrawlDbDatum(2, 6, False) p = pipelines._ExactDomainMapreducePipeline( "ExactDomainMapreducePipeline", params={ "entity_kind": "lakshmi.datum.CrawlDbDatum", }, shard_count=3) p.start() test_support.execute_until_empty(self.taskqueue) finished_map = pipelines._RobotsFetchPipeline.from_id(p.pipeline_id) # Can open files file_paths = finished_map.outputs.default.value self.assertTrue(len(file_paths) > 0) self.assertTrue(file_paths[0].startswith("/blobstore/")) for file_path in file_paths: blob_key = files.blobstore.get_blob_key(file_path) reader = input_readers.BlobstoreLineInputReader(blob_key, 0, 100) u = 0 for content in reader: self.assertTrue(content[1] != None) u += 1 self.assertEqual(2, u) query = CrawlDbDatum.query( CrawlDbDatum.extract_domain_url == "http://hoge_0.com") entities = query.fetch() for entity in entities: self.assertEquals("http://hoge_0.com", entity.extract_domain_url)
def testSuccessfulRun(self): createMockCrawlDbDatum(2, 2, True) file_name1 = self.createMockData(("http://hoge_0.com/content_0", True)) file_name2 = self.createMockData( ("http://hoge_1.com/content_0", False)) static_content = "<html><body>TestContent</body></html>" self.setReturnValue(content=static_content, headers={ "Content-Length": len(static_content), "Content-Type": "text/html" }) p = pipelines._FetchPagePipeline("FetchPipeline", [file_name1, file_name2], 2) p.start() test_support.execute_until_empty(self.taskqueue) finished_map = pipelines._FetchPagePipeline.from_id(p.pipeline_id) # Can open files file_paths = finished_map.outputs.default.value self.assertTrue(len(file_paths) > 0) self.assertTrue(file_paths[0].startswith("/blobstore/")) entities = CrawlDbDatum.query( CrawlDbDatum.url == "http://hoge_0.com/content_0").fetch() entity = entities[0] fetched_datum = FetchedDbDatum.query(ancestor=entity.key).fetch() self.assertTrue(fetched_datum is not None)
def testSuccessfulRun(self): createMockCrawlDbDatum(2, 6, False) p = pipelines._ExactDomainMapreducePipeline("ExactDomainMapreducePipeline", params={ "entity_kind": "lakshmi.datum.CrawlDbDatum", }, shard_count=3) p.start() test_support.execute_until_empty(self.taskqueue) finished_map = pipelines._RobotsFetchPipeline.from_id(p.pipeline_id) # Can open files file_paths = finished_map.outputs.default.value self.assertTrue(len(file_paths) > 0) self.assertTrue(file_paths[0].startswith("/blobstore/")) for file_path in file_paths: blob_key = files.blobstore.get_blob_key(file_path) reader = input_readers.BlobstoreLineInputReader(blob_key, 0, 100) u = 0 for content in reader: self.assertTrue(content[1]!=None) u += 1 self.assertEqual(2, u) query = CrawlDbDatum.query(CrawlDbDatum.extract_domain_url=="http://hoge_0.com") entities = query.fetch() for entity in entities: self.assertEquals("http://hoge_0.com", entity.extract_domain_url)
def finalized(self): """Sends an email to admins indicating this Pipeline has completed. For developer convenience. Automatically called from finalized for root Pipelines that do not override the default action. """ status = "successful" if self.was_aborted: status = "aborted" url = memcache.get("url") email = memcache.get("email") base_dir = os.path.realpath(os.path.dirname(__file__)) # Configure jinja for internal templates env = Environment( autoescape=True, extensions=["jinja2.ext.i18n"], loader=FileSystemLoader(os.path.join(base_dir, "templates")), ) subject = "Your Fetcher Job is " + status crawl_db_datum = crawl_db_datums = CrawlDbDatum.query(CrawlDbDatum.url == url).fetch() crawl_db_datum = crawl_db_datums[0] content_db_datums = ContentDbDatum.query(ancestor=crawl_db_datum.key).fetch_async() fetched_db_datums = FetchedDbDatum.query(ancestor=crawl_db_datum.key).fetch() attachments = [] if len(fetched_db_datums) > 0: fetched_db_datum = fetched_db_datums[0] attachments.append(("fetched_content.html", fetched_db_datum.fetched_content)) link_db_datums = LinkDbDatum.query(ancestor=crawl_db_datum.key).fetch_async() html = env.get_template("mail_template.html").render(url=url, contents=content_db_datums, links=link_db_datums) attachments.append(("sendmail.html", html)) sender = "*****@*****.**" mail.send_mail( sender=sender, to=email, subject=subject, body="FetchResults", html=html, attachments=attachments )
def _fetchContentMap(binary_record): """Map function of fetch content. Fetched content will store to blobstore. Arg: binary_record: key value data, that key is url of target page, value is url of target of fetch. Returns: url: fetched url. """ proto = file_service_pb.KeyValue() proto.ParseFromString(binary_record) page_url = proto.key() target_url = proto.value() #Fetch to CrawlDbDatum try: query = CrawlDbDatum.query(CrawlDbDatum.url == page_url) crawl_db_datum_future = query.fetch_async() except Exception as e: logging.warning("Failed create key, caused by invalid url:" + page_url + ":" + e.message) #start fetch fetcher = fetchers.SimpleHttpFetcher(1, fetcher_policy_yaml.fetcher_policy) stored_url = None if re.match("^/", target_url): crawl_db_datum = _getCrawlDatum(crawl_db_datum_future) target_url = "%s%s" % (crawl_db_datum.extract_domain_url, target_url) try: fetch_result = fetcher.get(target_url) if fetch_result: #Storing to blobstore blob_io = files.blobstore.create( mime_type=fetch_result.get("mime_type"), _blobinfo_uploaded_filename=fetch_result.get("fetched_url")) with files.open(blob_io, 'a') as f: f.write(fetch_result.get("content")) files.finalize(blob_io) blob_key = files.blobstore.get_blob_key(blob_io) stored_url = images.get_serving_url(str(blob_key)) except Exception as e: logging.warning("Fetch Error Occurs:" + e.message) #Put content to datastore. crawl_db_datum = _getCrawlDatum(crawl_db_datum_future) if crawl_db_datum and stored_url is not None: entity = ContentDbDatum( parent=crawl_db_datum.key, fetched_url=fetch_result.get("fetched_url"), stored_url=stored_url, content_type=fetch_result.get("mime_type"), content_size=fetch_result.get("content_length"), http_headers=str(fetch_result.get("headers"))) entity.put() yield "%s:%s" % (target_url, stored_url)
def createMockFetchedDatum(url, html_text, status): """Create FetchedDatum mock data.""" key = ndb.Key(CrawlDbDatum, url) crawl = CrawlDbDatum.get_or_insert(url, parent=key, url=url, last_status=status) if status != pipelines.UNFETCHED: fetched_datum = FetchedDbDatum(parent=crawl.key, url=url, fetched_url = url, fetched_content = html_text, content_type="text/html") fetched_datum.put()
def testSuccessfulRun(self): """Test clean pipeline.""" createMockFetchedDatum("http://foo.html", "Content", pipelines.FETCHED) createMockFetchedDatum("http://bar.html", "Content", pipelines.SKIPPED) createMockFetchedDatum("http://baz.html", "Content", pipelines.UNFETCHED) p = pipelines.CleanDatumPipeline("CleanDatumPipeline", params={ "entity_kind": "lakshmi.datum.CrawlDbDatum", }, shards=3) p.start() test_support.execute_until_empty(self.taskqueue) entities = CrawlDbDatum.query(CrawlDbDatum.url== "http://foo.html").fetch() self.assertEquals(0, len(entities)) entities = CrawlDbDatum.query(CrawlDbDatum.url== "http://bar.html").fetch() self.assertEquals(0, len(entities)) entities = CrawlDbDatum.query(CrawlDbDatum.url=="http://baz.html").fetch() self.assertEquals(1, len(entities))
def _fetchContentMap(binary_record): """Map function of fetch content. Fetched content will store to blobstore. Arg: binary_record: key value data, that key is url of target page, value is url of target of fetch. Returns: url: fetched url. """ proto = file_service_pb.KeyValue() proto.ParseFromString(binary_record) page_url = proto.key() target_url = proto.value() #Fetch to CrawlDbDatum try: query = CrawlDbDatum.query(CrawlDbDatum.url==page_url) crawl_db_datum_future = query.fetch_async() except Exception as e: logging.warning("Failed create key, caused by invalid url:" + page_url + ":" + e.message) #start fetch fetcher = fetchers.SimpleHttpFetcher(1, fetcher_policy_yaml.fetcher_policy) stored_url = None if re.match("^/", target_url): crawl_db_datum = _getCrawlDatum(crawl_db_datum_future) target_url = "%s%s" % (crawl_db_datum.extract_domain_url, target_url) try: fetch_result = fetcher.get(target_url) if fetch_result: #Storing to blobstore blob_io = files.blobstore.create(mime_type=fetch_result.get("mime_type"), _blobinfo_uploaded_filename=fetch_result.get("fetched_url")) with files.open(blob_io, 'a') as f: f.write(fetch_result.get("content")) files.finalize(blob_io) blob_key = files.blobstore.get_blob_key(blob_io) stored_url = images.get_serving_url(str(blob_key)) except Exception as e: logging.warning("Fetch Error Occurs:" + e.message) #Put content to datastore. crawl_db_datum = _getCrawlDatum(crawl_db_datum_future) if crawl_db_datum and stored_url is not None: entity = ContentDbDatum(parent=crawl_db_datum.key, fetched_url=fetch_result.get("fetched_url"), stored_url=stored_url, content_type=fetch_result.get("mime_type"), content_size=fetch_result.get("content_length"), http_headers=str(fetch_result.get("headers"))) entity.put() yield "%s:%s" % (target_url, stored_url)
def testSuccessfulRun(self): """Test extract outlinks by UDF.""" resource_neg = self.getResource("cloudysunny14.html") static_content = resource_neg.read() createMockFetchedDatum("http://cloudysunny14.html", static_content, pipelines.FETCHED) file_name = self.createMockDataLine("http://cloudysunny14.html\n") p = pipelines._ExtractOutlinksPipeline("ExtractOutlinksPipeline", file_names=[file_name], parser_params={ "text/html": __name__+"._htmlOutlinkParser" }) p.start() test_support.execute_until_empty(self.taskqueue) entities = CrawlDbDatum.query(CrawlDbDatum.url=="http://cloudysunny14.html").fetch() entity = entities[0] fetched_datum = FetchedDbDatum.query(ancestor=entity.key).fetch() self.assertTrue(fetched_datum!=None) qry = CrawlDbDatum.query(CrawlDbDatum.last_status == pipelines.UNFETCHED) crawl_db_datums = qry.fetch() self.assertTrue(len(crawl_db_datums)==0)
def testSuccessfulRun(self): file_name1 = self.createMockData(("https://developers.google.com/appengine/", "http://k.yimg.jp/images/top/sp/logo.gif")) file_name2 = self.createMockData(("https://developers.google.com/appengine/", "/appengine/images/slide1.png")) datum = CrawlDbDatum( parent =ndb.Key(CrawlDbDatum, "https://developers.google.com/appengine/"), url="https://developers.google.com/appengine/", extract_domain_url="https://developers.google.com", last_status=pipelines.UNFETCHED) datum.put() resource = self.getResource("slide1.png") static_content = resource.read() self.setReturnValue(content=static_content, headers={"Content-Length": len(static_content), "Content-Type": "image/png"}) p = pipelines._FetchContentPipeline("FetchContentPipeline", [file_name1, file_name2]) p.start() test_support.execute_until_empty(self.taskqueue) finished_map = pipelines._FetchSetsBufferPipeline.from_id(p.pipeline_id) # Can open files file_paths = finished_map.outputs.default.value self.assertTrue(len(file_paths) > 0) self.assertTrue(file_paths[0].startswith("/blobstore/")) reader = input_readers.RecordsReader(file_paths, 0) for binary_record in reader: proto = file_service_pb.KeyValue() proto.ParseFromString(binary_record) key = proto.key() value = proto.value() self.assertTrue(key is not None) self.assertTrue(value is not None) query = CrawlDbDatum.query(CrawlDbDatum.url=="https://developers.google.com/appengine/") crawl_db_datums = query.fetch() self.assertTrue(len(crawl_db_datums)>0) key = crawl_db_datums[0].key content_datums = ContentDbDatum.query(ancestor=key).fetch() self.assertEqual(2, len(content_datums))
def createMockFetchedDatum(url, html_text, status): """Create FetchedDatum mock data.""" key = ndb.Key(CrawlDbDatum, url) crawl = CrawlDbDatum.get_or_insert(url, parent=key, url=url, last_status=status) if status != pipelines.UNFETCHED: fetched_datum = FetchedDbDatum(parent=crawl.key, url=url, fetched_url=url, fetched_content=html_text, content_type="text/html") fetched_datum.put()
def testSuccessfulRun(self): """Test extract outlinks by UDF.""" resource_neg = self.getResource("cloudysunny14.html") static_content = resource_neg.read() createMockFetchedDatum("http://cloudysunny14.html", static_content, pipelines.FETCHED) file_name = self.createMockDataLine("http://cloudysunny14.html\n") p = pipelines._ExtractOutlinksPipeline( "ExtractOutlinksPipeline", file_names=[file_name], parser_params={"text/html": __name__ + "._htmlOutlinkParser"}) p.start() test_support.execute_until_empty(self.taskqueue) entities = CrawlDbDatum.query( CrawlDbDatum.url == "http://cloudysunny14.html").fetch() entity = entities[0] fetched_datum = FetchedDbDatum.query(ancestor=entity.key).fetch() self.assertTrue(fetched_datum != None) qry = CrawlDbDatum.query( CrawlDbDatum.last_status == pipelines.UNFETCHED) crawl_db_datums = qry.fetch() self.assertTrue(len(crawl_db_datums) == 0)
def testFetchEndToEnd(self): """Test for through of fetcher job""" createMockCrawlDbDatum("http://foo.com/bar.html") static_robots = "User-agent: test\nDisallow: /content_0\nDisallow: /content_1\nDisallow: /content_3" self.setReturnValue(url="http://foo.com/robots.txt", content=static_robots, headers={"Content-Length": len(static_robots)}) #static resource is read from resource resource = self.getResource("sample_content.html") static_content = resource.read() static_content_length = len(static_content) self.setReturnValue(url="http://foo.com/bar.html", content=static_content, headers={ "Content-Length": static_content_length, "Content-Type": "text/html" }) resource_image = self.getResource("slide1.png") static_content_image = resource_image.read() static_content_length = len(static_content_image) self.setReturnValue(url="http://foo.com/images/slide1.png", content=static_content_image, headers={ "Content-Length": static_content_length, "Content-Type": "image/png" }) p = pipelines.FetcherPipeline( "FetcherPipeline", params={"entity_kind": "lakshmi.datum.CrawlDbDatum"}, parser_params={"text/html": __name__ + ".htmlParser"}, shards=2) p.start() test_support.execute_until_empty(self.taskqueue) crawl_db_datums = CrawlDbDatum.query( CrawlDbDatum.url == "http://foo.com/bar.html").fetch() crawl_db_datum = crawl_db_datums[0] self.assertTrue(pipelines.FETCHED, crawl_db_datum.last_status) fetched_db_datums = FetchedDbDatum.query( ancestor=crawl_db_datum.key).fetch() fetched_db_datum = fetched_db_datums[0] self.assertTrue(fetched_db_datum is not None) self.assertTrue("http://foo.com/bar.html", fetched_db_datum.fetched_url) link_db_datums = LinkDbDatum.query(ancestor=crawl_db_datum.key).fetch() self.assertTrue(len(link_db_datums) > 0) contents_db_datums = ContentDbDatum.query( ancestor=crawl_db_datum.key).fetch() self.assertTrue(len(contents_db_datums) > 0)
def _extract_content_urls_map(data): """Map function of extract outlinks from content. Function to be extracted and parsed to extract contents url with UDF. For example, You specified parser UDF for HTML, would like to fetch content from target page, and storing outlinks. implement default like this:: def htmlParser(key, content): outlinks = re.findall(r'href=[\'"]?([^\'" >]+)', content) link_datums = [] for link in outlinks: link_datum = LinkDbDatum(parent=key, link_url=link) link_datums.append(link_datum) ndb.put_multi_async(link_datums) content_links = re.findall(r'src=[\'"]?([^\'" >]+)', content) return content_links Note:Note:The above function to return the URL of the target of url that will fetch in the next job(FetchContentPipeline) Args: data: key value data, that key is position, value is url. Returns: url: The page url. """ k, url = data query = CrawlDbDatum.query(CrawlDbDatum.url == url) crawl_db_datum = query.fetch() key = crawl_db_datum[0].key fetched_datums = FetchedDbDatum.query(ancestor=key).fetch() fetched_datum = fetched_datums[0] content = None if fetched_datum is not None: content = fetched_datum.fetched_content mime_type = fetched_datum.content_type if content is not None: parsed_obj = None try: params = _get_parser_param(_PARSER_PARAM_KEY) parsed_obj = util.handler_for_name(params[mime_type])(key, content) except Exception as e: logging.warning("Can not handle for %s[params:%s]:%s" % (mime_type, params, e.message)) if parsed_obj is not None: for content_urls in parsed_obj: yield (url, content_urls)
def testSuccessfulRun(self): """Test clean pipeline.""" createMockFetchedDatum("http://foo.html", "Content", pipelines.FETCHED) createMockFetchedDatum("http://bar.html", "Content", pipelines.SKIPPED) createMockFetchedDatum("http://baz.html", "Content", pipelines.UNFETCHED) p = pipelines.CleanDatumPipeline("CleanDatumPipeline", params={ "entity_kind": "lakshmi.datum.CrawlDbDatum", }, shards=3) p.start() test_support.execute_until_empty(self.taskqueue) entities = CrawlDbDatum.query( CrawlDbDatum.url == "http://foo.html").fetch() self.assertEquals(0, len(entities)) entities = CrawlDbDatum.query( CrawlDbDatum.url == "http://bar.html").fetch() self.assertEquals(0, len(entities)) entities = CrawlDbDatum.query( CrawlDbDatum.url == "http://baz.html").fetch() self.assertEquals(1, len(entities))
def _extract_content_urls_map(data): """Map function of extract outlinks from content. Function to be extracted and parsed to extract contents url with UDF. For example, You specified parser UDF for HTML, would like to fetch content from target page, and storing outlinks. implement default like this:: def htmlParser(key, content): outlinks = re.findall(r'href=[\'"]?([^\'" >]+)', content) link_datums = [] for link in outlinks: link_datum = LinkDbDatum(parent=key, link_url=link) link_datums.append(link_datum) ndb.put_multi_async(link_datums) content_links = re.findall(r'src=[\'"]?([^\'" >]+)', content) return content_links Note:Note:The above function to return the URL of the target of url that will fetch in the next job(FetchContentPipeline) Args: data: key value data, that key is position, value is url. Returns: url: The page url. """ k, url = data query = CrawlDbDatum.query(CrawlDbDatum.url==url) crawl_db_datum = query.fetch() key = crawl_db_datum[0].key fetched_datums = FetchedDbDatum.query(ancestor=key).fetch() fetched_datum = fetched_datums[0] content = None if fetched_datum is not None: content = fetched_datum.fetched_content mime_type = fetched_datum.content_type if content is not None: parsed_obj = None try: params = _get_parser_param(_PARSER_PARAM_KEY) parsed_obj = util.handler_for_name(params[mime_type])(key, content) except Exception as e: logging.warning("Can not handle for %s[params:%s]:%s"%(mime_type, params, e.message)) if parsed_obj is not None: for content_urls in parsed_obj: yield (url, content_urls)
def testFetchEndToEnd(self): """Test for through of fetcher job""" createMockCrawlDbDatum("http://foo.com/bar.html") static_robots = "User-agent: test\nDisallow: /content_0\nDisallow: /content_1\nDisallow: /content_3" self.setReturnValue(url="http://foo.com/robots.txt", content=static_robots, headers={"Content-Length": len(static_robots)}) #static resource is read from resource resource = self.getResource("sample_content.html") static_content = resource.read() static_content_length = len(static_content) self.setReturnValue(url="http://foo.com/bar.html", content=static_content, headers={"Content-Length": static_content_length, "Content-Type": "text/html"}) resource_image = self.getResource("slide1.png") static_content_image = resource_image.read() static_content_length = len(static_content_image) self.setReturnValue(url="http://foo.com/images/slide1.png", content=static_content_image, headers={"Content-Length": static_content_length, "Content-Type": "image/png"}) p = pipelines.FetcherPipeline("FetcherPipeline", params={ "entity_kind": "lakshmi.datum.CrawlDbDatum" }, parser_params={ "text/html": __name__ + ".htmlParser" }, shards=2) p.start() test_support.execute_until_empty(self.taskqueue) crawl_db_datums = CrawlDbDatum.query(CrawlDbDatum.url=="http://foo.com/bar.html").fetch() crawl_db_datum = crawl_db_datums[0] self.assertTrue(pipelines.FETCHED, crawl_db_datum.last_status) fetched_db_datums = FetchedDbDatum.query(ancestor=crawl_db_datum.key).fetch() fetched_db_datum = fetched_db_datums[0] self.assertTrue(fetched_db_datum is not None) self.assertTrue("http://foo.com/bar.html", fetched_db_datum.fetched_url) link_db_datums = LinkDbDatum.query(ancestor=crawl_db_datum.key).fetch() self.assertTrue(len(link_db_datums)>0) contents_db_datums = ContentDbDatum.query(ancestor=crawl_db_datum.key).fetch() self.assertTrue(len(contents_db_datums)>0)
def _makeFetchSetBufferMap(binary_record): """Map function of create fetch buffers, that output thus is one or more fetch url to fetch or skip. Arg: binary_record: key value data, that key is extract domain url, value is content from robots.txt. Returns: url: to fetch url. fetch_or_unfetch: the boolean value of fetch or unfetch, if sets true is fetch, false is skip. """ proto = file_service_pb.KeyValue() proto.ParseFromString(binary_record) extract_domain_url = proto.key() content = proto.value() #Extract urls from CrawlDbDatum. try: query = CrawlDbDatum.query( CrawlDbDatum.extract_domain_url == extract_domain_url) crawl_datum_future = query.fetch_async() except Exception as e: logging.warning("Fetch error occurs from CrawlDbDatum" + e.message) can_fetch = False #Get the fetcher policy from resource. user_agent = fetcher_policy_yaml.fetcher_policy.agent_name rp = robotparser.RobotFileParser() try: rp.parse(content.split("\n").__iter__()) except Exception as e: logging.warning("RobotFileParser raises exception:" + e.message) for crawl_datum in crawl_datum_future.get_result(): url = crawl_datum.url try: can_fetch = rp.can_fetch(user_agent, url) except Exception as e: logging.warning("RobotFileParser raises exception:" + e.message) url = "" yield (url, can_fetch)
def _makeFetchSetBufferMap(binary_record): """Map function of create fetch buffers, that output thus is one or more fetch url to fetch or skip. Arg: binary_record: key value data, that key is extract domain url, value is content from robots.txt. Returns: url: to fetch url. fetch_or_unfetch: the boolean value of fetch or unfetch, if sets true is fetch, false is skip. """ proto = file_service_pb.KeyValue() proto.ParseFromString(binary_record) extract_domain_url = proto.key() content = proto.value() #Extract urls from CrawlDbDatum. try: query = CrawlDbDatum.query(CrawlDbDatum.extract_domain_url==extract_domain_url) crawl_datum_future = query.fetch_async() except Exception as e: logging.warning("Fetch error occurs from CrawlDbDatum" + e.message) can_fetch = False #Get the fetcher policy from resource. user_agent = fetcher_policy_yaml.fetcher_policy.agent_name rp = robotparser.RobotFileParser() try: rp.parse(content.split("\n").__iter__()) except Exception as e: logging.warning("RobotFileParser raises exception:" + e.message) for crawl_datum in crawl_datum_future.get_result(): url = crawl_datum.url try: can_fetch = rp.can_fetch(user_agent, url) except Exception as e: logging.warning("RobotFileParser raises exception:" + e.message) url = "" yield (url, can_fetch)
def finalized(self): """Sends an email to admins indicating this Pipeline has completed. For developer convenience. Automatically called from finalized for root Pipelines that do not override the default action. """ status = 'successful' if self.was_aborted: status = 'aborted' url = memcache.get("url") email = memcache.get("email") base_dir = os.path.realpath(os.path.dirname(__file__)) # Configure jinja for internal templates env = Environment(autoescape=True, extensions=['jinja2.ext.i18n'], loader=FileSystemLoader( os.path.join(base_dir, 'templates'))) subject = "Your Fetcher Job is " + status crawl_db_datum = crawl_db_datums = CrawlDbDatum.query( CrawlDbDatum.url == url).fetch() crawl_db_datum = crawl_db_datums[0] content_db_datums = ContentDbDatum.query( ancestor=crawl_db_datum.key).fetch_async() fetched_db_datums = FetchedDbDatum.query( ancestor=crawl_db_datum.key).fetch() attachments = [] if len(fetched_db_datums) > 0: fetched_db_datum = fetched_db_datums[0] attachments.append( ("fetched_content.html", fetched_db_datum.fetched_content)) link_db_datums = LinkDbDatum.query( ancestor=crawl_db_datum.key).fetch_async() html = env.get_template("mail_template.html").render( url=url, contents=content_db_datums, links=link_db_datums) attachments.append(("sendmail.html", html)) sender = "*****@*****.**" mail.send_mail(sender=sender, to=email, subject=subject, body="FetchResults", html=html, attachments=attachments)
def testSuccessfulRun(self): createMockCrawlDbDatum(2, 2, True) file_name1 = self.createMockData(("http://hoge_0.com/content_0", True)) file_name2 = self.createMockData(("http://hoge_1.com/content_0", False)) static_content = "<html><body>TestContent</body></html>" self.setReturnValue(content=static_content, headers={"Content-Length": len(static_content), "Content-Type": "text/html"}) p = pipelines._FetchPagePipeline("FetchPipeline", [file_name1, file_name2], 2) p.start() test_support.execute_until_empty(self.taskqueue) finished_map = pipelines._FetchPagePipeline.from_id(p.pipeline_id) # Can open files file_paths = finished_map.outputs.default.value self.assertTrue(len(file_paths) > 0) self.assertTrue(file_paths[0].startswith("/blobstore/")) entities = CrawlDbDatum.query(CrawlDbDatum.url=="http://hoge_0.com/content_0").fetch() entity = entities[0] fetched_datum = FetchedDbDatum.query(ancestor=entity.key).fetch() self.assertTrue(fetched_datum is not None)
def createMockCrawlDbDatum(url): """Create CrawlDbDatum mock data.""" CrawlDbDatum.get_or_insert(url, url=url, last_status=pipelines.UNFETCHED)
def _fetchMap(binary_record): """Map function of create fetch result, that create FetchResulDatum entity, will be store to datastore. Arg: binary_record: key value data, that key is url to fetch, value is boolean value of can be fetch. Returns: url: to fetch url. fetch_result: the result of fetch. """ proto = file_service_pb.KeyValue() proto.ParseFromString(binary_record) url = proto.key() could_fetch = _str2bool(proto.value()) result = UNFETCHED fetched_url = "" fetch_date = None #Fetch to CrawlDbDatum try: query = CrawlDbDatum.query(CrawlDbDatum.url == url) crawl_db_datum_future = query.fetch_async() except Exception as e: logging.warning("Failed create key, caused by invalid url:" + url + ":" + e.message) could_fetch = False if could_fetch: #start fetch fetcher = fetchers.SimpleHttpFetcher( 1, fetcher_policy_yaml.fetcher_policy) try: fetch_result = fetcher.get(url) if fetch_result: #Storing to datastore crawl_db_datums = crawl_db_datum_future.get_result() fetche_datum = FetchedDbDatum( parent=crawl_db_datums[0].key, url=url, fetched_url=fetch_result.get("fetched_url"), fetch_time=fetch_result.get("time"), fetched_content=fetch_result.get("content"), content_type=fetch_result.get("mime_type"), content_size=fetch_result.get("read_rate"), response_rate=fetch_result.get("read_rate"), http_headers=str(fetch_result.get("headers"))) fetche_datum.put() #update time of last fetched result = FETCHED fetch_date = datetime.datetime.now() fetched_url = ("%s\n" % url) except Exception as e: logging.warning("Fetch Page Error Occurs:" + e.message) result = FAILED else: result = FAILED #Update status to all datums. crawl_db_datums = crawl_db_datum_future.get_result() for datum in crawl_db_datums: datum.last_status = result datum.last_fetched = fetch_date ndb.put_multi(crawl_db_datums) yield fetched_url
def _fetchMap(binary_record): """Map function of create fetch result, that create FetchResulDatum entity, will be store to datastore. Arg: binary_record: key value data, that key is url to fetch, value is boolean value of can be fetch. Returns: url: to fetch url. fetch_result: the result of fetch. """ proto = file_service_pb.KeyValue() proto.ParseFromString(binary_record) url = proto.key() could_fetch = _str2bool(proto.value()) result = UNFETCHED fetched_url = "" fetch_date = None #Fetch to CrawlDbDatum try: query = CrawlDbDatum.query(CrawlDbDatum.url==url) crawl_db_datum_future = query.fetch_async() except Exception as e: logging.warning("Failed create key, caused by invalid url:" + url + ":" + e.message) could_fetch = False if could_fetch: #start fetch fetcher = fetchers.SimpleHttpFetcher(1, fetcher_policy_yaml.fetcher_policy) try: fetch_result = fetcher.get(url) if fetch_result: #Storing to datastore crawl_db_datums = crawl_db_datum_future.get_result() fetche_datum = FetchedDbDatum(parent=crawl_db_datums[0].key, url=url, fetched_url = fetch_result.get("fetched_url"), fetch_time = fetch_result.get("time"), fetched_content = fetch_result.get("content"), content_type = fetch_result.get("mime_type"), content_size = fetch_result.get("read_rate"), response_rate = fetch_result.get("read_rate"), http_headers = str(fetch_result.get("headers"))) fetche_datum.put() #update time of last fetched result = FETCHED fetch_date = datetime.datetime.now() fetched_url = ("%s\n"%url) except Exception as e: logging.warning("Fetch Page Error Occurs:" + e.message) result = FAILED else: result = FAILED #Update status to all datums. crawl_db_datums = crawl_db_datum_future.get_result() for datum in crawl_db_datums: datum.last_status = result datum.last_fetched = fetch_date ndb.put_multi(crawl_db_datums) yield fetched_url