def createMockCrawlDbDatum(domain_count, url_count, isExtracted): """Create CrawlDbDatum mock data.""" for d in range(domain_count): for n in range(url_count): url = "http://hoge_%d.com/content_%d" % (d, n) extracted_url = None if isExtracted: extracted_url = "http://hoge_%d.com" % (d) datum = CrawlDbDatum(parent=ndb.Key(CrawlDbDatum, url), url=url, extract_domain_url=extracted_url, last_status=pipelines.UNFETCHED) datum.put()
def createMockCrawlDbDatum(domain_count, url_count, isExtracted): """Create CrawlDbDatum mock data.""" for d in range(domain_count): for n in range(url_count): url = "http://hoge_%d.com/content_%d" % (d, n) extracted_url = None if isExtracted: extracted_url = "http://hoge_%d.com"%(d) datum = CrawlDbDatum( parent =ndb.Key(CrawlDbDatum, url), url=url, extract_domain_url=extracted_url, last_status=pipelines.UNFETCHED) datum.put()
def testSuccessfulRun(self): file_name1 = self.createMockData( ("https://developers.google.com/appengine/", "http://k.yimg.jp/images/top/sp/logo.gif")) file_name2 = self.createMockData( ("https://developers.google.com/appengine/", "/appengine/images/slide1.png")) datum = CrawlDbDatum( parent=ndb.Key(CrawlDbDatum, "https://developers.google.com/appengine/"), url="https://developers.google.com/appengine/", extract_domain_url="https://developers.google.com", last_status=pipelines.UNFETCHED) datum.put() resource = self.getResource("slide1.png") static_content = resource.read() self.setReturnValue(content=static_content, headers={ "Content-Length": len(static_content), "Content-Type": "image/png" }) p = pipelines._FetchContentPipeline("FetchContentPipeline", [file_name1, file_name2]) p.start() test_support.execute_until_empty(self.taskqueue) finished_map = pipelines._FetchSetsBufferPipeline.from_id( p.pipeline_id) # Can open files file_paths = finished_map.outputs.default.value self.assertTrue(len(file_paths) > 0) self.assertTrue(file_paths[0].startswith("/blobstore/")) reader = input_readers.RecordsReader(file_paths, 0) for binary_record in reader: proto = file_service_pb.KeyValue() proto.ParseFromString(binary_record) key = proto.key() value = proto.value() self.assertTrue(key is not None) self.assertTrue(value is not None) query = CrawlDbDatum.query( CrawlDbDatum.url == "https://developers.google.com/appengine/") crawl_db_datums = query.fetch() self.assertTrue(len(crawl_db_datums) > 0) key = crawl_db_datums[0].key content_datums = ContentDbDatum.query(ancestor=key).fetch() self.assertEqual(2, len(content_datums))
def get(self): url = self.request.get("target", default_value=None) email = self.request.get("email", default_value=None) if url is None: url = memcache.get("url") else: memcache.set(key="url", value=url) if email is None: return data = CrawlDbDatum(parent=ndb.Key(CrawlDbDatum, url), url=url, last_status=pipelines.UNFETCHED) data.put() pipeline = FecherJobPipeline(email) pipeline.start() path = pipeline.base_path + "/status?root=" + pipeline.pipeline_id self.redirect(path)
def get(self): url = self.request.get("target", default_value=None) email = self.request.get("email", default_value=None) if url is None: url = memcache.get("url") else: memcache.set(key="url", value=url) if email is None: return data = CrawlDbDatum(parent=ndb.Key(CrawlDbDatum, url), url=url, last_status=pipelines.UNFETCHED) data.put() pipeline = FecherJobPipeline(email) pipeline.start() path = pipeline.base_path + "/status?root=" + pipeline.pipeline_id self.redirect(path)
def testSuccessfulRun(self): file_name1 = self.createMockData(("https://developers.google.com/appengine/", "http://k.yimg.jp/images/top/sp/logo.gif")) file_name2 = self.createMockData(("https://developers.google.com/appengine/", "/appengine/images/slide1.png")) datum = CrawlDbDatum( parent =ndb.Key(CrawlDbDatum, "https://developers.google.com/appengine/"), url="https://developers.google.com/appengine/", extract_domain_url="https://developers.google.com", last_status=pipelines.UNFETCHED) datum.put() resource = self.getResource("slide1.png") static_content = resource.read() self.setReturnValue(content=static_content, headers={"Content-Length": len(static_content), "Content-Type": "image/png"}) p = pipelines._FetchContentPipeline("FetchContentPipeline", [file_name1, file_name2]) p.start() test_support.execute_until_empty(self.taskqueue) finished_map = pipelines._FetchSetsBufferPipeline.from_id(p.pipeline_id) # Can open files file_paths = finished_map.outputs.default.value self.assertTrue(len(file_paths) > 0) self.assertTrue(file_paths[0].startswith("/blobstore/")) reader = input_readers.RecordsReader(file_paths, 0) for binary_record in reader: proto = file_service_pb.KeyValue() proto.ParseFromString(binary_record) key = proto.key() value = proto.value() self.assertTrue(key is not None) self.assertTrue(value is not None) query = CrawlDbDatum.query(CrawlDbDatum.url=="https://developers.google.com/appengine/") crawl_db_datums = query.fetch() self.assertTrue(len(crawl_db_datums)>0) key = crawl_db_datums[0].key content_datums = ContentDbDatum.query(ancestor=key).fetch() self.assertEqual(2, len(content_datums))