def run(self, job_name, sequence_num, time_started, namespace, output): results = [] # TODO(mgainer): Notice errors earlier in pipeline, and mark job # as failed in that case as well. try: iterator = input_readers.RecordsReader(output, 0) for item in iterator: # Map/reduce puts reducer output into blobstore files as a # string obtained via "str(result)". Use AST as a safe # alternative to eval() to get the Python object back. results.append(ast.literal_eval(item)) time_completed = time.time() with Namespace(namespace): db.run_in_transaction( DurableJobEntity._complete_job, job_name, sequence_num, MapReduceJob.build_output(self.root_pipeline_id, results), long(time_completed - time_started)) # Don't know what exceptions are currently, or will be in future, # thrown from Map/Reduce or Pipeline libraries; these are under # active development. # # pylint: disable=broad-except except Exception, ex: time_completed = time.time() with Namespace(namespace): db.run_in_transaction( DurableJobEntity._fail_job, job_name, sequence_num, MapReduceJob.build_output(self.root_pipeline_id, results, str(ex)), long(time_completed - time_started))
def testSuccessfulRun(self): file_name1 = self.createMockData(( "http://hoge_0.com", "User-agent: test\nDisallow: /content_0\nDisallow: /content_1\nDisallow: /content_3" )) file_name2 = self.createMockData(( "http://hoge_1.com", "User-agent: test\nAllow: /content_0\nAllow: /content_1\nDisallow: /content_3" )) createMockCrawlDbDatum(2, 6, True) p = pipelines._FetchSetsBufferPipeline("FetchSetsBufferPipeline", [file_name1, file_name2]) p.start() test_support.execute_until_empty(self.taskqueue) finished_map = pipelines._FetchSetsBufferPipeline.from_id( p.pipeline_id) # Can open files file_paths = finished_map.outputs.default.value self.assertTrue(len(file_paths) > 0) self.assertTrue(file_paths[0].startswith("/blobstore/")) reader = input_readers.RecordsReader(file_paths, 0) for binary_record in reader: proto = file_service_pb.KeyValue() proto.ParseFromString(binary_record) key = proto.key() value = proto.value() self.assertTrue(key is not None) self.assertTrue(value is not None)
def testFetchError(self): blob_keys = self.createInvalidMockData() static_content = "User-agent: *\nDisallow: /search\nDisallow: /sdch\nDisallow: /groups" self.setReturnValue(content=static_content, headers={ "Content-Length": len(static_content), "Content-Type": "text/html" }) p = pipelines._RobotsFetchPipeline("RobotsFetchPipeline", blob_keys, 2) p.start() test_support.execute_until_empty(self.taskqueue) finished_map = pipelines._RobotsFetchPipeline.from_id(p.pipeline_id) # Can open files file_list = finished_map.outputs.default.value self.assertTrue(len(file_list) > 0) reader = input_readers.RecordsReader(file_list, 0) for binary_record in reader: proto = file_service_pb.KeyValue() proto.ParseFromString(binary_record) key = proto.key() value = proto.value() self.assertEquals("invalidScheme://test_url.com", key) self.assertEquals("User-agent: *\nDisallow: /", value)
def testSuccessfulRun(self): file_name1 = self.createMockData( ("https://developers.google.com/appengine/", "http://k.yimg.jp/images/top/sp/logo.gif")) file_name2 = self.createMockData( ("https://developers.google.com/appengine/", "/appengine/images/slide1.png")) datum = CrawlDbDatum( parent=ndb.Key(CrawlDbDatum, "https://developers.google.com/appengine/"), url="https://developers.google.com/appengine/", extract_domain_url="https://developers.google.com", last_status=pipelines.UNFETCHED) datum.put() resource = self.getResource("slide1.png") static_content = resource.read() self.setReturnValue(content=static_content, headers={ "Content-Length": len(static_content), "Content-Type": "image/png" }) p = pipelines._FetchContentPipeline("FetchContentPipeline", [file_name1, file_name2]) p.start() test_support.execute_until_empty(self.taskqueue) finished_map = pipelines._FetchSetsBufferPipeline.from_id( p.pipeline_id) # Can open files file_paths = finished_map.outputs.default.value self.assertTrue(len(file_paths) > 0) self.assertTrue(file_paths[0].startswith("/blobstore/")) reader = input_readers.RecordsReader(file_paths, 0) for binary_record in reader: proto = file_service_pb.KeyValue() proto.ParseFromString(binary_record) key = proto.key() value = proto.value() self.assertTrue(key is not None) self.assertTrue(value is not None) query = CrawlDbDatum.query( CrawlDbDatum.url == "https://developers.google.com/appengine/") crawl_db_datums = query.fetch() self.assertTrue(len(crawl_db_datums) > 0) key = crawl_db_datums[0].key content_datums = ContentDbDatum.query(ancestor=key).fetch() self.assertEqual(2, len(content_datums))
def run(self, job_id, job_class_str, output): job_class = mapreduce_util.for_name(job_class_str) try: iterator = input_readers.RecordsReader(output, 0) results_list = [] for item in iterator: # Map/reduce puts reducer output into blobstore files as a # string obtained via "str(result)". Use AST as a safe # alternative to eval() to get the Python object back. results_list.append(ast.literal_eval(item)) job_class.register_completion(job_id, results_list) except Exception as e: logging.error(traceback.format_exc()) logging.error('Job %s failed at %s' % (job_id, utils.get_current_time_in_millisecs())) job_class.register_failure( job_id, '%s\n%s' % (unicode(e), traceback.format_exc()))