def start(self, from_latest_checkpoint=False): """Starts the job.""" logger.info("Building %s" % self.name) self.heritrix.build_job(self.name) self.waitfor("NASCENT") logger.info("Launching %s" % self.name) if from_latest_checkpoint: self.heritrix.launch_from_latest_checkpoint(self.name) else: self.heritrix.launch_job(self.name) self.waitfor("PAUSED") self.write_act_info() logger.info("Running scripts for %s" % self.name) self.run_job_script() # NOTE: The below line is a kludge to avoid an issue in the AsynchronousMQExtractor... TODO Remove? #self.heritrix.execute_script(self.name, "groovy", "appCtx.getBean(\"extractorMq\").setupChannel();") if self.use_credentials: for i, target in enumerate(self.info): if "secretId" in target["watchedTarget"].keys() and target["watchedTarget"]["secretId"]: logger.info("Getting credentials for %s..." % target["title"]) new_info = credentials.handle_credentials(target, self.name, self.heritrix) self.info[i] = new_info # Wait for a moment (attempting to avoid some race-condition starting up with WARC writers): time.sleep(10) # And unpause: logger.info("Unpausing %s" % self.name) self.heritrix.unpause_job(self.name) self.waitfor("RUNNING")
def start(self): """Starts the job.""" logger.info("Building %s" % self.name) self.heritrix.build(self.name) self.waitfor("NASCENT") logger.info("Launching %s" % self.name) self.heritrix.launch(self.name) self.waitfor("PAUSED") self.write_act_info() logger.info("Running scripts for %s" % self.name) self.run_job_script() #TODO: The below line is a kludge to avoid an issue in the AsynchronousMQExtractor... self.heritrix.execute( engine="groovy", script="appCtx.getBean(\"extractorMq\").setupChannel();", job=self.name) if self.use_credentials: for i, target in enumerate(self.info): if "secretId" in target["watchedTarget"].keys( ) and target["watchedTarget"]["secretId"]: logger.info("Getting credentials for %s..." % target["title"]) new_info = credentials.handle_credentials( target, self.name, self.heritrix) self.info[i] = new_info logger.info("Unpausing %s" % self.name) self.heritrix.unpause(self.name) self.waitfor("RUNNING")
def start(self, from_latest_checkpoint=False): """Starts the job.""" logger.info("Building %s" % self.name) self.heritrix.build_job(self.name) self.waitfor("NASCENT") logger.info("Launching %s" % self.name) if from_latest_checkpoint: self.heritrix.launch_from_latest_checkpoint(self.name) else: self.heritrix.launch_job(self.name) self.waitfor("PAUSED") self.write_act_info() logger.info("Running scripts for %s" % self.name) self.run_job_script() # NOTE: The below line is a kludge to avoid an issue in the AsynchronousMQExtractor... TODO Remove? #self.heritrix.execute_script(self.name, "groovy", "appCtx.getBean(\"extractorMq\").setupChannel();") if self.use_credentials: for i, target in enumerate(self.info): if "secretId" in target["watchedTarget"].keys( ) and target["watchedTarget"]["secretId"]: logger.info("Getting credentials for %s..." % target["title"]) new_info = credentials.handle_credentials( target, self.name, self.heritrix) self.info[i] = new_info # Wait for a moment (attempting to avoid some race-condition starting up with WARC writers): time.sleep(10) # And unpause: logger.info("Unpausing %s" % self.name) self.heritrix.unpause_job(self.name) self.waitfor("RUNNING")
def start(self): """Starts the job.""" logger.info("Building %s" % self.name) self.heritrix.build(self.name) self.waitfor("NASCENT") logger.info("Launching %s" % self.name) self.heritrix.launch(self.name) self.waitfor("PAUSED") self.write_act_info() logger.info("Running scripts for %s" % self.name) self.run_job_script() #TODO: The below line is a kludge to avoid an issue in the AsynchronousMQExtractor... self.heritrix.execute(engine="groovy", script="appCtx.getBean(\"extractorMq\").setupChannel();", job=self.name) if self.use_credentials: for i, target in enumerate(self.info): if "secretId" in target["watchedTarget"].keys() and target["watchedTarget"]["secretId"]: logger.info("Getting credentials for %s..." % target["title"]) new_info = credentials.handle_credentials(target, self.name, self.heritrix) self.info[i] = new_info logger.info("Unpausing %s" % self.name) self.heritrix.unpause(self.name) self.waitfor("RUNNING")