示例#1
0
 def start(self, from_latest_checkpoint=False):
     """Starts the job."""
     logger.info("Building %s" % self.name)
     self.heritrix.build_job(self.name)
     self.waitfor("NASCENT")
     logger.info("Launching %s" % self.name)
     if from_latest_checkpoint:
         self.heritrix.launch_from_latest_checkpoint(self.name)
     else:
         self.heritrix.launch_job(self.name)
     self.waitfor("PAUSED")
     self.write_act_info()
     logger.info("Running scripts for %s" % self.name)
     self.run_job_script()
     # NOTE: The below line is a kludge to avoid an issue in the AsynchronousMQExtractor... TODO Remove?
     #self.heritrix.execute_script(self.name, "groovy", "appCtx.getBean(\"extractorMq\").setupChannel();")
     if self.use_credentials:
         for i, target in enumerate(self.info):
             if "secretId" in target["watchedTarget"].keys() and target["watchedTarget"]["secretId"]:
                 logger.info("Getting credentials for %s..." % target["title"])
                 new_info = credentials.handle_credentials(target, self.name, self.heritrix)
                 self.info[i] = new_info
     # Wait for a moment (attempting to avoid some race-condition starting up with WARC writers):
     time.sleep(10)
     # And unpause:
     logger.info("Unpausing %s" % self.name)
     self.heritrix.unpause_job(self.name)
     self.waitfor("RUNNING")
示例#2
0
 def start(self):
     """Starts the job."""
     logger.info("Building %s" % self.name)
     self.heritrix.build(self.name)
     self.waitfor("NASCENT")
     logger.info("Launching %s" % self.name)
     self.heritrix.launch(self.name)
     self.waitfor("PAUSED")
     self.write_act_info()
     logger.info("Running scripts for %s" % self.name)
     self.run_job_script()
     #TODO: The below line is a kludge to avoid an issue in the AsynchronousMQExtractor...
     self.heritrix.execute(
         engine="groovy",
         script="appCtx.getBean(\"extractorMq\").setupChannel();",
         job=self.name)
     if self.use_credentials:
         for i, target in enumerate(self.info):
             if "secretId" in target["watchedTarget"].keys(
             ) and target["watchedTarget"]["secretId"]:
                 logger.info("Getting credentials for %s..." %
                             target["title"])
                 new_info = credentials.handle_credentials(
                     target, self.name, self.heritrix)
                 self.info[i] = new_info
     logger.info("Unpausing %s" % self.name)
     self.heritrix.unpause(self.name)
     self.waitfor("RUNNING")
示例#3
0
 def start(self, from_latest_checkpoint=False):
     """Starts the job."""
     logger.info("Building %s" % self.name)
     self.heritrix.build_job(self.name)
     self.waitfor("NASCENT")
     logger.info("Launching %s" % self.name)
     if from_latest_checkpoint:
         self.heritrix.launch_from_latest_checkpoint(self.name)
     else:
         self.heritrix.launch_job(self.name)
     self.waitfor("PAUSED")
     self.write_act_info()
     logger.info("Running scripts for %s" % self.name)
     self.run_job_script()
     # NOTE: The below line is a kludge to avoid an issue in the AsynchronousMQExtractor... TODO Remove?
     #self.heritrix.execute_script(self.name, "groovy", "appCtx.getBean(\"extractorMq\").setupChannel();")
     if self.use_credentials:
         for i, target in enumerate(self.info):
             if "secretId" in target["watchedTarget"].keys(
             ) and target["watchedTarget"]["secretId"]:
                 logger.info("Getting credentials for %s..." %
                             target["title"])
                 new_info = credentials.handle_credentials(
                     target, self.name, self.heritrix)
                 self.info[i] = new_info
     # Wait for a moment (attempting to avoid some race-condition starting up with WARC writers):
     time.sleep(10)
     # And unpause:
     logger.info("Unpausing %s" % self.name)
     self.heritrix.unpause_job(self.name)
     self.waitfor("RUNNING")
示例#4
0
 def start(self):
     """Starts the job."""
     logger.info("Building %s" % self.name)
     self.heritrix.build(self.name)
     self.waitfor("NASCENT")
     logger.info("Launching %s" % self.name)
     self.heritrix.launch(self.name)
     self.waitfor("PAUSED")
     self.write_act_info()
     logger.info("Running scripts for %s" % self.name)
     self.run_job_script()
     #TODO: The below line is a kludge to avoid an issue in the AsynchronousMQExtractor...
     self.heritrix.execute(engine="groovy", script="appCtx.getBean(\"extractorMq\").setupChannel();", job=self.name)
     if self.use_credentials:
         for i, target in enumerate(self.info):
             if "secretId" in target["watchedTarget"].keys() and target["watchedTarget"]["secretId"]:
                 logger.info("Getting credentials for %s..." % target["title"])
                 new_info = credentials.handle_credentials(target, self.name, self.heritrix)
                 self.info[i] = new_info
     logger.info("Unpausing %s" % self.name)
     self.heritrix.unpause(self.name)
     self.waitfor("RUNNING")