Exemplo n.º 1
0
    def get_h3_status(self, job, server):
        # Set up connection to H3:
        h = hapyx.HapyX(server['url'], username=server['user'], password=server['pass'], timeout=5)
        state = {}
        try:
            logger.info("Getting status for job %s on %s" % (job, server))
            info = h.get_job_info(job)
            state['details'] = info
            if info.has_key('job'):
                state['status'] = info['job'].get("crawlControllerState", None)
                if not state['status']:
                    state['status'] = info['job'].get("statusDescription", None)
                state['status'] = state['status'].upper()
        except Exception as e:
            state['status'] = "DOWN"
            state['error'] = "Could not reach Heritrix! %s" % e
            # app.logger.exception(e)
        # Classify
        if state['status'] == "DOWN":
            state['status-class'] = "status-oos"
        elif state['status'] == "RUNNING":
            # Replacing RUNNING with docs/second rate
            rate = state['details']['job']['rateReport']['currentDocsPerSecond']
            state['rate'] = "%.1f" % float(rate)
            if rate < 1.0:
                state['status-class'] = "status-warning"
            else:
                state['status-class'] = "status-good"
        else:
            state['status-class'] = "status-warning"

        return state
Exemplo n.º 2
0
    def run(self):
        # Set up connection to H3:
        h = hapyx.HapyX("https://%s:%s" % (h3().host, h3().port),
                        username=h3().username,
                        password=h3().password)

        logger.info("Starting %s" % (self.job.name))
        targets = json.load(self.input()[1].open('r'))
        nevercrawl = json.load(self.input()[2].open('r'))
        logger.debug("Found %s Targets in date range." % len(targets))
        job = W3actJob(targets,
                       self.job.name,
                       heritrix=h,
                       heritrix_job_dir=h3().local_job_folder,
                       nevercrawl=nevercrawl)
        status = h.status(self.job.name)
        logger.info("Got current job status: %s" % status)

        logger.info("Starting job %s (from checkpoint = %s)..." %
                    (job.name, self.from_latest_checkpoint))
        job.start(from_latest_checkpoint=self.from_latest_checkpoint)
        launch_id = h.get_launch_id(self.job.name)

        logger.info("Launched job %s/%s with %s seeds." %
                    (job.name, launch_id, len(job.seeds)))
        #with self.output().open('w') as f:
        #    f.write('{}\n'.format(launch_id))

        # Record an output file that can be use as a Target by a different task.:
        mark_job_as(job, launch_id, 'started')

        return
Exemplo n.º 3
0
def unpause_dc():
    servers = json.load(systems().servers)
    services = json.load(systems().services)
    for job in ['dc0-2016', 'dc1-2016', 'dc2-2016', 'dc3-2016']:
        server = servers[services['jobs'][job]['server']]
        h = hapyx.HapyX(server['url'],
                        username=server['user'],
                        password=server['pass'])
        h.unpause_job(services['jobs'][job]['name'])
    return redirect(url_for('status'))
Exemplo n.º 4
0
    def complete(self):
        # Set up connection to H3:
        h = hapyx.HapyX("https://%s:%s" % (h3().host, h3().port),
                        username=h3().username,
                        password=h3().password)

        # Is this job known?
        if self.job.name in h.list_jobs():
            status = h.status(self.job.name)
            if status == "":
                return True
            else:
                return False
        else:
            return True
Exemplo n.º 5
0
 def setup_heritrix(self,
                    api=None,
                    host=None,
                    port=None,
                    user="******",
                    passwd="bl_uk"):
     if api is not None:
         self.heritrix = api
     else:
         self.heritrix = hapyx.HapyX(host="https://%s:%s/engine" %
                                     (host, port),
                                     user=user,
                                     passwd=passwd,
                                     verbose=False,
                                     verify=False)
     self.heritrix.add_job_directory(self.job_dir)
Exemplo n.º 6
0
    def run(self):
        # Set up connection to H3:
        h = hapyx.HapyX("https://%s:%s" % (h3().host, h3().port),
                        username=h3().username,
                        password=h3().password)

        # Is that job running?
        status = h.status(self.job.name)
        if status != "":
            # Check the launch ID is not current:
            launch_id = h.get_launch_id(self.job.name)
            if launch_id == self.launch_id:
                # Declare that we are awaiting an external process to stop this job:
                yield StopJobExternalTask(self.job, self.launch_id)

        # Not running, so mark as stopped:
        with self.output().open('w') as f:
            f.write('{} {}\n'.format(self.job.name, self.launch_id))
Exemplo n.º 7
0
    def run(self):
        # Set up connection to H3:
        h = hapyx.HapyX("https://%s:%s" % (h3().host, h3().port),
                        username=h3().username,
                        password=h3().password)

        logger.info("I'm stopping %s" % (self.job.name))

        # Stop job if currently running:
        if self.job.name in h.list_jobs() and h.status(self.job.name) != "":
            """Stops a running job, cleans up the directory, initiates job assembly."""
            launch_id = h.get_launch_id(self.job.name)
            job = W3actJob.from_directory(
                "%s/%s" % (h3().local_job_folder, self.job.name), heritrix=h)
            job.stop()
            remove_action_files(self.job.name,
                                HERITRIX_JOBS=h3().local_job_folder)

            # Record an output file that can be use as a Target by a different task:
            mark_job_as(job, launch_id, 'stopped')
        else:
            logger.warning("No {} job to be stopped!".format(self.job.name))
Exemplo n.º 8
0
def stop_start_job(self, frequency, start=datetime.utcnow(), restart=True):
    """
    Restarts the job for a particular frequency.
    """
    try:
        logger.info("Stopping/starting %s at %s" % (frequency, start))

        # Set up connection to W3ACT:
        w = w3act(cfg.get('act', 'url'), cfg.get('act', 'username'),
                  cfg.get('act', 'password'))
        # Set up connection to H3:
        h = hapyx.HapyX("https://%s:%s" %
                        (cfg.get('h3', 'host'), cfg.get('h3', 'port')),
                        username=cfg.get('h3', 'username'),
                        password=cfg.get('h3', 'password'))

        # Stop job if currently running:
        if frequency in h.list_jobs() and h.status(frequency) != "":
            """Stops a running job, notifies RabbitMQ and cleans up the directory."""
            launch_id = h.get_launch_id(frequency)
            job = W3actJob.from_directory(w,
                                          "%s/%s" % (HERITRIX_JOBS, frequency),
                                          heritrix=h)
            job.stop()
            remove_action_files(frequency)
            crawl.status.update_job_status.delay(
                job.name, "%s/%s" % (job.name, launch_id), "STOPPED")

            # Pass on to the next step in the chain:
            logger.info("Requesting assembly of output for: %s/%s" %
                        (frequency, launch_id))
            assemble_job_output.delay(frequency, launch_id)
        else:
            job = None

        # Start job if requested:
        if restart:
            targets = w.get_ld_export(frequency)
            # logger.info("Found %s Targets in export." % len(export))
            #    targets = [t for t in export if (t["startDate"] is None or t["startDate"] < start) and (t["endDateISO"] is None or t["crawlEndDateISO"] > start)]
            logger.debug("Found %s Targets in date range." % len(targets))
            job = W3actJob(w, targets, frequency, heritrix=h)
            logger.info("Starting job %s..." % job.name)
            job.start()
            launch_id = h.get_launch_id(frequency)
            crawl.status.update_job_status.delay(
                job.name, "%s/%s" % (job.name, launch_id), "LAUNCHED")
            logger.info("Launched job %s/%s with %s seeds." %
                        (job.name, launch_id, len(job.seeds)))
            return "Launched job %s/%s with %s seeds." % (job.name, launch_id,
                                                          len(job.seeds))
        else:
            if job:
                logger.info("Stopped job %s/%s without restarting..." %
                            (job.name, launch_id))
                return "Stopped job %s/%s without restarting..." % (job.name,
                                                                    launch_id)
            else:
                logger.warning("No running '%s' job to stop!" % frequency)
                return "No running '%s' job to stop!" % frequency
    except BaseException as e:
        logger.exception(e)
        raise self.retry(countdown=10, exe=e)
Exemplo n.º 9
0
def get_hapy_for_job(job):
    host = "https://%s-%s:%s" % (h3().host, job.name, h3().port)
    return hapyx.HapyX(host, username=h3().username, password=h3().password)