Exemplo n.º 1
0
    def run(self):
        # Set up connection to H3:
        h = hapyx.HapyX("https://%s:%s" % (h3().host, h3().port),
                        username=h3().username,
                        password=h3().password)

        logger.info("Starting %s" % (self.job.name))
        targets = json.load(self.input()[1].open('r'))
        nevercrawl = json.load(self.input()[2].open('r'))
        logger.debug("Found %s Targets in date range." % len(targets))
        job = W3actJob(targets,
                       self.job.name,
                       heritrix=h,
                       heritrix_job_dir=h3().local_job_folder,
                       nevercrawl=nevercrawl)
        status = h.status(self.job.name)
        logger.info("Got current job status: %s" % status)

        logger.info("Starting job %s (from checkpoint = %s)..." %
                    (job.name, self.from_latest_checkpoint))
        job.start(from_latest_checkpoint=self.from_latest_checkpoint)
        launch_id = h.get_launch_id(self.job.name)

        logger.info("Launched job %s/%s with %s seeds." %
                    (job.name, launch_id, len(job.seeds)))
        #with self.output().open('w') as f:
        #    f.write('{}\n'.format(launch_id))

        # Record an output file that can be use as a Target by a different task.:
        mark_job_as(job, launch_id, 'started')

        return
Exemplo n.º 2
0
def stop_start_job(self, frequency, start=datetime.utcnow(), restart=True):
    """
    Restarts the job for a particular frequency.
    """
    try:
        logger.info("Stopping/starting %s at %s" % (frequency, start))

        # Set up connection to W3ACT:
        w = w3act(cfg.get('act','url'),cfg.get('act','username'),cfg.get('act','password'))
        # Set up connection to H3:
        h = hapyx.HapyX("https://%s:%s" % (cfg.get('h3','host'), cfg.get('h3','port')), username=cfg.get('h3','username'), password=cfg.get('h3','password'))

        # Stop job if currently running:
        if frequency in h.list_jobs() and h.status(frequency) != "":
            """Stops a running job, notifies RabbitMQ and cleans up the directory."""
            launch_id = h.get_launch_id(frequency)
            job = W3actJob.from_directory(w, "%s/%s" % (HERITRIX_JOBS, frequency), heritrix=h)
            job.stop()
            remove_action_files(frequency)
            crawl.status.update_job_status.delay(job.name, "%s/%s" % (job.name, launch_id), "STOPPED")

            # Pass on to the next step in the chain:
            logger.info("Requesting assembly of output for: %s/%s" % (frequency, launch_id))
            assemble_job_output.delay(frequency,launch_id)
        else:
            job = None

        # Start job if requested:
        if restart:
            targets = w.get_ld_export(frequency)
            # logger.info("Found %s Targets in export." % len(export))
            #    targets = [t for t in export if (t["startDate"] is None or t["startDate"] < start) and (t["endDateISO"] is None or t["crawlEndDateISO"] > start)]
            logger.debug("Found %s Targets in date range." % len(targets))
            job = W3actJob(w, targets, frequency, heritrix=h)
            logger.info("Starting job %s..." % job.name)
            job.start()
            launch_id = h.get_launch_id(frequency)
            crawl.status.update_job_status.delay(job.name, "%s/%s" % (job.name, launch_id), "LAUNCHED" )
            logger.info("Launched job %s/%s with %s seeds." % (job.name, launch_id, len(job.seeds)))
            return "Launched job %s/%s with %s seeds." % (job.name, launch_id, len(job.seeds))
        else:
            if job:
                logger.info("Stopped job %s/%s without restarting..." % (job.name, launch_id))
                return "Stopped job %s/%s without restarting..." % (job.name, launch_id)
            else:
                logger.warning("No running '%s' job to stop!" % frequency)
                return "No running '%s' job to stop!" % frequency
    except BaseException as e:
        logger.exception(e)
        raise self.retry(countdown=10, exe=e)
Exemplo n.º 3
0
def stop_running_job(frequency, heritrix):
    """Stops a running job, notifies RabbitMQ and cleans up the directory."""
    launchid = heritrix.launchid(frequency)
    message = "%s/%s" % (frequency, launchid)
    job = W3actJob.from_directory("%s/%s" %
                                  (settings.HERITRIX_JOBS, frequency),
                                  heritrix=heritrix)
    job.stop()
    logger.info("Sending SIP message: %s" % message)
    send_message(settings.QUEUE_HOST, settings.SIP_QUEUE_NAME,
                 settings.SIP_QUEUE_KEY, message)
    logger.info("Sending QA message: %s" % message)
    send_message(settings.QUEUE_HOST, settings.QA_QUEUE_NAME,
                 settings.QA_QUEUE_KEY, message)
    remove_action_files(frequency)
Exemplo n.º 4
0
    def run(self):
        # Set up connection to H3:
        h = get_hapy_for_job(self.job)

        logger.info("I'm stopping %s" % (self.job.name))

        # Stop job if currently running:
        if self.job.name in h.list_jobs() and h.status(self.job.name) != "":
            """Stops a running job, cleans up the directory, initiates job assembly."""
            launch_id = h.get_launch_id(self.job.name)
            job = W3actJob.from_directory("%s/%s" % (h3().local_job_folder, self.job.name), heritrix=h)
            job.stop()
            remove_action_files(self.job.name, HERITRIX_JOBS=h3().local_job_folder)

            # Record an output file that can be use as a Target by a different task:
            mark_job_as(job, launch_id, 'stopped')
        else:
            logger.warning("No {} job to be stopped!".format(self.job.name))
Exemplo n.º 5
0
def stop_running_job(frequency, heritrix):
    """Stops a running job, notifies RabbitMQ and cleans up the directory."""
    launchid = heritrix.launchid(frequency)
    message = "%s/%s" % (frequency, launchid)
    job = W3actJob.from_directory("%s/%s" % (settings.HERITRIX_JOBS, frequency), heritrix=heritrix)
    job.stop()
    logger.info("Sending SIP message: %s" % message)
    send_message(
        settings.QUEUE_HOST,
        settings.SIP_QUEUE_NAME,
        settings.SIP_QUEUE_KEY,
        message
    )
    logger.info("Sending QA message: %s" % message)
    send_message(
        settings.QUEUE_HOST,
        settings.QA_QUEUE_NAME,
        settings.QA_QUEUE_KEY,
        message
    )
    remove_action_files(frequency)
Exemplo n.º 6
0
def restart_job(frequency, start=datetime.utcnow()):
    """Restarts the job for a particular frequency."""
    logger.info("Restarting %s at %s" % (frequency, start))
    try:
        w = w3act(args.w3act_url, args.w3act_user, args.w3act_pw)

        export = w.get_ld_export(frequency)
        logger.debug("Found %s Targets in export." % len(export))
        targets = [
            t for t in export
            if (t["crawlStartDateISO"] is None
                or dateutil.parser.parse(t["crawlStartDateISO"]) < start) and (
                    t["crawlEndDateISO"] is None
                    or dateutil.parser.parse(t["crawlEndDateISO"]) > start)
        ]
        logger.debug("Found %s Targets in date range." % len(targets))
        h = hapy.Hapy("https://%s:%s" % (args.host, args.port),
                      username=args.user,
                      password=args.password)
        #h = heritrix.API(host="https://%s:%s/engine" % (settings.HERITRIX_HOST, settings.HERITRIX_PORTS[frequency]), user="******", passwd="bl_uk", verbose=False, verify=False)
        if frequency in h.listjobs() and h.status(frequency) != "":
            stop_running_job(frequency, h)
            #TODO: Automated QA
        job = W3actJob(targets, name=frequency, heritrix=h)
        if not args.test:
            logger.debug("Starting job %s with %s seeds." %
                         (job.name, len(job.seeds)))
            job.start()
        else:
            logger.debug("Would start job %s with %s seeds." %
                         (job.name, len(job.seeds)))
            logger.debug("Seeds:")
            for surl in job.seeds:
                logger.debug("- %s" % surl)

    except:
        logger.error("%s: %s" % (frequency, str(sys.exc_info())))
        logger.error("%s: %s" % (frequency, traceback.format_exc()))
Exemplo n.º 7
0
def stop_start_job(self, frequency, start=datetime.utcnow(), restart=True):
    """
    Restarts the job for a particular frequency.
    """
    try:
        logger.info("Stopping/starting %s at %s" % (frequency, start))

        # Set up connection to W3ACT:
        w = w3act(cfg.get('act', 'url'), cfg.get('act', 'username'),
                  cfg.get('act', 'password'))
        # Set up connection to H3:
        h = hapyx.HapyX("https://%s:%s" %
                        (cfg.get('h3', 'host'), cfg.get('h3', 'port')),
                        username=cfg.get('h3', 'username'),
                        password=cfg.get('h3', 'password'))

        # Stop job if currently running:
        if frequency in h.list_jobs() and h.status(frequency) != "":
            """Stops a running job, notifies RabbitMQ and cleans up the directory."""
            launch_id = h.get_launch_id(frequency)
            job = W3actJob.from_directory(w,
                                          "%s/%s" % (HERITRIX_JOBS, frequency),
                                          heritrix=h)
            job.stop()
            remove_action_files(frequency)
            crawl.status.update_job_status.delay(
                job.name, "%s/%s" % (job.name, launch_id), "STOPPED")

            # Pass on to the next step in the chain:
            logger.info("Requesting assembly of output for: %s/%s" %
                        (frequency, launch_id))
            assemble_job_output.delay(frequency, launch_id)
        else:
            job = None

        # Start job if requested:
        if restart:
            targets = w.get_ld_export(frequency)
            # logger.info("Found %s Targets in export." % len(export))
            #    targets = [t for t in export if (t["startDate"] is None or t["startDate"] < start) and (t["endDateISO"] is None or t["crawlEndDateISO"] > start)]
            logger.debug("Found %s Targets in date range." % len(targets))
            job = W3actJob(w, targets, frequency, heritrix=h)
            logger.info("Starting job %s..." % job.name)
            job.start()
            launch_id = h.get_launch_id(frequency)
            crawl.status.update_job_status.delay(
                job.name, "%s/%s" % (job.name, launch_id), "LAUNCHED")
            logger.info("Launched job %s/%s with %s seeds." %
                        (job.name, launch_id, len(job.seeds)))
            return "Launched job %s/%s with %s seeds." % (job.name, launch_id,
                                                          len(job.seeds))
        else:
            if job:
                logger.info("Stopped job %s/%s without restarting..." %
                            (job.name, launch_id))
                return "Stopped job %s/%s without restarting..." % (job.name,
                                                                    launch_id)
            else:
                logger.warning("No running '%s' job to stop!" % frequency)
                return "No running '%s' job to stop!" % frequency
    except BaseException as e:
        logger.exception(e)
        raise self.retry(countdown=10, exe=e)