def restart_job(frequency, start=datetime.utcnow()): """Restarts the job for a particular frequency.""" logger.info("Restarting %s at %s" % (frequency, start)) try: w = w3act(args.w3act_url,args.w3act_user,args.w3act_pw) export = w.get_ld_export(frequency) logger.debug("Found %s Targets in export." % len(export)) targets = [t for t in export if (t["crawlStartDateISO"] is None or dateutil.parser.parse(t["crawlStartDateISO"]) < start) and (t["crawlEndDateISO"] is None or dateutil.parser.parse(t["crawlEndDateISO"]) > start)] logger.debug("Found %s Targets in date range." % len(targets)) h = hapy.Hapy("https://%s:%s" % (args.host, args.port), username=args.user, password=args.password) #h = heritrix.API(host="https://%s:%s/engine" % (settings.HERITRIX_HOST, settings.HERITRIX_PORTS[frequency]), user="******", passwd="bl_uk", verbose=False, verify=False) if frequency in h.listjobs() and h.status(frequency) != "": stop_running_job(frequency, h) #TODO: Automated QA job = W3actJob(targets, name=frequency, heritrix=h) if not args.test: logger.debug("Starting job %s with %s seeds." % (job.name, len(job.seeds))) job.start() else: logger.debug("Would start job %s with %s seeds." % (job.name, len(job.seeds))) logger.debug("Seeds:") for surl in job.seeds: logger.debug("- %s" % surl) except: logger.error("%s: %s" % (frequency, str(sys.exc_info()))) logger.error("%s: %s" % (frequency, traceback.format_exc()))
def restart_job(frequency, start=datetime.utcnow()): """Restarts the job for a particular frequency.""" logger.info("Restarting %s at %s" % (frequency, start)) try: w = w3act(args.w3act_url, args.w3act_user, args.w3act_pw) export = w.get_ld_export(frequency) logger.debug("Found %s Targets in export." % len(export)) targets = [ t for t in export if (t["crawlStartDateISO"] is None or dateutil.parser.parse(t["crawlStartDateISO"]) < start) and ( t["crawlEndDateISO"] is None or dateutil.parser.parse(t["crawlEndDateISO"]) > start) ] logger.debug("Found %s Targets in date range." % len(targets)) h = hapy.Hapy("https://%s:%s" % (args.host, args.port), username=args.user, password=args.password) #h = heritrix.API(host="https://%s:%s/engine" % (settings.HERITRIX_HOST, settings.HERITRIX_PORTS[frequency]), user="******", passwd="bl_uk", verbose=False, verify=False) if frequency in h.listjobs() and h.status(frequency) != "": stop_running_job(frequency, h) #TODO: Automated QA job = W3actJob(targets, name=frequency, heritrix=h) if not args.test: logger.debug("Starting job %s with %s seeds." % (job.name, len(job.seeds))) job.start() else: logger.debug("Would start job %s with %s seeds." % (job.name, len(job.seeds))) logger.debug("Seeds:") for surl in job.seeds: logger.debug("- %s" % surl) except: logger.error("%s: %s" % (frequency, str(sys.exc_info()))) logger.error("%s: %s" % (frequency, traceback.format_exc()))
parser.add_argument( "-W", "--watched-surt-file", dest="watched_surt_file", type=str, help= "SURT file to write Watched Targets, for scoping document extraction [default: %(default)s]", default=None) parser.add_argument('queue', metavar='queue', help="Name of queue to send seeds to.") args = parser.parse_args() # Get all the frequently-crawled items act = w3act(args.w3act_url, args.w3act_user, args.w3act_pw) targets = act.get_ld_export(args.frequency) logger.info("Got %s targets" % len(targets)) destination = args.destination # or use "h3" for message suitable for h3 # Update scope file, if enabled: if args.surt_file: logger.debug("Writing surt targets to %s" % args.surt_file) write_surt_file(targets, args.surt_file) # Update watched target scope file, if enabled: if args.watched_surt_file: logger.debug("Writing watched targets to %s" % args.surt_file) write_watched_surt_file(targets, args.watched_surt_file) # Set up launcher:
parser.add_argument("-f", "--frequency", dest="frequency", type=str, help="Frequency to look at. Use 'frequent' for all valid frequencies. [default: %(default)s]", default='frequent') parser.add_argument("-d", "--destination", dest="destination", type=str, default='har', help="Destination, implying message format to use: 'har' or 'h3'. [default: %(default)s]") parser.add_argument("-tid", "--target-id", dest="target_id", type=int, help="Target ID to allow to launch (for testing purposes). [default: %(default)s]") parser.add_argument("-S", "--surt-file", dest="surt_file", type=str, help="SURT file to write to, for scoping Heritrix crawls [default: %(default)s]", default=None) parser.add_argument("-W", "--watched-surt-file", dest="watched_surt_file", type=str, help="SURT file to write Watched Targets, for scoping document extraction [default: %(default)s]", default=None) parser.add_argument('queue', metavar='queue', help="Name of queue to send seeds to.") args = parser.parse_args() # Get all the frequently-crawled items act = w3act(args.w3act_url,args.w3act_user,args.w3act_pw) targets = act.get_ld_export(args.frequency) logger.info("Got %s targets" % len(targets)) destination = args.destination # or use "h3" for message suitable for h3 # Update scope file, if enabled: if args.surt_file: logger.debug("Writing surt targets to %s" % args.surt_file) write_surt_file(targets, args.surt_file) # Update watched target scope file, if enabled: if args.watched_surt_file: logger.debug("Writing watched targets to %s" % args.surt_file) write_watched_surt_file(targets, args.watched_surt_file) # Set up launcher: