Пример #1
0
def restart_job(frequency, start=datetime.utcnow()):
    """Restarts the job for a particular frequency."""
    logger.info("Restarting %s at %s" % (frequency, start))
    try:
        w = w3act(args.w3act_url,args.w3act_user,args.w3act_pw)

        export = w.get_ld_export(frequency)
        logger.debug("Found %s Targets in export." % len(export))
        targets = [t for t in export if (t["crawlStartDateISO"] is None or dateutil.parser.parse(t["crawlStartDateISO"]) < start) and (t["crawlEndDateISO"] is None or dateutil.parser.parse(t["crawlEndDateISO"]) > start)]
        logger.debug("Found %s Targets in date range." % len(targets))
        h = hapy.Hapy("https://%s:%s" % (args.host, args.port), username=args.user, password=args.password)
        #h = heritrix.API(host="https://%s:%s/engine" % (settings.HERITRIX_HOST, settings.HERITRIX_PORTS[frequency]), user="******", passwd="bl_uk", verbose=False, verify=False)
        if frequency in h.listjobs() and h.status(frequency) != "":
            stop_running_job(frequency, h)
            #TODO: Automated QA
        job = W3actJob(targets, name=frequency, heritrix=h)
        if not args.test:
            logger.debug("Starting job %s with %s seeds." % (job.name, len(job.seeds)))
            job.start()
        else:
            logger.debug("Would start job %s with %s seeds." % (job.name, len(job.seeds)))
            logger.debug("Seeds:")
            for surl in job.seeds:
                logger.debug("- %s" % surl)

    except:
        logger.error("%s: %s" % (frequency, str(sys.exc_info())))
        logger.error("%s: %s" % (frequency, traceback.format_exc()))
Пример #2
0
def restart_job(frequency, start=datetime.utcnow()):
    """Restarts the job for a particular frequency."""
    logger.info("Restarting %s at %s" % (frequency, start))
    try:
        w = w3act(args.w3act_url, args.w3act_user, args.w3act_pw)

        export = w.get_ld_export(frequency)
        logger.debug("Found %s Targets in export." % len(export))
        targets = [
            t for t in export
            if (t["crawlStartDateISO"] is None
                or dateutil.parser.parse(t["crawlStartDateISO"]) < start) and (
                    t["crawlEndDateISO"] is None
                    or dateutil.parser.parse(t["crawlEndDateISO"]) > start)
        ]
        logger.debug("Found %s Targets in date range." % len(targets))
        h = hapy.Hapy("https://%s:%s" % (args.host, args.port),
                      username=args.user,
                      password=args.password)
        #h = heritrix.API(host="https://%s:%s/engine" % (settings.HERITRIX_HOST, settings.HERITRIX_PORTS[frequency]), user="******", passwd="bl_uk", verbose=False, verify=False)
        if frequency in h.listjobs() and h.status(frequency) != "":
            stop_running_job(frequency, h)
            #TODO: Automated QA
        job = W3actJob(targets, name=frequency, heritrix=h)
        if not args.test:
            logger.debug("Starting job %s with %s seeds." %
                         (job.name, len(job.seeds)))
            job.start()
        else:
            logger.debug("Would start job %s with %s seeds." %
                         (job.name, len(job.seeds)))
            logger.debug("Seeds:")
            for surl in job.seeds:
                logger.debug("- %s" % surl)

    except:
        logger.error("%s: %s" % (frequency, str(sys.exc_info())))
        logger.error("%s: %s" % (frequency, traceback.format_exc()))
Пример #3
0
    parser.add_argument(
        "-W",
        "--watched-surt-file",
        dest="watched_surt_file",
        type=str,
        help=
        "SURT file to write Watched Targets, for scoping document extraction [default: %(default)s]",
        default=None)
    parser.add_argument('queue',
                        metavar='queue',
                        help="Name of queue to send seeds to.")

    args = parser.parse_args()

    # Get all the frequently-crawled items
    act = w3act(args.w3act_url, args.w3act_user, args.w3act_pw)
    targets = act.get_ld_export(args.frequency)
    logger.info("Got %s targets" % len(targets))
    destination = args.destination  # or use "h3" for message suitable for h3

    # Update scope file, if enabled:
    if args.surt_file:
        logger.debug("Writing surt targets to %s" % args.surt_file)
        write_surt_file(targets, args.surt_file)

    # Update watched target scope file, if enabled:
    if args.watched_surt_file:
        logger.debug("Writing watched targets to %s" % args.surt_file)
        write_watched_surt_file(targets, args.watched_surt_file)

    # Set up launcher:
Пример #4
0
	parser.add_argument("-f", "--frequency", dest="frequency", type=str, 
					help="Frequency to look at. Use 'frequent' for all valid frequencies. [default: %(default)s]", default='frequent')
	parser.add_argument("-d", "--destination", dest="destination", type=str, default='har',
					help="Destination, implying message format to use: 'har' or 'h3'. [default: %(default)s]")
	parser.add_argument("-tid", "--target-id", dest="target_id", type=int,
					help="Target ID to allow to launch (for testing purposes). [default: %(default)s]")
	parser.add_argument("-S", "--surt-file", dest="surt_file", type=str, 
					help="SURT file to write to, for scoping Heritrix crawls [default: %(default)s]", default=None)	
	parser.add_argument("-W", "--watched-surt-file", dest="watched_surt_file", type=str, 
					help="SURT file to write Watched Targets, for scoping document extraction [default: %(default)s]", default=None)	
	parser.add_argument('queue', metavar='queue', help="Name of queue to send seeds to.")
	
	args = parser.parse_args()
	
	# Get all the frequently-crawled items
	act = w3act(args.w3act_url,args.w3act_user,args.w3act_pw)
	targets = act.get_ld_export(args.frequency)
	logger.info("Got %s targets" % len(targets))
	destination = args.destination # or use "h3" for message suitable for h3
	
	# Update scope file, if enabled:
	if args.surt_file:
		logger.debug("Writing surt targets to %s" % args.surt_file)
		write_surt_file(targets, args.surt_file)
	
	# Update watched target scope file, if enabled:
	if args.watched_surt_file:
		logger.debug("Writing watched targets to %s" % args.surt_file)
		write_watched_surt_file(targets, args.watched_surt_file)
		
	# Set up launcher: