def _xhr_running_jobs_table(): """Returns a JSON with a list of running jobs data and a rendered running jobs table""" running_jobs = list() for job in db.session.query(Jobs).filter( Jobs.spider_status == SpiderStatus.RUNNING).all(): running_jobs.append({ "id": job.id, "spider_name": job.spider_name, "scrape_type": ScrapeType(job.scrape_type), "use_proxies": bool(job.use_proxies), "date_started": job.date_started, "save_to_file": bool(job.file), "save_to_db": bool(job.db), }) util_proc = utility_processor() html = running_jobs_template.render( running_jobs=running_jobs, now=datetime.now(), time_delta=util_proc["time_delta"], str_date=util_proc["str_date"], scrape_type=ScrapeType, ) return jsonify({ "running_jobs": running_jobs, "html": html, })
def _xhr_enabled_jobs_table(): """Returns a JSON with a list of enabled periodic jobs data and a rendered enabled jobs table""" enabled_jobs = list() for job in db.session.query(PeriodicJobs).filter( PeriodicJobs.enabled == 1).all(): enabled_jobs.append({ "id": job.id, "spider_name": job.spider_name, "scrape_type": ScrapeType(job.scrape_type), "use_proxies": bool(job.use_proxies), "date_started": job.date_started, "save_to_file": bool(job.file), "save_to_db": bool(job.db), "repeat_time": job.repeat_time, }) util_proc = utility_processor() html = enabled_jobs_template.render( enabled_jobs=enabled_jobs, now=datetime.now(), str_date=util_proc["str_date"], next_date_event=util_proc["next_date_event"], scrape_type=ScrapeType, ) return jsonify({ "enabled_jobs": enabled_jobs, "html": html, })
def run_periodic_job(job_id: int, params: dict): """ Run periodic scrape job managed by APScheduler :param int job_id: row ID of a job from the periodic_jobs table in webui.db :param dict params: scrapy spider parameters """ time_stamp = datetime.datetime.now() file_name = "{} {}".format(params["spider_name"], time_stamp.strftime(TIMESTAMP_FORMAT2)) log_file = file_name + ".log" feed_file = file_name + ".json" job = Jobs( task_id=run_periodic_job.request.id, spider_name=params["spider_name"], spider_status=SpiderStatus.RUNNING, scrape_type=params["scrape_type"], use_proxies=params["use_proxies"], file=params["save_to_feed"], db=params["save_to_db"], # images=params["images"], date_started=datetime.datetime.now(), ) db_session.add(job) db_session.commit() params["log"] = join(SPIDER_LOG_DIR, log_file) params["save_to_feed"] = None params["feed_file"] = join(FEEDS_DIR, feed_file) params["scrape_type"] = ScrapeType(params["scrape_type"]) params["job_id"] = job.id params["task_id"] = job.task_id # print(params) run_crawler_process(params)
def run_job(job_id: int, params: dict): """ Run one scrape job :param int job_id: row ID of a job from the jobs table in webui.db :param dict params: scrapy spider parameters """ # Update job job = db_session.query(Jobs).filter_by(id=job_id).first() if job is None: return False job.task_id = run_job.request.id time_stamp = datetime.datetime.now() job.date_started = time_stamp file_name = "{} {}".format(job.spider_name, time_stamp.strftime(TIMESTAMP_FORMAT2)) log_file = file_name + ".log" feed_file = file_name + ".json" job.spider_status = SpiderStatus.RUNNING db_session.commit() params["log"] = join(SPIDER_LOG_DIR, log_file) params["feed_file"] = join(FEEDS_DIR, feed_file) params["scrape_type"] = ScrapeType(params["scrape_type"]) params["job_id"] = job_id params["task_id"] = job.task_id # print(params) run_crawler_process(params)
def _xhr_next_jobs_table(): """Returns a JSON with a list of next jobs data and a rendered next jobs table""" next_jobs = list() for job in db.session.query(Jobs).filter( Jobs.spider_status == SpiderStatus.PENDING).all(): next_jobs.append({ "id": job.id, "spider_name": job.spider_name, "scrape_type": ScrapeType(job.scrape_type), "use_proxies": bool(job.use_proxies), "save_to_file": bool(job.file), "save_to_db": bool(job.db), }) html = next_jobs_template.render( next_jobs=next_jobs, scrape_type=ScrapeType, ) return jsonify({ "next_jobs": next_jobs, "html": html, })
def _periodic_jobs_add(): """Adds a periodic job to WebUI DB and job scheduler""" global settings try: # Selected spiders selected_spiders = request.form.get("_selected_spiders", '') selected_spiders = selected_spiders.split( ',') if selected_spiders else [] if all(selected_spiders): selected_spiders = list(map(int, selected_spiders)) if selected_spiders: settings.set_key("selected_spiders", selected_spiders) else: msg = "Please select at least one spider!" app.logger.warning(msg) flash(msg, "warning") # Selected countries selected_countries = request.form.getlist("countries") if selected_countries: settings.set_key("selected_countries", selected_countries) else: msg = "Please select at least one country!" app.logger.warning(msg) flash(msg, "warning") # Keywords keywords = list(text_to_unique_lines(request.form.get("keywords", ''))) if keywords: settings.set_key("keywords", keywords) else: msg = "Please enter at least one keyword!" app.logger.warning(msg) flash(msg, "warning") scrape_type = ScrapeType(int(request.form.get("scrape_type", ''))) save_to_feed = bool(request.form.get("save_to_feed", '')) save_to_db = bool(request.form.get("save_to_db", '')) use_proxies = bool(request.form.get("use_proxies", '')) cron_minutes = int(request.form.get("cron_minutes")) cron_hour = int(request.form.get("cron_hour")) if cron_minutes == 0 and cron_hour == 0: flash("Please choose repeat time", "warning") return redirect("/jobs/periodic") repeat_time = cron_hour * 60 + cron_minutes settings.set_key("scrape_type", scrape_type) settings.set_key("save_to_feed", save_to_feed) settings.set_key("save_to_db", save_to_db) settings.set_key("use_proxies", use_proxies) # settings.set_key("repeat_time", repeat_time) settings.save(SETTINGS_FILE) # Periodic spider jobs if selected_spiders and keywords and selected_countries and repeat_time: params = { # Global "db_host": settings.key("db_host"), "db_port": settings.key("db_port"), "db_name": settings.key("db_name"), "db_user": settings.key("db_user"), "db_pass": settings.key("db_pass"), "redis_host": settings.key("redis_host"), "redis_port": settings.key("redis_port"), "delay": settings.key("delay"), "timeout": settings.key("timeout"), "retries": settings.key("retries"), "concurrent_requests": settings.key("concurrent_requests"), # Job specific "spider": None, "spider_name": None, "scrape_type": scrape_type.value, "save_to_feed": save_to_feed, "save_to_db": save_to_db, "use_proxies": use_proxies, "keywords": keywords, "selected_countries": selected_countries, } app.logger.debug(params) # pprint(params, indent=4) # Set periodic jobs for spider_id in selected_spiders: if spider_id - 1 < len(SPIDERS): spider_name = SPIDERS[spider_id - 1][0] try: spider_params = deepcopy(params) spider_params["spider"] = spider_id spider_params["spider_name"] = spider_name # Get periodic job by spider ID job = db.session.query(PeriodicJobs).filter( PeriodicJobs.id == spider_id).first() # Update periodic job settings job.scrape_type = params["scrape_type"] job.use_proxies = int(params["use_proxies"]) job.file = int(params["save_to_feed"]) job.db = int(params["save_to_db"]) job.repeat_time = repeat_time job.enabled = OK.YES job.date_started = datetime.now() db.session.commit() # Add periodic job to schedule p = deepcopy(params) p["spider"] = spider_id - 1 p["spider_name"] = spider_name scheduler.add_job( func=run_periodic_job.delay, trigger=IntervalTrigger(minutes=repeat_time), id=job.spider_name, args=(job.id, p), name=job.spider_name, replace_existing=True, ) flash( "Successfully added periodic job {}".format( job.id), "success") except ConflictingIdError: job.enabled = OK.NO db.session.commit() msg = "Job ID {} already exists".format(job.id) app.logger.error(msg, exc_info=True) flash(msg, "danger") except Exception as e: job.enabled = OK.NO db.session.commit() app.logger.error( "Failed starting the job ID {}!".format(job.id), exc_info=True) flash( "Failed starting the job ID {}, details: {}". format(job.id, e), "danger") except Exception as e: app.logger.error("Failed to add the scrape job!", exc_info=True) flash("Failed to add the scrape job, details: {}".format(e), "danger") return redirect(request.referrer)
def _job_add(): """Adds a job to WebUI DB and runs it in a background task""" global settings try: # Selected spiders selected_spiders = request.form.get("_selected_spiders", '') selected_spiders = selected_spiders.split( ',') if selected_spiders else [] if all(selected_spiders): selected_spiders = list(map(int, selected_spiders)) if selected_spiders: settings.set_key("selected_spiders", selected_spiders) else: msg = "Please select at least one spider!" app.logger.warning(msg) flash(msg, "warning") # Selected countries selected_countries = request.form.getlist("countries") if selected_countries: settings.set_key("selected_countries", selected_countries) else: msg = "Please select at least one country!" app.logger.warning(msg) flash(msg, "warning") # Keywords keywords = list(text_to_unique_lines(request.form.get("keywords", ''))) if keywords: settings.set_key("keywords", keywords) else: msg = "Please enter at least one keyword!" app.logger.warning(msg) flash(msg, "warning") scrape_type = ScrapeType(int(request.form.get("scrape_type", ''))) save_to_feed = bool(request.form.get("save_to_feed", '')) save_to_db = bool(request.form.get("save_to_db", '')) use_proxies = bool(request.form.get("use_proxies", '')) settings.set_key("scrape_type", scrape_type) settings.set_key("save_to_feed", save_to_feed) settings.set_key("save_to_db", save_to_db) settings.set_key("use_proxies", use_proxies) settings.save(SETTINGS_FILE) # print(selected_spiders, keywords, selected_countries) # return redirect(request.referrer) # Spider jobs if selected_spiders and keywords and selected_countries: params = { # Global "db_host": settings.key("db_host"), "db_port": settings.key("db_port"), "db_name": settings.key("db_name"), "db_user": settings.key("db_user"), "db_pass": settings.key("db_pass"), "redis_host": settings.key("redis_host"), "redis_port": settings.key("redis_port"), "delay": settings.key("delay"), "timeout": settings.key("timeout"), "retries": settings.key("retries"), "concurrent_requests": settings.key("concurrent_requests"), # "countries": settings.key("countries"), # Job specific "spider": None, "spider_name": None, "scrape_type": scrape_type.value, "save_to_feed": save_to_feed, "save_to_db": save_to_db, "use_proxies": use_proxies, "keywords": keywords, "selected_countries": selected_countries, } # pprint(params, indent=4) app.logger.debug("Job add params {}".format(params)) # return redirect(request.referrer) # Set jobs jobs = [] for spider_idx in selected_spiders: if spider_idx < len(SPIDERS): spider_params = deepcopy(params) spider_params["spider"] = spider_idx spider_params["spider_name"] = SPIDERS[spider_idx][0] job = Jobs( task_id='', spider_name=spider_params["spider_name"], spider_status=SpiderStatus.PENDING, scrape_type=spider_params["scrape_type"], use_proxies=int(spider_params["use_proxies"]), file=int(spider_params["save_to_feed"]), db=int(spider_params["save_to_db"]), ) jobs.append((job, spider_params)) # Run spiders if jobs: # Save to WebUI DB for job, spider_params in jobs: db.session.add(job) db.session.commit() msg = "Successfully added {} scrape job(s)".format(len(jobs)) app.logger.info(msg) flash(msg, "success") # Run jobs for job, spider_params in jobs: task_id = run_job.delay(job.id, spider_params) # flash("Job id {} already exists".format(j.id), "danger") # except Exception as e: # flash("Failed starting the job id {}, details: {}".format(j.id, e)) # # flash("Successfully added {} periodic scrape job(s)".format(len(jobs)), "success") except Exception as e: msg = "Failed adding scrape job, details: {}".format(e) app.logger.error(msg, exc_info=True) flash(msg, "danger") return redirect(request.referrer)