def retrieve_reports(url, market, dst_file, date_val, cols) -> DownloadDetails: request_data = dict(market=market, date=date_val) response = requests.post(url=url, headers=headers.get_random_headers(), data=request_data) if response.status_code == requests.codes.ok: text = response.text try: json_val = json.loads(text) results = json_val['results'] except (ValueError, KeyError) as e: app_logger.error(e, exc_info=True) else: app_logger.info("Downloaded entries for date %s OK" % date_val) # save if there's some non-empty data if results: if save_to_file(dst_csv_file_name=dst_file, csv_cols=cols, data=results): app_logger.info("Saved entries for date %s OK" % date_val) return DownloadDetails(skipped=False, status=True) else: app_logger.warning("Skipped empty entries for date %s" % date_val) return DownloadDetails(skipped=True, status=True) else: app_logger.error( "Data for %s is not available, request returned %d status" % (date_val, response.status_code)) return DownloadDetails(skipped=False, status=False)
def enable_instance(instance_id): """标记实例为可用""" app_logger.info("启用实例, iid:{0}".format(instance_id)) try: with get_db() as session: CRUD_Instances_Info.enable_instance(session, instance_id) except Exception as ex: app_logger.error("启用实例失败, iid:{0}, ex:{1}".format(instance_id, ex)) return 0
def disable_job(job_id): """标记job为不可用""" app_logger.info("禁用job, jid:{0}".format(job_id)) try: with get_db() as session: CRUD_Analysis_Job.disable_job(session, job_id) except Exception as ex: app_logger.error("禁用job失败, iid:{0}, ex:{1}".format(job_id, ex)) return 0
def parse_info(text, scrape_pages_count: bool) -> Optional[ResultSummary]: bs = bs4.BeautifulSoup(text, features='html.parser') content = bs.find("div", {"id": "Content"}) if not content: app_logger.error("Invalid response HTML format, 'Content' element not found") return None if not scrape_pages_count: pages_count = None else: pages_count = 1 pagelist = content.find("div", {"class": "pagelist"}) if pagelist: pages_info = pagelist.find("p", attrs={'class': None}) if pages_info: pages_val = pages_info.text.rstrip() pattern_match = PostCodesSearcher.RX_PAGES_INFO_PATTERN.match(pages_val) if pattern_match: pages_count = int(pattern_match.group(2)) search_results_tbl = content.find("table", {"title": "Search results"}) if not search_results_tbl: app_logger.error("Invalid response HTML format, no 'Search results' table found") return None search_results_tbl_tbody = search_results_tbl.find("tbody") if not search_results_tbl_tbody: app_logger.error("Invalid response HTML format, no 'Search results' table's tbody found") return None result = [] trs = search_results_tbl_tbody.find_all("tr") if trs: for tr in trs: tds = tr.find_all("td") if len(tds) != 4: app_logger.error("Incorrect result table structure, should contain 4 columns, but %d found" % len(tds)) continue address_href = tds[0].find("a") if not address_href: app_logger.error("Incorrect result table structure, no 'address' element found") continue address_val = address_href.text.strip() council_tax_band_val = tds[1].text.strip() local_auth_ref_number_val = tds[3].text.strip() result.append(ResultItem(address=address_val, council_tax_band=council_tax_band_val, local_auth_ref_number=local_auth_ref_number_val)) return ResultSummary(items=result, pages_count=pages_count)
def init_handler(): processor = PriceDataProcessor() if processor.fetch_data(): if processor.get_errors(): problem_urls = processor.get_problem_urls() app_logger.warning("The following urls were not downloaded: %s" % ", ".join(problem_urls)) processor.preprocess_data() else: app_logger.error("All downloads failed")
def save_to_file(dst_csv_file_name, csv_cols, data: dict): file_exists = os.path.isfile(dst_csv_file_name) try: with open(dst_csv_file_name, 'a') as f: dict_writer = csv.DictWriter(f, fieldnames=csv_cols) if not file_exists: dict_writer.writeheader() for val in data: dict_writer.writerow(val) return True except IOError as e: app_logger.error(e, exc_info=True) return False
def alarm_big_key(begin_time=datetime.now() - timedelta(hours=12)): begin_time_str = begin_time.strftime("%Y-%m-%d %H:%M:%S") app_logger.info("开始发送报警") title = "大key报警" try: with get_db() as session: alarm_logs = CRUD_Alarm_Log.get_not_sended_log( session, begin_time_str) except Exception as ex: app_logger.error("获取报警信息失败! ex:{0}".format(ex), exc_info=True) raise Exception("获取报警信息失败") if len(alarm_logs) == 0: app_logger.info("无未发送报警") for alarm_log in alarm_logs: sleep(2) try: dingclient.sendto_ding(title, alarm_log.message) except Exception as ex: app_logger.error("发送报警失败! ex:{0}, message:{1}".format( ex, alarm_log.message), exc_info=True) continue try: with get_db() as session: CRUD_Alarm_Log.set_log_is_sended(session, alarm_log.log_id) except Exception as ex: app_logger.error("跟新报警消息状态失败! ex:{0}, message:{1}".format( ex, alarm_log.message), exc_info=True) continue
def run(self): postcodes_file_path = PROCESSED_DATA_DIR / "single.csv" if not postcodes_file_path.exists(): app_logger.error("No 'single.csv' file %s found. Exiting..." % postcodes_file_path) return # order of column names is important columns = ["Address", "Postcode", "Council Tax band", "Local authority reference number"] for chunk_df in pd.read_csv(postcodes_file_path, chunksize=100, header=None, usecols=[0]): for _, row in chunk_df.iterrows(): postcode = row[0] result_file = RESULTS_DIR / "{}.csv".format(postcode.replace(" ", "_")) if result_file.exists(): app_logger.warning("Skipping result file %s, already exists" % result_file) continue app_logger.info("Scraping %s postcode started" % postcode) items = self.query(postcode=postcode) result_list = [[result_item.address, postcode, result_item.council_tax_band, result_item.local_auth_ref_number] for result_item in items] if items else [] result_df = pd.DataFrame(result_list, columns=columns) result_df.to_csv(result_file, index=False) if items: app_logger.info("Scraping %s postcode completed" % postcode) else: app_logger.info("Scraping %s postcode completed, but it discovered no entries" % postcode) # sleep (5, 60) seconds randomly secs = random.randint(5, 60) app_logger.info("Sleeping %d seconds" % secs) time.sleep(secs)
def reflush_redis_instances(): app_logger.info("开始刷新实例!") try: all_instances_info = ALIREDIS.get_all_instances_info() except Exception as ex: app_logger.error("获取实例信息失败, ex:{0}".format(ex), exc_info=True) raise Exception("获取实例信息失败") app_logger.info("刷新实例!完成") with get_db() as session: for instances_info in all_instances_info: app_logger.info( "开始更新实例信息,instances_info:{0}".format(instances_info)) try: CRUD_Instances_Info.in_update_notin_insert( session, instances_info) except Exception as ex: app_logger.error( "更新实例信息失败! instances_info:{0}".format(instances_info), exc_info=True) app_logger.info("刷新实例完成") return 0
def preprocess_data() -> bool: app_logger.info("Preparing data started...") start = time.time() series: List[dd.Series] = [] for f in os.listdir(constants.DATA_DIR): if not f.endswith(".csv"): app_logger.warning("non-CSV file found in DATA_DIR: %s" % f) continue app_logger.info("Processing %s" % f) try: if len(series) < 2: df = dd.read_csv(constants.DATA_DIR / f, header=None) if len( df.columns ) != PriceDataProcessor.REQUIRED_CSV_FORMAT_COLUMNS_COUNT: app_logger.error( "File %s has insufficient amount of columns: required %d, found %d" % (f, PriceDataProcessor. REQUIRED_CSV_FORMAT_COLUMNS_COUNT, len(df.columns))) continue # we are interested in the 4th column's values fourth_col: dd.Series = df.iloc[:, 3] unique_vals_series = fourth_col.drop_duplicates() series.append(unique_vals_series) if len(series) == 2: # merge two Series into one and remove duplicates s = dd.concat(series).drop_duplicates() # keep the result Series in the first list's element del series[-1] series[0] = s except Exception as e: app_logger.error("Processing file %f had errors: " + str(e)) app_logger.info("Processing %s done" % f) if series: s: dd.Series = series[0] s.to_csv(constants.PROCESSED_DATA_DIR / "single.csv", single_file=True, index=False, header=False) else: app_logger.error( "Prepare data: could not generate the result CSV file") app_logger.info("Preparing data completed in %s seconds" % str(time.time() - start)) return bool(series)
def query_impl(self, postcode, page) -> Optional[ResultSummary]: if page == 1: request_data = dict(btnPush=1, txtRedirectTo='InitS.asp', txtStartKey="0", txtPageNum="0", txtPageSize="", intNumFound="", txtPostCode=postcode) request_url = self.SEARCH_INIT_URL.format(PostCodesSearcher.gen_rand_number(12)) else: request_data = dict(lstPageSize="20", txtRefSPostCode=postcode, txtStartKey=str((page - 1) * 20), txtPageNum=str(page), txtPageSize="20", txtPostCode=postcode) request_url = self.NEXT_RESULTS_URL h = headers.get_random_headers() h['Referer'] = PostCodesSearcher.SEARCH_INIT_URL try: resp = requests.post(url=request_url, data=request_data, headers=h) if resp.status_code != HTTPStatus.OK: app_logger.error("Request error: bad response code " + str(resp.status_code)) self.errors = True return None return self.parse_info(text=resp.text, scrape_pages_count=page == 1) except requests.exceptions.RequestException as e: app_logger.error("Request error: " + str(e)) self.errors = True return None except Exception as e: app_logger.error("General error: " + str(e)) self.errors = True return None
def create_analysis_job(instance_id=None): instances = [] if instance_id: try: job_info = ALIREDIS.create_cache_analysisjob(instance_id) # 如果存入数据库失败需要根据日志手工补 app_logger.info("新job信息:{0}".format(job_info)) with get_db() as session: CRUD_Analysis_Job.in_update_notin_insert(session, job_info) except Exception as ex: app_logger.error("创建job,跟新job表失败, ID:{0} ex:{1}".format( instance_id, ex), exc_info=True) else: try: with get_db() as session: instances = CRUD_Instances_Info.get_all_instances(session) except Exception as ex: app_logger.error("获取实例列表出错", exc_info=True) raise Exception("获取实例列表出错") if len(instances) == 0: app_logger.info("实例id为空值") return 0 for instance in instances: try: job_info = ALIREDIS.create_cache_analysisjob(instance.instance_id) app_logger.info("新job信息:{0}".format(job_info)) with get_db() as session: CRUD_Analysis_Job.in_update_notin_insert(session, job_info) except Exception as ex: app_logger.error("创建job,跟新job表失败, ID:{0} ex:{1}".format( instance.instance_id, ex), exc_info=True) return 0
def sync_and_udpate_day_job(day=str(date.today())): app_logger.info("开始更新job状态") with get_db() as session: try: jobs = CRUD_Analysis_Job.get_currentday_not_finnish_job( session, day) except Exception as ex: app_logger.error("获取未完成作业列表失败!", exc_info=True) raise Exception("获取未完成作业列表失败!") if len(jobs) == 0: app_logger.info("无未更新job") return 0 for job in jobs: try: job_info, job_data = ALIREDIS.get_analysisjob_info( job.instance_id, job.job_id) except RequestJobError as ex: app_logger.error("获取job信息失败! iid:{0},jid:{1},ex:{2}".format( job.instance_id, job.job_id, ex), exc_info=True) with get_db() as session: CRUD_Instances_Info.disable_instance(session, job.instance_id) CRUD_Analysis_Job.disable_job(session, job.job_id) app_logger.error( "因作业接口调用失败,怀疑实例为空实例,已经禁用实例与作业!! iid:{0},jid:{1}".format( job.instance_id, job.job_id)) continue except Exception as ex: app_logger.error("获取job信息失败! iid:{0},jid:{1},ex:{2}".format( job.instance_id, job.job_id, ex), exc_info=True) continue try: with get_db() as session: instance = CRUD_Instances_Info.get_instance( session, job.instance_id) except Exception as ex: app_logger.error("获取实例信息失败! iid:{0},jid:{1},ex:{2}".format( job.instance_id, job.job_id, ex), exc_info=True) continue if len(instance) == 0: continue try: app_logger.info("开始更新job状态, job_info:{0}".format(job_info)) with get_db() as session: CRUD_Analysis_Job.in_update_notin_insert(session, job_info) except Exception as ex: app_logger.error("更新job状态失败!job_info:{0}".format(job_info), exc_info=True) # 如果没有完成分析则跳出 if job_data == None: continue try: app_logger.info("开始更新key信息, job_info:{0}, job_data:{1}".format( job_info, job_data)) format_response_dict = Redis_BigKeys.format_big_keys_info( job_data, day, instance.instance_id, instance.instance_name) if format_response_dict["keys_info"]: with get_influxdb() as session: Redis_BigKeys.slave_to_influxdb( session, format_response_dict["keys_info"]) with get_influxdb() as session: Redis_BigKeys.slave_to_influxdb( session, format_response_dict["keyprefixes"]) except Exception as ex: app_logger.error( "存入key信息失败! job_info:{0}, day:{1}, iid:{2}, iname:{3}, ex:{4}". format(job_info, day, instance.instance_id, instance.instance_name, ex), exc_info=True) continue try: app_logger.info("更新savedata状态,job_info:{0}".format(job_info)) with get_db() as session: CRUD_Analysis_Job.update_job_save_data_status( session, job.job_id, 1) except Exception as ex: app_logger.error("更新savedata状态失败! jid:{0},ex:{1}".format( job.job_id, ex), exc_info=True) continue try: app_logger.info("生成报警信息,jid:{0}, date:{1}".format( job.job_id, format_response_dict["keys_info"])) alarm_log_dict = check_keys(format_response_dict["keys_info"]) # 如果没有触发阈值,则不保存报警信息 if len(alarm_log_dict) == 0: continue with get_db() as session: CRUD_Alarm_Log.insert(session, alarm_log_dict) except Exception as ex: app_logger.error("保存alarmlog失败, jid:{0},ex:{1}".format( job.job_id, ex), exc_info=True) return 0
def fetch_data(self) -> bool: self.errors = False self.problem_urls = [] self.clear_dir(constants.DATA_DIR) self.clear_dir(constants.PROCESSED_DATA_DIR) app_logger.info("Fetching started") if not constants.DOWNLOAD_LINKS: app_logger.warning("no links to download") self.errors = True return False start = time.time() for url in constants.DOWNLOAD_LINKS: app_logger.info("downloading %s..." % url) name = self.get_file_name(url) h = headers.get_random_headers() h['Referer'] = self.REFERER try: resp = requests.get(url=url, stream=True, headers=h) if resp.status_code != HTTPStatus.OK: app_logger("Request error: bad response code " + str(resp.status_code)) self.problem_urls.append(url) self.errors = True continue app_logger.info("saving %s..." % url) with open(constants.DATA_DIR / name, 'wb') as f: for chunk in resp.iter_content(chunk_size=self.CHUNK_SIZE): if chunk: f.write(chunk) app_logger.info("saved") except requests.exceptions.RequestException as e: app_logger.error("Request error: " + str(e)) self.problem_urls.append(url) self.errors = True except Exception as e: app_logger.error("General error: " + str(e)) self.problem_urls.append(url) self.errors = True # remove file data leftovers in case of errors # (it may be corrupted, incomplete, etc) self.del_file(constants.DATA_DIR / name) app_logger.info("Fetching data completed in %s seconds" % str(time.time() - start)) # check if at least some urls have been downloaded without problem return not self.errors or (len(self.problem_urls) < len( constants.DOWNLOAD_LINKS))
from sqlalchemy.sql import expression from sqlalchemy.sql.functions import array_agg from sqlalchemy.sql.sqltypes import String from sqlalchemy.util.compat import with_metaclass from alicache import AliyunRedis, RequestJobError, CreateJobError from model import CRUD_Instances_Info, CRUD_Analysis_Job, CRUD_Alarm_Log from logger import app_logger from session import get_db, get_influxdb from savedata import Redis_BigKeys from dingpush import dingclient from alarm import check_keys try: ALIREDIS = AliyunRedis() except Exception as ex: app_logger.error("初始化AliyunRedis类失败, ex:{0}".format(ex), exc_info=True) def reflush_redis_instances(): app_logger.info("开始刷新实例!") try: all_instances_info = ALIREDIS.get_all_instances_info() except Exception as ex: app_logger.error("获取实例信息失败, ex:{0}".format(ex), exc_info=True) raise Exception("获取实例信息失败") app_logger.info("刷新实例!完成") with get_db() as session: for instances_info in all_instances_info: app_logger.info( "开始更新实例信息,instances_info:{0}".format(instances_info))
async def json_error_middleware(request, handler): try: return await handler(request) except web.HTTPException as ex: logger.error(ex.text) return web.json_response({'error': ex.text}, status=ex.status)
def save_last_date(fname, val: str): try: with open(fname, "w") as f: f.write("%s\n" % val) except Exception as e: app_logger.error(e, exc_info=True)