Exemplo n.º 1
0
def retrieve_reports(url, market, dst_file, date_val, cols) -> DownloadDetails:
    request_data = dict(market=market, date=date_val)

    response = requests.post(url=url,
                             headers=headers.get_random_headers(),
                             data=request_data)

    if response.status_code == requests.codes.ok:
        text = response.text
        try:
            json_val = json.loads(text)
            results = json_val['results']
        except (ValueError, KeyError) as e:
            app_logger.error(e, exc_info=True)
        else:
            app_logger.info("Downloaded entries for date %s OK" % date_val)
            # save if there's some non-empty data
            if results:
                if save_to_file(dst_csv_file_name=dst_file,
                                csv_cols=cols,
                                data=results):
                    app_logger.info("Saved entries for date %s OK" % date_val)
                    return DownloadDetails(skipped=False, status=True)
            else:
                app_logger.warning("Skipped empty entries for date %s" %
                                   date_val)
                return DownloadDetails(skipped=True, status=True)
    else:
        app_logger.error(
            "Data for %s is not available, request returned %d status" %
            (date_val, response.status_code))

    return DownloadDetails(skipped=False, status=False)
Exemplo n.º 2
0
def enable_instance(instance_id):
    """标记实例为可用"""
    app_logger.info("启用实例, iid:{0}".format(instance_id))
    try:
        with get_db() as session:
            CRUD_Instances_Info.enable_instance(session, instance_id)
    except Exception as ex:
        app_logger.error("启用实例失败, iid:{0}, ex:{1}".format(instance_id, ex))
    return 0
Exemplo n.º 3
0
def disable_job(job_id):
    """标记job为不可用"""
    app_logger.info("禁用job, jid:{0}".format(job_id))
    try:
        with get_db() as session:
            CRUD_Analysis_Job.disable_job(session, job_id)
    except Exception as ex:
        app_logger.error("禁用job失败, iid:{0}, ex:{1}".format(job_id, ex))
    return 0
Exemplo n.º 4
0
    def parse_info(text, scrape_pages_count: bool) -> Optional[ResultSummary]:

        bs = bs4.BeautifulSoup(text, features='html.parser')

        content = bs.find("div", {"id": "Content"})
        if not content:
            app_logger.error("Invalid response HTML format, 'Content' element not found")
            return None

        if not scrape_pages_count:
            pages_count = None
        else:
            pages_count = 1
            pagelist = content.find("div", {"class": "pagelist"})
            if pagelist:
                pages_info = pagelist.find("p", attrs={'class': None})
                if pages_info:
                    pages_val = pages_info.text.rstrip()
                    pattern_match = PostCodesSearcher.RX_PAGES_INFO_PATTERN.match(pages_val)
                    if pattern_match:
                        pages_count = int(pattern_match.group(2))

        search_results_tbl = content.find("table", {"title": "Search results"})
        if not search_results_tbl:
            app_logger.error("Invalid response HTML format, no 'Search results' table found")
            return None

        search_results_tbl_tbody = search_results_tbl.find("tbody")
        if not search_results_tbl_tbody:
            app_logger.error("Invalid response HTML format, no 'Search results' table's tbody found")
            return None

        result = []
        trs = search_results_tbl_tbody.find_all("tr")
        if trs:
            for tr in trs:
                tds = tr.find_all("td")
                if len(tds) != 4:
                    app_logger.error("Incorrect result table structure, should contain 4 columns, but %d found"
                                     % len(tds))
                    continue

                address_href = tds[0].find("a")
                if not address_href:
                    app_logger.error("Incorrect result table structure, no 'address' element found")
                    continue

                address_val = address_href.text.strip()
                council_tax_band_val = tds[1].text.strip()
                local_auth_ref_number_val = tds[3].text.strip()

                result.append(ResultItem(address=address_val,
                                         council_tax_band=council_tax_band_val,
                                         local_auth_ref_number=local_auth_ref_number_val))

        return ResultSummary(items=result, pages_count=pages_count)
Exemplo n.º 5
0
def init_handler():

    processor = PriceDataProcessor()
    if processor.fetch_data():
        if processor.get_errors():
            problem_urls = processor.get_problem_urls()
            app_logger.warning("The following urls were not downloaded: %s" %
                               ", ".join(problem_urls))

        processor.preprocess_data()
    else:
        app_logger.error("All downloads failed")
Exemplo n.º 6
0
def save_to_file(dst_csv_file_name, csv_cols, data: dict):
    file_exists = os.path.isfile(dst_csv_file_name)

    try:
        with open(dst_csv_file_name, 'a') as f:
            dict_writer = csv.DictWriter(f, fieldnames=csv_cols)

            if not file_exists:
                dict_writer.writeheader()

            for val in data:
                dict_writer.writerow(val)

            return True
    except IOError as e:
        app_logger.error(e, exc_info=True)
        return False
Exemplo n.º 7
0
def alarm_big_key(begin_time=datetime.now() - timedelta(hours=12)):
    begin_time_str = begin_time.strftime("%Y-%m-%d %H:%M:%S")
    app_logger.info("开始发送报警")
    title = "大key报警"
    try:
        with get_db() as session:
            alarm_logs = CRUD_Alarm_Log.get_not_sended_log(
                session, begin_time_str)

    except Exception as ex:
        app_logger.error("获取报警信息失败! ex:{0}".format(ex), exc_info=True)
        raise Exception("获取报警信息失败")

    if len(alarm_logs) == 0:
        app_logger.info("无未发送报警")

    for alarm_log in alarm_logs:
        sleep(2)
        try:
            dingclient.sendto_ding(title, alarm_log.message)
        except Exception as ex:
            app_logger.error("发送报警失败! ex:{0}, message:{1}".format(
                ex, alarm_log.message),
                             exc_info=True)
            continue

        try:
            with get_db() as session:
                CRUD_Alarm_Log.set_log_is_sended(session, alarm_log.log_id)
        except Exception as ex:
            app_logger.error("跟新报警消息状态失败! ex:{0}, message:{1}".format(
                ex, alarm_log.message),
                             exc_info=True)
            continue
Exemplo n.º 8
0
    def run(self):
        postcodes_file_path = PROCESSED_DATA_DIR / "single.csv"

        if not postcodes_file_path.exists():
            app_logger.error("No 'single.csv' file %s found. Exiting..." % postcodes_file_path)
            return

        # order of column names is important
        columns = ["Address", "Postcode", "Council Tax band", "Local authority reference number"]

        for chunk_df in pd.read_csv(postcodes_file_path, chunksize=100, header=None, usecols=[0]):
            for _, row in chunk_df.iterrows():

                postcode = row[0]
                result_file = RESULTS_DIR / "{}.csv".format(postcode.replace(" ", "_"))
                if result_file.exists():
                    app_logger.warning("Skipping result file %s, already exists" % result_file)
                    continue

                app_logger.info("Scraping %s postcode started" % postcode)
                items = self.query(postcode=postcode)

                result_list = [[result_item.address,
                                postcode,
                                result_item.council_tax_band,
                                result_item.local_auth_ref_number] for result_item in items] if items else []

                result_df = pd.DataFrame(result_list, columns=columns)
                result_df.to_csv(result_file, index=False)
                if items:
                    app_logger.info("Scraping %s postcode completed" % postcode)
                else:
                    app_logger.info("Scraping %s postcode completed, but it discovered no entries" % postcode)

                # sleep (5, 60) seconds randomly
                secs = random.randint(5, 60)
                app_logger.info("Sleeping %d seconds" % secs)
                time.sleep(secs)
Exemplo n.º 9
0
def reflush_redis_instances():
    app_logger.info("开始刷新实例!")
    try:
        all_instances_info = ALIREDIS.get_all_instances_info()
    except Exception as ex:
        app_logger.error("获取实例信息失败, ex:{0}".format(ex), exc_info=True)
        raise Exception("获取实例信息失败")
    app_logger.info("刷新实例!完成")

    with get_db() as session:
        for instances_info in all_instances_info:
            app_logger.info(
                "开始更新实例信息,instances_info:{0}".format(instances_info))
            try:
                CRUD_Instances_Info.in_update_notin_insert(
                    session, instances_info)
            except Exception as ex:
                app_logger.error(
                    "更新实例信息失败! instances_info:{0}".format(instances_info),
                    exc_info=True)
        app_logger.info("刷新实例完成")

    return 0
Exemplo n.º 10
0
    def preprocess_data() -> bool:
        app_logger.info("Preparing data started...")

        start = time.time()

        series: List[dd.Series] = []
        for f in os.listdir(constants.DATA_DIR):
            if not f.endswith(".csv"):
                app_logger.warning("non-CSV file found in DATA_DIR: %s" % f)
                continue

            app_logger.info("Processing %s" % f)
            try:
                if len(series) < 2:
                    df = dd.read_csv(constants.DATA_DIR / f, header=None)

                    if len(
                            df.columns
                    ) != PriceDataProcessor.REQUIRED_CSV_FORMAT_COLUMNS_COUNT:
                        app_logger.error(
                            "File %s has insufficient amount of columns: required %d, found %d"
                            % (f, PriceDataProcessor.
                               REQUIRED_CSV_FORMAT_COLUMNS_COUNT,
                               len(df.columns)))
                        continue

                    # we are interested in the 4th column's values
                    fourth_col: dd.Series = df.iloc[:, 3]
                    unique_vals_series = fourth_col.drop_duplicates()
                    series.append(unique_vals_series)

                if len(series) == 2:
                    # merge two Series into one and remove duplicates
                    s = dd.concat(series).drop_duplicates()

                    # keep the result Series in the first list's element
                    del series[-1]
                    series[0] = s

            except Exception as e:
                app_logger.error("Processing file %f had errors: " + str(e))

            app_logger.info("Processing %s done" % f)

        if series:
            s: dd.Series = series[0]
            s.to_csv(constants.PROCESSED_DATA_DIR / "single.csv",
                     single_file=True,
                     index=False,
                     header=False)
        else:
            app_logger.error(
                "Prepare data: could not generate the result CSV file")

        app_logger.info("Preparing data completed in %s seconds" %
                        str(time.time() - start))
        return bool(series)
Exemplo n.º 11
0
    def query_impl(self, postcode, page) -> Optional[ResultSummary]:

        if page == 1:
            request_data = dict(btnPush=1,
                                txtRedirectTo='InitS.asp',
                                txtStartKey="0",
                                txtPageNum="0",
                                txtPageSize="",
                                intNumFound="",
                                txtPostCode=postcode)
            request_url = self.SEARCH_INIT_URL.format(PostCodesSearcher.gen_rand_number(12))
        else:
            request_data = dict(lstPageSize="20",
                                txtRefSPostCode=postcode,
                                txtStartKey=str((page - 1) * 20),
                                txtPageNum=str(page),
                                txtPageSize="20",
                                txtPostCode=postcode)
            request_url = self.NEXT_RESULTS_URL

        h = headers.get_random_headers()
        h['Referer'] = PostCodesSearcher.SEARCH_INIT_URL

        try:
            resp = requests.post(url=request_url,
                                 data=request_data,
                                 headers=h)
            if resp.status_code != HTTPStatus.OK:
                app_logger.error("Request error: bad response code " + str(resp.status_code))
                self.errors = True
                return None

            return self.parse_info(text=resp.text, scrape_pages_count=page == 1)

        except requests.exceptions.RequestException as e:

            app_logger.error("Request error: " + str(e))
            self.errors = True
            return None
        except Exception as e:

            app_logger.error("General error: " + str(e))
            self.errors = True
            return None
Exemplo n.º 12
0
def create_analysis_job(instance_id=None):
    instances = []
    if instance_id:
        try:
            job_info = ALIREDIS.create_cache_analysisjob(instance_id)
            # 如果存入数据库失败需要根据日志手工补
            app_logger.info("新job信息:{0}".format(job_info))
            with get_db() as session:
                CRUD_Analysis_Job.in_update_notin_insert(session, job_info)
        except Exception as ex:
            app_logger.error("创建job,跟新job表失败, ID:{0} ex:{1}".format(
                instance_id, ex),
                             exc_info=True)
    else:
        try:
            with get_db() as session:
                instances = CRUD_Instances_Info.get_all_instances(session)
        except Exception as ex:
            app_logger.error("获取实例列表出错", exc_info=True)
            raise Exception("获取实例列表出错")

    if len(instances) == 0:
        app_logger.info("实例id为空值")
        return 0

    for instance in instances:
        try:
            job_info = ALIREDIS.create_cache_analysisjob(instance.instance_id)
            app_logger.info("新job信息:{0}".format(job_info))
            with get_db() as session:
                CRUD_Analysis_Job.in_update_notin_insert(session, job_info)
        except Exception as ex:
            app_logger.error("创建job,跟新job表失败, ID:{0} ex:{1}".format(
                instance.instance_id, ex),
                             exc_info=True)

    return 0
Exemplo n.º 13
0
def sync_and_udpate_day_job(day=str(date.today())):
    app_logger.info("开始更新job状态")
    with get_db() as session:
        try:
            jobs = CRUD_Analysis_Job.get_currentday_not_finnish_job(
                session, day)
        except Exception as ex:
            app_logger.error("获取未完成作业列表失败!", exc_info=True)
            raise Exception("获取未完成作业列表失败!")

    if len(jobs) == 0:
        app_logger.info("无未更新job")
        return 0

    for job in jobs:
        try:
            job_info, job_data = ALIREDIS.get_analysisjob_info(
                job.instance_id, job.job_id)
        except RequestJobError as ex:
            app_logger.error("获取job信息失败! iid:{0},jid:{1},ex:{2}".format(
                job.instance_id, job.job_id, ex),
                             exc_info=True)
            with get_db() as session:
                CRUD_Instances_Info.disable_instance(session, job.instance_id)
                CRUD_Analysis_Job.disable_job(session, job.job_id)
            app_logger.error(
                "因作业接口调用失败,怀疑实例为空实例,已经禁用实例与作业!! iid:{0},jid:{1}".format(
                    job.instance_id, job.job_id))
            continue
        except Exception as ex:
            app_logger.error("获取job信息失败! iid:{0},jid:{1},ex:{2}".format(
                job.instance_id, job.job_id, ex),
                             exc_info=True)
            continue

        try:
            with get_db() as session:
                instance = CRUD_Instances_Info.get_instance(
                    session, job.instance_id)
        except Exception as ex:
            app_logger.error("获取实例信息失败! iid:{0},jid:{1},ex:{2}".format(
                job.instance_id, job.job_id, ex),
                             exc_info=True)
            continue

        if len(instance) == 0:
            continue

        try:
            app_logger.info("开始更新job状态, job_info:{0}".format(job_info))
            with get_db() as session:
                CRUD_Analysis_Job.in_update_notin_insert(session, job_info)
        except Exception as ex:
            app_logger.error("更新job状态失败!job_info:{0}".format(job_info),
                             exc_info=True)

        # 如果没有完成分析则跳出
        if job_data == None:
            continue

        try:
            app_logger.info("开始更新key信息, job_info:{0}, job_data:{1}".format(
                job_info, job_data))
            format_response_dict = Redis_BigKeys.format_big_keys_info(
                job_data, day, instance.instance_id, instance.instance_name)

            if format_response_dict["keys_info"]:
                with get_influxdb() as session:
                    Redis_BigKeys.slave_to_influxdb(
                        session, format_response_dict["keys_info"])

                with get_influxdb() as session:
                    Redis_BigKeys.slave_to_influxdb(
                        session, format_response_dict["keyprefixes"])

        except Exception as ex:
            app_logger.error(
                "存入key信息失败! job_info:{0}, day:{1}, iid:{2}, iname:{3}, ex:{4}".
                format(job_info, day, instance.instance_id,
                       instance.instance_name, ex),
                exc_info=True)
            continue

        try:
            app_logger.info("更新savedata状态,job_info:{0}".format(job_info))
            with get_db() as session:
                CRUD_Analysis_Job.update_job_save_data_status(
                    session, job.job_id, 1)
        except Exception as ex:
            app_logger.error("更新savedata状态失败! jid:{0},ex:{1}".format(
                job.job_id, ex),
                             exc_info=True)
            continue

        try:
            app_logger.info("生成报警信息,jid:{0}, date:{1}".format(
                job.job_id, format_response_dict["keys_info"]))
            alarm_log_dict = check_keys(format_response_dict["keys_info"])
            # 如果没有触发阈值,则不保存报警信息
            if len(alarm_log_dict) == 0:
                continue

            with get_db() as session:
                CRUD_Alarm_Log.insert(session, alarm_log_dict)
        except Exception as ex:
            app_logger.error("保存alarmlog失败, jid:{0},ex:{1}".format(
                job.job_id, ex),
                             exc_info=True)

    return 0
Exemplo n.º 14
0
    def fetch_data(self) -> bool:
        self.errors = False
        self.problem_urls = []

        self.clear_dir(constants.DATA_DIR)
        self.clear_dir(constants.PROCESSED_DATA_DIR)

        app_logger.info("Fetching started")

        if not constants.DOWNLOAD_LINKS:
            app_logger.warning("no links to download")
            self.errors = True
            return False

        start = time.time()

        for url in constants.DOWNLOAD_LINKS:
            app_logger.info("downloading %s..." % url)

            name = self.get_file_name(url)

            h = headers.get_random_headers()
            h['Referer'] = self.REFERER

            try:
                resp = requests.get(url=url, stream=True, headers=h)
                if resp.status_code != HTTPStatus.OK:
                    app_logger("Request error: bad response code " +
                               str(resp.status_code))
                    self.problem_urls.append(url)
                    self.errors = True
                    continue

                app_logger.info("saving %s..." % url)

                with open(constants.DATA_DIR / name, 'wb') as f:
                    for chunk in resp.iter_content(chunk_size=self.CHUNK_SIZE):
                        if chunk:
                            f.write(chunk)

                app_logger.info("saved")

            except requests.exceptions.RequestException as e:
                app_logger.error("Request error: " + str(e))
                self.problem_urls.append(url)
                self.errors = True
            except Exception as e:
                app_logger.error("General error: " + str(e))
                self.problem_urls.append(url)
                self.errors = True

                # remove file data leftovers in case of errors
                # (it may be corrupted, incomplete, etc)
                self.del_file(constants.DATA_DIR / name)

        app_logger.info("Fetching data completed in %s seconds" %
                        str(time.time() - start))

        # check if at least some urls have been downloaded without problem
        return not self.errors or (len(self.problem_urls) < len(
            constants.DOWNLOAD_LINKS))
Exemplo n.º 15
0
from sqlalchemy.sql import expression
from sqlalchemy.sql.functions import array_agg
from sqlalchemy.sql.sqltypes import String
from sqlalchemy.util.compat import with_metaclass
from alicache import AliyunRedis, RequestJobError, CreateJobError
from model import CRUD_Instances_Info, CRUD_Analysis_Job, CRUD_Alarm_Log
from logger import app_logger
from session import get_db, get_influxdb
from savedata import Redis_BigKeys
from dingpush import dingclient
from alarm import check_keys

try:
    ALIREDIS = AliyunRedis()
except Exception as ex:
    app_logger.error("初始化AliyunRedis类失败, ex:{0}".format(ex), exc_info=True)


def reflush_redis_instances():
    app_logger.info("开始刷新实例!")
    try:
        all_instances_info = ALIREDIS.get_all_instances_info()
    except Exception as ex:
        app_logger.error("获取实例信息失败, ex:{0}".format(ex), exc_info=True)
        raise Exception("获取实例信息失败")
    app_logger.info("刷新实例!完成")

    with get_db() as session:
        for instances_info in all_instances_info:
            app_logger.info(
                "开始更新实例信息,instances_info:{0}".format(instances_info))
Exemplo n.º 16
0
async def json_error_middleware(request, handler):
    try:
        return await handler(request)
    except web.HTTPException as ex:
        logger.error(ex.text)
        return web.json_response({'error': ex.text}, status=ex.status)
Exemplo n.º 17
0
def save_last_date(fname, val: str):
    try:
        with open(fname, "w") as f:
            f.write("%s\n" % val)
    except Exception as e:
        app_logger.error(e, exc_info=True)