Exemplo n.º 1
0
 def __init__(
     self,
     config: typing.Dict[str, typing.Any],
 ):
     super().__init__()
     self.config = config
     self.slack_client = SlackClient(config.get("SLACK_CHANNEL"),
                                     config.get("SLACK_API_TOKEN"))
     self.taein_client = TaeinClient(
         client_delay=self.config['CLIENT_DELAY'],
         proxy=random.choice(self.config['PROXY_HOST_LIST']))
     self.s3_client = S3Client(config)
     self.total_statistics = CrawlerStatistics()
     self.failure_statistics = CrawlerStatistics()
     self.crawling_date: datetime.datetime = tznow(
         pytz.timezone("Asia/Seoul"))
     self.crawling_start_time: str = str(
         timestamp(tznow(pytz.timezone("Asia/Seoul"))))
Exemplo n.º 2
0
    def update_crawler_log(self, run_by: str, name_type: str) -> None:
        """
        크롤러 로그는 기존 크롤러 로그를 업데이트하는 방식으로 작성되어집니다.
        """
        region_date_list: typing.List[CrawlerRegionDate] = []
        total_statistics = attr.asdict(self.total_statistics)
        if name_type == "토지이용계획정보":
            total_statistics["land_feature_zip_count"] = 0
        elif name_type == "토지특성정보":
            total_statistics["land_use_zip_count"] = 0
        elif name_type == "토지이용계획정보없음":
            total_statistics["land_feature_zip_count"] = 0
            total_statistics["land_use_zip_count"] = 0
            name_type = "토지이용계획정보"
        elif name_type == "토지특성정보없음":
            total_statistics["land_feature_zip_count"] = 0
            total_statistics["land_use_zip_count"] = 0
            name_type = "토지특성정보"

        if name_type == "토지이용계획정보":
            for region, date in self.region_land_use_dict.items():
                if date != "0001-01-01":
                    region_date_list.append(
                        CrawlerRegionDate(region=region, date=date)
                    )
        elif name_type == "토지특성정보":
            for region, date in self.region_land_feature_dict.items():
                if date != "0001-01-01":
                    region_date_list.append(
                        CrawlerRegionDate(region=region, date=date)
                    )

        data = {
            "time_stamp": self.crawling_start_time,
            "run_by": run_by,
            "finish_time_stamp": str(timestamp(tznow())),
            "total_statistics": total_statistics,
            "region_date": [vars(x) for x in region_date_list],
        }

        folder_name = (
            f"{self.config['ENVIRONMENT']}/"
            f"{self.crawling_date.year}/"
            f"{self.crawling_date.month:02}/"
            f"{self.crawling_date.day:02}/"
            f"{str(self.crawling_start_time)}/"
            f"{name_type}/"
            f"crawler-log"
        )

        file_name = f"{self.crawling_start_time}.json"

        self.s3_client.upload_s3(
            folder_name, file_name, data, "application/json", encoding="utf-8"
        )
Exemplo n.º 3
0
 def __init__(
     self,
     config: typing.Dict[str, typing.Any],
 ) -> None:
     super().__init__()
     self.config = config
     self.slack_client = SlackClient(
         config.get("SLACK_CHANNEL"), config.get("SLACK_API_TOKEN")
     )
     self.nsdi_client = NsdiClient(config)
     self.s3_client = S3Client(config)
     self.region_land_use_dict: typing.Dict[str, str] = dict()
     self.region_land_feature_dict: typing.Dict[str, str] = dict()
     self.total_statistics = CrawlerStatistics()
     self.failure_statistics = CrawlerStatistics()
     self.crawling_date: datetime.datetime = tznow(
         pytz.timezone("Asia/Seoul")
     )
     self.crawling_start_time: str = str(
         timestamp(tznow(pytz.timezone("Asia/Seoul")))
     )
Exemplo n.º 4
0
 def __init__(self, config: typing.Dict[str, typing.Any]) -> None:
     super().__init__()
     self.config = config
     self.session_factory = create_session_factory(config)
     self.s3_client = S3Client(config)
     self.slack_client = SlackClient(
         config.get("SLACK_CHANNEL"), config.get("SLACK_API_TOKEN")
     )
     self.storing_date: datetime.datetime = tznow(
         pytz.timezone("Asia/Seoul")
     )
     self.region_level_1 = self.config["REGION_REGEX_LEVEL_1"]
     self.region_level_2 = self.config["REGION_REGEX_LEVEL_2"]
     self.region_level_3 = self.config["REGION_REGEX_LEVEL_3"]
     self.competed_sido_ids: typing.Dict[str, int] = dict()
     self.completed_gugun_ids: typing.Dict[str, int] = dict()
Exemplo n.º 5
0
    def __init__(
        self,
        config: typing.Dict[str, typing.Any],
    ):
        super().__init__()
        self.config = config

        self.slack_client = SlackClient(config.get("SLACK_CHANNEL"),
                                        config.get("SLACK_API_TOKEN"))
        self.info_care_client = InfocareClient(config)
        self.s3_client = S3Client(config)
        self.total_statistics = CrawlerStatistics()
        self.failure_statistics = CrawlerStatistics()
        self.crawling_date: datetime.datetime = tznow(
            pytz.timezone("Asia/Seoul"))
        self.crawling_start_time: str = str(timestamp(self.crawling_date))
Exemplo n.º 6
0
    def update_crawler_log(self, run_by: str) -> None:
        total_statistics = attr.asdict(self.total_statistics)

        data = {
            "time_stamp": self.crawling_start_time,
            "run_by": run_by,
            "finish_time_stamp": str(timestamp(tznow())),
            "total_statistics": total_statistics,
        }

        folder_name = (f"{self.config['ENVIRONMENT']}/"
                       f"{self.crawling_date.year}/"
                       f"{self.crawling_date.month:02}/"
                       f"{self.crawling_date.day:02}/"
                       f"{str(self.crawling_start_time)}/"
                       f"crawler-log")

        file_name = f"{self.crawling_start_time}.json"

        self.s3_client.upload_json(folder_name=folder_name,
                                   file_name=file_name,
                                   data=data)
Exemplo n.º 7
0
    def upload_crawler_log_to_s3(self, run_by: str) -> None:
        total_statistics = attr.asdict(self.total_statistics)
        area_step = self.config["BUILDING_AREA_STEP"]
        area_start = self.config["BUILDING_AREA_START"]
        area_end = self.config["BUILDING_AREA_END"]
        area_range = [{
            "start_area": area_end,
            "end_area": 1000
        } if x == area_end else {
            "start_area": x,
            "end_area": x + area_step
        } for x in range(area_start, area_end + area_step, area_step)]
        data = {
            "time_stamp": self.crawling_start_time,
            "run_by": run_by,
            "finish_time_stamp": str(timestamp(tznow())),
            "total_statistics": total_statistics,
            "area_range": area_range,
        }

        folder_name = (f"{self.config['ENVIRONMENT']}/"
                       f"{self.crawling_date.year}/"
                       f"{self.crawling_date.month:02}/"
                       f"{self.crawling_date.day:02}/"
                       f"{str(self.crawling_start_time)}/"
                       f"crawler-log")

        file_name = f"{self.crawling_start_time}.json"

        self.s3_client.upload_json(folder_name=folder_name,
                                   file_name=file_name,
                                   data=data)

        logger.info(
            "Upload crawler log to s3",
            folder_name=folder_name,
            file_name=file_name,
        )