class BaseCrawler(metaclass=ABCMeta): """ 만약 posts_soup 와 soup(post_soup) 를 분리해서 사용하고 싶다면 set_posts_soup 에서 self.soup = ... -> self.posts_soup = ... 이후 self.posts_soup 로 get_posts에서 독립적인 사용이 가능하다. """ def __init__( self, table: NamedTuple, brand_name: str, main_url: str, base_page_url: Optional[str] = "", model_date_format: str = "%Y-%m-%d", encoding: Optional[str] = None, custom_config: Config = None, ): self.brand_name = brand_name if custom_config is None: self.config = Config() else: self.config = custom_config self.init_logger() self.base_page_url = base_page_url self.curr_ctgr_url = None self.curr_post_url = None self.table = table if self.config.is_test is True: self.table_name = 'test.' + self.table.__name__ else: self.table_name = 'public.' + self.table.__name__ self.conn = DB(table=table, table_name=self.table_name) self.encoding = encoding self.session = RetrySession(encoding=encoding) self.main_url = main_url self.model_date_format = model_date_format self.soup = None self.posts_soup = None self.metas = {} if not self.config.is_valid(): raise ValueError(f"Invalid Config: \n {self.config.__repr__()}") def __getstate__(self): d = self.__dict__.copy() if "logger" in d: d["logger"] = d["logger"].name return d def __setstate__(self, d): if "logger" in d: d["logger"] = getLogger(d["logger"]) self.__dict__.update(d) # must add add_query as 'where brand_name' for unique together news_id, brand_name def get_exist_ids(self): rows = self.conn.select( fields=["news_id"], table=self.table_name, add_query=f" WHERE media_name='{self.brand_name}'", ) return set(map(lambda r: int(r[0]), rows)) # ======================== FLOW ============================= def go(self): self.logger.info( f"{self.brand_name} Crawler Start at {datetime.now()}") self.page_parse() def page_parse(self) -> None: datas = [] stop_crawling = False for page in self.gen_pages(): if stop_crawling: break self.curr_ctgr_url = page self.set_posts_soup(page) posts = self.get_posts() self.logger.debug( f"Num of Crawled Posts: {len(posts)} in Page: {page} ") for url in posts: self.curr_post_url = url try: data: NamedTuple = self.detail_parse() except Exception as e: self.logger.error( f"Error During Detail Parse \nURL: {url} \nDetail: {e}" ) continue # 현재 크롤링한 아이템의 날짜가 limit date를 넘으면 저장하지 않고 멈춤 if self.get_date() < self.config.end_date: stop_crawling = True break datas.append(tuple(data)) self.save(datas=datas) @retry(8, 3) def set_soup(self, url: str) -> None: try: res = self.session.get(url) self.soup = BeautifulSoup(res, "html.parser") except TypeError: self.soup = BeautifulSoup(res.text, "html.parser") @retry(5, 3) def set_posts_soup(self, url: str) -> None: self.set_soup(url) def set_metas(self) -> None: metas = {} for m in self.soup.find_all("meta"): if not isinstance(m, Tag): continue k = m.get("property", m.get("name", m.get('http-equiv', None))) v = m.get("content", None) if k is not None and v is not None: metas[k] = v self.metas = metas @staticmethod def get_id(url: str) -> str: return re.findall("\d+", url)[-1] def data_cleaning(self, datas: List[Tuple]) -> List[Tuple]: result = [] id_idx = 1 unique_ids = set(map(lambda d: d[id_idx], datas)) exist_ids = self.get_exist_ids() new_ids = unique_ids.difference(exist_ids) for d in datas: new_id = d[id_idx] if new_id in new_ids: result.append(d) new_ids.remove(new_id) return result def save(self, datas: List[Tuple]): self.logger.info( f"{self.brand_name} Crawler Finished at {datetime.now()}") self.logger.debug(f"Length Of Datas Before Cleaning: {len(datas)}") clean_datas = self.data_cleaning(datas) self.logger.debug( f"Length Of Datas After Cleaning: {len(clean_datas)}") if self.config.is_test is True: self.save_datas_as_pickle(clean_datas) assert self.table_name == f'test.{self.table.__name__}' self.conn.insert_magazine(clean_datas) self.logger.debug(f"Insert To Data to DB \n {self.conn.__dict__}") def detail_parse(self): post_url = self.curr_post_url self.set_soup(post_url) self.set_metas() return self.table( self.brand_name, int(self.get_id(post_url)), self.get_date(), self.get_title(), self.get_content(), self.curr_post_url, self.get_keywords(), self.get_post_type(), ) def gen_pages(self) -> Generator: page = 0 while True: page += 1 yield self.base_page_url + str(page) @abstractmethod def get_date(self) -> date: pass @abstractmethod def get_posts(self) -> List[str]: pass @abstractmethod def get_title(self) -> str: pass @abstractmethod def get_content(self) -> str: pass @abstractmethod def get_post_type(self) -> Optional[str]: pass @abstractmethod def get_keywords(self) -> str: pass # ======================== FLOW END ============================= # ======================== Utils[Optional] ====================== def save_datas_as_pickle(self, datas): pkl_save_path = f"{self.config.TESTFILE_SAVE_PATH}/" Path(pkl_save_path).mkdir(parents=True, exist_ok=True) with open(f"{pkl_save_path}/{self.brand_name}.pkl", "wb") as f: pickle.dump(datas, f) @staticmethod def clean_date_txt(txt): return txt.lower() @staticmethod def get_clean_txt(txt): return "".join(txt.split()) @check_return(date) def extract_time(self, text: str) -> Optional[date]: clean_txt = self.clean_date_txt(text) date_time = next(datefinder.find_dates(clean_txt), None) if date_time is None: return None return date_time.date() def furbish_link(self, link: str, prefix=None) -> str: if link.startswith("//"): link = "http:" + link if prefix is not None and isinstance(prefix, str): link = prefix + link if "http" not in link or link.startswith("/"): link = urljoin(self.main_url, link) return link def get_links(self, attrs) -> Dict[str, str]: # attrs: Dictionary or has values() method suspectors = ["href", "src", "ec-data-src"] links = defaultdict(str) for s in suspectors: link = attrs.get(s, None) if link is not None and isinstance(link, str): links[s] = self.furbish_link(link) return links @staticmethod def attrs_to_text(attrs) -> str: # attrs: Dictionary or has values() method all_hints = [] for v in attrs.values(): if isinstance(v, list): all_hints.append(" ".join(v)) else: all_hints.append(v) return " ".join(all_hints) def init_logger(self): if self.config is None or self.brand_name is None: raise SyntaxError( "Must be Called After Ininitalize !Config and brand_name!") l_path = self.config.LOG_SAVE_PATH if not os.path.exists(l_path): os.makedirs(l_path, exist_ok=True) self.logger = named_logger(f"{opjoin(l_path, self.brand_name)}.log", "crawl_logger")