class CategoryCrawl(object): CATEGORY_ID = 50000000 # Default value COLLECTION = 'category' _DELIMITER = 'category?catId=' _PATH_TOKEN = '#' def __init__(self): # 크롤링 설정 정보 관리 - singleton self.crawl_config: CrawlConfiguration = ConfigManager().crawl_config # Database manager - 데이터 조회 및 저장을 여기서 합니다. - singleton self.database_manager = DatabaseManager() # 중복 데이터 확인을 위해 미리 저장된 결과 list를 조회한다. self._category_list: list = list( self.database_manager.find_all_mongo(self.COLLECTION)) self.CATEGORY_ID = self.crawl_config.category_id def _update(self, cid, name, paths: str): # Database manager - 데이터 조회 및 저장을 여기서 합니다. - singleton self.database_manager = DatabaseManager() _query = self.database_manager.find_query('cid', cid) _update_data = dict() _update_data['name'] = name _update_data['paths'] = paths _update_data['update_time'] = datetime.now() return self.database_manager.update(self.COLLECTION, _query, {"&set": _update_data}) def _insert(self, cid, name, paths: str, is_root: bool = False): """ Mongo Database Insert """ _is_exists: bool = False for item in self._category_list: _name = item['name'] _cid = item['cid'] _paths = item['paths'] if is_root: if eq(_name, name): self._category_list.remove(item) return else: if eq(_cid, cid): if eq(_name, name) and eq(_paths, paths): self._category_list.remove(item) return else: self._update() self._category_list.remove(item) return _category_document = dict() _category_document['cid'] = cid _category_document['name'] = name _category_document['paths'] = paths _category_document['insert_time'] = datetime.now() return self.database_manager.insert_one_mongo(self.COLLECTION, _category_document) def _is_exists(self, field, value: str): """MongoDB에 cid 값을 조회하여 조건에 맞는 document가 있는지 확인""" _query = self.database_manager.find_query(field, value) return self.database_manager.count_document('category', _query) > 0 def _parse_category(self, element: HtmlElement, root_paths: str): ul_tag: HtmlElement = element.find('ul') if ul_tag is not None: li_tags = ul_tag.findall('li') li: HtmlElement for li in li_tags: li_a_tag = li.find('a') if li_a_tag is not None: _name = li_a_tag.text _href = li_a_tag.get('href') _cid = Utils.separate_right(_href, self._DELIMITER) _paths = Utils.join_path(self._PATH_TOKEN, root_paths, _name) self._insert(_cid, _name, _paths) div_tag = li.find('div') if div_tag is not None: self._parse_category(div_tag, _paths) if li.find('ul') is not None: self._parse_category(li, _paths) def parse(self): for category_id in range(self.CATEGORY_ID, self.CATEGORY_ID + 11): _url = 'https://search.shopping.naver.com/category/category/{0}' logging.info("PID >> %s | CategoryID >> %d " % (os.getpid(), category_id)) request = requests.get(_url.format(category_id)) Utils.take_a_sleep(0, 1) # 상태 체크 if request.status_code != 200: return try: _content = request.content tree: HtmlElement = html.fromstring(_content) header_xpath = '//*[@id="__next"]/div/div[2]/h2' _root_name = tree.xpath(header_xpath)[0].text self._insert(str(category_id), _root_name, None, True) xpath = '//*[@id="__next"]/div/div[2]/div/div' elements: [HtmlElement] = tree.xpath(xpath) element: HtmlElement for element in elements: if element.find('div') is not None: a_tag: HtmlElement = element[0].find('h3').find('a') _name = a_tag.find('strong').text _href = a_tag.get('href') _cid = Utils.separate_right(_href, self._DELIMITER) _paths = Utils.join_path(self._PATH_TOKEN, _root_name, _name) self._insert(_cid, _name, _paths) self._parse_category(element[0], _paths) else: logging.info('Element is not Exists') except Exception as e: logging.error(str(e)) # 더이상 필요없는 카테고리 아이템들 제거 for item in self._category_list: _query = self.database_manager.find_query('_id', item['_id']) self.database_manager.delete_one(self.COLLECTION, _query) def run(self): pass
class ProductCrawl: ''' 상품목록에서 상품 정보를 수집하는 crawler 입니다. Attributes: driver ''' PRODUCT_COLLECTION = "product" CRAWL_CONFIG_COLLECTION = "crawl_config" _excepted_data_count = 0 def __init__(self): logging.info('start product crawl') # Database manager - 데이터 조회 및 저장을 여기서 합니다. - singleton self.database_manager = DatabaseManager() self.crawl_config: CrawlConfiguration = ConfigManager().crawl_config # start Page Default self._paging_start: int = 1 self._view_size: int = 80 # self.crawl_config.crawl_count # 먼저 확인해야함. 다시 수집시 DB->Config 정보 셋 # TODO: 나중에 처리하도록 수정 # self._check_crawl_configuration() self._result: list = [] self._category: dict = None self.productInfo_arr = [] self._current_page: int = 0 self.last_crawled_date_time = datetime.datetime.now() def _upsert_crawl_configuration(self, start_page): """모든 분석이 끝나고 Config 정보 update""" # 조건 _filter = {} # 변경 데이터 _config = dict() _config['start_page'] = start_page self.database_manager.update(self.CRAWL_CONFIG_COLLECTION, _filter, _config) def _check_crawl_configuration(self): """Config 정보 set""" _config: dict = self.database_manager.find_one(self.CRAWL_CONFIG_COLLECTION) if _config.get('start_page') is not None: self._paging_start = _config['start_page'] if _config.get('crawl_category_list') is not None: self.crawl_config.crawl_category = _config['crawl_category_list'] def _category_getter(self, crawl_category: list) -> list: """ 카테고리 목록 조회해서 분석 :return category 목록들""" _categories: list = [] if crawl_category is None: crawl_category = self.crawl_config.crawl_category for item in crawl_category: query = self.database_manager.keyword_query('paths', item) _categories.extend(list(self.database_manager.find('category', query=query))) return _categories def make_url(self, paging_index: int, frm: str = "NVSHMDL", _filter: str = "") -> str: """category id, 페이지 사이즈, 페이지 넘버를 조합하여 url 생성""" _url = ("https://search.shopping.naver.com/search/category?catId={0}&frm={1}{2}&origQuery&pagingIndex={3}&pagingSize={4}&productSet=model&query&sort=rel×tamp=&viewType=list") _cid = self._category['cid'] return _url.format(_cid, frm, _filter, paging_index, self._view_size) async def parse(self, identifier: str, context: dict, crawl_category: list = None): """ 외부에서 파싱을 하기 위해 호출하는 함수 """ _categories: list = self._category_getter(crawl_category) for category in _categories: await asyncio.sleep(1) jobs = context['jobs'] job_info = jobs[identifier] job_info['status'] = 'in Progress' job_info['category'] = category.get('name') self._category = category """파싱 프로세스 시작""" self._current_page = 0 # Default = 1 _url = self.make_url(paging_index=1) _total_count, _filter = self._get_base_data(_url) # Page 조건 변경 필요 _is_oversize = _total_count > 8000 # Page 계산 _page_size = Utils.calc_page(_total_count, self._view_size) if _is_oversize: self._filter_parse(_filter) else: await self._execute_parse(_page_size) logging.info('>>> end childCategory: ' + self._category.get('name') + ' Pg.' + str(self._current_page)) job_info['status'] = 'done' def _make_list(self, _min, _max, _half): result = [] a = [_min, _half] b = [_half, _max] result.append(a) result.append(b) return result def _filter_parse_recursive(self, min_value, max_value): _param = ("&maxPrice={0}&minPrice={1}".format(str(max_value), str(min_value))) _url = self.make_url(1, "NVSHPRC", _param) _total_count, _filter = self._get_base_data(_url) _is_oversize = _total_count > 8000 _page_size = Utils.calc_page(_total_count, self._view_size) if _is_oversize: half_price = math.ceil((min_value + max_value) / 2) _range = self._make_list(min_value, max_value, half_price) for value in _range: self._filter_parse_recursive(value[0], value[1]) else: await self._execute_parse(_page_size, _param) pass def _filter_parse(self, filters: list): # 한번만 호출된다. for _filter in filters: _filterAction = _filter.get('filterAction') _separator = "-" # default = - _paramName = None if _filterAction is not None: _separator = _filterAction.get('separator') # price split _value: str = _filter.get('value') _param = "" _min = 0 _max = 0 if _value is not None: _min, _max = (int(_price) for _price in _value.split(_separator)) logging.info("Filter Parse >> min{0} / max{1}".format(_min, _max)) self._filter_parse_recursive(_min, _max) async def _execute_parse(self, page_number, filter_param: str = ""): for page_number in range(1, page_number): try: _url = self.make_url(page_number, _filter=filter_param) self.parse_data(self._get_product_json(_url)) logging.info(">>> URL : " + _url) logging.info('>>> start parsing: ' + self._category.get('name') + ' Pg.' + str(page_number)) self._current_page = page_number except Exception as e: logging.debug(">>> Category Collect Err " + str(self._current_page) + " name: " + self._category.get('name') + " Err :" + str(e)) def _get_product_json(self, url) -> dict: """ 상품 정보 가져오기 :arg :param url: request URL :return: data_dict 상품 정보 """ # header 추가 필요. try: _headers = {'Content-Type': 'application/json;'} req = requests.get(url, _headers) html = req.text soup = BeautifulSoup(html, 'html.parser') # html.parser를 사용해서 soup에 넣겠다 json_data = soup.find('script', text=re.compile('application/json')) data_dict = json.loads(str(json_data.contents[0])) except Exception as e: data_dict = None # 슬립 시간 조정 필요 - 8초가 부족할 수 있음. time.sleep(8) # 비정상적인 요청이 감지됨 - 다시 URL을 요청한다. logging.error("no find Data request Error >> {0} | URL >> {1}".format(e, url)) self._get_product_json(url) return data_dict def parse_data(self, data_dict): """ 데이터 파싱 """ product_info: dict = self._get_data(data_dict, 'products') if product_info is not None: '''수집된 데이터가 있는 경우''' product_list: list = product_info.get('list') self._excepted_data_count = 0 logging.info("수집 시작 - 상품 데이터 수: " + str(len(product_list))) if len(product_list) > 0: for product in product_list: product_data = dict() product_item = product.get('item') if product_item.get('adId') is None: '''광고 데이터가 아닌 경우에만 수집''' # 카테고리 정보 Setting self._set_category_info(product_data) # 상품 정보 Setting self._set_product_info(product_data, product_item) self._insert_product_info(product_data) else: self._excepted_data_count += 1 else: logging.error('!!! Exception: 상품 정보가 없습니다.') # if len(product_list) != len(products_data) + self._excepted_data_count: # logging.error("!!! Exception: 데이터 수 확인이 필요 합니다.") # logging.info("수집된 데이터 수: " + str(len(products_data))) # logging.info("수집 제외된 데이터 수: " + str(self._excepted_data_count)) else: logging.error('!!! Exception: 데이터가 수집되지 않았습니다.') def _set_category_info(self, product_data: dict): '''상품정보에 카테고리 정보 셋팅 arg: products_data: 상품 정보 객체 ''' product_data['n_cid'] = self._category.get('cid') product_data['cid'] = self._category.get('_id') product_data['paths'] = self._category.get('paths') product_data['cname'] = self._category.get('name') def _set_product_info(self, product_data: dict, product_item): product_data['n_id'] = product_item.get('id') product_data['imageUrl'] = product_item.get('imageUrl') product_data['title'] = product_item.get('productTitle') product_data['price'] = product_item.get('price') # product_data['option'] = {} _attribute: str = product_item.get('attributeValue', "") _attribute_value: str = product_item.get('characterValue', "") if (_attribute != "") and (_attribute_value != ""): # 옵션 정보가 있는 경우 product_option_key: list = product_item.get('attributeValue').split('|') # 옵션 키값 product_option_value: list = product_item.get('characterValue').split('|') # 옵션 벨류값 product_data['option'] = dict(zip(product_option_key, product_option_value)) def _insert_product_info(self, value: dict): """db data insert""" try: # TODO: 값 비교는 어디서 하지? _selection = self.database_manager.find_query("n_id", value.get("n_id")) self.database_manager.update(self.PRODUCT_COLLECTION, _selection, value) pass except Exception as e: logging.error('!!! Fail: Insert data to DB: ', e) def _get_base_data(self, url): _data = self._get_product_json(url) _total_count = 0 value_filters: Optional[dict] = None if _data is not None: products = self._get_data(_data, 'products') if products is not None: _total_count = products.get('total') if _total_count is not None: _total_count = int(_total_count) else: _total_count = 0 filters = self._get_data(_data, 'mainFilters') if filters is not None: value_filters = self._get_filter(filters) return _total_count, value_filters def _get_data(self, data: dict, _type: str): return data.get('props', {}).get('pageProps', {}).get('initialState', {}).get(_type) def _get_filter(self, main_filters: dict) -> dict: value_filters = None for _filter in main_filters: _filterType: str = _filter.get('filterType') if (_filterType is not None) and (eq(_filterType, 'price')): value_filters = _filter.get('filterValues') if value_filters is not None: break return value_filters
class CategoryCrawl(object): URL = 'https://search.shopping.naver.com/category/category/{0}' CATEGORY = 50000000 DELIMITER = 'cat_id=' COLLECTION = 'category' def __init__(self): # 크롬 selenium Driver - singleton self.driver = Selenium().driver # 크롤링 설정 정보 관리 - singleton self.crawl_config: CrawlConfiguration = ConfigManager().crawl_config # Database manager - 데이터 조회 및 저장을 여기서 합니다. - singleton self.database_manager = DatabaseManager() # 중복 데이터 확인을 위해 미리 저장된 결과 list를 조회한다. self._category_list: list = list( self.database_manager.find_all_mongo(self.COLLECTION)) def _update(self, cid, name, paths: str): _query = self.database_manager.find_query('cid', cid) _update_data = dict() _update_data['name'] = name _update_data['paths'] = paths _update_data['update_time'] = datetime.now() return self.database_manager.update(self.COLLECTION, _query, {"&set": _update_data}) def _insert(self, cid, name, paths: str, is_root: bool = False): """ Mongo Database Insert """ _is_exists: bool = False for item in self._category_list: _name = item['name'] _cid = item['cid'] _paths = item['paths'] if is_root: if eq(_name, name): self._category_list.remove(item) return else: if eq(_cid, cid): if eq(_name, name) and eq(_paths, paths): self._category_list.remove(item) return else: self._update() self._category_list.remove(item) return _category_document = dict() _category_document['cid'] = cid _category_document['name'] = name _category_document['paths'] = paths _category_document['insert_time'] = datetime.now() return self.database_manager.insert_one_mongo(self.COLLECTION, _category_document) def _is_exists(self, field, value: str): """MongoDB에 cid 값을 조회하여 조건에 맞는 document가 있는지 확인""" _query = self.database_manager.find_query(field, value) return self.database_manager.count_document('category', _query) > 0 def parse(self): self.driver.get(self.URL) try: for category in self.driver.find_elements_by_xpath( '//*[@id="home_category_area"]/div[1]/ul/li'): time.sleep(1) self._parse_root(category) # 더이상 필요없는 카테고리 아이템들 제거 for item in self._category_list: _query = self.database_manager.find_query('_id', item['_id']) self.database_manager.delete_one(self.COLLECTION, _query) except Exception as e: logging.error(str(e)) def _parse_root(self, category: WebElement): # Root 이름 root_name: str = category.text # root_name = text.replace('/', '-') logging.info('rootName : ' + root_name) for exclude_category in self.crawl_config.exclude_category: if eq(root_name, exclude_category): return None class_att = category.get_attribute('class') click_xpath = '//*[@id="home_{0}"]'.format(class_att) self.driver.implicitly_wait(5) # 먼저 클릭해봄. self.driver.find_element_by_xpath(click_xpath).send_keys(Keys.ENTER) # class_att 맞춰 내부 xPath 설정 time.sleep(1) xpath_cate = '//*[@id="home_{0}_inner"]/div[1]'.format(class_att) # Root Category element: WebElement = None while 1: if element is not None: break else: # 클릭 이벤트가 정상적으로 안들어오면 계속 클릭하자.. self.driver.find_element_by_xpath(click_xpath).send_keys( Keys.ENTER) self.driver.implicitly_wait(4) time.sleep(1) element = self.driver.find_element_by_xpath(xpath_cate) self._insert(None, root_name, None, True) # Root -> sub co_col_elements = element.find_elements(By.CLASS_NAME, 'co_col') self._parse_co_col(co_col_elements, root_name) def _parse_co_cel(self, co_cel_elements, root_name): co_cel: WebElement for co_cel in co_cel_elements: # href sub_href = co_cel.find_element_by_tag_name('a').get_attribute( 'href') # cid _cid = Utils.separate_right(sub_href, self.DELIMITER) sub_element: WebElement = co_cel.find_element_by_tag_name('strong') # name _name = sub_element.find_element_by_tag_name('a').text _name = re.sub("전체보기", "", _name) # paths _paths = Utils.join_path(token='#', source=root_name, value=_name) # cid, name, paths self._insert(_cid, _name, _paths) # 하위 카테고리 리스트 child_items: [WebElement] = co_cel.find_elements(By.TAG_NAME, 'li') self._parse_child(child_items, _paths) pass def _parse_co_col(self, sub_category, root_name): co_col: WebElement for co_col in sub_category: time.sleep(1) # 중간 카테고리 co_cel_elements = co_col.find_elements_by_class_name('co_cel') self._parse_co_cel(co_cel_elements, root_name) def _parse_child(self, child_items, sub_paths): child_item: WebElement for child_item in child_items: time.sleep(1) # href _href = child_item.find_element_by_tag_name('a').get_attribute( 'href') # cid _cid = Utils.separate_right(_href, self.DELIMITER) # name _name = child_item.text # 이름 # paths _paths = Utils.join_path(token='#', source=sub_paths, value=_name) self._insert(_cid, _name, _paths)