async def parse(self, identifier: str, context: dict): logging.info("Category Crawl Start >> WEB") for category_id in range(self.CATEGORY_ID, self.CATEGORY_ID + 11): await asyncio.sleep(1) _url = 'https://search.shopping.naver.com/category/category/{0}' logging.info("PID >> %s | CategoryID >> %d " % (os.getpid(), category_id)) jobs = context['jobs'] job_info = jobs[identifier] request = requests.get(_url.format(category_id)) # 상태 체크 if request.status_code != 200: return try: _content = request.content tree: HtmlElement = html.fromstring(_content) header_xpath = '//*[@id="__next"]/div/div[2]/h2' _root_name = tree.xpath(header_xpath)[0].text job_info['status'] = 'in progress' job_info['name'] = _root_name self.crawl_status(str(category_id), _root_name, request.status_code) self._insert(str(category_id), _root_name, None, True) xpath = '//*[@id="__next"]/div/div[2]/div/div' elements: [HtmlElement] = tree.xpath(xpath) element: HtmlElement for element in elements: if element.find('div') is not None: a_tag: HtmlElement = element[0].find('h3').find('a') _name = a_tag.find('strong').text _href = a_tag.get('href') _cid = Utils.separate_right(_href, self._DELIMITER) _paths = Utils.join_path(self._PATH_TOKEN, _root_name, _name) self._insert(_cid, _name, _paths) self._parse_category(element[0], _paths) else: logging.info('Element is not Exists') except Exception as e: logging.error(str(e)) # 더이상 필요없는 카테고리 아이템들 제거 for item in self._category_list: _query = self.database_manager.find_query('_id', item['_id']) self.database_manager.delete_one(self.COLLECTION, _query) logging.info("Category Crawl END >> WEB") context['jobs'][identifier]['status'] = 'done'
def _parse_child(self, child_items, sub_paths): child_item: WebElement for child_item in child_items: time.sleep(1) # href _href = child_item.find_element_by_tag_name('a').get_attribute( 'href') # cid _cid = Utils.separate_right(_href, self.DELIMITER) # name _name = child_item.text # 이름 # paths _paths = Utils.join_path(token='#', source=sub_paths, value=_name) self._insert(_cid, _name, _paths)
async def parse(self, identifier: str, context: dict, crawl_category: list = None): """ 외부에서 파싱을 하기 위해 호출하는 함수 """ _categories: list = self._category_getter(crawl_category) for category in _categories: await asyncio.sleep(1) jobs = context['jobs'] job_info = jobs[identifier] job_info['status'] = 'in Progress' job_info['category'] = category.get('name') self._category = category """파싱 프로세스 시작""" self._current_page = 0 # Default = 1 _url = self.make_url(paging_index=1) _total_count, _filter = self._get_base_data(_url) # Page 조건 변경 필요 _is_oversize = _total_count > 8000 # Page 계산 _page_size = Utils.calc_page(_total_count, self._view_size) if _is_oversize: self._filter_parse(_filter) else: await self._execute_parse(_page_size) logging.info('>>> end childCategory: ' + self._category.get('name') + ' Pg.' + str(self._current_page)) job_info['status'] = 'done'
def parse(self): for category_id in range(self.CATEGORY_ID, self.CATEGORY_ID + 11): _url = 'https://search.shopping.naver.com/category/category/{0}' logging.info("PID >> %s | CategoryID >> %d " % (os.getpid(), category_id)) request = requests.get(_url.format(category_id)) Utils.take_a_sleep(0, 1) # 상태 체크 if request.status_code != 200: return try: _content = request.content tree: HtmlElement = html.fromstring(_content) header_xpath = '//*[@id="__next"]/div/div[2]/h2' _root_name = tree.xpath(header_xpath)[0].text self._insert(str(category_id), _root_name, None, True) xpath = '//*[@id="__next"]/div/div[2]/div/div' elements: [HtmlElement] = tree.xpath(xpath) element: HtmlElement for element in elements: if element.find('div') is not None: a_tag: HtmlElement = element[0].find('h3').find('a') _name = a_tag.find('strong').text _href = a_tag.get('href') _cid = Utils.separate_right(_href, self._DELIMITER) _paths = Utils.join_path(self._PATH_TOKEN, _root_name, _name) self._insert(_cid, _name, _paths) self._parse_category(element[0], _paths) else: logging.info('Element is not Exists') except Exception as e: logging.error(str(e)) # 더이상 필요없는 카테고리 아이템들 제거 for item in self._category_list: _query = self.database_manager.find_query('_id', item['_id']) self.database_manager.delete_one(self.COLLECTION, _query)
def category(i): # URL = "https://search.shopping.naver.com/category/category/" + str(i) URL = "https://search.shopping.naver.com/too-many-request" headers = {'Content-Type': 'application/json;'} req = requests.get(URL, headers) content = req.content soup = BeautifulSoup(content, 'html.parser') # html.parser를 사용해서 soup에 넣겠다 json_data = soup.find('script', text=re.compile('application/json')) try: data_dict = json.loads(str(json_data.contents[0])) except Exception as e: print('') # tree: HtmlElement = etree.fromstring(content) tree: HtmlElement = html.fromstring(content) header_xpath = '//*[@id="__next"]/div/div[2]/h2' header = tree.xpath(header_xpath)[0].text xpath = '//*[@id="__next"]/div/div[2]/div/div' elements: [HtmlElement] = tree.xpath(xpath) element: HtmlElement for i, element in enumerate(elements): print(i) try: if element.find('div') is not None: a_tag: HtmlElement = element[0].find('h3').find('a') href = a_tag.get('href') _cid = Utils.separate_right(href, "category?catId=") h3_tag = a_tag.find('strong').text paths = Utils.join_path('#', header, h3_tag) sub_category(element[0], paths) except Exception as e: print('')
def _parse_category(self, element: HtmlElement, root_paths: str): ul_tag: HtmlElement = element.find('ul') if ul_tag is not None: li_tags = ul_tag.findall('li') li: HtmlElement for li in li_tags: li_a_tag = li.find('a') if li_a_tag is not None: _name = li_a_tag.text _href = li_a_tag.get('href') _cid = Utils.separate_right(_href, self._DELIMITER) _paths = Utils.join_path(self._PATH_TOKEN, root_paths, _name) self._insert(_cid, _name, _paths) div_tag = li.find('div') if div_tag is not None: self._parse_category(div_tag, _paths) if li.find('ul') is not None: self._parse_category(li, _paths)
def _parse_co_cel(self, co_cel_elements, root_name): co_cel: WebElement for co_cel in co_cel_elements: # href sub_href = co_cel.find_element_by_tag_name('a').get_attribute( 'href') # cid _cid = Utils.separate_right(sub_href, self.DELIMITER) sub_element: WebElement = co_cel.find_element_by_tag_name('strong') # name _name = sub_element.find_element_by_tag_name('a').text _name = re.sub("전체보기", "", _name) # paths _paths = Utils.join_path(token='#', source=root_name, value=_name) # cid, name, paths self._insert(_cid, _name, _paths) # 하위 카테고리 리스트 child_items: [WebElement] = co_cel.find_elements(By.TAG_NAME, 'li') self._parse_child(child_items, _paths) pass
def _filter_parse_recursive(self, min_value, max_value): _param = ("&maxPrice={0}&minPrice={1}".format(str(max_value), str(min_value))) _url = self.make_url(1, "NVSHPRC", _param) _total_count, _filter = self._get_base_data(_url) _is_oversize = _total_count > 8000 _page_size = Utils.calc_page(_total_count, self._view_size) if _is_oversize: half_price = math.ceil((min_value + max_value) / 2) _range = self._make_list(min_value, max_value, half_price) for value in _range: self._filter_parse_recursive(value[0], value[1]) else: await self._execute_parse(_page_size, _param) pass
def _start_parsing_process(self): """파싱 프로세스 시작""" self._current_page = 0 # Default = 1 _url = self.make_url(paging_index=1) _total_count, _filter = self._get_base_data(_url) # Page 조건 변경 필요 _is_oversize = _total_count > 8000 # Page 계산 _page_size = Utils.calc_page(_total_count, self._view_size) if _is_oversize: self._filter_parse(_filter) else: self._execute_parse(_page_size) logging.info('>>> end childCategory: ' + self._category.get('name') + ' Pg.' + str(self._current_page))
def sub_category(element: HtmlElement, root_path: str): ul_tag: HtmlElement = element.find('ul') if ul_tag is not None: li_tags = ul_tag.findall('li') li: HtmlElement for li in li_tags: try: li_a_tag = li.find('a') if li_a_tag is not None: href = li_a_tag.get('href') text = li_a_tag.text paths = Utils.join_path('#', root_path, text) div_tag = li.find('div') if div_tag is not None: sub_category(div_tag, paths) if li.find('ul') is not None: sub_category(li, paths) except Exception as e: print('')