def spider_opened(self, spider): spider.logger.info('Spider opened: %s' % spider.name) UrlsState.init() print(UrlsState.parsed_urls) print( '##########################################################################################' )
def __is_first_request(self, url): print('#_is_first_request') ##state = UrlsState.get_parsed_url_state(url) if UrlsState.exist_url_state(url): return UrlsState.get_url_data(url, UrlsState.KEY_IS_FIRST_REQUEST) else: UrlsState._create_parsed_url_state(url) return True
def __create_a_new_tab(self, url): ##new_index_tab = UrlsState.get_a_new_index_tab() ##print('#created the tab %i'%new_index_tab) ##self.parsed_urls[url]['tab'] = new_index_tab UrlsState.set_new_index_tab(url) index_tab = UrlsState.get_index_tab(url) if index_tab != 0: self.driver.execute_script("window.open('');") self.__select_tab(url) self.driver.get(url) self.parsed_urls[url]['is_first_request'] = False
def process_spider_input(self, response, spider): # Called for each response that goes through the spider # middleware and into the spider. print("#IeSpiderMiddleware.process_spider_input: {}".format( response.url)) url = response.url if IeSpiderMiddleware.JOBS_URL_PATH in url: UrlsState.update_url_state( response.meta[UrlsState.KEY_START_URL], response.meta[UrlsState.KEY_TOTAL_RESULTS]) elif '?page=0' in url: print('# page not found') try: print(response.meta[UrlsState.KEY_START_URL]) except Exception as e: print(f'Error!!! {e}') UrlsState.reset_url_state(response.meta[UrlsState.KEY_START_URL]) # Should return None or raise an exception. return None
def process_start_requests(self, start_requests, spider): # Called with the start requests of the spider, and works # similarly to the process_spider_output() method, except # that it doesn’t have a response associated. # Must return only requests (not items). print('# ###process_start_requests') for r in start_requests: # The init url is changed for the url with the pending page query r._set_url(UrlsState.get_pending_url(r.url)) print(r) print('end') yield r
def __select_tab(self, url): self.driver.switch_to.window( self.driver.window_handles[UrlsState.get_index_tab(url)])
def process_request(self, request, spider): # Called for each request that goes through the downloader # middleware. # Must either: # - return None: continue processing this request # - or return a Response object # - or return a Request object # - or raise IgnoreRequest: process_exception() methods of # installed downloader middleware will be called # The request.url must be: # - a job url # - a job results page url url = request.url url_to_parse = request.url print('url: %s' % url) print('url_to_parse: %s' % url_to_parse) if not self.__is_a_valid_job_results_url(url): print('#This url will not process by selenium: %s' % url) try: print(request) try: print(request.meta) print(request.meta[UrlsState.KEY_START_URL]) except: print('1') clean_url = self.__clean_url( request.meta[UrlsState.KEY_START_URL]) print('#parsing job from: %s' % clean_url) ## self.__update_parsed_url(clean_url) print() print() print('# request.meta: %s' % request.meta[UrlsState.KEY_START_URL]) print() print() except: pass return None else: print('IM SELENIUM PROCESSING THE REQUEST: %s' % request.url) ## if not self.driver: self.driver = self.__get_chrome_browser(request) clean_url = self.__clean_url(url) page = self.__get_number_page_from_the_url(url) print('clean url: %s' % clean_url) print('page: %i' % page) try: if self.__is_first_request(clean_url): print( 'Selenium is going to get the page %i of %s for first time' % (page, url)) self.__create_a_new_tab(url) if page == 0: page = UrlsState.get_pending_page(clean_url) if page > 1 and page < 500: self.__go_to_the_page(page) url_to_parse = '%s%s/' % (clean_url, str(page)) elif page >= 500: self.__go_to_the_page(499) next_page = 500 while next_page <= page: actual_page = self._get_actual_page_from_driver() print('actual_page: %i, next_page: %i' % (actual_page, next_page)) if actual_page and ((actual_page + 1) == next_page): self.__go_to_the_next_page(next_page) next_page += 1 else: print('#something wrong') break else: print('Selenium is going to get the next page %i of %s' % (page, url)) if page == 0: inexistent_page = UrlsState.get_pending_page( clean_url) + 555 UrlsState.reset_url_state(url) print('# Im selenium, all the pages have been parsed') self.__go_to_the_page(inexistent_page) else: self.__select_tab(clean_url) actual_page = self._get_actual_page_from_driver() print('#actual_page: %i' % actual_page) if actual_page and ((actual_page + 1) == page): self.__go_to_the_next_page(page) else: self.__reboot_tab(clean_url, page) except Exception as e: print('#Exception __go_to_the_page: %s %i' % (clean_url, page)) print('#Exception raised: %s' % e) self.__reboot_tab(clean_url, page) body = self.driver.page_source return HtmlResponse(url_to_parse, body=body, encoding='utf-8', request=request)
def spider_closed(self, spider): spider.logger.info('y Spider closed: %s', spider.name) UrlsState.close()