def process_finished(self, wrapped_result): search_request = wrapped_result[0] crawl_result = wrapped_result[1] if crawl_result is None: # we need to mark as invalid the result in order to be reprocessed. self.search_session.reset_search_request(search_request) logging.info( "[{}%] Request {} could not be retrieved. Reseted.".format( self.search_session.get_completion_progress(), search_request)) else: search_request.associate_result(crawl_result) # Log to session self.search_session.add_history_entry(search_request) logging.info("[{}%] Results for request {} retrieved: {}.".format( self.search_session.get_completion_progress(), search_request, len(crawl_result))) global_status.update_proc_progress( "Retrieving data from search engines...", self.search_session.get_completion_progress()) if self.search_session.get_completion_progress() == 100: logging.info("Crawler finished.") if self.on_process_finished: self.on_process_finished(search_request, crawl_result)
def _cache_all_page(self, search_words): """ This search engine adds content dynamically when you scroll down the page. We are interested in all the content we can get from the same page, so we simulate scroll downs until no more content is added. :return: """ previous_percent = -1 current_percent = 0 finished = False no_update_count = 0 while not finished: logging.info("images cached previously: {}, images cached currently: {}".format(previous_percent, current_percent)) global_status.update_proc_progress("{} ({}) *Caching page*".format(self.__class__.__name__, search_words), current_percent, max=MAX_IMAGES_PER_REQUEST) if current_percent > MAX_IMAGES_PER_REQUEST or no_update_count > MAX_SCROLL_NO_UPDATE_IMAGES_THRESHOLD: finished = True continue if previous_percent == current_percent: no_update_count +=1 else: no_update_count = 0 previous_percent = current_percent self.transport_core.scroll_to_bottom() elements = self.transport_core.get_elements_html_by_class("dg_u") current_percent = len(elements) global_status.update_proc_progress("{} ({}) *Caching page*".format(self.__class__.__name__, search_words), 100)
def _retrieve_image_links_data(self, search_words, search_options): url = "http://www.bing.com/images/search?&q={}".format(search_words) # We enable the face option if needed. if 'face' in search_options: url += "&qft=+filterui:face-face" logging.info("Built url ({}) for request.".format(url)) global_status.update_proc_progress("{} ({}) *Built url ({}) for request*".format(self.__class__.__name__, search_words, url), 0) global_status.update_proc_progress("{} ({}) *Retrieving URL*".format(self.__class__.__name__, search_words), 0) self.transport_core.get(url) self.transport_core.wait_for_elements_from_class("dg_u") global_status.update_proc_progress("{} ({}) *Retrieved URL*".format(self.__class__.__name__, search_words), 0) self._cache_all_page(search_words) logging.info("Get done. Loading elements JSON") global_status.update_proc_progress("{} ({}) *Generating JSON*".format(self.__class__.__name__, search_words), 100) dg_u_elements = [BeautifulSoup(html_element, 'html.parser').find() for html_element in self.transport_core.get_elements_html_by_class("dg_u", False)] logging.info("dg_elements loaded. Building json for each element...") result = [self._build_json_for(element, search_words) for element in dg_u_elements] logging.info("Retrieved {} elements".format(len(result))) global_status.update_proc_progress("{} ({}) *Generated content for {} elements*".format(self.__class__.__name__, search_words, len(result)), 100) return result
def retrieve(self, search_request): """ Performs a retrieval from the yahoo images given the search request info. :param search_request: A search request instance filled with the keywords and the options for the desired search. The following options are currently accepted: 'face' -> Select the face option as a search filter. Yahoo images contains a portrait option in the search that this flag enables. It will result in images of faces related with the given keywords. :return: """ # This way we cache the transport core. if not self.transport_core or search_request.get_transport_core_proto( ) != self.transport_core.__class__: self.transport_core = search_request.get_transport_core_proto()() logging.debug("Transport core created from proto.") global_status.update_proc_progress( "{} ({})".format(self.__class__.__name__, search_request.get_words()), 0) logging.info( "Retrieving image links from request {}.".format(search_request)) result = self._retrieve_image_links_data(search_request.get_words(), search_request.get_options()) return result
def _retrieve_image_links_data(self, search_words, search_options): url = "https://es.images.search.yahoo.com" logging.info("Built url ({}) for request.".format(url)) global_status.update_proc_progress( "{} ({}) *Built url ({}) for request*".format( self.__class__.__name__, search_words, url), 0) global_status.update_proc_progress( "{} ({}) *Retrieving URL*".format(self.__class__.__name__, search_words), 0) self.transport_core.get(url) global_status.update_proc_progress( "{} ({}) *Retrieved URL*".format(self.__class__.__name__, search_words), 0) # Since yahoo builds the url dynamically per client request, we need to pass through their ring. # We fill the search input box self.transport_core.wait_for_elements_from_class("ygbt") text = search_words #urllib.parse.quote_plus(search_words) self.transport_core.send_text_to_input_by_id("yschsp", text) # Then we click it self.transport_core.click_button_by_class("ygbt") if 'face' in search_options: self.transport_core.wait_for_elements_from_class("portrait") # We enable the portrait option if needed. self.transport_core.click_button_by_class("portrait") self._cache_all_page(search_words) global_status.update_proc_progress( "{} ({}) *Generating JSON*".format(self.__class__.__name__, search_words), 100) logging.info("Get done. Loading elements JSON") elements_holder = self.transport_core.get_elements_html_by_class( "ld ", False) logging.info("Retrieved all the elements holders.") ld_elements = [ BeautifulSoup(html_element, 'html.parser').find() for html_element in elements_holder ] logging.info("Building json...") result = [ self._build_json_for(element, search_words) for element in ld_elements ] global_status.update_proc_progress( "{} ({}) *Generated content for {} elements*".format( self.__class__.__name__, search_words, len(result)), 100) logging.info( "Retrieved {} elements in JSON format successfully".format( len(result))) return result
def _retrieve_image_links_data(self, search_words, search_options): url = "https://www.google.es/search?q={}&site=webhp&source=Lnms&tbm=isch".format( urllib.parse.quote_plus(search_words)) if 'face' in search_options: url += '&tbs=itp:face' logging.info("Built url ({}) for request.".format(url)) global_status.update_proc_progress( "{} ({}) *Built url ({}) for request*".format( self.__class__.__name__, search_words, url), 0) global_status.update_proc_progress( "{} ({}) *Retrieving URL*".format(self.__class__.__name__, search_words), 0) self.transport_core.get(url) global_status.update_proc_progress( "{} ({}) *Retrieved URL*".format(self.__class__.__name__, search_words), 0) self._cache_all_page(search_words) logging.info("Get done. Loading elements JSON") global_status.update_proc_progress( "{} ({}) *Generating JSON*".format(self.__class__.__name__, search_words), 100) json_elements = [ json.loads(element) for element in self.transport_core.get_elements_html_by_class("rg_meta") ] global_status.update_proc_progress( "{} ({}) *Generated content for {} elements*".format( self.__class__.__name__, search_words, len(json_elements)), 100) logging.info("Retrieved {} elements".format(len(json_elements))) return [{ 'url': image['ou'], 'width': image['ow'], 'height': image['oh'], 'desc': image['pt'], 'searchwords': search_words, 'source': 'google' } for image in json_elements]
def _cache_all_page(self, search_words): """ This search engine adds content dynamically when you scroll down the page. We are interested in all the content we can get from the same page, so we simulate scroll downs until no more content is added. :return: """ # We know maximum is 400 for google previous_percent = -1 current_percent = 0 while previous_percent < current_percent: previous_percent = current_percent self.transport_core.scroll_to_bottom() elements = self.transport_core.get_elements_html_by_class( "rg_meta") current_percent = len(elements) global_status.update_proc_progress("{} ({}) *Caching page*".format( self.__class__.__name__, search_words), current_percent, max=400)
def _feed_crawler(self, selected_session): """ Feeds the crawler with the specified session. :param session: :return: """ if self.crawler_service is None: self.crawler_service = CrawlerService(selected_session, processes=self.crawler_processes) self.crawler_service.start() else: self.crawler_service.update_search_session(search_session=selected_session) global_status.update_proc("Waiting for session \"{}\"".format(selected_session.backend_url)) global_status.update_proc_progress("Retrieving data from search engines...", selected_session.get_completion_progress()) while selected_session.get_completion_progress() != 100 and not self.__get_stop_flag__(): seconds_frozen = 0 ping_done = time() self.crawler_service.do_ping(ping_done) while self.crawler_service.get_pong() != ping_done and not self.__get_stop_flag__(): sleep(1) seconds_frozen += 1 if seconds_frozen > 60: print("Crawler seems completely frozen!!!") sleep(1) #selected_session.wait_for_finish() logging.info("Finished crawling session ({}) from " "dataset \"{}\" to crawl".format(selected_session.backend_url, name))