def get_recent_result(self, key): """output: urls, size_ratio""" if key not in self.__url_map: return None, None [retrieved_date, new_result, urls, size_ratio] = self.__url_map[key] if not self.__network_reachable or Crawler.__STOP_SEARCH: show(get_msg(Msg.use_previous_search_result)) # though size_ratio can be valid, we do not return it for caller usage is not expected return urls, None # spec.: we will execute a new search when there is enough new result on previous search # => if previous new result is n, all result is m, we will have a new search after m/n days # => if all previous result is new, then after 1 day we will have a search # => if no previous result is new, then we will have a search after 'TARGET_SEARCH_RESULT_SIZE' days valid_day_size = len(urls) / new_result if new_result > 0 else \ 1 if NA is new_result else \ TARGET_SEARCH_RESULT_SIZE # new_result = 0 => no new result before from util.global_def import get_search_latency valid_day_size *= get_search_latency() current_date = datetime.today() date_diff = current_date - retrieved_date if date_diff > timedelta(days=valid_day_size): # 'valid_day_size' is the valid duration of search result return None, size_ratio to_next_query = timedelta(days=valid_day_size) - date_diff hours, remainder = divmod(to_next_query.seconds, 3600) minutes, seconds = divmod(remainder, 60) show(get_msg(Msg.to_next_search), to_next_query.days, get_msg(Msg.day), hours, get_msg(Msg.hour), minutes, get_msg(Msg.minute), seconds, (get_msg(Msg.second) + ","), get_msg(Msg.current_url_count), len(urls)) # though size_ratio can be valid, we do not return it for caller usage is not expected return urls, None
def set_graph(self, image_obj, graph_file=NA): self.__cur_image_obj = image_obj digest = None if NA == graph_file: graph_file, digest = GraphDirHandler(image_obj.location).get_graph() if image_obj.location else \ GraphFetcher(size=image_obj.size, option=image_obj.option).fetch(image_obj.pattern) if NA == graph_file: return False show(graph_file) with open(graph_file, 'rb') as f: try: image = GraphViewer.get_image(f) except IOError as e: f.close() # close f here for we are going to delete the file below # some image cannot be opened (maybe it's not image format?), err msg is 'cannot identify image file' info(get_msg(Msg.fail_to_open_image), str(e)) GraphFetcher().handle_image(graph_file, DELETE) return False # we met "Decompressed Data Too Large" for ~/Inside Out/Image_124.jpg... except ValueError as e: info(get_msg(Msg.fail_to_open_image), str(e)) return False self.__cur_graph_file = graph_file self.__graph_history.append([self.__cur_image_obj, self.__cur_graph_file]) if digest: digest_str = digest + "\n" else: digest_str = "%s:%s\n" % (get_msg(Msg.path), graph_file) self.__cur_digest = digest_str + "%s:%sx%s" % (get_msg(Msg.size), image.size[0], image.size[1]) self.select_phrase(image_obj.pattern) return self.set_graph_content(graph_file, image)
def select_pattern(self): if self.__arbitrator.is_active(): choice_pattern = None while not choice_pattern: choice_pattern = self.__arbitrator.arbitrate() if not choice_pattern: show(get_msg(Msg.no_available_image_wait_10_minutes)) self.__root.withdraw() import time time.sleep(600) self.__root.deiconify() return self.__cur_image_obj_dict[choice_pattern] image_obj_size = len(self.__cur_image_obj_list) return self.__cur_image_obj_list[random.randrange(0, image_obj_size)]
def load(pickle_file): """output: is_exist, value""" try: pickle_fd = open(pickle_file, "r") except IOError as err: if errno.ENOENT == err.errno: show(get_msg(Msg.cache_file_does_not_exist), pickle_file) return False, None assert False try: value = cPickle.load(pickle_fd) return True, value except (ValueError, UnpicklingError, EOFError): error(get_msg(Msg.cannot_read_pickle_file), pickle_file, get_msg(Msg.suggest_re_fetch_pickle_file)) assert False
def fetch(self, pattern): self.__has_write = False new_objs, old_objs = self.get_updated_url(pattern) show(get_msg(Msg.total_data_count), len(new_objs) + len(old_objs)) url = self.choose_url(new_objs, old_objs) if NA == url: return NA, NA image_objs = old_objs image_objs.update(new_objs) image_slot = image_objs[url] graph_file, new_encoding = self.get_graph_file(pattern, url, image_slot.encoding) new_slot = ImageSlot(image_slot.timestamp, new_encoding, image_slot.rank) image_objs[url] = new_slot if self.__has_write: save(GraphFetcher.get_cache_file(pattern), image_objs) return graph_file, GraphFetcher.get_graph_digest(graph_file, image_objs[url])
def crawl(self, pattern, size_list, option="", print_url=False): """output: urls, is_new_result""" show(get_msg(Msg.search_target), "\"" + pattern + "\"") key = Crawler.get_search_key(pattern, option) urls, size_ratio = self.get_recent_result(key) if urls: return urls, False if not self.__network_reachable or Crawler.__STOP_SEARCH: return None, False assert size_list and (not size_ratio or isinstance(size_ratio, dict)) dice = Crawler.get_dice(size_list, size_ratio) urls = [] next_size_ratio = {size: 0 for size in size_list} # key: size, value: number of new result (initial with 0) start = {size: 1 for size in size_list} # key: size, value: next search start offset (start from 1 by google) tried_size = 0 while tried_size < TARGET_SEARCH_RESULT_SIZE: chosen_size = get_weighted_random_dict_key(dice) this_urls, success = Crawler.crawl_by_asking_google_search(pattern, start[chosen_size], chosen_size, option) if not success: break urls += this_urls new_result = self.get_this_time_new_result_num(key, this_urls) next_size_ratio[chosen_size] += (new_result if NA != new_result else len(this_urls)) start[chosen_size] += G_SEARCH_PER_REQ_SIZE tried_size += G_SEARCH_PER_REQ_SIZE # 'set' to filter out duplicated item (though not expected, but we found g-search may give duplicated result) urls = list(set(urls)) if not Crawler._HAS_SHOW_NO_SEARCH_MSG: info("%s:%s, %s:%i" % ( get_msg(Msg.target), pattern, get_msg(Msg.acquired_url_count), len(urls))) if print_url: for url in urls: show(url) if success: next_size_ratio = {size: 1 if 0 == next_size_ratio[size] else next_size_ratio[size] for size in next_size_ratio} self.cache_url(key, urls, next_size_ratio) return urls, success
def prepare_for_next_view(self, wait_time, msg=None): if msg: show(msg) job = self.__root.after(int(wait_time), lambda: self.timer_action()) self.__pending_jobs.append(job)