def arbitrate(self): """consider current date/time and value of the ranks, return the selected pattern""" timed_percentage_rank, timed_weight_rank = self.__get_current_timed_rank( ) timed_percentage_count = sum([ timed_percentage_rank[pattern].value for pattern in timed_percentage_rank ]) total_percentage_count = self.__general_percentage_count + timed_percentage_count max_percentage = 100 if total_percentage_count > max_percentage: max_percentage = total_percentage_count if not RankArbitrator.__HAS_SHOWN_PERCENTAGE_WARNING__: warning( "total percentage count value '%s' is greater than 100" % total_percentage_count) RankArbitrator.__HAS_SHOWN_PERCENTAGE_WARNING__ = True dice = { PERCENTAGE: RankHolder(total_percentage_count), WEIGHT: RankHolder(max_percentage - total_percentage_count) } choice = get_weighted_random_dict_key(dice) general_holders = self.__general_percentage_holder if PERCENTAGE is choice else self.__general_weight_holder timed_ranks = timed_percentage_rank if PERCENTAGE is choice else timed_weight_rank if not general_holders and not timed_ranks: return None dice = self.__get_dice(general_holders, timed_ranks) choice_pattern = get_weighted_random_dict_key(dice) return choice_pattern
def arbitrate(self): """consider current date/time and value of the ranks, return the selected pattern""" timed_percentage_rank, timed_weight_rank = self.__get_current_timed_rank() timed_percentage_count = sum([timed_percentage_rank[pattern].value for pattern in timed_percentage_rank]) total_percentage_count = self.__general_percentage_count + timed_percentage_count max_percentage = 100 if total_percentage_count > max_percentage: max_percentage = total_percentage_count if not RankArbitrator.__HAS_SHOWN_PERCENTAGE_WARNING__: print("[warning] total percentage count value '%s' is greater than 100" % total_percentage_count) RankArbitrator.__HAS_SHOWN_PERCENTAGE_WARNING__ = True dice = {PERCENTAGE: RankHolder(total_percentage_count), WEIGHT: RankHolder(max_percentage - total_percentage_count)} choice = get_weighted_random_dict_key(dice) general_holders = self.__general_percentage_holder if PERCENTAGE is choice else self.__general_weight_holder timed_ranks = timed_percentage_rank if PERCENTAGE is choice else timed_weight_rank if not general_holders and not timed_ranks: return None dice = self.__get_dice(general_holders, timed_ranks) choice_pattern = get_weighted_random_dict_key(dice) return choice_pattern
def choose_url(new_objs, old_objs): # ... support setting... new_size = len(new_objs) old_size = len(old_objs) if new_size > 0 and new_size + old_size <= new_size * 2: new_objs.update(old_objs) return get_random_dict_key(new_objs) if not old_size > 0: return NA # now we will throw a dice with 50%/50% prob. choosing new or old obj is_choose_new = new_size > 0 and 1 == random.randrange(0, 2) if is_choose_new: return get_random_dict_key(new_objs) else: return get_weighted_random_dict_key(old_objs, bypass=lambda image_slot: NA == image_slot.encoding)
def crawl(self, pattern, size_list, option="", print_url=False): """output: urls, is_new_result""" show(get_msg(Msg.search_target), "\"" + pattern + "\"") key = Crawler.get_search_key(pattern, option) urls, size_ratio = self.get_recent_result(key) if urls: return urls, False if not self.__network_reachable or Crawler.__STOP_SEARCH: return None, False assert size_list and (not size_ratio or isinstance(size_ratio, dict)) dice = Crawler.get_dice(size_list, size_ratio) urls = [] next_size_ratio = {size: 0 for size in size_list} # key: size, value: number of new result (initial with 0) start = {size: 1 for size in size_list} # key: size, value: next search start offset (start from 1 by google) tried_size = 0 while tried_size < TARGET_SEARCH_RESULT_SIZE: chosen_size = get_weighted_random_dict_key(dice) this_urls, success = Crawler.crawl_by_asking_google_search(pattern, start[chosen_size], chosen_size, option) if not success: break urls += this_urls new_result = self.get_this_time_new_result_num(key, this_urls) next_size_ratio[chosen_size] += (new_result if NA != new_result else len(this_urls)) start[chosen_size] += G_SEARCH_PER_REQ_SIZE tried_size += G_SEARCH_PER_REQ_SIZE # 'set' to filter out duplicated item (though not expected, but we found g-search may give duplicated result) urls = list(set(urls)) if not Crawler._HAS_SHOW_NO_SEARCH_MSG: info("%s:%s, %s:%i" % ( get_msg(Msg.target), pattern, get_msg(Msg.acquired_url_count), len(urls))) if print_url: for url in urls: show(url) if success: next_size_ratio = {size: 1 if 0 == next_size_ratio[size] else next_size_ratio[size] for size in next_size_ratio} self.cache_url(key, urls, next_size_ratio) return urls, success
def crawl(self, pattern, size_list, option="", print_url=False): """output: urls, is_new_result""" debug("[search] search target: \"%s\"" % pattern) key = Crawler.get_search_key(pattern, option) urls, size_ratio = self.get_recent_result(key) if urls: return urls, False if not self.__network_reachable or Crawler.__STOP_SEARCH: return None, False assert size_list and (not size_ratio or isinstance(size_ratio, dict)) dice = Crawler.get_dice(size_list, size_ratio) urls = [] next_size_ratio = {size: 0 for size in size_list} # key: size, value: number of new result (initial with 0) start = {size: 1 for size in size_list} # key: size, value: next search start offset (start from 1 by google) tried_size = 0 while tried_size < get_search_size(): chosen_size = get_weighted_random_dict_key(dice) this_urls, success = Crawler.crawl_by_asking_google_search(pattern, start[chosen_size], chosen_size, option) if not success: break urls += this_urls new_result = self.get_this_time_new_result_num(key, this_urls) next_size_ratio[chosen_size] += (new_result if NA != new_result else len(this_urls)) start[chosen_size] += G_SEARCH_PER_REQ_SIZE tried_size += G_SEARCH_PER_REQ_SIZE # 'set' to filter out duplicated item (though not expected, but we found g-search may give duplicated result) urls = list(set(urls)) if not Crawler._HAS_SHOW_NO_SEARCH_MSG: info("target:%s, acquired url count:%i" % (pattern, len(urls))) if print_url: for url in urls: debug("[search] %s" % url) if success: next_size_ratio = {size: 1 if 0 == next_size_ratio[size] else next_size_ratio[size] for size in next_size_ratio} self.cache_url(key, urls, next_size_ratio) return urls, success
def get_graph(self): if not self.__valid: return NA, NA graph_file = get_weighted_random_dict_key(self.__status_cache) full_graph_file = self.__location + get_delim() + graph_file return full_graph_file, self.get_graph_digest(graph_file)