def set_graph(self, image_obj, graph_file=NA): self.__cur_image_obj = image_obj digest = None if NA == graph_file: graph_file, digest = GraphDirHandler(image_obj.location).get_graph() if image_obj.location else \ GraphFetcher(size=image_obj.size, option=image_obj.option).fetch(image_obj.pattern) if NA == graph_file: return False show(graph_file) with open(graph_file, 'rb') as f: try: image = GraphViewer.get_image(f) except IOError as e: f.close() # close f here for we are going to delete the file below # some image cannot be opened (maybe it's not image format?), err msg is 'cannot identify image file' info(get_msg(Msg.fail_to_open_image), str(e)) GraphFetcher().handle_image(graph_file, DELETE) return False # we met "Decompressed Data Too Large" for ~/Inside Out/Image_124.jpg... except ValueError as e: info(get_msg(Msg.fail_to_open_image), str(e)) return False self.__cur_graph_file = graph_file self.__graph_history.append([self.__cur_image_obj, self.__cur_graph_file]) if digest: digest_str = digest + "\n" else: digest_str = "%s:%s\n" % (get_msg(Msg.path), graph_file) self.__cur_digest = digest_str + "%s:%sx%s" % (get_msg(Msg.size), image.size[0], image.size[1]) self.select_phrase(image_obj.pattern) return self.set_graph_content(graph_file, image)
def handle_image(location, graph_file, action): assert action in [INC_RANK, DEC_RANK] handler = GraphDirHandler(location) assert handler.__valid base_file = graph_file.replace(location + get_delim(), "") for image in handler.__status_cache: if image == base_file: has_change = True status = handler.__status_cache[image] if INC_RANK == action: status.rank += 1 msg = get_msg(Msg.change_rank_to) + str(status.rank) else: if 1 == status.rank: msg = get_msg(Msg.cannot_lower_down_rank_as_it_is_already_the_lowest) has_change = False else: status.rank -= 1 msg = get_msg(Msg.change_rank_to) + str(status.rank) if has_change: handler.__status_cache[image] = status cache_file = location + get_delim() + GraphDirHandler.CACHE_FILE # TODO: it shows that the timestamp not change... timestamp = time.ctime(os.path.getmtime(location)) save(cache_file, [timestamp, handler.__status_cache]) return msg assert False
def handle_image(graph_file, action): assert action in [DELETE, DISCARD, INC_RANK, DEC_RANK] key_str = get_delim() + "image_" end_pos = graph_file.find(key_str) assert -1 != end_pos begin_pos = graph_file[:end_pos].rfind(get_delim()) assert -1 != begin_pos pattern = graph_file[begin_pos + 1:end_pos] has_cache, cached_objs = load(GraphFetcher.get_cache_file(pattern)) assert has_cache file_encoding = graph_file[graph_file.find(key_str) + len(key_str):graph_file.find(".jpg")] for url in cached_objs: image_slot = cached_objs[url] if image_slot.encoding == file_encoding: new_encoding = NA if DELETE == action else \ None if DISCARD == action else \ image_slot.encoding # no change new_rank = image_slot.rank + 1 if INC_RANK == action else \ image_slot.rank - 1 if DEC_RANK == action and image_slot.rank is not 1 else \ image_slot.rank # no change updated_slot = ImageSlot(timestamp=image_slot.timestamp, encoding=new_encoding, rank=new_rank) cached_objs[url] = updated_slot save(GraphFetcher.get_cache_file(pattern), cached_objs) if action in [DELETE, DISCARD]: os.remove(graph_file) msg = "" if action in [DELETE, DISCARD] else \ get_msg(Msg.change_rank_to) + str(new_rank) + "!" if new_rank is not image_slot.rank else \ get_msg(Msg.cannot_lower_down_rank_as_it_is_already_the_lowest) if image_slot.rank is 1 else \ None assert msg is not None return msg assert False
def get_graph_digest(self, graph_file): if NA is graph_file: return "NA" full_graph_file = self.__location + get_delim() + graph_file timestamp = time.ctime(os.path.getmtime(full_graph_file)) return "%s:%s\n%s:%s\n%s:%s" % ( get_msg(Msg.location), full_graph_file, get_msg(Msg.timestamp), timestamp, get_msg(Msg.rank), self.__status_cache[graph_file].rank)
def get_graph_digest(graph_file, url_obj): if NA is graph_file: return "NA" relative_pos = graph_file.find(pic_home()) assert 0 == relative_pos relative_graph_file = graph_file[len(pic_home()):] return "%s:%s\n%s:%s\n%s:%s" % ( get_msg(Msg.location), relative_graph_file, get_msg(Msg.timestamp), url_obj.timestamp.strftime("%B %d, %Y"), get_msg(Msg.rank), url_obj.rank)
def check_access_status(): print(get_msg(Msg.check_network_connection)) try: urllib2.urlopen('http://google.com', timeout=3) print(get_msg(Msg.network_status_succeed)) return True except urllib2.URLError: pass print(get_msg(Msg.network_status_fail)) return False
def load(pickle_file): """output: is_exist, value""" try: pickle_fd = open(pickle_file, "r") except IOError as err: if errno.ENOENT == err.errno: show(get_msg(Msg.cache_file_does_not_exist), pickle_file) return False, None assert False try: value = cPickle.load(pickle_fd) return True, value except (ValueError, UnpicklingError, EOFError): error(get_msg(Msg.cannot_read_pickle_file), pickle_file, get_msg(Msg.suggest_re_fetch_pickle_file)) assert False
def set_graph_content(self, graph_file, image=None): if image is None: try: image = GraphViewer.get_image(graph_file) except IOError as e: error(str(e)) assert False self.__root.geometry(self.__full_geom if self.__fullscreen_mode else '%dx%d+0+0' % (image.size[0], image.size[1])) if self.__fullscreen_mode: resize_width, resize_height, x_pos, y_pos = self.get_adjusted_geom(image.size[0], image.size[1]) try: resized = image.resize((resize_width, resize_height), Image.ANTIALIAS) except IOError as e: # 'incomplete downloaded image' may go here info(get_msg(Msg.fail_to_convert_image_to_fullscreen), str(e)) GraphFetcher().handle_image(graph_file, DISCARD) return False image = resized self.__root.title(self.__cur_image_obj.group_name) tk_image_obj = ImageTk.PhotoImage(image) self.__tk_obj_ref = tk_image_obj self.__canvas.delete('all') self.__canvas.create_image(x_pos if self.__fullscreen_mode else 0, y_pos if self.__fullscreen_mode else 0, image=tk_image_obj, anchor=Tkinter.NW) self.show_onscreen_help() self.show_onscreen_info() self.show_onscreen_phrase() return True
def save(pickle_file, value): pickle_fd = open(pickle_file, "w") try: cPickle.dump(value, pickle_fd) except AttributeError as msg: error(get_msg(Msg.fail_to_write_cache), str(msg)) pickle_fd.close()
def __init__(self, in_file): self.__fd = open(in_file) try: self.__json_data = json.load(self.__fd) except Exception, e: error(get_msg(Msg.fail_read_file) + "\"", in_file, "\"") error(str(e)) assert False
def delete_image(self, *unused): if self.__cur_image_obj.location: return # spec.: not support remove image that user 'specified' info(get_msg(Msg.remove_image), self.__cur_graph_file) self.__graph_history.remove([self.__cur_image_obj, self.__cur_graph_file]) GraphFetcher.handle_image(self.__cur_graph_file, DELETE) self.cancel_pending_jobs() self.timer_action(True)
def decrement_rank(self, *unused): info(get_msg(Msg.decrease_rank), self.__cur_graph_file) if self.__cur_image_obj.location: msg = GraphDirHandler.handle_image(self.__cur_image_obj.location, self.__cur_graph_file, DEC_RANK) else: msg = GraphFetcher.handle_image(self.__cur_graph_file, DEC_RANK) self.__cur_digest += "\n%s" % msg self.show_onscreen_info()
def get_recent_result(self, key): """output: urls, size_ratio""" if key not in self.__url_map: return None, None [retrieved_date, new_result, urls, size_ratio] = self.__url_map[key] if not self.__network_reachable or Crawler.__STOP_SEARCH: show(get_msg(Msg.use_previous_search_result)) # though size_ratio can be valid, we do not return it for caller usage is not expected return urls, None # spec.: we will execute a new search when there is enough new result on previous search # => if previous new result is n, all result is m, we will have a new search after m/n days # => if all previous result is new, then after 1 day we will have a search # => if no previous result is new, then we will have a search after 'TARGET_SEARCH_RESULT_SIZE' days valid_day_size = len(urls) / new_result if new_result > 0 else \ 1 if NA is new_result else \ TARGET_SEARCH_RESULT_SIZE # new_result = 0 => no new result before from util.global_def import get_search_latency valid_day_size *= get_search_latency() current_date = datetime.today() date_diff = current_date - retrieved_date if date_diff > timedelta(days=valid_day_size): # 'valid_day_size' is the valid duration of search result return None, size_ratio to_next_query = timedelta(days=valid_day_size) - date_diff hours, remainder = divmod(to_next_query.seconds, 3600) minutes, seconds = divmod(remainder, 60) show(get_msg(Msg.to_next_search), to_next_query.days, get_msg(Msg.day), hours, get_msg(Msg.hour), minutes, get_msg(Msg.minute), seconds, (get_msg(Msg.second) + ","), get_msg(Msg.current_url_count), len(urls)) # though size_ratio can be valid, we do not return it for caller usage is not expected return urls, None
def timer_action(self, user_next_image=False): if not user_next_image and self.__pause_slideshow: self.prepare_for_next_view(get_slideshow_frequency() * 1000) return success = self.set_graph(self.select_pattern()) if not success: self.prepare_for_next_view(1, get_msg(Msg.try_fetch_image_again)) return self.prepare_for_next_view(get_slideshow_frequency() * 1000)
def __load_or_create_status(self): status_cache = {} # key: image_file, value: status cache_file = self.__location + get_delim() + GraphDirHandler.CACHE_FILE cache_existed = os.path.exists(cache_file) if cache_existed: success, cache_data = load(cache_file) assert success [timestamp, status_cache] = cache_data if not self.dir_changed(timestamp): return status_cache else: info(get_msg(Msg.directory), self.__location, get_msg(Msg.has_changed_update_cache_file)) else: info("%s%s" % (get_msg(Msg.create_new_cache_file_for_directory), self.__location)) image_files = [] for root, _, files in os.walk(self.__location): assert len(root) >= 1 if root[-1] != get_delim(): root += get_delim() for base_file in files: basename, ext = os.path.splitext(base_file) if ext.replace(".", "") in GraphDirHandler.RECOGNIZED_IMAGE_EXT: image_files.append((root + base_file).replace(self.__location, "")) if not image_files: if cache_existed: os.remove(cache_file) self.__valid = False return None existed_image = {} for image in image_files: existed_image[image] = 1 # 1 is just a dummy value if image not in status_cache: status_cache[image] = Status() to_be_deleted = [] for image in status_cache: # this check works when some image is deleted if image not in existed_image: to_be_deleted.append(image) for image in to_be_deleted: status_cache.pop(image) # TODO: this makes an 'always' has-changed 2nd time image timestamp = time.ctime(os.path.getmtime(self.__location)) save(cache_file, [timestamp, status_cache]) return status_cache
def get_graph_file(self, pattern, url, cached_encoding): """output: graph_file, encoding""" if NA == cached_encoding: # mean this url is not retrievable return NA, NA file_encoding = cached_encoding if not file_encoding: file_encoding = GraphFetcher.get_file_encoding(pattern) graph_dir = GraphFetcher.get_graph_dir(pattern) if not os.path.exists(graph_dir): try: os.makedirs(graph_dir) except OSError as e: error(get_msg(Msg.cannot_create_directory), str(e)) import sys sys.exit() abs_graph_file = graph_dir + "image_" + file_encoding + ".jpg" if os.path.exists(abs_graph_file): return abs_graph_file, file_encoding if not self.__network_reachable: info(get_msg(Msg.give_up_fetch_image)) return NA, None self.__has_write = True try: info(get_msg(Msg.fetch_image), url) try: web_content = urllib2.urlopen(url, timeout=10) except httplib.BadStatusLine: info(get_msg(Msg.obtain_unrecognized_status_code), url) return NA, NA fd = open(abs_graph_file, 'wb') fd.write(web_content.read()) fd.close() assert os.path.exists(abs_graph_file) if os.stat(abs_graph_file).st_size <= 10240: info(get_msg(Msg.give_up_acquired_image_with_size), os.stat(abs_graph_file).st_size, "Bytes") info(get_msg(Msg.remove_image), abs_graph_file) os.remove(abs_graph_file) return NA, NA info(get_msg(Msg.fetch_succeed)) return abs_graph_file, file_encoding except (IOError, httplib.IncompleteRead, ssl.CertificateError) as e: info(get_msg(Msg.failed_url), url) info(get_msg(Msg.error_message), str(e)) if os.path.exists(abs_graph_file): fd.close() os.remove(abs_graph_file) return NA, NA
def crawl_by_asking_google_search(pattern, start, size, option=""): assert type(pattern) in [str, unicode] from util.global_def import get_api_key, get_cx api_key = get_api_key() cx = get_cx() if not api_key or not cx: if not Crawler._HAS_SHOW_NO_SEARCH_MSG: Crawler._HAS_SHOW_NO_SEARCH_MSG = True info(get_msg(Msg.no_search_due_to_no_api_key_and_cx)) return [], False size_option = "&imgSize=" + size if size else "" full_option = size_option + (option if option else "") base_url = 'https://www.googleapis.com/customsearch/v1?key=%s&cx=%s&searchType=image&num=%d' \ '&q=' + pattern + '&start=%d' + full_option request_str = base_url % (api_key, cx, G_SEARCH_PER_REQ_SIZE, start) urls = [] success = True try: r = requests.get(request_str) res = json.loads(r.text) if "error" in res: Crawler.print_error(res["error"]) if "This API requires billing to be enabled on the project" in res["error"]["message"]: # this is the 'out of quota' message Crawler.__STOP_SEARCH = True return urls, False if 'items' not in res: info(get_msg(Msg.cannot_fetch_image_url), "empty query") return urls, True # return 'True' is okay? for image_info in res['items']: assert 'link' in image_info url = image_info['link'] urls.append(url) except TypeError as e: # for unhandled error... info(get_msg(Msg.cannot_fetch_image_url), str(e)) success = False except requests.ConnectionError as e: info(get_msg(Msg.cannot_fetch_image_url), str(e)) success = False return urls, success
def view(self, image_obj_list, phrase_obj_list): if not phrase_obj_list: # the WTF 'mutable default argument' property makes us not have [] firstly phrase_obj_list = [] if not image_obj_list: info(get_msg(Msg.not_any_image_specified_program_exit)) sys.exit() self.setup_image_stuff(image_obj_list) self.setup_phrase_stuff(image_obj_list, phrase_obj_list) while True: self.timer_action(True) self.__root.mainloop() self.cancel_pending_jobs()
def crawl(self, pattern, size_list, option="", print_url=False): """output: urls, is_new_result""" show(get_msg(Msg.search_target), "\"" + pattern + "\"") key = Crawler.get_search_key(pattern, option) urls, size_ratio = self.get_recent_result(key) if urls: return urls, False if not self.__network_reachable or Crawler.__STOP_SEARCH: return None, False assert size_list and (not size_ratio or isinstance(size_ratio, dict)) dice = Crawler.get_dice(size_list, size_ratio) urls = [] next_size_ratio = {size: 0 for size in size_list} # key: size, value: number of new result (initial with 0) start = {size: 1 for size in size_list} # key: size, value: next search start offset (start from 1 by google) tried_size = 0 while tried_size < TARGET_SEARCH_RESULT_SIZE: chosen_size = get_weighted_random_dict_key(dice) this_urls, success = Crawler.crawl_by_asking_google_search(pattern, start[chosen_size], chosen_size, option) if not success: break urls += this_urls new_result = self.get_this_time_new_result_num(key, this_urls) next_size_ratio[chosen_size] += (new_result if NA != new_result else len(this_urls)) start[chosen_size] += G_SEARCH_PER_REQ_SIZE tried_size += G_SEARCH_PER_REQ_SIZE # 'set' to filter out duplicated item (though not expected, but we found g-search may give duplicated result) urls = list(set(urls)) if not Crawler._HAS_SHOW_NO_SEARCH_MSG: info("%s:%s, %s:%i" % ( get_msg(Msg.target), pattern, get_msg(Msg.acquired_url_count), len(urls))) if print_url: for url in urls: show(url) if success: next_size_ratio = {size: 1 if 0 == next_size_ratio[size] else next_size_ratio[size] for size in next_size_ratio} self.cache_url(key, urls, next_size_ratio) return urls, success
def select_pattern(self): if self.__arbitrator.is_active(): choice_pattern = None while not choice_pattern: choice_pattern = self.__arbitrator.arbitrate() if not choice_pattern: show(get_msg(Msg.no_available_image_wait_10_minutes)) self.__root.withdraw() import time time.sleep(600) self.__root.deiconify() return self.__cur_image_obj_dict[choice_pattern] image_obj_size = len(self.__cur_image_obj_list) return self.__cur_image_obj_list[random.randrange(0, image_obj_size)]
def __parse_config(self, config_file): from util.config import Config config = Config(config_file) config.set_general_setting() image_target = config.get_setting("image", "target") if not image_target: print(get_msg(Msg.not_any_image_specified_program_exit)) sys.exit() phrase_target = config.get_setting("phrase", "target") import glob self.__image_setting += glob.glob(image_target) self.__phrase_setting += glob.glob(phrase_target) if phrase_target else []
def fetch(self, pattern): self.__has_write = False new_objs, old_objs = self.get_updated_url(pattern) show(get_msg(Msg.total_data_count), len(new_objs) + len(old_objs)) url = self.choose_url(new_objs, old_objs) if NA == url: return NA, NA image_objs = old_objs image_objs.update(new_objs) image_slot = image_objs[url] graph_file, new_encoding = self.get_graph_file(pattern, url, image_slot.encoding) new_slot = ImageSlot(image_slot.timestamp, new_encoding, image_slot.rank) image_objs[url] = new_slot if self.__has_write: save(GraphFetcher.get_cache_file(pattern), image_objs) return graph_file, GraphFetcher.get_graph_digest(graph_file, image_objs[url])
@staticmethod def get_file_encoding(pattern): # TODO: add a file to keep last largest number to avoid possible long glob time... file_list = glob.glob(GraphFetcher.get_graph_dir(pattern) + "image_*.jpg") largest_idx = 0 # noinspection PyShadowingNames for graph_file in file_list: begin_pos = graph_file.find("image_") end_pos = graph_file.find(".jpg") assert -1 != begin_pos and -1 != end_pos begin_pos += len("image_") iter_idx = int(graph_file[begin_pos:end_pos]) assert iter_idx > 0 largest_idx = iter_idx if iter_idx > largest_idx else largest_idx largest_idx += 1 return str(largest_idx) if __name__ == '__main__': from util.global_def import config_action config_action() # name '_' before the 'obj' to let python not free imported module before __del__ is called # (or we will have 'NoneType' object has no attribute 'dump' for cPickle.dump) _obj = GraphFetcher() graph_file, digest = _obj.fetch("Inside Out") if NA == graph_file: print(get_msg(Msg.fetch_image_fail)) else: print(graph_file)
def help_str(): return get_msg(Msg.help_message)
def print_error(data): assert isinstance(data, dict) and "message" in data error(get_msg(Msg.search_engine_err_msg), data["message"])