def __init__(self, bot): self.bot = bot self.next_index = 1 self.reverse_image_index = {} self.reverse_hash_index = {} self.index_json = {} self.myopener = MyOpener() self.lock = threading.Lock()
def get_image_hash(url): file_name = '/tmp/' + get_random_file_name() imgur = ImgurFetcher() myopener = MyOpener() if not url.startswith('http'): url = '//' + url parsed = urlparse(url) imgur_url = imgur.get_image_url(parsed.path[1:]) try: myopener.retrieve(imgur_url, file_name) return str(imagehash.dhash(Image.open(file_name))) except: return None finally: os.remove(file_name)
class XkcdFetcher(object): def __init__(self, bot): self.bot = bot self.next_index = 1 self.reverse_image_index = {} self.reverse_hash_index = {} self.index_json = {} self.myopener = MyOpener() self.lock = threading.Lock() def get_json(self, url): if not url.startswith('http'): url = '//' + url parsed = urlparse(url) if not parsed: return None if re.match('^(www\.)?imgs.xkcd.com$', parsed.netloc): if parsed.path not in self.reverse_image_index: self._load_reverse() comic_id = self.reverse_image_index.get(parsed.path) return self.index_json.get(comic_id) if comic_id else None if re.match('^(www\.)?xkcd.com$', parsed.netloc) and re.match('^/\d+/?$', parsed.path): m = re.search('^/(\d+)/?$', parsed.path) comic_id = int(m.group(1)) if comic_id not in self.index_json: self._load_reverse() return self.index_json.get(comic_id) if comic_id else None if re.match('^imgur\.com$', parsed.netloc): hash = self.bot.imgur_lookup.get(url) if hash: if hash not in self.reverse_hash_index: self._load_reverse() comic_id = self.reverse_hash_index.get(hash) j = self.index_json.get(comic_id) if comic_id else None if j: j['from_external'] = True return j return None def get_explained_link(self, comic_id): return XKCD_EXPLAINED_URL.format(comic_id=comic_id) def _load_reverse(self): self.lock.acquire() data_store = self.bot._get_new_data_store_connection() try: while True: meta = self._get_meta(data_store, self.next_index) if not meta: self._insert_meta(data_store, self.next_index) meta = self._get_meta(data_store, self.next_index) if not meta: return if meta[1]: # json self.index_json[self.next_index] = json.loads(meta[1]) if meta[3] and meta[3] not in self.reverse_hash_index: # hash_avg self.reverse_hash_index[meta[3]] = self.next_index if meta[1]: # json parsed = urlparse(self.index_json[self.next_index].get('img', '')) if parsed and parsed.path and parsed.path not in self.reverse_image_index: self.reverse_image_index[parsed.path] = self.next_index self.next_index += 1 finally: self.lock.release() data_store.close() def _get_meta(self, data_store, comic_id): return data_store.get_xkcd_meta(comic_id) def _insert_meta(self, data_store, comic_id): j = self._get_xkcd_json(comic_id) hash_avg = '' hash_d = '' hash_p = '' if not j: return if j.get('img'): file_name = '/tmp/' + get_random_file_name() try: self.myopener.retrieve(j.get('img'), file_name) hash_avg = imagehash.average_hash(Image.open(file_name)) hash_d = imagehash.dhash(Image.open(file_name)) hash_p = imagehash.phash(Image.open(file_name)) except: pass finally: os.remove(file_name) data_store.insert_xkcd_meta(comic_id, json.dumps(j), str(hash_avg), str(hash_d), str(hash_p)) def _get_xkcd_json(self, comic_id): if int(comic_id) == 404: return {'title': '404', 'transcript': '404', 'alt': '404'} try: response = urllib2.urlopen(XKCD_JSON_API_URL.format(comic_id=comic_id)) html = response.read() return json.loads(html) except: return None