def test_url_save_guess_file(self): md5, file = url_get_content(URL_DEBIAN_CD_PATH.format('MD5SUMS'), fake_headers()).splitlines()[0].split() Log.d(TAG, 'md5={}, file={}'.format(md5, file)) self.assertEqual( file, url_save_guess_file(URL_DEBIAN_CD_PATH.format(file))[0])
def test_url_save(self): md5, file = url_get_content(URL_DEBIAN_CD_PATH.format('MD5SUMS'), fake_headers()).splitlines()[0].split() Log.d(TAG, 'md5={}, file={}'.format(md5, file)) file_actual, size = url_save( URL_DEBIAN_CD_PATH.format(file), reporthook=lambda a, b: Log.d( TAG, '{:>5}% downloaded'.format(round(a * 100 / b, 1)))) Log.d(TAG, 'file size: {} MiB'.format(round(size / 1024 / 1024, 1))) md5_actual = hashlib.md5() with open(file_actual, 'rb') as f: buffer = f.read(512 * 1024) while buffer: md5_actual.update(buffer) buffer = f.read(512 * 1024) self.assertEqual(md5, md5_actual.hexdigest())
async def do_get(word: str) -> List[str]: async with sem: try: actual, fields = await asyncio.get_running_loop().run_in_executor(None, self.get_card, word) except Exception as e: Log.e(TAG, 'can\'t get card: "{}", {}'.format(word, e)) async with lock: skipped.append(word) Log.e(TAG, 'skipped: "{}"'.format(word)) else: async with lock: bar.extra = actual bar.increment() if actual not in visited: visited.add(word) visited.add(actual) return fields
def get_card(self, word: str) -> Tuple[str, List[str]]: Log.i(TAG, 'querying "{}"'.format(word)) response = urlopen_with_retry( URL_QUERY.format(urllib.parse.quote(word.replace('/', ' '))), fake_headers()) actual = urllib.parse.urlsplit(response.geturl()).path.rsplit('/', 1)[-1] actual = ' '.join(actual.split('-')) if not actual: raise WordNotFoundError('can\'t find: "{}"'.format(word)) if actual != word: Log.i(TAG, 'redirected to: "{}"'.format(actual)) content = url_get_content(response) fields = self._extract_fields(content) Log.i(TAG, 'parsed: "{}"'.format(actual)) return actual, fields
def generate_cards(self, *words: str): Log.i(TAG, 'generating {} cards'.format(len(words))) file = valid_path(self.cards_file) # region Access with lock in coroutines visited = set() skipped = [] bar = ProgressBar(len(words)) lock = asyncio.Lock() # endregion async def do_generate(): sem = asyncio.Semaphore(DEFAULT_CONCURRENCY) async def do_get(word: str) -> List[str]: async with sem: try: actual, fields = await asyncio.get_running_loop().run_in_executor(None, self.get_card, word) except Exception as e: Log.e(TAG, 'can\'t get card: "{}", {}'.format(word, e)) async with lock: skipped.append(word) Log.e(TAG, 'skipped: "{}"'.format(word)) else: async with lock: bar.extra = actual bar.increment() if actual not in visited: visited.add(word) visited.add(actual) return fields # gather all tasks to keep results stable return await asyncio.gather(*[do_get(w) for w in words]) bar.update() cards = asyncio.run(do_generate()) cards = [card for card in cards if card] bar.done() with open(file, 'a', encoding='utf8') as fp: writer = csv.writer(fp) writer.writerows(cards) Log.i(TAG, 'generated {} cards to: {}'.format(len(cards), file)) if skipped: Log.e(TAG, 'skipped {} words:\n{}'.format(len(skipped), '\n'.join(skipped)))
def _retrieve_styling(self) -> str: Log.i(TAG, 'retrieving styling') style = url_get_content(URL_STYLE, fake_headers()) font = url_save_guess_file(URL_FONT, fake_headers())[0] # add '_' to tell Anki that the file is used by template _font = url_save(URL_FONT, headers=fake_headers(), filename=valid_path( os.path.join(self.media_path, '_' + font)))[0] Log.i(TAG, 'saved font file to: {}'.format(_font)) _font = os.path.basename(_font) style = re.sub(r'url\([\S]*?/{}'.format(font), 'url({}'.format(_font), style) style = '<style>{}</style>'.format(style) style += '<script type="text/javascript">{}</script>'.format( url_get_content(URL_AMP, fake_headers())) style += '<script type="text/javascript">{}</script>'.format( url_get_content(URL_AMP_ACCORDION, fake_headers())) Log.i(TAG, 'retrieved styling') return style
def test_url_get_content(self): Log.d( TAG, url_get_content(URL_DEBIAN_CD_PATH.format('MD5SUMS'), fake_headers()))
def test_urlopen_with_retry(self): url = URL_CAMBRIDGE_QUERY.format(urllib.parse.quote('cater to')) with urlopen_with_retry(url, fake_headers()) as response: Log.d(TAG, 'headers={}'.format(response.headers)) Log.d(TAG, 'status={}, url={}'.format(response.status, response.url))
def collapse2(h): m = parse_tag.match(h) Log.d(TAG, '{}\n{}\n{}'.format(m.group(1), m.group(2), m.group(3))) return m.group(1) + HTML_COLLAPSE.format(m.group(2)) + m.group(3)
def generate_styling(self): Log.i(TAG, 'generating styling') sf = valid_path(self.styling_file) with open(sf, 'w', encoding='utf8') as fp: fp.write(self._styling) Log.i(TAG, 'generated styling to: {}'.format(sf))
def test_findall(self): for e in htmls.findall(self.HTML, 'a'): Log.d(TAG, e)
def generate_front_template(self): Log.i(TAG, 'generating front template') ftf = valid_path(self.front_template_file) with open(ftf, 'w', encoding='utf8') as fp: fp.write(self._front_template) Log.i(TAG, 'generated front template to: {}'.format(ftf))
def generate_cards(self, *words: str): Log.i(TAG, 'trying to generate {} cards'.format(len(words))) visited = set() skipped = [] cf = valid_path(self.cards_file) with open(cf, 'a', encoding='utf8') as fp: for word in words: if word in visited: Log.i(TAG, 'skipping duplicate: "{}"'.format(word)) continue try: actual, fields = self.get_card(word) except Exception as e: Log.e(TAG, e) skipped.append(word) Log.w(TAG, 'skipped: "{}"'.format(word)) else: if fp.tell(): fp.write('\n') fp.write('\t'.join(fields)) visited.add(word) visited.add(actual) if skipped: Log.w( TAG, 'skipped {} words:\n'.format(len(skipped)) + '\n'.join(skipped)) Log.i( TAG, 'generated {} cards to: {}'.format(len(words) - len(skipped), cf))
def test_removeall(self): Log.d(TAG, htmls.removeall(self.HTML, 'a'))
def test_sub(self): def rm_tag(s): return re.sub(r'<[\s\S]*?>([\s\S]*)<[\s\S]*>', r'\g<1>', s) Log.d(TAG, htmls.sub(self.HTML, rm_tag, 'a', 'href="http://example.com/"'))
def generate_back_template(self): Log.i(TAG, 'generating back template') btf = valid_path(self.back_template_file) with open(btf, 'w', encoding='utf8') as fp: fp.write(self._back_template) Log.i(TAG, 'generated back template to: {}'.format(btf))
def test_find_positions(self): for i, j in htmls.find_positions(self.HTML, 'a', 'href="http://example.org/"'): Log.d(TAG, 'i={}, j={}'.format(i, j))