class QueuedKeepaliveAioHttpTransport( QueuedPatchedAioHttpTransport, ): DNS_CACHE_TTL = 600 DNS_CACHE = True TCP_CONNECTION_LIMIT = 32 TCP_CONNECTION_LIMIT_HOST = 8 WORKERS = 1 QUEUE_SIZE = 1000 def __init__(self, *args: Any, family: int = socket.AF_UNSPEC, loop: asyncio.AbstractEventLoop = None, dns_cache: bool = DNS_CACHE, dns_cache_ttl: int = DNS_CACHE_TTL, connection_limit: int = TCP_CONNECTION_LIMIT, connection_limit_host: int = TCP_CONNECTION_LIMIT_HOST, workers: int = WORKERS, qsize: int = QUEUE_SIZE, **kwargs: Any): self.connection_limit = connection_limit self.connection_limit_host = connection_limit_host self.dns_cache = dns_cache self.dns_cache_ttl = dns_cache_ttl super().__init__(*args, family=family, loop=loop, keepalive=True, workers=workers, qsize=qsize, **kwargs) def _client_session_factory(self) -> ClientSession: self.connector = TCPConnector( family=self.family, limit=self.connection_limit, limit_per_host=self.connection_limit_host, ttl_dns_cache=self.dns_cache_ttl, use_dns_cache=self.dns_cache, verify_ssl=self.verify_ssl, ) return ClientSession( connector=self.connector, connector_owner=False, ) async def _close(self) -> Transport: transport = await super()._close() if inspect.iscoroutinefunction(self.connector.close()): await self.connection.close() else: self.connector.close() return transport
class DownloadDispatcher: def __init__(self, config): self.config = config self.pageAdviser = parsers.PageAdviser(config) self._loop = asyncio.get_event_loop() self.pages = [] self.pic_page_urls = [] self.conn = TCPConnector(ssl=False, limit=10, use_dns_cache=True) def __enter__(self): return self def __exit__(self, exc_type, exc_val, exc_tb): self.close() def close(self): self._loop.run_until_complete(self.conn.close()) self._loop.close() def dispatch(self): self.pages = self.pageAdviser.get_pages() print("%d images are expected to be downloaded." % self.pageAdviser.get_pic_number()) print("parsing list page, %d list pages found" % len(self.pages)) work_limit = min(len(self.pages), self.config.maxr) self._start_tasks(work_limit, self._handle_list_page()) self._check_pic_number() work_limit = min(len(self.pic_page_urls), self.config.maxr) print("started to download images") self._start_tasks(work_limit, self._handle_pic_page()) def _start_tasks(self, work_limit, cor_gen): tasks = [] for i in range(work_limit): tasks.append(cor_gen) self._loop.run_until_complete(asyncio.wait(tasks)) def _check_pic_number(self): print("parsing list page finished. %d pic urls found" % len(self.pic_page_urls)) missed_number = self.pageAdviser.get_pic_number() - len( self.pic_page_urls) if missed_number > 0: print("there is(are) %d image(s) can't be found." % missed_number) async def _handle_list_page(self): while len(self.pages) > 0: page = self.pages.pop() content = await self._get_request_text( "%s%s&page=%d" % (_URL_ROOT, _LIST_PAGE, page)) parser = parsers.ListPageParser() parsed_urls = parser.parse_img_urls(content) parser.close() for date_str, url in parsed_urls: if self.pageAdviser.is_required_img(date_str): self.pic_page_urls.append((date_str, url)) await asyncio.sleep(0.1) async def _handle_pic_page(self): while len(self.pic_page_urls) > 0: img_name, url = self.pic_page_urls.pop() content = await self._get_request_text(_URL_ROOT + url) parser = parsers.ImgViewPageParser() src = parser.get_img_src(content) parser.close() if src: await self._download_img(src, img_name) await asyncio.sleep(0.1) async def _download_img(self, src, img_name): file_path = "%s%s.png" % (self.config.directory, img_name) content = None async with client.request("GET", src, connector=self.conn) as response: content = await response.read() if content: with open(file_path, 'wb') as fd: fd.write(content) print(" :=> downloaded wallpaper of date: %s" % img_name) async def _get_request_text(self, url): async with client.request("GET", url, connector=self.conn) as response: text = await response.text() return text
def downloadReplay(bcst, info): resp = call('getAccessPublic', {'broadcast_id': bcst}) # 1. Download replay info if 'cookies' not in resp: print('==> "%s" (@%s) has been deleted!' % (info['status'], info['username'])) return cookies = {i['Name']: i['Value'] for i in resp['cookies']} baseUrl = '/'.join(resp['replay_url'].split('/')[:-1]) + '/chunk_%d.ts' m3u8 = get(resp['replay_url'], cookies=cookies).text nbChunks = int(m3u8.split('chunk_')[-1].split('.')[0]) + 1 # 2. Download parallelly TS chunks and save into mp4 set_event_loop(new_event_loop()) connector = TCPConnector(conn_timeout=10, limit=6) resps = [None] * nbChunks with ClientSession(connector=connector, cookies=cookies, headers=headers2) as client: get_event_loop().run_until_complete(wait([replayCoro(baseUrl % i, resps, i, client) for i in range(nbChunks)])) connector.close() if None in resps: print('==> "%s" (@%s) has been deleted!' % (info['status'], info['username'])) return with NamedTemporaryFile(suffix='.ts') as tmp: for chunk in resps: tmp.write(chunk) tmp.flush() run(['ffmpeg', '-y', '-v', 'fatal', '-i', tmp.name, '-bsf:a', 'aac_adtstoasc', '-c', 'copy', 'storage/live/' + bcst + '.mp4'], check=True) del resps Popen(['ffmpeg', '-y', '-v', '-8', '-i', 'storage/live/' + bcst + '.mp4', '-vframes', '1', '-ss', '5', '-vf', 'crop=in_w:1/PHI*in_w, scale=-1:65', 'storage/thumb/' + bcst + '.jpg']) postProcessChat(bcst) # 4. Download chat info (other thread?) cursor = None retries = 0 chat = ChatEngine(bcst, info) while cursor != '': hist = post(resp['endpoint'] + '/chatapi/v1/history', json={ 'access_token': resp['access_token'], 'cursor': cursor, 'duration': 9999999, 'since': 0 }) hist.encoding = 'utf-8' if hist.text.strip() == 'list room events in progress' and retries < 20: sleep(5) retries += 1 continue retries = 0 try: hist = loads(hist.text) except JSONDecodeError: try: hist = loads(sub(r'([\u007f-\uffff])\\*("\}?,)(\\+)', r'\1\3\2\3', hist.text)) except JSONDecodeError: print('=> Retrieval of "%s" (@%s) failed with: %s (%d)' % (info['status'], info['username'], repr(hist.text), hist.status_code)) return for msg in hist['messages']: chat.parse(msg=msg) cursor = hist['cursor'] chat.save() print('=> Ended up downloading: "%s" (@%s)' % (info['status'], info['username']))
class MarkdownFormatter: def __init__(self, base_url, redis=None): self.base_url = base_url self.matcher = {} self.re = re.compile(r"(\(https?://\S+\))") self.connector = TCPConnector() self.redis: Redis = redis def __del__(self): self.connector.close() @cache async def get_shorten_link(self, url) -> Tuple[str, str]: if '@' in url: return url, url res = await self.get_click(url) # Забанили урл if len(res) > 30: try: res = await self.get_isgd(url) return url, res except ClientResponseError: return url, url return url, res async def get_click(self, url): try: chatbase = quote(get_chatbase_url(url), encoding="ascii") req_url = (f'https://clck.ru/--?url={chatbase}') except UnicodeEncodeError: log.exception('Failed to encode url %r', url) req_url = url async with ClientSession(raise_for_status=True, connector=self.connector, connector_owner=False) as session: async with session.get(req_url) as r: # type: ClientResponse return await r.text() async def get_isgd(self, url): try: chatbase = quote(get_chatbase_url(url), encoding="ascii") req_url = f'https://is.gd/create.php?format=simple&url={chatbase}' except UnicodeEncodeError: log.exception('Failed to encode url %r', url) req_url = url async with ClientSession(raise_for_status=True, connector=self.connector, connector_owner=False) as session: async with session.get(req_url) as r: # type: ClientResponse return await r.text() def collect_matches(self, match: Match[str]): self.matcher[match.group(0)[1:-1]] = "" async def convert_links(self, markdown: str) -> str: self.matcher = {} # noinspection PyTypeChecker self.re.sub(partial(self.collect_matches), markdown) tasks = [self.get_shorten_link(url) for url in self.matcher.keys()] res = await gather(*tasks) return reduce(lambda md, b: md.replace(f'({b[0]})', f'({b[1]})'), res, markdown) async def parse_markdown(self, html, width=269, max_length=4096) -> str: h = html2text.HTML2Text(baseurl=self.base_url, bodywidth=width) # Небольшие изощрения с li html_to_parse = str(html).replace('<li', '<div').replace("</li>", "</div>") if '[' in html_to_parse and ']' in html_to_parse: html_to_parse = html_to_parse.replace('[', '{').replace(']', '}') html_to_parse.replace('lite.mfd.ru', 'forum.mfd.ru') html_to_parse = utils.transform_emoji(html_to_parse) md = await self.convert_links(h.handle(html_to_parse).strip()) if len(md) > max_length: md = f'{md[:max_length - 60]}...' return md