async def logic(urls): try: new_loop=asyncio.new_event_loop() asyncio.set_event_loop(new_loop) session = AsyncHTMLSession() browser = await launch({ 'ignoreHTTPSErrors':True, 'headless':True, 'handleSIGINT':False, 'handleSIGTERM':False, 'handleSIGHUP':False, 'args': ['--no-sandbox', '--disable-setuid-sandbox'] }) session._browser = browser urls1=urls.split(',') emails1=[] for url in urls1: emails = await fetch(url, session) for email in emails: emails1.append(email) for i in range(len(emails1)): if(i % 2 == 1): emails1.pop(i-1) for i in range(len(emails1)): returndict.update({'email' + str(i+1):emails1[i]}) return returndict except Exception as e: print(e) falseret=[] return falseret
async def get_site(self): new_loop=asyncio.new_event_loop() asyncio.set_event_loop(new_loop) session = AsyncHTMLSession() browser = await pyppeteer.launch({ 'ignoreHTTPSErrors':True, 'headless':True, 'handleSIGINT':False, 'handleSIGTERM':False, 'handleSIGHUP':False }) session._browser = browser url = 'https://money.tmx.com/en/quote/' + self.symbol resp_page = await session.get(url) await resp_page.html.arender() return resp_page
async def get_post(your_query_url): new_loop = asyncio.new_event_loop() asyncio.set_event_loop(new_loop) session = AsyncHTMLSession() browser = await pyppeteer.launch({ 'ignoreHTTPSErrors': True, 'headless': True, 'handleSIGINT': False, 'handleSIGTERM': False, 'handleSIGHUP': False }) try: session._browser = browser resp_page = await session.get(your_query_url) await resp_page.html.arender() await browser.close() return resp_page except TimeoutError: await browser.close() raise TimeoutError
async def get_search_results_async(query: str) -> List[Tuple[str, str]]: encoded_query = quote_plus(query) session = None try: browser = await pyppeteer.launch(ignoreHTTPSErrors=False, headless=True, args=['--no-sandbox'], options=OPTIONS) session = AsyncHTMLSession(browser_args=OPTIONS) session._browser = browser # The library is doing sketchy stuff that breaks static-analysis here. # noinspection PyUnresolvedReferences response = await session.get(SEARCH_BASE + encoded_query) html: HTML = response.html await html.arender(timeout=RENDER_TIMEOUT_SECS) attrs = [ link.attrs for link in html.find(SIMPLE_SEARCH_RESULT_CLASS_FINGERPRINT) ] if attrs: pairs = ((attr["title"], attr["href"]) for attr in attrs if "title" in attr and "href" in attr) return [(title, href.split("=")[-1]) for title, href in pairs ] # Get the video codes out of the pairs else: logging.warning(f"Scraping return 0 results for {query}.") return [] except TimeoutError: logging.warning( f"Rendering of request for {query} timed out (max {RENDER_TIMEOUT_SECS} seconds)." ) return [] finally: if session: await session.close()
async def getposts_async(): htmls = [] base_url = 'https://mbasic.facebook.com' base_page_url = 'https://mbasic.facebook.com/%E8%BB%9F%E9%AB%94%E6%AF%8F%E6%97%A5%E6%96%B0%E8%81%9E-102888707949836/' urls = [] next_page_url = base_page_url """ collect urls """ regex_story = re.compile(r'^/story\.php.*=%2As-R$') regex_next_page = re.compile(r'^/profile\.php\?sectionLoadingID.*$') session = AsyncHTMLSession() browser = await pyppeteer.launch({ 'ignoreHTTPSErrors': True, 'headless': True, 'handleSIGINT': False, 'handleSIGTERM': False, 'handleSIGHUP': False, 'args': ['--no-sandbox', '--disable-setuid-sandbox'] }) session._browser = browser cookies_str = os.environ.get('COOKIES_STR') cookie_dict = {} for kv_pair in cookies_str.split('; '): kv = kv_pair.split('=') cookie_dict[kv[0]] = kv[1] while True: resp_page = await session.get(next_page_url, cookies=cookie_dict) resp_page.encoding = 'utf-8' #await resp_page.html.arender() soup_page = resp_page.html.links url_tags = [link for link in soup_page if regex_story.search(link)] next_page_tags = [ link for link in soup_page if regex_next_page.search(link) ] for url_tag in url_tags: urls.append('{base_url}{href}'.format(base_url=base_url, href=url_tag)) next_page_url = None if len( next_page_tags) == 0 else '{base_url}{href}'.format( base_url=base_url, href=next_page_tags[0]) #print('next_page_url ', next_page_url) if (next_page_url is None): break #print('urls ', urls) """ collect data """ posts = [] for url in urls: #httpproxy=choice(http_proxies) #httpsproxy=choice(https_proxies) resp = await session.get(url, cookies=cookie_dict) #print('host ', request.environ['REMOTE_ADDR']) resp.encoding = 'utf-8' #htmls.append(resp.html) posts.append({ 'title': '', 'url': '', 'content': resp.html.find('div.bc')[0].text }) """sleep for 5 sec """ # time.sleep(5) #print(htmls) #posts=[{'title': '123','url': 'okok','content':'okkk'}, {'title': '222', 'url': '222', 'content': '555'}] return posts