def __init__(self, starturl, depth=10, release=False): self.session = requests_html.AsyncHTMLSession() self.urls = [starturl] self.depth = depth self.urls_done = [] self.host = urlparse.urlparse(starturl).netloc self.release = release
def aggregate_zip_codes(state_links, state_list): asess = requests_html.AsyncHTMLSession() jobs = [ partial(aggregate_helper, state_links, state, asess) for state in state_list ] asess.run(*jobs) sleep(20) asess.close()
async def main(): s = requests_html.AsyncHTMLSession() SINA_URL = 'https://news.sina.com.cn/roll/' r = await s.get(SINA_URL) await r.html.arender() print(r.html.encoding) f = open("./tmp.html", "w", encoding=r.html.encoding) f.write(r.html.html) f.close() print(r.url, r.is_redirect)
async def get_news_links_from_page_url(url: str) -> list: # print(f"get news links: url={url}") if not __isValidUrl__(url): raise Exception(f"not valid url :|{url}|") s = requests_html.AsyncHTMLSession() try: r = await s.get(url) await r.html.arender() html = r.html.html.encode(r.html.encoding).decode('utf8', 'ignore') bs = bs4.BeautifulSoup(r.html.html, 'html.parser') links = bs.findAll('a') res = [] for link in links: url = link['href'] if __isValidUrl__(url): res.append(url) return res finally: await s.close()
async def get_page_html(page_link: str = '', html_element: bool = True) -> lxml.html.Element: '''Given the URL of a JS rendered webpage, function will return the raw html from page in bytes format Must use 'await' command with function, setting html_element to True will return html.Element object, otherwise will return html page in bytes ''' res = None # Check link if page_link: try: # Start Session asession = reqHTML.AsyncHTMLSession() # Request Page r = await asession.get(page_link, headers={'User-Agent': 'Mozilla/5.0'}) await r.html.arender() res = lxml.html.fromstring(r.html.raw_html) if html_element else r except requests.exceptions.RequestException as e: print(e) return res
def __init__(self, start_session, end_session): self._start_session = start_session self._end_session = end_session self._session = requests_html.AsyncHTMLSession() self._trade_file_urls = None self._cached_asset_details = {}
from starlette.applications import Starlette from starlette.templating import Jinja2Templates from starlette.requests import Request import json import requests_html import asyncio templates = Jinja2Templates(directory='templates') app = Starlette() session = requests_html.AsyncHTMLSession() ENDPOINT = "http://52.35.39.131:1337/text-gen/predict" @app.route('/', methods=["GET", "POST"]) async def homepage(request: Request): if request.method == "GET": return templates.TemplateResponse('index.html', { 'result': ["result will appear here"], "request": request }) else: form = await request.form() data = { "input": json.dumps({ "text": form["text"], "num_words": int(form["num_words"]), "num_tries": 3 }) }
def __init__(self, **kwargs): self.session = requests_html.AsyncHTMLSession() self.url = kwargs['url'] self.width = kwargs['width'] self.run()