def get_articles(profile): session = requests_html.HTMLSession() medium_link = f'https://medium.com/feed/@{profile}'.replace('@@', '@') r = session.get(medium_link) for i in r.html.find('channel')[0].find('item'): link = i.find('link')[0].html.replace('<link/>', '') pub_date = i.find('pubDate')[0].text categories = i.find('category') yield link, get_date_formatted(pub_date), categories
def getPricesOnline(): for key in stocks: url = 'https://in.finance.yahoo.com/quote/' + key session = requests_html.HTMLSession() r = session.get(url) content = BeautifulSoup(r.content, 'lxml') price = str(content).split('data-reactid="32"')[4].split( '</span>')[0].replace('>', '') price = float(price.replace(',', '')) stocks[key] = price
def __init__(self): self.db_init = DBInit() self.edb_dao = EDBDao(self.db_init.session) self.session = requests_html.HTMLSession() self.session.keep_alive = False self.headers = { 'user-agent': 'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/66.0.3359.170 Safari/537.36', }
def __init__(self): """ Crawls. """ self.http_session = requests_html.HTMLSession() logger.debug("Sending GET request to start_url...") self.start_response = self.http_session.get(url=config.start_url) logger.debug("Received %s response.", self.start_response.status_code) self.request_count = 1
def __init__(self, docentes_dataset, basename='scholar', delay_bounds_secs=(7, 27), short_fraction=4, **kwargs): super().__init__(basename+'.csv', None, **kwargs) self.basename = basename self.docentes_dataset = docentes_dataset self.delay_bounds_secs = delay_bounds_secs self.short_fraction = short_fraction self.session = requests_html.HTMLSession() self.delay_pending = False
def _scrape_once(data, config: Config) -> Iterator[WordPronPair]: session = requests_html.HTMLSession() for member in data["query"]["categorymembers"]: word = member["title"] date = member["timestamp"] if _skip_word(word) or _skip_date(date, config.cut_off_date): continue request = session.get(_PAGE_TEMPLATE.format(word=word), timeout=10) for word, pron in config.extract_word_pron(word, request, config): yield word, pron
def getkvalue(keywords): s = requests_html.HTMLSession() url = "http://search.anccnet.com/searchResult2.aspx?keyword={}".format(keywords) header = { "referer": "http://www.anccnet.com/", "User-Agent": "Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/63.0.3239.132 Safari/537.36" } rsp = s.get(url, headers=header) k = rsp.html.find('#yanzhengLabel')[0].attrs['data-site-key'] s.close() return k
def __init__(self): db_init = DBInit() self.msf_dao = MSFDao(db_init.session) self.session = requests_html.HTMLSession() self.session.keep_alive = False self.headers = { 'user-agent': 'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/66.0.3359.170 Safari/537.36', } logging.basicConfig(level=logging.INFO)
def __init__(self): """Initialize the class, session, etc.""" self.logged_in = False self.session = requests_html.HTMLSession(mock_browser=False) self.session.headers['User-Agent'] = constants.REQ_USER_AGENT self.ibank_url = constants.BNI_IBANK_URL self.ib_account_name = '' self.first_url = '' self.last_req = None self.session_ttl = constants.SESSION_TTL self.session_time = None
def start_requests(self): urls = [] cur_url = "http://club.xywy.com/keshi/{}.html" sess = requests_html.HTMLSession() for i in range(1, 65): response = sess.get(cur_url.format(i)) tmp_urls = response.html.xpath( "//ul[@class='club_Date clearfix']/li/a/@href") urls += tmp_urls for url in urls: yield scrapy.Request(url, callback=self.parse)
def __init__(self): self.session = requests.HTMLSession() self.token = None self.datalist = list() self.headers_data = { 'Host': 'webshare.cz', 'Referer': Webshare.webshare_url + '/', 'Origin': Webshare.webshare_url, 'X-Requested-With': 'XMLHttpRequest', 'Accept': 'text/xml; charset=UTF-8' }
def get_sse_one_page(page_num: str = 1, page_size: str = 15) -> pd.DataFrame: ss = requests_html.HTMLSession() ss.headers[ 'Referer'] = 'http://www.sse.com.cn/disclosure/credibility/supervision/inquiries/' url = f'http://query.sse.com.cn/commonSoaQuery.do?siteId=28&sqlId=BS_KCB_GGLL&channelId=10743%2C10744%2C10012&extGGDL=&order=createTime|desc%2Cstockcode|asc&isPagination=true&pageHelp.pageSize={page_size}&pageHelp.pageNo={page_num}' res = ss.get(url) df = pd.DataFrame(res.json()['result']) df = df[[ 'stockcode', 'extGSJC', 'cmsOpDate', 'extWTFL', 'docTitle', 'docURL' ]] df['docURL'] = df['docURL'].apply(lambda x: x.split('/')[-1]) return df
def requests(url_list, progressbar): html = [] with requests_html.HTMLSession() as session: for url in url_list: r = session.get(url) html.append(r.text) progressbar.UpdateBar( round(100 / len(url_list) * (url_list.index(url) + 1))) return html
def htmlRequest(url: str) -> requests_html.HTMLResponse: nvlog.debug("Starting requests session...") session = requests_html.HTMLSession() nvlog.info(f"Making GET request: url={url}") response = session.get(url) nvlog.info(f"Response received ({response.status_code})") nvlog.debug("Rendering dynamic content...") response.html.render() nvlog.debug("Rendered dynamic content") session.close() nvlog.debug("Session closed") return response.html.raw_html
def index_request(self): session = requests_html.HTMLSession() r = session.get( 'https://cl.cfbf.xyz/thread0806.php?fid=20&search=&page=2') novel_list = r.html.find( '#ajaxtable > tbody:nth-child(2) > tr > td.tal > h3') novel_list = novel_list[5:-1] for x in novel_list: title = x.text for reallink in x.links: url = 'https://cl.cfbf.xyz/' + reallink self.content_request(title, url)
def login(self, username, password): """ Creates authenticated session. """ self._html_session = requests_html.HTMLSession() self._html_session.get(url='https://api.librus.pl/OAuth/Authorization?client_id=46&response_type=code&scope=mydata') response = self._html_session.post(url='https://api.librus.pl/OAuth/Authorization?client_id=46', data={'action': 'login', 'login': username, 'pass': password}) if not response.json().get('status') == 'ok' or not response.json().get('goTo'): raise RuntimeError("Login failed") self._html_session.get(url=urljoin(response.url, response.json()['goTo']))
def main(): with requests_html.HTMLSession() as sess: res = sess.get(SCRAPE_URL) state_links = get_state_links(res.html) states = list(state_links.keys()) for state in states: global_storage[state] = list() for package in bunch(states, 5): aggregate_zip_codes(state_links, package)
def regular_season(requests_date): with requests_html.HTMLSession() as session: session.headers = headers session.mount('http://', HTTPAdapter(max_retries=5)) session.mount('https://', HTTPAdapter(max_retries=5)) # 抓取的常规赛年月 url_format = 'http://nba.win0168.com/jsData/matchResult/%s/l1_1_20%s_10.js?version=2018112112' % ( requests_date, requests_date[:2]) r = session.get(url_format, timeout=6) # 年月数据格式化 year_month = map(lambda x: x.split(','), r.html.search('ymList = [[{}]];')[0].split('],[')) workbook = xw.Workbook('NBA.xlsx') worksheet = workbook.add_worksheet() worksheet.set_column('A:A', 15) worksheet.set_column('H:H', 15) worksheet.set_column('I:I', 15) worksheet.set_column('J:J', 15) worksheet.write(0, 0, "赛事") worksheet.write(0, 1, "时间") worksheet.write(0, 2, "队伍1") worksheet.write(0, 3, "队伍2") worksheet.write(0, 4, "第一节") worksheet.write(0, 5, "第二节") worksheet.write(0, 6, "第三节") worksheet.write(0, 7, "第四节") worksheet.write(0, 8, "第一节") worksheet.write(0, 9, "第二节") worksheet.write(0, 10, "第三节") worksheet.write(0, 11, "第四节") worksheet.write(0, 12, "半场") worksheet.write(0, 13, "全场") starkey1 = 0 for ym in year_month: # if ym == ['2018', '12']: # return 0 # 新建excel format: year - month.xlsx url = 'http://nba.win0168.com/jsData/matchResult/%s/l1_1_%s_%s.js?version=2018112112' % ( requests_date, ym[0], ym[1]) r = session.get(url, timeout=6) # 该年月的比赛id play_id = map(lambda x: x.split(',')[0], r.html.search('arrData = [[{}]];')[0].split('],[')) play_id = list(play_id) # 当前场次后无数据 截取list # if ym == ['2018', '11']: # play_id = play_id[:play_id.index('325827')] # 新建工作薄 starkey1 = create_execl(play_id, worksheet, session, starkey1) # 关闭保存 print('完成', ym[0], ym[1]) workbook.close()
def favicon(website): session = requests_html.HTMLSession() try: r = session.get(website, headers={'user-agent': USER_AGENT}, timeout=TIMEOUT) except requests.exceptions.RequestException: r = None try: if r: favicon = r.html.find('link[rel="shortcut icon"]', first=True) if favicon is not None and "href" in favicon.attrs: return urllib.parse.urljoin(website, favicon.attrs['href']) favicon = r.html.find('link[rel="icon"]', first=True) if favicon is not None: return urllib.parse.urljoin(website, favicon.attrs['href']) except: # LATER: logging r = None favicon = urllib.parse.urljoin(website, "favicon.ico") try: r = requests.get(favicon, headers={'user-agent': USER_AGENT}, timeout=TIMEOUT) except requests.exceptions.RequestException: return "-" if r.status_code == 200 and 'content-type' in r.headers and r.headers[ 'content-type'].startswith("image/"): return favicon parts = urllib.parse.urlparse(website) if parts.path != '/': favicon = urllib.parse.urlunparse( (parts.scheme, parts.netloc, "/favicon.ico", "", "", "")) try: r = requests.get(favicon, headers={'user-agent': USER_AGENT}, timeout=TIMEOUT) except requests.exceptions.RequestException: return "-" if r.status_code == 200 and r.headers['content-type'].startswith( "image/"): return favicon return "-"
def login(username,password): user_data={ 'account_name':username, 'user_pwd':password, 'remember_me':'N' } session=requests_html.HTMLSession() login_res=session.post("https://www.mosoteach.cn/web/index.php?c=passport&m=account_login",data=user_data) if(login_res.json()['result_code']==1007 or login_res.json()['result_code']==1001 ): return None else: return login_res.cookies
def _get_rendered_html_handler(url): ''' get the rendered html version of the site - includes the modifications JS makes dynamically ''' try: session = requests_html.HTMLSession() raw_html = session.get(url) raw_html.html.render() return BeautifulSoup(raw_html.html.html, 'html.parser') except requests.exceptions.ConnectionError: raise ValueError("Can't open playlist URL. Please check the URL again")
def content_request(self, title, url): session = requests_html.HTMLSession() r = session.get(url) content = r.html.find('#main > div:nth-child(4)') content = 'url:' + url + '\n' + content[0].text fileName = 'D:/cl/' + title + ".txt" if os.path.exists('D:/cl') == False: os.mkdir('D:/cl') print("正在保存小说文件:" + fileName) with open(fileName, "w", encoding="utf-8") as f: f.write(content)
def get_page(url: str) -> Response: headers: str = {"User-Agent": requests_html.user_agent()} with requests_html.HTMLSession() as s: resp: Response = s.get(url, headers=headers) try: resp.raise_for_status() except requests.exceptions.HTTPError as e: print(e) return None return resp
def __init__(self, encoding: str = None, headers=None, cookies: str = None): self._session = s = requests_html.HTMLSession() s.mount('http://', requests.adapters.HTTPAdapter(max_retries=5)) s.mount('https://', requests.adapters.HTTPAdapter(max_retries=5)) self._encoding = encoding if headers: self._session.headers.update(headers) if cookies: cs = {} for item in cookies.split(';'): key, value = item.split('=', 1) cs[key.strip()] = value.strip() session.cookies = requests.cookies.cookiejar_from_dict(cs)
def redfin(url=None): if url == None: url = 'https://www.redfin.com/stingray/api/gis-csv?al=1&market=dc&max_price=500000&min_stories=1&num_homes=350&ord=redfin-recommended-asc&page_number=1®ion_id=20065®ion_type=6&sf=1,2,3,5,6,7&status=9&uipt=1,2,3,4,5,6&v=8' #url = 'https://www.redfin.com/city/20065/MD/Upper-Marlboro/filter/max-price=500k' with requests_html.HTMLSession() as session: r = session.get(url) #r.html.render() df = pd.read_csv(url) df.columns = df.columns.str.strip().str.lower().str.replace( ' ', '_').str.replace('(', '').str.replace(')', '') #data = pd.read_html(r.html.html) return df
def get_full_article(self): """Get the full article from the url """ art_sess = requests_html.HTMLSession() art_res = art_sess.get(self.url) try: self.date = art_res.html.find('time', first=True).attrs['datetime'] full_text = [] for el in art_res.html.find('.article-main-body', first=True).find('p'): full_text.append(el.text) self.text = '\n'.join(full_text) except KeyError as e: print(f'KeyError: {e}')
def __init__(self, mlock, iqueue, oqueue, max_retry=3): super(GetWordProcess, self).__init__() self.lock = mlock self.iqueue = iqueue self.oqueue = oqueue self.lhp = LexicoHTMLParser() self.words_to_get = list() self.sess = requests_html.HTMLSession() self.max_retry = max_retry self.max_sleep = 3 self.t_sleep = 20 self.words_failed = dict() self.words_collected = list()
async def lucario(ctx): """posts a random (safe) Lucario image from e621""" response = requests_html.HTMLSession().get( "https://e621.net/posts.json?" "tags=lucario+rating:safe+score:%3E=50+-type:mp4+-type:swf+-type:webm+-type:zip+order:random&limit=1" ) json = response.json() post = next(iter(json["posts"])) id = post["id"] url = post["file"]["url"] embed = discord.Embed(title = "a wild lucario appeared!") embed.set_image(url = post["file"]["url"]) embed.set_author(name = post['id'], icon_url = "https://e621.net/favicon.ico") await ctx.send(embed = embed)
def handle(self, *args, **options): session = requests_html.HTMLSession() r = session.get(CCEW_DATA_URL) for d in r.html.find("blockquote.download"): for p in d.find("p"): if "Charity register extract" in p.text: links = p.absolute_links for link in links: f, created = CcewDataFile.objects.update_or_create( title=d.find("h4", first=True).text, defaults=dict(url=link, description=p.text), ) print("{} ({})".format( f, "created" if created else "updated"))
def _scrape_once(data, config: Config) -> Iterator[WordPronPair]: session = requests_html.HTMLSession() for member in data["query"]["categorymembers"]: word = member["title"] date = member["timestamp"] if _skip_word(word, config.no_skip_spaces_word) or _skip_date( date, config.cut_off_date ): continue request = session.get(_PAGE_TEMPLATE.format(word=word), timeout=10) for word, pron in config.extract_word_pron(word, request, config): # Pronunciation processing is done in NFD-space; # we convert back to NFC aftewards. yield word, unicodedata.normalize("NFC", pron)