def get_player_stats(): player_stats = [] composition_stats = [] player_hero_stats = [] session = HTMLSession() res = session.get(player_stats_url()) map_divs = res.html.find('.map-wrapper') table_divs = res.html.find('.side-by-side-stats') category = 'Allmaps' for div in res.html.find('.match-div > div'): if 'map-wrapper' in div.attrs.get('class', []): map_name = div.find( '.label-info', first=True).text.lower().replace(' ', '_') elif 'side-by-side-stats' in div.attrs.get('class', []): composition_stat, hero_stat = parse_overall_hero_stat_div( div, category=category, map_name=map_name) composition_stats += composition_stat player_hero_stats += hero_stat player_stats += parse_overall_stat_div( div, category=category, map_name=map_name) else: category = div.text write_json('stats/composition_stats.json', composition_stats) write_json('stats/player_hero_stats.json', player_hero_stats) write_json('stats/player_stats.json', player_stats)
def get_media(user): ua = "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/71.0.3578.98 Safari/537.36" headers = { 'User-Agent': 'My User Agent 1.0', } proxies = { 'http': 'http://1.20.102.177:30106', 'https': 'https://1.20.102.177:30106', } url = 'https://www.instagram.com/' + user session = HTMLSession() req = session.get(url, headers=headers, proxies=proxies) media = [] scripts = req.html.xpath('//script[@type]') for s in scripts: content = s.text if "csrf_token" in content: content = content[:-1].split("window._sharedData = ")[1] data = json.loads(content) recent_media = data["entry_data"]["ProfilePage"][0]["graphql"]["user"]["edge_owner_to_timeline_media"]["edges"] for r in recent_media: media.append({ "username": data["entry_data"]["ProfilePage"][0]["graphql"]["user"]["username"], "image": r["node"]["thumbnail_src"], "timestamp": r["node"]["taken_at_timestamp"], 'permalink': r["node"]["display_url"], 'caption': r["node"]["edge_media_to_caption"]["edges"][0]["node"]["text"], 'shortcode': r["node"]["shortcode"] }) return media
def main(dir: str): files = [x for x in os.listdir(dir) if x.lower()[-4:] == "json"] session = HTMLSession() os.makedirs(f"{dir}_solutions", exist_ok=True) for crossword_file in files: crossword = json.load(open(os.path.join(dir, crossword_file))) timestamp = crossword["dateSolutionAvailable"] year, month, day = parse_timestamp(timestamp) number = crossword["number"] crossword_type = crossword["crosswordType"] url = f"https://www.theguardian.com/crosswords/{year}/{month}/{day}/annotated-solutions-for-{crossword_type}-{number}" print(crossword["solutionAvailable"], url) result = session.get(url) if result.status_code >= 300: continue html = result.html relevant_divs = html.find("div.content__main-column.content__main-column--article.js-content-main-column") if len(relevant_divs) != 1: print(relevant_divs) solutions = [x.text for x in relevant_divs[0].find("p") if x.text] parsed = parse_solutions(solutions) save_name = os.path.join("crosswords/prize_solutions", f"{number}_solution.json") with open(save_name, "w+") as file: json.dump(parsed, file, indent=4)
def show_datetime_for(name, url): session = HTMLSession() r = session.get(url) # '03:37:58' time = r.html.find('#ct', first=True).text # 'PDT' timezone = r.html.find('#cta', first=True).text # 'Saturday, 16 June 2018' date = r.html.find('#ctdat', first=True).text print(f'{name:12}: {time} {date} {timezone}')
class MensaBase(object): def __init__(self, endpoints, location): """Constructor.""" self.location = location # dict of language specific endpoints # { Language : url-string } self.endpoints = endpoints adapter = CacheControlAdapter(heuristic=ExpiresAfter(days=1)) self.session = HTMLSession() self.session.mount('https://', adapter) def retrieve(self, datum=None, language=None, meals=None, emojize=None) -> Plan: # overwrite this # TODO how to make design more pythonic? # In Java terms: abstract class -> two implementation classes pass # Helper method to make a language-specific request def do_request(self, language=Language.DE): resp = self.session.get(self.endpoints[language.name]) code = resp.status_code if code != 200: logger.warning(f'Non-200 status: {code}') logger.debug(f'Status Code: {code}') return resp.html @staticmethod def _normalize_key(k: str) -> str: return None if not k else k.strip().lower().replace(' ', '_') @staticmethod def _strip_additives(text: str) -> str: return re.sub('\((\s*(\d+)?[a-z]?[,.]?\s*)+\)', '', text) @staticmethod def _normalize_whitespace(text: str) -> str: return re.sub('\s{2,}', ' ', text) @staticmethod def _normalize_orthography(text: str) -> str: return re.sub('\s,', ',', text) @staticmethod def _clean_text(text: str) -> str: return MensaBase._normalize_orthography(MensaBase._normalize_whitespace(MensaBase._strip_additives(text.strip()))) @staticmethod def _text_replace(text: str) -> str: return re.sub('Züricher', "Zürcher", text)
def fetch_ads(url='http://tankeogteknikk.no/qmedia/oslo.php'): """Crawl tankeogteknikk web site and fetch current ads""" try: r = HTMLSession().get(url) except RequestError: logger.exception('failed to fetch ads') return [] r.raise_for_status() # raise exception if 404 or other non ok http status subs = r.html.find('table.sub') ads = [_parse_sub_advert(sub) for sub in subs] for ad in ads: # use absolute and quoted urls ad['image'] = parse.quote(parse.urljoin(url, ad['image']), safe='/:') return ads
def handle(cls, *args, **kwargs): session = HTMLSession() for operator in Operator.objects.filter(service__current=True, twitter='').exclude(url='').distinct(): try: r = session.get(operator.url, timeout=10) except RequestException: operator.url = '' operator.save() continue for link in r.html.links: twitter = cls.get_from_link(link) if twitter: operator.twitter = twitter operator.save() break
def get_teams_and_matches(): session = HTMLSession() res = session.get(owl_index_url()) res.html.render(timeout=60) match_rows = res.html.find( '.tab-pane#past')[0].find('table')[0].find('.past-matches-row') updated = True # TODO get match data in future # for row in match_rows: # if parse_match_row(row): # updated = True if updated: teams = {td.text: td.absolute_links.pop() for td in res.html.find('td.team')} write_json('stats/team_hero_stats.json', [parse_team(team_name, team_url) for team_name, team_url in teams.items()])
def fetch_hpps_streamflow(dst_dir, url=None): """ Fetch streamflow data from chmi fetch_hpps_data """ session = HTMLSession() n_charts = 0 datatype_prefix = 'streamflow' agency = 'chmi' pagesize = 50 n_pages = 20 for page in range(0, n_pages): subpage_url = "http://hydro.chmi.cz/hpps/hpps_oplist.php?startpos={0}&recnum={1}".format(page*pagesize, pagesize) print("----------------------------------------------------") print(subpage_url) print("----------------------------------------------------") session = HTMLSession() r = session.get(subpage_url) for lnk in r.html.absolute_links: if 'prfdyn' in lnk: print(lnk) station_seq = lnk.split('=')[-1] print(station_seq) data_dir = dst_dir / datatype_prefix / agency / station_seq if not os.path.exists(data_dir): os.makedirs(data_dir) utc_timestamp_text = datetime.utcnow().strftime('%Y-%m-%dT%H0000z.html') html_filename = "prfdata_" + station_seq + "_" + utc_timestamp_text html_path = data_dir / html_filename # save the HTML with seven-day table lnk_table = lnk.replace('prfdyn', 'prfdata') print(lnk_table) html_response = get(lnk_table) if html_response.status_code == 200: print(html_path) with open(html_path, 'wb') as f: f.write(html_response.content)
def __init__(self): self.__page = 1 self.__url = "http://www.mm131.com/qingchun/list_1_{}.html" self.__session = HTMLSession() self.__headers = { 'Referer':'http://www.mm131.com/qingchun/', 'User-Agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/64.0.3282.140 Safari/537.36' } self.__imagePath = r'D:/Photo/MM' self.__confirmPath()
def __init__(self, endpoints, location): """Constructor.""" self.location = location # dict of language specific endpoints # { Language : url-string } self.endpoints = endpoints adapter = CacheControlAdapter(heuristic=ExpiresAfter(days=1)) self.session = HTMLSession() self.session.mount('https://', adapter)
class MM(object): def __init__(self): self.__page = 1 self.__url = "http://www.mm131.com/qingchun/list_1_{}.html" self.__session = HTMLSession() self.__headers = { 'Referer':'http://www.mm131.com/qingchun/', 'User-Agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/64.0.3282.140 Safari/537.36' } self.__imagePath = r'D:/Photo/MM' self.__confirmPath() def __confirmPath(self): if not os.path.exists(self.__imagePath): os.makedirs(self.__imagePath) def download(self,link,fileName): try: with open(self.__imagePath+'/'+fileName+'.jpg','wb') as f: f.write(self.__session.request('get',link,headers = self.__headers,allow_redirects=False).content) except Exception as e: print(str(e)) def parseData(self): start = time.time() while self.__page < 12: if self.__page == 1: self.__url = "http://www.mm131.com/qingchun/" else: self.__url = 'http://www.mm131.com/qingchun/list_1_{}.html'.format(self.__page) r = self.__session.get(self.__url) main = r.html.find(".main",first=True) dl = main.find('dl')[0] dds = dl.find('dd') for dd in dds[:-1]: attr = dd.find('img')[0].attrs imageLink = attr['src'] title = attr['alt'] self.download(imageLink,title) self.__page += 1 end = time.time() - start print("爬取时间:",end)
def get_hero_stats(): hero_stats = [] session = HTMLSession() res = session.post(hero_stats_url(), data={ 'event[]': 86, 'teamcompTypes': 1}) player_heros = [] team_heros = [] # ['gameNumber', 'roundtype', 'player', 'team', 'hero', # 'timePlayed', 'matchID', 'playerPic', 'playerName', 'teamPic', # 'nameCSFriendly', 'map', 'teamName'] for result in res.html.search_all("heroStatsArr.concat({})"): player_heros += json.loads(result[0]) # keys = ['gameNumber', 'roundtype', 'team', 'tcString', # 'gameWasPlayed', 'map', 'maptype', 'timePlayed', 'matchID'] for result in res.html.search_all("teamcompsArr.concat({})"): team_heros += json.loads(result[0]) write_json('stats/player_heros.json', player_heros) write_json('stats/team_heros.json', team_heros)
def __init__(self,url): # 重写父类的__init__方法 super(XHSpider, self).__init__() self.url = url self.session = HTMLSession() self.headers = { 'Host':'news.daxues.cn', 'User-Agent':'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/64.0.3282.186 Safari/537.36' } self.path = "D:/Photo/" self.check_file_path(self.path)
def parse_match_row(row): match_path = os.path.join( 'stats', 'matches', row.attrs['matchid'] + '.json') if os.path.exists(match_path): return False match = {} session = HTMLSession() match_res = session.get(match_url(row.attrs['matchid'])) render_result = match_res.html.render(timeout=600) print(render_result) team_names = [{'name': team_name_div.text, 'id': team_name_div.links.pop().split('id=')[-1]} for team_name_div in match_res.html.find('.names-and-score', first=True).find('div')[1::2]] maps = [] for map_div in match_res.html.find('.map-wrapper'): map_data = {'name': map_div.find( '.mapname', first=True).text, 'teams': []} mapping = {'name': 3, 'score': 4, 'progress': 5, 'fights': 6, 'kills': 7} for i in range(1, 3): team_data = {} for key, index in mapping.items(): team_data[key] = map_div.find('div')[index].text.split('\n')[i] map_data['teams'].append(team_data) maps.append(map_data) stat_divs = match_res.html.find('.side-by-side-stats') overall_stats = parse_stat_div(stat_divs.pop(0)) for i, map_stat_div in enumerate(stat_divs): maps[i]['stats'] = parse_stat_div(map_stat_div) hero_stats = parse_hero_stat_div(match_res.html.find( '#allMapsAllRoundsAllTeams', first=True)) hero_stats_by_team = [] # TODO FIX the script problem # for team in team_names: # hero_stats_by_team.append(parse_hero_stat_div(match_res.html.find( # '#allMapsAllRoundsTeam' + team['id'], first=True))) write_json(match_path, {'maps': maps, 'stats': overall_stats, 'hero_stats': hero_stats, 'hero_stats_by_team': hero_stats_by_team, 'teams': team_names, 'date': row.find('td')[0].text}) return True
def main(crossword_types: List[str]): session = HTMLSession() for crossword_type in crossword_types: if crossword_type not in CROSSWORDS.keys(): raise ValueError(f"crosword type must be in one of {CROSSWORDS.keys()}") start, end = CROSSWORDS[crossword_type] os.makedirs(f"crosswords/{crossword_type}", exist_ok=True) for crossword_no in reversed(range(start, end)): try: url = "https://www.theguardian.com/crosswords/" + crossword_type + "/" + str(crossword_no) result = session.get(url) if result.status_code >= 300: continue html = result.html try: relevant_divs = html.find("div.js-crossword") if len(relevant_divs) != 1: print(relevant_divs) clues = relevant_divs[0].attrs["data-crossword-data"] except: relevant_divs = html.find("div.js-crossword has-grouped-clues") if len(relevant_divs) != 1: print(relevant_divs) clues = relevant_divs[0].attrs["data-crossword-data"] clues_json = json.loads(clues) save_name = clues_json["id"] + ".json" with open(save_name, "w+") as file: json.dump(clues_json, file, indent=4) except IndexError: print("couldn't find crossword no:{}".format(crossword_no)) with open("crosswords/" + crossword_type + "/missing_ids.txt", "a+") as file: file.write(str(crossword_no) + "\n")
def get_event_player_rank(): session = HTMLSession() res = session.get(player_rank_url()) table = res.html.find('table.ranking-table', first=True) player_ranks = [] hero_ranks = [] for tr in table.find('tr')[2:]: overall_rank = int(tr.find('td.rank', first=True).text) overall_rating = int(tr.find('.rating-number', first=True).text) team_name = tr.find('.small-team-logo', first=True).attrs['title'].split(': ')[-1] stars = int(tr.find( '.star-rating', first=True).attrs['class'][-1].replace('star', '').split('-')[0]) info_div, heros_div = tr.find('.team-info-td > div') name = info_div.find('a', first=True).text time, sos_rank, win_percent = [div.text.split( ': ')[-1] for div in info_div.find('.secondary-stats')] rank_data = {'overall_rank': overall_rank, 'overall_rating': overall_rating, 'team_name': team_name, 'stars': stars, 'name': name, 'time': time, 'sos_rank': int(sos_rank), 'win_percent': win_percent, 'hero_ranks': []} for span in heros_div.find('span.secondary-ranking'): hero_name = span.attrs['title'].split(' Rank:')[0].lower() hero_rank_by_total, hero_rating, hero_time, hero_win_percent = [ text.split(': ')[-1] for text in span.attrs['title'].split('\n')] hero_rank, total_count = hero_rank_by_total.split('/') hero_rank_data = {'hero_name': hero_name, 'hero_rank_by_total': hero_rank_by_total, 'hero_rating': int(hero_rating), 'hero_time': hero_time, 'hero_win_percent': hero_win_percent, 'hero_rank': int(hero_rank), 'total_count': int(total_count), 'name': name, 'overall_rank': overall_rank, 'overall_rating': overall_rating, 'team_name': team_name, 'stars': stars, } hero_ranks.append(hero_rank_data) rank_data['hero_ranks'].append(hero_rank_data) player_ranks.append(rank_data) write_json('stats/player_ranks.json', player_ranks) write_json('stats/hero_ranks.json', hero_ranks)
def get(spell_id): session = HTMLSession() r = session.get("https://cn.wowhead.com/spell={}".format(spell_id)) en_url = r.html.find('link[hreflang="en"]', first=True).attrs['href'] zh_url = r.url return Spell(spell_id, parse(en_url), parse(zh_url))
import os from requests_html import HTMLSession, HTML from requests_file import FileAdapter session = HTMLSession() session.mount('file://', FileAdapter()) def get(): path = os.path.sep.join((os.path.dirname(os.path.abspath(__file__)), 'python.html')) url = 'file://{}'.format(path) return session.get(url) def test_file_get(): r = get() assert r.status_code == 200 def test_css_selector(): r = get() about = r.html.find('#about', first=True) for menu_item in ( 'About', 'Applications', 'Quotes', 'Getting Started', 'Help', 'Python Brochure' ): assert menu_item in about.text.split('\n')
def fetch_vodagov_charts(dst_dir, agency, base_url, subpages, datatype_prefix): """ Fetch graphs and html tables from voda.gov.cz fetch_vodagov_charts(dst_dir='/home/jiri/meteodata', agency_prefix='pod', base_url='http://www.pvl.cz/portal/SaP/pc/?', subpages=['oid=1', 'oid=2'], datatype_prefix='streamflow', agency_prefix='pod') :param dst_dir: destination directory where to save the data (subdirs are created automatically) :param base_url: the base url [for example http://www.pvl.cz/portal/SaP/pc/? for streamflow, http://www.pvl.cz/portal/srazky/pc/? for precipitation] :param subpages: the list of sub-pages (for example ['oid=1', 'oid=2', 'oid=3']) :param datatype_prefix: the data type. use 'streamflow' or 'precip' :param agency: the short name of the operating agency. use pla, poh, pod, pvl or pmo :return: number of charts and html pages downloaded """ #if datatype_prefix == 'streamflow': #pvl_base = 'http://sap.poh.cz/portal/SaP/pc/?' #else: #pvl_base = 'http://sap.poh.cz/portal/Srazky/PC/?' session = HTMLSession() n_charts = 0 for subpage in subpages: url = base_url + subpage print('-----------------------------') print(url) print('-----------------------------') r = session.get(url) for lnk in r.html.absolute_links: if 'Mereni.aspx?id=' or 'mereni.aspx?id=' in lnk: try: r_st = session.get(lnk) images = r_st.html.find('img') for img in images: if 'src' not in img.attrs: continue src = img.attrs['src'] if ('graf' in src or 'Graf' in src) and ('miniatury' not in src) and ("&" not in src) and (".ashx" not in src): if 'maska' in src: continue img_src_absolute = urljoin(lnk, src) img_response = get(img_src_absolute) if img_response.status_code == 200: img_dir = os.path.join(dst_dir, datatype_prefix, agency, os.path.splitext(os.path.basename(img_src_absolute))[0]) if not os.path.exists(img_dir): os.makedirs(img_dir) utc_timestamp_text = datetime.utcnow().strftime('_%Y-%m-%dT%H0000z.png') img_filename = os.path.basename(img_src_absolute).replace('.png', utc_timestamp_text) img_path = os.path.join(img_dir, img_filename) print(img_path) with open(img_path, 'wb') as f: f.write(img_response.content) # also save the HTML html_path = img_path.replace('.png', '.html') html_response = get(lnk) if html_response.status_code == 200: print(html_path) with open(html_path, 'wb') as f: f.write(html_response.content) n_charts += 1 except ValueError: print('ERROR fetching ' + lnk) return n_charts
from requests_html import HTMLSession, HTMLResponse import urllib.request session = HTMLSession() #creates obj of htmlsession n stores into session urls = ['http://books.toscrape.com/catalogue/page-1.html'] for i in range(1, 2): urls.append('http://books.toscrape.com/catalogue/page-{i}.html') for url in urls: response = session.get(url) #print(response) #print(response.text) source = response.html #store html file in response #print(type(source)) #print(source) #gives url #print(source.html) block = source.find('ol.row', first=True) #first=True ==> block[0] #print(block) names = block.find('li h3 a', first=True) print(names.attrs['title']) #print(names.text)........ gives the content of <a> tag #names = block.find('li h3 a',first=True) #print(names.attrs['href']) titles = [] cost = []
format=LOG_FORMAT) logger = logging.getLogger() # Test the logger # logger.info("Our first message!") # In[3]: # pip3 install requests-html from requests_html import HTMLSession import re #import regular expression package 正则表达式模块 import zipfile from zipfile import ZipFile import os session = HTMLSession() # input&read logger.debug("# Read the CIK and acc_no from console") cik = input("Please input the CIK:") acc_no_test = input("Please input the document accession number:") print('The CIK and Acc_no you entered is:', cik, acc_no_test) # CIK = '51143' # acc_no = '000005114313000007/0000051143-13-000007' logger.debug("# Get the HTML page") CIK = cik acc_no = acc_no_test html_tail = '-index.html' url_company = "http://www.sec.gov/Archives/edgar/data/" + CIK + "/" + acc_no + html_tail
from requests_html import HTML, HTMLSession session = HTMLSession() req = session.get('https://httpbin.org') # print(req.status_code) # print(req.text) html = req.html title = html.find('title', first=True) print(title.text) print("==============================================") wrappers_first = html.find('div.wrapper', first=True) print(wrappers_first.html) # print("==============================================") # info = html.find('div.info') # # print(info) # print("==============================================") # for inf in info: # info_head = inf.find('h2', first=True) # print(info_head.text) # wrappers = html.find('div.wrapper') # print(wrappers) # print("==============================================")
def __init__(self): self.showing = None self.coming_up = None self.session = HTMLSession()
import os from datetime import date from requests_html import HTMLSession from bs4 import BeautifulSoup if __name__ == '__main__': # Scrape Anaconda main page for latest version of Python, with link to list session = HTMLSession() url = 'https://docs.anaconda.com/anaconda/packages/pkg-docs/' r = session.get(url, verify=False) links = list(r.html.absolute_links) link = ''.join(sorted([ i for i in links if "win-64" in i ])[-1]) # search for latest win-64 version package list and save link # Scrape website with Anaconda package list using latest list r = session.get(link, verify=False) # Get the table of packages bs = BeautifulSoup(r.text, "lxml") tr_elements = bs.find_all('table') # Reduce to package names, removing HTML formatting strings for row in tr_elements: elements = row.find_all('td') elements = [x.text.strip() for x in elements] # Every 4th item is package name in table pkgs = []
以下代码是用requs_html这个库去搜索壁纸并下载 安装命令:pip install requests-html ''' from requests_html import HTMLSession import requests # 保存图片到picture/目录 def save_image(url, title): img_response = requests.get(url) with open('./picture/'+title+'.jpg', 'wb') as file: file.write(img_response.content) # 背景图片地址,这里选择1920*1080的背景图片 url = "http://www.win4000.com/wallpaper_2358_0_10_1.html" session = HTMLSession() r = session.get(url) # 查找页面中背景图,找到链接,访问查看大图,并获取大图地址 items_img = r.html.find('ul.clearfix > li > a') for img in items_img: img_url = img.attrs['href'] if "/wallpaper_detail" in img_url: r = session.get(img_url) item_img = r.html.find('img.pic-large', first=True) url = item_img.attrs['src'] title = item_img.attrs['title'] print(url+title) save_image(url, title)
class F5Downloads: def __init__(self, username, password, default_location='IRELAND'): self.username = username self.password = password self.default_location = default_location self._session = None self._version_pages = None self.new_files = [] @property def session(self): if not self._session: self._session = HTMLSession() self._session.post( 'https://api-u.f5.com/auth/pub/sso/login/user', headers={'Content-Type': 'application/x-www-form-urlencoded'}, data={ 'userid': self.username, 'passwd': self.password, }) return self._session def find_links(self, page, pattern): return [(l.text, next(iter(l.absolute_links))) for l in page.html.find('a') if l.text and l.absolute_links and re.match(pattern, l.text)] def follow_specific_link(self, **kwargs): page = kwargs['page'] pattern = kwargs['pattern'] matching_links = self.find_links(page, pattern) # To proceed in the chain we need exactly one match if len(matching_links) != 1: logger.error( 'Found {len(matching_links)} matches for url {url} and pattern {pattern}, unable to proceed' ) logger.error('Files found:') logger.error(matching_links) raise Exception(f'') name, url = matching_links[0] logger.debug(f'Following {name} with {url}') return self.get_page(url) def pick_latest_version(self, **kwargs): page = kwargs['page'] pattern = kwargs['pattern'] matching_links = self.find_links(page, pattern) if not len(matching_links): raise Exception( f'No versions matching {pattern} found on page {page}') versionDict = {} # This is an ugly one. Threat the versions as a decimal number and increase the worth # of each version number by a factor of 10, then return the sum for version, url in matching_links: number = version.replace('.', '') versionDict[number] = (version, url) # Pick the highest number version, url = versionDict[max(versionDict, key=int)] logger.debug(f'Picking {version} as latest version') return self.get_page(url) def follow_path(self, page, steps): step = steps.pop(0) f = step['f'] args = step['args'] | {'page': page} result = f(**args) if not len(steps): return result elif result: return self.follow_path(result, steps) # Detect if the EULA exists and circle around it def get_page(self, url): page = self.session.get(url) if len(page.html.find('input#accept-eula')): logger.debug('EULA encountered, accepting it') page = self.session.get( url.replace('https://downloads.f5.com/esd/ecc.sv', 'https://downloads.f5.com/esd/eula.sv')) return page def download_files(self, **kwargs): page = kwargs['page'] pattern = kwargs['pattern'] download_folder = kwargs['download_folder'] cb = kwargs['cb'] # Create folders if needed pathlib.Path(download_folder).mkdir(parents=True, exist_ok=True) matching_links = self.find_links(page, pattern) for name, url in matching_links: md5_name, md5_url = next( iter(self.find_links(page, rf'^{name}.md5$')), (None, None)) # Only download if there's a matching md5 file if not md5_name: raise Exception(f'No matching md5 file found for {name}') file_path = f'{download_folder}{name}' md5_path = f'{download_folder}{md5_name}' self.download_file(md5_path, md5_url) if self.md5_sum_ok(md5_path, file_path): logger.info('The newest file already exists on disk') return file_path else: self.download_file(file_path, url) logger.info(f'Validating {name} against the supplied md5') if self.md5_sum_ok(md5_path, f'{download_folder}{name}'): logger.info('Downloaded file successfully') if cb: cb(file_path) return (file_path) else: raise Exception(f'Failed to download file {name}') def md5_sum_ok(self, md5_file, file): if not os.path.exists(md5_file): raise Exception(f'{md5_file} does not exist') if not os.path.exists(file): logger.info(f'{file} does not exist') return False with open(md5_file, 'r') as f: md5sum = re.sub(r' .+\n$', '', f.read()) file_sum = self.md5(file) return md5sum == file_sum def md5(self, file_name): hash_md5 = hashlib.md5() with open(file_name, "rb") as f: for chunk in iter(lambda: f.read(4096), b""): hash_md5.update(chunk) return hash_md5.hexdigest() def download_file(self, file_path, url): if os.path.exists(file_path): os.remove(file_path) page = self.get_page(url) name, download_url = next( iter(self.find_links(page, rf'{self.default_location}')), (None, None)) if (download_url): logger.debug(f'Saving file as ./{file_path}') with self.session.get(download_url, stream=True) as r: r.raise_for_status() with open(file_path, 'wb') as f: for chunk in r.iter_content(chunk_size=8192): f.write(chunk) def download_geoipdb(self, version, cb=None): return self.follow_path( self.get_page('https://downloads.f5.com/esd/productlines.jsp'), [{ 'f': self.follow_specific_link, 'args': { 'pattern': rf'BIG-IP v{version}.x.+' }, }, { 'f': self.follow_specific_link, 'args': { 'pattern': r'GeoLocationUpdates', } }, { 'f': self.download_files, 'args': { 'pattern': rf'^ip-geolocation-.+\.zip$', 'download_folder': f'./downloads/GeoIP/v{version}/', 'cb': cb } }]) def download_latest_version(self, version, cb=None): return self.follow_path( self.get_page('https://downloads.f5.com/esd/productlines.jsp'), [{ 'f': self.follow_specific_link, 'args': { 'pattern': rf'BIG-IP v{version}.x.+' }, }, { 'f': self.pick_latest_version, 'args': { 'pattern': rf'^{version}[\.0-9]+$', } }, { 'f': self.download_files, 'args': { 'pattern': rf'^BIGIP-{version}[\.0-9]+.+iso$', 'download_folder': f'./downloads/BIG-IP/v{version}/', 'cb': cb } }])
def __init__(self): self.session = HTMLSession() self.x_guest_token = None self.headers = {"User-Agent": USER_AGENT}
class TwitterScrap: def __init__(self): self.session = HTMLSession() self.x_guest_token = None self.headers = {"User-Agent": USER_AGENT} def get_profile(self, username): profile = Profile() profile.profile_url = f"https://twitter.com/{username}/" self.__get_token(profile.profile_url) self.headers["x-guest-token"] = self.x_guest_token self.headers[ "Authorization"] = "Bearer AAAAAAAAAAAAAAAAAAAAANRILgAAAAAAnNwIzUejRCOuH5E6I8xnZz4puTs=1Zv7ttfk8LF81IUq16cHjhLTvJu4FA33AGWWjCpTnA" self.headers["Referer"] = profile.profile_url params = { 'variables': json.dumps({ 'screen_name': username, 'withHighlightedLabel': True }, separators=(',', ':')) } prepared_request = self.session.prepare_request( requests.Request( "GET", "https://api.twitter.com/graphql/-xfUfZsnR_zqjFd-IfrN5A/UserByScreenName", headers=self.headers, params=urllib.parse.urlencode(params, quote_via=urllib.parse.quote))) res = self.session.send(prepared_request, allow_redirects=True, timeout=15) profile.parse_profile(res.json()) return profile def get_tweets(self, profile: Profile, include_replies=False, include_rt=False, count=40): prepared_request = self.session.prepare_request( requests.Request( "GET", f"https://twitter.com/i/api/2/timeline/profile/{profile.id}.json", headers=self.headers, params=urllib.parse.urlencode(get_params( profile.id, include_replies, count), quote_via=urllib.parse.quote))) res = self.session.send(prepared_request, allow_redirects=True, timeout=10) if res.status_code != 200: raise Exception( f"Could not get tweets, status_code {res.status_code}") data = res.json() tweets = parse_tweets(data, include_rt, profile.id) return tweets def __get_token(self, url): if self.x_guest_token == None: request = self.session.prepare_request( requests.Request("GET", url, headers=self.headers)) res = self.session.send(request, allow_redirects=True, timeout=20) possible_token = re.search( r'decodeURIComponent\("gt=(\d+); Max-Age=10800;', res.text) if possible_token: self.x_guest_token = possible_token.group(1) self.session.cookies.set("gt", self.x_guest_token, domain='.twitter.com', path='/', secure=True, expires=time.time() + 10800) else: raise Exception( f"Could not retrieve guest token, status_code {res.status_code}" ) else: return
import os from functools import partial import pytest import psutil from pyppeteer.browser import Browser from pyppeteer.page import Page from requests_html import HTMLSession, AsyncHTMLSession, HTML from requests_file import FileAdapter session = HTMLSession() session.mount('file://', FileAdapter()) def get(): path = os.path.sep.join((os.path.dirname(os.path.abspath(__file__)), 'python.html')) url = 'file://{}'.format(path) return session.get(url) @pytest.fixture def async_get(event_loop): """ AsyncSession cannot be created global since it will create a different loop from pytest-asyncio. """ async_session = AsyncHTMLSession() async_session.mount('file://', FileAdapter()) path = os.path.sep.join((os.path.dirname(os.path.abspath(__file__)), 'python.html')) url = 'file://{}'.format(path) return partial(async_session.get, url)
def scrape(url): """ Scrape comments from Amazon.com, write them(comments and rating) into a csv file :param url: the link from Amazon :return: None """ hs = HTMLSession() try: url = url.replace("dp", "product-reviews") except Exception as e: print(e) quit() r = hs.get( url=url, headers={ 'accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9', 'accept-encoding': 'gzip, deflate, br', 'accept-language': 'en-US,en;q=0.9,zh-CN;q=0.8,zh;q=0.7,zh-TW;q=0.6', 'sec-fetch-dest': 'document', 'sec-fetch-mode': 'navigate', 'sec-fetch-site': 'none', 'sec-fetch-user': '******', 'upgrade-insecure-requests': '1', 'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/86.0.4240.75 Safari/537.36' }) comments = r.html.find('div.a-section.review.aok-relative') fw = open('reviews.csv', 'a', encoding='utf8') # output file writer = csv.writer(fw, lineterminator='\n') for a in comments: comment, star = 'NA', 'NA' # initialize critic and text commentChunk = a.find( 'span.a-size-base.review-text.review-text-content > span') if commentChunk: comment = commentChunk[0].text.strip() starChunk = a.find('i > span.a-icon-alt') if starChunk: star = starChunk[0].text.strip() # star = a.find('i > span.a-icon-alt')[0].text # comment = a.find('span.a-size-base.review-text.review-text-content > span')[0].text writer.writerow([comment, star]) fw.close() sleep(.75) pagination(r) r.close()
import re from requests_html import HTMLSession, HTML from datetime import datetime session = HTMLSession() def get_tweets(user, pages=25): """Gets tweets for a given user, via the Twitter frontend API.""" url = f'https://twitter.com/i/profiles/show/{user}/timeline/tweets?include_available_features=1&include_entities=1&include_new_items_bar=true' headers = { 'Accept': 'application/json, text/javascript, */*; q=0.01', 'Referer': f'https://twitter.com/{user}', 'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_12_6) AppleWebKit/603.3.8 (KHTML, like Gecko) Version/10.1.2 Safari/603.3.8', 'X-Twitter-Active-User': '******', 'X-Requested-With': 'XMLHttpRequest' } def gen_tweets(pages): r = session.get(url, headers=headers) while pages > 0: try: html = HTML(html=r.json()['items_html'], url='bunk', default_encoding='utf-8') except KeyError: raise ValueError( f'Oops! Either "{user}" does not exist or is private.')
def new_search(request): search = request.POST.get('search') models.Search.objects.create(search=search) final_url = BASE_URL.format(quote_plus(search)) final_url2 = BASE_URL2.format(requote_uri(search)) print(final_url) print(final_url2) response = requests.get(final_url) headers = requests.utils.default_headers() headers.update({'user-agent': 'GoogleChrome'}) response2 = requests.get(final_url2.replace(u'\ufeff', '')) headers = requests.utils.default_headers() headers.update({'user-agent': 'GoogleChrome'}) session = HTMLSession() resp = session.get(final_url2) print(resp) soup4 = BeautifulSoup(resp.html.html, "lxml") #image = soup4.find_all(class_ = 'BaseCardstyle__ListingPosterMainWrapper-LdsSD KSRNa' ) image2 = soup4.find_all(class_= 'img-wrapper') for gambar in image2: link_gambar = gambar.find(class_ = 'Rumah Cimanggis Pinggir Jalan Dekat Pintu Tol di Depok, Cimanggis, Depok 1') print(link_gambar) final_post = [] final_post2 = [] print(final_post2) data = response.text soup = BeautifulSoup(data, features='lxml') post_listing = soup.find_all(class_='card ListingCell-content js-MainListings-container ListingCell-wrapper') data2 = response2.text soup2 = BeautifulSoup(data2,features='lxml') post_listing2 = soup2.find_all(class_= 'BaseCardstyle__ListingContainer-pryVa gCOzDl') print('*' * 100) for post in post_listing: post_title = post.find(class_='ListingCell-KeyInfo-title').text post_title2 = post_title.strip() post_link = post.find('a').get('href') if post.find(class_='PriceSection-FirstPrice'): post_price = post.find(class_='PriceSection-FirstPrice').text else: post_price = 'N/A' if post.find(class_='ListingCell-image'): post_image_id = post.find(class_='ListingCell-image').img['data-src'] #post_image_url = BASE_IMAGE_URL.format(post_image_id) else: post_image_id = 'https://craigslist.org/images/peace.jpg' final_post.append((post_title2, post_price, post_link, post_image_id)) #part rumah123.com for postrumah123 in post_listing2: post_titlerumah123 = postrumah123.find(class_='BaseCardstyle__ListingTitleWrapper-bFjnJr hTMjgq') if post_titlerumah123 is not None: post_titlerumah123 = post_titlerumah123.text else: post_titlerumah123 = 'rumah dijual di daerah strategis' post_linkrumah123 = postrumah123.find('a').get('href') if '.com' not in post_linkrumah123: post_linkrumah123_revisi = urljoin(PARENT_URL2, post_linkrumah123) else: post_linkrumah123_revisi = post_linkrumah123 if postrumah123.find(class_='listing-primary-price ListingPrice__Wrapper-FYsEL cpaEEX'): post_pricerumah123 = postrumah123.find(class_='listing-primary-price-item ListingPrice__ItemWrapper-egelzL fnIFZc').get_text() else: post_price = 'N/A' final_post2.append((post_titlerumah123, post_pricerumah123, post_linkrumah123_revisi,)) #else: # post_image_idrumah123 = 'https://craigslist.org/images/peace.jpg' #print(post_pricerumah123) #final_post2.append((post_titlerumah123_2, post_pricerumah123, post_linkrumah123)) for_frontend = { 'search': search, 'final_post': final_post, 'final_post2': final_post2 } return render(request, 'apaan/new_search.html', for_frontend)
def search_data(self, crawl_rules, arguments): if crawl_rules['method'] == 'post': crawl_rules = self.parse_rules(crawl_rules, arguments['keyword'], 'form_data') crawl_rules['form_data'] = json.loads(crawl_rules['form_data']) res = requests.post(crawl_rules['request_url'], data=crawl_rules['form_data']) try: content = json.loads(res.text) except json.decoder.JSONDecodeError: print("Check your keyword and parameter") print(res.text) return 0 if crawl_rules['result_list_param'] is not None: last_page = int(content[crawl_rules['result_list_param']][0][crawl_rules['result_total_page_param']]) result_data = [] for line in content[crawl_rules['result_list_param']]: result_data.append( str(line[crawl_rules['result_code_param']]) + " " + str(line[crawl_rules['result_name_param']])) for i in range(2, last_page + 1): crawl_rules['form_data'][crawl_rules['result_current_page_param']] = i res = requests.post(crawl_rules['request_url'], data=crawl_rules['form_data']) if type(res.content) is bytes: content = json.loads(res.content.decode('utf8')) for line in content[crawl_rules['result_list_param']]: result_data.append( str(line[crawl_rules['result_code_param']]) + " " + str( line[crawl_rules['result_name_param']])) elif crawl_rules['method'] == 'get': crawl_rules = self.parse_rules(crawl_rules, arguments['keyword'], 'request_url') sess = HTMLSession() res = sess.get(crawl_rules['request_url']) try: res.html.render() except MaxRetries: print("MaxRetries...") print('Want you reload?') ans = input('(Y/N) << ').lower() if ans in ['yes', 'y']: self.get_get_data(crawl_rules, arguments) elif ans in ['no', 'n']: return 0 soup = BeautifulSoup(res.html.html, 'lxml') if arguments['stage'] == "select": result_data = '' if crawl_rules['label_css_path'] is not None: result_data += re.sub("[\n]", " ", soup.select(crawl_rules['label_css_path'])[0].text) + ": " result_data += re.sub("[^\d\.%]", "", soup.select(crawl_rules['value_css_path'])[0].text) # result_data = soup.select(crawl_rules['value_css_path'])[0].text else: result_data = soup.select(crawl_rules['value_css_path'])[0].text return result_data
from bs4 import BeautifulSoup from requests_html import HTMLSession import csv #urls that will be parsed urls = [ 'https://iaclarington.com/en/price-performance/fund?fund_id=4201&series=4401', 'https://iaclarington.com/en/price-performance/fund?fund_id=4303&series=7400', 'https://iaclarington.com/en/price-performance/fund?fund_id=4509', 'https://iaclarington.com/en/price-performance/fund?fund_id=4215' ] session = HTMLSession() # handling the csv file csv_file = open('performance.csv', 'w', newline='') csv_writer = csv.writer(csv_file) csv_writer.writerow([ "Name", "Value", "Performance Date", "Performance (1mo)", "Performance (3mo)" ]) #iterating through the url list for url in urls: print(url) response = session.get(url) response.html.render() soup = BeautifulSoup(response.html.html, 'lxml') main = soup.find('div', class_='main-content')
import numpy as np import os.path import cv2 import pandas as pd import time import json import sys from requests_html import HTMLSession s = HTMLSession() import time z = int(sys.argv[1]) if not os.path.exists("all_ships.zoom=%s" % z): os.mkdir("all_ships.zoom=%s" % z) # A function to convert values from string to the most suitable format for that valuu # - str, int or float def mangle_type(val): try: if str(int(val)) == val.strip(): return int(val) except: pass try: if str(float(val)) == val.strip(): return float(val) except: pass return val
def parse_team(team_name, team_url): session = HTMLSession() return {'id': team_url.split('id=')[-1], 'name': team_name, 'heros': [{'win_rate': div.text, 'heros': [span.attrs['title'].replace('soldier76', 'soldier-76') for span in div.find('span')]} for div in session.get(team_url).html.find('.team-comp-wrapper > .team-comp')]}
def __init__(self): """ INITIALISATION """ self.session = HTMLSession() # Notre "navigateur virtuel"
def parse_single_url(url, region): with HTMLSession() as session: response = session.get(url=url) response = response.html try: result = re.findall( r'\"location\":{\"latitude\":\d{2}[.]\d{0,7},\"longtitude\":\d{2}[.]\d{0,7}', response.xpath("/html/body/script[19]/text()")[0])[0] latitude = re.findall(r'\"latitude\":\d{2}[.]\d{0,7}', result)[0].split(":")[1] longtitude = re.findall(r'\"longtitude\":\d{2}[.]\d{0,7}', result)[0].split(":")[1] except BaseException: latitude = "Null" longtitude = "Null" area = "No information" floor = "No information" try: for value in response.xpath( "//li[@class='card-living-content-params-list__item']"): text = value.text if "площадь" in text: area = text.split('площадь')[1].split()[0] elif "Этаж" in text: floor = text.split('Этажей')[1] except BaseException: pass try: who = response.xpath("//div[@class='offer-card-contacts__person _type']")[0].text + ", " + \ response.xpath("//div[@class='offer-card-contacts__person']//span[@class='ui-kit-link__inner']")[0].text except BaseException: who = "No information" try: price = response.xpath("//div[@class='price']")[0].text.split( "₽")[0].replace(" ", "") except BaseException: price = "No information" try: link = \ response.xpath("//a[@class='ui-kit-link offer-card-contacts__owner-name _type-common _color-blue']/@href")[ 0] except IndexError: host = url.split(".ru/")[0] + ".ru" link = host + response.xpath( "//a[@class='ui-kit-link offer-card-contacts__link _agency-name _type-common _color-blue']/@href" )[0] try: contacts = response.xpath( "//a[@class='offer-card-contacts-phones__phone']/@href")[0].split( "+")[1] except BaseException: contacts = "No information" info = { "Регион": region, "Ссылка на объявление": url, "Продажа/Аренда": "Продажа", "Тип": "Коттеджи", "Широта/Долгота": latitude + ", " + longtitude, "Площадь": area, "Этаж": floor, "Цена": price, "название агенства недвижимости": who, "Ссылка на агенство": link, "Телефон агенства": contacts, "Дата скачивания": time, } with locker: with open(write_to, mode='a') as write_file: fieldnames = [ "Регион", "Ссылка на объявление", "Продажа/Аренда", "Тип", "Широта/Долгота", "Площадь", "Этаж", "Цена", "название агенства недвижимости", "Ссылка на агенство", "Телефон агенства", "Дата скачивания" ] writer = csv.DictWriter(write_file, delimiter=';', fieldnames=fieldnames) writer.writerow(info)
class LostFilmParser: source_url = 'https://www.lostfilm.tv/' tv_shows_list_part_url = 'https://www.lostfilm.tv/ajaxik.php' part_step = 10 def __init__(self): self.session = HTMLSession() self.news_data = self.session.get(self.source_url) def get_links(self): return self.news_data.html.links def get_title_en(self, href): try: result = search(r'/series/([^/]+)/', href) title_en = result.group(1) tv_show_link = self.source_url.rstrip('/') + result.group() except AttributeError: title_en = None tv_show_link = None return title_en, tv_show_link def get_new_shows_episodes(self): clear_data = [] news_block = self.news_data.html.find('.new-movies-block', first=True) movies = news_block.find('a.new-movie') for movie in movies: title_en, show_link = self.get_title_en(movie.attrs['href']) clear_data.append( { 'title_ru': movie.attrs['title'], 'title_en': title_en, 'jpg': 'http:' + movie.find('img', first=True).attrs['src'], 'season': movie.find('.title', first=True).text, 'date': movie.find('.date', first=True).text, 'episode_link': self.source_url.rstrip('/') + movie.attrs['href'], 'tv_show_link': show_link, } ) return clear_data def load_part_list(self, step): url = self.source_url + 'ajaxik.php' request_data = self.session.post( url=url, data={'act': 'serial', 'o': step, 's': 3, 't': 0, 'type': 'search'} ) return json.loads(request_data.content)['data'] def get_tv_shows_list(self): """10->20->30-> пока не вернет пустой список""" step = 0 shows_list = [] request_result = self.load_part_list(step) while request_result: for result in request_result: shows_list.append(result) step += self.part_step sleep(1) request_result = self.load_part_list(step) return shows_list
def __init__(self): self.session = HTMLSession()
class Scraper: # Initializes the scraper C3PO def __init__(self, url, budget, u_email): # Attributes about product self.url = url self.budget = budget # Setting user email self.u_email = u_email # Attributes about scraping self.session = HTMLSession() self.webpage = self.session.get(self.url).content self.parser = 'lxml' self.soup = BeautifulSoup(self.webpage, self.parser) # Prints the object def __str__(self): return self.soup.prettify() # Stores the title of the product def get_title(self): try: temp_title = self.soup.find('span', id='productTitle').text.strip() temp_list_title = [] for x in temp_title: if x == '(': break temp_list_title.append(x) self.product_title = ''.join(temp_list_title) return self.product_title except Exception: print("\n") print("ERROR - We weren't able to find the name of the product") print("\n") print("Exiting the script") exit() # Stores the price of the product after filtering the string and # converting it to an integer def get_price(self): price_raw = self.soup.find('span', id='priceblock_ourprice').text.strip() price_filtered = price_raw[2:len(price_raw) - 3] self.product_price = int(''.join( [x for x in price_filtered if x != ','])) return # Prints product title def print_title(self): print(self.product_title) return # Prints product price def print_price(self): print(self.product_price) return # Checks if the price of the product is below the budget def is_below_budget(self): if self.product_price <= self.budget: return True else: return False # Runs the scraper def run(self): self.get_title() self.get_price() self.alert = self.is_below_budget() self.status = False if self.alert: self.status = self.send_email() return self.status # Sends an email when the condition is satisfied. Under testing! def send_email(self): # Attributes for email sending port = 587 smtp_server = 'smtp.gmail.com' self.email = str(os.environ.get('DEVELOPER_MAIL')) self.app_pw = str(os.environ.get('DEVELOPER_PASS')) # Message details subject = f'The price of {self.get_title()} is within your budget!' body_start = """Hey there!\n The price is now within your budget. Here is the link, buy it now!\n""" body_mid = self.url body_end = '\n\nRegards\nYour friendly neighbourhood programmer' body = str(body_start) + str(body_mid) + str(body_end) message = f"Subject: {subject}\n\n{body}" # Establishing server context = ssl.create_default_context() self.server = smtplib.SMTP(smtp_server, port) # Mail sending self.server.ehlo() self.server.starttls(context=context) self.server.ehlo() self.server.login(self.email, self.app_pw) self.server.sendmail(self.email, self.u_email, message) print("Email sent successfully!") self.server.quit() return True
def fetch_pmo_charts(dst_dir, agency, base_url, subpages, datatype_prefix): """ Fetch graphs and html tables from pmo (Povodi Moravy) water board fetch_pmo_charts(dst_dir='/home/jiri/meteodata', agency_prefix='pmo', base_url='http://www.pmo.cz/portal/srazky/en/', subpages=['prehled_tab_1_chp.htm', 'prehled_tab_2_chp.htm', 'prehled_tab_3_chp.htm'], datatype_prefix='precip', agency='pmo') :param dst_dir: destination directory where to save the data (subdirs are created automatically) :param base_url: the base url [for example http://www.pvl.cz/portal/SaP/pc/? for streamflow, http://www.pvl.cz/portal/srazky/pc/? for precipitation] :param subpages: the list of sub-pages (for example ['oid=1', 'oid=2', 'oid=3']) :param datatype_prefix: the data type. use 'streamflow' or 'precip' :param agency: the short name of the operating agency. use pla, poh, pod, pvl or pmo :return: number of charts and html pages downloaded """ agency = "pmo" session = HTMLSession() n_charts = 0 for subpage in subpages: url = base_url + subpage print('-----------------------------') print(url) print('-----------------------------') r = session.get(url) anchors = r.html.find('a') a_hrefs = [a for a in r.html.find('a') if "DoMereni" in a.attrs["href"]] for a in a_hrefs: id = a.attrs["href"].split("'")[1] url_html = '{:s}/en/mereni_{:s}.htm'.format(base_url, id) print(url_html) if datatype_prefix == 'precip': url_img = '{:s}/grafy/sr{:s}_en.gif'.format(base_url, id) else: url_img = '{:s}/grafy/{:s}.gif'.format(base_url, id) print(url_img) img_response = get(url_img) if img_response.status_code == 200: img_dir = os.path.join(dst_dir, datatype_prefix, agency, os.path.splitext(os.path.basename(url_img))[0]) if not os.path.exists(img_dir): os.makedirs(img_dir) utc_timestamp_text = datetime.utcnow().strftime('_%Y-%m-%dT%H0000z.gif') img_filename = os.path.basename(url_img).replace('.gif', utc_timestamp_text) img_path = os.path.join(img_dir, img_filename) print(img_path) with open(img_path, 'wb') as f: f.write(img_response.content) n_charts += 1 # also save the HTML html_path = img_path.replace('.gif', '.htm') html_response = get(url_html) if html_response.status_code == 200: print(html_path) with open(html_path, 'wb') as f: f.write(html_response.content) return n_charts
from requests_html import HTMLSession import time import sys session = HTMLSession() pages = 1 nxt = "https://old.reddit.com/r/ShrugLifeSyndicate/" follow_posts = True while pages <= 10: r = session.get(nxt) articles = r.html.find("article") titles = r.html.find(".title") for t in titles: title = t.text url = t.attrs.get("href") if url: print(t.text) if (follow_posts): r = session.get("https://old.reddit.com/" + url) body = r.html.find(".usertext-body") # the first usertext-body is the sidebar if len(body) > 1: print(body[1].text)
from requests_html import HTMLSession session = HTMLSession() r = session.get('https://reddit.com') for html in r.html: print(html) pass
def handle(self, *args, **options): print('Start') Enterprises.objects.all().delete() session = HTMLSession() resp = session.get('http://west-info.ua/katalog-predpriyatij/') a = 1 start_urls = [] while True: b = 1 try: links = resp.html.xpath( f'/html/body/header/div[3]/div/nav/ul/li[2]/ul/li[{a}]/a/text()' ) print('Категория: {}'.format(links[0])) while True: try: link = resp.html.xpath( f'/html/body/header/div[3]/div/nav/ul/li[2]/ul/li[{a}]/ul/li[{b}]/a' ) print('Подкатегория -- {}'.format(link[0])) url = str(link[0]).split("href=")[1].rstrip( '\'>').lstrip("\'") dom = "http://west-info.ua" full_url = f"{dom}{url}" cat = resp.html.xpath( f'/html/body/header/div[3]/div/nav/ul/li[2]/ul/li[{a}]/a/text()' ) pod = resp.html.xpath( f'/html/body/header/div[3]/div/nav/ul/li[2]/ul/li[{a}]/ul/li[{b}]/a/text()' ) page = 1 page_one = 1 while True: try: max_page = [] i = 20 while i >= 0: full_url_page = f"{full_url}?page={page_one}" session = HTMLSession() resp = session.get(full_url_page) tit = resp.html.xpath( '/html/body/main/div/section/div/ul/li[5]/a/text()' ) max_page.append(tit) i -= 1 page_one += 1 max_page = int(str(max_page[-1]).strip("[']")) if page <= max_page: full_url_page = f"{full_url}?page={page}" session = HTMLSession() resp = session.get(full_url_page) print(full_url_page) try: code = resp.text company_url = re.findall( r'<a href="/company/.+', code) for i in company_url: url = i.lstrip( '<a href="').rstrip() url = url.split("/") url = f'{dom}/{url[1]}/{url[2]}/' print(url) try: session = HTMLSession() resp = session.get(url) code = resp.text company_name = re.findall( r'<h1 class="main-ttl"><span>.+', code) company_name = company_name[ 0].split('span') company_name = company_name[1] company_name = company_name.strip( '></') print('Название: ', company_name) city = re.findall( r'<b>Город:</b>.+', code) city = city[0].split('<') city = city[2] city = city.split('>') city = city[1] print('Город: ', city) phone = re.findall( r'<b>Телефон</b> .+', code) phone = phone[0].split( '<b>Телефон</b> ') phone = phone[1] phone = phone.split('</') phone = phone[0] print('Телефоны: ', phone) content = re.findall( r'<p>.+', code) content = content[0] content = content.split('p>') content = content[1] content = content.split('<') content = content[0] print('Контент', content) address = re.findall( r'Адрес:</b> .+', code) address = address[0].split( 'Адрес:</b> ') address = address[1] address = address.split('</') address = address[0] print('Адрес: ', address) image = re.findall( r'/admin/uploads/products/images/.+', code) image = image[0] image = image.split('"') image = image[0] image = f'{dom}{image}' print('Изображение: ', image) subcat = Subcategory.objects.all( ) firm = Enterprises() firm.name = company_name firm.content = content firm.image = image firm.phone = phone firm.address = address for i in subcat: if i.name == pod[0]: firm.subcategory = i # firm.category = cat firm.save() except: break except: break page += 1 else: break except: break b += 1 except: break a += 1 except: break print(start_urls)
class Switter: def __init__(self, *, proxies: Optional[Dict[str, str]] = None): self._session = HTMLSession() self._session.headers.update({'User-Agent': _CHROME_USER_AGENT}) if proxies: self._session.proxies.update(proxies) self._enable_legacy_site() def _enable_legacy_site(self): self._session.cookies.set('m5', 'off') def _profile_html(self, screen_name: str) -> HTML: url = f'https://twitter.com/{screen_name}' response = self._session.get(url) response.raise_for_status() return response.html def _search_json(self, query: str, max_position: Optional[int] = None) -> dict: url = 'https://twitter.com/i/search/timeline' response = self._session.get(url, params={ 'q': query, 'f': 'tweets', 'max_position': max_position or -1 }) response.raise_for_status() return response.json() def profile(self, screen_name: str) -> dict: document = self._profile_html(screen_name) data = json.loads( html.unescape( document.find('input.json-data[id=init-data][type=hidden]', first=True).attrs['value'])) user = data['profile_user'] date_format = r'%a %b %d %H:%M:%S %z %Y' return dict( id=user['id'], name=user['name'], screen_name=user['screen_name'], location=user['location'], website=user['url'], description=user['description'], created_at=datetime.datetime.strptime(user['created_at'], date_format), following_count=user['friends_count'], followers_count=user['followers_count'], favorites_count=user['favourites_count'], tweets_count=user['statuses_count'], private=user['protected'], ) def followers(self, screen_name: str) -> Iterable[str]: cursor: Optional[int] = INITIAL_CURSOR while cursor is not None: screen_names, cursor = self.followers_page(screen_name, cursor) yield from screen_names def followers_page( self, screen_name: str, cursor: int = INITIAL_CURSOR) -> Tuple[List[str], Optional[int]]: response = self._session.get( f'https://mobile.twitter.com/{screen_name}/followers', params={'cursor': cursor} if cursor != INITIAL_CURSOR else None, ) response.raise_for_status() document = response.html screen_names = _parse_followers_screen_names(document) next_cursor = _parse_followers_cursor(document) return screen_names, next_cursor def search(self, query: str, *, limit=20) -> Iterable[dict]: assert limit > 0 count = 0 position = -1 while True: data = self._search_json(query, max_position=position) html = HTML(html=data['items_html']) tweets = _extract_tweets(html) yield from map(_parse_tweet, tweets[:limit - count]) count += len(tweets) if not data['has_more_items'] or count >= limit: break position = data['min_position']
9:streaminfo.find('target="') - 2]) display.stop() print("\nOpening top available stream...") streamopen = False for s in streamurls: if "buffstreamz.com" in s: os.system("xdg-open " + s + " &> /dev/null") streamopen = True elif "ripple.is" in s: os.system("xdg-open " + s + " &> /dev/null") streamopen = True elif "nbastreams.xyz" in s: for line in HTMLSession().get(s).text.split(): if 'src="http://crackstreams.ga/nba/' in line: for pline in (HTMLSession().get( line[line.find('src="http://crackstreams.ga/nba/') + 5:-1]).text.split()): if '"http://' in pline: os.system("nohup mpv " + pline[1:-2] + "&> /dev/null") exit() if not streamopen: os.system("xdg-open " + streamurls[0] + " &> /dev/null") if input("Does this stream work? (y/n): ").lower() == "n": with open("streams.txt", "w+") as f: for stream in streamurls: f.write(stream + "\n")
class XHSpider(Process): def __init__(self,url): # 重写父类的__init__方法 super(XHSpider, self).__init__() self.url = url self.session = HTMLSession() self.headers = { 'Host':'news.daxues.cn', 'User-Agent':'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/64.0.3282.186 Safari/537.36' } self.path = "D:/Photo/" self.check_file_path(self.path) def check_file_path(self, path): ''' check the file is exists ''' if not os.path.exists(path): os.makedirs(path) def run(self): self.parse_page() def send_request(self, url): ''' 用来发送请求的方法 ''' # 请求出错时,重复求情3次 i = 0 while i < 3: try: print('请求url : ', url) # 网页是utf-8编码 return self.session.get(url, headers = self.headers).html except Exception as e: print('send_request error : ', str(e)) i += 1 def parse_page(self): ''' 解析网站源码,使用request-html提取 ''' html = self.send_request(self.url) imgs = html.find('dl a.p img') for img in imgs: href = img.attrs['src'] alt = img.attrs['alt'] self.save_image('http://news.daxues.cn'+href, alt) def save_image(self, url, name): ''' save image ''' content = self.session.get(url, headers=self.headers).content with open(self.path+name+'.jpg', 'wb') as f: f.write(content) f.close() def parse(self, url): self.url = url self.parse_page()
def __init__(self,**kw): self.session = HTMLSession()
def retrieve(self, job_state): from requests_html import HTMLSession session = HTMLSession() response = session.get(self.navigate) return response.html.html
from requests_html import HTMLSession from bs4 import BeautifulSoup session = HTMLSession() resp = session.get("https://www.desales.edu/directory") resp.html.render() # forces js to run soup = BeautifulSoup(resp.html.html, "lxml") emails = [] for td in soup.find_all('td'): if "@desales.edu" in td.text: emails.append(td.text) print(emails)
import random from time import sleep from pprint import pprint from requests_html import HTMLSession keywords = ('buy essays online', 'buy essay', 'write my essay', 'write history essay') session = HTMLSession() SERP = {} for key in keywords: print(f'Send request to Google: [{key}]') resp = session.get(f'https://www.google.com/search?q={key}&num=100&hl=en') links = resp.html.xpath('//div[@class="r"]/a[1]/@href') SERP[key] = [x.split('/')[2] for x in links if 'http' in x] sleep_seconds = random.randint(1, 10) print(f'Sleep: {sleep_seconds}') sleep(sleep_seconds) pprint(SERP) #Ваш код писать тут... domains = [set(l) for l in SERP.values()] all_domains = sorted(set.union(*domains)) int_domains = sorted(set.intersection(*domains)) print(f'Unique and sorted domains:\n {all_domains}')
from requests_html import HTMLSession import pandas as pd lista = [] #url='https://www.coppel.com/refrigeradores-y-congeladores' s = HTMLSession() def request(url): res = s.get(url) res.html.render(sleep=1) print(res.status_code) #print(res.html.xpath('//*[@id="searchBasedNavigation_widget_6_2303"]/div[1]/div[2]', first= True)) return res.html.xpath( '//*[@id="searchBasedNavigation_widget_6_2303"]/div[1]/div[2]', first=True) def parse(products): for product in products.absolute_links: try: res = s.get(product) name = res.html.find('div.top.namePartPriceContainer.clearfix', first=True).text except: print("No se encontro el producto!!") if res.html.find('div.p_oferta'): price = res.html.find('div.tam_normal', first=True).text.replace( "de contado", "").replace(' ', '') oferta = "Producto en oferta"
import csv from requests_html import HTMLSession session = HTMLSession() URL = "http://vbpl.vn/TW/Pages/vanban.aspx?fromyear=01/01/2011&toyear=31/12/2020&dvid=13&Page=" MAX_PAGES = 875 with open("list.csv", mode="w", encoding="utf-8-sig", newline="") as f: csv_writer = csv.writer(f) csv_writer.writerow( ["sokyhieu", "trichyeu", "ngaybanhanh", "ngayhieuluc", "word"]) for page in range(1, MAX_PAGES + 1): try: r = session.get(f"{URL}{page}") except Exception as e: r = None if r: items = r.html.find(".item") for item in items: try: title = item.find(".title", first=True).text description = item.find(".des", first=True).text right_column = item.find(".green") publish_date = right_column[0].text[-10:] valid_date = right_column[1].text[-10:] doc_file = item.find(".fileAttack a", first=True)
# -*- coding: utf-8 -*- """ Spyder Editor This is a temporary script file. """ import pandas as pd from requests_html import HTMLSession import time from openpyxl import load_workbook session = HTMLSession() lst2 = [] url2 = 'http://www.wsj.com/mdc/public/page/2_3062-shtnyse_0_9-listing.html' r2 = session.get(url2) #print(r2) hd = r2.html.find('.colhead') for x in range(len(hd)): if '\n' in hd[x].text: s = hd[x].text.split("\n") lst2.append(' '.join(s)) else: lst2.append(hd[x].text) time.sleep(.5) headers = lst2 exchange = ['shtnyse_', 'shtnasdaq_', 'shtamex_'] sheet = ['NYSE', 'NASDAQ', 'AMEX']
#!/usr/bin/env python # coding=utf-8 # author: zengyuetian # 获得链家网城区信息 from requests_html import HTMLSession if __name__ == '__main__': session = HTMLSession() # 获取上海链家小区栏目页 r = session.get('https://sh.lianjia.com/xiaoqu/') # 获得上海链家区县列表 elements = r.html.xpath('/html/body/div[3]/div[1]/dl[2]/dd/div/div/a') # 区县英文和中文名列表 en_names = list() ch_names = list() # element html代码 形如 <a href="/xiaoqu/pudong/" title="上海浦东小区二手房 ">浦东</a> for element in elements: for link in element.absolute_links: # 遍历链接set en_names.append(link.split('/')[-2]) ch_names.append(element.text) # 打印区县英文和中文名列表 for index, name in enumerate(en_names): print(name, ch_names[index]) """ pudong 浦东 minhang 闵行
def __init__(self): self.session = HTMLSession() self.news_data = self.session.get(self.source_url)