def get_player_stats(): player_stats = [] composition_stats = [] player_hero_stats = [] session = HTMLSession() res = session.get(player_stats_url()) map_divs = res.html.find('.map-wrapper') table_divs = res.html.find('.side-by-side-stats') category = 'Allmaps' for div in res.html.find('.match-div > div'): if 'map-wrapper' in div.attrs.get('class', []): map_name = div.find( '.label-info', first=True).text.lower().replace(' ', '_') elif 'side-by-side-stats' in div.attrs.get('class', []): composition_stat, hero_stat = parse_overall_hero_stat_div( div, category=category, map_name=map_name) composition_stats += composition_stat player_hero_stats += hero_stat player_stats += parse_overall_stat_div( div, category=category, map_name=map_name) else: category = div.text write_json('stats/composition_stats.json', composition_stats) write_json('stats/player_hero_stats.json', player_hero_stats) write_json('stats/player_stats.json', player_stats)
def get_media(user): ua = "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/71.0.3578.98 Safari/537.36" headers = { 'User-Agent': 'My User Agent 1.0', } proxies = { 'http': 'http://1.20.102.177:30106', 'https': 'https://1.20.102.177:30106', } url = 'https://www.instagram.com/' + user session = HTMLSession() req = session.get(url, headers=headers, proxies=proxies) media = [] scripts = req.html.xpath('//script[@type]') for s in scripts: content = s.text if "csrf_token" in content: content = content[:-1].split("window._sharedData = ")[1] data = json.loads(content) recent_media = data["entry_data"]["ProfilePage"][0]["graphql"]["user"]["edge_owner_to_timeline_media"]["edges"] for r in recent_media: media.append({ "username": data["entry_data"]["ProfilePage"][0]["graphql"]["user"]["username"], "image": r["node"]["thumbnail_src"], "timestamp": r["node"]["taken_at_timestamp"], 'permalink': r["node"]["display_url"], 'caption': r["node"]["edge_media_to_caption"]["edges"][0]["node"]["text"], 'shortcode': r["node"]["shortcode"] }) return media
def main(dir: str): files = [x for x in os.listdir(dir) if x.lower()[-4:] == "json"] session = HTMLSession() os.makedirs(f"{dir}_solutions", exist_ok=True) for crossword_file in files: crossword = json.load(open(os.path.join(dir, crossword_file))) timestamp = crossword["dateSolutionAvailable"] year, month, day = parse_timestamp(timestamp) number = crossword["number"] crossword_type = crossword["crosswordType"] url = f"https://www.theguardian.com/crosswords/{year}/{month}/{day}/annotated-solutions-for-{crossword_type}-{number}" print(crossword["solutionAvailable"], url) result = session.get(url) if result.status_code >= 300: continue html = result.html relevant_divs = html.find("div.content__main-column.content__main-column--article.js-content-main-column") if len(relevant_divs) != 1: print(relevant_divs) solutions = [x.text for x in relevant_divs[0].find("p") if x.text] parsed = parse_solutions(solutions) save_name = os.path.join("crosswords/prize_solutions", f"{number}_solution.json") with open(save_name, "w+") as file: json.dump(parsed, file, indent=4)
def show_datetime_for(name, url): session = HTMLSession() r = session.get(url) # '03:37:58' time = r.html.find('#ct', first=True).text # 'PDT' timezone = r.html.find('#cta', first=True).text # 'Saturday, 16 June 2018' date = r.html.find('#ctdat', first=True).text print(f'{name:12}: {time} {date} {timezone}')
class MensaBase(object): def __init__(self, endpoints, location): """Constructor.""" self.location = location # dict of language specific endpoints # { Language : url-string } self.endpoints = endpoints adapter = CacheControlAdapter(heuristic=ExpiresAfter(days=1)) self.session = HTMLSession() self.session.mount('https://', adapter) def retrieve(self, datum=None, language=None, meals=None, emojize=None) -> Plan: # overwrite this # TODO how to make design more pythonic? # In Java terms: abstract class -> two implementation classes pass # Helper method to make a language-specific request def do_request(self, language=Language.DE): resp = self.session.get(self.endpoints[language.name]) code = resp.status_code if code != 200: logger.warning(f'Non-200 status: {code}') logger.debug(f'Status Code: {code}') return resp.html @staticmethod def _normalize_key(k: str) -> str: return None if not k else k.strip().lower().replace(' ', '_') @staticmethod def _strip_additives(text: str) -> str: return re.sub('\((\s*(\d+)?[a-z]?[,.]?\s*)+\)', '', text) @staticmethod def _normalize_whitespace(text: str) -> str: return re.sub('\s{2,}', ' ', text) @staticmethod def _normalize_orthography(text: str) -> str: return re.sub('\s,', ',', text) @staticmethod def _clean_text(text: str) -> str: return MensaBase._normalize_orthography(MensaBase._normalize_whitespace(MensaBase._strip_additives(text.strip()))) @staticmethod def _text_replace(text: str) -> str: return re.sub('Züricher', "Zürcher", text)
def get_teams_and_matches(): session = HTMLSession() res = session.get(owl_index_url()) res.html.render(timeout=60) match_rows = res.html.find( '.tab-pane#past')[0].find('table')[0].find('.past-matches-row') updated = True # TODO get match data in future # for row in match_rows: # if parse_match_row(row): # updated = True if updated: teams = {td.text: td.absolute_links.pop() for td in res.html.find('td.team')} write_json('stats/team_hero_stats.json', [parse_team(team_name, team_url) for team_name, team_url in teams.items()])
def handle(cls, *args, **kwargs): session = HTMLSession() for operator in Operator.objects.filter(service__current=True, twitter='').exclude(url='').distinct(): try: r = session.get(operator.url, timeout=10) except RequestException: operator.url = '' operator.save() continue for link in r.html.links: twitter = cls.get_from_link(link) if twitter: operator.twitter = twitter operator.save() break
def fetch_hpps_streamflow(dst_dir, url=None): """ Fetch streamflow data from chmi fetch_hpps_data """ session = HTMLSession() n_charts = 0 datatype_prefix = 'streamflow' agency = 'chmi' pagesize = 50 n_pages = 20 for page in range(0, n_pages): subpage_url = "http://hydro.chmi.cz/hpps/hpps_oplist.php?startpos={0}&recnum={1}".format(page*pagesize, pagesize) print("----------------------------------------------------") print(subpage_url) print("----------------------------------------------------") session = HTMLSession() r = session.get(subpage_url) for lnk in r.html.absolute_links: if 'prfdyn' in lnk: print(lnk) station_seq = lnk.split('=')[-1] print(station_seq) data_dir = dst_dir / datatype_prefix / agency / station_seq if not os.path.exists(data_dir): os.makedirs(data_dir) utc_timestamp_text = datetime.utcnow().strftime('%Y-%m-%dT%H0000z.html') html_filename = "prfdata_" + station_seq + "_" + utc_timestamp_text html_path = data_dir / html_filename # save the HTML with seven-day table lnk_table = lnk.replace('prfdyn', 'prfdata') print(lnk_table) html_response = get(lnk_table) if html_response.status_code == 200: print(html_path) with open(html_path, 'wb') as f: f.write(html_response.content)
class MM(object): def __init__(self): self.__page = 1 self.__url = "http://www.mm131.com/qingchun/list_1_{}.html" self.__session = HTMLSession() self.__headers = { 'Referer':'http://www.mm131.com/qingchun/', 'User-Agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/64.0.3282.140 Safari/537.36' } self.__imagePath = r'D:/Photo/MM' self.__confirmPath() def __confirmPath(self): if not os.path.exists(self.__imagePath): os.makedirs(self.__imagePath) def download(self,link,fileName): try: with open(self.__imagePath+'/'+fileName+'.jpg','wb') as f: f.write(self.__session.request('get',link,headers = self.__headers,allow_redirects=False).content) except Exception as e: print(str(e)) def parseData(self): start = time.time() while self.__page < 12: if self.__page == 1: self.__url = "http://www.mm131.com/qingchun/" else: self.__url = 'http://www.mm131.com/qingchun/list_1_{}.html'.format(self.__page) r = self.__session.get(self.__url) main = r.html.find(".main",first=True) dl = main.find('dl')[0] dds = dl.find('dd') for dd in dds[:-1]: attr = dd.find('img')[0].attrs imageLink = attr['src'] title = attr['alt'] self.download(imageLink,title) self.__page += 1 end = time.time() - start print("爬取时间:",end)
def parse_match_row(row): match_path = os.path.join( 'stats', 'matches', row.attrs['matchid'] + '.json') if os.path.exists(match_path): return False match = {} session = HTMLSession() match_res = session.get(match_url(row.attrs['matchid'])) render_result = match_res.html.render(timeout=600) print(render_result) team_names = [{'name': team_name_div.text, 'id': team_name_div.links.pop().split('id=')[-1]} for team_name_div in match_res.html.find('.names-and-score', first=True).find('div')[1::2]] maps = [] for map_div in match_res.html.find('.map-wrapper'): map_data = {'name': map_div.find( '.mapname', first=True).text, 'teams': []} mapping = {'name': 3, 'score': 4, 'progress': 5, 'fights': 6, 'kills': 7} for i in range(1, 3): team_data = {} for key, index in mapping.items(): team_data[key] = map_div.find('div')[index].text.split('\n')[i] map_data['teams'].append(team_data) maps.append(map_data) stat_divs = match_res.html.find('.side-by-side-stats') overall_stats = parse_stat_div(stat_divs.pop(0)) for i, map_stat_div in enumerate(stat_divs): maps[i]['stats'] = parse_stat_div(map_stat_div) hero_stats = parse_hero_stat_div(match_res.html.find( '#allMapsAllRoundsAllTeams', first=True)) hero_stats_by_team = [] # TODO FIX the script problem # for team in team_names: # hero_stats_by_team.append(parse_hero_stat_div(match_res.html.find( # '#allMapsAllRoundsTeam' + team['id'], first=True))) write_json(match_path, {'maps': maps, 'stats': overall_stats, 'hero_stats': hero_stats, 'hero_stats_by_team': hero_stats_by_team, 'teams': team_names, 'date': row.find('td')[0].text}) return True
def main(crossword_types: List[str]): session = HTMLSession() for crossword_type in crossword_types: if crossword_type not in CROSSWORDS.keys(): raise ValueError(f"crosword type must be in one of {CROSSWORDS.keys()}") start, end = CROSSWORDS[crossword_type] os.makedirs(f"crosswords/{crossword_type}", exist_ok=True) for crossword_no in reversed(range(start, end)): try: url = "https://www.theguardian.com/crosswords/" + crossword_type + "/" + str(crossword_no) result = session.get(url) if result.status_code >= 300: continue html = result.html try: relevant_divs = html.find("div.js-crossword") if len(relevant_divs) != 1: print(relevant_divs) clues = relevant_divs[0].attrs["data-crossword-data"] except: relevant_divs = html.find("div.js-crossword has-grouped-clues") if len(relevant_divs) != 1: print(relevant_divs) clues = relevant_divs[0].attrs["data-crossword-data"] clues_json = json.loads(clues) save_name = clues_json["id"] + ".json" with open(save_name, "w+") as file: json.dump(clues_json, file, indent=4) except IndexError: print("couldn't find crossword no:{}".format(crossword_no)) with open("crosswords/" + crossword_type + "/missing_ids.txt", "a+") as file: file.write(str(crossword_no) + "\n")
def get_event_player_rank(): session = HTMLSession() res = session.get(player_rank_url()) table = res.html.find('table.ranking-table', first=True) player_ranks = [] hero_ranks = [] for tr in table.find('tr')[2:]: overall_rank = int(tr.find('td.rank', first=True).text) overall_rating = int(tr.find('.rating-number', first=True).text) team_name = tr.find('.small-team-logo', first=True).attrs['title'].split(': ')[-1] stars = int(tr.find( '.star-rating', first=True).attrs['class'][-1].replace('star', '').split('-')[0]) info_div, heros_div = tr.find('.team-info-td > div') name = info_div.find('a', first=True).text time, sos_rank, win_percent = [div.text.split( ': ')[-1] for div in info_div.find('.secondary-stats')] rank_data = {'overall_rank': overall_rank, 'overall_rating': overall_rating, 'team_name': team_name, 'stars': stars, 'name': name, 'time': time, 'sos_rank': int(sos_rank), 'win_percent': win_percent, 'hero_ranks': []} for span in heros_div.find('span.secondary-ranking'): hero_name = span.attrs['title'].split(' Rank:')[0].lower() hero_rank_by_total, hero_rating, hero_time, hero_win_percent = [ text.split(': ')[-1] for text in span.attrs['title'].split('\n')] hero_rank, total_count = hero_rank_by_total.split('/') hero_rank_data = {'hero_name': hero_name, 'hero_rank_by_total': hero_rank_by_total, 'hero_rating': int(hero_rating), 'hero_time': hero_time, 'hero_win_percent': hero_win_percent, 'hero_rank': int(hero_rank), 'total_count': int(total_count), 'name': name, 'overall_rank': overall_rank, 'overall_rating': overall_rating, 'team_name': team_name, 'stars': stars, } hero_ranks.append(hero_rank_data) rank_data['hero_ranks'].append(hero_rank_data) player_ranks.append(rank_data) write_json('stats/player_ranks.json', player_ranks) write_json('stats/hero_ranks.json', hero_ranks)
import sys import json from requests_html import HTMLSession GOOGLE = 'https://www.google.com/search?tbm=isch&q=' if len(sys.argv) == 2: q = str(sys.argv[1]) session = HTMLSession() r = session.get(GOOGLE + q) r.html.render() colors = r.html.search_all("rgb({rgb})") palette = [] for c in colors: palette.append(c['rgb']) wordPalette = {q: palette} print(json.dumps(wordPalette))
def fetch_vodagov_charts(dst_dir, agency, base_url, subpages, datatype_prefix): """ Fetch graphs and html tables from voda.gov.cz fetch_vodagov_charts(dst_dir='/home/jiri/meteodata', agency_prefix='pod', base_url='http://www.pvl.cz/portal/SaP/pc/?', subpages=['oid=1', 'oid=2'], datatype_prefix='streamflow', agency_prefix='pod') :param dst_dir: destination directory where to save the data (subdirs are created automatically) :param base_url: the base url [for example http://www.pvl.cz/portal/SaP/pc/? for streamflow, http://www.pvl.cz/portal/srazky/pc/? for precipitation] :param subpages: the list of sub-pages (for example ['oid=1', 'oid=2', 'oid=3']) :param datatype_prefix: the data type. use 'streamflow' or 'precip' :param agency: the short name of the operating agency. use pla, poh, pod, pvl or pmo :return: number of charts and html pages downloaded """ #if datatype_prefix == 'streamflow': #pvl_base = 'http://sap.poh.cz/portal/SaP/pc/?' #else: #pvl_base = 'http://sap.poh.cz/portal/Srazky/PC/?' session = HTMLSession() n_charts = 0 for subpage in subpages: url = base_url + subpage print('-----------------------------') print(url) print('-----------------------------') r = session.get(url) for lnk in r.html.absolute_links: if 'Mereni.aspx?id=' or 'mereni.aspx?id=' in lnk: try: r_st = session.get(lnk) images = r_st.html.find('img') for img in images: if 'src' not in img.attrs: continue src = img.attrs['src'] if ('graf' in src or 'Graf' in src) and ('miniatury' not in src) and ("&" not in src) and (".ashx" not in src): if 'maska' in src: continue img_src_absolute = urljoin(lnk, src) img_response = get(img_src_absolute) if img_response.status_code == 200: img_dir = os.path.join(dst_dir, datatype_prefix, agency, os.path.splitext(os.path.basename(img_src_absolute))[0]) if not os.path.exists(img_dir): os.makedirs(img_dir) utc_timestamp_text = datetime.utcnow().strftime('_%Y-%m-%dT%H0000z.png') img_filename = os.path.basename(img_src_absolute).replace('.png', utc_timestamp_text) img_path = os.path.join(img_dir, img_filename) print(img_path) with open(img_path, 'wb') as f: f.write(img_response.content) # also save the HTML html_path = img_path.replace('.png', '.html') html_response = get(lnk) if html_response.status_code == 200: print(html_path) with open(html_path, 'wb') as f: f.write(html_response.content) n_charts += 1 except ValueError: print('ERROR fetching ' + lnk) return n_charts
def start_request(self, url): session = HTMLSession() header = dict() header['user-agent'] = random.choice(self.USER_AGENT_LIST) response = session.get(url, headers=header) return response
def parse_team(team_name, team_url): session = HTMLSession() return {'id': team_url.split('id=')[-1], 'name': team_name, 'heros': [{'win_rate': div.text, 'heros': [span.attrs['title'].replace('soldier76', 'soldier-76') for span in div.find('span')]} for div in session.get(team_url).html.find('.team-comp-wrapper > .team-comp')]}
from requests_html import HTMLSession session = HTMLSession() r = session.get("http://money.finance.sina.com.cn/corp/go.php/" + "vMS_MarketHistory/stockid/601006.phtml?year=2018&jidu=2") table = r.html.xpath("//*[@id='FundHoldSharesTable']")[0] trArray = table.xpath("//tr") trArray = trArray[2:len(trArray)] gpList = list() for tr in trArray: gp = list() for td in tr.xpath("//td"): gp.append(td.text) gpList.append(gp) for gp in gpList: print(gp)
# -*- coding:utf8 -*- # @Time:2021/10/21 10:15 上午 # @Author: Huang Jeff from requests_html import HTMLSession name = "猫" url = f"https://unsplash.com/s/photos/{name}" session = HTMLSession() result = session.get(url) print(result.status_code) print( result.html.xpath('//figure[@itemprop="image"]//a[@rel="nofollow"]/@href')) # down_list = result.html.xpath('//figure[@itemprop="image"]//a[@rel="nofollow"]/@href') # def get_picID_from_url(): # def down_one_pic(url): # result = session.get(url) # filename = get_picID_from_url(url) # with open(filename, "wb") as f: # f.write(result.content) # for one_url in down_list: # down_one_pic(one_url)
def update_econproj(url, baseline, text_args): """ Function that will read new CBO economic projections and update CBO_baseline.csv accordingly Parameters ---------- url: URL linking to IRS website with projections of federal tax filings baseline: CBO baseline we're updaint text_args: Dictionary containing the arguments that will be passed to the documentation template Returns ------- baseline: Updated baseline numbers text_args: Updated dictionary with text aruments to fill in the template """ print("Updating CBO Economic Projections") # pull all of the latest CBO reports and use them for needed updates session = HTMLSession() r = session.get(url) divs = r.html.find("div.view.view-recurring-data") revprojections = divs[4] # both assertions are there to throw errors if the order of sections change # revenue projections used for capital gains projections assert "Revenue Projections" in revprojections.text latest_revprojections = revprojections.find("div.views-col.col-1")[0] rev_link = latest_revprojections.find("a")[0] _rev_report = datetime.strptime(rev_link.text, "%b %Y") rev_report = datetime.strftime(_rev_report, "%B %Y") rev_url = rev_link.attrs["href"] econprojections = divs[8] assert "10-Year Economic Projections" in econprojections.text latest_econprojections = econprojections.find("div.views-col.col-1")[0] econ_link = latest_econprojections.find("a")[0] _cbo_report = datetime.strptime(econ_link.text, "%b %Y") cbo_report = datetime.strftime(_cbo_report, "%B %Y") econ_url = econ_link.attrs["href"] if cbo_report == text_args["current_cbo"]: print("\tNo new data since last update") else: # read in economic projections econ_proj = pd.read_excel(econ_url, sheet_name="2. Calendar Year", skiprows=6, index_col=[0, 1, 2, 3]) # extract values for needed rows in the excel file # some variables have a missing value in the multi-index. Use iloc # to extract needed variables from them. gdp = econ_proj.loc["Output"].loc["Gross Domestic Product (GDP)"].iloc[ 0] income = econ_proj.loc["Income"] tpy = income.loc["Income, Personal"].iloc[0] wages = income.loc["Wages and Salaries"].iloc[0] billions = "Billions of dollars" var = "Proprietors' income, nonfarm, with IVA & CCAdj" schc = income.loc["Nonwage Income"].loc[var].loc[billions] var = "Proprietors' income, farm, with IVA & CCAdj" schf = income.loc["Nonwage Income"].loc[var].loc[billions] var = "Interest income, personal" ints = income.loc["Nonwage Income"].loc[var].loc[billions] var = "Dividend income, personal" divs = income.loc["Nonwage Income"].loc[var].loc[billions] var = "Income, rental, with CCAdj" rents = income.loc["Nonwage Income"].loc[var].loc[billions] book = income.loc["Profits, Corporate, With IVA & CCAdj"].iloc[0] var = "Consumer Price Index, All Urban Consumers (CPI-U)" cpiu = econ_proj.loc["Prices"].loc[var].iloc[0] var_list = [gdp, tpy, wages, schc, schf, ints, divs, rents, book, cpiu] var_names = [ "GDP", "TPY", "Wages", "SCHC", "SCHF", "INTS", "DIVS", "RENTS", "BOOK", "CPIU", ] df = pd.DataFrame(var_list, index=var_names).round(1) df.columns = df.columns.astype(str) df_cols = set(df.columns) baseline_cols = set(baseline.columns) for col in df_cols - baseline_cols: baseline[col] = None baseline.update(df) text_args["previous_cbo"] = text_args["current_cbo"] text_args["current_cbo"] = cbo_report if rev_report == text_args["cgns_prev_report"]: print("\tNo new data since last update") return baseline, text_args elif rev_link.text == "Mar 2020": msg = ( "\nCapital gains realizations are not included in CBO's March 2020" " revenue projections publication. It's unclear if this is a " "permanent change or due to the pandemic. For now, we will skip " "this update and re-evaluate when they release their next " "projections.\n") print(msg) return baseline, text_args else: # Extract capital gains data cg_proj = pd.read_excel( rev_url, sheet_name="6. Capital Gains Realizations", skiprows=7, header=[0, 1], ) cg_proj.index = cg_proj[cg_proj.columns[0]] var = "Capital Gains Realizationsa" cgns = cg_proj[var]["Billions of Dollars"].loc[list(range(2017, 2032))] var_list = [cgns] var_names = ["CGNS"] df = pd.DataFrame(var_list, index=var_names).round(1) df.columns = df.columns.astype(str) # update baseline file with the new data # add a column for any years that are in the update but not yet in the # CBO baseline file before updating the values df_cols = set(df.columns) baseline_cols = set(baseline.columns) for col in df_cols - baseline_cols: baseline[col] = None baseline.update(df) text_args["cgns_prev_report"] = text_args["cgns_cur_report"] text_args["cgns_prev_url"] = text_args["cgns_cur_url"] text_args["cgns_cur_report"] = rev_report text_args["cgns_cur_url"] = rev_url return baseline, text_args
from requests_html import HTMLSession import json from pprint import pprint import io session = HTMLSession() r = session.get('https://www.qiushibaike.com/text/') # # 网页文本 # print(r.html.html) # # # 获取链接 # print(r.html.links) # print(r.html.absolute_links) # # 首页菜单文本 # print(r.html.find('div#menu', first=True).text) # # 首页菜单元素 # print(r.html.find('div#menu a', first=True)) # # 段子内容 # print(list(map(lambda x: x.text, r.html.find('div.content span')))) # print(r.html.xpath("//div[@id='menu']", first=True).text) # print(r.html.xpath("//div[@id='menu']/a")) # print(r.html.xpath("//div[@class='content']/span/text()")) # 获取元素 # e = r.html.find("div#hd_logo", first=True) # print(e.text) # print(e.attrs) # print(e.absolute_links) # print(e.links)
#!/usr/bin/env python3 # -*- coding: utf-8 -*- __author__ = 'ipetrash' # pip install requests-html from requests_html import HTMLSession session = HTMLSession() rs = session.get('https://coronavirus-monitor.ru/statistika/') with open('rs_before_js.html', 'w', encoding='utf-8') as f: f.write(rs.html.html) rs.html.render() # Без этого не будет выполнения js кода with open('rs_after_js.html', 'w', encoding='utf-8') as f: f.write(rs.html.html)
#!/usr/bin/python # -*- coding:utf-8 -*- from requests_html import HTMLSession import json dict2json = {} session = HTMLSession() r = session.get('https://ckb.jax.org/gene/grid') print(len(r.html.links)) gene_link_dic = {gene.text:tuple(gene.absolute_links)[0] for gene in r.html.find('div.container-fluid div:nth-child(3) a')} del gene_link_dic[''] del gene_link_dic['Creative Commons Attribution-NonCommercial-ShareAlike 4.0 International License'] for gene, link in gene_link_dic.items(): r = session.get(link) if 'Additional content available in ' in r.html.html: pass else: selector2tr = 'div.container-fluid div:nth-child(4) div.col-lg-12 div.tab-content div#geneVariants tbody tr' gene_variants_tr_list = r.html.find(selector2tr) # gene_variants_tr_html = HTML(gene_variants_tr_list) dict2json[gene] = [] for variant in gene_variants_tr_list: variant_info_list = variant.find('td') dict2json[gene].append({'Variant':variant_info_list[0].text, 'Imapct':variant_info_list[1].text, 'Protein Effect':variant_info_list[2].text, 'Variant Description':variant_info_list[3].text, 'Associated with drug Resistance':variant_info_list[4].text}) with open('gene_data.json', 'w') as f: json.dump(dict2json, f)
class Spider(object): """ sina.com.cn Spider """ def __init__(self, db): self.url = args.url self.depth = args.depth self.keyword = args.keyword self.db = db self.analysed_url = {} self.url_visiting = {} self.url_keyword = {} self.lock = threading.Lock() self.rlock = threading.RLock() self._session = HTMLSession() self._thread_pool = ThreadPool(args.thread, fn=self.analyse) self._thread_pool.sumbit((args.url, 0)) logging.debug('start Spider url={} keyword={} depth={}'.format( self.url, self.keyword, self.keyword, self.depth)) def analyse(self, task): """ get html response :param task: (url, level) :return: """ url, level = task if not self.is_visited(url, level): logging.debug( 'get task {} which has been visited. Please check mutex use right or not' .format(task)) return try: response = self._session.get(url) logging.debug('GET {}'.format(url)) except Exception as e: logging.error("requests error %s" % e) else: content_type = self.content_type(response) if '/html' in content_type: has_key = self.has_keyword(response) if has_key: try: self.lock.acquire() self.url_keyword[url] = level logging.info('url {} with keyword={}'.format( url, self.keyword)) except Exception as e: logging.error(e) finally: self.lock.release() links = self.extract_link(response) self.submit_links2queue(links, level + 1) else: logging.warning('url {} Content-Type={} not supported'.format( url, content_type)) finally: self.add_url2analysed(url, level) def is_visited(self, url, level): # 判断urk是否抓取过 flag = False try: self.lock.acquire() if url not in self.url_visiting and url not in self.analysed_url: self.url_visiting[url] = level flag = True except Exception as e: pass finally: self.lock.release() return flag def content_type(self, r: HTMLResponse) -> str: # 网页类型 return r.headers.get('Content-Type', '') def has_keyword(self, r): # 是否含关键词 text = "" try: text = r.html.text except Exception as e: logging.error(e) if self.keyword in text: logging.debug('{} in url {}'.format(self.keyword, r.url)) return True else: logging.debug('{} not in url {}'.format(self.keyword, r.url)) return False def submit_links2queue(self, links, level): """ 向队列中添加新任务 :param links: 本次爬取页面中的所有链接 :param level: 本次页面所处深度 :return: """ if level > self.depth: logging.debug('links {} beyond max_depth={}'.format(links, level)) return for link in links: if link.startswith(self.url): # 只搜索同一个域下的内容 if link not in self.url_visiting and link not in self.analysed_url: # 提交到队列中 self._thread_pool.sumbit((link, level)) def add_url2analysed(self, url, level): """ 将 url 加入到已经分析的 url 字典中 :param url: :return: """ try: self.lock.acquire() # 先加到 analysed_url 再从 url_visiting 删除,防止重复爬取 self.analysed_url[url] = level del self.url_visiting[url] except Exception as e: logging.error(e) finally: self.lock.release() logging.info('finish url {} in level {}'.format(url, level)) def extract_link(self, r: HTMLResponse): """ 提取 html 中的链接,需要分析相对链接和绝对链接 :return: """ relative_links = r.html.links - r.html.absolute_links all_links = r.html.absolute_links | self.relative2absolute( relative_links, r.html.base_url) logging.info('links {} in url {}'.format(all_links, r.url)) return all_links def relative2absolute(self, relative_links, base_url): """ 将相对链接拼接成绝对链接 :param relative_links: :param base_url: :return: """ return {base_url + x for x in relative_links} def has_finished(self): """ 查询当前爬虫任务是否已经完成 :return: """ f = self._thread_pool.is_over() if not f: self.persist2db() return f return f def persist2db(self): """ 将 url 持久化到 sqlite db :return: """ with self.rlock: self.rlock.acquire() t = self.url_keyword self.url_keyword = {} for url in t: content = self._session.get(url).content.decode("utf8", "ignore") self.db.insert(self.keyword, url, content) def progress(self) -> tuple: """ 反馈进度消息 :return: """ return self._thread_pool.progress() def add_url_with_keyword(self, url, level): """ 发现页面包含关键字的 url :param url: :param level: :return: """ with self.rlock: self.url_keyword[url] = level logging.info('url {} with keyword={}'.format(url, self.keyword))
########################################\n\ \033[0m') urllist = [ 'https://www.nicovideo.jp/tag/VOCAL_Character%E3%83%A9%E3%83%B3%E3%82%AD%E3%83%B3%E3%82%B0?sort=f&order=d' ] # for i in range(1,13): # urllist.append('https://www.nicovideo.jp/tag/週刊VOCALOIDとUTAUランキング?sort=f&order=d&page='+str(i)) mylist = {} with open('../json/episodelist.json') as json_file: mylist = json.load(json_file) for url in urllist: print(url) r = session.get(url) # seltit = 'body > div.BaseLayout > div.container.columns.column700-300 > div > div.column.main > div.contentBody.video.uad.videoList.videoList01 > ul:nth-child(2) > li > div.itemContent > p > a' # seldes = 'body > div.BaseLayout > div.container.columns.column700-300 > div > div.column.main > div.contentBody.video.uad.videoList.videoList01 > ul:nth-child(2) > li > div.itemContent > div.wrap > p.itemDescription' sel = 'body > div.BaseLayout > div.container.columns.column700-300 > div > div.column.main > div.contentBody.video.uad.videoList.videoList01 > ul:nth-child(2) > li > div.itemContent' vresults = r.html.find(sel) for vresult in vresults: if len(vresult.find('a')) <= 0 or len( vresult.find('p.itemDescription')) <= 0: continue vtit = list(vresult.find('a'))[0] vdes = list(vresult.find('p.itemDescription'))[0] if len(vtit.absolute_links) <= 0: continue
def scrap(given_name: str, given_url, given_model_no=None): """ :param given_model_no: :param given_name: :param given_url: :return: List of Scraped data, Data error count and Keyword """ if given_model_no is not None: links = get_links(given_name, given_url, given_model_no) else: links = get_links(given_name, given_url) if len(links) < 1: return [] data_list = [] number = 1 for link in links: print(f'Getting data from link {number} of {len(links)}...') url = link.find('.name.fn.l_mgn-tb-sm.l_dsp-blc')[0].attrs['href'] session = HTMLSession() r = session.get(url) number += 1 try: t1 = datetime.now() try: title = clean_text(r.html.find('.product-name')[0].text) sku = r.html.find('.product-id.meta.quiet.p_txt-sm')[-1].text except IndexError: continue except Exception as e: n = e continue try: prd_price = clean_price( r.html.find('.price-device>script')[0].text) except Exception as e: n = e # print(f'\n{e} price\n{title}\n\n') prd_price = '0' try: merchant = clean_text(r.html.find('#sellerProfile')[0].text) except Exception as e: n = e merchant = 'NA' timestamp = datetime.now() main = { 'name': title, 'price': prd_price, 'timestamp': timestamp, 'merchant': merchant, 'time': (datetime.now() - t1).total_seconds(), 'url': url, 'sku': sku, } data_list.append(main) except AttributeError: pass return data_list
from bs4 import BeautifulSoup as soup from selenium import webdriver from requests_html import HTMLSession from urllib.request import urlopen as uReq session = HTMLSession() r = session.get("https://www.wma.net/publications/wma-annual-report/") html = r.html.html pageSoup = soup(html, "html.parser") pdf_soup = pageSoup.find_all("a", {"target": "_blank"}) for i in range(len(pdf_soup)): print( "-----------------------------------------------------------------------------------------------------------" ) print(pdf_soup[i].get('href'))
def runScraper(): storeCities() conn = connect() #the cities table contains around 480 cities, all of the craigslist pages in north america curs = conn.cursor() curs.execute("SELECT * FROM cities") citiesList = [] for city in curs.fetchall(): citiesList.append(city) curs.execute( '''CREATE TABLE IF NOT EXISTS vehicles(id BIGINT PRIMARY KEY, url TEXT, region TEXT, region_url TEXT, price BIGINT, year BIGINT, manufacturer TEXT, model TEXT, condition TEXT, cylinders TEXT, fuel TEXT, odometer BIGINT, title_status TEXT, transmission TEXT, VIN TEXT, drive TEXT, size TEXT, type TEXT, paint_color TEXT, image_url TEXT, description TEXT, county TEXT, state TEXT, lat REAL, long REAL)''') session = HTMLSession() #scraped counts all entries gathered scraped = 0 #carBrands dictate what qualifies as a brand so we can snatch that data from the 'model' tag carBrands = [ "ford", "toyota", "chevrolet", "chev", "chevy", "honda", "jeep", "hyundai", "subaru", "kia", "gmc", "ram", "dodge", "mercedes-benz", "mercedes", "mercedesbenz", "volkswagen", "vw", "bmw", "saturn", "land rover", "landrover", "pontiac", "mitsubishi", "lincoln", "volvo", "mercury", "harley-davidson", "harley", "rover", "buick", "cadillac", "infiniti", "infinity", "audi", "mazda", "chrysler", "acura", "lexus", "nissan", "datsun", "jaguar", "alfa", "alfa-romeo", "aston", "aston-martin", "ferrari", "fiat", "hennessey", "porche", "noble", "morgan", "mini", "tesla" ] #if the car year is beyond next year, we toss it out. this variable is used later nextYear = datetime.now().year + 1 #simple txt file mechanism to track scraping progress fileName = os.path.dirname( os.path.abspath(__file__)) + "/static/trackVehicleScraping.txt" exists = os.path.isfile(fileName) if not exists: tracker = open(fileName, "w") tracker.write("0") tracker.close() with open(fileName, "r") as tracker: cities = int(tracker.readlines()[0]) citiesCount = len(citiesList) citiesList = citiesList[cities:] for city in citiesList: scrapedInCity = 0 cities += 1 print( f"Scraping vehicles from {city[2]}, {citiesCount - cities} cities remain" ) empty = False #scrapedIds is used to store each individual vehicle id from a city, therefore we can delete vehicle records from the database #if their id is no longer in scrapedIds under the assumption that the entry has been removed from craigslist scrapedIds = set([]) #track items skipped that are already in the database skipped = 0 #this loop executes until we are out of search results, craigslist sets this limit at 3000 and cities often contain the full 3000 records (but not always) while not empty: print( f"Gathering entries {scrapedInCity} through {scrapedInCity + 120}" ) #now we scrape try: searchUrl = f"{city[1]}/d/cars-trucks/search/cta?s={scrapedInCity}" page = session.get(searchUrl) except Exception as e: #catch any excpetion and continue the loop if we cannot access a site for whatever reason print( f"Failed to reach {searchUrl}, entries have been dropped: {e}" ) scrapedInCity += 120 continue #each search page contains 120 entries scrapedInCity += 120 tree = html.fromstring(page.content) #the following line returns a list of urls for different vehicles vehicles = tree.xpath('//a[@class="result-image gallery"]') if len(vehicles) == 0: #if we no longer have entries, continue to the next city empty = True continue vehiclesList = [] for item in vehicles: vehicleDetails = [] vehicleDetails.append(item.attrib["href"]) try: #this code attempts to grab the price of the vehicle. some vehicles dont have prices (which throws an exception) #and we dont want those which is why we toss them vehicleDetails.append(item[0].text) except: continue vehiclesList.append(vehicleDetails) #loop through each vehicle for item in vehiclesList: url = item[0] try: idpk = int(url.split("/")[-1].strip(".html")) except ValueError as e: print("{} does not have a valid id: {}".format(url, e)) #add the id to scrapedIds for database cleaning purposes scrapedIds.add(idpk) #vehicle id is a primary key in this database so we cant have repeats. if a record with the same url is found, we continue #the loop as the vehicle has already been stored curs.execute(f"SELECT 1 FROM vehicles WHERE id = {idpk}") if len(curs.fetchall()) != 0: skipped += 1 continue vehicleDict = {} vehicleDict["price"] = int(item[1].strip("$")) try: #grab each individual vehicle page page = session.get(url) tree = html.fromstring(page.content) except: print(f"Failed to reach {url}, entry has been dropped") continue attrs = tree.xpath('//span//b') #this fetches a list of attributes about a given vehicle. each vehicle does not have every specific attribute listed on craigslist #so this code gets a little messy as we need to handle errors if a car does not have the attribute we're looking for for item in attrs: try: #model is the only attribute without a specific tag on craigslist, so if this code fails it means that we've grabbed the model of the vehicle k = item.getparent().text.strip() k = k.strip(":") except: k = "model" try: #this code fails if item=None so we have to handle it appropriately vehicleDict[k] = item.text.strip() except: continue #we will assume that each of these variables are None until we hear otherwise #that way, try/except clauses can simply pass and leave these values as None price = None year = None manufacturer = None model = None condition = None cylinders = None fuel = None odometer = None title_status = None transmission = None VIN = None drive = None size = None vehicle_type = None paint_color = None image_url = None lat = None long = None description = None #now this code gets redundant. if we picked up a specific attr in the vehicleDict then we can change the variable from None. #integer attributes (price/odometer) are handled in case the int() is unsuccessful, but i have never seen that be the case if "price" in vehicleDict: try: price = int(vehicleDict["price"]) except Exception as e: print(f"Could not parse price: {e}") if "odomoter" in vehicleDict: try: odometer = int(vehicleDict["odometer"]) except Exception as e: print(f"Could not parse odometer: {e}") if "condition" in vehicleDict: condition = vehicleDict["condition"] if "model" in vehicleDict: #model actually contains 3 variables that we'd like: year, manufacturer, and model (which we call model) try: year = int(vehicleDict["model"][:4]) if year > nextYear: year = None except: year = None model = vehicleDict["model"][5:] foundManufacturer = False #we parse through each word in the description and search for a match with carBrands (at the top of the program) #if a match is found then we have our manufacturer, otherwise we set model to the entire string and leave manu blank for word in model.split(): if word.lower() in carBrands: foundManufacturer = True model = "" #resolve conflicting manufacturer titles manufacturer = word.lower() if manufacturer == "chev" or manufacturer == "chevy": manufacturer = "chevrolet" if manufacturer == "mercedes" or manufacturer == "mercedesbenz": manufacturer = "mercedes-benz" if manufacturer == "vw": manufacturer = "volkswagen" if manufacturer == "landrover": manufacturer = "land rover" if manufacturer == "harley": manufacturer = "harley-davidson" if manufacturer == "infinity": manufacturer = "infiniti" if manufacturer == "alfa": manufacturer = "alfa-romeo" if manufacturer == "aston": manufacturer = "aston-martin" continue if foundManufacturer: model = model + word.lower() + " " model = model.strip() if "cylinders" in vehicleDict: cylinders = vehicleDict["cylinders"] if "fuel" in vehicleDict: fuel = vehicleDict["fuel"] if "odometer" in vehicleDict: odometer = vehicleDict["odometer"] if "title status" in vehicleDict: title_status = vehicleDict["title status"] if "transmission" in vehicleDict: transmission = vehicleDict["transmission"] if "VIN" in vehicleDict: VIN = vehicleDict["VIN"] if "drive" in vehicleDict: drive = vehicleDict["drive"] if "size" in vehicleDict: size = vehicleDict["size"] if "type" in vehicleDict: vehicle_type = vehicleDict["type"] if "paint color" in vehicleDict: paint_color = vehicleDict["paint color"] #now lets fetch the image url if exists try: img = tree.xpath( '//div[@class="slide first visible"]//img') image_url = img[0].attrib["src"] except: pass #try to fetch lat/long and city/state, remain as None if they do not exist try: location = tree.xpath("//div[@id='map']") lat = float(location[0].attrib["data-latitude"]) long = float(location[0].attrib["data-longitude"]) except Exception as e: pass #try to fetch a vehicle description, remain as None if it does not exist try: location = tree.xpath("//section[@id='postingbody']") description = location[0].text_content().replace( "\n", " ").replace("QR Code Link to This Post", "").strip() except: pass #finally we get to insert the entry into the database curs.execute( '''INSERT INTO vehicles(id, url, region, region_url, price, year, manufacturer, model, condition, cylinders, fuel,odometer, title_status, transmission, VIN, drive, size, type, paint_color, image_url, description, lat, long, state) VALUES(%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s)''', (idpk, url, city[2], city[1], price, year, manufacturer, model, condition, cylinders, fuel, odometer, title_status, transmission, VIN, drive, size, vehicle_type, paint_color, image_url, description, lat, long, city[3])) scraped += 1 #these lines will execute every time we grab a new page (after 120 entries) print("{} vehicles scraped".format(scraped)) #now to clean the database we grab all urls from the city that are already logged curs.execute("SELECT id FROM vehicles WHERE region_url = '{}'".format( city[1])) deleted = 0 #if a given id is not in scrapedIds (the ids that we just scraped) then the entry no longer exists and we remove it for oldId in curs.fetchall(): if int(oldId[0]) not in scrapedIds: curs.execute("DELETE FROM vehicles WHERE id = '{}'".format( oldId[0])) deleted += 1 print( "Deleted {} old records, {} records skipped as they are already stored" .format(deleted, skipped)) conn.commit() #update progress file with open(fileName, "w") as tracker: tracker.write(str(cities)) #delete tracker file os.remove(fileName) count = curs.execute("SELECT Count(*) FROM vehicles") print("Table vehicles successfully updated, {} entries exist".format(\ curs.fetchall()[0][0])) conn.close()
def update_socsec(url, baseline, text_args): """ Function that will read the table with OASI Social Security Projections Parameters ---------- url: URL linking to IRS website with projections of federal tax filings baseline: CBO baseline we're updaint text_args: Dictionary containing the arguments that will be passed to the documentation template Returns ------- baseline: Updated baseline numbers text_args: Updated dictionary with text aruments to fill in the template """ print("Updating Social Security Projections") session = HTMLSession() r = session.get(url) # we can determine the latest year by looking at all of the years availeble # in the first drop down and adding one. selector = r.html.find("select#yh1")[0] latest_yr = max([int(yr) for yr in selector.text.split()]) + 1 report = f"{latest_yr} Report" if report == text_args["socsec_cur_report"]: print("\tNo new data since last update") return baseline, text_args socsec_url = f"https://www.ssa.gov/oact/TR/{latest_yr}/VI_C_SRfyproj.html" match_txt = "Operations of the OASI Trust Fund, Fiscal Years" html = pd.read_html(socsec_url, match=match_txt)[0] # merge the columns with years and data into one sub_data = pd.concat( [ html["Fiscal year", "Fiscal year.1"], html["Cost", "Sched-uled benefits"] ], axis=1, ) sub_data.columns = ["year", "cost"] # further slim down data so that we have the intermediate costs only start = sub_data.index[sub_data["year"] == "Intermediate:"][0] end = sub_data.index[sub_data["year"] == "Low-cost:"][0] cost_data = sub_data.iloc[start + 1:end].dropna() cost_data["cost"] = cost_data["cost"].astype(float) # rate we'll use to extrapolate costs to final year we'll need pct_change = cost_data["cost"].pct_change() + 1 cost_data.set_index("year", inplace=True) cost_data = cost_data.transpose() cost_data.index = ["SOCSEC"] # create values for years not included in the report factor = pct_change.iloc[-1] last_year = int(max(cost_data.columns)) cbo_last_year = int(max(baseline.columns)) for year in range(last_year + 1, cbo_last_year + 1): cost_data[str(year)] = cost_data[str(year - 1)] * factor cost_data = cost_data.round(1) # finally update CBO projections baseline.update(cost_data) text_args["socsec_prev_report"] = text_args["socsec_cur_report"] text_args["socsec_prev_url"] = text_args["socsec_cur_url"] text_args["socsec_cur_report"] = report text_args["socsec_cur_url"] = socsec_url return baseline, text_args
#-*- coding: gbk -*- # @Date : '2018/3/27 0027' # @Author : Terry feng ([email protected]) from requests_html import HTMLSession client = HTMLSession() r = client.get("https://www.qiushibaike.com/text/") a = r.html.find(".content") for i in a: print(i.text) print("--------------------------")
class LostFilmParser: source_url = 'https://www.lostfilm.tv/' tv_shows_list_part_url = 'https://www.lostfilm.tv/ajaxik.php' part_step = 10 def __init__(self): self.session = HTMLSession() self.news_data = self.session.get(self.source_url) def get_links(self): return self.news_data.html.links def get_title_en(self, href): try: result = search(r'/series/([^/]+)/', href) title_en = result.group(1) tv_show_link = self.source_url.rstrip('/') + result.group() except AttributeError: title_en = None tv_show_link = None return title_en, tv_show_link def get_new_shows_episodes(self): clear_data = [] news_block = self.news_data.html.find('.new-movies-block', first=True) movies = news_block.find('a.new-movie') for movie in movies: title_en, show_link = self.get_title_en(movie.attrs['href']) clear_data.append( { 'title_ru': movie.attrs['title'], 'title_en': title_en, 'jpg': 'http:' + movie.find('img', first=True).attrs['src'], 'season': movie.find('.title', first=True).text, 'date': movie.find('.date', first=True).text, 'episode_link': self.source_url.rstrip('/') + movie.attrs['href'], 'tv_show_link': show_link, } ) return clear_data def load_part_list(self, step): url = self.source_url + 'ajaxik.php' request_data = self.session.post( url=url, data={'act': 'serial', 'o': step, 's': 3, 't': 0, 'type': 'search'} ) return json.loads(request_data.content)['data'] def get_tv_shows_list(self): """10->20->30-> пока не вернет пустой список""" step = 0 shows_list = [] request_result = self.load_part_list(step) while request_result: for result in request_result: shows_list.append(result) step += self.part_step sleep(1) request_result = self.load_part_list(step) return shows_list
# input&read logger.debug("# Read the CIK and acc_no from console") cik = input("Please input the CIK:") acc_no_test = input("Please input the document accession number:") print('The CIK and Acc_no you entered is:', cik, acc_no_test) # CIK = '51143' # acc_no = '000005114313000007/0000051143-13-000007' logger.debug("# Get the HTML page") CIK = cik acc_no = acc_no_test html_tail = '-index.html' url_company = "http://www.sec.gov/Archives/edgar/data/" + CIK + "/" + acc_no + html_tail r1 = session.get(url_company) url_10q = "" # match 10q page logger.debug("# Get the 10Q page") for url in r1.html.absolute_links: if re.match(r'[a-zA-z]+://[^\s]*.10q.htm', url) != None: url_10q = url break # open 10q page r2 = session.get(url_10q) # find html element through css selector # r2.html.find('table')
from requests_html import HTMLSession session = HTMLSession() r = session.get('https://prettyprinted.com') # print (r.html.links) print(r.html.absolute_links) print('\n') print(r.html.find('.headline', first=True)) headline = r.html.find('.headline', first=True) print('\n') print(headline.text) print('\n') r = session.get('https://prettyprinted.com/p/the-flask-extensions-course') print(r) print(r.html.find('.course-section', first=True)) print('\n') flask_wtf_section = r.html.find('.course-section', first=True) print(flask_wtf_section) print('\n') print(flask_wtf_section.find('.item')) items = flask_wtf_section.find('.item') print('\n') for item in items: print(item.text)
from requests_html import HTML,HTMLSession session = HTMLSession() r = session.get('https://coreyms.com') article = r.html.find('article', first=True) # cari element dengan class article headline = article.find('.entry-title-link', first=True).text print(headline) summary = article.find('.entry-content p', first=True).text # cari element dengan class entry-content yang ada p print(summary) vid_src = article.find('iframe', first=True).attrs['src'] # print(vid_src.attrs['src']) vid_id = vid_src.split('/')[4] vid_id = vid_id.split('?')[0] vid_id = f'https://youtube.com/watch?={vid_id}' print(vid_id) # Untuk menampilkan semua class article articles = r.html.find('article') # cari element dengan class article for article in articles: headline = article.find('.entry-title-link', first=True).text print(headline) summary = article.find('.entry-content p', first=True).text # cari element dengan class entry-content yang ada p print(summary) try: vid_src = article.find('iframe', first=True).attrs['src'] # print(vid_src.attrs['src']) vid_id = vid_src.split('/')[4]
async def on_message(message): print( f"{message.channel}: {message.author}: {message.author.name}: {message.content}" ) sentdex_guild = client.get_guild(405403391410438165) author_roles = message.author.roles #print(author_roles) #author_role_ids = [r.id for r in author_roles] if random.choice(range(500)) == 30: matches = [r for r in author_roles if r.id in vanity_role_ids] #print(matches) if len(matches) == 0: try: role_id_choice = random.choice(vanity_role_ids) actual_role_choice = sentdex_guild.get_role(role_id_choice) #print(type(message.author)) author_roles.append(actual_role_choice) await message.author.edit(roles=author_roles) except Exception as e: print('EDITING ROLES ISSUE:', str(e)) with open(f"{path}/msgs.csv", "a") as f: if message.author.id not in chatbots: f.write( f"{int(time.time())},{message.author.id},{message.channel}\n") with open(f"{path}/log.csv", "a") as f: if message.author.id not in chatbots: try: f.write( f"{int(time.time())},{message.author.id},{message.channel},{message.content}\n" ) except Exception as e: f.write(f"{str(e)}\n") if "sentdebot.member_count()" == message.content.lower(): await message.channel.send(f"```py\n{sentdex_guild.member_count}```") elif "sentdebot.community_report()" == message.content.lower( ) and message.channel.id in image_chan_ids: online, idle, offline = community_report(sentdex_guild) file = discord.File(f"{path}/online.png", filename=f"{path}/online.png") await message.channel.send("", file=file) await message.channel.send( f'```py\n{{\n\t"Online": {online},\n\t"Idle/busy/dnd": {idle},\n\t"Offline": {offline}\n}}```' ) elif "sentdebot.p6()" == message.content.lower(): await message.channel.send( f"```\nThe Neural Networks from Scratch videos will resume one day. https://nnfs.io```" ) elif "sentdebot.user_activity()" == message.content.lower( ) and message.channel.id in image_chan_ids: # and len([r for r in author_roles if r.id in admins_mods_ids]) > 0: file = discord.File(f"{path}/activity.png", filename=f"{path}/activity.png") await message.channel.send("", file=file) #await message.channel.send(f'```py\n{{\n\t"Online": {online},\n\t"Idle/busy/dnd": {idle},\n\t"Offline": {offline}\n}}```') elif "help(sentdebot)" == message.content.lower( ) or "sentdebot.commands()" == message.content.lower(): await message.channel.send(commands_available) # if it doesnt work later. #elif "sentdebot.logout()" == message.content.lower() and message.author.id == 324953561416859658: elif "sentdebot.logout()" == message.content.lower() and str( message.author).lower() == "sentdex#7777": await client.close() elif "sentdebot.gtfo()" == message.content.lower() and str( message.author).lower() == "sentdex#7777": await client.close() elif "sentdebot.get_history()" == message.content.lower() and str( message.author).lower() == "sentdex#7777": channel = sentdex_guild.get_channel(channel_ids[0]) async for message in channel.history(limit=999999999999999): if message.author.id == 324953561416859658: with open(f"{path}/history_out.csv", "a") as f: f.write(f"{message.created_at},1\n") else: query = search_term(message.content) if query: #query = match.group(1) print(query) qsearch = query.replace(" ", "%20") full_link = f"https://pythonprogramming.net/search/?q={qsearch}" session = HTMLSession() r = session.get(full_link) specific_tutorials = [(tut.text, list(tut.links)[0]) for tut in r.html.find("a") if "collection-item" in tut.html] if len(specific_tutorials) > 0: return_str = "\n---------------------------------------\n".join( f'{tut[0]}: <https://pythonprogramming.net{tut[1]}>' for tut in specific_tutorials[:3]) return_str = f"```Searching for '{query}'```\n" + return_str + f"\n----\n...More results: <{full_link}>" await message.channel.send(return_str) else: await message.channel.send(f"""```py Traceback (most recent call last): File "<stdin>", line 1, in <module> NotFoundError: {query} not found```""")
from requests_html import HTMLSession session = HTMLSession() r = session.get('https://reddit.com') for html in r.html: print(html) pass
from bs4 import BeautifulSoup import requests from requests_html import HTMLSession session = HTMLSession() resp = session.get( "https://www.amazon.com/Sceptre-E248W-19203R-Monitor-Speakers-Metallic/dp/B0773ZY26F/ref=sr_1_2?crid=1861TM8A5NDPX&dchild=1&keywords=monitors&qid=1597071906&sprefix=monitors%2Caps%2C364&sr=8-2" ) resp.html.render(sleep=1, keep_page=True, timeout=20) soup = BeautifulSoup(resp.html.html, "lxml") title = soup.find(id="productTitle").get_text().strip() print(title)
class XHSpider(Process): def __init__(self,url): # 重写父类的__init__方法 super(XHSpider, self).__init__() self.url = url self.session = HTMLSession() self.headers = { 'Host':'news.daxues.cn', 'User-Agent':'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/64.0.3282.186 Safari/537.36' } self.path = "D:/Photo/" self.check_file_path(self.path) def check_file_path(self, path): ''' check the file is exists ''' if not os.path.exists(path): os.makedirs(path) def run(self): self.parse_page() def send_request(self, url): ''' 用来发送请求的方法 ''' # 请求出错时,重复求情3次 i = 0 while i < 3: try: print('请求url : ', url) # 网页是utf-8编码 return self.session.get(url, headers = self.headers).html except Exception as e: print('send_request error : ', str(e)) i += 1 def parse_page(self): ''' 解析网站源码,使用request-html提取 ''' html = self.send_request(self.url) imgs = html.find('dl a.p img') for img in imgs: href = img.attrs['src'] alt = img.attrs['alt'] self.save_image('http://news.daxues.cn'+href, alt) def save_image(self, url, name): ''' save image ''' content = self.session.get(url, headers=self.headers).content with open(self.path+name+'.jpg', 'wb') as f: f.write(content) f.close() def parse(self, url): self.url = url self.parse_page()
def retrieve(self, job_state): from requests_html import HTMLSession session = HTMLSession() response = session.get(self.navigate) return response.html.html
def start_request(self, url): headers = {'user-agent': random.choice(self.USER_AGENT_LIST)} session = HTMLSession() response = session.get(url, headers=headers) return response
def fetch_pmo_charts(dst_dir, agency, base_url, subpages, datatype_prefix): """ Fetch graphs and html tables from pmo (Povodi Moravy) water board fetch_pmo_charts(dst_dir='/home/jiri/meteodata', agency_prefix='pmo', base_url='http://www.pmo.cz/portal/srazky/en/', subpages=['prehled_tab_1_chp.htm', 'prehled_tab_2_chp.htm', 'prehled_tab_3_chp.htm'], datatype_prefix='precip', agency='pmo') :param dst_dir: destination directory where to save the data (subdirs are created automatically) :param base_url: the base url [for example http://www.pvl.cz/portal/SaP/pc/? for streamflow, http://www.pvl.cz/portal/srazky/pc/? for precipitation] :param subpages: the list of sub-pages (for example ['oid=1', 'oid=2', 'oid=3']) :param datatype_prefix: the data type. use 'streamflow' or 'precip' :param agency: the short name of the operating agency. use pla, poh, pod, pvl or pmo :return: number of charts and html pages downloaded """ agency = "pmo" session = HTMLSession() n_charts = 0 for subpage in subpages: url = base_url + subpage print('-----------------------------') print(url) print('-----------------------------') r = session.get(url) anchors = r.html.find('a') a_hrefs = [a for a in r.html.find('a') if "DoMereni" in a.attrs["href"]] for a in a_hrefs: id = a.attrs["href"].split("'")[1] url_html = '{:s}/en/mereni_{:s}.htm'.format(base_url, id) print(url_html) if datatype_prefix == 'precip': url_img = '{:s}/grafy/sr{:s}_en.gif'.format(base_url, id) else: url_img = '{:s}/grafy/{:s}.gif'.format(base_url, id) print(url_img) img_response = get(url_img) if img_response.status_code == 200: img_dir = os.path.join(dst_dir, datatype_prefix, agency, os.path.splitext(os.path.basename(url_img))[0]) if not os.path.exists(img_dir): os.makedirs(img_dir) utc_timestamp_text = datetime.utcnow().strftime('_%Y-%m-%dT%H0000z.gif') img_filename = os.path.basename(url_img).replace('.gif', utc_timestamp_text) img_path = os.path.join(img_dir, img_filename) print(img_path) with open(img_path, 'wb') as f: f.write(img_response.content) n_charts += 1 # also save the HTML html_path = img_path.replace('.gif', '.htm') html_response = get(url_html) if html_response.status_code == 200: print(html_path) with open(html_path, 'wb') as f: f.write(html_response.content) return n_charts
author = line2.text #print('작성자: ', author) file_data = OrderedDict() file_data['author'] = author file_data['post_create_datetime'] = date + " 00:00:00" # 2015-01-01 12:10:00 file_data['title'] = news_title file_data['content'] = content file_data['url'] = url file_data['publisher'] = publisher return file_data if __name__ == '__main__': session = HTMLSession() ahnlab_url = 'https://www.ahnlab.com/kr/site/securityinfo/secunews/secuNewsList.do?curPage=1&menu_dist=1&seq=&key=&dir_group_dist=&dir_code=&searchDate=' r = session.get(ahnlab_url) r.html.render() for line in r.html.find('input.secuNewsSeq'): value = line.attrs['value'] news_url = 'https://www.ahnlab.com/kr/site/securityinfo/secunews/secuNewsView.do?curPage=1&menu_dist=1&seq='+value+'&key=&dir_group_dist=&dir_code=&searchDate=' #print('news_url:', news_url) #SQL에서 URL 중복 체크 sql = "select EXISTS (select * from raw_table WHERE url=%s) as success" val = (news_url) is_exists = select_mydb(sql, val)[0][0] # ture: 1 / false: 0 반환 if is_exists: # 해당 URL이 있으면 패스 continue
def get(spell_id): session = HTMLSession() r = session.get("https://cn.wowhead.com/spell={}".format(spell_id)) en_url = r.html.find('link[hreflang="en"]', first=True).attrs['href'] zh_url = r.url return Spell(spell_id, parse(en_url), parse(zh_url))
import time from requests_html import HTMLSession starttime = time.time() # Loop para obtenção dos dados while True: print(time.strftime('%H:%M:%S', time.localtime())) # Definição da sessão para o web scraping url = 'https://br.investing.com/equities/magaz-luiza-on-nm-historical-data' session = HTMLSession() r = session.get(url).html data = r.find('#results_box', first=True).text.split() # Criação das listas para armazenar os dados dados_diario = [] dados_tam = len(data) linha = [] # Obtenção dos dados apenas na primeira linha da tabela i = 7 while i < 14: Data_dh = data[i] linha.append(Data_dh) i = i + 1 Abertura = data[i] linha.append(Abertura) i = i + 1
def scrap(self, country_from, country_to, link, file, time): """ :type country_from: object """ session = HTMLSession() r = session.get(link) tp = r.html.find('.tp') skad = r.html.find('.from') cena = r.html.find('.legPrice') # list_of_variables = [".date", ".to", ".from", ".legPrice", ".time," ,".tp", "p"] data = r.html.find('.date') data = data[3:] to = r.html.find('.to') change = r.html.find('.durcha') przewoznik = r.html.find('.airline') scrap_date = time.strftime("%Y-%m-%d") scrap_time = time.strftime("%H:%M") # with open(file, "a") as f: # myfile.write("appended text") # self.write_data(myfile) licznik = 0 ii = 0 licz_ceny = 0 i = 0 id_podrozy = self.flight_id ThereBack = ['There', "Back"] # print(len(change)) print('zaczynam zapisywanie dla.. ' + country_from + " - " + country_to) while i + ii < len(change): przes = 0 print(change[i + ii].full_text[9:11]) print(i + ii) print(len(change)) try: while change[i + ii].full_text[9:11] != 'no': # print('if 1 dla.. ' + country) przes = 1 file.write(scrap_date + ";" + scrap_time + ";" + country_from + ";" + country_to + ";" + str( id_podrozy) + ";" + \ str(ThereBack[(i + ii) % 2]) + ";" + str(data[i + ii].text[4:]) + ";" + str( przewoznik[licz_ceny].text) + ";" + str( przes) + ";" + str(cena[licz_ceny].text[1:]) + ";" + str( skad[licznik + 1].text.replace('\xa0', ' ')[3:8]) + \ ";" + str(skad[licznik + 1].text.replace('\xa0', ' ')[9:]) + ";" + str( to[licznik + 1].text[:5]) + ";" + str(to[licznik + 1].text[6:]) + "\n") licz_ceny += 1 licznik += 1 file.write(scrap_date + ";" + scrap_time + ";" + country_from + ";" + country_to + ";" + str( id_podrozy) + ";" + \ str(ThereBack[(i + ii) % 2]) + ";" + str(data[i + ii].text[4:]) + ";" + str( przewoznik[licz_ceny].text) + ";" + str( przes) + ";" + str(cena[licz_ceny].text[1:]) + ";" + str( skad[licznik + 1].text.replace('\xa0', ' ')[3:8]) + \ ";" + str(skad[licznik + 1].text.replace('\xa0', ' ')[9:]) + ";" + str( to[licznik - 1].text[:5]) + ";" + str(to[licznik - 1].text[6:]) + "\n") licz_ceny += 1 licznik += 2 ii += 1 if (i + ii) % 2 == 0: id_podrozy += 1 except IndexError: print("Something went wrong") # if ii%3==0: # id_podrozy += 1 # ?? dodane ify żeby się nie wywalało, ale teraz część się w ogóle nie zapisuje if len(to) > licznik: # if len(to[licznik].text)<7 and len(skad[licznik + 1].text.replace('\xa0', ' '))<10: file.write( scrap_date + ";" + scrap_time + ";" + country_from + ";" + country_to + ";" + str( id_podrozy) + ";" + \ str(ThereBack[(i + ii) % 2]) + ";" + str(data[i + ii].text[4:]) + ";" + str( przewoznik[licz_ceny].text) + ";" + str(przes) + ";" + str( cena[licz_ceny].text[1:]) + ";" + str(skad[licznik + 1].text.replace('\xa0', ' ')[3:8]) + \ ";" + str(skad[licznik + 1].text.replace('\xa0', ' ')[9:]) + ";" + str( to[licznik].text[:5]) + ";" + str(to[licznik].text[6:]) + "\n") # id_podrozy += 1 licz_ceny += 1 licznik += 2 i += 1 if (i + ii) % 2 == 0: id_podrozy += 1 print('-----') # print("----2------") # print(licz_ceny) # print(licznik) # print(i) self.flight_id = id_podrozy file.flush() print(country_from + " - " + country_to + " skończone!")
from requests_html import HTMLSession root = 'https://www.wiki-wiki.top/baike-%E8%A5%BF%E6%B8%B8%E8%AE%B0%E8%A7%92%E8%89%B2%E5%88%97%E8%A1%A8' all_entities = [] print('visiting...') url = root session = HTMLSession() response = session.get(url) output = response.html.find('div.mw-parser-output', first=True) print(output.text) a_list = response.html.find('dt') # cur_ents = [] # for a in a_list: # if a.attrs.get('class', '') == ('category-page__member-link', ): # cur_ents.append(a.attrs['title']) # for t in cur_ents: # if 'Template' in t: continue # if 'Category' in t: # if t not in have_seen_categories: # current_category.append(t) # have_seen_categories.add(t) # else: # all_entities.append(t) all_entities = sorted(list(set(all_entities))) with open('entities_wiki1.txt', 'w', encoding='utf-8') as f: for t in all_entities: f.write(t.strip() + '\n')
# -*- coding: utf-8 -*- """ Created on Mon Apr 13 13:00:05 2020 @author: srira """ from requests_html import HTMLSession session = HTMLSession() r = session.get("https://en.wikipedia.org/wiki/Association_football") r.status_code #print(r.html) urls = r.html.links #print(urls) absoluteurls = r.html.absolute_links #print(absoluteurls) type(absoluteurls) links = r.html.find('a') #print( links) onlywikipedialinks = r.html.find('a', containing="wikipedia") #print(onlywikipedialinks) textinwikipedialink = [ wikipedialink.text for wikipedialink in onlywikipedialinks
import pandas as pd session = HTMLSession() major_need_crawl = [ 'B1', 'B2', 'B3', 'B5', 'C1', 'C2', 'C3', 'C4', 'F8', 'E1', 'E3', 'E4', 'E5', 'E6', 'E8', 'E9', 'F0', 'F1', 'F2', 'F4', 'F5', 'F6', 'F9', 'H1', 'H2', 'H3', 'H4', 'H5', 'I2', 'I3', 'I5', 'I6', 'I7', 'I8', 'D2', 'D4', 'D5', 'D8', 'E2', 'F7', 'E7', 'F2', 'F3', 'C5', 'C6' ] all_major = {} major = [] course = [] for k in major_need_crawl: r = session.get( 'http://course-query.acad.ncku.edu.tw/qry/qry001.php?dept_no={}'. format(k)) r.encoding = 'utf-8' res = r.html.find('thead tr th') for i in range(1, 5): resp = r.html.find('.course_y{} td'.format(i)) for j in range(len(resp)): if (j + 1) % len(res) == 0 and j != 0: course.append(str(resp[j].text)) major.append(course.copy()) course.clear() else: course.append(str(resp[j].text))
# get the target url form the command line arguments url = str(sys.argv[1]) # for any given fanfiction.net link only the first 5 parts matter for location a story # https://www.fanfiction.net/s/8897431/1/Child-of-the-Storm --> Original # https: | | www.fanfiction.net | s | 8897431 | 1 | Child-of-the-Storm --> Post Split (| represent the places where the split occurred # https://www.fanfiction.net/s/8897431 --> rebuilt URL split = url.split('/') main = split[0] + "/" + split[1] + '/' + split[2] + '/' + split[ 3] + '/' + split[4] # Create the session object session = HTMLSession() # send a GET HTTP request pageHTML = session.get(main) # render() load the html into a headless pupeteer browser and executes any javascript on the page (this is important because parts of the page a dynamically loaded using JQuery) pageHTML.html.render() # build the BeautifulSoup object based on the now rendered HTML pageSoup = soup(pageHTML.html.html, "html.parser") # create the directory where the story will live based on the name of the story dirName = pageSoup.find('b', {'class': 'xcontrast_txt'}).text os.mkdir(dirName) # create a link to a local copy of the stylesheet that will be retrieved later styleSheets = '<link rel="stylesheet" href="./xss26.css">' # find the length of the storyy based on the existance of a known select tag. If the tag does not exists the story has 1 chapter only
def UploadImageAsset(client, url, image_ref_on_file, image_name, width, height, mode): """Uploads the image from the specified url. Args: client: An AdWordsClient instance. url: The image URL. Returns: The ID of the uploaded image. """ # Initialize appropriate service. asset_service = client.GetService('AssetService', version='v201809') # Download the image. headers = { 'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/42.0.2311.90 Safari/537.36' } session__ = HTMLSession() """ image_request = session__.get(url, headers=headers, verify=True) #print("URL: "+ url) print(image_request.content) print(image_request.html) """ print(url) tab = url.split('&') #print(type(url)) image_request = session__.get(tab[0], headers=headers, verify=True) #print(tab[0]) #image_asset = BytesIO(urlopen(tab[0]).read()) image_asset = image_request.content #print(image_asset) # Create the image asset. try: source = tinify.tinify.tinify.from_url(url) #print(source) resized_image = source.resize(method=mode, width=int(width), height=int(height)) data = resized_image.to_file(image_ref_on_file) #print(sys.getsizeof(data)) #print(data) except: try: source = tinify.tinify.tinify.from_url(url) print(source) resized_image = source.resize(method=mode, width=int(width), height=int(height)) data = resized_image.to_file(image_ref_on_file) print(sys.getsizeof(data)) #print(data) except Exception as e: print(e) print(image_name) file_url = url_for('uploaded_file', filename=image_name, _external=True) image_asset = { 'xsi_type': 'ImageAsset', 'imageData': urlopen(file_url).read(), # This field is optional, and if provided should be unique. # 'assetName': 'Image asset ' + str(uuid.uuid4()), } # Create the operation. operation = { 'operator': 'ADD', 'operand': image_asset } # Create the asset and return the ID. result = asset_service.mutate([operation]) return result['value'][0]['assetId']
#!/usr/bin/env python # coding=utf-8 # author: zengyuetian # 获得链家网城区信息 from requests_html import HTMLSession if __name__ == '__main__': session = HTMLSession() # 获取上海链家小区栏目页 r = session.get('https://sh.lianjia.com/xiaoqu/') # 获得上海链家区县列表 elements = r.html.xpath('/html/body/div[3]/div[1]/dl[2]/dd/div/div/a') # 区县英文和中文名列表 en_names = list() ch_names = list() # element html代码 形如 <a href="/xiaoqu/pudong/" title="上海浦东小区二手房 ">浦东</a> for element in elements: for link in element.absolute_links: # 遍历链接set en_names.append(link.split('/')[-2]) ch_names.append(element.text) # 打印区县英文和中文名列表 for index, name in enumerate(en_names): print(name, ch_names[index]) """ pudong 浦东 minhang 闵行
from requests_html import HTMLSession session = HTMLSession() headers = { 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/74.0.3729.131 Safari/537.36' } r = session.get( 'https://tembiapo.mopc.gov.py/obras/42-pavimentacion-de-tramos-alimentadores-de-la-red-vial-nacional-paquete-n-1-lote-2', verify=False, headers=headers) table = r.html.find('#info', first=True) columns = [] for e in table.find('label'): columns.append(e.text) cells = [] for e in urls: columns.append(e.text) print(columns) print(cells)