def get_html(url): """ get_html uses selenium to drive a browser to fetch a URL, and return a requests_html.HTML object for it. If there is a captcha challenge it will alert the user and wait until it has been completed. """ global driver time.sleep(random.randint(1,5)) driver.get(url) while True: try: recap = driver.find_element_by_css_selector( '#gs_captcha_ccl,#recaptcha') except NoSuchElementException: try: html = driver.find_element_by_css_selector('#gs_top').\ get_attribute('innerHTML') return requests_html.HTML(html=html) except NoSuchElementException: print("google has blocked this browser, reopening") driver.close() driver = webdriver.Chrome() return get_html(url) print("... it's CAPTCHA time!\a ...") time.sleep(5)
def parse_election_site_results(content): """ Parse KPU C1 HTML page for scanned pages of election results Arguments: content: HTML string Output: jpg_urls: List of URLs which are the scanned pages of election results """ jpg_urls = [] # On Kelurahan/Desa "PARTALI TORUAN": There is links for scanned pages such as: <a href="javascript:read_jpg('000744000101')" class="image1_aktif" > # When clicking this image, it will download: http://scanc1.kpu.go.id/viewp.php?f=000744000101.jpg html = requests_html.HTML(html=content) html_a_hrefs = html.find(selector='a.image1_aktif') for html_a_href in html_a_hrefs: assert html_a_href.attrs['class'] == ('image1_aktif',) href = html_a_href.attrs['href'] matches = re.match(r'^javascript:read_jpg\(\'(\d+)\'\)$', href) if matches: # Example of jpg_id: '000000400101' jpg_id = matches.group(1) # We're only interested in page 04 if jpg_id.endswith('04'): jpg_urls.append('http://scanc1.kpu.go.id/viewp.php?f=' + jpg_id + '.jpg') return jpg_urls
def get_request(request, headers, route_parameters, re_render_time): # get the current token user_token = test_state.get_authentication() if user_token != None: headers["X-authorization"] = user_token address = get_address(request, route_parameters) session = requests_html.HTMLSession() response = session.get(address, headers=headers) try: response.raise_for_status() except HTTPError: raise HTTPError("Internal server error. Content of response was \n" + response.text) # format once before rendering to remove ignored elements content_unrendered = format_html(response.html.html) content_html = requests_html.HTML(html=content_unrendered) content_html.render() if re_render_time != 0: time.sleep(re_render_time / 1000) content_html.render() # format again after rendering content = format_html(content_html.html) return content
def get_page_source(self): sysstr = platform.system() if sysstr == "Windows": browser = webdriver.PhantomJS() else: browser = webdriver.PhantomJS( executable_path="/opt/model/phantomjs/bin/phantomjs") browser.get(self.url) dom = browser.page_source html = requests_html.HTML(html=dom) return html
def parse(self, response): body = requests_html.HTML(html=response.css("body").extract_first()) current_page = body.find(".pagecurrent", first=True) if current_page is None: return None for row in body.find('table.pme-main tr'): columns = row.find("td") timeline = {} for i, column in enumerate(columns): timeline[self.column_names[i]] = self._text(column) if timeline: yield timeline
def assert_puzzle_cached(cls, puzzle_url_suffix): dir_script = Path(__file__).parent dir_snapshots = dir_script / 'snapshots' dir_snapshots.mkdir(exist_ok=True) path_snapshot = dir_snapshots / f'{puzzle_url_suffix}.html' if not path_snapshot.exists(): print( f'Taking HTML snapshot of puzzle "{puzzle_url_suffix}" and storing in {dir_snapshots}...' ) take_snapshot(puzzle_url_suffix, dir_snapshots) html_text = path_snapshot.read_text() html = requests_html.HTML(html=html_text) puzzle_attr = cls.suffix_as_attr(puzzle_url_suffix) setattr(cls, puzzle_attr, html)
def scrap_works(self, html, url, works_sink): TRS_SELECTOR = 'tr td.gsc_a_c a.gsc_a_ac' SELECTOR = '#gsc_a_t tbody ' + TRS_SELECTOR values = [] window = [0,20] values += _get_html_ints(html, SELECTOR, empty=0) self.feed_works_sink(html.find('#gsc_a_t tbody', first=True), works_sink) while len(values) >= window[1]: window = [window[1], window[1]+60] json = self.session.post(url + f'&cstart={window[0]}&pagesize' + f'={window[1]}', data='json=1') json = json.json() payload = json['B'].strip() if payload != '': trs = requests_html.HTML(html=payload) values += _get_html_ints(trs, TRS_SELECTOR, empty=0) self.feed_works_sink(trs, works_sink) return {'count': len(values), 'citation-counts': values}
def iter_dir(dir_path): nonlocal c for pathname in os.listdir(dir_path): abs_path = os.path.join(dir_path, pathname) if pathname.endswith('html'): rel_path = os.path.relpath(abs_path, SAVE_DIR) with open(abs_path, 'r') as f: html = requests_html.HTML(html=f.read()) title = html.find('title', first=True).text insert_table(c, title, "Guide", rel_path) print("Add guide: ", title, rel_path) anchors = html.find('h2') for i in anchors: name = i.text[2:] p = rel_path + '#' + urllib.parse.quote_plus(i.attrs['id']) print("Add anchor: ", name, p) insert_table(c, name, 'Section', p) elif os.path.isdir(abs_path): iter_dir(abs_path)
def process(process_data: str) -> Dict: process_data = requests_html.HTML(html=process_data) result = { 'Dados do processo': {}, 'Partes do processo': [], 'Movimentações': [] } not_found = ('Não existem informações disponíveis para os ' 'parâmetros informados') if not_found not in process_data.text: process_general_data = process_data.xpath( "//table[contains(@class, 'secaoFormBody')]")[1] result['Dados do processo'] = general_data(process_general_data) process_parts = process_data.find( '#tableTodasPartes,#tablePartesPrincipais', first=True) result['Partes do processo'] = parts(process_parts) process_movements = process_data.find('#tabelaUltimasMovimentacoes', first=True) result['Movimentações'] = movements(process_movements) return result
def get_hot_boards(): url = "https://www.ptt.cc/bbs/index.html" resp = fetch(url) html = requests_html.HTML(html=resp.text) boards = html.find('div.b-ent') board_list = [] for board in boards: board_list.append({ "name": board.find('div.board-name', first=True).text, "nuser": board.find('div.board-nuser', first=True).text, "class": board.find('div.board-class', first=True).text, "title": board.find('div.board-title', first=True).text }) return board_list
async def job(url, find_queue, visited, semaphone): html = await get(url) print('Downloaded: ', url) html = requests_html.HTML(url=url, html=html) markdown = html.find('article.markdown-body', first=True).html title = html.find('title', first=True).text.replace(' - 为企业级框架和应用而生', '') await save( url, f""" <html> <head> <title>{ title }</title> <meta charset="UTF-8"> <link rel="stylesheet" href="{ '../' if url[len(BASE_URL):].find('/') > 0 else '' }index.css"> </head> <body> { markdown } </body> </html>""") links = filter_url(html.absolute_links, visited) for link in links: visited.add(link) find_queue.put(link) semaphone.release()
def parse_c1_html(content): """ Parse KPU C1 HTML page Arguments: content: HTML string Output: administrative_type: Type of Administrative Division, i.e. 'Provinsi', 'Kabupaten/Kota', 'Kecamatan', 'Kelurahan/Desa' administratives: [{ 'name', 'id', 'parent_id', 'url', }, ...] """ html = requests_html.HTML(html=content) # On main page: <select class="formfield" name="wilayah_id" onChange="selectCat(this,'0')"><option value="">pilih</option><option value="1">ACEH</option><option value="6728">SUMATERA UTARA</option> # When selecting Provinsi "SUMATERA UTARA", it will download: https://pilpres2014.kpu.go.id/c1.php?cmd=select&grandparent=0&parent=6728 # On Provinsi "SUMATERA UTARA": <select class="formfield" name="wilayah_id" onChange="selectCat(this,'6728')"><option value="">pilih</option><option value="7240">TAPANULI TENGAH</option><option value="7438">TAPANULI UTARA</option> # When selecting Kabupaten/Kota "TAPANULI UTARA", it will download: https://pilpres2014.kpu.go.id/c1.php?cmd=select&grandparent=6728&parent=7438 # On Kabupaten/Kota "TAPANULI UTARA": <select class="formfield" name="wilayah_id" onChange="selectCat(this,'7438')"><option value="">pilih</option><option value="7439">TARUTUNG</option><option value="7668">GAROGA</option> # When selecting Kecamatan "TARUTUNG", it will download: https://pilpres2014.kpu.go.id/c1.php?cmd=select&grandparent=7438&parent=7439 # On Kecamatan "TARUTUNG": <select class="formfield" name="wilayah_id" onChange="selectCat(this,'7439')"><option value="">pilih</option><option value="7440">PARTALI TORUAN</option><option value="7457">HUTATORUAN IV</option> # When selecting Kelurahan/Desa "PARTALI TORUAN", it will download: https://pilpres2014.kpu.go.id/c1.php?cmd=select&grandparent=7439&parent=7440 # On Kelurahan/Desa "PARTALI TORUAN": There is no <span> and no <select> # Parse: <span class="label">Provinsi :</span> html_span_label = html.find(selector='span.label', first=True) if html_span_label is None: return None, [] html_span_label_text = html_span_label.text administrative_type = html_span_label_text.replace(' :', '') # Parse: <select class="formfield" name="wilayah_id" onChange="selectCat(this,'6728')"> html_select = html.find(selector='select.formfield', first=True) assert html_select.attrs['name'] == 'wilayah_id' assert html_select.attrs['class'] == ('formfield',) html_select_onchange = html_select.attrs['onchange'] matches = re.match(r'^selectCat\(this,\'(\d+)\'\)$', html_select_onchange) if not matches: raise Exception('Cannot parse onChange="{}"'.format(html_select_onchange)) parent_id = int(matches.group(1)) # Parse: <option value="1">ACEH</option><option value="6728">SUMATERA UTARA</option> administratives = [] html_select_options = html_select.find(selector='option') for html_select_option in html_select_options: if html_select_option.attrs['value'] == '': # Ignore 'pilih' continue option_value = int(html_select_option.attrs['value']) option_name = html_select_option.text url = 'https://pilpres2014.kpu.go.id/c1.php?cmd=select&grandparent={}&parent={}'.format(parent_id, option_value) administrative = { 'name': option_name, 'id': option_value, 'parent_id': parent_id, 'url': url } administratives.append(administrative) return administrative_type, administratives
def get_table(self): src = self.__browser.page_source html = HTML.HTML(html=src) table = html.find('#ip_list', first=True) return table
import zipfile import pyquery import re import json import os import pandas as pd import requests_html items = [] with zipfile.ZipFile('us_words.zip', 'r') as zf: for name in zf.namelist()[:]: html = zf.open(name, 'r').read().decode('utf-8') html = re.sub(r'<b\s+class="b".*?>(.+?)</b>', r'`\1`', html, flags=re.DOTALL) doc = requests_html.HTML(html=html) #doc = pyquery.PyQuery(html) #print(repr(doc.find('.head h1'))) key = word = doc.find('.head h1')[0].text pron = ' '.join(list(x.text for x in doc.find('.head .pron'))) #assert '!!' not in pron, word #print(word) #item = {'word':word} print(word) for posblock in doc.find('.posblock'): poses = [x.text for x in posblock.find('.posblock > .posgram') ] # phrasal verb pos = ' '.join({x: 1 for x in poses}.keys()) pos = re.sub(r"\s+", ' ', pos)
def _update_html(self, qt_webpage_to_html: str): self.html_str = qt_webpage_to_html self.html_parser = requests_html.HTML(url=self.url, html=self.html_str) self.find = self.html_parser.find self.xpath = self.html_parser.xpath self._qt_app.quit()
def eval_js(script): return requests_html.HTML().render(script=script, reload=False)
def get_tbody(self): src = self.__browser.page_source html = HTML.HTML(html=src) tbody = html.find('.table > tbody:nth-child(2)', first=True) return tbody