Exemplo n.º 1
0
def get_html(url):
    """
    get_html uses selenium to drive a browser to fetch a URL, and return a
    requests_html.HTML object for it.
    
    If there is a captcha challenge it will alert the user and wait until 
    it has been completed.
    """
    global driver

    time.sleep(random.randint(1,5))
    driver.get(url)
    while True:
        try:
            recap = driver.find_element_by_css_selector(
                '#gs_captcha_ccl,#recaptcha')
        except NoSuchElementException:

            try:
                html = driver.find_element_by_css_selector('#gs_top').\
                        get_attribute('innerHTML')
                return requests_html.HTML(html=html)
            except NoSuchElementException:
                print("google has blocked this browser, reopening")
                driver.close()
                driver = webdriver.Chrome()
                return get_html(url)

        print("... it's CAPTCHA time!\a ...")
        time.sleep(5)
Exemplo n.º 2
0
    def parse_election_site_results(content):
        """
        Parse KPU C1 HTML page for scanned pages of election results
        Arguments:
            content: HTML string
        Output:
            jpg_urls: List of URLs which are the scanned pages of election results
        """
        jpg_urls = []

        # On Kelurahan/Desa "PARTALI TORUAN": There is links for scanned pages such as: <a href="javascript:read_jpg('000744000101')" class="image1_aktif" >
        # When clicking this image, it will download: http://scanc1.kpu.go.id/viewp.php?f=000744000101.jpg
        html = requests_html.HTML(html=content)
        html_a_hrefs = html.find(selector='a.image1_aktif')
        for html_a_href in html_a_hrefs:
            assert html_a_href.attrs['class'] == ('image1_aktif',)
            href = html_a_href.attrs['href']
            matches = re.match(r'^javascript:read_jpg\(\'(\d+)\'\)$', href)
            if matches:
                # Example of jpg_id: '000000400101'
                jpg_id = matches.group(1)
                # We're only interested in page 04
                if jpg_id.endswith('04'):
                    jpg_urls.append('http://scanc1.kpu.go.id/viewp.php?f=' + jpg_id + '.jpg')
        return jpg_urls
Exemplo n.º 3
0
def get_request(request, headers, route_parameters, re_render_time):

    # get the current token
    user_token = test_state.get_authentication()
    if user_token != None:
        headers["X-authorization"] = user_token

    address = get_address(request, route_parameters)

    session = requests_html.HTMLSession()

    response = session.get(address, headers=headers)
    try:
        response.raise_for_status()
    except HTTPError:
        raise HTTPError("Internal server error. Content of response was \n" +
                        response.text)

    # format once before rendering to remove ignored elements
    content_unrendered = format_html(response.html.html)

    content_html = requests_html.HTML(html=content_unrendered)

    content_html.render()
    if re_render_time != 0:
        time.sleep(re_render_time / 1000)
        content_html.render()

    # format again after rendering
    content = format_html(content_html.html)

    return content
Exemplo n.º 4
0
    def get_page_source(self):
        sysstr = platform.system()
        if sysstr == "Windows":
            browser = webdriver.PhantomJS()
        else:
            browser = webdriver.PhantomJS(
                executable_path="/opt/model/phantomjs/bin/phantomjs")
        browser.get(self.url)
        dom = browser.page_source

        html = requests_html.HTML(html=dom)
        return html
Exemplo n.º 5
0
    def parse(self, response):
        body = requests_html.HTML(html=response.css("body").extract_first())
        current_page = body.find(".pagecurrent", first=True)
        if current_page is None:
            return None

        for row in body.find('table.pme-main tr'):
            columns = row.find("td")

            timeline = {}
            for i, column in enumerate(columns):
                timeline[self.column_names[i]] = self._text(column)

            if timeline:
                yield timeline
Exemplo n.º 6
0
    def assert_puzzle_cached(cls, puzzle_url_suffix):
        dir_script = Path(__file__).parent
        dir_snapshots = dir_script / 'snapshots'
        dir_snapshots.mkdir(exist_ok=True)
        path_snapshot = dir_snapshots / f'{puzzle_url_suffix}.html'

        if not path_snapshot.exists():
            print(
                f'Taking HTML snapshot of puzzle "{puzzle_url_suffix}" and storing in {dir_snapshots}...'
            )
            take_snapshot(puzzle_url_suffix, dir_snapshots)

        html_text = path_snapshot.read_text()
        html = requests_html.HTML(html=html_text)

        puzzle_attr = cls.suffix_as_attr(puzzle_url_suffix)
        setattr(cls, puzzle_attr, html)
Exemplo n.º 7
0
 def scrap_works(self, html, url, works_sink):
     TRS_SELECTOR = 'tr td.gsc_a_c a.gsc_a_ac'
     SELECTOR = '#gsc_a_t tbody ' + TRS_SELECTOR
     values = []
     window = [0,20]
     values += _get_html_ints(html, SELECTOR, empty=0)
     self.feed_works_sink(html.find('#gsc_a_t tbody', first=True), works_sink)
     while len(values) >= window[1]:
         window = [window[1], window[1]+60]
         json = self.session.post(url + f'&cstart={window[0]}&pagesize'
                                  + f'={window[1]}', data='json=1')
         json = json.json()
         payload = json['B'].strip()
         if payload != '':
             trs = requests_html.HTML(html=payload)
             values += _get_html_ints(trs, TRS_SELECTOR, empty=0)
             self.feed_works_sink(trs, works_sink)
     return {'count': len(values), 'citation-counts': values}
Exemplo n.º 8
0
    def iter_dir(dir_path):
        nonlocal c
        for pathname in os.listdir(dir_path):
            abs_path = os.path.join(dir_path, pathname)
            if pathname.endswith('html'):
                rel_path = os.path.relpath(abs_path, SAVE_DIR)
                with open(abs_path, 'r') as f:
                    html = requests_html.HTML(html=f.read())
                    title = html.find('title', first=True).text
                    insert_table(c, title, "Guide", rel_path)
                    print("Add guide: ", title, rel_path)

                    anchors = html.find('h2')
                    for i in anchors:
                        name = i.text[2:]
                        p = rel_path + '#' + urllib.parse.quote_plus(i.attrs['id'])
                        print("Add anchor: ", name, p)
                        insert_table(c, name, 'Section', p)

            elif os.path.isdir(abs_path):
                iter_dir(abs_path)
Exemplo n.º 9
0
def process(process_data: str) -> Dict:
    process_data = requests_html.HTML(html=process_data)
    result = {
        'Dados do processo': {},
        'Partes do processo': [],
        'Movimentações': []
    }
    not_found = ('Não existem informações disponíveis para os '
                 'parâmetros informados')
    if not_found not in process_data.text:
        process_general_data = process_data.xpath(
            "//table[contains(@class, 'secaoFormBody')]")[1]
        result['Dados do processo'] = general_data(process_general_data)

        process_parts = process_data.find(
            '#tableTodasPartes,#tablePartesPrincipais', first=True)
        result['Partes do processo'] = parts(process_parts)

        process_movements = process_data.find('#tabelaUltimasMovimentacoes',
                                              first=True)
        result['Movimentações'] = movements(process_movements)
    return result
Exemplo n.º 10
0
def get_hot_boards():

    url = "https://www.ptt.cc/bbs/index.html"
    resp = fetch(url)
    html = requests_html.HTML(html=resp.text)
    boards = html.find('div.b-ent')

    board_list = []

    for board in boards:
        board_list.append({
            "name":
            board.find('div.board-name', first=True).text,
            "nuser":
            board.find('div.board-nuser', first=True).text,
            "class":
            board.find('div.board-class', first=True).text,
            "title":
            board.find('div.board-title', first=True).text
        })

    return board_list
Exemplo n.º 11
0
async def job(url, find_queue, visited, semaphone):
    html = await get(url)
    print('Downloaded: ', url)
    html = requests_html.HTML(url=url, html=html)
    markdown = html.find('article.markdown-body', first=True).html
    title = html.find('title', first=True).text.replace(' - 为企业级框架和应用而生', '')
    await save(
        url, f"""
    <html>
        <head>
            <title>{ title }</title>
            <meta charset="UTF-8">
            <link rel="stylesheet" href="{ '../' if url[len(BASE_URL):].find('/') > 0 else ''  }index.css">
        </head>
        <body>
        { markdown }
        </body>
    </html>""")
    links = filter_url(html.absolute_links, visited)
    for link in links:
        visited.add(link)
        find_queue.put(link)
    semaphone.release()
Exemplo n.º 12
0
    def parse_c1_html(content):
        """
        Parse KPU C1 HTML page
        Arguments:
            content: HTML string
        Output:
            administrative_type: Type of Administrative Division, i.e. 'Provinsi', 'Kabupaten/Kota', 'Kecamatan', 'Kelurahan/Desa'
            administratives: [{
               'name',
               'id',
               'parent_id',
               'url',
            }, ...]
        """
        html = requests_html.HTML(html=content)

        # On main page: <select class="formfield" name="wilayah_id" onChange="selectCat(this,'0')"><option value="">pilih</option><option  value="1">ACEH</option><option  value="6728">SUMATERA UTARA</option>
        # When selecting Provinsi "SUMATERA UTARA", it will download: https://pilpres2014.kpu.go.id/c1.php?cmd=select&grandparent=0&parent=6728
        # On Provinsi "SUMATERA UTARA": <select class="formfield" name="wilayah_id" onChange="selectCat(this,'6728')"><option value="">pilih</option><option  value="7240">TAPANULI TENGAH</option><option  value="7438">TAPANULI UTARA</option>
        # When selecting Kabupaten/Kota "TAPANULI UTARA", it will download: https://pilpres2014.kpu.go.id/c1.php?cmd=select&grandparent=6728&parent=7438
        # On Kabupaten/Kota "TAPANULI UTARA": <select class="formfield" name="wilayah_id" onChange="selectCat(this,'7438')"><option value="">pilih</option><option  value="7439">TARUTUNG</option><option  value="7668">GAROGA</option>
        # When selecting Kecamatan "TARUTUNG", it will download: https://pilpres2014.kpu.go.id/c1.php?cmd=select&grandparent=7438&parent=7439
        # On Kecamatan "TARUTUNG": <select class="formfield" name="wilayah_id" onChange="selectCat(this,'7439')"><option value="">pilih</option><option  value="7440">PARTALI TORUAN</option><option  value="7457">HUTATORUAN IV</option>
        # When selecting Kelurahan/Desa "PARTALI TORUAN", it will download: https://pilpres2014.kpu.go.id/c1.php?cmd=select&grandparent=7439&parent=7440
        # On Kelurahan/Desa "PARTALI TORUAN": There is no <span> and no <select>

        # Parse: <span class="label">Provinsi :</span>
        html_span_label = html.find(selector='span.label', first=True)
        if html_span_label is None:
            return None, []
        html_span_label_text = html_span_label.text
        administrative_type = html_span_label_text.replace(' :', '')

        # Parse: <select class="formfield" name="wilayah_id" onChange="selectCat(this,'6728')">
        html_select = html.find(selector='select.formfield', first=True)
        assert html_select.attrs['name'] == 'wilayah_id'
        assert html_select.attrs['class'] == ('formfield',)
        html_select_onchange = html_select.attrs['onchange']
        matches = re.match(r'^selectCat\(this,\'(\d+)\'\)$', html_select_onchange)
        if not matches:
            raise Exception('Cannot parse onChange="{}"'.format(html_select_onchange))
        parent_id = int(matches.group(1))

        # Parse: <option  value="1">ACEH</option><option  value="6728">SUMATERA UTARA</option>
        administratives = []
        html_select_options = html_select.find(selector='option')
        for html_select_option in html_select_options:
            if html_select_option.attrs['value'] == '':
                # Ignore 'pilih'
                continue
            option_value = int(html_select_option.attrs['value'])
            option_name = html_select_option.text
            url = 'https://pilpres2014.kpu.go.id/c1.php?cmd=select&grandparent={}&parent={}'.format(parent_id,
                                                                                                    option_value)
            administrative = {
                'name': option_name,
                'id': option_value,
                'parent_id': parent_id,
                'url': url
            }
            administratives.append(administrative)
        return administrative_type, administratives
Exemplo n.º 13
0
 def get_table(self):
     src = self.__browser.page_source
     html = HTML.HTML(html=src)
     table = html.find('#ip_list', first=True)
     return table
Exemplo n.º 14
0
import zipfile
import pyquery
import re
import json
import os
import pandas as pd
import requests_html
items = []
with zipfile.ZipFile('us_words.zip', 'r') as zf:
    for name in zf.namelist()[:]:
        html = zf.open(name, 'r').read().decode('utf-8')
        html = re.sub(r'<b\s+class="b".*?>(.+?)</b>',
                      r'`\1`',
                      html,
                      flags=re.DOTALL)
        doc = requests_html.HTML(html=html)
        #doc = pyquery.PyQuery(html)
        #print(repr(doc.find('.head h1')))
        key = word = doc.find('.head h1')[0].text

        pron = ' '.join(list(x.text for x in doc.find('.head .pron')))
        #assert '!!' not in pron, word
        #print(word)
        #item = {'word':word}
        print(word)
        for posblock in doc.find('.posblock'):
            poses = [x.text for x in posblock.find('.posblock > .posgram')
                     ]  # phrasal verb
            pos = ' '.join({x: 1 for x in poses}.keys())
            pos = re.sub(r"\s+", ' ', pos)
Exemplo n.º 15
0
 def _update_html(self, qt_webpage_to_html: str):
     self.html_str = qt_webpage_to_html
     self.html_parser = requests_html.HTML(url=self.url, html=self.html_str)
     self.find = self.html_parser.find
     self.xpath = self.html_parser.xpath
     self._qt_app.quit()
Exemplo n.º 16
0
def eval_js(script):
    return requests_html.HTML().render(script=script, reload=False)
Exemplo n.º 17
0
 def get_tbody(self):
     src = self.__browser.page_source
     html = HTML.HTML(html=src)
     tbody = html.find('.table > tbody:nth-child(2)', first=True)
     return tbody