示例#1
0
文件: html.py 项目: bdunford/waabi
    def Reflected(res, value, tryRender=True):
        if len(res.text) > 0:
            hl = HTML(html=res.text)
            if tryRender:
                try:
                    hl.render(timeout=0.5)
                except Exception as ex:
                    pass

            matches = re.findall(".{1,15}" + re.escape(value) + ".{1,15}",
                                 hl.html, re.DOTALL)

            ret = {
                "type": "match",
                "count": len(matches),
                "matches": list(set(matches)),
                "rendered": hl.html
            }

            return ((len(matches) > 0), ret)
        else:
            return (False, {
                "type": "match",
                "matches": [],
                "count": 0,
                "rendered": ""
            })
示例#2
0
def getByCountry(country, proxy_type='ALL'):
    proxy_type = proxy_type.upper()
    if proxy_type not in all_types:
        return None
    headers = {'User-Agent': getIndexUa(10)}
    URL = 'http://spys.one/free-proxy-list/{}/'.format(country.upper())
    token_data = requests.get(URL, headers=headers)
    token_selc = Selector(text=token_data.text)
    token = token_selc.xpath(
        "//input[@type='hidden'and@name='xx0']/@value").extract_first()
    data = {
        'xx0': token,
        'xpp': 5,
        'xf1': 0,
        'xf2': 0,
        'xf4': 0,
        'xf5': all_types[proxy_type]
    }
    headers = {
        'User-Agent':
        getIndexUa(10),
        'Host':
        'spys.one',
        'Content-Length':
        '66',
        'Cache-Control':
        'max-age=0',
        'Upgrade-Insecure-Requests':
        '1',
        'Origin':
        'http://spys.one',
        'Content-Type':
        'application/x-www-form-urlencoded',
        'Accept':
        ('text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8,app'
         'lication/signed-exchange;v=b3;q=0.9'),
        'Referer':
        'http://spys.one/free-proxy-list/PE/',
        'Accept-Encoding':
        'gzip, deflate',
        'Accept-Language':
        'es-ES,es;q=0.9',
        'Connection':
        'close'
    }
    r = requests.post(URL, headers=headers, data=data)
    proxies = []
    if r.ok:
        html = HTML(html=r.text)
        html.render()
        page = Selector(text=html.html)
        table = page.css('tr')
        for tr in table:
            tds = tr.css("td")
            ip = tds[0].css('font::text').extract()
            if len(ip) == 3:
                proxies.append('http://{}:{}'.format(ip[0], ip[2]))
        return proxies
    else:
        return []
示例#3
0
 def render_html_page(self, page_content):
     '''Render html page'''
     try:
         html = HTML(html=page_content)
         html.render(reload=False)
         return html.text
     except:
         logging.error('Cannot render the page', exc_info=True)
         return page_content
示例#4
0
 def getPage(self, url):
     url = self.getFullURL(url)
     self._updateBaseURL(url)
     print("getting", url)
     text = requests.get(url, headers={"User-agent": self.userAgent}).text
     print("got page from URL", url, "with length", len(text))
     if self.runJavascript:
         print("\trunning javascript...")
         html = HTML(html=text)
         html.render(timeout=self.timeout)
         text = html.html
         print("\tdone.")
     return text
示例#5
0
 def render_html_page(self, page_content):
     """
     Render html page
         :param self: 
         :param page_content: Content of the page to render
     """
     try:
         from requests_html import HTML
         html = HTML(html=page_content)
         html.render(reload=False)
         return html.text
     except:
         logging.error('Cannot render the page', exc_info=True)
         return page_content
def test_bare_js_eval():
    doc = """
    <!DOCTYPE html>
    <html>
    <body>
    <div id="replace">This gets replaced</div>

    <script type="text/javascript">
      document.getElementById("replace").innerHTML = "yolo";
    </script>
    </body>
    </html>
    """

    html = HTML(html=doc)
    html.render()

    assert html.find('#replace', first=True).text == 'yolo'
示例#7
0
def test_bare_js_eval():
    doc = """
    <!DOCTYPE html>
    <html>
    <body>
    <div id="replace">This gets replaced</div>

    <script type="text/javascript">
      document.getElementById("replace").innerHTML = "yolo";
    </script>
    </body>
    </html>
    """

    html = HTML(html=doc)
    html.render()

    assert html.find('#replace', first=True).text == 'yolo'
示例#8
0
文件: html.py 项目: bdunford/waabi
    def FindExecJs(res, code, attributes, tryRender=True):

        if len(res.text) > 0:
            hl = HTML(html=res.text)
            attributes = attributes if isinstance(attributes,
                                                  list) else [attributes]

            if tryRender:
                try:
                    hl.render(timeout=0.5)
                except Exception as ex:
                    pass

            expr = re.compile("(^|\"\;\s*|\'\;\s*|\=\s*|\+\s*|\(\s*|\{\s*)" +
                              re.escape(code))
            scripts = [
                "script: " + el.text for el in filter(
                    lambda x: expr.search(x.text) != None, hl.find("script"))
            ]

            attrs = []

            for a in attributes:
                prfxs = ["javascript:"]
                if a[0:2] == "on":
                    prfxs.append("")
                if a == "src":
                    prfxs.append("data:text/javascript,")
                for pfx in prfxs:
                    attrs += [
                        "attr: " + el.attrs[a]
                        for el in hl.find("[{0}^='{1}']".format(a, pfx + code))
                    ]

            ret = {
                "type": "match",
                "matches": list(set(scripts)) + list(set(attrs)),
                "count": len(scripts) + len(attrs),
                "rendered": hl.html
            }

            return ((len(scripts) + len(attrs) > 0), ret)
        else:
            return (False, {"matches": [], "count": 0, "rendered": ""})
示例#9
0
def crawl_js(js_text):
    script = """
        () => {
            return {
                width: document.documentElement.clientWidth,
                height: document.documentElement.clientHeight,
                deviceScaleFactor: window.devicePixelRatio,
            }
        }
    """
    html = HTML(html=js_text)
    val = html.render(script=script,reload=False)
    return html.html
示例#10
0
def test_bare_render():
    doc = """<a href='https://httpbin.org'>"""
    html = HTML(html=doc)
    script = """
        () => {
            return {
                width: document.documentElement.clientWidth,
                height: document.documentElement.clientHeight,
                deviceScaleFactor: window.devicePixelRatio,
            }
        }
    """
    val = html.render(script=script, reload=False)
    for value in ('width', 'height', 'deviceScaleFactor'):
        assert value in val

    assert html.find('html')
    assert 'https://httpbin.org' in html.links
def test_bare_render():
    doc = """<a href='https://httpbin.org'>"""
    html = HTML(html=doc)
    script = """
        () => {
            return {
                width: document.documentElement.clientWidth,
                height: document.documentElement.clientHeight,
                deviceScaleFactor: window.devicePixelRatio,
            }
        }
    """
    val = html.render(script=script, reload=False)
    for value in ('width', 'height', 'deviceScaleFactor'):
        assert value in val

    assert html.find('html')
    assert 'https://httpbin.org' in html.links
示例#12
0
def get_manoto_video():
    try:
        # base url of all channels in telewebion
        base_url = "https://www.manototv.com/live"

        session = requests.Session()

        retries = Retry(total=5,
                        backoff_factor=0.1,
                        status_forcelist=[500, 502, 503, 504])

        session.mount('http://', HTTPAdapter(max_retries=retries))
        session.mount('https://', HTTPAdapter(max_retries=retries))

        response = session.get(base_url,
                               headers={
                                   'user-agent': 'my-app',
                                   'referer': 'https://www.manototv.com/',
                                   'origin': 'https://www.manototv.com/',
                                   'cache-control': 'no-cache',
                                   'Content-Type': 'application/json'
                               })

        # throw exception if request does not return 2xx
        response.raise_for_status()

        content = response.content

        # BeautifulSoup object
        soup = BeautifulSoup(content, features="html.parser")
        html = HTML(html=content, url=base_url)
        # source = 'http:' + soup.find_all("source")[0]['src']
        return html.render()

    except requests.exceptions.HTTPError as e:
        return "HTTP Error: " + str(e)
    except requests.exceptions.ConnectionError as e:
        return "Connection Error: " + str(e)
    except requests.exceptions.Timeout as e:
        return "Timeout Error: " + str(e)
    except requests.exceptions.RequestException as e:
        return "Whoops! Something went wrong: " + str(e)
示例#13
0
from requests_html import HTML, HTMLSession
import csv

with open('simple.html') as html_file:
    source = html_file.read()
    html = HTML(html=source)
    html.render()

match = html.find('#footer', first=True)
print(match.html)

# print(html.full_text)
# articles = html.find('div.article')
# for article in articles:
#     headline = article.find('h2', first=True).text
#     summary = article.find('p', first=True).text

#     print(headline)
#     print(summary)

# csv_file = open('scraped.csv','w')
# csv_writer = csv.writer(csv_file)
# csv_writer.writerow(['HEADLINE', 'SUMMARY', 'VIDEO'])

# session = HTMLSession()
# response = session.get('https://coreyms.com/')
# html = response.html

# articles = html.find('article')

# for article in articles:
示例#14
0
from requests_html import HTMLSession, HTML
doc = """<a href='https://httpbin.org'>"""
html = HTML(html=doc)
script = """
    () => {
        return {
            width: document.documentElement.clientWidth,
            height: document.documentElement.clientHeight,
            deviceScaleFactor: window.devicePixelRatio,
        }
    }
"""
val = html.render(script=script, reload=False)

print(html.html)
print(val)
示例#15
0
base_url = 'https://economictimes.indiatimes.com/archive/year-2001,month-1.cms'

r = session.get(base_url)

html1 = HTML(html=r.content)

print('before', len(html1.find('a')))
print('before', len(html1.find('td')))
divs = html1.find('div')
links = html1.find('a')
urls = html1.absolute_links
td = html1.find('td')
tr = html1.find('#calenderdiv')

html1.render()

print('after', len(html1.find('a')))
new_divs = html1.find('div')
new_links = html1.find('a')
new_urls = html1.absolute_links
new_td = html1.find('td')
new_tr = html1.find('#calenderdiv')

print(len(divs), len(new_divs), len(links), len(new_links), len(urls), len(new_urls),len(tr),len(new_tr))

new_td = html1.find('td')
print('after', len(html1.find('td')))

# j=0
date_link = {}
function escramble_758(){
    var a,b,c
    a='+1 '
    b='84-'
    a+='425-'
    b+='7450'
    c='9'
    return a+c+b;
}
"""

#jsInput = jsInput + "let content = `" + str(decoded_html) + "`;"

jsInput = "function Test() { return packer.Pack('function DoPack()', true, true );}"

val = html.render(script=jsInput, reload=False)
print(val)

jsInput = jsInput + """
function Processing(input) {
    return input.replace(/e/g, '');
}
function DoPack() {
    //let output = packer.Pack(content, true, true);
    let output = Processing(content, true, true);
    return output;
}
"""

#ctx = py_mini_racer.MiniRacer()
#ctx.eval( jsInput)
)  #print [<Element 'div' class=('article',)>, <Element 'div' class=('article',)>]
for eachallarticleclasses in allarticleclasses:
    print(eachallarticleclasses)  #print <Element 'div' class=('article',)>
    #articleclassheadline = eachallarticleclasses.find("h2")
    #print(articleclassheadline.text) #print AttributeError: 'list' object has no attribute 'text'
    articleclassheadline = eachallarticleclasses.find("h2", first=True)
    print(articleclassheadline.text)  #print Article 1 Headline
    #articleclasssummary = eachallarticleclasses.find("p").text
    #print(articleclasssummary) #print AttributeError: 'list' object has no attribute 'text'
    articleclasssummary = eachallarticleclasses.find("p", first=True).text
    print(articleclasssummary)  #print This is a summary of article 1
#Display HTML from JavaScript code.
with open("simple.html") as htmlfile:
    source = htmlfile.read()
    htmlcode = HTML(html=source)
    htmlcode.render()
match = htmlcode.find("#footer", first=True)
print(match.html)
'''
Before htmlcode.render()
<div id="footer">
<p>Footer Information</p>
</div>
'''
'''
htmlcode.render()
<div id="footer">
<p>Footer Information</p>
<p>This is text generated by JavaScript.</p></div>
'''