def Reflected(res, value, tryRender=True): if len(res.text) > 0: hl = HTML(html=res.text) if tryRender: try: hl.render(timeout=0.5) except Exception as ex: pass matches = re.findall(".{1,15}" + re.escape(value) + ".{1,15}", hl.html, re.DOTALL) ret = { "type": "match", "count": len(matches), "matches": list(set(matches)), "rendered": hl.html } return ((len(matches) > 0), ret) else: return (False, { "type": "match", "matches": [], "count": 0, "rendered": "" })
def getByCountry(country, proxy_type='ALL'): proxy_type = proxy_type.upper() if proxy_type not in all_types: return None headers = {'User-Agent': getIndexUa(10)} URL = 'http://spys.one/free-proxy-list/{}/'.format(country.upper()) token_data = requests.get(URL, headers=headers) token_selc = Selector(text=token_data.text) token = token_selc.xpath( "//input[@type='hidden'and@name='xx0']/@value").extract_first() data = { 'xx0': token, 'xpp': 5, 'xf1': 0, 'xf2': 0, 'xf4': 0, 'xf5': all_types[proxy_type] } headers = { 'User-Agent': getIndexUa(10), 'Host': 'spys.one', 'Content-Length': '66', 'Cache-Control': 'max-age=0', 'Upgrade-Insecure-Requests': '1', 'Origin': 'http://spys.one', 'Content-Type': 'application/x-www-form-urlencoded', 'Accept': ('text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8,app' 'lication/signed-exchange;v=b3;q=0.9'), 'Referer': 'http://spys.one/free-proxy-list/PE/', 'Accept-Encoding': 'gzip, deflate', 'Accept-Language': 'es-ES,es;q=0.9', 'Connection': 'close' } r = requests.post(URL, headers=headers, data=data) proxies = [] if r.ok: html = HTML(html=r.text) html.render() page = Selector(text=html.html) table = page.css('tr') for tr in table: tds = tr.css("td") ip = tds[0].css('font::text').extract() if len(ip) == 3: proxies.append('http://{}:{}'.format(ip[0], ip[2])) return proxies else: return []
def render_html_page(self, page_content): '''Render html page''' try: html = HTML(html=page_content) html.render(reload=False) return html.text except: logging.error('Cannot render the page', exc_info=True) return page_content
def getPage(self, url): url = self.getFullURL(url) self._updateBaseURL(url) print("getting", url) text = requests.get(url, headers={"User-agent": self.userAgent}).text print("got page from URL", url, "with length", len(text)) if self.runJavascript: print("\trunning javascript...") html = HTML(html=text) html.render(timeout=self.timeout) text = html.html print("\tdone.") return text
def render_html_page(self, page_content): """ Render html page :param self: :param page_content: Content of the page to render """ try: from requests_html import HTML html = HTML(html=page_content) html.render(reload=False) return html.text except: logging.error('Cannot render the page', exc_info=True) return page_content
def test_bare_js_eval(): doc = """ <!DOCTYPE html> <html> <body> <div id="replace">This gets replaced</div> <script type="text/javascript"> document.getElementById("replace").innerHTML = "yolo"; </script> </body> </html> """ html = HTML(html=doc) html.render() assert html.find('#replace', first=True).text == 'yolo'
def test_bare_js_eval(): doc = """ <!DOCTYPE html> <html> <body> <div id="replace">This gets replaced</div> <script type="text/javascript"> document.getElementById("replace").innerHTML = "yolo"; </script> </body> </html> """ html = HTML(html=doc) html.render() assert html.find('#replace', first=True).text == 'yolo'
def FindExecJs(res, code, attributes, tryRender=True): if len(res.text) > 0: hl = HTML(html=res.text) attributes = attributes if isinstance(attributes, list) else [attributes] if tryRender: try: hl.render(timeout=0.5) except Exception as ex: pass expr = re.compile("(^|\"\;\s*|\'\;\s*|\=\s*|\+\s*|\(\s*|\{\s*)" + re.escape(code)) scripts = [ "script: " + el.text for el in filter( lambda x: expr.search(x.text) != None, hl.find("script")) ] attrs = [] for a in attributes: prfxs = ["javascript:"] if a[0:2] == "on": prfxs.append("") if a == "src": prfxs.append("data:text/javascript,") for pfx in prfxs: attrs += [ "attr: " + el.attrs[a] for el in hl.find("[{0}^='{1}']".format(a, pfx + code)) ] ret = { "type": "match", "matches": list(set(scripts)) + list(set(attrs)), "count": len(scripts) + len(attrs), "rendered": hl.html } return ((len(scripts) + len(attrs) > 0), ret) else: return (False, {"matches": [], "count": 0, "rendered": ""})
def crawl_js(js_text): script = """ () => { return { width: document.documentElement.clientWidth, height: document.documentElement.clientHeight, deviceScaleFactor: window.devicePixelRatio, } } """ html = HTML(html=js_text) val = html.render(script=script,reload=False) return html.html
def test_bare_render(): doc = """<a href='https://httpbin.org'>""" html = HTML(html=doc) script = """ () => { return { width: document.documentElement.clientWidth, height: document.documentElement.clientHeight, deviceScaleFactor: window.devicePixelRatio, } } """ val = html.render(script=script, reload=False) for value in ('width', 'height', 'deviceScaleFactor'): assert value in val assert html.find('html') assert 'https://httpbin.org' in html.links
def test_bare_render(): doc = """<a href='https://httpbin.org'>""" html = HTML(html=doc) script = """ () => { return { width: document.documentElement.clientWidth, height: document.documentElement.clientHeight, deviceScaleFactor: window.devicePixelRatio, } } """ val = html.render(script=script, reload=False) for value in ('width', 'height', 'deviceScaleFactor'): assert value in val assert html.find('html') assert 'https://httpbin.org' in html.links
def get_manoto_video(): try: # base url of all channels in telewebion base_url = "https://www.manototv.com/live" session = requests.Session() retries = Retry(total=5, backoff_factor=0.1, status_forcelist=[500, 502, 503, 504]) session.mount('http://', HTTPAdapter(max_retries=retries)) session.mount('https://', HTTPAdapter(max_retries=retries)) response = session.get(base_url, headers={ 'user-agent': 'my-app', 'referer': 'https://www.manototv.com/', 'origin': 'https://www.manototv.com/', 'cache-control': 'no-cache', 'Content-Type': 'application/json' }) # throw exception if request does not return 2xx response.raise_for_status() content = response.content # BeautifulSoup object soup = BeautifulSoup(content, features="html.parser") html = HTML(html=content, url=base_url) # source = 'http:' + soup.find_all("source")[0]['src'] return html.render() except requests.exceptions.HTTPError as e: return "HTTP Error: " + str(e) except requests.exceptions.ConnectionError as e: return "Connection Error: " + str(e) except requests.exceptions.Timeout as e: return "Timeout Error: " + str(e) except requests.exceptions.RequestException as e: return "Whoops! Something went wrong: " + str(e)
from requests_html import HTML, HTMLSession import csv with open('simple.html') as html_file: source = html_file.read() html = HTML(html=source) html.render() match = html.find('#footer', first=True) print(match.html) # print(html.full_text) # articles = html.find('div.article') # for article in articles: # headline = article.find('h2', first=True).text # summary = article.find('p', first=True).text # print(headline) # print(summary) # csv_file = open('scraped.csv','w') # csv_writer = csv.writer(csv_file) # csv_writer.writerow(['HEADLINE', 'SUMMARY', 'VIDEO']) # session = HTMLSession() # response = session.get('https://coreyms.com/') # html = response.html # articles = html.find('article') # for article in articles:
from requests_html import HTMLSession, HTML doc = """<a href='https://httpbin.org'>""" html = HTML(html=doc) script = """ () => { return { width: document.documentElement.clientWidth, height: document.documentElement.clientHeight, deviceScaleFactor: window.devicePixelRatio, } } """ val = html.render(script=script, reload=False) print(html.html) print(val)
base_url = 'https://economictimes.indiatimes.com/archive/year-2001,month-1.cms' r = session.get(base_url) html1 = HTML(html=r.content) print('before', len(html1.find('a'))) print('before', len(html1.find('td'))) divs = html1.find('div') links = html1.find('a') urls = html1.absolute_links td = html1.find('td') tr = html1.find('#calenderdiv') html1.render() print('after', len(html1.find('a'))) new_divs = html1.find('div') new_links = html1.find('a') new_urls = html1.absolute_links new_td = html1.find('td') new_tr = html1.find('#calenderdiv') print(len(divs), len(new_divs), len(links), len(new_links), len(urls), len(new_urls),len(tr),len(new_tr)) new_td = html1.find('td') print('after', len(html1.find('td'))) # j=0 date_link = {}
function escramble_758(){ var a,b,c a='+1 ' b='84-' a+='425-' b+='7450' c='9' return a+c+b; } """ #jsInput = jsInput + "let content = `" + str(decoded_html) + "`;" jsInput = "function Test() { return packer.Pack('function DoPack()', true, true );}" val = html.render(script=jsInput, reload=False) print(val) jsInput = jsInput + """ function Processing(input) { return input.replace(/e/g, ''); } function DoPack() { //let output = packer.Pack(content, true, true); let output = Processing(content, true, true); return output; } """ #ctx = py_mini_racer.MiniRacer() #ctx.eval( jsInput)
) #print [<Element 'div' class=('article',)>, <Element 'div' class=('article',)>] for eachallarticleclasses in allarticleclasses: print(eachallarticleclasses) #print <Element 'div' class=('article',)> #articleclassheadline = eachallarticleclasses.find("h2") #print(articleclassheadline.text) #print AttributeError: 'list' object has no attribute 'text' articleclassheadline = eachallarticleclasses.find("h2", first=True) print(articleclassheadline.text) #print Article 1 Headline #articleclasssummary = eachallarticleclasses.find("p").text #print(articleclasssummary) #print AttributeError: 'list' object has no attribute 'text' articleclasssummary = eachallarticleclasses.find("p", first=True).text print(articleclasssummary) #print This is a summary of article 1 #Display HTML from JavaScript code. with open("simple.html") as htmlfile: source = htmlfile.read() htmlcode = HTML(html=source) htmlcode.render() match = htmlcode.find("#footer", first=True) print(match.html) ''' Before htmlcode.render() <div id="footer"> <p>Footer Information</p> </div> ''' ''' htmlcode.render() <div id="footer"> <p>Footer Information</p> <p>This is text generated by JavaScript.</p></div> '''