def __getUrlsInPage(url,url_xpath,next_xpath): User_Agent = "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.1 (KHTML, like Gecko) Chrome/22.0.1207.1 Safari/537.1" header = {} header["User-Agent"] = User_Agent rq = requests.get(url, headers=header, allow_redirects=True, verify=False) page = rq.text page = remove_control_characters(page) html = etree.HTML(page) # try: result = html.xpath(url_xpath) try: q = urlparse.urlparse(url).query q = q.split('&') if len(q) == 1: nextpage = url + "&p=2" else: nextpage = int(q[1].split('=')[1]) + 1 nextpage = url.split('&')[0] + "&p=" + str(nextpage) except Exception: nextpage = None #print page #print "Next:",nextpage return result, nextpage
def __getUrlsInPage(self, url, wantedXpath, nextUrlXpath): User_Agent = "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.1 (KHTML, like Gecko) Chrome/22.0.1207.1 Safari/537.1" header = {} header["User-Agent"] = User_Agent rq = requests.get(url, headers=header,allow_redirects=True,verify=False) html = rq.text html = remove_control_characters(html) page = etree.HTML(html) result = page.xpath(wantedXpath) try: nextpage = page.xpath(nextUrlXpath)[0] except Exception: nextpage = None return result, nextpage
def __getUrlsInPage(url,url_xpath,next_xpath): page = getpagewithchrome(url,CHROME_DRIVER_PATH) page = remove_control_characters(page) html = etree.HTML(page) # try: result = html.xpath(url_xpath) try: nextpage = html.xpath(next_xpath)[0] except Exception: nextpage = None #print page #print "Next:",nextpage r = [] for temp in result: if 'ip:' in temp: temp = temp.split('ip:')[-1] r.append(temp) result = r return result, nextpage