def reverseip(url): """return domains from given the same server""" # get only domain name url = urlparse(url).netloc if urlparse(url).netloc != '' else urlparse( url).path.split("/")[0] source = "http://domains.yougetsignal.com/domains.php" useragent = useragents.get() contenttype = "application/x-www-form-urlencoded; charset=UTF-8" # POST method opener = urllib2.build_opener(urllib2.HTTPHandler(), urllib2.HTTPSHandler()) data = urllib.urlencode([('remoteAddress', url), ('key', '')]) request = urllib2.Request(source, data) request.add_header("Content-type", contenttype) request.add_header("User-Agent", useragent) try: result = urllib2.urlopen(request).read() except urllib2.HTTPError, e: print >> sys.stderr, "[{}] HTTP error".format(e.code)
def __init_phantom_js_driver(): dcap = dict(DesiredCapabilities.PHANTOMJS) dcap["phantomjs.page.settings.userAgent"] = (str(get())) current_date = datetime.isoformat(datetime.now()) driver = webdriver.PhantomJS( desired_capabilities=dcap, service_log_path='/tmp/phant-' + current_date + '.log', # attention!!, can fill all /tmp dir service_args=[ '--debug=true', '--webdriver-loglevel=DEBUG', '--local-url-access=false' ]) return driver
def getHTML(url, lastURL=False): """return HTML of the given url""" if not (url.startswith("http://") or url.startswith("https://")): url = "http://" + url header = useragents.get() request = urllib2.Request(url, None, header) try: reply = urllib2.urlopen(request, timeout=10) except urllib2.HTTPError, e: #print >> sys.stderr, "[{}] HTTP error".format(e.code) pass
def gethtml(url, lastURL=False): """return HTML of the given url""" if not (url.startswith("http://") or url.startswith("https://")): url = "http://" + url header = useragents.get() request = urllib2.Request(url, None, header) html = None try: reply = urllib2.urlopen(request, timeout=10) except urllib2.HTTPError, e: # read html content anyway for reply with HTTP500 if e.getcode() == 500: html = e.read() #print >> sys.stderr, "[{}] HTTP error".format(e.code) pass
def __init_phantom_js_driver(): dcap = dict(DesiredCapabilities.PHANTOMJS) dcap["phantomjs.page.settings.userAgent"] = (str(get())) current_date = datetime.isoformat(datetime.now()) options = webdriver.ChromeOptions() options.add_argument('--headless') options.add_argument('--no-sandbox') options.add_argument('--disable-dev-shm-usage') driver = webdriver.Chrome(chrome_options=options, desired_capabilities=dcap, service_log_path='/tmp/phant-' + current_date + '.log', service_args=[] ) # deprecated # driver = webdriver.PhantomJS( # desired_capabilities=dcap, # service_log_path='/tmp/phant-' + current_date + '.log', # attention!!, can fill all /tmp dir # service_args=[] # ) return driver
def create_request(self, method, url, body): if method == "GET": return dict( url="http://" + self.url + url, method=method, headers={"User-Agent": get()}, params=body) if method == "POST": return dict( url="http://" + self.url + url, method=method, headers={"User-Agent": get()}, data=body)
def create_request(self, method: str, url: str, body: bytes): request = Request("http://" + self.url + url, method=method, headers={"User-Agent": get()}) request.data = body + b"\n" return request
import time import requests from selenium import webdriver import useragents import wools import xpaths import proxys wools = wools.get_all() all_proxy = requests.get( "http://www.qingool.com:8000/?protocol=1&country=%E5%9B%BD%E5%86%85").json( ) for i in range(200): proxy = '--proxy-server=http://{}:{}'.format(all_proxy[i][0], all_proxy[i][1]) xpath = xpaths.get() useragent = useragents.get() print(useragent) options = webdriver.ChromeOptions() options.add_argument(useragent) # options.add_argument(proxy) print(proxy) driver = webdriver.Chrome(executable_path="./chromedriver", chrome_options=options) for wool in wools: driver.get("http://www.baidu.com") time.sleep(3)
def getHtml(url): header = useragents.get() print(header)