def UT_crawl_html(): spider = Crawler() # test html download url = "https://www.python.org/" print( spider.html(url) ) # test save html to .html url = "http://www.archives.com/" spider.save_html(url, "www.archives.com.html") # test file download img_url = "https://www.python.org/static/img/python-logo.png" spider.download(img_url, "python-logo.png") # test download html to .html file target_url = "http://www.archives.com/" # www.archives.com 是 utf-8 编码 with open("before_login.html", "wb") as f: # because requests.text return bytes, so mode has to be "wb" html = spider.html(target_url) print( type(html), chardet.detect(html) ) # 用于展示在2和3之中requests.text返回的对象类型 f.write( html ) spider._login(url = "http://www.archives.com/member/", # test html after login payload = {"__uid":"*****@*****.**","__pwd":"efa2014"} ) with open("after_login.html", "wb") as f: f.write( spider.html(target_url))
def html_WITH_proxy(): """test random proxy mechanism""" url = "http://docs.python-requests.org/" spider = Crawler() spider.enable_proxy() for i in range(10): html = spider.html(url) print(i, spider.pm.current_proxy) if html: print("\tSUCCESS") spider.pm.update_health(1) else: print("\tFAILED") print(spider)
def zillow_property_detail(address, zipcode): url = gen_url(address, zipcode) # generate query's http url spider = Crawler() html = spider.html(url) # fetch html if html: # if good html, analysis it try: soup = BS4(html) dt = soup.find("dt", class_="property-data") info = dt.text.strip() span = soup.find("span", itemprop="addressLocality") city = span.text.strip() span = soup.find("span", itemprop="addressRegion") state = span.text.strip() return address, city, state, zipcode, info except: # if something wrong in analysis, raise ExtractorError raise ExtractorError(address, zipcode, url) else: # if bad html, raise HttpError raise HttpError(address, zipcode, url)
def property_info(address, zipcode): url = gen_url(address, zipcode) spider = Crawler() html = spider.html(url) if html: try: soup = BS4(html) dt = soup.find("dt", class_="property-data") info = dt.text.strip() span = soup.find("span", itemprop="addressLocality") city = span.text.strip() span = soup.find("span", itemprop="addressRegion") state = span.text.strip() return address, city, state, zipcode, info except: log.write( "Failed to analysis address = %s, zipcode = %s" % (address, zipcode), "Failed Extraction") return None else: log.write("%s Failed to get http request" % url, "Http Error")
def html_WITHOUt_proxy(): """test normal http request""" url = "http://docs.python-requests.org/" spider = Crawler() html = spider.html(url) print(BS4(html).prettify())
def enable_proxy(): print("{:=^100}".format("enable_proxy")) spider = Crawler() spider.enable_proxy() print(spider)
def set_referer(): print("{:=^100}".format("set_referer")) spider = Crawler() print(spider) spider.set_referer("https://www.python.org/") print(spider)