示例#1
0
def UT_crawl_html():
    spider = Crawler()
    # test html download
    url = "https://www.python.org/" 
    print( spider.html(url) )
    
    # test save html to .html
    url = "http://www.archives.com/"
    spider.save_html(url, "www.archives.com.html")
    
    # test file download
    img_url = "https://www.python.org/static/img/python-logo.png" 
    spider.download(img_url, "python-logo.png")
    
    # test download html to .html file
    target_url = "http://www.archives.com/" # www.archives.com 是 utf-8 编码
    with open("before_login.html", "wb") as f: # because requests.text return bytes, so mode has to be "wb"
        html = spider.html(target_url)
        print( type(html), chardet.detect(html) ) # 用于展示在2和3之中requests.text返回的对象类型
        f.write( html )
         
    spider._login(url = "http://www.archives.com/member/", # test html after login
                  payload = {"__uid":"*****@*****.**","__pwd":"efa2014"} )
    with open("after_login.html", "wb") as f:
        f.write( spider.html(target_url))
 def html_WITH_proxy():
     """test random proxy mechanism"""
     url = "http://docs.python-requests.org/"
     spider = Crawler()
     spider.enable_proxy()
     for i in range(10):
         html = spider.html(url)
         print(i, spider.pm.current_proxy)
         if html:
             print("\tSUCCESS")
             spider.pm.update_health(1)
         else:
             print("\tFAILED")
     print(spider)
示例#3
0
def zillow_property_detail(address, zipcode):
    url = gen_url(address, zipcode)  # generate query's http url
    spider = Crawler()
    html = spider.html(url)  # fetch html
    if html:  # if good html, analysis it
        try:
            soup = BS4(html)
            dt = soup.find("dt", class_="property-data")
            info = dt.text.strip()
            span = soup.find("span", itemprop="addressLocality")
            city = span.text.strip()
            span = soup.find("span", itemprop="addressRegion")
            state = span.text.strip()
            return address, city, state, zipcode, info
        except:  # if something wrong in analysis, raise ExtractorError
            raise ExtractorError(address, zipcode, url)
    else:  # if bad html, raise HttpError
        raise HttpError(address, zipcode, url)
示例#4
0
def property_info(address, zipcode):
    url = gen_url(address, zipcode)
    spider = Crawler()
    html = spider.html(url)
    if html:
        try:
            soup = BS4(html)
            dt = soup.find("dt", class_="property-data")
            info = dt.text.strip()
            span = soup.find("span", itemprop="addressLocality")
            city = span.text.strip()
            span = soup.find("span", itemprop="addressRegion")
            state = span.text.strip()
            return address, city, state, zipcode, info
        except:
            log.write(
                "Failed to analysis address = %s, zipcode = %s" %
                (address, zipcode), "Failed Extraction")
            return None
    else:
        log.write("%s Failed to get http request" % url, "Http Error")
 def html_WITHOUt_proxy():
     """test normal http request"""
     url = "http://docs.python-requests.org/"
     spider = Crawler()
     html = spider.html(url)
     print(BS4(html).prettify())
 def enable_proxy():
     print("{:=^100}".format("enable_proxy"))
     spider = Crawler()
     spider.enable_proxy()
     print(spider)
 def set_referer():
     print("{:=^100}".format("set_referer"))
     spider = Crawler()
     print(spider)
     spider.set_referer("https://www.python.org/")
     print(spider)