Exemplo n.º 1
0
 def html_WITH_proxy():
     """test random proxy mechanism"""
     url = "http://docs.python-requests.org/"
     spider = Crawler()
     spider.enable_proxy()
     for i in range(10):
         html = spider.html(url)
         print(i, spider.pm.current_proxy)
         if html:
             print("\tSUCCESS")
             spider.pm.update_health(1)
         else:
             print("\tFAILED")
     print(spider)
def get_disease_category_url():
    """taskplan level2 get disease subcategory url
    """
    base_url = 'http://www.mayoclinic.org'         
    task = load_jt('task.json')   
    spider = Crawler()
    
    for url in task:
        html = spider.html(url)
        if html:
            soup = BS4(html)
            div = soup.find_all('div', id = 'main_0_left1_0_tertiarynav')[0]
            for a in div.find_all('a'):
                task[url].setdefault(base_url + a['href'], {'data': {'category': a.text.strip()}})
        dump_jt(task, 'task.json', replace = True)
def get_disease_url():
    """taskplan level1 get disease homepage url
    """
    base_url = 'http://www.mayoclinic.org'
    task = load_jt('task.json')
    spider = Crawler()
    
    for entrance_url in gen_entranceURL():
        html = spider.html(entrance_url)
        if html:
            soup = BS4(html)
            ol = soup.find_all('ol')[1]
            for li in ol.find_all('li'):
                url = base_url + li.a['href']
                task.setdefault(url, {'data': {'disease_name': li.text.strip()} } )
                dump_jt(task, 'task.json', replace = True)
Exemplo n.º 4
0
def zillow_property_detail(address, zipcode):
    url = gen_url(address, zipcode) # generate query's http url
    spider = Crawler()
    html = spider.html(url) # fetch html
    if html: # if good html, analysis it
        try:
            soup = BS4(html)
            dt = soup.find("dt", class_ ="property-data")
            info = dt.text.strip()
            span = soup.find("span", itemprop = "addressLocality")
            city = span.text.strip()
            span = soup.find("span", itemprop = "addressRegion")
            state = span.text.strip()
            return address, city, state, zipcode, info
        except: # if something wrong in analysis, raise ExtractorError
            raise ExtractorError(address, zipcode, url)
    else: # if bad html, raise HttpError
        raise HttpError(address, zipcode, url)
def download_all():
    """crawl them all, disease_name - subcategory
    """
    task = load_jt('task.json')
    data = load_jt('data.json')
    spider = Crawler()
    
    for _, v in task.iteritems():
        disease_name = v['data']['disease_name']
        data.setdefault(disease_name, {})
        for url, v1 in ignore_iteritems(v, ignore = ['data']):
            print url
            html = spider.html(url)
            if html:
                soup = BS4(html)
                div = soup.find('div', id='main-content')
                data[disease_name].setdefault(v1['data']['category'], str(div))
        dump_jt(data, 'data.json', fastmode = True, replace = True)
Exemplo n.º 6
0
def zillow_property_detail(address, zipcode):
    url = gen_url(address, zipcode)  # generate query's http url
    spider = Crawler()
    html = spider.html(url)  # fetch html
    if html:  # if good html, analysis it
        try:
            soup = BS4(html)
            dt = soup.find("dt", class_="property-data")
            info = dt.text.strip()
            span = soup.find("span", itemprop="addressLocality")
            city = span.text.strip()
            span = soup.find("span", itemprop="addressRegion")
            state = span.text.strip()
            return address, city, state, zipcode, info
        except:  # if something wrong in analysis, raise ExtractorError
            raise ExtractorError(address, zipcode, url)
    else:  # if bad html, raise HttpError
        raise HttpError(address, zipcode, url)
Exemplo n.º 7
0
def property_info(address, zipcode):
    url = gen_url(address, zipcode)
    spider = Crawler()
    html = spider.html(url)
    if html:
        try:
            soup = BS4(html)
            dt = soup.find("dt", class_="property-data")
            info = dt.text.strip()
            span = soup.find("span", itemprop="addressLocality")
            city = span.text.strip()
            span = soup.find("span", itemprop="addressRegion")
            state = span.text.strip()
            return address, city, state, zipcode, info
        except:
            log.write(
                "Failed to analysis address = %s, zipcode = %s" %
                (address, zipcode), "Failed Extraction")
            return None
    else:
        log.write("%s Failed to get http request" % url, "Http Error")
Exemplo n.º 8
0
 def html_WITHOUt_proxy():
     """test normal http request"""
     url = "http://docs.python-requests.org/"
     spider = Crawler()
     html = spider.html(url)
     print(BS4(html).prettify())
Exemplo n.º 9
0
 def enable_proxy():
     print("{:=^100}".format("enable_proxy"))
     spider = Crawler()
     spider.enable_proxy()
     print(spider)
Exemplo n.º 10
0
 def set_referer():
     print("{:=^100}".format("set_referer"))
     spider = Crawler()
     print(spider)
     spider.set_referer("https://www.python.org/")
     print(spider)
Exemplo n.º 11
0
def UT_crawl_html():
    spider = Crawler()
    # test html download
    url = "https://www.python.org/" 
    print( spider.html(url) )
    
    # test save html to .html
    url = "http://www.archives.com/"
    spider.save_html(url, "www.archives.com.html")
    
    # test file download
    img_url = "https://www.python.org/static/img/python-logo.png" 
    spider.download(img_url, "python-logo.png")
    
    # test download html to .html file
    target_url = "http://www.archives.com/" # www.archives.com 是 utf-8 编码
    with open("before_login.html", "wb") as f: # because requests.text return bytes, so mode has to be "wb"
        html = spider.html(target_url)
        print( type(html), chardet.detect(html) ) # 用于展示在2和3之中requests.text返回的对象类型
        f.write( html )
         
    spider._login(url = "http://www.archives.com/member/", # test html after login
                  payload = {"__uid":"*****@*****.**","__pwd":"efa2014"} )
    with open("after_login.html", "wb") as f:
        f.write( spider.html(target_url))