示例#1
0
def test_scrape_url():
    zpid = "24743857"
    response = scrape_url(None, zpid, 5)
    for k, v in zpid_1_expected.items():
        eq_(response[k], v)
    assert_greater(int(response['price'].replace(',', '')), 0)
    ok_(response['description'])
    ok_(response['price_history'])
示例#2
0
def main():
    parser = ArgumentParser()
    mutex = parser.add_mutually_exclusive_group(required=True)
    mutex.add_argument("--zpid")
    mutex.add_argument("--url")
    parser.add_argument("-t", "--request-timeout", type=int)
    args = parser.parse_args()
    pprint(scrape_url(**args.__dict__))
示例#3
0
def test_scrape_url():
    zpid = "2101920883"
    response = scrape_url(None, zpid, 5)
    for k, v in zpid_1_expected.items():
        eq_(response[k], v)
    assert_greater(int(response['price'].replace(',', '')), 0)
    ok_(response['description'])
    ok_(response['price_history'])
def test_scrape_url():
    zpid = "24743857"
    response = scrape_url(None, zpid, 5)
    for k, v in zpid_1_expected.items():
        eq_(response[k], v)
    if not response['price']:
        eq_(response['status'], 'Off Market')
    else:
        assert_greater(int(response['price'].replace(',', '')), 0)
    ok_(response['description'])
    ok_(response['price_history'])
示例#5
0
def test_scrape_url():
    zpid = "24743857"
    response = scrape_url(None, zpid, 5)
    for k, v in zpid_1_expected.items():
        eq_(response[k], v)
    ok_(response["sales_info"])
    if not response["sales_info"]["price"]:
        eq_(response["sales_info"]["status"], "Off Market")
    else:
        assert_greater(int(response["sales_info"]["price"].replace(",", "")), 0)
    ok_(response["description"])
    ok_(response["price_history"])
示例#6
0
def test_scrape_url():
    zpid = "24743857"
    response = scrape_url(None, zpid, 5)
    for k, v in zpid_1_expected.items():
        eq_(response[k], v)
    ok_(response['sales_info'])
    if not response['sales_info']['price']:
        eq_(response['sales_info']['status'], 'Off Market')
    else:
        assert_greater(int(response['sales_info']['price'].replace(',', '')),
                       0)
    ok_(response['description'])
    ok_(response['price_history'])
示例#7
0
def get_result(zpid):
    result = scrape_url(None, zpid, 5)
    result["zpid"] = zpid
    return result
示例#8
0
import time

#initialize
numPages = 2887
url_prefix = "http://www.zillow.com/search/GetResults.htm?spt=homes&status=100000&lt=111001&ht=100000&pr=,&mp=,&bd=0%2C&ba=0%2C&sf=,&lot=,&yr=,&pho=0&pets=0&parking=0&laundry=0&pnd=0&red=0&zso=0&days=any&ds=all&pmf=0&pf=0&zoom=4&rect=-135087891,31334871,-103491211,43052833&p="
url_postfix = "&sort=featured&search=maplist&disp=1&rid=9&rt=2&listright=true&isMapSearch=1&zoom=4"
request_timeout = 0.3
out_file_pre = "results"
out_file_post = ".json"

results = []

for pageNum in range(1, numPages + 1):

    url = url_prefix + str(pageNum) + url_postfix

    zpidSource = json.load(urllib2.urlopen(url))['list']['listHTML']
    zpids = re.findall(r'id="zpid_(\d{8})"', zpidSource)

    for i, zpid in enumerate(zpids):
        try:
            results.append(scrape_url(None, int(zpid), request_timeout))
            time.sleep(0.5)
            print i, 'out of', len(zpids)
        except:
            pass

    print 'finish scraping page %d,%d houses found!' % (pageNum, len(zpids))

with open(out_file, 'w') as f:
    json.dump(results, f)
示例#9
0
import time

#initialize
numPages = 2887
url_prefix = "http://www.zillow.com/search/GetResults.htm?spt=homes&status=100000&lt=111001&ht=100000&pr=,&mp=,&bd=0%2C&ba=0%2C&sf=,&lot=,&yr=,&pho=0&pets=0&parking=0&laundry=0&pnd=0&red=0&zso=0&days=any&ds=all&pmf=0&pf=0&zoom=4&rect=-135087891,31334871,-103491211,43052833&p="
url_postfix = "&sort=featured&search=maplist&disp=1&rid=9&rt=2&listright=true&isMapSearch=1&zoom=4"
request_timeout = 0.3
out_file_pre = "results"
out_file_post = ".json"

results = []

for pageNum in range(1,numPages+1):

	url = url_prefix + str(pageNum) + url_postfix 

	zpidSource = json.load(urllib2.urlopen(url))['list']['listHTML']
	zpids = re.findall(r'id="zpid_(\d{8})"',zpidSource)

	for i,zpid in enumerate(zpids):
		try:
	 		results.append(scrape_url(None, int(zpid), request_timeout))
	 		time.sleep(0.5)
	 		print i, 'out of', len(zpids)
	 	except:
	 		pass

	print 'finish scraping page %d,%d houses found!' %(pageNum,len(zpids))

with open(out_file,'w') as f:
 	json.dump(results,f)