def get_listing_urls(br): """ Searches StreetEasy for all rental apartment listings in Williamsburg, caches each page of search results to the directory whose name is stored in the variable SEARCH_RESULTS_DIR, and caches the URLs for the listings (one per line) to the file whose name is stored in the variable LISTING_URLS_FILE. Arguments: br -- Browser object """ if os.path.exists(LISTING_URLS_FILE): return makedir(os.path.dirname(LISTING_URLS_FILE)) br.open(SEARCH_URL) br.select_form(nr=1) # print br.form br.form['area[]'] = ['302'] response = br.submit() results_url = response.geturl() with safe_write(LISTING_URLS_FILE) as f: while True: filename = download_url(br, results_url, SEARCH_RESULTS_DIR) soup = BeautifulSoup(file(filename).read()) results = soup.findAll('div', attrs={'class': 'details_title'}) urls = [] for r in results: r = r.find('h5') r = r.find('a') r = r.get('href') urls.append('http://streeteasy.com' + r) # urls = ['http://www.streeteasy.com' + r.find('h5').find('a').get('href') for r in soup.findAll('div', attrs={'class': 'details_title' })] f.write('\n'.join(urls)) f.write('\n') f.flush() nav = soup.find('a', attrs={'class': 'next_page'}) try: results_url = 'http://www.streeteasy.com' + nav.get('href') except AttributeError: break
def get_listing_urls(br): """ Searches StreetEasy for all rental apartment listings in Williamsburg, caches each page of search results to the directory whose name is stored in the variable SEARCH_RESULTS_DIR, and caches the URLs for the listings (one per line) to the file whose name is stored in the variable LISTING_URLS_FILE. Arguments: br -- Browser object """ if os.path.exists(LISTING_URLS_FILE): return makedir(os.path.dirname(LISTING_URLS_FILE)) br.open(SEARCH_URL) br.select_form(nr=1) # print br.form br.form['area[]'] = ['302'] response = br.submit() results_url = response.geturl() with safe_write(LISTING_URLS_FILE) as f: while True: filename = download_url(br, results_url, SEARCH_RESULTS_DIR) soup = BeautifulSoup(file(filename).read()) results = soup.findAll('div', attrs={'class': 'details_title' }) urls = [] for r in results: r = r.find('h5') r = r.find('a') r = r.get('href') urls.append('http://streeteasy.com' + r) # urls = ['http://www.streeteasy.com' + r.find('h5').find('a').get('href') for r in soup.findAll('div', attrs={'class': 'details_title' })] f.write('\n'.join(urls)) f.write('\n') f.flush() nav = soup.find('a', attrs={'class': 'next_page'}) try: results_url = 'http://www.streeteasy.com' + nav.get('href') except AttributeError: break
def get_listing_data(): with safe_write(CSV_FILE) as f: for filename in iterview(glob(LISTING_PAGES_DIR + '/*')): contents = file(filename).read() # print contents try: [obj] = re.findall('dataLayer\s*=\s*\[(.*)\];', contents) obj = loads(obj) except ValueError: return if 'listPrice' in obj and 'listBed' in obj: text = '\t'.join((os.path.basename(filename), str(obj['listPrice']), str(obj['listBed']))) f.write(text) f.write('\n') f.flush()