예제 #1
0
def get_listing_urls(br):
    """
    Searches StreetEasy for all rental apartment listings in
    Williamsburg, caches each page of search results to the directory
    whose name is stored in the variable SEARCH_RESULTS_DIR, and
    caches the URLs for the listings (one per line) to the file whose
    name is stored in the variable LISTING_URLS_FILE.

    Arguments:

    br -- Browser object
    """

    if os.path.exists(LISTING_URLS_FILE):
        return

    makedir(os.path.dirname(LISTING_URLS_FILE))

    br.open(SEARCH_URL)

    br.select_form(nr=1)
    #    print br.form
    br.form['area[]'] = ['302']
    response = br.submit()
    results_url = response.geturl()

    with safe_write(LISTING_URLS_FILE) as f:
        while True:

            filename = download_url(br, results_url, SEARCH_RESULTS_DIR)
            soup = BeautifulSoup(file(filename).read())

            results = soup.findAll('div', attrs={'class': 'details_title'})

            urls = []

            for r in results:

                r = r.find('h5')
                r = r.find('a')
                r = r.get('href')

                urls.append('http://streeteasy.com' + r)


#            urls = ['http://www.streeteasy.com' + r.find('h5').find('a').get('href') for r in soup.findAll('div', attrs={'class': 'details_title' })]

            f.write('\n'.join(urls))
            f.write('\n')
            f.flush()

            nav = soup.find('a', attrs={'class': 'next_page'})

            try:
                results_url = 'http://www.streeteasy.com' + nav.get('href')
            except AttributeError:
                break
예제 #2
0
def get_listing_urls(br):
    """
    Searches StreetEasy for all rental apartment listings in
    Williamsburg, caches each page of search results to the directory
    whose name is stored in the variable SEARCH_RESULTS_DIR, and
    caches the URLs for the listings (one per line) to the file whose
    name is stored in the variable LISTING_URLS_FILE.

    Arguments:

    br -- Browser object
    """

    if os.path.exists(LISTING_URLS_FILE):
        return

    makedir(os.path.dirname(LISTING_URLS_FILE))

    br.open(SEARCH_URL)

    br.select_form(nr=1)
#    print br.form
    br.form['area[]'] = ['302']
    response = br.submit()
    results_url = response.geturl()

    with safe_write(LISTING_URLS_FILE) as f:
        while True:

            filename = download_url(br, results_url, SEARCH_RESULTS_DIR)
            soup = BeautifulSoup(file(filename).read())

            results = soup.findAll('div', attrs={'class': 'details_title' })

            urls = []

            for r in results:

                r = r.find('h5')
                r = r.find('a')
                r = r.get('href')

                urls.append('http://streeteasy.com' + r)

#            urls = ['http://www.streeteasy.com' + r.find('h5').find('a').get('href') for r in soup.findAll('div', attrs={'class': 'details_title' })]

            f.write('\n'.join(urls))
            f.write('\n')
            f.flush()

            nav = soup.find('a', attrs={'class': 'next_page'})

            try:
                results_url = 'http://www.streeteasy.com' + nav.get('href')
            except AttributeError:
                break
예제 #3
0
def get_listing_data():

    with safe_write(CSV_FILE) as f:
        for filename in iterview(glob(LISTING_PAGES_DIR + '/*')):

            contents = file(filename).read()
#            print contents

            try:
                [obj] = re.findall('dataLayer\s*=\s*\[(.*)\];', contents)
                obj = loads(obj)
            except ValueError:
                return

            if 'listPrice' in obj and 'listBed' in obj:
                text = '\t'.join((os.path.basename(filename),
                                  str(obj['listPrice']), str(obj['listBed'])))
                f.write(text)
                f.write('\n')
                f.flush()
예제 #4
0
def get_listing_data():

    with safe_write(CSV_FILE) as f:
        for filename in iterview(glob(LISTING_PAGES_DIR + '/*')):

            contents = file(filename).read()
            #            print contents

            try:
                [obj] = re.findall('dataLayer\s*=\s*\[(.*)\];', contents)
                obj = loads(obj)
            except ValueError:
                return

            if 'listPrice' in obj and 'listBed' in obj:
                text = '\t'.join((os.path.basename(filename),
                                  str(obj['listPrice']), str(obj['listBed'])))
                f.write(text)
                f.write('\n')
                f.flush()