def scrape_auction(url): print 'Now scraping auction site: ' + url grab = urllib.urlopen(url) soup = BeautifulSoup(grab) if soup.h1.string is None: return 0 title = soup.h1.string.strip() url = '' price = '' for tag in soup.find_all('h2'): # The URL is in an <a> tag within an <h2> if (tag.a): if tag.a.string is None: break url = tag.a.string.strip() if 'class' in tag.attrs and 'price' in tag['class']: price = re.sub('[$,]', '', tag.string.strip()) if is_int(price): db_access.insert_data_point('flippa.com', url, 'Price', int(price)) divs = soup.find_all('div') for item in divs: if 'class' in item.attrs: # There are 2 divs that contain data: datatable and editable if ('datatable' or 'editable') in item['class']: dataset_title = item.h3.string.strip() for tr in item.find_all('tr'): key = tr.th.string.strip() if tr.td.string is None: break value = re.sub('[,-]', '', tr.td.string.strip()) if key and value: if is_int(value): db_access.insert_data_point('flippa.com', url, key, int(value))
def scrape_site(url): print "Now scraping: " + url soup = souphelper.get_soup(url) if not soup: return 0 # doc = open('websitebroker_page.htm').read() # soup = BeautifulSoup(doc) # Want [4] tables = soup.find_all('table') counter = 1 try: for td in tables[4].find_all('td'): # This is totally ridiculous to call the re.search on each of these itmes # but for some reason calling it once in a variable was having issues. # Fix me if you have time!!! if td.string is None: break if counter == 9: visitors = souphelper.string_to_int(td.string) # print visitors elif counter == 11: page_views = souphelper.string_to_int(td.string) # print page_views elif counter == 13: income = souphelper.string_to_int(td.string) # print income elif counter == 15: expenses = souphelper.string_to_int(td.string) # print expenses elif counter == 17: price = souphelper.string_to_int(td.string) # print price elif counter == 19: url = td.string # print url counter += 1 db_access.insert_data_point('websitebroker.com', url, 'Price', price) db_access.insert_data_point('websitebroker.com', url, 'Visitors', visitors) db_access.insert_data_point('websitebroker.com', url, 'Page Views', page_views) db_access.insert_data_point('websitebroker.com', url, 'Income', income) db_access.insert_data_point('websitebroker.com', url, 'Expenses', expenses) except: print "Error scraping site: " + url
def scrape_site(url): print "Now scraping: " + url soup = souphelper.get_soup(url) if soup is None: return 0 try: h1s = soup.find_all('h1') title = h1s[1].string.strip() tables = soup.find_all('table') counter = 1 for td in tables[4].find_all('td'): if counter < 4: value = souphelper.string_to_int(td.string) if counter == 1: price = value elif counter == 2: income = value elif counter == 3: cash_flow = value else: break counter += 1 db_access.insert_data_point('websiteproperties.com', title, 'Price', price) db_access.insert_data_point('websiteproperties.com', title, 'Gross Income', income) db_access.insert_data_point('websiteproperties.com', title, 'Cash Flow', cash_flow) except: print "Error scraping page: " + url