示例#1
0
def scrape_auction(url):
	print 'Now scraping auction site: ' + url
	grab = urllib.urlopen(url)
	soup = BeautifulSoup(grab)

	if soup.h1.string is None:
		return 0
	title = soup.h1.string.strip()
	url = ''
	price = ''

	for tag in soup.find_all('h2'):
		# The URL is in an <a> tag within an <h2>
		if (tag.a):
			if tag.a.string is None:
				break
			url = tag.a.string.strip()
		if 'class' in tag.attrs and 'price' in tag['class']:
			price = re.sub('[$,]', '', tag.string.strip())
			if is_int(price):
				db_access.insert_data_point('flippa.com', url, 'Price', int(price))
	divs = soup.find_all('div')
	for item in divs:
		if 'class' in item.attrs:
			# There are 2 divs that contain data:  datatable and editable
			if ('datatable' or 'editable') in item['class']:
				dataset_title = item.h3.string.strip()
				for tr in item.find_all('tr'):
					key = tr.th.string.strip()
					if tr.td.string is None:
						break
					value = re.sub('[,-]', '', tr.td.string.strip())
					if key and value:
						if is_int(value):
							db_access.insert_data_point('flippa.com', url, key, int(value))
示例#2
0
def scrape_site(url):
	print "Now scraping: " + url
	soup = souphelper.get_soup(url)
	if not soup:
		return 0
	# doc = open('websitebroker_page.htm').read()
	# soup = BeautifulSoup(doc)

	# Want [4]
	tables = soup.find_all('table')
	counter = 1
	try:
		for td in tables[4].find_all('td'):
			# This is totally ridiculous to call the re.search on each of these itmes
			# but for some reason calling it once in a variable was having issues.
			# Fix me if you have time!!!
			if td.string is None:
				break
			if counter == 9:
				visitors = souphelper.string_to_int(td.string)
				# print visitors
			elif counter == 11:
				page_views = souphelper.string_to_int(td.string)
				# print page_views
			elif counter == 13:
				income = souphelper.string_to_int(td.string)
				# print income
			elif counter == 15:
				expenses = souphelper.string_to_int(td.string)
				# print expenses
			elif counter == 17:
				price = souphelper.string_to_int(td.string)
				# print price
			elif counter == 19:
				url = td.string
				# print url
			counter += 1

		db_access.insert_data_point('websitebroker.com', url, 'Price', price)
		db_access.insert_data_point('websitebroker.com', url, 'Visitors', visitors)
		db_access.insert_data_point('websitebroker.com', url, 'Page Views', page_views)
		db_access.insert_data_point('websitebroker.com', url, 'Income', income)
		db_access.insert_data_point('websitebroker.com', url, 'Expenses', expenses)
	except:
		print "Error scraping site: " + url
def scrape_site(url):
  print "Now scraping: " + url
  soup = souphelper.get_soup(url)
  if soup is None:
    return 0

  try:
    h1s = soup.find_all('h1')
    title = h1s[1].string.strip()
    tables = soup.find_all('table')
    counter = 1
    for td in tables[4].find_all('td'):
      if counter < 4: 
        value = souphelper.string_to_int(td.string)
      if counter == 1:
        price = value
      elif counter == 2:
        income = value
      elif counter == 3:
        cash_flow = value
      else:
        break
      counter += 1
    db_access.insert_data_point('websiteproperties.com', title, 'Price', price)
    db_access.insert_data_point('websiteproperties.com', title, 'Gross Income', income)
    db_access.insert_data_point('websiteproperties.com', title, 'Cash Flow', cash_flow)
  except:
    print "Error scraping page: " + url