Пример #1
0
def scrape_site(url):
    print "Now scraping: " + url
    soup = souphelper.get_soup("http://www.siteworthchecker.com/" + url)
    if soup:

        try:
            worth = souphelper.string_to_int(soup.span.string)
            spans = soup.find_all('span')
            for i in range(len(spans)):
                try:
                    spans[i] = souphelper.string_to_int(str(spans[i]))
                except:
                    spans[i] = 0
            # for i in range(len(spans)):
            #     print str(i) + " : " + str(spans[i])

            daily_visits = spans[3]
            daily_revenue = spans[4]
            alexa_rank = spans[5]
            google_page_rank = spans[6]
            google_indexed_pages = spans[7]
            google_backlinks = spans[9]
            domain_age = spans[17]
        except:
            print "Error scraping: " + url
            return 0

        try:
            db_query = 'REPLACE INTO siteworthchecker SET url="%s", worth="%s", daily_visits="%s", daily_revenue="%s", alexa_rank="%s", google_page_rank="%s", google_indexed_pages="%s", google_backlinks="%s", domain_age="%s"' % (url, worth, daily_visits, daily_revenue, alexa_rank, google_page_rank, google_indexed_pages, google_backlinks, domain_age)
            con.query(db_query)
        except _mysql.Error, e:
            print "Error %d: %s" % (e.args[0], e.args[1])
Пример #2
0
def scrape_site(url):
    print "Now scraping: " + url
    soup = souphelper.get_soup("http://webstatsart.com/" + url)
    if soup:
        try:
            worth_string = soup.h2.string
            worth = souphelper.string_to_int(worth_string)
            if "Million" in worth_string:
                worth = worth * 1000000

            tr = soup.find_all('tr')
            for item in tr:
                scan = str(item)
                # print "--------------------------"
                # print item
                if "Pagerank" in scan:
                    google_page_rank = souphelper.string_to_int(str(item.td.next_sibling))
                if "Alexa Rank" in scan:
                    alexa_rank = souphelper.string_to_int(str(item.td.next_sibling))
                if "Daily Unique Visitors" in scan:
                    daily_visits = souphelper.string_to_int(str(item.td.next_sibling))
                if "Daily Ad Revenue" in scan:
                    daily_revenue = souphelper.string_to_int(str(item.td.next_sibling))
        except:
            return 0

        try:
            db_query = 'REPLACE INTO webstatsart SET url="%s", worth="%s", daily_visits="%s", daily_revenue="%s", alexa_rank="%s", google_page_rank="%s"' % (url, worth, daily_visits, daily_revenue, alexa_rank, google_page_rank)
            con.query(db_query)
            return 1
        except _mysql.Error, e:
            print "Error %d: %s" % (e.args[0], e.args[1])
            return 0
Пример #3
0
def scrape_site(url):
    print "Now Scraping: " + url
    soup = souphelper.get_soup("http://www.statsmogul.com/" + url)
    if soup:
        td = soup.find_all('td')
        for i in range(len(td)):
            new_value = souphelper.string_to_int(str(td[i]))
            if new_value is not None:
                td[i] = new_value

        try:
            # The actual data
            em = soup.find_all('em')
            worth = souphelper.string_to_int(str(em[4]))
            alexa_rank = td[0]
            compete_rank = td[1]
            quantcast_rank = td[2]
            google_page_rank = td[7]
            domain_characters = td[3]
            if "No" in td[4]:
                domain_dictionary = 0
            else:
                domain_dictionary = 1
            domain_age = td[5]
            if "No" in td[6]:
                domain_dashes_or_numbers = 0
            else:
                domain_dashes_or_numbers = 1
            yahoo_backlinks = td[8]
            if "No" in td[10]:
                meta_tags = 0
            else:
                meta_tags = 1
            if "No" in td[11]:
                dmoz_directory = 0
            else:
                dmoz_directory = 1
        except:
            print "     Error scraping: " + url
            return 0


        try:
            db_query = 'REPLACE INTO statsmogul SET url="%s", worth="%s", alexa_rank="%s", compete_rank="%s", quantcast_rank="%s", google_page_rank="%s", domain_characters="%s", domain_dictionary="%s", domain_age="%s", domain_dashes_or_numbers="%s", yahoo_backlinks="%s", meta_tags="%s", dmoz_directory="%s"' % (url, worth, alexa_rank, compete_rank, quantcast_rank, google_page_rank, domain_characters, domain_dictionary, domain_age, domain_dashes_or_numbers, yahoo_backlinks, meta_tags, dmoz_directory)
            con.query(db_query)
            return 1
        except _mysql.Error, e:
            print "Error %d: %s" % (e.args[0], e.args[1])
            return 0
Пример #4
0
def scrape_site(url):
  print "Now scraping: " + url
  soup = souphelper.get_soup(url)
  if soup is None:
    return 0

  try:
    h1s = soup.find_all('h1')
    title = h1s[1].string.strip()
    tables = soup.find_all('table')
    counter = 1
    for td in tables[4].find_all('td'):
      if counter < 4: 
        value = souphelper.string_to_int(td.string)
      if counter == 1:
        price = value
      elif counter == 2:
        income = value
      elif counter == 3:
        cash_flow = value
      else:
        break
      counter += 1
    db_access.insert_data_point('websiteproperties.com', title, 'Price', price)
    db_access.insert_data_point('websiteproperties.com', title, 'Gross Income', income)
    db_access.insert_data_point('websiteproperties.com', title, 'Cash Flow', cash_flow)
  except:
    print "Error scraping page: " + url
Пример #5
0
def scrape_site(url):
	print "Now scraping: " + url
	soup = souphelper.get_soup(url)
	if not soup:
		return 0
	# doc = open('websitebroker_page.htm').read()
	# soup = BeautifulSoup(doc)

	# Want [4]
	tables = soup.find_all('table')
	counter = 1
	try:
		for td in tables[4].find_all('td'):
			# This is totally ridiculous to call the re.search on each of these itmes
			# but for some reason calling it once in a variable was having issues.
			# Fix me if you have time!!!
			if td.string is None:
				break
			if counter == 9:
				visitors = souphelper.string_to_int(td.string)
				# print visitors
			elif counter == 11:
				page_views = souphelper.string_to_int(td.string)
				# print page_views
			elif counter == 13:
				income = souphelper.string_to_int(td.string)
				# print income
			elif counter == 15:
				expenses = souphelper.string_to_int(td.string)
				# print expenses
			elif counter == 17:
				price = souphelper.string_to_int(td.string)
				# print price
			elif counter == 19:
				url = td.string
				# print url
			counter += 1

		db_access.insert_data_point('websitebroker.com', url, 'Price', price)
		db_access.insert_data_point('websitebroker.com', url, 'Visitors', visitors)
		db_access.insert_data_point('websitebroker.com', url, 'Page Views', page_views)
		db_access.insert_data_point('websitebroker.com', url, 'Income', income)
		db_access.insert_data_point('websitebroker.com', url, 'Expenses', expenses)
	except:
		print "Error scraping site: " + url
Пример #6
0
def scrape_site(url):
    print "Now scraping: " + url
    soup = souphelper.get_soup("http://www.cubestat.com/www." + url)
    # Yeah, it's stupid to have to assign these here, but even with try I was getting assignment errors
    # worth = 0
    # daily_pageviews = 0
    # daily_revenue = 0
    # alexa_rank = 0
    # quantcast_rank = 0
    # compete_rank = 0
    # google_page_rank = 0
    # live_indexed_pages = 0
    # domain_age = 0
    if soup:
        # for span in soup.find_all('span'):
        #     print span
        spans = soup.find_all('span')
        for i in range(len(spans)):
            try:
                # print str(i) + ' : ' + str(souphelper.string_to_int(str(spans[i].string)))
                spans[i] = souphelper.string_to_int(str(spans[i].string))
            except:
                spans[i] = 0

        try:
            worth = spans[0]
            daily_pageviews = spans[1]
            daily_revenue = spans[2]
            alexa_rank = spans[9]
            quantcast_rank = spans[10]
            compete_rank = spans[11]
            google_page_rank = spans[12]
            live_indexed_pages = spans[17]
            domain_age = spans[26]
        except:
            print "list index out of range error, cannot scape: " + url
            return 0

        try:
            db_query = 'REPLACE INTO cubestat SET url="%s", worth="%s", daily_pageviews="%s", daily_revenue="%s", alexa_rank="%s", quantcast_rank="%s", compete_rank="%s", google_page_rank="%s", live_indexed_pages="%s", domain_age="%s"' % (url, worth, daily_pageviews, daily_revenue, alexa_rank, quantcast_rank, compete_rank, google_page_rank, live_indexed_pages, domain_age)
            con.query(db_query)
        except _mysql.Error, e:
            print "Error %d: %s" % (e.args[0], e.args[1])
            return 0
Пример #7
0
def scrape_site(url):
    print 'Now scraping: ' + url
    soup = souphelper.get_soup('http://www.yandalo.com/www.' + url)
    if not soup:
        return 0

    h1 = soup.find_all('h1')
    for i in range(len(h1)):
        item = str(h1[i])
        if i < len(h1) - 1:
            value = souphelper.string_to_int(str(h1[i+1])[4:])
        # print item
        if 'Worth' in item:
            worth = value
        if 'Pageviews' in item:
            daily_pageviews = value
        if 'Earnings' in item:
            daily_revenue = value

    h3 = soup.find_all('h3')
    for i in range(len(h3)):
        h3[i] = str(h3[i])[4:]
        # print h3[i]
        # print str(i) + ' : ' + item
    try:
        alexa_rank = souphelper.string_to_int(str(h3[1]))
        google_page_rank = str(h3[11])[80:81] #FIX ME!!
        bounce_rate = souphelper.string_to_float(str(h3[15])) #fix, this is a percentage!
        pageviews_per_user = souphelper.string_to_int(str(h3[5])) #again, this is a decimal
        daily_time_on_site = souphelper.string_to_int(str(h3[6])) #decimal
        load_time = souphelper.string_to_float(str(h3[9])) #decimal
        bing_backlinks = souphelper.string_to_int(str(h3[3]))
        inbound_links = souphelper.string_to_int(str(h3[4]))
        visits_from_se = souphelper.string_to_float(str(h3[16])) #decimal
        if 'Yes' in h3[0]:
            dmoz_directory = 1
        else:
            dmoz_directory = 0
    except:
        print 'Error grabbing data for: ' + url
        return 0



    try:
        db_query = 'REPLACE INTO yandalo SET url="%s", worth="%s", daily_pageviews="%s", daily_revenue="%s", alexa_rank="%s", google_page_rank="%s", bounce_rate="%s", pageviews_per_user="******", daily_time_on_site="%s", load_time="%s", bing_backlinks="%s", inbound_links="%s", visits_from_se="%s", dmoz_directory="%s"' % (url, worth, daily_pageviews, daily_revenue, alexa_rank, google_page_rank, bounce_rate, pageviews_per_user, daily_time_on_site, load_time, bing_backlinks, inbound_links, visits_from_se, dmoz_directory)
        con.query(db_query)
    except _mysql.Error, e:
        print "Error %d: %s" % (e.args[0], e.args[1])
        return 0