예제 #1
0
def calc_slope(asin, category_url):
	"""Calculates the slope given asin and category_url
	1. create a list of lists, with each point for each asin [[x,y], [x,y]]: x 
	is days (starting from 0); y is rank
	2. given the list calculate the following variables: num_of_points, 
	sum_of_x, sum_of_y, sum_of_xy, sum_of_x_squared
	3. if num_of_points < 3: skip the asin
	4. calculate the slope of the line: ((num_of_points * sum_of_xy) - 
		(sum_of_x * sum_of_y)) / ((num_of_points * sum_of_x_squared) - 
		sum_of_x^2)
	"""
	datapoints = []
	results = dbdo.get_scrape_date_rank(category_url, asin)
	if len(results) == 0 or len(results) == 1:
		return None
	zero_date = results[0][0]
	for line in results:
		datapoints.append([helpers.day_delta(line[0], zero_date), int(line[1])])
	num_of_points = len(datapoints)
	sum_of_x = 0.0
	sum_of_y = 0.0
	sum_of_xy = 0.0
	sum_of_x_squared = 0.0
	for datum in datapoints:
		sum_of_x += datum[0]
		sum_of_y += datum[1]
		sum_of_xy += (datum[0] * datum[1])
		sum_of_x_squared += (datum[0] * datum[0])
	try:
		slope = ((num_of_points * sum_of_xy) - (sum_of_x * sum_of_y)) / \
		((num_of_points * sum_of_x_squared) - (sum_of_x * sum_of_x))
	except Exception, e:
		print datapoints
		print num_of_points
		print sum_of_xy
		print sum_of_x
		print sum_of_y
		print sum_of_x_squared
		raise
예제 #2
0
def calc_categories_to_scrape(days=5):
	"""Returns an array of categories to scrape, which are older than 5 days"""
	categories_to_scrape = []
	results = [['http://www.amazon.com/s/ref=sr_hi_1?rh=n%3A3760901&ie=UTF8']]
	#Scraping only Health & Personal category_results... uncomment line below to scrape all
	#results = dbdo.get_all_category_urls()
	for line in results:
		category_url = line[0]
		start_rank = last_rank_scraped(category_url)
		#Check if last scrape attempt was completed
		if start_rank != 1:
			categories_to_scrape.append([category_url, start_rank])
			continue
		#Add to queue categories that haven't been scraped or are older than days
		scrape_date_results = dbdo.get_last_scrape_date(category_url)
		if scrape_date_results == ():
			categories_to_scrape.append([category_url, start_rank])
			continue
		last_scrape = scrape_date_results[0][0]
		if helpers.day_delta(last_scrape) < -days:
			categories_to_scrape.append([category_url, start_rank])
	return categories_to_scrape