Exemplo n.º 1
0
def crawl_one_keyword(keyword):
	'''Scrapes one keyword.

	Returns:
		list of links, a link is a dictionary with keys:
			link, rank, snippet, title, visible_link, date, keyword
			
	raises: SERPError
	'''
	url = get_keyword_url(keyword)
	logging.debug("trying to download SERP {}".format(url))
	try:
		rawhtml, headers = urlrequest.get_raw_html(url)
	except requests.exceptions.RequestException as e:
		raise SERPError(e)

	date = _date()

	if is_blocked(rawhtml):
		raise SERPError()

	#links = parse(rawhtml) + [{'link':"http://lesbartavelles13.free.fr/IMAGE-ISO/ENGLISH6EME.iso"}]

	links = parse(rawhtml)

	# adding scraping information to links
	for i in links:
		i['date'] = date
		i['keyword'] = keyword
		i['link'] = encode(i['link'])   #.encode('UTF-8')

	return links
Exemplo n.º 2
0
	def test_parsing(self):
		link = "http://www.bing.com/search?q=ahoj&qs=ds&form=QBLH&scope=web"
		html = requests.get(link).text

		links = parse(html)
		
		self.assertTrue(len(links)>0)
		
		l = links[0]
		self.assertTrue('link' in l and 'snippet' in l and 'title' in l)

		self.assertTrue('http' in l['link'])