示例#1
0
def crawl_one_keyword(keyword):
	'''Scrapes one keyword.

	Returns:
		list of links, a link is a dictionary with keys:
			link, rank, snippet, title, visible_link, date, keyword
			
	raises: SERPError
	'''
	url = get_keyword_url(keyword)
	logging.debug("trying to download SERP {}".format(url))
	try:
		rawhtml, headers = urlrequest.get_raw_html(url)
	except requests.exceptions.RequestException as e:
		raise SERPError(e)

	date = _date()

	if is_blocked(rawhtml):
		raise SERPError()

	#links = parse(rawhtml) + [{'link':"http://lesbartavelles13.free.fr/IMAGE-ISO/ENGLISH6EME.iso"}]

	links = parse(rawhtml)

	# adding scraping information to links
	for i in links:
		i['date'] = date
		i['keyword'] = keyword
		i['link'] = encode(i['link'])   #.encode('UTF-8')

	return links
	def _generate(self):
		while True:
			x,_ = urlrequest.get_raw_html('https://en.wikipedia.org/wiki/Special:Random')
			pagetitle = BeautifulSoup(x,"lxml").html.head.title.string
			# there is " - Wikipedia, the free encyclopedia" in the end of every
			# page title, I'm removing it
			title = pagetitle[:-len(" - Wikipedia, the free encyclopedia")]
			for i in title.split():
				yield i
	def _generate(self):
		while True:
			url = "https://en.wikipedia.org/wiki/Special:Random"
			html, _ = urlrequest.get_raw_html(url)
			soup = BeautifulSoup(html,"lxml").html
			divs = soup('div',{"id":"mw-content-text"})
			if len(divs)==0: # article is probably empty
				continue
			# this is a text of article
			article = divs[0].text
			words = article.split()
			for i in words:
				yield i