示例#1
0
def search(keyword):
    ''' 
        search content for keyword
        @author: douzifly
    '''
    if not keyword:
        return
    url = "http://www.hakuzy.com/search.asp"
    keyword = Utils.to_unicode(keyword)
    params = {"searchword": keyword.encode("gbk")}
    html = WebTool.request(url, params, "post") # replace with other lib
    if not html:
        Log.write_stderr('ERROR:cant get html')
        return
    Log.write_stdout(html) # this html only contain search result, no hash
    parser = HakuzyVideoParser() # do not create parse every time

    # find video links
    cache = VideoCache()
    for url in parser.parse_search_page(html):
        html = WebTool.request(url)
        soup = BeautifulSoup(html)
        video_info = Video_Info()
        video_info.ref_url = url
        parser.parse(soup, video_info)
        Log.write_stdout("###################")
        Log.write_stdout(video_info)
        cache.add(video_info)
        time.sleep(Config.NETWORK_REQUST_INTERVAL)
    cache.flush()  # write left items to persistence
示例#2
0
文件: Koala.py 项目: ifels/pySpider
	def add_next_entry(self, nextEntries):
		'''
		添加NextEntry到数据库,忽略重复项

		@param nextEntries: NextEntry表
		@type nextEntries: pythpn序列类型

		@return: 无
		'''
		for entryURL in nextEntries:
			doc = dict()
			doc[KoalaStatus.DOC_FIELD_HASH] 	 = Utils.hash(entryURL)
			doc[KoalaStatus.DOC_FIELD_URL] 		 = entryURL
			try:
				self.collNextEntry.insert(doc, safe=True)
			except pymongo.errors.DuplicateKeyError as error:
				Log.write_stderr(repr(error))
示例#3
0
    url = "http://www.hakuzy.com/search.asp"
    keyword = Utils.to_unicode(keyword)
    params = {"searchword": keyword.encode("gbk")}
    html = WebTool.request(url, params, "post") # replace with other lib
    if not html:
        Log.write_stderr('ERROR:cant get html')
        return
    Log.write_stdout(html) # this html only contain search result, no hash
    parser = HakuzyVideoParser() # do not create parse every time

    # find video links
    cache = VideoCache()
    for url in parser.parse_search_page(html):
        html = WebTool.request(url)
        soup = BeautifulSoup(html)
        video_info = Video_Info()
        video_info.ref_url = url
        parser.parse(soup, video_info)
        Log.write_stdout("###################")
        Log.write_stdout(video_info)
        cache.add(video_info)
        time.sleep(Config.NETWORK_REQUST_INTERVAL)
    cache.flush()  # write left items to persistence


if __name__ == "__main__":
    #start_crawl()
    search("青春")
    Log.write_stderr('finished')

示例#4
0
文件: Koala.py 项目: ifels/pySpider
	def __crawl_proc(self, entryURL, maxDepth):
		'''
		爬行的执行过程

		@param entryURL: 爬虫的入口url
		@type entryURL: 字符串
		@param maxDepth: 最大抓取深度
		@type maxDepth: 整数

		@return: 满足过滤条件的url
		@rtype: 字符串
		'''
		# 如果达到最大深度则返回
		if maxDepth <= 0:
			return

		# 解析出页面中所有的链接
		try:
			source = Utils.get_url_html(entryURL)
			soup = BeautifulSoup(source, Config.DEFAULT_HTML_PARSER)
			if self.__yield_filter(entryURL):
				self.__parse(soup)
		except Exception as error:
			Log.write_stderr(repr(error))
			return
		links = list()
		for elemA in soup.find_all('a'):
			try:
				links.append(elemA['href'])
			except KeyError as error:
				#Log.write_stderr(repr(error))
				pass

		# 生成符合规则的链接,并记录符合规则的子页面
		nextEntries = list()
		for link in links:
			url = urlparse.urljoin(entryURL, link)
			if self.__global_filter(entryURL, url):
				if self.__yield_filter(url):
					yield url
				if self.__entry_filter(url):
					nextEntries.append(url)

		# 执行到此处代表一个(子)页面(EntryURL)处理完成

		# 需要记录到已处理页面集合中。处于性能考虑,记录url的hash值而非url本身
		self.visitedEntriesHash.add(Utils.hash(entryURL))

		# 如果启用状态支持,则同步删除数据库中对应的NextEntry数据(如果有的话)
		if self.koalaStatus:
			self.koalaStatus.remove_next_entry([entryURL])

		# 如果即将达到最大深度,处于性能考虑,不再进入子页面
		if maxDepth - 1 <= 0:
			return
		else:
			# 准备进入子页面之前,同步更新状态
			if self.koalaStatus:
				self.koalaStatus.add_next_entry(nextEntries)

			# 广度优先抓取
			for nextEntryURL in nextEntries:
				if Utils.hash(nextEntryURL) not in self.visitedEntriesHash:
					for url in self.__crawl_proc(nextEntryURL, maxDepth - 1):
						yield url