def search(keyword): ''' search content for keyword @author: douzifly ''' if not keyword: return url = "http://www.hakuzy.com/search.asp" keyword = Utils.to_unicode(keyword) params = {"searchword": keyword.encode("gbk")} html = WebTool.request(url, params, "post") # replace with other lib if not html: Log.write_stderr('ERROR:cant get html') return Log.write_stdout(html) # this html only contain search result, no hash parser = HakuzyVideoParser() # do not create parse every time # find video links cache = VideoCache() for url in parser.parse_search_page(html): html = WebTool.request(url) soup = BeautifulSoup(html) video_info = Video_Info() video_info.ref_url = url parser.parse(soup, video_info) Log.write_stdout("###################") Log.write_stdout(video_info) cache.add(video_info) time.sleep(Config.NETWORK_REQUST_INTERVAL) cache.flush() # write left items to persistence
def add_next_entry(self, nextEntries): ''' 添加NextEntry到数据库,忽略重复项 @param nextEntries: NextEntry表 @type nextEntries: pythpn序列类型 @return: 无 ''' for entryURL in nextEntries: doc = dict() doc[KoalaStatus.DOC_FIELD_HASH] = Utils.hash(entryURL) doc[KoalaStatus.DOC_FIELD_URL] = entryURL try: self.collNextEntry.insert(doc, safe=True) except pymongo.errors.DuplicateKeyError as error: Log.write_stderr(repr(error))
url = "http://www.hakuzy.com/search.asp" keyword = Utils.to_unicode(keyword) params = {"searchword": keyword.encode("gbk")} html = WebTool.request(url, params, "post") # replace with other lib if not html: Log.write_stderr('ERROR:cant get html') return Log.write_stdout(html) # this html only contain search result, no hash parser = HakuzyVideoParser() # do not create parse every time # find video links cache = VideoCache() for url in parser.parse_search_page(html): html = WebTool.request(url) soup = BeautifulSoup(html) video_info = Video_Info() video_info.ref_url = url parser.parse(soup, video_info) Log.write_stdout("###################") Log.write_stdout(video_info) cache.add(video_info) time.sleep(Config.NETWORK_REQUST_INTERVAL) cache.flush() # write left items to persistence if __name__ == "__main__": #start_crawl() search("青春") Log.write_stderr('finished')
def __crawl_proc(self, entryURL, maxDepth): ''' 爬行的执行过程 @param entryURL: 爬虫的入口url @type entryURL: 字符串 @param maxDepth: 最大抓取深度 @type maxDepth: 整数 @return: 满足过滤条件的url @rtype: 字符串 ''' # 如果达到最大深度则返回 if maxDepth <= 0: return # 解析出页面中所有的链接 try: source = Utils.get_url_html(entryURL) soup = BeautifulSoup(source, Config.DEFAULT_HTML_PARSER) if self.__yield_filter(entryURL): self.__parse(soup) except Exception as error: Log.write_stderr(repr(error)) return links = list() for elemA in soup.find_all('a'): try: links.append(elemA['href']) except KeyError as error: #Log.write_stderr(repr(error)) pass # 生成符合规则的链接,并记录符合规则的子页面 nextEntries = list() for link in links: url = urlparse.urljoin(entryURL, link) if self.__global_filter(entryURL, url): if self.__yield_filter(url): yield url if self.__entry_filter(url): nextEntries.append(url) # 执行到此处代表一个(子)页面(EntryURL)处理完成 # 需要记录到已处理页面集合中。处于性能考虑,记录url的hash值而非url本身 self.visitedEntriesHash.add(Utils.hash(entryURL)) # 如果启用状态支持,则同步删除数据库中对应的NextEntry数据(如果有的话) if self.koalaStatus: self.koalaStatus.remove_next_entry([entryURL]) # 如果即将达到最大深度,处于性能考虑,不再进入子页面 if maxDepth - 1 <= 0: return else: # 准备进入子页面之前,同步更新状态 if self.koalaStatus: self.koalaStatus.add_next_entry(nextEntries) # 广度优先抓取 for nextEntryURL in nextEntries: if Utils.hash(nextEntryURL) not in self.visitedEntriesHash: for url in self.__crawl_proc(nextEntryURL, maxDepth - 1): yield url