def getPOI2(q,region): q = q.encode('utf-8') region = region.encode('utf-8') L = [] ak = u'BjZFyCBFktfZmdj7SVP98fEFx78KzFn4' #ak=u'skS8wg9wP1VVFk2iuDuQATzoWKMb8FuY' #ak=u'AKsr88dgGDK8d74q7wTRbhiSb567HVmA' q = urllib2.quote(q) region = urllib2.quote(region) #hd = HtmlDownloader.HtmlDownloader() #http://api.map.baidu.com/place/v2/search?query=购物中心®ion=天津&city_limit=true&output=json&ak=BjZFyCBFktfZmdj7SVP98fEFx78KzFn4&page_num=0 baseUrl = 'http://api.map.baidu.com/place/v2/search?query=%s®ion=%s&city_limit=true&output=json&ak=%s&page_num=' %(q,region,ak) page = 0 total= 1 while page*10 < total: url = baseUrl+unicode(str(page),'utf-8') print url res = HtmlDownloader.download(url) while(res is None): res = HtmlDownloader.download(url) print "retrying...",url res = unicode(res,'utf-8') data = JsonUtils.readStr(res) status = data[u'status'] message = data[u'message'] if status==0 and message=='ok': #返回成功 L.extend(data[u'results']) total = data[u'total'] page = page + 1 else: #失败 print u"查询失败",message return L return L
class SpiderMan: def __init__(self): self.downloader = HtmlDownloader() self.parser = HtmlParser() self.output = DataOutput() def crawl(self, root_url): content = self.downloader.download(root_url) urls = self.parser.parser_url(root_url, content) # 构造一个获取评分和票房链接 for url in urls: try: t = time.strftime('%Y%m%d%H%M%S3282', time.localtime()) rank_url = ('http://service.library.mtime.com/Movie.api' '?Ajax_CallBack=true' '&Ajax_CallBackType=Mtime.Library.Service' '&Ajax_CallBackMethod=GetMovieOverviewRating' '&Ajax_CrossDomain=1' '&Ajax_RequestUrl=%s' '&t=%s' '&Ajax_CallBackArgument0=%s' % (url[0], t, url[1])) rank_content = self.downloader.download(rank_url) data = self.parser.parser_json(rank_url, rank_content) self.output.store_data(data) except Exception as e: print(e, 'Crawl failed') self.output.output_end() print('Crawl finish')
def _getWikiBooks(enWikiUrls): wikiBooks = [] for enWikiUrl in enWikiUrls: print('Getting wiki books ', enWikiUrl) wikiBook = _WikiBook() wikiBook.Pages = [] wikiBookPage = _WikiBookPage() wikiBookPage.Language = 'EN' wikiBookPage.WikiUrl = enWikiUrl wikiBook.Pages.append(wikiBookPage) html = HtmlDownloader.DownloadHtml('en.wikipedia.org', enWikiUrl) soup = BeautifulSoup(html) ruLinkElement = soup.find('li', attrs={'class':'interlanguage-link interwiki-ru'}) if ruLinkElement != None: wikiBookPage = _WikiBookPage() wikiBookPage.Language = 'RU' wikiBookPage.WikiUrl = ruLinkElement.find('a')['href'].replace('//ru.wikipedia.org', '') wikiBook.Pages.append(wikiBookPage) deLinkElement = soup.find('li', attrs={'class':'interlanguage-link interwiki-de'}) if deLinkElement != None: wikiBookPage = _WikiBookPage() wikiBookPage.Language = 'DE' wikiBookPage.WikiUrl = deLinkElement.find('a')['href'].replace('//de.wikipedia.org', '') wikiBook.Pages.append(wikiBookPage) wikiBooks.append(wikiBook) return wikiBooks
def getPOI(key,radius,lat,lng): ak = u'BjZFyCBFktfZmdj7SVP98fEFx78KzFn4' #ak=u'skS8wg9wP1VVFk2iuDuQATzoWKMb8FuY' #ak=u'AKsr88dgGDK8d74q7wTRbhiSb567HVmA' key = urllib2.quote(key) hd = HtmlDownloader.HtmlDownloader() ju = JsonUtils.JsonUtils() url = 'http://api.map.baidu.com/place/v2/search?scope=2&query=%s&location=%s,%s&radius=%s&output=json&ak=%s&page_size=20' %(key,lat,lng,radius,ak) #print url res = hd.download(url) while(res is None): res = hd.download(url) print "retrying...",url res = unicode(res,'utf-8') data = ju.readStr(res) status = data[u'status'] total = u'0' L =[] #返回成功 sss=u"" index = 1 if status==0: for poi in data[u'results']: name = poi[u'name'] location = poi[u'location'] address = poi[u'address'] location = unicode(ju.writeJson2Str(location),'utf-8') sss = sss+unicode(str(index),'utf-8')+u":" + name+u"——"+address+u"——"+location index = index+1 return data,sss else: return None
def getPos(place,city): ak = u'BjZFyCBFktfZmdj7SVP98fEFx78KzFn4' #ak=u'skS8wg9wP1VVFk2iuDuQATzoWKMb8FuY' #ak=u'AKsr88dgGDK8d74q7wTRbhiSb567HVmA' d={} place=urllib2.quote(place) city=urllib2.quote(city) hd = HtmlDownloader.HtmlDownloader() ju = JsonUtils.JsonUtils() url =u'http://api.map.baidu.com/geocoder/v2/?address=%s&output=json&ak=%s&city=%s' %(place,ak,city) print url res = hd.download(url) res = unicode(res,'utf-8') data = ju.readStr(res) status = data[u'status'] if status==0: lng = data[u'result'][u'location'][u'lng'] lat = data[u'result'][u'location'][u'lat'] d[u"lng"]=unicode(str(lng),"utf-8") d[u"lat"]=unicode(str(lat),'utf-8') return d else: print url return None
class SpiderMan(object): """docstring for SpiderMan""" def __init__(self): self.manager = UrlManager() self.downloader = HtmlDownloader() self.parser = HtmlParser() self.output = DataOutput() def crawl(self, root_url): #添加入口URL self.manager.add_new_url(root_url) #判断url管理器中是否有新的url,同时判断抓取了多少个url while (self.manager.has_new_url() and self.manager.old_url_size() < 100): try: #从URL管理器获取新的url new_url = self.manager.get_new_url() #HTML下载器下载网页 html = self.downloader.download(new_url) #HTML解析器抽取网页数据 new_urls, data = self.parser.parser(new_url, html) #将抽取的url添加到URL管理器中 self.manager.add_new_urls(new_urls) #数据存储器存储文件 self.output.store_data(data) print("已经抓取%s个链接" % self.manager.old_url_size()) except Exception as e: print("crawl failed") #数据存储器将文件输出成指定格式 self.output.output_html()
def __init__(self): ''' 初始化各模块 ''' self.DA = DataArranger.DataArranger() self.HD = HtmlDownloader.HtmlDownloader() self.HP = HtmlParser.HtmlParser() self.UM = UrlManager.UrlManager()
def __init__(self): # 初始化分布式进程中工作节点的链接工作 # 实现第一步: 使用BaseManager注册用于获取Queue的方法名称 BaseManager.register('get_task_queue') BaseManager.register('get_result_queue') # 实现第二步: 连接到服务器 server_addr = '127.0.0.1' print('Connect to server %s...' % server_addr) # 注意保持端口和验证口令与服务器进程设置的完全一致 self.m = BaseManager(address=(server_addr, 8001), authkey='baike') # 从网络连接 self.m.connect() # 实现第三步: 获取Queue的对象 self.task = self.m.get_task_queue() self.result = self.m.get_result_queue() # 初始化网页下载器和解析器 self.downloader = HtmlDownloader() self.parser = HtmlParser() print('init finish.')
def __init__(self): # 爬取深度(页数) self.maxPageDeep = 1 # 地址管理器 self.UrlsManager = UrlManager.UrlManager() # 下载器 self.Downloader = HtmlDownloader.HtmlDownloader() # 解析器 self.Parser = HtmlParser.HtmlParser() # 输出器 self.Outputer = HtmlOutputer.HtmlOutputer()
def _getLowereadBookPages(bookId): pages = [] pageNum = 1 while True: html = HtmlDownloader.DownloadHtml( 'loveread.ws', '/read_book.php?id=' + bookId + '&p=' + str(pageNum)) if str(pageNum) != _getLowereadPageNumber(html): break pages.append(html) pageNum += 1 return pages
def getSongs(self): L = [] urls = self.getUrls() for url in urls: hd = HtmlDownloader.HtmlDownloader() html_cont = hd.download(url) while html_cont is None: print id, u"retrying..." html_cont = hd.download(url) res = self.parseJson(html_cont) L.extend(res) return L
def _getGutenbergSpiegelPages(bookId): pages = [] pageNum = 1 while True: pageHtml = HtmlDownloader.DownloadHtml( 'gutenberg.spiegel.de', '/buch/' + bookId + '/' + str(pageNum)) if pageNum > 1 and _isGutenbergSpiegelPageEmpty(pageHtml): break pages.append(pageHtml) pageNum += 1 return pages
def crawl(): try: global count, mutex if mutex.acquire(): count += 1 new_url = url.get_new_url() print('正在爬第' + str(count) + '条:' + new_url) mutex.release() html = downloader.download(new_url) url_list = parser.parser(html) url.add_new_urls(url_list) except: print('未知异常')
def __init__(self): #self.login = login() self.urls = UrlManager.url_manager() self.downloader = HtmlDownloader.htmldownloader() self.parser = HtmlParser.htmlparser() self.imgdownloader = ImgDownloader.imgdownloader() self.url_list = self.get_url_list() self.url_list_num = len(self.url_list) self.url_list_cnt = 0 self.img_list = None self.img_list_num = 0 self.img_list_cnt = 0
def _getReadCentraChapterPaths(path): chapterPaths = [] html = HtmlDownloader.DownloadHtml('www.readcentral.com', path) soup = BeautifulSoup(html) tdElements = soup.find_all('td', attrs={'class': 'bookindex'}) for tdElement in tdElements: linkElement = tdElement.find('a') if linkElement != None: path = linkElement['href'] chapterPaths.append(path) chapterPaths.sort() return chapterPaths
def _getGutenbergSpiegelBookId(title): bookId = None title = title.lower() indexPageHtml = HtmlDownloader.DownloadHtml('gutenberg.spiegel.de', '/buch') indexPageSoup = BeautifulSoup(indexPageHtml) booksElements = indexPageSoup.find('div', attrs={ 'id': 'spTeaserColumn' }).find_all('a') for bookElement in booksElements: if bookElement.string.strip().lower() == title.lower(): bookId = bookElement['href'].split('/')[2] return bookId
def _getPathToReadcentralBook(title): path = None titleFirstLetter = title[0] html = HtmlDownloader.DownloadHtml( 'www.readcentral.com', '/read-online-books/' + titleFirstLetter) soup = BeautifulSoup(html) tdElements = soup.find_all('td', attrs={'class': 'bookindex'}) for tdElement in tdElements: linkElement = tdElement.find('a') linkText = linkElement.string.strip() if linkText.lower() == title.lower(): path = linkElement['href'] return path
def __init__(self): #初始化分布式进程中的工作节点的连接工作 # 实现第一步:使用BaseManager注册获取Queue的方法名称 BaseManager.register('get_task_queue') BaseManager.register('get_result_queue') # 实现第二步:连接到服务器: server_addr = '127.0.0.1' print('Connect to server %s...' % server_addr) # 端口和验证口令注意保持与服务进程设置的完全一致: self.m = BaseManager(address=(server_addr, 8001), authkey='baike'.encode('utf-8')) # 从网络连接: self.m.connect() # 实现第三步:获取Queue的对象: self.task = self.m.get_task_queue() self.result = self.m.get_result_queue() #初始化网页下载器和解析器 self.downloader = HtmlDownloader.HtmlDownloader() self.parser = HtmlParser.HtmlParser() print('init finish')
def _getLowereadBookId(title): bookId = None title = title.lower() titleFirstLetter = title[0] html = HtmlDownloader.DownloadHtml( 'loveread.ws', '/letter_nav.php?let=' + str((ord(titleFirstLetter) - ord('а')) + 1)) soup = BeautifulSoup(html) booksElement = soup.find('ul', attrs={'class': 'let_ul'}) if booksElement != None: for bookElement in booksElement.find_all('li'): if bookElement.a.string.strip().lower() == title.lower(): bookId = bookElement.a['href'].split('id=')[1] return bookId
class SpiderWork: def __init__(self): # 初始化分布式进程中工作节点的链接工作 # 实现第一步: 使用BaseManager注册用于获取Queue的方法名称 BaseManager.register('get_task_queue') BaseManager.register('get_result_queue') # 实现第二步: 连接到服务器 server_addr = '127.0.0.1' print('Connect to server %s...' % server_addr) # 注意保持端口和验证口令与服务器进程设置的完全一致 self.m = BaseManager(address=(server_addr, 8001), authkey='baike') # 从网络连接 self.m.connect() # 实现第三步: 获取Queue的对象 self.task = self.m.get_task_queue() self.result = self.m.get_result_queue() # 初始化网页下载器和解析器 self.downloader = HtmlDownloader() self.parser = HtmlParser() print('init finish.') def crawl(self): while True: try: if not self.task.emtpy(): url = self.task.get() if url == 'end': print('控制节点通知爬虫节点停止工作...') # 接着通知其它节点停止工作 self.result.put({'new_urls': 'end', 'data': 'end'}) return print('爬虫节点正在解析: %s' % url.encode('utf-8')) content = self.downloader.download(url) new_urls, data = self.parser.parser(url, content) self.result.put({'new_urls': new_urls, 'data': data}) except EOFError: print('连接工作节点失败') return except Exception as e: print(e) print('Crawl fail.')
class SpiderMan(object): def __init__(self): self.manager = URLManager() self.downloader = HtmlDownloader() self.parser = HtmlParser() self.output = DataOutput() def crawl(self, root_url): self.manager.add_new_url(root_url) while (self.manager.has_new_url() and self.manager.old_url_size() < 100): try: new_url = self.manager.get_new_url() html = self.downloader.download(new_url) new_urls, data = self.parser(new_url) self.manager.add_new_url(new_urls) self.output.store_data(data) print('已经抓取了{}个连接'.format(self.manager.old_url_size())) except Exception: print('爬取失败') self.output.output_html()
def getPOI(key, radius, lat, lng): ak = u'BjZFyCBFktfZmdj7SVP98fEFx78KzFn4' key = urllib2.quote(key) hd = HtmlDownloader.HtmlDownloader() ju = JsonUtils.JsonUtils() url = 'http://api.map.baidu.com/place/v2/search?query=%s&location=%s,%s&radius=%s&output=json&ak=%s&page_size=20' % ( key, lat, lng, radius, ak) #print url res = hd.download(url) res = unicode(res, 'utf-8') data = ju.readStr(res) status = data[u'status'] total = u'0' L = [] if status == 0: total = data[u'total'] total, unicode(str(total), 'utf-8') ress = data[u'results'] for res in ress: L.append(res['name']) else: pass return total, len(ress), L
def _getReadcentralBookChapter(path): chapter = Library.BookChapter() chapter.Paragraphs = [] html = HtmlDownloader.DownloadHtml('www.readcentral.com', path) soup = BeautifulSoup(html) pageheadElem = soup.find('div', attrs={'id': 'pagehead'}) chapter.Title = pageheadElem.div.string.strip() contentElement = soup.find( 'div', attrs={'id': 'ctl00_contents_book_chapter_content_area'}) for paragraphElement in contentElement.find_all('p'): paragraphString = ''.join(paragraphElement.strings) #paragraphString = re.sub('<[^>]+>', '', paragraphString) if paragraphString != None: paragraph = Library.BookParagraph() paragraph.Sentences = getEnSentencesFromParagraphString( paragraphString) if len(paragraph.Sentences) != 0: chapter.Paragraphs.append(paragraph) return chapter
def getPos(place, city): ak = u'BjZFyCBFktfZmdj7SVP98fEFx78KzFn4' d = {} place = urllib2.quote(place) city = urllib2.quote(city) hd = HtmlDownloader.HtmlDownloader() ju = JsonUtils.JsonUtils() url = u'http://api.map.baidu.com/geocoder/v2/?address=%s&output=json&ak=%s&city=%s' % ( place, ak, city) #print url res = hd.download(url) res = unicode(res, 'utf-8') data = ju.readStr(res) status = data[u'status'] if status == 0: lng = data[u'result'][u'location'][u'lng'] lat = data[u'result'][u'location'][u'lat'] d[u"精度"] = unicode(str(lng), "utf-8") d[u"纬度"] = unicode(str(lat), 'utf-8') return d else: return None
def _getBooks(wikiBooks): parallelBooks = [] print(len(wikiBooks)) for i, wikiBook in enumerate(wikiBooks): print(str(i)) parallelBook = ParallelBook() parallelBook.Books = [] for wikiBookPage in wikiBook.Pages: print(wikiBookPage.WikiUrl) book = Book() book.Language = wikiBookPage.Language html = HtmlDownloader.DownloadHtml(wikiBookPage.Language.lower() + '.wikipedia.org', wikiBookPage.WikiUrl) soup = BeautifulSoup(html) headingElement = soup.find('h1', attrs={'id', 'firstHeading'}) book.Title = headingElement.text book.Title = re.sub('\(.+\)', '', book.Title) parallelBook.Books.append(book) parallelBooks.append(parallelBook) return parallelBooks
def _getEnWikiUrls(): wikiUrls = [] # conn = http.client.HTTPConnection("en.wikipedia.org") # conn.request("GET", "/wiki/The_100_Best_Books_of_All_Time") # responce = conn.getresponse() # data = responce.read() # root = ET.fromstring(data.decode('utf-8')) # table = root.find('.//table[@class="wikitable sortable"]') # rows = table.findall('tr') # for row in rows: # print (1) # bookElem = row.find('td/i/a') # if (bookElem != None): # wikiUrl = bookElem.get('href') # wikiUrls.append(wikiUrl) html = HtmlDownloader.DownloadHtml('en.wikipedia.org', '/wiki/100_Classic_Book_Collection') with codecs.open('1.html', 'w', 'utf-8') as target: target.write(html) soup = BeautifulSoup(html) tableElements = soup.find_all('table', attrs={'class':'wikitable sortable'}) for tableElement in tableElements: print(10) rowElements= tableElement.find_all('tr') for rowElement in rowElements: dataElement = rowElement.find('td') if dataElement != None: bookElement = dataElement.find('i').find('a') if bookElement != None: wikiUrl = bookElement['href'] wikiUrls.append(wikiUrl) return list(set(wikiUrls))
def __init__(self): self.urls = UrlManager.UrlManager() self.downloader = HtmlDownloader.HtmlDownloader() self.parser = HtmlParser.HtmlParser() self.outputer = HtmlOutputer.HtmlOutputer()
def __init__(self): self.UM = UM.UrlManager() self.HD = HD.HtmlDownloader() self.HP = HP.HtmlParser() self.DA = DA.DataArrange()
def __init__(self): self.urlManager = UrlManager.UrlManager() self.htmlParse = HtmlParse() self.htmlDownload = HtmlDownloader.HtmlDownload() self.dataSave = DataSave.DataSave()
def __init__(self): self.manager = UrlManager.UrlManager() self.downloader = HtmlDownloader.HtmlDownloader() self.parser = HtmlParser.HtmlParser() self.output = DataOutput.DataOutput()