def _taskHandler(self, url): """ 根据指定的url,抓取网页,并进行相应的访问控制 """ print "Visiting : " + url webPage = WebPage(url) # 抓取页面内容 flag = webPage.fetch() if flag: url, pageSource = webPage.getDatas() # 抽取小组主页的置顶贴 match_obj = REGroup.match(url) if match_obj is not None: group_id = match_obj.group(1) # 添加置顶贴的topic列表 self._addStickTopic(webPage) return True # 抽取普通讨论贴 match_obj = REDiscussion.match(url) if match_obj is not None: group_id = match_obj.group(1) start = int(match_obj.group(2)) self._addTopicLink(webPage, start) return True log.error("抓取小组讨论列表时,发现网址格式错误。Group ID: %s, URL: %s" % (self.group_id, url)) # if page reading fails self.failed_href.add(url) return False
def task_handler(topic_id, seg_list): f = codecs.open('tables/TopicInfo-title.txt', 'a','utf-8') url = 'http://www.douban.com/group/topic/' + topic_id + '/' print 'Visiting: ', url webPage = WebPage(url) flag = webPage.fetch() if flag: url, pageSource = webPage.getDatas() # pageSource已经为unicode格式 page = etree.HTML(pageSource) content = page.xpath(u"/html/body/div[@id='wrapper']/div[@id='content']")[0] tmp = page.xpath(u"//table[@class='infobox']//td[@class='tablecc']") if len(tmp) == 0: # 标题没有被截断 titlenode = content.xpath("h1")[0] title = titlenode.text.strip() else: titlenode = tmp[0] title = etree.tostring(titlenode, method='text', encoding='utf-8').strip() if isinstance(title, unicode): pass else: title = title.decode("utf-8") seg_list.insert(4, title) f.write('[=]'.join(seg_list) + '\n') else: failed_set.add(topic_id) f.close()
def testurl(): url="http://cy.5156edu.com/cymore.html" w=WebPage(url) hrefstart="http://cy.5156edu.com" if w.fetch(): url,pageSource=w.getDatas() soup=BeautifulSoup(pageSource) results=soup.find_all('a',href=re.compile(r'/[a-z]\.html')) print len(results) for a in results: print urljoin(hrefstart,a.get('href'))
def testdb(): d=Database('testdb.sql') url="http://cy.5156edu.com/html4/31232.html" w=WebPage(url) if w.fetch(): url,pageSource=w.getDatas() soup=BeautifulSoup(pageSource) temp=soup.find('table',bgcolor='#C0C0C0') infos=temp.find_all('td') info=getInfo(url,infos) d.saveData(info)
def selfTesting(): for i in range(97,123): url = 'http://cy.5156edu.com/html2/%s.html'%chr(i) w=WebPage(url) if w.fetch(): url,pageSource=w.getDatas() soup=BeautifulSoup(pageSource) temp=soup.find('td',bgcolor='#E8F3FF') page=int(re.findall(r'\d+',temp.text)[1]) print url for i in range(2,page+1): href="%s_%d.html"%(url[:-5],i) print href
def _taskHandler(self, url): """ 根据指定的url,抓取网页,并进行相应的访问控制 """ print "Visiting : " + url webPage = WebPage(url) # 抓取页面内容 flag = webPage.fetch() if flag: url, pageSource = webPage.getDatas() hrefs = self._getAllHrefsFromPage(url, pageSource) # 找到有效的链接 post_list = [] next_page_url = None for href in hrefs: # 只有满足讨论帖链接格式的链接才会被处理 m = regex_post_first.match(href) if self._isHttpOrHttpsProtocol(href) and m is not None: post_list.append(m.group('post_id')) # 在当前页面中查找匹配“下一页”的链接 m = regex_next_page.match(href) if m != None and (not m.group() in self.visited_href): url = m.group() print 'Add next page link: ', url self.thread_pool.putTask(self._taskHandler, url) self.visited_href.add(url) for post_id in post_list: #print "Add thread link: ", thread self.post_list.append(post_id) # 存储已经抓取的topic list self.save_thread.putTask(self._saveTopicHandler, post_list) else: log.error(u"抓取讨论帖列表时,发现网址格式错误。URL: %s" % url) # if page reading fails self.failed_href.add(url) return False
try: info.append(infos[i].text) except Exception,e: print e return info def extractUrls(filename): try: f=open(filename,"rb") s=f.read() f.close() except Exception,e: print e else: urls=re.findall(r'http://cy.5156edu.com/html4/\d+.html',s) for url in urls: w=WebPage(url) d=Database("data.sql") if w.fetch(): try: href,pageSource=w.getDatas() soup=BeautifulSoup(pageSource) temp=soup.find('table',bgcolor='#C0C0C0') infos=temp.find_all("td") info=getInfo(href,infos) d.saveData(info) except Exception,e: print e if __name__ == '__main__': extractUrls("spider.log")