예제 #1
0
 def crawl(self,urls,depth=2):
     for url in urls:
         html = getPage(url)
         if html is None or html == '' : continue
         
         soup = BeautifulSoup(html)
         url_list = []
         for link in soup('a'):
             url_str = link.get('href')
             if url_str is None or url_str == '' : continue
             elif url_str[0:4] == 'http' : url_list.append(url_str)
             else : pass
예제 #2
0
critics={'李楠':{'一代宗师':'2.5','小时代':'3.5','钢铁侠':'2.0','蜘蛛侠':'3.5','重返地球':'2.5','007':'3.0'},
         '宋瑶':{'一代宗师':'1.0','小时代':'3.5','超人':'1.0','蜘蛛侠':'2.5','星际穿越':'2.5','007':'2.0'},
         '吴琼':{'一代宗师':'1.5','小时代':'2.5','钢铁侠':'3.0','蝙蝠侠':'2.5','星际穿越':'1.5','007':'1.0'},
         '卞雪达':{'一代宗师':'2.0','小时代':'1.5','钢铁侠':'3.0','蜘蛛侠':'2.5','星际穿越':'1.5','007':'4.0','生化危机':'5.0'},
         '卞冬至':{'一代宗师':'2.5','小时代':'4.0','超人':'3.0','蜘蛛侠':'1.5','星际穿越':'1.5','007':'3.0'},
         '吴会来':{'一代宗师':'3.0','小时代':'1.0','钢铁侠':'3.5','蜘蛛侠':'4.0','星际穿越':'4.0','007':'2.0'},
         '张薇':{'一代宗师':'4.0','小时代':'1.0','超人':'3.5','蜘蛛侠':'2.0','星际穿越':'3.0','007':'3.5','生化危机':'4.0'},
         '尼培伦':{'一代宗师':'3.5','小时代':'2.5','钢铁侠':'4.0','蜘蛛侠':'3.0','重返地球':'2.0','007':'2.5'},
         '石相扬':{'一代宗师':'4.5','小时代':'0.5','钢铁侠':'4.5','蜘蛛侠':'3.5','星际穿越':'2.5','007':'1.5'},
         '王瑞元':{'一代宗师':'0.5','小时代':'4.5','超人':'1.0','蝙蝠侠':'3.5','星际穿越':'3.0','007':'3.0'},
         '大宝':{'我爱你':'0.5','小时代3':'4.5','超人2':'1.0','蝙蝠侠2':'3.5'}}

url_base = 'http://zhidao.baidu.com'
url_resouce = '/question/1432695535674275539.html'
url_para = '?push=asking&entry=qb_home_new'
html = getPage(url_base+url_resouce+url_para)

if html is None or html == '':print 'html读取失败'
open('a.html','w+').write(html)
soup = BeautifulSoup(html)
url_list = []
for link in soup('a'):
    url = link.get('href')
    if url is None or url == '' : continue
    if url[0] == '/' : url_list.append(url_base+url)
    elif url[0:4] == 'http' : url_list.append(url)
    else : pass

for url in url_list : print url