예제 #1
0
파일: parsehtml.py 프로젝트: cmdbdu/little
 def get_label_a(self):
     try:
         soup = BeautifulSoup(self.data, 'lxml')
         alist = soup.find_all('a')
         mange = Mange(self.new_urls, self.old_urls)
         for i in alist:
             if 'http://' in i['href']:
                 mange.add_new_urls(i['href'])
     except Exception,e:  # no href
         pass
예제 #2
0
파일: main.py 프로젝트: cmdbdu/little
def main():

    # page thread url
    if not args.url:
        return

    mange = Mange(new_urls, old_urls)
    mange.add_new_urls(args.url)
    while mange.has_new_urls():
        if len(old_urls) >= args.page:
            break
        try:
            url = mange.pop()
            myt = SpiderThread(url, args.page, args.thread, new_urls, old_urls)
            myt.start()
            mange.add_old_urls(url)
        except Exception,e:
            print e
        time.sleep(1)