class SpiderManager(object): def __init__(self): self.downloader = HtmlDownloader() self.parser = HtmlParser() self.output = DataOutput() def crawl(self, root_url): content = self.downloader.download(root_url) #with open("content.html", 'wb') as f: # f.write(content.encode('utf-8')) urls = self.parser.parser_url(root_url, content) print(urls) for url in urls: try: t = time.strftime("%Y%m%d%H%M%S3282", time.localtime()) rank_url = 'http://service.library.mtime.com/Movie.api' \ '?Ajax_CallBack=true' \ '&Ajax_CallBackType=Mtime.Library.Services' \ '&Ajax_CallBackMethod=GetMovieOverviewRating' \ '&Ajax_CrossDomain=1' \ '&Ajax_RequestUrl=%s' \ '&t=%s' \ '&Ajax_CallBackArgument0=%s' % (url[0], t, url[1]) rank_content = self.downloader.download(rank_url) data = self.parser.parser_json(rank_url, rank_content) self.output.store_data(data) except Exception as e: print('Crawl Failed!!!') self.output.output_end() print('Crawl Finish!')
def store_proc(self, store_q): output = DataOutput() while True: if not store_q.empty(): data = store_q.get() if data == 'end': print('Store process get the ending notify') return output.store_data(data) else: time.sleep(0.1)
class Spiderman(object): def __init__(self): self.manage = UrlManager() self.parser = HtmlParser() self.downloader = Htmldownloader() self.output = DataOutput() def crawl(self,root_url): self.manage.add_new_url(root_url) print(len(self.manage.new_urls)) while(self.manage.has_new_url() and self.manage.old_url_size() < 100): try: new_url = self.manage.get_new_url() html = self.downloader.download(new_url) new_urls,data = self.parser.parser(new_url,html) self.manage.add_new_urls(new_urls) self.output.store_data(data=data) print('已经抓取%s个链接' % self.manage.old_url_size()) except: print('crawl Failed') self.output.output_html()
class SpiderManager(object): def __init__(self): self.urlmanager = UrlManager() self.downloader = HtmlDownLoader() self.parser = HtmlParser() self.output = DataOutput() def crawl(self, root_url): self.urlmanager.add_new_url(root_url) while (self.urlmanager.has_new_url() and self.urlmanager.old_url_size() < 100): try: new_url = self.urlmanager.get_new_url() html = self.downloader.download(new_url) new_urls, data = self.parser.parser(new_url, html) self.urlmanager.add_new_urls(new_urls) self.output.store_data(data) print('Has crawl %s links' % self.urlmanager.old_url_size()) except Exception as e: print('crawl failed') print(e) self.output.output_html()
def __init__(self): self.manage = UrlManager() self.parser = HtmlParser() self.downloader = Htmldownloader() self.output = DataOutput()
def __init__(self): self.downloader = HtmlDownloader() self.parser = HtmlParser() self.output = DataOutput()
url_to_file.close_file() #从文档中读取url urls = open('url.txt', 'r').readlines() #print(urls) #从url中获取href并写入txt文件 header = { 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0;) Gecko/20100101 Firefox/65.0' } for url in urls: url = url.strip('\n') url_soup = UrlParser(url, header).get_url_soup() s = UrlParser(url, header).get_url_href(url_soup) for item in s: href_to_txt = DataOutput(item).data_to_txt('href.txt') #从href.txt文件中读取href并解析 f = open('href.txt', 'r').readlines() for detail_href in f: i = f.index(detail_href) print('正在处理第{}个href'.format(i)) detail_url = detail_href.strip('\n') try: global detail detail = UrlParser(detail_href, header) detail_soup = detail.get_url_soup() except: pass if i == 0: