示例#1
0
class SpiderManager(object):
    def __init__(self):
        self.downloader = HtmlDownloader()
        self.parser = HtmlParser()
        self.output = DataOutput()

    def crawl(self, root_url):
        content = self.downloader.download(root_url)
        #with open("content.html", 'wb') as f:
        #    f.write(content.encode('utf-8'))
        urls = self.parser.parser_url(root_url, content)
        print(urls)
        for url in urls:
            try:
                t = time.strftime("%Y%m%d%H%M%S3282", time.localtime())
                rank_url = 'http://service.library.mtime.com/Movie.api' \
                    '?Ajax_CallBack=true' \
                    '&Ajax_CallBackType=Mtime.Library.Services' \
                    '&Ajax_CallBackMethod=GetMovieOverviewRating' \
                    '&Ajax_CrossDomain=1' \
                    '&Ajax_RequestUrl=%s' \
                    '&t=%s' \
                    '&Ajax_CallBackArgument0=%s' % (url[0], t, url[1])
                rank_content = self.downloader.download(rank_url)
                data = self.parser.parser_json(rank_url, rank_content)
                self.output.store_data(data)
            except Exception as e:
                print('Crawl Failed!!!')
        self.output.output_end()
        print('Crawl Finish!')
示例#2
0
 def store_proc(self, store_q):
     output = DataOutput()
     while True:
         if not store_q.empty():
             data = store_q.get()
             if data == 'end':
                 print('Store process get the ending notify')
                 return
             output.store_data(data)
         else:
             time.sleep(0.1)
示例#3
0
class Spiderman(object):

    def __init__(self):
        self.manage = UrlManager()
        self.parser = HtmlParser()
        self.downloader = Htmldownloader()
        self.output = DataOutput()

    def crawl(self,root_url):
        self.manage.add_new_url(root_url)
        print(len(self.manage.new_urls))
        while(self.manage.has_new_url() and self.manage.old_url_size() < 100):
            try:
                new_url = self.manage.get_new_url()
                html = self.downloader.download(new_url)
                new_urls,data = self.parser.parser(new_url,html)
                self.manage.add_new_urls(new_urls)
                self.output.store_data(data=data)
                print('已经抓取%s个链接' % self.manage.old_url_size())
            except:
                print('crawl Failed')
        self.output.output_html()
示例#4
0
class SpiderManager(object):
    def __init__(self):
        self.urlmanager = UrlManager()
        self.downloader = HtmlDownLoader()
        self.parser = HtmlParser()
        self.output = DataOutput()

    def crawl(self, root_url):
        self.urlmanager.add_new_url(root_url)

        while (self.urlmanager.has_new_url()
               and self.urlmanager.old_url_size() < 100):
            try:
                new_url = self.urlmanager.get_new_url()
                html = self.downloader.download(new_url)
                new_urls, data = self.parser.parser(new_url, html)
                self.urlmanager.add_new_urls(new_urls)
                self.output.store_data(data)
                print('Has crawl %s links' % self.urlmanager.old_url_size())
            except Exception as e:
                print('crawl failed')
                print(e)
        self.output.output_html()
示例#5
0
 def __init__(self):
     self.manage = UrlManager()
     self.parser = HtmlParser()
     self.downloader = Htmldownloader()
     self.output = DataOutput()
示例#6
0
 def __init__(self):
     self.downloader = HtmlDownloader()
     self.parser = HtmlParser()
     self.output = DataOutput()
示例#7
0
url_to_file.close_file()

#从文档中读取url
urls = open('url.txt', 'r').readlines()
#print(urls)

#从url中获取href并写入txt文件
header = {
    'User-Agent': 'Mozilla/5.0 (Windows NT 10.0;) Gecko/20100101 Firefox/65.0'
}
for url in urls:
    url = url.strip('\n')
    url_soup = UrlParser(url, header).get_url_soup()
    s = UrlParser(url, header).get_url_href(url_soup)
    for item in s:
        href_to_txt = DataOutput(item).data_to_txt('href.txt')

#从href.txt文件中读取href并解析
f = open('href.txt', 'r').readlines()

for detail_href in f:
    i = f.index(detail_href)
    print('正在处理第{}个href'.format(i))
    detail_url = detail_href.strip('\n')
    try:
        global detail
        detail = UrlParser(detail_href, header)
        detail_soup = detail.get_url_soup()
    except:
        pass
    if i == 0: