Exemplo n.º 1
0
        Index_Url = response.meta.get('Index_Url', None)
        Max_Page = response.meta.get('Max_Page', None)
        All_Detail_Page = response.meta.get('All_Detail_Page', None)
        Signal_Detail_Page = response.meta.get('Signal_Detail_Page', None)
        Target_Detail_Page = response.meta.get('Target_Detail_Page', None)
        Final_Xpath = response.meta.get('Final_Xpath', None)
        res_json = json.loads(response.body_as_unicode())

        depth = 0
        try:
            while depth < len(Max_Page['index']):
                res_json = res_json.get(Max_Page['index'][depth])
                depth += 1
        except Exception, e:
            print Exception, ":", e
        urls = get_HeadUrl(Index_Url, self.name)

        max_pages = Total_page_circulate(self.name, int(res_json))
        print "最大页数是:%d" % max_pages
        if All_Detail_Page is None:
            for i in range(1, max_pages + 1):
                i = Turn_True_Page(i, self.name)
                url = urls.format(page=str(i))
                request = Request(url,
                                  callback=self.parse_final,
                                  dont_filter=True,
                                  meta={
                                      'splash': {
                                          'endpoint': 'render.html',
                                          'args': {
                                              'wait': 0.5,
Exemplo n.º 2
0
class movieSpider(scrapy.Spider):
    name = 'youku_movie'
    allowed_domain = []

    def __init__(self, *args, **kwargs):
        super(movieSpider, self).__init__(*args, **kwargs)
        self.now = time.time()
        self.config = []
        self.Index_Url = ""

    def start_requests(self):
        with open('config.json', 'r') as f:
            data = json.load(f)
            for i in data.iteritems():
                if i[0].encode('utf-8') == self.name:
                    self.config.append(i)
            f.close()

        for v in self.config:
            if len(v[1]) == 2:
                self.Index_Url = v[1][0]['Index_Url']
                Is_Json = v[1][0]['Is_Json']
                Max_Page = v[1][0]['Max_Page']
                Final_Xpath = v[1][1]['Final_Xpath']
                if Is_Json == 1:
                    for url in self.Index_Url:
                        request = Request(url, self.parse_json)
                        request.meta['Index_Url'] = url
                        request.meta['Max_Page'] = Max_Page
                        request.meta['Final_Xpath'] = Final_Xpath
                        yield request
                else:
                    for url in self.Index_Url:
                        request = Request(url,
                                          self.parse_splash,
                                          meta={
                                              'splash': {
                                                  'endpoint': 'render.html',
                                                  'args': {
                                                      'wait': 0.5,
                                                      'images': 0,
                                                      'render_all': 1
                                                  }
                                              }
                                          })
                        request.meta['Index_Url'] = url
                        request.meta['Max_Page'] = Max_Page
                        request.meta['Final_Xpath'] = Final_Xpath
                        yield request

            if len(v[1]) == 3:
                self.Index_Url = v[1][0]['Index_Url']
                Is_Json = v[1][0]['Is_Json']
                Max_Page = v[1][0]['Max_Page']
                All_Detail_Page = v[1][1]['All_Detail_Page']
                Final_Xpath = v[1][2]['Final_Xpath']
                if Is_Json == 1:
                    for url in self.Index_Url:
                        request = Request(url, self.parse_json)
                        request.meta['Index_Url'] = url
                        request.meta['Max_Page'] = Max_Page
                        request.meta['All_Detail_Page'] = All_Detail_Page
                        request.meta['Final_Xpath'] = Final_Xpath
                        yield request
                else:
                    for url in self.Index_Url:
                        request = Request(url,
                                          self.parse_splash,
                                          meta={
                                              'splash': {
                                                  'endpoint': 'render.html',
                                                  'args': {
                                                      'wait': 0.5,
                                                      'images': 0,
                                                      'render_all': 1
                                                  }
                                              }
                                          })
                        request.meta['Index_Url'] = url
                        request.meta['Max_Page'] = Max_Page
                        request.meta['All_Detail_Page'] = All_Detail_Page
                        request.meta['Final_Xpath'] = Final_Xpath
                        yield request

            if len(v[1]) == 4:
                self.Index_Url = v[1][0]['Index_Url']
                Is_Json = v[1][0]['Is_Json']
                Max_Page = v[1][0]['Max_Page']
                All_Detail_Page = v[1][1]['All_Detail_Page']
                Signal_Detail_Page = v[1][2]['Signal_Detail_Page']
                Final_Xpath = v[1][3]['Final_Xpath']
                if Is_Json == 1:
                    for url in self.Index_Url:
                        request = Request(url, callback=self.parse_json)
                        request.meta['Index_Url'] = url
                        request.meta['Max_Page'] = Max_Page
                        request.meta['All_Detail_Page'] = All_Detail_Page
                        request.meta['Signal_Detail_Page'] = Signal_Detail_Page
                        request.meta['Final_Xpath'] = Final_Xpath
                        yield request
                else:
                    for url in self.Index_Url:
                        request = Request(url,
                                          callback=self.parse_splash,
                                          dont_filter=True,
                                          meta={
                                              'splash': {
                                                  'endpoint': 'render.html',
                                                  'args': {
                                                      'wait': 0.5,
                                                      'images': 0,
                                                      'render_all': 1
                                                  }
                                              }
                                          })
                        request.meta['Index_Url'] = url
                        request.meta['Max_Page'] = Max_Page
                        request.meta['All_Detail_Page'] = All_Detail_Page
                        request.meta['Signal_Detail_Page'] = Signal_Detail_Page
                        request.meta['Final_Xpath'] = Final_Xpath
                        yield request

            if len(v[1]) == 5:
                self.Index_Url = v[1][0]['Index_Url']
                Is_Json = v[1][0]['Is_Json']
                Max_Page = v[1][0]['Max_Page']
                All_Detail_Page = v[1][1]['All_Detail_Page']
                Signal_Detail_Page = v[1][2]['Signal_Detail_Page']
                Target_Detail_Page = v[1][3]['Target_Detail_Page']
                Final_Xpath = v[1][4]['Final_Xpath']
                if Is_Json == 1:
                    for url in self.Index_Url:
                        request = Request(url, callback=self.parse_json)
                        request.meta['Index_Url'] = url
                        request.meta['Max_Page'] = Max_Page
                        request.meta['All_Detail_Page'] = All_Detail_Page
                        request.meta['Signal_Detail_Page'] = Signal_Detail_Page
                        request.meta['Target_Detail_Page'] = Target_Detail_Page
                        request.meta['Final_Xpath'] = Final_Xpath
                        yield request
                else:
                    for url in self.Index_Url:
                        request = Request(url,
                                          callback=self.parse_splash,
                                          meta={
                                              'splash': {
                                                  'endpoint': 'render.html',
                                                  'args': {
                                                      'wait': 0.5,
                                                      'images': 0,
                                                      'render_all': 1
                                                  }
                                              }
                                          })
                        request.meta['Index_Url'] = url
                        request.meta['Max_Page'] = Max_Page
                        request.meta['All_Detail_Page'] = All_Detail_Page
                        request.meta['Signal_Detail_Page'] = Signal_Detail_Page
                        request.meta['Target_Detail_Page'] = Target_Detail_Page
                        request.meta['Final_Xpath'] = Final_Xpath
                        yield request

    def parse_splash(self, response):
        #这边就是管你有没有,我都接收,在使用的时候判断,如果不存在,说明要直接到final_parse处
        Index_Url = response.meta.get('Index_Url', None)
        Max_Page = response.meta.get('Max_Page', None)
        All_Detail_Page = response.meta.get('All_Detail_Page', None)
        Signal_Detail_Page = response.meta.get('Signal_Detail_Page', None)
        Target_Detail_Page = response.meta.get('Target_Detail_Page', None)
        Final_Xpath = response.meta.get('Final_Xpath', None)
        max_pages = 2
        try:
            max_pages = re.search(
                Max_Page['re'],
                ''.join(response.xpath(Max_Page['xpath']).extract())).group()
        except Exception, e:
            print Exception, ":", e
        #这里是替换末尾的\d+,记住,遇上其他情况,就扩展这个get_HeadUrl()
        urls = get_HeadUrl(Index_Url, self.name)
        try:
            max_pages = Total_page_circulate(self.name, int(max_pages))
        except Exception, e:
            print Exception, ":", e
Exemplo n.º 3
0
class movieSpider(scrapy.Spider):
    name = 'letv_movie'
    allowed_domain = []

    def __init__(self, *args, **kwargs):
        super(movieSpider, self).__init__(*args, **kwargs)
        self.now = time.time()
        self.config = []
        self.Index_Url = ""

    def start_requests(self):
        with open('config.json', 'r') as f:
            data = json.load(f)
            for i in data.iteritems():
                if i[0].encode('utf-8') == self.name:
                    self.config.append(i)
            f.close()

        for v in self.config:
            if len(v[1]) == 2:
                self.Splash = v[1][0]['Splash']
                self.Index_Url = v[1][0]['Index_Url']
                Is_Json = v[1][0]['Is_Json']
                Max_Page = v[1][0]['Max_Page']
                Final_Xpath = v[1][1]['Final_Xpath']
                if Is_Json == 1:
                    for url in self.Index_Url:
                        request = Request(url, self.parse_json)
                        request.meta['Index_Url'] = url
                        request.meta['Max_Page'] = Max_Page
                        request.meta['Final_Xpath'] = Final_Xpath
                        yield request
                else:
                    for url in self.Index_Url:
                        request = Request(url,
                                          self.parse_splash,
                                          meta={
                                              'splash': {
                                                  'endpoint': 'render.html',
                                                  'args': {
                                                      'wait': 0.5,
                                                      'images': 0,
                                                      'render_all': 1
                                                  }
                                              }
                                          })
                        request.meta['Index_Url'] = url
                        request.meta['Max_Page'] = Max_Page
                        request.meta['Final_Xpath'] = Final_Xpath
                        yield request

            if len(v[1]) == 3:
                self.Splash = v[1][0]['Splash']
                self.Index_Url = v[1][0]['Index_Url']
                Is_Json = v[1][0]['Is_Json']
                Max_Page = v[1][0]['Max_Page']
                All_Detail_Page = v[1][1]['All_Detail_Page']
                Final_Xpath = v[1][2]['Final_Xpath']
                if Is_Json == 1:
                    for url in self.Index_Url:
                        request = Request(url, self.parse_json)
                        request.meta['Index_Url'] = url
                        request.meta['Max_Page'] = Max_Page
                        request.meta['All_Detail_Page'] = All_Detail_Page
                        request.meta['Final_Xpath'] = Final_Xpath
                        yield request
                else:
                    for url in self.Index_Url:
                        request = Request(url,
                                          self.parse_splash,
                                          meta={
                                              'splash': {
                                                  'endpoint': 'render.html',
                                                  'args': {
                                                      'wait': 0.5,
                                                      'images': 0,
                                                      'render_all': 1
                                                  }
                                              }
                                          })
                        request.meta['Index_Url'] = url
                        request.meta['Max_Page'] = Max_Page
                        request.meta['All_Detail_Page'] = All_Detail_Page
                        request.meta['Final_Xpath'] = Final_Xpath
                        yield request

            if len(v[1]) == 4:
                self.Splash = v[1][0]['Splash']
                self.Index_Url = v[1][0]['Index_Url']
                Is_Json = v[1][0]['Is_Json']
                Max_Page = v[1][0]['Max_Page']
                All_Detail_Page = v[1][1]['All_Detail_Page']
                Signal_Detail_Page = v[1][2]['Signal_Detail_Page']
                Final_Xpath = v[1][3]['Final_Xpath']
                if Is_Json == 1:
                    for url in self.Index_Url:
                        request = Request(url, callback=self.parse_json)
                        request.meta['Index_Url'] = url
                        request.meta['Max_Page'] = Max_Page
                        request.meta['All_Detail_Page'] = All_Detail_Page
                        request.meta['Signal_Detail_Page'] = Signal_Detail_Page
                        request.meta['Final_Xpath'] = Final_Xpath
                        yield request
                else:
                    for url in self.Index_Url:
                        request = Request(url,
                                          callback=self.parse_splash,
                                          dont_filter=True,
                                          meta={
                                              'splash': {
                                                  'endpoint': 'render.html',
                                                  'args': {
                                                      'wait': 0.5,
                                                      'images': 0,
                                                      'render_all': 1
                                                  }
                                              }
                                          })
                        request.meta['Index_Url'] = url
                        request.meta['Max_Page'] = Max_Page
                        request.meta['All_Detail_Page'] = All_Detail_Page
                        request.meta['Signal_Detail_Page'] = Signal_Detail_Page
                        request.meta['Final_Xpath'] = Final_Xpath
                        yield request

            if len(v[1]) == 5:
                self.Splash = v[1][0]['Splash']
                self.Index_Url = v[1][0]['Index_Url']
                Is_Json = v[1][0]['Is_Json']
                Max_Page = v[1][0]['Max_Page']
                All_Detail_Page = v[1][1]['All_Detail_Page']
                Signal_Detail_Page = v[1][2]['Signal_Detail_Page']
                Target_Detail_Page = v[1][3]['Target_Detail_Page']
                Final_Xpath = v[1][4]['Final_Xpath']
                if Is_Json == 1:
                    for url in self.Index_Url:
                        request = Request(url, callback=self.parse_json)
                        request.meta['Index_Url'] = url
                        request.meta['Max_Page'] = Max_Page
                        request.meta['All_Detail_Page'] = All_Detail_Page
                        request.meta['Signal_Detail_Page'] = Signal_Detail_Page
                        request.meta['Target_Detail_Page'] = Target_Detail_Page
                        request.meta['Final_Xpath'] = Final_Xpath
                        yield request
                else:
                    for url in self.Index_Url:
                        request = Request(url,
                                          callback=self.parse_splash,
                                          meta={
                                              'splash': {
                                                  'endpoint': 'render.html',
                                                  'args': {
                                                      'wait': 0.5,
                                                      'images': 0,
                                                      'render_all': 1
                                                  }
                                              }
                                          })
                        request.meta['Index_Url'] = url
                        request.meta['Max_Page'] = Max_Page
                        request.meta['All_Detail_Page'] = All_Detail_Page
                        request.meta['Signal_Detail_Page'] = Signal_Detail_Page
                        request.meta['Target_Detail_Page'] = Target_Detail_Page
                        request.meta['Final_Xpath'] = Final_Xpath
                        yield request

    def parse_splash(self, response):
        #这边就是管你有没有,我都接收,在使用的时候判断,如果不存在,说明要直接到final_parse处
        Index_Url = response.meta.get('Index_Url', None)
        Max_Page = response.meta.get('Max_Page', None)
        All_Detail_Page = response.meta.get('All_Detail_Page', None)
        Signal_Detail_Page = response.meta.get('Signal_Detail_Page', None)
        Target_Detail_Page = response.meta.get('Target_Detail_Page', None)
        Final_Xpath = response.meta.get('Final_Xpath', None)
        max_pages = 2
        try:
            max_pages = re.search(
                Max_Page['re'],
                ''.join(response.xpath(Max_Page['xpath']).extract())).group()
        except Exception, e:
            print Exception, ":", e
        if max_pages.isdigit():
            max_pages = Total_page_circulate(self.name, int(max_pages))
        elif max_pages == '':
            max_pages = Total_Page_Byyourself(self.name)
        else:
            raise CloseSpider("渲染页面中,找不到Max_Page ,请重新确认 ,爬虫关闭!!!")

        urls = get_HeadUrl(Index_Url, self.name)
        print "最大页数是:%d" % max_pages
        if All_Detail_Page is None:
            if self.Splash:
                for i in range(1, max_pages + 1):
                    i = Turn_True_Page(i, self.name)
                    url = urls.format(page=str(i))
                    request = Request(url,
                                      callback=self.parse_final,
                                      dont_filter=True,
                                      meta={
                                          'splash': {
                                              'endpoint': 'render.html',
                                              'args': {
                                                  'wait': 0.5,
                                                  'images': 0,
                                                  'render_all': 1
                                              }
                                          }
                                      })
                    request.meta['Final_Xpath'] = Final_Xpath
                    yield request
            else:
                for i in range(1, max_pages + 1):
                    i = Turn_True_Page(i, self.name)
                    url = urls.format(page=str(i))
                    request = Request(url,
                                      callback=self.parse_final,
                                      dont_filter=True)
                    request.meta['Final_Xpath'] = Final_Xpath
                    yield request
        else:
            for i in range(1, int(max_pages) + 1):
                try:
                    i = Turn_True_Page(i, self.name)
                    url = urls.format(page=str(i))
                except Exception, e:
                    print Exception, ":", e
                request = Request(url,
                                  callback=self.parse_first,
                                  dont_filter=True,
                                  meta={
                                      'splash': {
                                          'endpoint': 'render.html',
                                          'args': {
                                              'wait': 0.5,
                                              'images': 0,
                                              'render_all': 1
                                          }
                                      }
                                  })
                request.meta['Index_Url'] = Index_Url
                request.meta['All_Detail_Page'] = All_Detail_Page
                request.meta['Signal_Detail_Page'] = Signal_Detail_Page
                request.meta['Target_Detail_Page'] = Target_Detail_Page
                request.meta['Final_Xpath'] = Final_Xpath
                yield request