示例#1
0
 def video_null(self, response=None, children=None, url=None):
     if not response:
         return None
     result = dict()
     for key, value in self.settings.video_children:
         if len(value):
             way = value[0].get('way')
             if way == 'html':
                 location = value[0].get('location')
                 get = get_from_html(response.response, location)
                 if not len(get):
                     self.logger.error('Error response, url:' + url.url)
                     self.urls.put(url)
                     return None
                 if key == 'title':
                     result[key] = get[0].text
                 else:
                     if 'get' not in value[0].keys():
                         result[key] = get[0].text
                     else:
                         pass
             elif way == 'json':
                 m_split = value[0].get('split', '')
                 json_key = value[0].get('key')
                 request = build_request(template=value[0].get('request'),
                                         param=value[0].get('param'),
                                         isre=value[0].get('isre', False),
                                         html=response.response)
                 if not len(request):
                     continue
                 json_response = Crawler(request[0]).crawling()
                 if len(m_split):
                     m_start = json_response.response.index(m_split.split(':')[0])
                     m_end = json_response.response.index(m_split.split(':')[1])
                     json_response.response = json_response.response[m_start + 1:m_end]
                 try:
                     d = json.loads(json_response.response)
                     if isinstance(d, dict):
                         res = find_dict(d, json_key)
                         if len(res):
                             result[key] = res[0].get(json_key)
                     elif isinstance(d, int):
                         result[key] = d
                 except ValueError, ve:
                     pass
示例#2
0
    def run(self):
        import time
        counter = 0
        while True:
            if self.stat.ncompare(status.STATUS_RUNNING):
                break
            url = self.urls.get()
            if not url:
                if counter >= 30:
                    break
                counter += 1
                if counter % 10 == 0:
                    self.logger.info('Job(%d) waiting for url...(%d)' % (self.thread_num, counter))
                time.sleep(1)
                continue
            self.settings = Settings(url.url_tip)
            response = Crawler(m_url=url.url, snapshot=self.settings.snapshot and url.url_type == 'host').crawling()
            if not len(response.response):
                self.logger.error('None response')
                continue
            if url.url_type == 'host':
                host_children = self.settings.host.get('children')
                import re

                response.response = re.sub(r'charset=(\w*)', 'charset=UTF-8', response.response)
                if self.settings.host.get('onlyurl'):
                    self.host_only_url(response=response, children=host_children)
                else:
                    self.host_not_only_url(children=host_children)
            elif url.url_type == 'video':
                # do video
                video_children = self.settings.video.get('children')
                if self.settings.video.get('null'):
                    self.video_null(response=response, children=video_children, url=url)
                else:
                    self.video_not_null(response=response, children=video_children, url=url)