Пример #1
0
 def _parse_list(self, response):
     conf = response.meta.get('conf')
     current = datetime.datetime.now()
     rsp = demjson.decode(response.body_as_unicode())
     data = rsp.get('data', None)
     info = data.get('article_info', None) if data is not None else None
     lis = Selector(text=info).css('ul>li') if info is not None else None
     if lis is not None:
         for it in lis:
             ctime = it.css('span.t-time::text').extract_first()
             title = it.css('a::text').extract_first()
             href = it.css('a::attr(href)').extract_first()
             fni = FinNewsItem()
             fni['seed'] = conf['name']
             fni['nid'] = "%s%s" % (conf['name'], href)
             fni['title'] = title
             fni['url'] = href
             year = current.year if current.month >= int(
                 ctime[:2]) else current.year - 1
             fni['time'] = "%s-%s" % (year, ctime)
             fni = self.spider._filter_by_title(fni)
             if fni:
                 yield scrapy.Request(url=fni['url'],
                                      meta={'item': fni},
                                      callback=self._parse_detail)
             else:
                 continue
Пример #2
0
 def _parse_list(self, response):
     conf = response.meta.get('conf')
     if response.meta.get('curr_page') == -1:
         url = conf['url'] % (conf['page_num'], -1)
         yield scrapy.Request(url=url, meta={'conf': conf, 'max_page': response.meta.get('max_page'), 'curr_page': 0}, callback=self._parse_list)
     else:        
         rsp = demjson.decode(response.body_as_unicode())        
         if response.meta.get('curr_page') < response.meta.get('max_page'):
             if rsp.get('next_max_id', 1) > 1:
                 url = conf['url'] % (conf['page_num'], rsp.get('next_max_id', 1))
                 yield scrapy.Request(url=url, meta={'conf': conf, 'max_page': response.meta.get('max_page'), 'curr_page': response.meta.get('curr_page')+1}, callback=self._parse_list)                
         items = rsp.get('list', None)
         if items is not None:
             for it in items:
                 fni = FinNewsItem()
                 fni['seed'] = conf['name']                    
                 data = demjson.decode(it.get('data'))
                 fni['nid'] = "%s%s"%(conf['name'], data.get('id'))
                 fni['title'] = data.get('title')
                 fni['url'] = "https://xueqiu.com%s" % data.get('target')
                 fni['time'] = datetime.datetime.fromtimestamp(float(data.get('created_at'))/1000).strftime('%Y-%m-%d %H:%M:%S')
                 fni = self.spider._filter_by_title(fni)
                 if fni: 
                     yield scrapy.Request(url=fni['url'], meta={'item': fni}, callback=self._parse_detail)
                 else:
                     continue
Пример #3
0
 def _parse_list(self, response):
     conf = response.meta.get('conf')
     for text in response.css('div.main-content ul.new-list>li:not(.line)'):
         fni = FinNewsItem()
         fni['seed'] = conf['name']
         fni['title'] = text.css('a::text').extract_first().strip()
         fni['url'] = text.css('a::attr(href)').extract_first().strip()
         fni['time'] = text.css('span.time::text').extract_first().strip().replace('[','').replace(']','')
         fni['nid'] = "%s%s"%(conf['name'], fni['url'])
         fni = self.spider._filter_by_title(fni)
         if fni: 
             yield scrapy.Request(url=fni['url'], meta={'item': fni}, callback=self._parse_detail)
         else:
             continue
Пример #4
0
 def _parse_list(self, response):
     conf = response.meta.get('conf')
     for text in response.css('div.box_mid div.box_list_word'):
         fni = FinNewsItem()
         fni['seed'] = conf['name']
         fni['title'] = text.css('h2>a::text').extract_first().strip()
         fni['url'] = text.css('h2>a::attr(href)').extract_first().strip()
         fni['time'] = text.css(
             'div.keywords>a::text').extract_first().strip()
         fni['nid'] = "%s%s" % (conf['name'], fni['url'])
         fni = self.spider._filter_by_title(fni)
         if fni:
             yield scrapy.Request(url=fni['url'],
                                  meta={'item': fni},
                                  callback=self._parse_detail)
         else:
             continue
Пример #5
0
 def _parse_list(self, response):
     conf = response.meta.get('conf')
     rsp = self.spider._jsonp(response.body_as_unicode())
     rsp = demjson.decode(rsp)
     items = rsp.get('BA8EE5GMwangning', None)
     if items is not None:
         for it in sorted(items, key=lambda x: x['ptime']):
             fni = FinNewsItem()
             fni['seed'] = conf['name']
             fni['nid'] = "%s%s" % (conf['name'], it.get('docid'))
             fni['title'] = it.get('title')
             fni['url'] = it.get('url')
             fni['time'] = it.get('ptime')
             fni = self.spider._filter_by_title(fni)
             if fni:
                 yield scrapy.Request(url=fni['url'],
                                      meta={'item': fni},
                                      callback=self._parse_detail)
             else:
                 continue
Пример #6
0
 def _parse_list(self, response):
     conf = response.meta.get('conf')
     rsp = demjson.decode(response.body_as_unicode())
     items = rsp.get('list', None)
     if items is not None:
         for it in items:
             fni = FinNewsItem()
             fni['seed'] = conf['name']
             fni['nid'] = "%s%s" % (conf['name'], it.get('ContId'))
             fni['title'] = it.get('Title')
             fni['url'] = it.get('Url')
             fni['time'] = datetime.datetime.fromtimestamp(
                 float(
                     it.get('CreatedTime3g'))).strftime('%Y-%m-%d %H:%M:%S')
             fni = self.spider._filter_by_title(fni)
             if fni:
                 yield scrapy.Request(url=fni['url'],
                                      meta={'item': fni},
                                      callback=self._parse_detail)
             else:
                 continue
Пример #7
0
 def _parse_list(self, response):
     conf = response.meta.get('conf')
     items = demjson.decode(response.body_as_unicode())
     if items is not None:
         for it in items:
             fni = FinNewsItem()
             fni['seed'] = conf['name']
             fni['nid'] = "%s%s" % (conf['name'], it.get('id'))
             fni['title'] = it.get('title')
             fni['url'] = "http://www.sohu.com/a/%s_%s" % (
                 it.get('id'), it.get('authorId'))
             fni['time'] = datetime.datetime.fromtimestamp(
                 float(it.get('publicTime')) /
                 1000).strftime('%Y-%m-%d %H:%M:%S')
             fni = self.spider._filter_by_title(fni)
             if fni:
                 yield scrapy.Request(url=fni['url'],
                                      meta={'item': fni},
                                      callback=self._parse_detail)
             else:
                 continue
Пример #8
0
 def _parse_list(self, response):
     conf = response.meta.get('conf')
     current = datetime.datetime.now()
     for text in response.css('ul#newsListContent>li>div.text'):
         fni = FinNewsItem()
         fni['seed'] = conf['name']
         fni['title'] = text.css('p.title>a::text').extract_first().strip()
         fni['url'] = text.css(
             'p.title>a::attr(href)').extract_first().strip()
         fni['time'] = text.css('p.time::text').extract_first().strip()
         fni['time'] = fni['time'].replace('月', '-').replace('日', '')
         year = current.year if current.month >= int(
             fni['time'][:2]) else current.year - 1
         fni['time'] = "%s-%s" % (year, fni['time'])
         fni['nid'] = "%s%s" % (conf['name'], fni['url'])
         fni = self.spider._filter_by_title(fni)
         if fni:
             yield scrapy.Request(url=fni['url'],
                                  meta={'item': fni},
                                  callback=self._parse_detail)
         else:
             continue
Пример #9
0
 def _parse_list(self, response):
     conf = response.meta.get('conf')
     rsp = demjson.decode(response.body_as_unicode())
     items = rsp.get('list', None)
     if items is not None:
         current = datetime.datetime.now()
         for it in sorted(items, key=lambda x: x['time']):
             fni = FinNewsItem()
             fni['seed'] = conf['name']
             fni['nid'] = "%s%s" % (conf['name'], it.get('id'))
             fni['title'] = it.get('title')
             fni['url'] = it.get('titleLink')
             year = current.year if current.month >= int(
                 it.get('time')[:2]) else current.year - 1
             fni['time'] = "%s-%s" % (year, it.get('time'))
             fni = self.spider._filter_by_title(fni)
             if fni:
                 yield scrapy.Request(url=fni['url'],
                                      meta={'item': fni},
                                      callback=self._parse_detail)
             else:
                 continue
Пример #10
0
 def _parse_list(self, response):
     conf = response.meta.get('conf')
     rsp = self.spider._jsonp(response.body_as_unicode(), end=-7)
     rsp = demjson.decode(rsp)
     items = rsp.get('result', None)
     items = items.get('data', None) if items is not None else None
     if items is not None:
         for it in sorted(items, key=lambda x: x['ctime']):
             fni = FinNewsItem()
             fni['seed'] = conf['name']
             fni['nid'] = "%s%s" % (conf['name'], it.get('docid'))
             fni['title'] = it.get('title')
             fni['url'] = it.get('url')
             fni['time'] = datetime.datetime.fromtimestamp(
                 float(it.get('ctime'))).strftime('%Y-%m-%d %H:%M:%S')
             fni = self.spider._filter_by_title(fni)
             if fni:
                 yield scrapy.Request(url=fni['url'],
                                      meta={'item': fni},
                                      callback=self._parse_detail)
             else:
                 continue