def _parse_list(self, response): conf = response.meta.get('conf') current = datetime.datetime.now() rsp = demjson.decode(response.body_as_unicode()) data = rsp.get('data', None) info = data.get('article_info', None) if data is not None else None lis = Selector(text=info).css('ul>li') if info is not None else None if lis is not None: for it in lis: ctime = it.css('span.t-time::text').extract_first() title = it.css('a::text').extract_first() href = it.css('a::attr(href)').extract_first() fni = FinNewsItem() fni['seed'] = conf['name'] fni['nid'] = "%s%s" % (conf['name'], href) fni['title'] = title fni['url'] = href year = current.year if current.month >= int( ctime[:2]) else current.year - 1 fni['time'] = "%s-%s" % (year, ctime) fni = self.spider._filter_by_title(fni) if fni: yield scrapy.Request(url=fni['url'], meta={'item': fni}, callback=self._parse_detail) else: continue
def _parse_list(self, response): conf = response.meta.get('conf') if response.meta.get('curr_page') == -1: url = conf['url'] % (conf['page_num'], -1) yield scrapy.Request(url=url, meta={'conf': conf, 'max_page': response.meta.get('max_page'), 'curr_page': 0}, callback=self._parse_list) else: rsp = demjson.decode(response.body_as_unicode()) if response.meta.get('curr_page') < response.meta.get('max_page'): if rsp.get('next_max_id', 1) > 1: url = conf['url'] % (conf['page_num'], rsp.get('next_max_id', 1)) yield scrapy.Request(url=url, meta={'conf': conf, 'max_page': response.meta.get('max_page'), 'curr_page': response.meta.get('curr_page')+1}, callback=self._parse_list) items = rsp.get('list', None) if items is not None: for it in items: fni = FinNewsItem() fni['seed'] = conf['name'] data = demjson.decode(it.get('data')) fni['nid'] = "%s%s"%(conf['name'], data.get('id')) fni['title'] = data.get('title') fni['url'] = "https://xueqiu.com%s" % data.get('target') fni['time'] = datetime.datetime.fromtimestamp(float(data.get('created_at'))/1000).strftime('%Y-%m-%d %H:%M:%S') fni = self.spider._filter_by_title(fni) if fni: yield scrapy.Request(url=fni['url'], meta={'item': fni}, callback=self._parse_detail) else: continue
def _parse_list(self, response): conf = response.meta.get('conf') for text in response.css('div.main-content ul.new-list>li:not(.line)'): fni = FinNewsItem() fni['seed'] = conf['name'] fni['title'] = text.css('a::text').extract_first().strip() fni['url'] = text.css('a::attr(href)').extract_first().strip() fni['time'] = text.css('span.time::text').extract_first().strip().replace('[','').replace(']','') fni['nid'] = "%s%s"%(conf['name'], fni['url']) fni = self.spider._filter_by_title(fni) if fni: yield scrapy.Request(url=fni['url'], meta={'item': fni}, callback=self._parse_detail) else: continue
def _parse_list(self, response): conf = response.meta.get('conf') for text in response.css('div.box_mid div.box_list_word'): fni = FinNewsItem() fni['seed'] = conf['name'] fni['title'] = text.css('h2>a::text').extract_first().strip() fni['url'] = text.css('h2>a::attr(href)').extract_first().strip() fni['time'] = text.css( 'div.keywords>a::text').extract_first().strip() fni['nid'] = "%s%s" % (conf['name'], fni['url']) fni = self.spider._filter_by_title(fni) if fni: yield scrapy.Request(url=fni['url'], meta={'item': fni}, callback=self._parse_detail) else: continue
def _parse_list(self, response): conf = response.meta.get('conf') rsp = self.spider._jsonp(response.body_as_unicode()) rsp = demjson.decode(rsp) items = rsp.get('BA8EE5GMwangning', None) if items is not None: for it in sorted(items, key=lambda x: x['ptime']): fni = FinNewsItem() fni['seed'] = conf['name'] fni['nid'] = "%s%s" % (conf['name'], it.get('docid')) fni['title'] = it.get('title') fni['url'] = it.get('url') fni['time'] = it.get('ptime') fni = self.spider._filter_by_title(fni) if fni: yield scrapy.Request(url=fni['url'], meta={'item': fni}, callback=self._parse_detail) else: continue
def _parse_list(self, response): conf = response.meta.get('conf') rsp = demjson.decode(response.body_as_unicode()) items = rsp.get('list', None) if items is not None: for it in items: fni = FinNewsItem() fni['seed'] = conf['name'] fni['nid'] = "%s%s" % (conf['name'], it.get('ContId')) fni['title'] = it.get('Title') fni['url'] = it.get('Url') fni['time'] = datetime.datetime.fromtimestamp( float( it.get('CreatedTime3g'))).strftime('%Y-%m-%d %H:%M:%S') fni = self.spider._filter_by_title(fni) if fni: yield scrapy.Request(url=fni['url'], meta={'item': fni}, callback=self._parse_detail) else: continue
def _parse_list(self, response): conf = response.meta.get('conf') items = demjson.decode(response.body_as_unicode()) if items is not None: for it in items: fni = FinNewsItem() fni['seed'] = conf['name'] fni['nid'] = "%s%s" % (conf['name'], it.get('id')) fni['title'] = it.get('title') fni['url'] = "http://www.sohu.com/a/%s_%s" % ( it.get('id'), it.get('authorId')) fni['time'] = datetime.datetime.fromtimestamp( float(it.get('publicTime')) / 1000).strftime('%Y-%m-%d %H:%M:%S') fni = self.spider._filter_by_title(fni) if fni: yield scrapy.Request(url=fni['url'], meta={'item': fni}, callback=self._parse_detail) else: continue
def _parse_list(self, response): conf = response.meta.get('conf') current = datetime.datetime.now() for text in response.css('ul#newsListContent>li>div.text'): fni = FinNewsItem() fni['seed'] = conf['name'] fni['title'] = text.css('p.title>a::text').extract_first().strip() fni['url'] = text.css( 'p.title>a::attr(href)').extract_first().strip() fni['time'] = text.css('p.time::text').extract_first().strip() fni['time'] = fni['time'].replace('月', '-').replace('日', '') year = current.year if current.month >= int( fni['time'][:2]) else current.year - 1 fni['time'] = "%s-%s" % (year, fni['time']) fni['nid'] = "%s%s" % (conf['name'], fni['url']) fni = self.spider._filter_by_title(fni) if fni: yield scrapy.Request(url=fni['url'], meta={'item': fni}, callback=self._parse_detail) else: continue
def _parse_list(self, response): conf = response.meta.get('conf') rsp = demjson.decode(response.body_as_unicode()) items = rsp.get('list', None) if items is not None: current = datetime.datetime.now() for it in sorted(items, key=lambda x: x['time']): fni = FinNewsItem() fni['seed'] = conf['name'] fni['nid'] = "%s%s" % (conf['name'], it.get('id')) fni['title'] = it.get('title') fni['url'] = it.get('titleLink') year = current.year if current.month >= int( it.get('time')[:2]) else current.year - 1 fni['time'] = "%s-%s" % (year, it.get('time')) fni = self.spider._filter_by_title(fni) if fni: yield scrapy.Request(url=fni['url'], meta={'item': fni}, callback=self._parse_detail) else: continue
def _parse_list(self, response): conf = response.meta.get('conf') rsp = self.spider._jsonp(response.body_as_unicode(), end=-7) rsp = demjson.decode(rsp) items = rsp.get('result', None) items = items.get('data', None) if items is not None else None if items is not None: for it in sorted(items, key=lambda x: x['ctime']): fni = FinNewsItem() fni['seed'] = conf['name'] fni['nid'] = "%s%s" % (conf['name'], it.get('docid')) fni['title'] = it.get('title') fni['url'] = it.get('url') fni['time'] = datetime.datetime.fromtimestamp( float(it.get('ctime'))).strftime('%Y-%m-%d %H:%M:%S') fni = self.spider._filter_by_title(fni) if fni: yield scrapy.Request(url=fni['url'], meta={'item': fni}, callback=self._parse_detail) else: continue