def parse(self, response): try: logging.info('<JS_progress> parse url:' + response.url) items = response.xpath('//div[@class="col-xs-8"]/div[@class="wrap"]') if len(items) > 0: for item in items: author_url = item.xpath('a/@href').extract()[0] author_id = author_url.split('/').pop() parsingItem = AuthorIdItem() parsingItem['author_id'] = author_id logging.info('<JS><parsingitem> author_id: %s' % author_id) yield parsingItem if self.recommend_page_index == 1: self.session = get_db_session() self.recommend_page_index += 1 url = self.recommend_base_url % self.recommend_page_index yield Request(url, headers=self.headers, callback=self.parse, dont_filter=True) request = self.start_request_author() if request: logging.info('<js_scrapy> start next author....4') yield request except Exception as ex: logging.error('<JS><Author_Commit>parse error:\n' + repr(ex)) logging.error(traceback.format_exc())
def __init__(self): super(jianshu_spider, self).__init__() init_mysql() self.session = get_db_session() list = self.session.query(User).filter(User.is_article_complete == 1 or User.is_follower_complete == 1).all() if list: for item in list: if item.is_article_complete == 1: item.is_article_complete = 0 if item.is_follower_complete == 1: item.is_follower_complete = 0 self.session.flush() self.session.commit() list = self.session.query(ParsingItem).filter(ParsingItem.is_parsed == 1).all() if list: for item in list: item.is_parsed = 0 self.session.flush() self.session.commit()
def open_spider(self, spider): self.session = get_db_session()