class PersistentPipeline(object): """持久化数据 Pipeline """ def open_spider(self, spider): self.session = Session() def close_spider(self, spider): self.session.commit() self.session.close() def process_item(self, item, spider): if isinstance(item, JobItem): return self._process_job_item(item) else: return item def _process_job_item(self, item): city = item['city'].split('·')[0] salary_lower, salary_upper = 0, 0 m = re.match(r'[^\d]*(\d+)k-(\d+)k', item['salary']) if m is not None: salary_lower, salary_upper = int(m.group(1)), int(m.group(2)) experience_lower, experience_upper = 0, 0 m = re.match(r'[^\d]*(\d+)-(\d+)', item['experience']) if m is not None: experience_lower, experience_upper = int(m.group(1)), int( m.group(2)) tags = ' '.join(item['tags']) model = JobModel( title=item['title'], city=city, salary_lower=salary_lower, salary_upper=salary_upper, experience_lower=experience_lower, experience_upper=experience_upper, education=item['education'], tags=tags, company=item['company'], ) self.session.add(model) return item
class SeiyaPipeline(object): def open_spider(self, spider): self.session = Session() def close_spider(self, spider): self.session.commit() self.session.close() def process_item(self, item, spider): if isinstance(item, JobItem): return self._process_job_item(item) else: return item def _process_job_item(self, item): city = item['city'].split('·')[0] m = re.search(r'(\d*)k-(\d*)k', item['salary']) if m: salary_lower, salary_upper = int(m.group(1)), int(m.group(2)) else: salary_lower, salary_upper = 0, 0 m = re.search(r'(\d+)-(\d+)', item['experience']) if m: experience_lower, experience_upper = int(m.group(1)), int( m.group(2)) else: experience_lower, experience_upper = 0, 0 tags = ' '.join(item['tags']) jobdata = JobModel(title=item['title'], city=city, salary_lower=salary_lower, salary_upper=salary_upper, experience_lower=experience_lower, experience_upper=experience_upper, education=item['education'], tags=tags, company=item['company']) self.session.add(jobdata) return item
class PersistentPipeline(object): """持久化数据 Pipeline """ def open_spider(self, spider): self.session = Session() def close_spider(self, spider): self.session.commit() self.session.close() def process_item(self, item, spider): if isinstance(item, JobItem): return self._process_job_item(item) elif isinstance(item, FoodItem): return self._process_food_item(item) elif isinstance(item, HouseItem): return self._process_house_item(item) else: return item def _process_job_item(self, item): city = item['city'].split('·')[0] salary_lower, salary_upper = 0, 0 m = re.match(r'[^\d]*(\d+)k-(\d+)k', item['salary']) if m is not None: salary_lower, salary_upper = int(m.group(1)), int(m.group(2)) experience_lower, experience_upper = 0, 0 m = re.match(r'[^\d]*(\d+)-(\d+)', item['experience']) if m is not None: experience_lower, experience_upper = int(m.group(1)), int( m.group(2)) tags = ' '.join(item['tags']) model = JobModel( title=item['title'], city=city, salary_lower=salary_lower, salary_upper=salary_upper, experience_lower=experience_lower, experience_upper=experience_upper, education=item['education'], tags=tags, company=item['company'], ) self.session.add(model) return item def _process_food_item(self, item): agvExp, score = 0, 0 m = re.match(r'¥(\d+)', item['agvExp']) if m is not None: agvExp = int(m.group(1)) m = re.match(r'(.*)(\d{2})', item['score']) if m is not None: score = int(m.group(2)) / 10.0 model = FoodModel( label=item['label'], name=item['name'], score=score, reviewNum=int(item['reviewNum']), agvExp=agvExp, fenlei=item['fenlei'], quan=item['quan'], addr=item['addr'], kouwei=float(item['kouwei']), huanjin=float(item['huanjin']), fuwu=float(item['fuwu']), ) self.session.add(model) return item def _process_house_item(self, item): mianji = item['mianji'].split('平米')[0] louceng = item['other'][0] years = item['other'][1] model = HouseModel( area=item['area'], name=item['name'], xiaoqu=item['xiaoqu'].replace("\xa0\xa0", ""), huxing=item['huxing'].replace("\xa0\xa0", ""), mianji=float(mianji), chaoxiang=item['chaoxiang'], quan=item['quan'], louceng=louceng, years=years, labels=' '.join(item['labels']), price=int(item['price']), ) self.session.add(model) return item