def process(self, data): """ 经与周晔、技术讨论,对排重规则进行调整: 定义在车源标题、车型、里程数、地区、联系方式、价格、车龄都完全相同的车源,为重复车源 在前台隐藏不展示,但数据库中不删除,便于后续数据分析 """ keys = ("car_title", "car_type", "car_mileage", "car_price", "car_birth", "source_province", "source_zone") m = hashlib.md5() for key in keys: if type(data[key]) == unicode: mk = data[key].encode('utf-8') else: mk = str(data[key]) m.update(mk) signature = m.hexdigest() if self.rd7.exists(signature): for contact in (data["contact_mobile"], data["contact_phone"]): if self.rd7.sismember(signature, contact): dr.insert_data(data) self.logger.debug("(%s) has been droped by process77 %s." % (data['domain'], data['url'])) return None self.rd7.sadd(signature, data["contact_mobile"], data["contact_phone"]) return data
def process(self, data): is_lose = Process66Main._is_car_title_lose(data['car_title']) if is_lose: dr.insert_data(data) self.logger.debug("(%s) Item ignore, lose car_title." % data['domain']) return None is_lose = Process66Main._is_car_brand_car_series_lose(data['car_brand'], data['car_series']) if is_lose: dr.insert_data(data) self.logger.debug("(%s) Item ignore, lose car_series." % data['domain']) return None is_lose = Process66Main._is_car_price_lose(data['car_price']) if is_lose: dr.insert_data(data) self.logger.debug("(%s) Item ignore, lose car_price." % data['domain']) return None is_lose = Process66Main._is_contact_phone_contact_mobile_contact_mail_contact_qq_lose(data['contact_phone'], data['contact_mobile'], data['contact_mail'], data['contact_qq']) if is_lose: dr.insert_data(data) self.logger.debug("(%s) Item ignore, lose contact." % data['domain']) return None is_lose = Process66Main._is_source_province_source_zone_lose(data['source_province'], data['source_zone']) if is_lose: dr.insert_data(data) self.logger.debug("(%s) Item ignore, lose source." % data['domain']) return None time.sleep(0.08) return data
def process(self, data): is_lose = Process66Main._is_car_title_lose(data['car_title']) if is_lose: dr.insert_data(data) self.logger.debug("(%s) Item ignore, lose car_title." % data['domain']) return None is_lose = Process66Main._is_car_brand_car_series_lose( data['car_brand'], data['car_series']) if is_lose: dr.insert_data(data) self.logger.debug("(%s) Item ignore, lose car_series." % data['domain']) return None is_lose = Process66Main._is_car_price_lose(data['car_price']) if is_lose: dr.insert_data(data) self.logger.debug("(%s) Item ignore, lose car_price." % data['domain']) return None is_lose = Process66Main._is_contact_phone_contact_mobile_contact_mail_contact_qq_lose( data['contact_phone'], data['contact_mobile'], data['contact_mail'], data['contact_qq']) if is_lose: dr.insert_data(data) self.logger.debug("(%s) Item ignore, lose contact." % data['domain']) return None is_lose = Process66Main._is_source_province_source_zone_lose( data['source_province'], data['source_zone']) if is_lose: dr.insert_data(data) self.logger.debug("(%s) Item ignore, lose source." % data['domain']) return None time.sleep(0.08) return data