bloom = Bloomfilter("state.txt") else: print("文件不存在设置大小为100000") bloom = Bloomfilter(100000) bloom =Bloomfilter(100000) # 如果程序是第一次使用填数字 如果不是第一次 用文件路径 # bloom =Bloomfilter("state.txt") while True: key = input("请输入数据") if bloom.test(key): #测试 print("数据存在",key) else: print("数据不存在!",key) bloom.add(key) #添加 bloom.save("state.txt") #状态保存 # 1 列表去重 #优点:逻辑/代码简单易懂 # 缺点 1)如果数据量巨大电脑吃不消(内存) # 2)只对当前运行有效 ,不能停 result = ['zhangsan','lisi'] keyword = 'wangwu' if 'zhangsan' in result: print("数据已存在") else: #数据存储 print("新数据,开始存储") result.append(keyword)
class SaveToMysqlPipeline(object): def open_spider(self, spider): # host:localhost、127.0.0.1、 192.168.2.54 # user:连接数据库的用户名,一般都是root # password:连接数据库的密码, 123456 # database:连接的数据库名字(数据库必须存在) # port:mysql数据库的端口,默认3306 # charset:mysql数据库的编码格式, utf8 # connection = pymysql.connect(host="localhost", user="******", password="******", database="jobs", port=3306, charset="utf8") # connection = pymysql.connect( # host=settings['MYSQL_HOST'], # user=settings['MYSQL_USER'], # password=settings['MYSQL_PASSWORD'], # database=settings['MYSQL_DATABASE'], # port=settings['MYSQL_PORT'], # charset=settings['MYSQL_CHARSET'], # ) connection = pymysql.connect(**settings['MYSQL_SETTINGS']) cursor = connection.cursor() if os.path.exists("job.state"): self.bloom = Bloomfilter("job.state") else: self.bloom = Bloomfilter(1000000) # 主键 PRIMARY KEY,特点:不能重复 # 自增 AUTO_INCREMENT # 如果你的列名多个单词,最好用``括起来 例如 `my name` # 如果你的列名是sql的关键字,最好用`括起来 cursor.execute(""" CREATE TABLE IF NOT EXISTS `job` ( job_id INTEGER PRIMARY KEY AUTO_INCREMENT, job_name text COMMENT '工作名称', job_money text COMMENT '工作薪资', max_money FLOAT COMMENT '最大薪资', min_money FLOAT COMMENT '最少薪资', job_date text COMMENT '工作发布时间', company_name text COMMENT '公司名称', job_place text COMMENT '工作地点', job_city text COMMENT '工作城市', job_area text COMMENT '工作地区', job_education text COMMENT '工作学历', job_fuli text COMMENT '公司福利', job_from text COMMENT '工作所属网站', job_type text COMMENT '工作类型', job_detail_href text COMMENT '详情地址', job_state text COMMENT '工作数据的加密信息' ) """) self.connection = connection self.cursor = cursor def process_item(self, item, spider): # 将python数据结构转换为Json job_state = json.dumps(dict(item)) # 摘要算法,把任意长度的数据转换为一个长度固定的数据串 hl = hashlib.md5() hl.update(job_state.encode(encoding='utf-8')) job_state = hl.hexdigest() # 测试数据是否在bloom对象中 # 数据不在,添加,并且插入到数据库 if not self.bloom.test(item['job_detail_href']): print("添加数据========================") self.cursor.execute( """ INSERT INTO job ( job_name, job_money, max_money, min_money, job_date, company_name, job_place, job_city, job_area, job_education, job_fuli, job_from, job_type, job_detail_href, job_state ) VALUES ( %s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s ) """, (item['job_name'], item['job_money'], item['max_money'], item['min_money'], item['job_date'], item['company_name'], item['job_place'], item['job_city'], item['job_area'], item['job_education'], item['job_fuli'], item['job_from'], item['job_type'], item['job_detail_href'], job_state)) self.bloom.add(item['job_detail_href']) self.bloom.save("job.state") # else: self.cursor.execute( """SELECT job_state from job WHERE job_detail_href=%s""", (item['job_detail_href'], )) result = self.cursor.fetchone() if result and result[0] != job_state: print("更新数据=========================") self.cursor.execute( """ UPDATE job set job_name=%s, job_money=%s, max_money=%s, min_money=%s, job_date=%s, company_name=%s, job_place=%s, job_city=%s, job_area=%s, job_education=%s, job_fuli=%s, job_from=%s, job_type=%s WHERE job_detail_href=%s """, (item['job_name'], item['job_money'], item['max_money'], item['min_money'], item['job_date'], item['company_name'], item['job_place'], item['job_city'], item['job_area'], item['job_education'], item['job_fuli'], item['job_from'], item['job_type'], item['job_detail_href'])) else: print("不用更新数据=========================") self.connection.commit() return item def close_spider(self, spider): self.cursor.close() self.connection.close()