class Splunk(object): def __init__(self): self.bf = Bloomfilter(64) self.terms = {} # Dictionary of term to set of events self.events = [] def add_event(self, event): """Adds an event to this object""" # Generate a unique ID for the event, and save it event_id = len(self.events) self.events.append(event) # Add each term to the bloomfilter, and track the event by each term for term in segments(event): self.bf.add_value(term) if term not in self.terms: self.terms[term] = set() self.terms[term].add(event_id) def search(self, term): """Search for a single term, and yield all the events that contain it""" # In Splunk this runs in O(1), and is likely to be in filesystem cache (memory) if not self.bf.might_contain(term): return # In Splunk this probably runs in O(log N) where N is the number of terms in the tsidx if term not in self.terms: return for event_id in sorted(self.terms[term]): yield self.events[event_id]
def __init__(self, dbpool): self.dbpool = dbpool if os.path.exists("job.state"): bloom = Bloomfilter("job.state") else: bloom = Bloomfilter(1000000) self.bloom = bloom query = self.dbpool.runInteraction(self.db_create) query.addErrback(self.db_create_err)
def open_spider(self, spider): # host:localhost、127.0.0.1、 192.168.2.54 # user:连接数据库的用户名,一般都是root # password:连接数据库的密码, 123456 # database:连接的数据库名字(数据库必须存在) # port:mysql数据库的端口,默认3306 # charset:mysql数据库的编码格式, utf8 # connection = pymysql.connect(host="localhost", user="******", password="******", database="jobs", port=3306, charset="utf8") # connection = pymysql.connect( # host=settings['MYSQL_HOST'], # user=settings['MYSQL_USER'], # password=settings['MYSQL_PASSWORD'], # database=settings['MYSQL_DATABASE'], # port=settings['MYSQL_PORT'], # charset=settings['MYSQL_CHARSET'], # ) connection = pymysql.connect(**settings['MYSQL_SETTINGS']) cursor = connection.cursor() if os.path.exists("job.state"): self.bloom = Bloomfilter("job.state") else: self.bloom = Bloomfilter(1000000) # 主键 PRIMARY KEY,特点:不能重复 # 自增 AUTO_INCREMENT # 如果你的列名多个单词,最好用``括起来 例如 `my name` # 如果你的列名是sql的关键字,最好用`括起来 cursor.execute(""" CREATE TABLE IF NOT EXISTS `job` ( job_id INTEGER PRIMARY KEY AUTO_INCREMENT, job_name text COMMENT '工作名称', job_money text COMMENT '工作薪资', max_money FLOAT COMMENT '最大薪资', min_money FLOAT COMMENT '最少薪资', job_date text COMMENT '工作发布时间', company_name text COMMENT '公司名称', job_place text COMMENT '工作地点', job_city text COMMENT '工作城市', job_area text COMMENT '工作地区', job_education text COMMENT '工作学历', job_fuli text COMMENT '公司福利', job_from text COMMENT '工作所属网站', job_type text COMMENT '工作类型', job_detail_href text COMMENT '详情地址', job_state text COMMENT '工作数据的加密信息' ) """) self.connection = connection self.cursor = cursor
# 3 布隆去重 推荐 #布隆过滤存储的是状态 0 / 1 # 优点: 内存占有量低 ,可以持久化存储 from bloomfilter import Bloomfilter import os # 参数1: n位 或者 文件路径 if os.path.exists("state.txt"): print("文件存在直接加载状态") bloom = Bloomfilter("state.txt") else: print("文件不存在设置大小为100000") bloom = Bloomfilter(100000) bloom =Bloomfilter(100000) # 如果程序是第一次使用填数字 如果不是第一次 用文件路径 # bloom =Bloomfilter("state.txt") while True: key = input("请输入数据") if bloom.test(key): #测试 print("数据存在",key) else: print("数据不存在!",key) bloom.add(key) #添加 bloom.save("state.txt") #状态保存 # 1 列表去重 #优点:逻辑/代码简单易懂 # 缺点 1)如果数据量巨大电脑吃不消(内存) # 2)只对当前运行有效 ,不能停
class SaveToMysqlPipeline(object): def open_spider(self, spider): # host:localhost、127.0.0.1、 192.168.2.54 # user:连接数据库的用户名,一般都是root # password:连接数据库的密码, 123456 # database:连接的数据库名字(数据库必须存在) # port:mysql数据库的端口,默认3306 # charset:mysql数据库的编码格式, utf8 # connection = pymysql.connect(host="localhost", user="******", password="******", database="jobs", port=3306, charset="utf8") # connection = pymysql.connect( # host=settings['MYSQL_HOST'], # user=settings['MYSQL_USER'], # password=settings['MYSQL_PASSWORD'], # database=settings['MYSQL_DATABASE'], # port=settings['MYSQL_PORT'], # charset=settings['MYSQL_CHARSET'], # ) connection = pymysql.connect(**settings['MYSQL_SETTINGS']) cursor = connection.cursor() if os.path.exists("job.state"): self.bloom = Bloomfilter("job.state") else: self.bloom = Bloomfilter(1000000) # 主键 PRIMARY KEY,特点:不能重复 # 自增 AUTO_INCREMENT # 如果你的列名多个单词,最好用``括起来 例如 `my name` # 如果你的列名是sql的关键字,最好用`括起来 cursor.execute(""" CREATE TABLE IF NOT EXISTS `job` ( job_id INTEGER PRIMARY KEY AUTO_INCREMENT, job_name text COMMENT '工作名称', job_money text COMMENT '工作薪资', max_money FLOAT COMMENT '最大薪资', min_money FLOAT COMMENT '最少薪资', job_date text COMMENT '工作发布时间', company_name text COMMENT '公司名称', job_place text COMMENT '工作地点', job_city text COMMENT '工作城市', job_area text COMMENT '工作地区', job_education text COMMENT '工作学历', job_fuli text COMMENT '公司福利', job_from text COMMENT '工作所属网站', job_type text COMMENT '工作类型', job_detail_href text COMMENT '详情地址', job_state text COMMENT '工作数据的加密信息' ) """) self.connection = connection self.cursor = cursor def process_item(self, item, spider): # 将python数据结构转换为Json job_state = json.dumps(dict(item)) # 摘要算法,把任意长度的数据转换为一个长度固定的数据串 hl = hashlib.md5() hl.update(job_state.encode(encoding='utf-8')) job_state = hl.hexdigest() # 测试数据是否在bloom对象中 # 数据不在,添加,并且插入到数据库 if not self.bloom.test(item['job_detail_href']): print("添加数据========================") self.cursor.execute( """ INSERT INTO job ( job_name, job_money, max_money, min_money, job_date, company_name, job_place, job_city, job_area, job_education, job_fuli, job_from, job_type, job_detail_href, job_state ) VALUES ( %s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s ) """, (item['job_name'], item['job_money'], item['max_money'], item['min_money'], item['job_date'], item['company_name'], item['job_place'], item['job_city'], item['job_area'], item['job_education'], item['job_fuli'], item['job_from'], item['job_type'], item['job_detail_href'], job_state)) self.bloom.add(item['job_detail_href']) self.bloom.save("job.state") # else: self.cursor.execute( """SELECT job_state from job WHERE job_detail_href=%s""", (item['job_detail_href'], )) result = self.cursor.fetchone() if result and result[0] != job_state: print("更新数据=========================") self.cursor.execute( """ UPDATE job set job_name=%s, job_money=%s, max_money=%s, min_money=%s, job_date=%s, company_name=%s, job_place=%s, job_city=%s, job_area=%s, job_education=%s, job_fuli=%s, job_from=%s, job_type=%s WHERE job_detail_href=%s """, (item['job_name'], item['job_money'], item['max_money'], item['min_money'], item['job_date'], item['company_name'], item['job_place'], item['job_city'], item['job_area'], item['job_education'], item['job_fuli'], item['job_from'], item['job_type'], item['job_detail_href'])) else: print("不用更新数据=========================") self.connection.commit() return item def close_spider(self, spider): self.cursor.close() self.connection.close()
def __init__(self): self.bf = Bloomfilter(64) self.terms = {} # Dictionary of term to set of events self.events = []