class BloomPipeline(object): def __init__(self, bloomfile, spider_name): self.bloomfile = bloomfile self.spider_name = spider_name # item crawled before logger.info("loading crawled items before...") if os.path.isfile(self.bloomfile): f = open(self.bloomfile, 'r') self.item_crawled = ScalableBloomFilter.fromfile(f) f.close() else: self.item_crawled = ScalableBloomFilter( 100000000, 0.001, mode=ScalableBloomFilter.SMALL_SET_GROWTH) cnt = self.item_crawled.count logger.info("pipline read %d crawled items" % cnt) def __del__(self): f = open(self.bloomfile, 'w') self.item_crawled.tofile(f) f.close() @classmethod def from_crawler(cls, crawler): return cls( #mongo_uri=crawler.settings.get('MONGODB_ADDRESS'), bloomfile=crawler.settings.get('BLOOM_FILE'), #bloomfile = "/root/dev/SocialSpider/data/weibotv/bloomfile", spider_name=crawler.spidercls.name) def process_item(self, item, spider): #if not item['md5']: # md5 = hashlib.md5("%s%s%s"%(item['title'].encode('utf-8'),item['url'].encode('utf-8'))).hexdigest() # item['md5'] = md5 valid = True item_id = '' if self.spider_name == 'weibotv': item_id = item['mid'] elif self.spider_name == 'toutiao': item_id = item['Url'] #item_id = hashlib.md5("%s"%(item['Url'].encode('utf-8'))).hexdigest() elif self.spider_name == 'anyvspider': item_id = item['pid'] else: pass if self.item_crawled.add(item_id): valid = False else: valid = True if valid: logger.info("item: %s wrote to bloomfile %s" % (item_id.encode('utf-8'), self.bloomfile)) return item else: logger.info("item droped %s " % item_id.encode('utf-8'))
class FilterHandler(object): def __init__(self, logger): self.logger_ = logger self._load_from_file() def url_seen(self, url): if self.deduper_.add(url): self.logger_.info('url duplicated: %s', url) return True return False def _load_from_file(self): self.logger_.info('loading data from cache file...') if not os.path.isfile('data/bloom.data'): self.logger_.error('bloom cache file not found, create one instead.') self.deduper_ = ScalableBloomFilter(100000, 0.0001, 4) else: with open('data/bloom.data', 'r') as f: self.deduper_ = ScalableBloomFilter.fromfile(f) def _dump_to_file(self): self.logger_.info('dumping data...') if not os.path.isdir('data'): os.mkdir('data') with open('data/bloom.data', 'w') as f: self.deduper_.tofile(f) self.logger_.info('dump data finished.') def close(self): self._dump_to_file()
def ParseQueue(): # Load Checked Urls File if os.path.isfile(path_checked_url_file): with open(path_checked_url_file, 'rb') as rf: checked_url_pool = ScalableBloomFilter.fromfile(rf) print("bf: Read pybloom from %s.\n" % path_checked_url_file) else: checked_url_pool = ScalableBloomFilter( initial_capacity=1000, error_rate=0.001, mode=ScalableBloomFilter.SMALL_SET_GROWTH) print("bf: Create pybloom") # Get each Item from Queue i = 1 # URL_QUEUE.put_nowait(None) # sign the end of Queue # for item in iter(URL_QUEUE.get_nowait, None): # cur_url = item[2] URL_DEQUE.appendleft(None) for item in iter(URL_DEQUE.pop, None): cur_url = item[2] if (cur_url in checked_url_pool) == False: # cur_url never checked try: time.sleep(0.3) page_html_raw = requests.get(cur_url, timeout=3) except requests.RequestException as e: print(e) # URL_DEQUE.appendleft(cur_url) with open(path_requestErr_log, 'a') as f_requestErr: f_requestErr.write( time.strftime('%Y-%m-%d %H:%M:%S', time.localtime(time.time())) + "Timeout " + cur_url + '\n') else: page_html = page_html_raw.content.decode('utf-8', 'ignore') buffer = parser4me.parser_4_1(item, page_html) with open(path_output_folder + os.path.sep + item[1] + item[0][0:128] + ".txt", 'w', encoding='utf-8') as resf: resf.write(buffer) print("%s OK! to file %s" % (i, item[0])) checked_url_pool.add(cur_url) i += 1 else: print("Skip %s" % i) i += 1 with open(path_checked_url_file, 'wb') as wf: checked_url_pool.tofile(wf)
class DuplicateItemFilterPipeline(Pipeline): # bloomfiler 序列化 fileName = "DuplicateItemFilter.dat" def open_spider(self, spider): self.fileName = spider.name + self.fileName if os.path.exists(self.fileName): with open(self.fileName, 'rb') as f: self.sbf = ScalableBloomFilter.fromfile(f) else: self.sbf = ScalableBloomFilter( mode=ScalableBloomFilter.LARGE_SET_GROWTH) pass def close_spider(self, spider): with open(self.fileName, 'wb') as f: self.sbf = self.sbf.tofile(f) pass def process_item(self, item, spider): # bloomfiler fp = hashlib.sha1() for key in item.keys(): if key not in ['curlDate', 'reference'] \ and item[key] is not None: # 不比较抓取时间,来源url fp.update(item[key]) fpValue = fp.hexdigest() if not self.sbf.add(fpValue): return item else: raise DropItem("duplicate item :/n %s" % item)
d = {} red = redis.StrictRedis(host='localhost', port=6379, db=0) #bloom1hop = ScalableBloomFilter(mode=ScalableBloomFilter.LARGE_SET_GROWTH)#capacity=200000000, error_rate=0.0001) bloomreiqual = ScalableBloomFilter(mode=ScalableBloomFilter.LARGE_SET_GROWTH ) #capacity=200000000, error_rate=0.0001) count = 0 red.set('linesread3', 0) with open('wikidata-raw-2018.08.01_reifiedqualifiers.ttl') as infile: for line in infile: try: line = line.strip() if line[0] == '#': continue red.incr('linesread3') tokens = line.split(' ') s = tokens[0][1:-1] p = tokens[1][1:-1] o = tokens[2][1:-1] _s, _p, _o = s[37:].split('_') _qualrel = p[31:] _qualent = p[37:] bloomreiqual.add(_s + ':' + _qualrel + '_' + _qualent) bloomreiqual.add(_o + ':' + _qualrel + '_' + _qualent) except Exception as e: print(e) f = open('bloom/bloomreifiedqualifiers.pickle', 'wb') bloomreiqual.tofile(f) f.close()
def manager(initUrlList, max_deep=MAX_DEEP, max_pageNum=MAX_PAGENUM\ , crawl_type = CRAWL_TYPE, proxies=PROXIES): redis.set('success', 0) # 抓取网站个数 page_num = 0 htmlQueue = Queue() if isinstance(initUrlList, list): initUrl = initUrlList[0] for url in initUrlList: htmlQueue.put(url) elif isinstance(initUrlList, str): initUrl = initUrlList htmlQueue.put(initUrl) if max_pageNum == 0: max_pageNum = -1 if max_deep == 0: max_deep = 9999 try: with open('urlBloomfilter.bloom', 'rb') as f: sbf = ScalableBloomFilter().fromfile(f) print('bllomfilter 读取成功!') except: sbf = ScalableBloomFilter(initial_capacity=10000, error_rate=0.00001, mode=ScalableBloomFilter.LARGE_SET_GROWTH) for deep in range(max_deep): gList = [] while not htmlQueue.empty(): url = htmlQueue.get() if not url in sbf or deep == 0: # htmlrun(url) gList.append( gevent.spawn(htmlrun, url, crawl_type, 'localhost:8087')) max_pageNum -= 1 page_num += 1 if max_pageNum == 0: continue gevent.joinall(gList) while redis.scard('STATUS') > 0: url = redis.spop('STATUS').decode() sbf.add(url) if max_pageNum == 0: break while redis.scard('HTML') > 0: url = redis.spop('HTML') url = url.decode() htmlQueue.put(url) # 没有url需要爬取 if htmlQueue.empty(): break # 下载CSS文件 while redis.scard('CSS') > 0: url = redis.spop('CSS').decode() url = parse.urljoin(initUrl, url) gList.append(gevent.spawn(cssrun, url)) gevent.joinall(gList) # 最后保存Bloomfilter文件 with open('urlBloomfilter.bloom', 'wb') as f: sbf.tofile(f) return page_num
class BloomAutoYara: def __init__(self,filterfile): self.filterfile = filterfile #if filterfile is present load bloom filter from that file, else create new one if os.path.exists(filterfile): self.bf = ScalableBloomFilter.fromfile(open(filterfile,"rb")) print "available signatures = %d"%len(self.bf) else: self.bf = ScalableBloomFilter(mode=ScalableBloomFilter.SMALL_SET_GROWTH) def save_filter(self): print "saving filter to file %s "%self.filterfile self.bf.tofile(open(self.filterfile,"wb")) def add_string(self,str): self.bf.add(str) def search_string(self,str): if str in self.bf: return True else: return False def extractlines(self,filename,min_len=4): chars = r"A-Za-z0-9/\-:.,_$%'()[\]<> " shortest_run = 4 regexp = '[%s]{%d,}' % (chars, shortest_run) pattern = re.compile(regexp) fp = open(filename,"rb") data = fp.read() lines = pattern.findall(data) s = set(lines) fp.close() return list(s) def build_filter(self,dirname,extensions=[]): print extensions total = 0 for (dir, _, files) in os.walk(dirname): for f in files: ext = f.split(".")[-1] if len(extensions) != 0 and ext not in extensions: continue print "processing file %s"%f total += 1 path = os.path.join(dir, f) lines = self.extractlines(path) for line in lines: self.add_string(line) print "creating bloom filter done. Total files = %d (Total entries = %d). Overwriting to bloom filter output file %s"%(total,len(self.bf),self.filterfile) self.save_filter() def find_file_topn(self,filename,topn=10): tmp = [] lines = self.extractlines(filename) print "total unique strings in file %s = %d"%(filename,len(lines)) for line in lines: if self.search_string(line) == False: tmp.append(line) tmp.sort(key=len) print "total strings which can be used for signature = %d"%len(tmp) tmp = tmp[-topn:] tmp.reverse() return tmp def find_dir_topn(self,dirname,topn=10): tmplist = [] for (dir, _, files) in os.walk(dirname): for f in files: path = os.path.join(dir, f) lines = self.extractlines(path) for line in lines: if self.search_string(line) == False: tmplist.append(line) counts = Counter(list(tmplist)) return counts.most_common(topn) def escapechars(self,str): for c in "\/.^$*+-?()[]{}|": str = str.replace(c,"\\"+c) return str def list_to_rule(self,list,rulename,threshold=0.5): tmp = "rule " + rulename + "{\n" tmp += " strings:\n" for i in xrange(0,len(list)): esc = self.escapechars(list[i]) tmp = tmp + "$str%d = "%i + r"/[^A-Za-z0-9\/\-:.,_$%'()\[\]<> ]" + esc + r"[^A-Za-z0-9\/\-:.,_$%'()\[\]<> ]/" tmp += "\n" tmp += "condition:\n" tmp += str(int(len(list)*threshold)) tmp += " of (" for i in xrange(0,len(list)): tmp += "$str"+ str(i) if i != (len(list) - 1): tmp += "," tmp += ")\n}" print "rule = %s.yara is written to disk "%rulename fp = open(rulename+".yara","w") fp.write(tmp) fp.close()
with open("wikidata-raw-2018.08.01.ttl") as infile: for line in infile: try: line = line.strip() if line[0] == '#': continue red.incr('linesread5') tokens = line.split(' ') url1 = tokens[0][1:-1] if 'resource' not in url1: continue sid = url1[37:] url2 = tokens[1][1:-1] if 'entity' not in url2: continue pid = url2[31:] bloom1.add(sid + ':' + pid) url3 = tokens[2][1:-1] if 'resource' not in url3: continue oid = url3[37:] bloom2.add(sid + ':' + oid) except Exception as e: print(e) f = open('bloom/wikidatabloom1hoppredicate.pickle', 'wb') bloom1.tofile(f) f.close() f = open('bloom/wikidatabloom1hopentity.pickle', 'wb') bloom2.tofile(f) f.close()
reader = csv.DictReader(handle, delimiter='|', fieldnames=fieldnames) for row in reader: bf.add(CVX + '|' + row['cvx code'].strip()) try: # If the bloom filter already exists, we're probably just appending to it with open(BF_PATH, 'rb') as handle: bf = ScalableBloomFilter.fromfile(handle) except FileNotFoundError: # If it doesn't, we need to make one bf = ScalableBloomFilter(mode=ScalableBloomFilter.SMALL_SET_GROWTH, initial_capacity=INITIAL_CAPACITY, error_rate=ERROR_RATE) import_loinc(bf) import_snomed(bf) import_rxnorm(bf) import_icd9(bf) import_icd10(bf) import_cpt(bf) import_fhir(bf) import_daf(bf) import_argo(bf) import_cvx(bf) if __name__ == '__main__': with open(BF_PATH, 'wb') as handle: bf.tofile(handle)
import sys from pybloom import ScalableBloomFilter import redis red = redis.StrictRedis(host='localhost', port=6379, db=0) bloom1hop = ScalableBloomFilter(mode=ScalableBloomFilter.LARGE_SET_GROWTH ) #capacity=200000000, error_rate=0.0001) count = 0 red.set('linesread7', 0) with open('wikidata-instance-types-2018.08.01.ttl') as infile: for line in infile: try: line = line.strip() if line[0] == '#': continue red.incr('linesread7') tokens = line.split(' ') s = tokens[0][1:-1][37:] o = tokens[2][1:-1][28:] bloom1hop.add(s + ':' + o) except Exception as e: print(e) f = open('bloom/bloom1hoptypeofentity.pickle', 'wb') bloom1hop.tofile(f) f.close()
class BloomFilter: def __init__(self, datafile, filterfile): # https://github.com/jaybaird/python-bloomfilter/blob/master/pybloom/pybloom.py self.filter = ScalableBloomFilter(mode=ScalableBloomFilter.SMALL_SET_GROWTH) self.datafile = datafile self.filterfile = filterfile self.datafilesize = None self.filterfilesize = None self.change = None def add_to_filter(self, update=False): # https://github.com/bigsnarfdude/Malware-Probabilistic-Data-Structres/blob/master/Mandiant_MD5_BloomFilter.py def stream_lines(filename): file = open(filename) while True: line = file.readline() if not line: file.close() break yield line.strip() def load_file(filename): lines = stream_lines(filename) templist = [] for line in lines: templist.append(line) return templist itemlist = load_file(self.datafile) self.itemcount = len(itemlist) if not update: # reinitialize filter before adding a new set of items self.filter = ScalableBloomFilter(mode=ScalableBloomFilter.SMALL_SET_GROWTH) for item in itemlist: _ = self.filter.add(item) def update_filter(self): # simulate updation via add self.add(update=True) def save_to_file(self): if self.filter: f = open(self.filterfile, 'wb') self.filter.tofile(f) f.close() def load_from_file(self): del self.filter f = open(self.filterfile, 'rb') self.filter = ScalableBloomFilter.fromfile(f) f.close() def search_filter(self, item): return True if item in self.filter else False def get_stats(self): if filter: self.datafilesize = file_size(self.datafile) self.filterfilesize = file_size(self.filterfile) self.change = 100 * (self.filterfilesize - self.datafilesize) / self.datafilesize return { "initial_capacity": self.filter.initial_capacity, "capacity": self.filter.capacity, "count": self.filter.count, "ratio": self.filter.ratio, "scale": self.filter.scale, "datafile": self.datafile, "filterfile": self.filterfile, "datafilesize": self.datafilesize, "filterfilesize": self.filterfilesize, "change": self.change } else: return None
d1[s].add(p) if o not in d1: d1[o] = set() d1[o].add(p) else: d1[o].add(p) red.set('linesread7', 0) with open("wikidata-raw-2018.08.01.ttl") as infile: for line in infile: line = line.strip() if line[0] == '#': continue red.incr('linesread7') tokens = line.split(' ') s = tokens[0][1:-1] p = tokens[1][1:-1] o = tokens[2][1:-1] if 'wikidata.dbpedia.org/resource' not in o: continue if o in d1: for pred in d1[o]: bloom.add(s + ':' + pred) if s in d1: for pred in d1[s]: bloom.add(o + ':' + pred) f = open('bloom/bloom2hoppredicate.pickle', 'wb') bloom.tofile(f) f.close()
with open(CVX_PATH, encoding='utf-16') as handle: reader = csv.DictReader(handle, delimiter='|', fieldnames=fieldnames) for row in reader: bf.add(CVX + '|' + row['cvx code'].strip()) try: # If the bloom filter already exists, we're probably just appending to it with open(BF_PATH, 'rb') as handle: bf = ScalableBloomFilter.fromfile(handle) except FileNotFoundError: # If it doesn't, we need to make one bf = ScalableBloomFilter(mode=ScalableBloomFilter.SMALL_SET_GROWTH, initial_capacity=INITIAL_CAPACITY, error_rate=ERROR_RATE) import_loinc(bf) import_snomed(bf) import_rxnorm(bf) import_icd9(bf) import_icd10(bf) import_cpt(bf) import_fhir(bf) import_daf(bf) import_argo(bf) import_cvx(bf) if __name__ == '__main__': with open(BF_PATH, 'wb') as handle: bf.tofile(handle)