def second_new_warn_entity(): minDates = getMinDate1(TABLE_MONITOR, RISK_LEVEL, ILLEGAL_SCORE, ILLEGAL_TYPE, TABLE_REPORT_ILLEGAL) row_monitor_date = datetime.strptime(monitor_date, '%Y-%m-%d') b7 = ScalableBloomFilter(100000, 0.001) b30 = ScalableBloomFilter(100000, 0.001) b90 = ScalableBloomFilter(100000, 0.001) for i, k in minDates.items(): dateTime = datetime.strptime(k, '%Y-%m-%d') dValue = int((row_monitor_date - dateTime).total_seconds()) / 86400 if dValue < 7 and dValue >= 0: [b7.add(i)] if dValue < 30 and dValue >= 0: [b30.add(i)] if dValue < 90 and dValue >= 0: [b90.add(i)] result90 = secondDetectFromBigTable(90, TABLE_REPORT_ILLEGAL, RISK_LEVEL, ILLEGAL_SCORE, 'all', 0, 0, 'all', 'all', TABLE_LOGS, 'all') count7 = 0 count30 = 0 count90 = 0 resultIds = [] for each in result90: if not each['entity_id'] in resultIds: resultIds.append(each['entity_id']) for id in resultIds: if id in b7: count7 += 1 if id in b30: count30 += 1 if id in b90: count90 += 1 result = {'count7': count7, 'count30': count30, 'count90': count90} return json.dumps(result, ensure_ascii=False)
class URLFilter(object): lock = RLock() def __init__(self): self.forbidden_keys = ['video', 'facebook', 'youtube', 'twitter', 'instagram', 'tv', 'amazon', 'ebay', 'photo', 'image', 'game', 'shop', 'foursquare'] self.seen = ScalableBloomFilter(initial_capacity=10000, mode=ScalableBloomFilter.LARGE_SET_GROWTH) def forbidden_key_word(self, url): for key_word in self.forbidden_keys: if key_word in url: log.debug('## FORBIDDEN: {}'.format(url)) return False return True @staticmethod def is_english(url): try: url.decode('ascii') except UnicodeDecodeError: log.debug('## NON-ENGLISH PAGE DETECTED: {}'.format(url)) return False else: return True def pass_check(self, url): with URLFilter.lock: if url in self.seen: log.debug('## SEEN: {}'.format(url)) return False self.seen.add(url) return self.forbidden_key_word(url) and self.is_english(url)
class FileBloom(object): def __init__(self): self.file_path = "bloom/bloom_weibo.txt" self.bloom_filter = ScalableBloomFilter(initial_capacity=10000, error_rate=0.001) def read_bloom(self): if os.path.exists(self.file_path): f = open(self.file_path, "r") ids = f.readlines() for id in ids: id_s = id.strip() self.bloom_filter.add(id_s) f.close() else: f = open(self.file_path, "w") f.close() def to_file(self): pass def update_bloom_file(self, m_id): f = open(self.file_path, "a") f.write(str(m_id) + "\n") f.close() def update_bloom(self, m_id): self.bloom_filter.add(m_id) def has_id(self, m_id): if m_id in self.bloom_filter: return True else: return False
def vacuum_all(self, limit=None): logger.debug('Begin vacuum_all(limit=%s)', limit) self.plugins = self.load_plugins() self.session.begin(subtransactions=True) ts = self.term_stat('SupplierCatalogItemVersion Vacuum', len(self.plugins)) #s = set() s = ScalableBloomFilter() query = self.session.query(SupplierCatalogModel.id) for (supplier_catalog_id, ) in query.yield_per(100): s.add(supplier_catalog_id) for plug in self.plugins.itervalues(): supplier_catalog_filter_id = plug.supplier_catalog_filter_id() model_name = plug.version_model() + 'Model' VersionModel = getattr(model, model_name) query = self.session.query(VersionModel) if limit: query = query.order_by(VersionModel.vacuumed.nullsfirst()) query = query.limit(limit) ts['sub_done'] = 0 ts['sub_total'] = query.count() for supplier_catalog_item_version in query.yield_per(10): if supplier_catalog_item_version.supplier_catalog_id not in s: logger.debug("Deleting %s %s", model_name, supplier_catalog_item_version.id) self.session.delete(supplier_catalog_item_version) ts['sub_done'] += 1 ts['done'] += 1 self.session.commit() ts.finish() logger.debug('End vacuum_all()')
class kmer_store: def __init__(self): self.bloom_filter = ScalableBloomFilter( initial_capacity=1000000, mode=ScalableBloomFilter.LARGE_SET_GROWTH) self.kmers = {} def update(self, item): if item in self.bloom_filter: if item in self.kmers: self.kmers[item] += 1 else: self.kmers[item] = 2 else: self.bloom_filter.add(item) def __iter__(self): for key in self.kmers: yield key def __getitem__(self, key): return self.kmers[key] def __repr__(self): return str(self.kmers) def __str__(self): return str(self.kmers)
def dedup_lines_bloom(text, just_words=True, zero_digits=True, capacity=100000, error=0.00001): sbf = ScalableBloomFilter(initial_capacity=capacity, error_rate=error, mode=ScalableBloomFilter.LARGE_SET_GROWTH) for line in text: if not isinstance(line, str): raise TypeError( 'Expected "text" to contain stings, found: {}'.format( type(line))) key = line.strip() if not key: yield line key = normalize('NFKD', key) if just_words: key = ' '.join(re.findall(r'\w+', key)) if zero_digits: key = re.sub(r'\d', '0', key) if key in sbf: line = '' else: sbf.add(key) yield line
def against_detect_data_from_bigtable(): b = ScalableBloomFilter(1000000, 0.001) date = int(request.args.get('date', '')) operation_mode = request.args.get('operation_mode', '') illegal_type = int(request.args.get('illegal_type', '')) entity_type = int(request.args.get('entity_type', '')) warn_distribute = request.args.get('warn_distribute', '') problem = request.args.get('problem', '') newEntity = int(request.args.get('newEntity', '')) fund_mode = request.args.get('fund_mode', '') result = againstDetectDataFromBigTable(date, TABLE_REPORT_ILLEGAL, RISK_LEVEL, ILLEGAL_SCORE, operation_mode, illegal_type, entity_type, warn_distribute, problem, TABLE_LOGS, fund_mode) # 合并相同数据 doubleId = [] for dict in result: if not dict['entity_id'] in b: [b.add(dict['entity_id'])] else: doubleId.append(dict['entity_id']) for id in doubleId: num = 0 illegalTypeList = [] for dict in result: if dict['entity_id'] == id: num += 1 illegalTypeList.append(dict['illegal_type']) dict.update({'illegal_type': illegalTypeList}) if num > 1: result.remove(dict) # 筛选新增实体 if newEntity: bb = ScalableBloomFilter(1000000, 0.001) newResult = [] minDates = getMinDate1(TABLE_MONITOR, RISK_LEVEL, ILLEGAL_SCORE, ILLEGAL_TYPE, TABLE_REPORT_ILLEGAL) row_monitor_date = datetime.strptime(monitor_date, '%Y-%m-%d') for i, k in minDates.items(): dateTime = datetime.strptime(k, '%Y-%m-%d') dValue = int((row_monitor_date - dateTime).total_seconds()) / 86400 if dValue < date and dValue >= 0: [bb.add(i)] for dict in result: if dict['entity_id'] in bb: newResult.append(dict) # 前端传的是id,防止报错,加上id for dict in result: dict.update({'id': dict['entity_id']}) return json.dumps(newResult, ensure_ascii=False) try: result.sort(key=lambda x: x['datetime'], reverse=True) except: pass # 前端传的是id,防止报错,加上id for dict in result: dict.update({'id': dict['entity_id']}) return json.dumps(result, ensure_ascii=False)
class BloomMembership(GenericMembership): def __init__(self, max_size: int, error_rate: float): self.bloom_filter = ScalableBloomFilter(max_size, error_rate) def add(self, key: str): self.bloom_filter.add(key) def __contains__(self, key: str) -> bool: return key in self.bloom_filter
class WishPipeline(object): def __init__(self): self.urls = ScalableBloomFilter(mode=ScalableBloomFilter.LARGE_SET_GROWTH) def process_item(self, item, spider): if item is None or item['url'] is None or item['url'] in self.urls: raise DropItem("Duplicate item found.") else: self.urls.add(item['url']) return item
class WishPipeline(object): def __init__(self): self.urls = ScalableBloomFilter( mode=ScalableBloomFilter.LARGE_SET_GROWTH) def process_item(self, item, spider): if item is None or item['url'] is None or item['url'] in self.urls: raise DropItem("Duplicate item found.") else: self.urls.add(item['url']) return item
class URLBloomFilter(RFPDupeFilter): def __init__(self, path=None, debug=False): self.urls_sbf = ScalableBloomFilter(mode=ScalableBloomFilter.SMALL_SET_GROWTH) RFPDupeFilter.__init__(self, path) def request_seen(self, request): fp = hashlib.sha1() fp.update(canonicalize_url(request.url).encode("utf8")) url_sha1 = fp.hexdigest() if url_sha1 in self.urls_sbf: return True else: self.urls_sbf.add(url_sha1)
def ParseQueue(): # Load Checked Urls File if os.path.isfile(path_checked_url_file): with open(path_checked_url_file, 'rb') as rf: checked_url_pool = ScalableBloomFilter.fromfile(rf) print("bf: Read pybloom from %s.\n" % path_checked_url_file) else: checked_url_pool = ScalableBloomFilter( initial_capacity=1000, error_rate=0.001, mode=ScalableBloomFilter.SMALL_SET_GROWTH) print("bf: Create pybloom") # Get each Item from Queue i = 1 # URL_QUEUE.put_nowait(None) # sign the end of Queue # for item in iter(URL_QUEUE.get_nowait, None): # cur_url = item[2] URL_DEQUE.appendleft(None) for item in iter(URL_DEQUE.pop, None): cur_url = item[2] if (cur_url in checked_url_pool) == False: # cur_url never checked try: time.sleep(0.3) page_html_raw = requests.get(cur_url, timeout=3) except requests.RequestException as e: print(e) # URL_DEQUE.appendleft(cur_url) with open(path_requestErr_log, 'a') as f_requestErr: f_requestErr.write( time.strftime('%Y-%m-%d %H:%M:%S', time.localtime(time.time())) + "Timeout " + cur_url + '\n') else: page_html = page_html_raw.content.decode('utf-8', 'ignore') buffer = parser4me.parser_4_1(item, page_html) with open(path_output_folder + os.path.sep + item[1] + item[0][0:128] + ".txt", 'w', encoding='utf-8') as resf: resf.write(buffer) print("%s OK! to file %s" % (i, item[0])) checked_url_pool.add(cur_url) i += 1 else: print("Skip %s" % i) i += 1 with open(path_checked_url_file, 'wb') as wf: checked_url_pool.tofile(wf)
def second_detect_data(): b = ScalableBloomFilter(1000000, 0.001) date = int(request.args.get('date', '')) operation_mode = request.args.get('operation_mode', '') illegal_type = int(request.args.get('illegal_type', '')) entity_type = int(request.args.get('entity_type', '')) warn_distribute = request.args.get('warn_distribute', '') problem = request.args.get('problem', '') newEntity = int(request.args.get('newEntity', '')) result = secondDetectData(date, TABLE_ENTITY_LIST, TABLE_MONITOR, TABLE_GONGSHANG, RISK_LEVEL, ILLEGAL_SCORE, operation_mode, illegal_type, entity_type, warn_distribute, problem, TABLE_INDEX_QUANTILE, TABLE_GUARANTEE_PROMISE, TABLE_LOGS) doubleId = [] for dict in result: if not dict['id'] in b: [b.add(dict['id'])] else: doubleId.append(dict['id']) for id in doubleId: num = 0 illegalTypeList = [] for dict in result: if dict['id'] == id: num += 1 illegalTypeList.append(dict['illegal_type']) dict.update({'illegal_type': illegalTypeList}) if num > 1: result.remove(dict) if newEntity: bb = ScalableBloomFilter(1000000, 0.001) newResult = [] minDates = getMinDate1(TABLE_MONITOR, RISK_LEVEL, ILLEGAL_SCORE, ILLEGAL_TYPE, TABLE_REPORT_ILLEGAL) row_monitor_date = datetime.strptime(monitor_date, '%Y-%m-%d') for i, k in minDates.items(): dateTime = datetime.strptime(k, '%Y-%m-%d') dValue = int((row_monitor_date - dateTime).total_seconds()) / 86400 if dValue < date: [bb.add(i)] for dict in result: if dict['id'] in bb: newResult.append(dict) return json.dumps(newResult, ensure_ascii=False) try: result.sort(key=lambda x: x['datetime'], reverse=True) except: pass return json.dumps(result, ensure_ascii=False)
class RequestFilter(object): """ RequestFilter """ def __init__(self): self.sbf = ScalableBloomFilter( mode=ScalableBloomFilter.SMALL_SET_GROWTH) def request_seen(self, request): """request seen """ finger = request_fingerprint(request) if finger in self.sbf: return True self.sbf.add(finger) return False
class URLBloomFilter(RFPDupeFilter): """根据urlhash_bloom过滤""" def __init__(self,path=None): self.urls_sbf = ScalableBloomFilter(mode=ScalableBloomFilter.SMALL_SET_GROWTH) RFPDupeFilter.__init__(self, path) def request_seen(self, request): fp = hashlib.sha1() fp.update(canonicalize_url(request.url)) url_shal = fp.hexdigest() if url_shal in self.urls_sbf: return True else: self.urls_sbf.add(url_shal)
def total_detect_data_test(): b = ScalableBloomFilter(1000000, 0.001) date = int(request.args.get('date', '')) operation_mode = request.args.get('operation_mode', '') #多选 illegal_type = int(request.args.get('illegal_type', '')) entity_type = int(request.args.get('entity_type', '')) warn_distribute = request.args.get('warn_distribute', '') #多选 problem = request.args.get('problem', '') #多选 newEntity = int(request.args.get('newEntity', '')) checked = int(request.args.get('checked', '')) fund_mode = request.args.get('fund_mode', '') result = totalDetectDataFromBigTable(date, TABLE_REPORT_ILLEGAL, operation_mode, illegal_type, entity_type, warn_distribute, problem, checked, fund_mode) # 将illegal_type不同的两个实体合并 doubleId = [] for dict in result: if not dict['entity_id'] in b: [b.add(dict['entity_id'])] else: doubleId.append(dict['entity_id']) for id in doubleId: num = 0 illegalTypeList = [] for dict in result: if dict['entity_id'] == id: num += 1 illegalTypeList.append(dict['illegal_type']) dict.update({'illegal_type': illegalTypeList}) if num > 1: result.remove(dict) # 筛选新增实体 if newEntity: bb = ScalableBloomFilter(1000000, 0.001) newResult = [] minDates = getMinDate1(TABLE_MONITOR, RISK_LEVEL, ILLEGAL_SCORE, ILLEGAL_TYPE, TABLE_REPORT_ILLEGAL) row_monitor_date = datetime.strptime(monitor_date, '%Y-%m-%d') for i, k in minDates.items(): dateTime = datetime.strptime(k, '%Y-%m-%d') dValue = int((row_monitor_date - dateTime).total_seconds()) / 86400 if dValue < date and dValue >= 0: [bb.add(i)] for dict in result: if dict['entity_id'] in bb: newResult.append(dict) return json.dumps(newResult, ensure_ascii=False) return json.dumps(result, ensure_ascii=False)
def to_bloomfilter(iterable, init_cap=200, err_rate=0.001): """ Converts the iterable into a ScalableBloomFilter :rtype : pybloom.ScalableBloomFilter :param iterable: :param init_cap: :param err_rate: """ bloom = ScalableBloomFilter(init_cap, err_rate) for element in iterable: bloom.add(element) return bloom
def to_bloomfilter(iterable, init_cap=200, err_rate=0.001): """ Converts the iterable into a ScalableBloomFilter :rtype : pybloom.ScalableBloomFilter :param iterable: :param init_cap: :param err_rate: """ bloom = ScalableBloomFilter(init_cap, err_rate) for element in iterable: bloom.add(element) return bloom
class DuplicateItemFilterPipeline(Pipeline): # bloomfiler 序列化 fileName = "DuplicateItemFilter.dat" def open_spider(self, spider): self.fileName = spider.name + self.fileName if os.path.exists(self.fileName): with open(self.fileName, 'rb') as f: self.sbf = ScalableBloomFilter.fromfile(f) else: self.sbf = ScalableBloomFilter( mode=ScalableBloomFilter.LARGE_SET_GROWTH) pass def close_spider(self, spider): with open(self.fileName, 'wb') as f: self.sbf = self.sbf.tofile(f) pass def process_item(self, item, spider): # bloomfiler fp = hashlib.sha1() for key in item.keys(): if key not in ['curlDate', 'reference'] \ and item[key] is not None: # 不比较抓取时间,来源url fp.update(item[key]) fpValue = fp.hexdigest() if not self.sbf.add(fpValue): return item else: raise DropItem("duplicate item :/n %s" % item)
class RequestFilter(object): """ RequestFilter """ def __init__(self): self.sbf = ScalableBloomFilter( mode=ScalableBloomFilter.SMALL_SET_GROWTH) def request_seen(self, request): """request seen """ finger = request_fingerprint(request) if finger in self.sbf: return True self.sbf.add(finger) return False
class BloomPipeline(object): def __init__(self, bloomfile, spider_name): self.bloomfile = bloomfile self.spider_name = spider_name # item crawled before logger.info("loading crawled items before...") if os.path.isfile(self.bloomfile): f = open(self.bloomfile, 'r') self.item_crawled = ScalableBloomFilter.fromfile(f) f.close() else: self.item_crawled = ScalableBloomFilter( 100000000, 0.001, mode=ScalableBloomFilter.SMALL_SET_GROWTH) cnt = self.item_crawled.count logger.info("pipline read %d crawled items" % cnt) def __del__(self): f = open(self.bloomfile, 'w') self.item_crawled.tofile(f) f.close() @classmethod def from_crawler(cls, crawler): return cls( #mongo_uri=crawler.settings.get('MONGODB_ADDRESS'), bloomfile=crawler.settings.get('BLOOM_FILE'), #bloomfile = "/root/dev/SocialSpider/data/weibotv/bloomfile", spider_name=crawler.spidercls.name) def process_item(self, item, spider): #if not item['md5']: # md5 = hashlib.md5("%s%s%s"%(item['title'].encode('utf-8'),item['url'].encode('utf-8'))).hexdigest() # item['md5'] = md5 valid = True item_id = '' if self.spider_name == 'weibotv': item_id = item['mid'] elif self.spider_name == 'toutiao': item_id = item['Url'] #item_id = hashlib.md5("%s"%(item['Url'].encode('utf-8'))).hexdigest() elif self.spider_name == 'anyvspider': item_id = item['pid'] else: pass if self.item_crawled.add(item_id): valid = False else: valid = True if valid: logger.info("item: %s wrote to bloomfile %s" % (item_id.encode('utf-8'), self.bloomfile)) return item else: logger.info("item droped %s " % item_id.encode('utf-8'))
class UrlFilter(RFPDupeFilter): def __init__(self, path=None, debug=False): self.urls_sbf = ScalableBloomFilter( mode=ScalableBloomFilter.SMALL_SET_GROWTH) RFPDupeFilter.__init__(self, path, debug) def request_seen(self, request): fp = hashlib.sha1() fp.update(canonicalize_url(request.url).encode('utf-8')) url_sha1 = fp.hexdigest() if url_sha1 not in self.urls_sbf and not mysqldb.queryItem( request.url): self.urls_sbf.add(url_sha1) else: return True
class URLBloomFilter(RFPDupeFilter): # 根据urlhash_bloom过滤 def __init__(self, path=None): self.urls_sbf = ScalableBloomFilter(mode=ScalableBloomFilter.SMALL_SET_GROWTH) RFPDupeFilter.__init__(self, path) def request_seen(self, request): # 生成一个哈希sha1处理实例 fp = hashlib.sha1() # 更新传入的参数为格式化统一后的函数(有时候同一个网址,可能请求网址的格式不一样) fp.update(canonicalize_url(request.url)) # sha1处理后的url url_sha1 = fp.hexdigest() if url_sha1 in self.urls_sbf: return True else: self.urls_sbf.add(url_sha1)
def add_sbf(self, query=None): ''' params: query -->mysql 查询语句 过滤任务处理结果 ''' if query is None: return None sbf = ScalableBloomFilter() table = Table(logger=self.logger) result_dict = table.execute(query=query) data = result_dict.get('data') for each in data: id = each.get('id') sbf.add(int(id)) table.close() return sbf
def count_distinct_approx(iterable, init_cap=200, err_rate=0.001): """ Count the number of distinct elements from an iterable. This implementation uses a bloomfilter to approximate the number of distinct values found in this iterable. :param iterable: :param init_cap: :param err_rate: """ counter = 0 set_of_distinct_values = ScalableBloomFilter(init_cap, err_rate) for element in iterable: if element not in set_of_distinct_values: set_of_distinct_values.add(element) counter += 1 return counter
def count_distinct_approx(iterable, init_cap=200, err_rate=0.001): """ Count the number of distinct elements from an iterable. This implementation uses a bloomfilter to approximate the number of distinct values found in this iterable. :param iterable: :param init_cap: :param err_rate: """ counter = 0 set_of_distinct_values = ScalableBloomFilter(init_cap, err_rate) for element in iterable: if element not in set_of_distinct_values: set_of_distinct_values.add(element) counter += 1 return counter
def get_city_rank(table, table4, field, province_name, risk_level): cur = defaultDatabase() city_list = [] list = [] province_list = [] sql = "select max(date) from %s" % table cur.execute(sql) end_time = cur.fetchall()[0][0] start_time = datetime.strptime(end_time, "%Y-%m-%d") - timedelta(days=7) start_time = start_time.strftime("%Y-%m-%d") start1_time = datetime.strptime(end_time, "%Y-%m-%d") - timedelta(days=30) start_time1 = start1_time.strftime("%Y-%m-%d") sql1 = 'select pd.illegal_type,gs.province,gs.city,count(*) from %s as pd inner join %s as gs on pd.entity_id=gs.entity_id where gs.date=(select max(date) from %s) and pd.date>"%s" and pd.date<="%s" and illegal_type>0 and risk_level>%d group by province,city' % ( table, table4, table4, start_time, end_time, risk_level) cur.execute(sql1) res1 = cur.fetchall() result1 = [{k: row[i] for i, k in enumerate(field)} for row in res1] sql2 = 'select pd.illegal_type,gs.province,gs.city,count(*) from %s as pd inner join %s as gs on pd.entity_id=gs.entity_id where gs.date=(select max(date) from %s) and pd.date>"%s" and pd.date<="%s" and illegal_type>0 and risk_level>%d group by province,city' % ( table, table4, table4, start_time1, end_time, risk_level) cur.execute(sql2) res2 = cur.fetchall() result2 = [{k: row[i] for i, k in enumerate(field)} for row in res2] result = result1 + result2 b = ScalableBloomFilter(1000000, 0.001) for p in result: if not p['city'] in b: [b.add(p['city'])] city_list.append({'province': p['province'], 'city': p['city']}) for d in city_list: if not d['province'] in province_list: province_list.append(d['province']) if province_name: for d in city_list: if d['province'] == province_name and d['city']: pro_dict = {"province": d['province'], "city": d['city']} for dict in result1: if dict['city'] == d['city']: pro_dict.update({'count7': dict['count']}) for dict in result2: if dict['city'] == d['city']: pro_dict.update({'count30': dict['count']}) list.append(pro_dict) if not province_name: for p in province_list: if p: pro_dict = {"province": p} count = 0 for dict in result1: if dict['province'] == p: count += dict['count'] pro_dict.update({"count": count}) list.append(pro_dict) return list
class BloomSet(object): def __init__(self, initial_capacity=1000, error_rate=0.0001): self._set = ScalableBloomFilter(initial_capacity=initial_capacity, error_rate=error_rate, mode=ScalableBloomFilter.LARGE_SET_GROWTH) # False positives in the Bloom filter will cause us to fail to # garbage-collect an object. Salt the Bloom filter to ensure # that we get a different set of false positives on every run. self._bloom_salt = os.urandom(2) def add(self, name): self._set.add(self._bloom_key(name)) def __contains__(self, name): # May return false positives. return self._bloom_key(name) in self._set def _bloom_key(self, name): if isinstance(name, unicode): name = name.encode('utf-8') return self._bloom_salt + name
def main(args): seenUrlSet = ScalableBloomFilter(mode=ScalableBloomFilter.SMALL_SET_GROWTH) for ln in sys.stdin: if not ln: continue fetchedUrl = json.loads(ln) # continue if we've seen this url already. if fetchedUrl["url"] in seenUrlSet or fetchedUrl["effective_url"] in seenUrlSet: continue # add unseen url to the url set seenUrlSet.add(fetchedUrl["url"]) seenUrlSet.add(fetchedUrl["effective_url"]) # extract links and filter out some urls by url filter. outlinks = url_filter(extract_links(fetchedUrl)) # analyze print "[postproc]%s" % fetchedUrl["url"]
class LibsPoiPipeline(object): filter_prefix = 'POI_' def __init__(self): self.files = {} self.file_path = './data/libs_poi.%d.csv' % int(time.time()) self.filter = ScalableBloomFilter(initial_capacity=1024, error_rate=0.001, mode=ScalableBloomFilter.SMALL_SET_GROWTH) @classmethod def from_crawler(cls, crawler): pipeline = cls() crawler.signals.connect(pipeline.spider_opened, signals.spider_opened) crawler.signals.connect(pipeline.spider_closed, signals.spider_closed) return pipeline def spider_opened(self, spider): file = open(self.file_path, 'a+b') self.files[spider] = file kwargs = { 'fields_to_export': ['lid','name','tag','ltype','typecode','biz_type','address','lng','lat','tel','postcode', 'website','email','pcode','pname','citycode', 'cityname', 'adcode','adname','importance', 'shopid','shopinfo','poiweight','gridcode','distance','navi_poiid','entr_lng','entr_lat','business_area', 'exit_location','match','recommend','timestamp','alias','indoor_map','cpid','floor','truefloor', 'groupbuy_num','discount_num','rating','cost','event','children']} self.exporter = CsvItemExporter(file, include_headers_line=False, **kwargs) self.exporter.start_exporting() def spider_closed(self, spider): self.exporter.finish_exporting() file = self.files.pop(spider) file.close() print("spider closed!") def process_item(self, item, spider): if isinstance(item, LibsPoiItem): if self.filter_prefix + item.get('lid') in self.filter: return self.exporter.export_item(item) self.filter.add(self.filter_prefix + item.get('lid')) return item
def vacuum_all(self, limit=None): logger.debug('Begin vacuum_all(limit=%s)', limit) self.plugins = self.load_plugins() ts = self.term_stat('SupplierSpecialItemVersion Vacuum', len(self.plugins)) tx = transaction.get() try: #s = set() s = ScalableBloomFilter() query = DBSession.query(SupplierSpecialModel.id) for (supplier_special_id, ) in query.yield_per(100): s.add(supplier_special_id) for plug in self.plugins.itervalues(): supplier_special_filter_id = plug.supplier_special_filter_id() model_name = plug.version_model() + 'Model' VersionModel = getattr(model, model_name) query = DBSession.query(VersionModel) if limit: query = query.order_by(VersionModel.vacuumed.nullsfirst()) query = query.limit(limit) ts['sub_done'] = 0 ts['sub_total'] = query.count() for supplier_special_item_version in query.yield_per(10): if supplier_special_item_version.supplier_special_id not in s: logger.debug("Deleting %s %s", model_name, supplier_special_item_version.id) DBSession.delete(supplier_special_item_version) ts['sub_done'] += 1 if ts['sub_done'] % 1000 == 0: DBSession.flush() DBSession.flush() ts['done'] += 1 except Exception: logger.exception('Caught Exception: ') tx.abort() finally: ts.finish() transaction.commit() logger.debug('End vacuum_all()')
class URLBloomFilter(BaseDupeFilter): def __init__(self,host,port): self.urls_sbf=ScalableBloomFilter(mode=ScalableBloomFilter.SMALL_SET_GROWTH) self.host=host self.port=port self.client = redis.Redis(self.host, self.port) @classmethod def from_settings(cls, settings): return cls(host=settings.get('FILTER_HOST'), port=settings.get('FILTER_PORT')) def request_seen(self, request): fp=hashlib.sha1() fp.update(canonicalize_url(request.url)) url_sha1=fp.hexdigest() if url_sha1 in self.urls_sbf: return True else: self.urls_sbf.add(url_sha1)
def test_bloom_string(self): f = ScalableBloomFilter(mode=ScalableBloomFilter.SMALL_SET_GROWTH) for i in xrange(0, 10000): rnd = ''.join(random.choice(string.letters) for i in xrange(40)) _ = f.add(rnd) self.assertEqual(rnd in f, True) for i in string.letters: self.assertEqual(i in f, False) self.assertEqual(rnd in f, True)
class kmer_store: def __init__(self): self.bloom_filter = ScalableBloomFilter(initial_capacity=1000000, mode=ScalableBloomFilter.LARGE_SET_GROWTH) self.kmers = {} def update(self, item): if item in self.bloom_filter: if item in self.kmers: self.kmers[item] += 1 else: self.kmers[item] = 2 else: self.bloom_filter.add(item) def __iter__(self): for key in self.kmers: yield key def __getitem__(self, key): return self.kmers[key] def __repr__(self): return str(self.kmers) def __str__(self): return str(self.kmers)
def main(): bloom = ScalableBloomFilter(mode=ScalableBloomFilter.SMALL_SET_GROWTH) random = SystemRandom() print('Sample hashes:') for i in range(0, 10000): random_hash = hex(random.getrandbits(256)) bloom.add(random_hash) if i % 1000 == 0: print(random_hash) print(f'~{len(bloom)} hashes added to bloom filter.') print() try: while True: user_hash = input('Enter hash to check: ') if not user_hash: break print(user_hash in bloom) except (EOFError, KeyboardInterrupt): pass
def second_new_warn_entity(): minDates = getMinDate(TABLE_MONITOR, RISK_LEVEL, ILLEGAL_SCORE) row_monitor_date = datetime.strptime(monitor_date, '%Y-%m-%d') b7 = ScalableBloomFilter(100000, 0.001) b30 = ScalableBloomFilter(100000, 0.001) b90 = ScalableBloomFilter(100000, 0.001) for i, k in minDates.items(): dateTime = datetime.strptime(k, '%Y-%m-%d') dValue = int((row_monitor_date - dateTime).total_seconds()) / 86400 if dValue < 7 and dValue >= 0: [b7.add(i)] if dValue < 30 and dValue >= 0: [b30.add(i)] if dValue < 90 and dValue >= 0: [b90.add(i)] result90 = secondDetectData(90, TABLE_ENTITY_LIST, TABLE_MONITOR, TABLE_GONGSHANG, RISK_LEVEL, ILLEGAL_SCORE, 'all', 0, 0, 'all', 'all', TABLE_INDEX_QUANTILE, TABLE_GUARANTEE_PROMISE) count7 = 0 count30 = 0 count90 = 0 resultIds = [] for each in result90: if not each['id'] in resultIds: resultIds.append(each['id']) for id in resultIds: if id in b7: count7 += 1 if id in b30: print(id) count30 += 1 if id in b90: count90 += 1 result = {'count7': count7, 'count30': count30, 'count90': count90} return json.dumps(result, ensure_ascii=False)
class DuplicatePipeline(object): def __init__(self): self.filter = ScalableBloomFilter( mode=ScalableBloomFilter.LARGE_SET_GROWTH) def process_item(self, item, spider): if isinstance(item, UrlItem): uid = '{}{}{}'.format(spider.prefix, spider.name, item['url']) else: uid = '{}{}{}'.format(spider.prefix, spider.name, item['_id']) if self.filter.add(uid): raise DropItem('duplicate item found') else: return item
class StockTileExclusions(object): """ Object that keeps track of which stock tiles have already been used. """ def __init__(self, source_image): self.source_image = source_image self.bloom_filter = ScalableBloomFilter( initial_capacity=source_image.tiles.count(), error_rate=0.0001, # 1 in 10,000 ) existing_matches = source_image.tiles.values_list('pk', 'stock_tile_match') for tile_id, existing_match_id in existing_matches: self.bloom_filter.add((tile_id, existing_match_id)) def __contains__(self, key): if key in self.bloom_filter: return True elif self.source_image.tiles.filter(stock_tile_match_id=key[1]).exists(): self.add(key) return True return False def add(self, key): self.bloom_filter.add(key)
def test_bloom_int(self): f = ScalableBloomFilter(mode=ScalableBloomFilter.SMALL_SET_GROWTH) for i in xrange(0, 10000): _ = f.add(i) for i in xrange(0, 10000): self.assertEqual(i in f, True) for i in xrange(0, 10000 / 2 ): r = random.randint(0,10000-1) self.assertEqual(r in f, True) for i in xrange(0, 10000 / 2 ): r = random.randint(10000,10000 * 2) self.assertEqual(r in f, False)
def get_province_rank(table, table4, field, risk_level): cur = defaultDatabase() list = [] province_list = [] sql = "select max(date) from %s" % table cur.execute(sql) end_time = cur.fetchall()[0][0] start0_time = datetime.strptime(end_time, "%Y-%m-%d") - timedelta(days=7) start1_time = datetime.strptime(end_time, "%Y-%m-%d") - timedelta(days=30) start_time0 = start0_time.strftime("%Y-%m-%d") start_time1 = start1_time.strftime("%Y-%m-%d") sql1 = 'select gs.province,count(*) from %s as pd inner join %s as gs on pd.entity_id=gs.entity_id where gs.date=(select max(date) from %s) and pd.date>"%s" and pd.date<="%s" and illegal_type>0 and risk_level>%d group by province' % ( table, table4, table4, start_time0, end_time, risk_level) cur.execute(sql1) res1 = cur.fetchall() result1 = [{k: row[i] for i, k in enumerate(field)} for row in res1] sql2 = 'select gs.province,count(*) from %s as pd inner join %s as gs on pd.entity_id=gs.entity_id where gs.date=(select max(date) from %s) and pd.date>"%s" and pd.date<="%s" and illegal_type>0 and risk_level>%d group by province' % ( table, table4, table4, start_time1, end_time, risk_level) cur.execute(sql2) res2 = cur.fetchall() result2 = [{k: row[i] for i, k in enumerate(field)} for row in res2] result = result1 + result2 b = ScalableBloomFilter(1000000, 0.001) for p in result: if not p['province'] in b: [b.add(p['province'])] province_list.append(p['province']) for d in province_list: if d: pro_dict = {"province": d} for dict in result1: if dict['province'] == d: pro_dict.update({'count7': dict['count']}) for dict in result2: if dict['province'] == d: pro_dict.update({'count30': dict['count']}) list.append(pro_dict) for li in list: try: if li['count7']: pass except: li['count7'] = 0 return list
def getHotSpot(entity_list): type = 'type1' results = [] number = 0 for dict in entity_list: indexB = ScalableBloomFilter(1000,0.001) for index_name in ['bbs','forum','webo']: query_body = { "sort":{"publish_time":{"order":"desc"}}, "query": { "bool": { "must": [ { "match": { "content": dict['name'] } }, { "match": { "em1": 1 } } ] } } } res = es.search(index=index_name, doc_type=type, body=query_body, request_timeout=100) hits = res['hits']['hits'] if(len(hits)): for item in hits: if dict['name'] in item['_source']['content']: if not index_name in indexB: if number < 10: id = dict['id'] entity_name = dict['name'] entity_type = dict['entity_type'] content = item['_source']['content'] results.append({'id':id,'name':entity_name,'content':content,'entity_type':entity_type}) [indexB.add(index_name)] number += 1 if not number < 10: break return results
class OrderSpider(RedisSpider): name = "order" allowed_domains = ["aliexpress.com"] start_urls = ( 'http://www.aliexpress.com/', ) prefix = '' ids = ScalableBloomFilter(mode=ScalableBloomFilter.LARGE_SET_GROWTH) def __init__(self): self.filter = ScalableBloomFilter(mode=ScalableBloomFilter.LARGE_SET_GROWTH) self.orders = dict() self.redis_queue = None def get_queue(self): for value in set(self.server.smembers(self.redis_key)): yield value def start_requests(self): OrderSpider.prefix = self.settings['prefix'] self.redis_key = '{}:order'.format(OrderSpider.prefix) self.redis_queue = self.get_queue() db = MongoClient().aliexpress for order in db['{}order'.format(OrderSpider.prefix)].find(): OrderSpider.ids.add(order['_id']) yield self.next_request() def next_request(self): while True: try: url = next(self.redis_queue) except StopIteration: url = None if not (url and OrderSpider.ids.add(urlparse.parse_qs(urlparse.urlparse(url).query)['productId'][0])): break if url: return self.make_requests_from_url(url) else: raise CloseSpider('redis queue has no url to request') def make_requests_from_url(self, url): self.log('request order page: {}'.format(url), logging.INFO) parsed = urlparse.urlparse(url) product_id = urlparse.parse_qs(parsed.query)['productId'][0] return self.request(product_id, url) def request(self, product_id, base_url, page=1): order_url = '{}&page={}'.format(base_url, page) self.log('request order page: {}'.format(order_url), logging.INFO) return scrapy.Request(url=order_url, meta={'product_id': product_id, 'base_url': base_url, 'page': page}, callback=self.parse) def parse(self, response): self.log('parse order page: {}'.format(response.url), logging.INFO) orders = json.loads(response.body.replace('\\', '')) records = [record for record in orders['records'] if not self.filter.add(record['id'])] if len(records) > 0: for record in records: date = datetime.strptime(record['date'], '%d %b %Y %H:%M') quantity = record['quantity'] buyer_level = record['buyerAccountPointLeval'] self.order(response.meta['product_id']).append_order(**{'date': date, 'quantity': quantity, 'buyer_level': buyer_level}) return self.request(response.meta['product_id'], response.meta['base_url'], int(response.meta['page']) + 1) else: self.order(response.meta['product_id']).finish_order = True return self.pop_order(response.meta['product_id']) def order(self, id): if id not in self.orders: self.orders[id] = Order(id) return self.orders[id] def pop_order(self, id): if self.order(id).is_finish(): order = self.orders.pop(id) self.log('crawl order: {}'.format(order), logging.INFO) item = OrderItem() item['prefix'] = OrderSpider.prefix item['_id'] = order.id item['orders'] = order.orders return item
class ProductSpider(RedisSpider): name = "product" allowed_domains = ["aliexpress.com"] start_urls = ( 'http://www.aliexpress.com/', ) prefix = '' def __init__(self): self.products = dict() self.ids = ScalableBloomFilter(mode=ScalableBloomFilter.LARGE_SET_GROWTH) self.redis_queue = None def get_queue(self): for value in set(self.server.smembers(self.redis_key)): yield value def start_requests(self): ProductSpider.prefix = self.settings['prefix'] self.redis_key = '{}:product'.format(ProductSpider.prefix) self.redis_queue = self.get_queue() db = MongoClient().aliexpress for product in db['{}product'.format(ProductSpider.prefix)].find(): self.ids.add(product['url'][product['url'].rfind('/') + 1:product['url'].rfind('.')]) yield self.next_request() def next_request(self): while True: try: url = next(self.redis_queue) except StopIteration: url = None if not (url and self.ids.add(url[url.rfind('/') + 1:url.rfind('.')])): break if url: return self.make_requests_from_url(url) else: raise CloseSpider('redis queue has no url to request') def parse(self, response): self.log('product url: {}'.format(response.url), logging.INFO) try: store_url = response.css('.shop-name').xpath('a/@href').extract()[0] self.log('crawl store url: {}'.format(store_url), logging.INFO) store_item = UrlItem() store_item['prefix'] = ProductSpider.prefix store_item['type'] = 'store' store_item['url'] = store_url yield store_item feedback_base_url = response.xpath('//div[@id="feedback"]/iframe/@thesrc').extract()[0] parsed = urlparse.urlparse(feedback_base_url) product_id = urlparse.parse_qs(parsed.query)['productId'][0] try: percent_num = response.css('.percent-num').xpath('text()').extract()[0] rantings_text = response.css('.rantings-num').xpath('text()').extract()[0] rantings_num = rantings_text[1:rantings_text.index(' ')] order_text = response.css('.order-num').xpath('text()').extract()[0] order_num = order_text[:order_text.index(' ')] except: percent_num = 0 rantings_num = 0 order_num = 0 product_item = ProductItem() product_item['prefix'] = ProductSpider.prefix product_item['_id'] = product_id product_item['store'] = store_url product_item['url'] = response.url product_item['percent_num'] = percent_num product_item['rantings_num'] = rantings_num product_item['order_num'] = order_num yield product_item feedback_item = UrlItem() feedback_item['prefix'] = ProductSpider.prefix feedback_item['type'] = 'feedback' feedback_item['url'] = feedback_base_url yield feedback_item order_item = UrlItem() order_item['prefix'] = ProductSpider.prefix order_item['type'] = 'order' order_item[ 'url'] = 'http://feedback.aliexpress.com/display/evaluationProductDetailAjaxService.htm?productId={}&type=default'.format( product_id) yield order_item except: try: product_url = response.meta['redirect_urls'][0] except: product_url = response.url self.log('strange product url: {}'.format(product_url), logging.ERROR) finally: self.log('meet anti-spider, back product: {}'.format(product_url), logging.INFO) url_item = UrlItem() url_item['prefix'] = ProductSpider.prefix url_item['type'] = 'product' url_item['url'] = product_url yield url_item
__author__ = 'ztj' from pybloom import BloomFilter f = BloomFilter(capacity=10000, error_rate=0.0001) arr = [f.add(x) for x in range(10)] print arr print all([(x in f) for x in range(10)]) print 10 in f print 5 in f f = BloomFilter(capacity=1000, error_rate=0.001) for i in xrange(0, f.capacity): _ = f.add(i) print (1.0 - (len(f) / float(f.capacity))) <= f.error_rate + 2e-18 from pybloom import ScalableBloomFilter sbf = ScalableBloomFilter(mode=ScalableBloomFilter.SMALL_SET_GROWTH) count = 10000 for i in xrange(0, count): _ = sbf.add(i) print (1.0 - (len(sbf) / float(count))) <= sbf.error_rate
def run(args): """ read FASTQ or SAM and tabulate basic metrics """ time_start = time.time() if args.input.name != '<stdin>': bsize = os.path.getsize(args.input.name) est_counter = int() sample_lengths = list() sample_binsizes = list() act_nlines = int() name, ext = os.path.splitext(args.input.name) if (args.leftlimit > 0) and (args.rightlimit > 0): if args.rightlimit < args.leftlimit: sys.exit("Left limit must be less than right limit.\n") if args.type: ext = '.' + args.type if ext not in ['.fq','.fastq', '.sam', '.bam', '.gz'] and args.input.name != '<stdin>': sys.exit("Input file must end in either .sam, .bam, .fastq, or .fastq.gz\n") if args.name: sample_name = args.name else: sample_name = args.input.name # estimate the number of lines in args.input if we can if ext in ['.fastq','.fq']: with FastqReader(open(args.input.name)) as fh: for read in fh: sample_lengths.append(len(read)) sample_binsizes.append(len(str(read))) est_counter += 1 if est_counter == 10000: break mean_bentry = mean(sample_binsizes) mean_len = mean(sample_lengths) est_nlines = int(bsize / mean_bentry) if not args.quiet: sys.stderr.write("At {bytes:.0f} bytes per read of {len:.0f} length " "we estimate {est:,} reads in input file.\n".format(bytes=mean_bentry, len=mean_len, est=est_nlines)) elif ext == '.sam': with Reader(open(args.input.name)) as fh: for read in fh: sample_lengths.append(len(read)) sample_binsizes.append(len(str(read))) est_counter += 1 if est_counter == 10000: break mean_bentry = mean(sample_binsizes) mean_len = mean(sample_lengths) est_nlines = int(bsize / mean_bentry) if not args.quiet: sys.stderr.write("At {bytes:.0f} bytes per read of {len:.0f} length " "we estimate {est:,} reads in input file.\n".format(bytes=mean_bentry, len=mean_len, est=est_nlines)) elif ext == '.bam': est_nlines = sum(bam_read_count(args.input.name)) if not args.quiet: sys.stderr.write("{est:,} reads in input file.\n".format(est=est_nlines)) elif ext == '.gz': if args.binsize: n = args.binsize est_nlines = None if not args.quiet: sys.stderr.write("Reading from gzipped file, bin size (-s) set to {binsize:n}.\n".format(binsize=n)) else: sys.stderr.write("Gzipped file detected. Reading file to determine bin size (-s).\n") p1 = Popen(shlex.split('gzip -dc %s' % args.input.name), stdout=PIPE) p2 = Popen(shlex.split('wc -l'), stdin=p1.stdout, stdout=PIPE) est_nlines, _ = p2.communicate() est_nlines = int(est_nlines) // 4 if not args.quiet: sys.stderr.write("{est:,} reads in input file.\n".format(est=est_nlines)) elif name == '<stdin>': if args.binsize: n = args.binsize else: n = 1 if not args.quiet: sys.stderr.write("Reading from <stdin>, bin size (-s) set to {binsize:n}.\n".format(binsize=n)) est_nlines = None if est_nlines is not None: # set up factor for sampling bin size if args.binsize: n = args.binsize else: nf = math.floor(est_nlines / args.nreads) if nf >= 1: n = int(nf) else: n = 1 if not args.quiet: sys.stderr.write("Bin size (-s) set to {binsize:n}.\n".format(binsize=n)) if ext in ['.sam', '.bam']: infile = Reader(args.input) else: infile = FastqReader(args.input, ext=ext) read_len = defaultdict(int) cycle_nuc = defaultdict(lambda: defaultdict(int)) cycle_qual = defaultdict(lambda: defaultdict(int)) cycle_gc = defaultdict(int) cycle_kmers = defaultdict(lambda: defaultdict(int)) cycle_mismatch = {'C': defaultdict(lambda: defaultdict(int)), 'G': defaultdict(lambda: defaultdict(int)), 'A': defaultdict(lambda: defaultdict(int)), 'T': defaultdict(lambda: defaultdict(int))} if args.count_duplicates: try: from pybloom import ScalableBloomFilter bloom_filter = ScalableBloomFilter(mode=ScalableBloomFilter.SMALL_SET_GROWTH) except ImportError: sys.exit("--count-duplicates option requires 'pybloom' package.\n") duplicates = 0 percent_complete = 10 reads = infile.subsample(n) for read in reads: if isinstance(read, Sam): if args.aligned_only and not read.mapped: continue elif args.unaligned_only and read.mapped: continue if read.reverse: seq = read.seq[::-1] qual = read.qual[::-1] else: seq = read.seq qual = read.qual else: seq = read.seq qual = read.qual # Set up limits if (args.leftlimit == 1) and (args.rightlimit < 0): pass elif (args.leftlimit >= 1) and (args.rightlimit > 0): try: seq = seq[args.leftlimit - 1:args.rightlimit] qual = qual[args.leftlimit - 1:args.rightlimit] except IndexError: act_nlines += n continue elif (args.leftlimit > 1) and (args.rightlimit < 0): try: seq = seq[args.leftlimit - 1:] qual = qual[args.leftlimit - 1:] except IndexError: act_nlines += n continue if len(seq) == 0: act_nlines += n continue cycle_gc[gc(seq)] += 1 if args.count_duplicates: if seq in bloom_filter: duplicates += 1 else: bloom_filter.add(seq) for i, (s, q) in enumerate(zip(seq, qual)): cycle_nuc[args.leftlimit + i][s] += 1 cycle_qual[args.leftlimit + i][q] += 1 read_len[len(qual)] += 1 for i, kmer in enumerate(window(seq, n=args.kmer)): cycle_kmers[args.leftlimit+i][kmer] += 1 if isinstance(read, Sam) and read.mapped: try: ref = read.parse_md() for i, (s, r) in enumerate(zip(seq, ref)): if s != r: try: cycle_mismatch[r][args.leftlimit+i][s] += 1 except KeyError: pass except KeyError: pass if est_nlines is not None: if (act_nlines / est_nlines) * 100 >= percent_complete: sys.stderr.write("Approximately {0:n}% complete at " "read {1:,} in {2}\n".format(percent_complete, act_nlines, time.strftime('%H:%M:%S', time.gmtime(time.time()-time_start)))) percent_complete += 10 act_nlines += n positions = [k for k in sorted(cycle_qual.keys())] depths = [read_len[k] for k in sorted(read_len.keys())] basecalls = [cycle_nuc[k].keys() for k in sorted(cycle_nuc.keys())] bases = set(list(itertools.chain.from_iterable(basecalls))) #nbasecalls = [ '\t'.join([str(cycle_nuc[p].get(k, 0)) for k in bases]) for p in sorted(cycle_nuc.keys())] map(padbases(bases), cycle_nuc.values()) quantile_values = [0.05,0.25,0.5,0.75,0.95] quantiles = [] ## replace ASCII quality with integer for _, v in sorted(cycle_qual.items()): for q in tuple(v.keys()): ## py3 keys are iterator, so build a tuple to avoid recursion v[ord(str(q)) - 33] = v.pop(q) line = [percentile(v, p) for p in quantile_values] quantiles.append(line) # build kmer set of known adapter sequences adapter_kmers = set() for adapter in all_adapter_sequences: for kmer in window(adapter, n=args.kmer): adapter_kmers.add(kmer) # test for nonuniform kmer profiles and calculate obs/exp observed_expected = dict() all_kmers = [cycle_kmers[k].keys() for k in sorted(cycle_kmers.keys())] kmers = set(list(itertools.chain.from_iterable(all_kmers))) bad_kmers = [] sequenced_bases = sum((l * n for l, n in read_len.items())) priors = tuple(map(float, args.base_probs.split(','))) for kmer in kmers: kmer_counts = [(i, cycle_kmers[i][kmer]) for i in sorted(cycle_kmers.keys())] expected_fraction = reduce(mul, (p ** kmer.count(b) for b, p in zip(('A', 'T', 'C', 'G', 'N'), priors)), 1) expected = expected_fraction * sequenced_bases observed_expected[kmer] = sum((n for _, n in kmer_counts)) / expected slope, _, _, p_value, _ = stats.linregress(*zip(*kmer_counts)) if abs(slope) > 2 and p_value < 0.05: bad_kmers.append((kmer, slope, p_value)) bad_kmers = sorted(bad_kmers, key=lambda x: x[2])[:10] pos_gc = [sum([cycle_nuc[i]['C'], cycle_nuc[i]['G']]) / sum([cycle_nuc[i]['C'], cycle_nuc[i]['G'], cycle_nuc[i]['A'], cycle_nuc[i]['T']]) * 100 for i in positions] # see http://vita.had.co.nz/papers/tidy-data.pdf sys.stdout.write("{row}\t{column}\t{pos}\t{value:n}\n".format(row=sample_name, column='reads', pos='None', value=act_nlines)) for cycle, count in read_len.items(): sys.stdout.write("{row}\t{column}\t{pos:n}\t{value:n}\n".format(row=sample_name, column='read_len', pos=cycle, value=count)) for i, position in enumerate(positions): sys.stdout.write("{row}\t{column}\t{pos:n}\t{value:n}\n".format(row=sample_name, column='q05', pos=position, value=quantiles[i][0])) sys.stdout.write("{row}\t{column}\t{pos:n}\t{value:n}\n".format(row=sample_name, column='q25', pos=position, value=quantiles[i][1])) sys.stdout.write("{row}\t{column}\t{pos:n}\t{value:n}\n".format(row=sample_name, column='q50', pos=position, value=quantiles[i][2])) sys.stdout.write("{row}\t{column}\t{pos:n}\t{value:n}\n".format(row=sample_name, column='q75', pos=position, value=quantiles[i][3])) sys.stdout.write("{row}\t{column}\t{pos:n}\t{value:n}\n".format(row=sample_name, column='q95', pos=position, value=quantiles[i][4])) for base in bases: for position in positions: sys.stdout.write("{row}\t{column}\t{pos:n}\t{value:n}\n".format(row=sample_name, column=base, pos=position, value=cycle_nuc[position][base])) for position in positions: sys.stdout.write("{row}\t{column}\t{pos:n}\t{value:n}\n".format(row=sample_name, column='cycle_gc', pos=position, value=cycle_gc[position])) for i in range(101): sys.stdout.write("{row}\t{column}\t{pos:n}\t{value:n}\n".format(row=sample_name, column='read_gc', pos=i, value=cycle_gc[i])) for kmer, obs_exp in sorted(observed_expected.items(), key=lambda x: x[1]): sys.stdout.write("{row}\t{column}\t{pos}\t{value:n}\n".format(row=sample_name, column=kmer, pos='None', value=obs_exp)) if args.count_duplicates: sys.stdout.write("{row}\t{column}\t{pos}\t{value:n}\n".format(row=sample_name, column='duplicate', pos='None', value=duplicates/act_nlines)) from zipfile import ZipFile with ZipFile(args.output + '.zip', mode='w') as zip_archive: fig_kw = {'figsize':(8, 6)} qualplot(positions, quantiles, zip_archive, fig_kw) median_qual = qualdist(cycle_qual.values(), zip_archive, fig_kw) qualmap(cycle_qual, zip_archive, fig_kw) depthplot(read_len, zip_archive, fig_kw) gcplot(positions, pos_gc, zip_archive, fig_kw) gcdist(cycle_gc, zip_archive, fig_kw) nucplot(positions, bases, cycle_nuc, zip_archive, fig_kw) kmerplot(positions, cycle_kmers, zip_archive, [fields[0] for fields in bad_kmers], fig_kw) adaptermerplot(positions, cycle_kmers, adapter_kmers, zip_archive, fig_kw) if isinstance(infile, Reader): mismatchplot(positions , cycle_mismatch, zip_archive, fig_kw) time_finish = time.time() elapsed = time_finish - time_start if not args.quiet: sys.stderr.write("There were {counts:,} reads in the file. Analysis finished in {sec}.\n".format(counts=act_nlines, sec=time.strftime('%H:%M:%S', time.gmtime(elapsed)) )) if len(bad_kmers) > 0: for kmer in bad_kmers: sys.stderr.write("KmerWarning: kmer %s has a non-uniform profile (slope = %s, p = %s).\n" % (kmer)) if median_qual < args.median_qual: sys.stderr.write("QualityWarning: median base quality score is %s.\n" % median_qual)
class Fetcher(metaclass=Singleton): def __init__(self, ioloop=None, start_url=None, max_depth=5): super().__init__() self.ioloop = ioloop or tornado.ioloop.IOLoop.instance() self.start_url = start_url or {} self.fetch_queue = Queue() self.fetched = [] self.fetched_filter = ScalableBloomFilter(mode=ScalableBloomFilter.SMALL_SET_GROWTH) self.fetch_finished = [] for u in start_url: self.fetch_queue.put(u) self.fetching = 0 self.max_depth = max_depth def add_url(self, url): if not isValidScheme(url): logger.warning("not vaild_scheme") return logger.debug("get url: %s" % url) self.fetch_queue.put(url) @tornado.gen.coroutine def fetch(self, url): """ 抓取器 """ http_cilent = AsyncHTTPClient() request = HTTPRequest(url=url.encode("utf-8"), connect_timeout=options.timeout, request_timeout=options.timeout) response = yield http_cilent.fetch(request) logger.debug("fetched url: %s" % url) return response def parse(self, response): """ 解析URL, 保存结果, 传递新的URL """ # self.save_tofile(response) url_gen = HtmlAnalyzer.extract_links(response.body, response.effective_url, []) return url_gen def save_tofile(self, response): """ 暂时使用blocking的f.write代替db 这里的io比较快,影响不大 """ path = response.effective_url.split("/")[-1] if path is None or path is "": path = response.effective_url.split("/")[-2] try: with open(os.path.join("tmp", path), "a") as f: f.write(response.effective_url + "\n") f.write(str(response.body) + "\n") except: logger.error("path %s" % path) @tornado.gen.coroutine def do_work(self, url): if not isValidScheme(url): logger.warning("not vaild_scheme") return None try: response = yield self.fetch(url) except tornado.httpclient.HTTPError as e: # import traceback # traceback.print_exc() # TODO # Some bug here. Too many file open. # with open('httperror.txt', "a") as f: # f.write("Url: %s HTTPError: %s \n"% (url,e.code)) logger.error("Url: %s HTTPError: %s " % (url, e.code)) except: import traceback traceback.print_exc() logger.error("Unknow error with url: %s" % url) else: url_gen = self.parse(response) self.fetch_finished.append(url) sender = Sender() for u in url_gen: sender.add_url(u) logging.info("fetched %s" % url) self.fetching -= 1 def run(self): """ Get url from fetch_queue to fetch """ logging.error("fetching: %s " % self.fetching) while not self.fetch_queue.empty() and self.fetching <= options.max_fetch_clients: url = self.fetch_queue.get() if url in self.fetched_filter: logging.info("url in fetched_filter") continue else: self.fetched_filter.add(url) self.fetched.append(url) self.fetching += 1 self.ioloop.add_callback(self.do_work, url) self.ioloop.add_timeout(datetime.timedelta(seconds=1), self.run)
#load context from file conn = sqlite3.connect('record.db') curs = conn.cursor() curs.execute('''create table if not exists downloaded_image_url (id INTEGER PRIMARY KEY autoincrement, tiebar_url text, image_url text, md5 text)''') curs.execute('''create table if not exists parsed_url (id INTEGER PRIMARY KEY autoincrement, url text, title text, parsed_time date)''') curs.execute('''create table if not exists wait_parse_url (id INTEGER PRIMARY KEY autoincrement, url text)''') print 'finish create table' #load downloaded image urls curs.execute('select image_url from downloaded_image_url') while True: url = curs.fetchone() if url is not None: print 'downloaded image url:', url[0] downloaded_image_urls.add(url[0]) else: break #load parsed urls curs.execute('select url from parsed_url') while True: url = curs.fetchone() if url is not None: print 'parsed url:', url[0] parsed_urls.add(url[0]) else: break #load wait parse queue urls curs.execute('select url from wait_parse_url') while True: url = curs.fetchone()
class BloomAutoYara: def __init__(self,filterfile): self.filterfile = filterfile #if filterfile is present load bloom filter from that file, else create new one if os.path.exists(filterfile): self.bf = ScalableBloomFilter.fromfile(open(filterfile,"rb")) print "available signatures = %d"%len(self.bf) else: self.bf = ScalableBloomFilter(mode=ScalableBloomFilter.SMALL_SET_GROWTH) def save_filter(self): print "saving filter to file %s "%self.filterfile self.bf.tofile(open(self.filterfile,"wb")) def add_string(self,str): self.bf.add(str) def search_string(self,str): if str in self.bf: return True else: return False def extractlines(self,filename,min_len=4): chars = r"A-Za-z0-9/\-:.,_$%'()[\]<> " shortest_run = 4 regexp = '[%s]{%d,}' % (chars, shortest_run) pattern = re.compile(regexp) fp = open(filename,"rb") data = fp.read() lines = pattern.findall(data) s = set(lines) fp.close() return list(s) def build_filter(self,dirname,extensions=[]): print extensions total = 0 for (dir, _, files) in os.walk(dirname): for f in files: ext = f.split(".")[-1] if len(extensions) != 0 and ext not in extensions: continue print "processing file %s"%f total += 1 path = os.path.join(dir, f) lines = self.extractlines(path) for line in lines: self.add_string(line) print "creating bloom filter done. Total files = %d (Total entries = %d). Overwriting to bloom filter output file %s"%(total,len(self.bf),self.filterfile) self.save_filter() def find_file_topn(self,filename,topn=10): tmp = [] lines = self.extractlines(filename) print "total unique strings in file %s = %d"%(filename,len(lines)) for line in lines: if self.search_string(line) == False: tmp.append(line) tmp.sort(key=len) print "total strings which can be used for signature = %d"%len(tmp) tmp = tmp[-topn:] tmp.reverse() return tmp def find_dir_topn(self,dirname,topn=10): tmplist = [] for (dir, _, files) in os.walk(dirname): for f in files: path = os.path.join(dir, f) lines = self.extractlines(path) for line in lines: if self.search_string(line) == False: tmplist.append(line) counts = Counter(list(tmplist)) return counts.most_common(topn) def escapechars(self,str): for c in "\/.^$*+-?()[]{}|": str = str.replace(c,"\\"+c) return str def list_to_rule(self,list,rulename,threshold=0.5): tmp = "rule " + rulename + "{\n" tmp += " strings:\n" for i in xrange(0,len(list)): esc = self.escapechars(list[i]) tmp = tmp + "$str%d = "%i + r"/[^A-Za-z0-9\/\-:.,_$%'()\[\]<> ]" + esc + r"[^A-Za-z0-9\/\-:.,_$%'()\[\]<> ]/" tmp += "\n" tmp += "condition:\n" tmp += str(int(len(list)*threshold)) tmp += " of (" for i in xrange(0,len(list)): tmp += "$str"+ str(i) if i != (len(list) - 1): tmp += "," tmp += ")\n}" print "rule = %s.yara is written to disk "%rulename fp = open(rulename+".yara","w") fp.write(tmp) fp.close()
import redis import logging, sys import config from pybloom import ScalableBloomFilter sbf = ScalableBloomFilter(mode=ScalableBloomFilter.SMALL_SET_GROWTH) logger = Logger.getStdOutDebugLogger('master') redis_conn = redis.StrictRedis(config.REDIS_HOST, config.REDIS_PORT) while(True): url = redis_conn.blpop(config.RawQueue)[1] if (not url in sbf) and (url[0:url.find('/', 7)].find('2epub') != -1): sbf.add(url) redis_conn.rpush(config.PendingCrawlingQueue, url)
class StoreFeedbackSpider(RedisSpider): name = "store" allowed_domains = ["aliexpress.com"] start_urls = ( 'http://www.aliexpress.com/', ) prefix = '' def __init__(self): self.feedbacks = dict() self.redis_queue = None self.ids = ScalableBloomFilter(mode=ScalableBloomFilter.LARGE_SET_GROWTH) def get_queue(self): for value in set(self.server.smembers(self.redis_key)): yield value def start_requests(self): StoreFeedbackSpider.prefix = self.settings['prefix'] self.redis_key = '{}:storefeedback'.format(StoreFeedbackSpider.prefix) self.redis_queue = self.get_queue() db = MongoClient().aliexpress for store_feedback in db['{}storefeedback'.format(StoreFeedbackSpider.prefix)].find(): self.ids.add(store_feedback['id']) yield self.next_request() def next_request(self): while True: try: url = next(self.redis_queue) except StopIteration: url = None if not (url and self.ids.add(urlparse.parse_qs(urlparse.urlparse(url).query)['storeId'][0])): break if url: return self.make_requests_from_url(url) else: raise CloseSpider('redis queue has no url to request') def make_requests_from_url(self, url): self.log('request store feedback url: {}'.format(url), logging.INFO) parsed = urlparse.urlparse(url) store_id = urlparse.parse_qs(parsed.query)['storeId'][0] return self.request(store_id, url) def request(self, store_id, base_url, page=1): feedback_url = '{}&page={}'.format(base_url, page) self.log('request store feedback page: {}'.format(feedback_url), logging.INFO) return scrapy.Request(url=feedback_url, meta={'store_id': store_id, 'base_url': base_url, 'page': page}, callback=self.parse) def parse(self, response): self.log('parse store feedback page: {}'.format(response.url), logging.INFO) trs = response.xpath('//tbody/tr') if len(trs) > 0: for tr in trs: product = tr.css('.product-name').xpath('a/@href').extract()[0].replace('//', '/') time = datetime.strptime(tr.css('.feedback-date').xpath('text()').extract()[0], '%d %b %Y %H:%M') star_width = tr.css('.star').xpath('span/@style').extract()[0] star = int(star_width[star_width.index(':') + 1:-2]) / 20 self.store(response.meta['store_id']).append_feedback(time=time, product=product, star=star) return self.request(response.meta['store_id'], response.meta['base_url'], int(response.meta['page']) + 1) else: self.store(response.meta['store_id']).finish_feedback = True return self.pop_feedback(response.meta['store_id']) def store(self, id): if id not in self.feedbacks: self.feedbacks[id] = StoreFeedback(id) return self.feedbacks[id] def pop_feedback(self, id): if self.store(id).is_finish(): feedback = self.feedbacks.pop(id) self.log('crawl store feedback: {}'.format(feedback), logging.INFO) item = StoreFeedbackItem() item['prefix'] = StoreFeedbackSpider.prefix item['_id'] = feedback.id item['feedbacks'] = feedback.feedbacks return item
for key, value in res_dict.items(): print("NULL\t{0}\t{1}".format(key, value), file=f) i += 1 filename = (field_content + "pickles/parse_sentences/parse_sentences_" + str(i) + ".pickle") sbf = ScalableBloomFilter(initial_capacity=150000000, mode=ScalableBloomFilter.LARGE_SET_GROWTH) out = open(field_content + "lm/data.new", "w", encoding="utf8") with open(field_content + "lm/data.txt", "r", encoding="utf8") as f: for line in f: w1, w2, w3 = line.strip().split('\t') if w2 not in sbf: sbf.add(w2) print("{0}\t{1}\t{2}".format(w1, w2, w3), file=out) out.close() # print("insert db") # path = '"/home/zhi/Project/sentiment_relation_extraction_new_data/data/domains/{0}/lm/data.new"'.format(content) # sql = "load data local infile "+path+ " into table lm_db fields escaped by ''" # print(sql) # execute(connection, sql) # print("insert end") # print("create index") # sql = "alter lm_db add index on (content)" # print(sql) # execute(connection, sql) # connection.close()
class SupplierCatalogItemTask(BaseSupplierCatalogTask): field_names = { 'advanced':'advanced', #'availability_indefinite':'availability_indefinite', 'available':'available', 'category_identifier':'category_identifier', 'cost':'quantity_cost', #'effective':'effective', 'manufacturer_identifier':'manufacturer_identifier', 'name':'name', 'phased_out':'phased_out', 'product_identifier':'product_identifier', 'retail':'quantity_retail', 'scale_identifier':'scale_identifier', 'special_cost':'quantity_special_cost', 'stock':'in_stock', #'to_be_announced':'to_be_announced' } defaults = { 'advanced': False, #'availability_indefinite': False, 'available': None, 'category_identifier': None, 'cost': Decimal(0), 'name': None, 'phased_out': False, 'retail': Decimal(0), 'scale_identifier': None, 'special_cost': Decimal(0), 'stock': False, #'to_be_announced': False, } latest_supplier_catalog_id_cache = dict() category_conversion_filter = None manufacturer_conversion_filter = None price_control_filter = None scale_conversion_filter = None def __init__(self): BaseSupplierCatalogTask.__init__(self) self.plugins = self.load_plugins() def load(self): """Load""" logger.debug("Begin load()") self.load_all() logger.debug("End load()") def load_all(self, supplier_id=None): logger.debug("Begin load_all()") self.ts = ttystatus.TerminalStatus(period=1) self.ts.add(ttystatus.Literal('SupplierCatalogItem Load Elapsed: ')) self.ts.add(ttystatus.ElapsedTime()) self.ts.add(ttystatus.Literal(' Supplier: ')) self.ts.add(ttystatus.PercentDone('supplier_done', 'supplier_total', decimals=2)) self.ts.add(ttystatus.Literal(' Manufacturer: ')) self.ts.add(ttystatus.PercentDone('manufacturer_done', 'manufacturer_total', decimals=2)) self.ts.add(ttystatus.Literal(' Product: ')) self.ts.add(ttystatus.PercentDone('product_done', 'product_total', decimals=2)) self.ts.add(ttystatus.Literal(' ')) self.ts.add(ttystatus.String('manufacturer')) self.ts.add(ttystatus.Literal('-')) self.ts.add(ttystatus.String('product')) self.ts['supplier_total'] = 1 self.ts['supplier_done'] = 0 self.ts['manufacturer_total'] = 1 self.ts['manufacturer_done'] = 0 self.ts['product_total'] = 1 self.ts['product_done'] = 0 self.ts['supplier_total'] = len(self.plugins) self.ts['supplier_done'] = 0 filter_supplier_id = supplier_id try: #self.session.begin(subtransactions=True) for plug in self.plugins.itervalues(): supplier_id = plug.supplier_id() if ( filter_supplier_id is not None and supplier_id != filter_supplier_id ): continue #latest_supplier_catalog = self.load_latest_supplier_catalog(supplier_id) #if supplier_catalog is not None: #self.supplier_catalog_id = supplier_catalog.id self.load_supplier(plug, supplier_id) #else: #logger.error("No Latest SupplierCatalog Found for Supplier.id %s", supplier_id) self.session.flush() self.session.expunge_all() self.ts['supplier_done'] += 1 #self.session.commit() except Exception: logger.exception("Caught Exception: ") if self.session.transaction is not None: self.session.rollback() finally: self.ts.finish() logger.debug("End load_all()") def load_supplier(self, plug, supplier_id): self.session.begin(subtransactions=True) logger.debug("load_supplier %s", supplier_id) query = self.session.query(SupplierCatalogItemFieldModel.manufacturer_identifier) query = query.filter(SupplierCatalogItemFieldModel.supplier_id == supplier_id) query = query.filter(SupplierCatalogItemFieldModel.manufacturer_identifier != None) query = query.group_by(SupplierCatalogItemFieldModel.manufacturer_identifier) self.ts['manufacturer_total'] = query.count() self.ts['manufacturer_done'] = 0 for (manufacturer_identifier, ) in query.yield_per(1000): self.ts['manufacturer'] = manufacturer_identifier self.load_manufacturer(plug, supplier_id, manufacturer_identifier) self.session.flush() self.session.expunge_all() self.ts['manufacturer_done'] += 1 self.session.commit() def load_manufacturer(self, plug, supplier_id, manufacturer_identifier): logger.debug("Manufacturer %s", manufacturer_identifier) query = self.session.query(SupplierCatalogItemFieldModel.product_identifier) query = query.filter(SupplierCatalogItemFieldModel.supplier_id == supplier_id) query = query.filter(SupplierCatalogItemFieldModel.manufacturer_identifier == manufacturer_identifier) query = query.filter(SupplierCatalogItemFieldModel.product_identifier != None) query = query.group_by(SupplierCatalogItemFieldModel.product_identifier) self.ts['product_total'] = query.count() self.ts['product_done'] = 0 for (product_identifier, ) in query.yield_per(1000): self.ts['product'] = product_identifier self.load_one(supplier_id, manufacturer_identifier, product_identifier) self.ts['product_done'] += 1 def load_one(self, supplier_id, manufacturer_identifier, product_identifier): #for (key, value) in self.defaults.iteritems(): #if key not in data or data[key] is None: #data[key] = value query = self.session.query(SupplierCatalogItemModel) query = query.filter(SupplierCatalogItemModel.supplier_id == supplier_id) query = query.filter(SupplierCatalogItemModel.manufacturer_identifier == manufacturer_identifier) query = query.filter(SupplierCatalogItemModel.product_identifier == product_identifier) try: supplier_catalog_item = query.one() except NoResultFound: supplier_catalog_item = SupplierCatalogItemModel() supplier_catalog_item.supplier_id = supplier_id supplier_catalog_item.manufacturer_identifier = manufacturer_identifier supplier_catalog_item.product_identifier = product_identifier self.session.add(supplier_catalog_item) #for (field_name, item_name) in self.field_names.iteritems(): #setattr(supplier_catalog_item, item_name, data[field_name]) #supplier_catalog_item.effective = data['effective'] self.session.flush() def update(self): """Update""" logger.debug("Begin update()") self.update_all(limit=10000, time_limit=timedelta(hours=1)) logger.debug("End update()") def update_all(self, modified_since=None, limit=None, time_limit=None): """Update All""" logger.debug("Begin update_all()") result = None ts = self.term_stat('SupplierCatalogItem Update') start_time = datetime.now() try: #s = ScalableBloomFilter() #query = self.session.query( # SupplierCatalogItemFieldModel.supplier_id, # SupplierCatalogItemFieldModel.manufacturer_identifier, # SupplierCatalogItemFieldModel.product_identifier, #) #for row in query.yield_per(1000): # s.add(row) query = self.session.query(SupplierCatalogItemModel) if modified_since: query = query.filter(SupplierCatalogItemModel.modified >= modified_since) if limit: query = query.order_by(SupplierCatalogItemModel.updated.nullsfirst()) query = query.limit(limit) ts['total'] = query.count() self.session.begin(subtransactions=True) for supplier_catalog_item in query.yield_per(10000): row = ( supplier_catalog_item.supplier_id, supplier_catalog_item.manufacturer_identifier, supplier_catalog_item.product_identifier, ) #if row not in s: # logger.info( # "Not found in SupplierCatalogItemFields %s %s-%s", # supplier_catalog_item.supplier_id, # supplier_catalog_item.manufacturer_identifier, # supplier_catalog_item.product_identifier # ) ## TODO Maybe only not do load from SCIV? # continue self.update_one(supplier_catalog_item) ts['done'] += 1 if time_limit is not None: if datetime.now() > start_time + time_limit: logger.info("Reached Time Limit at %i of %i", ts['done'], ts['total']) break; self.session.commit() result = True except Exception as e: logger.exception("Caught Exception: ") if self.session.transaction is not None: self.session.rollback() finally: ts.finish() logger.debug("End update_all()") return result def update_one(self, supplier_catalog_item): """ Update One Using ManufacturerConversion, convert manufacturer_identifier to manufacturer_id Using ProductConversion, convert product_identifier to product_id and quantity quantity_cost from quantity, cost quantity_retail from quantity, retail Using CategoryConversion, convert category_identifier to category_id Using ScaleConversion, convert scale_identifier to scale_id Using PriceControl, get price_control_id using sale, quantity generate quantity_sale """ self.session.begin(subtransactions=True) self.update_supplier_catalog_item_version(supplier_catalog_item) self.update_manufacturer(supplier_catalog_item) self.update_product(supplier_catalog_item) self.update_category(supplier_catalog_item) self.update_scale(supplier_catalog_item) self.update_price_control(supplier_catalog_item) supplier_catalog_item.updated = datetime.now() self.session.commit() def load_latest_supplier_catalog_id(self, supplier_id): if supplier_id in self.latest_supplier_catalog_id_cache: return self.latest_supplier_catalog_id_cache[supplier_id] query = self.session.query(SupplierCatalogModel) query = query.filter(SupplierCatalogModel.supplier_id == supplier_id) supplier_catalog = query.order_by(desc(SupplierCatalogModel.issue_date)).first() logger.debug("Latest Supplier %s, %s", supplier_id, supplier_catalog) self.latest_supplier_catalog_id_cache[supplier_id] = supplier_catalog.id return supplier_catalog.id def update_supplier_catalog_item_version(self, supplier_catalog_item): if supplier_catalog_item.supplier_id not in self.plugins: ## Not an ETL tracked Supplier. return plug = self.plugins[supplier_catalog_item.supplier_id] model_name = plug.version_model() + 'Model' VersionModel = getattr(model, model_name) ## TODO: Don't overwrite manual entries self.latest_supplier_catalog_id = self.load_latest_supplier_catalog_id(supplier_catalog_item.supplier_id) if self.latest_supplier_catalog_id is None: logger.error("No Latest SupplierCatalog Found for Supplier.id %s", supplier_catalog_item.supplier_id) ## TODO: What should we be doing here? setting some sort of defaults? supplier_catalog_item.legacy_flag = 20 return query = self.session.query(SupplierCatalogItemFieldModel.id) query = query.filter(SupplierCatalogItemFieldModel.supplier_id == supplier_catalog_item.supplier_id) query = query.filter(SupplierCatalogItemFieldModel.manufacturer_identifier == supplier_catalog_item.manufacturer_identifier) query = query.filter(SupplierCatalogItemFieldModel.product_identifier == supplier_catalog_item.product_identifier) data = None if query.count() > 0: s = set() for (supplier_catalog_item_field_id, ) in query.yield_per(1000): s.add(supplier_catalog_item_field_id) del query if plug.opaque() is True: if plug.ghost() is True: data = self.coalesce_opaque_ghost(VersionModel, s, plug) else: data = self.coalesce_opaque_noghost(VersionModel, s) else: if plug.ghost() is True: data = self.coalesce_translucent_ghost(VersionModel, s) else: data = self.coalesce_translucent_noghost(VersionModel, s) #print "DATA IN", data if data is None: logger.warning( "Got None from coalesce %s %s-%s", supplier_catalog_item.supplier_id, supplier_catalog_item.manufacturer_identifier, supplier_catalog_item.product_identifier, ) ## TODO What should we do here? supplier_catalog_item.legacy_flag = 30 return for (key, value) in self.defaults.iteritems(): if key not in data or data[key] is None: data[key] = value #print "DATA OUT", data f = { 'advanced':'advanced', #'availability_indefinite':'availability_indefinite', 'available':'available', 'category_identifier':'category_identifier', 'cost':'quantity_cost', #'effective':'effective', ##'manufacturer_identifier':'manufacturer_identifier', 'name':'name', 'phased_out':'phased_out', ##'product_identifier':'product_identifier', 'retail':'quantity_retail', 'scale_identifier':'scale_identifier', 'special_cost':'quantity_special_cost', 'stock':'in_stock', #'to_be_announced':'to_be_announced' } for (field_name, item_name) in f.iteritems(): setattr(supplier_catalog_item, item_name, data[field_name]) supplier_catalog_item.legacy_flag = 40 def coalesce_opaque_noghost(self, VersionModel, s, get_effective=False): query = self.session.query(VersionModel) query = query.filter(VersionModel.supplier_catalog_item_field_id.in_(s)) query = query.order_by(desc(VersionModel.effective)) try: supplier_catalog_item_version = query.first() except NoResultFound: logger.debug('No %s Found', VersionModel.__name__) return None if supplier_catalog_item_version is None: logger.debug('No %s Found', VersionModel.__name__) return None data = dict() for field_name in self.field_names.iterkeys(): data[field_name] = getattr(supplier_catalog_item_version.supplier_catalog_item_field, field_name) data['supplier_catalog_id'] = supplier_catalog_item_version.supplier_catalog_id supplier_catalog_item_field_id = supplier_catalog_item_version.supplier_catalog_item_field_id effective = supplier_catalog_item_version.effective if get_effective: for supplier_catalog_item_version in query.yield_per(5): if supplier_catalog_item_version.supplier_catalog_item_field_id == supplier_catalog_item_field_id: effective = supplier_catalog_item_version.effective else: break data['effective'] = effective return data def coalesce_opaque_ghost(self, VersionModel, s, plug, get_effective=False): data = self.coalesce_opaque_noghost(VersionModel, s, get_effective) if data is None: return None if data['supplier_catalog_id'] != self.latest_supplier_catalog_id: if plug.ghost_stock(): data['stock'] = False if plug.ghost_phased_out(): data['phased_out'] = False if plug.ghost_advanced(): data['advanced'] = False return data def coalesce_translucent_noghost(self, VersionModel, s): query = self.session.query(VersionModel) query = query.filter(VersionModel.supplier_catalog_item_field_id.in_(s)) query = query.order_by(desc(VersionModel.effective)) count = query.count() if count == 0: logger.error('No %s Found. Run SupplierCatalogItemVersionTask.vacuum() !', VersionModel.__name__) return None data = dict() first = True done = 0 for supplier_catalog_item_version in query.all(): done += 1 if first: data['supplier_catalog_id'] = supplier_catalog_item_version.supplier_catalog_id data['effective'] = supplier_catalog_item_version.effective complete = True for field_name in self.field_names.iterkeys(): field = getattr(supplier_catalog_item_version.supplier_catalog_item_field, field_name) if not field_name in data or data[field_name] is None: if field is None: complete = False else: data[field_name] = field if complete: break #logger.info("Complete SupplierCatalogItem was found in %i of %i Versions", done, count) return data def coalesce_translucent_ghost(self, VersionModel, s, plug): data = self.coalesce_translucent_noghost(VersionModel, s) if data is None: return None if data['supplier_catalog_id'] != self.latest_supplier_catalog_id: if plug.ghost_stock(): data['stock'] = False if plug.ghost_phased_out(): data['phased_out'] = False if plug.ghost_advanced(): data['advanced'] = False return data def update_manufacturer(self, supplier_catalog_item): #self.session.begin(subtransactions=True) """Update Manufacturer""" #print ( # "Update Manufacturer", # "sid", supplier_catalog_item.supplier_id, # "mident", supplier_catalog_item.manufacturer_identifier, # "mid", supplier_catalog_item.manufacturer_id #) manufacturer_conversion = self.get_manufacturer_conversion( supplier_catalog_item.supplier_id, supplier_catalog_item.manufacturer_identifier ) if manufacturer_conversion is not None: supplier_catalog_item.manufacturer_id = manufacturer_conversion.manufacturer_id else: supplier_catalog_item.manufacturer_id = None #self.session.commit() def update_product(self, supplier_catalog_item): #self.session.begin(subtransactions=True) """Product Conversion""" if ( supplier_catalog_item.supplier_id is not None and supplier_catalog_item.manufacturer_id is not None and supplier_catalog_item.product_identifier is not None ): product_conversion = self.get_product_conversion( supplier_catalog_item.supplier_id, supplier_catalog_item.manufacturer_id, supplier_catalog_item.product_identifier ) if product_conversion is not None: supplier_catalog_item.product_id = product_conversion.product_id supplier_catalog_item.quantity = product_conversion.get_quantity() else: supplier_catalog_item.product_id = None supplier_catalog_item.quantity = Decimal(1) else: supplier_catalog_item.product_id = None supplier_catalog_item.quantity = Decimal(1) if supplier_catalog_item.quantity_cost > 0: supplier_catalog_item.cost = decimal_round(supplier_catalog_item.quantity_cost / supplier_catalog_item.quantity, cfg.cost_decimals) else: supplier_catalog_item.cost = Decimal(0) if supplier_catalog_item.quantity_special_cost > 0: supplier_catalog_item.special_cost = decimal_round(supplier_catalog_item.quantity_special_cost / supplier_catalog_item.quantity, cfg.cost_decimals) else: supplier_catalog_item.special_cost = Decimal(0) if supplier_catalog_item.quantity_retail > 0: supplier_catalog_item.retail = decimal_round(supplier_catalog_item.quantity_retail / supplier_catalog_item.quantity, cfg.cost_decimals) else: supplier_catalog_item.retail = Decimal(0) #self.session.commit() def update_category(self, supplier_catalog_item): """Category Conversion""" #self.session.begin(subtransactions=True) if ( supplier_catalog_item.supplier_id is not None and supplier_catalog_item.manufacturer_id is not None and supplier_catalog_item.category_identifier is not None ): category_conversion = self.get_category_conversion( supplier_catalog_item.supplier_id, supplier_catalog_item.manufacturer_id, supplier_catalog_item.category_identifier ) if category_conversion is not None: supplier_catalog_item.category_id = category_conversion.category_id else: supplier_catalog_item.category_id = None else: supplier_catalog_item.category_id = None #self.session.commit() def update_scale(self, supplier_catalog_item): """Scale Conversion""" #self.session.begin(subtransactions=True) if ( supplier_catalog_item.supplier_id is not None and supplier_catalog_item.scale_identifier is not None ): scale_conversion = self.get_scale_conversion( supplier_catalog_item.supplier_id, supplier_catalog_item.scale_identifier ) if scale_conversion is not None: supplier_catalog_item.scale_id = scale_conversion.scale_id else: supplier_catalog_item.scale_id = None else: supplier_catalog_item.scale_id = None #self.session.commit() def update_price_control(self, supplier_catalog_item): """Price Control""" #self.session.begin(subtransactions=True) #*** TODO handle price_control.allow_advanced if ( supplier_catalog_item.supplier_id is not None and supplier_catalog_item.manufacturer_id is not None and supplier_catalog_item.retail > 0 ): price_control = self.get_price_control( supplier_catalog_item.supplier_id, supplier_catalog_item.manufacturer_id, supplier_catalog_item.retail, supplier_catalog_item.advanced, supplier_catalog_item.special ) if price_control is not None: supplier_catalog_item.price_control_id = price_control.id supplier_catalog_item.rank = price_control.rank if supplier_catalog_item.special: if supplier_catalog_item.cost > 0: ratio = supplier_catalog_item.special_cost / supplier_catalog_item.cost else: ratio = 1 special_retail = supplier_catalog_item.retail * ratio supplier_catalog_item.sale = price_control.sale( supplier_catalog_item.special_cost, special_retail ) else: supplier_catalog_item.sale = price_control.sale( supplier_catalog_item.cost, supplier_catalog_item.retail ) else: supplier_catalog_item.sale = 0 supplier_catalog_item.price_control_id = None supplier_catalog_item.rank = 0 else: supplier_catalog_item.sale = 0 supplier_catalog_item.price_control_id = None supplier_catalog_item.rank = 0 #self.session.commit() def get_category_conversion(self, supplier_id, manufacturer_id, category_identifier): """Category Conversion""" if self.category_conversion_filter is None: self.category_conversion_filter = ScalableBloomFilter() query = self.session.query( CategoryConversionModel.supplier_id, CategoryConversionModel.manufacturer_id, CategoryConversionModel.needle ) for row in query.yield_per(100): self.category_conversion_filter.add(row) row = (supplier_id, manufacturer_id, category_identifier) if row in self.category_conversion_filter: query = self.session.query(CategoryConversionModel) query = query.filter(CategoryConversionModel.supplier_id == supplier_id) query = query.filter(CategoryConversionModel.manufacturer_id == manufacturer_id) query = query.filter(CategoryConversionModel.needle == category_identifier) try: category_conversion = query.one() return category_conversion except NoResultFound: pass category_conversion = CategoryConversionModel() category_conversion.manufacturer_id = manufacturer_id category_conversion.supplier_id = supplier_id category_conversion.needle = category_identifier self.session.add(category_conversion) self.category_conversion_filter.add(row) return category_conversion def get_manufacturer_conversion(self, supplier_id, manufacturer_identifier): """Manufacturer Conversion""" if self.manufacturer_conversion_filter is None: self.manufacturer_conversion_filter = ScalableBloomFilter() query = self.session.query( ManufacturerConversionModel.supplier_id, ManufacturerConversionModel.manufacturer_identifier ) for row in query.yield_per(100): self.manufacturer_conversion_filter.add(row) row = (supplier_id, manufacturer_identifier) if row in self.manufacturer_conversion_filter: query = self.session.query(ManufacturerConversionModel) query = query.filter(ManufacturerConversionModel.supplier_id == supplier_id) query = query.filter(ManufacturerConversionModel.manufacturer_identifier == manufacturer_identifier) try: manufacturer_conversion = query.one() return manufacturer_conversion except NoResultFound: pass query = self.session.query(ManufacturerModel) query = query.filter(ManufacturerModel.identifier == manufacturer_identifier) try: manufacturer = query.one() except NoResultFound: logger.warning("No ManufacturerConversion found for supplier_id '%s' manufacturer_identifier '%s'", supplier_id, manufacturer_identifier) return None manufacturer_conversion = ManufacturerConversionModel() manufacturer_conversion.manufacturer_id = manufacturer.id manufacturer_conversion.supplier_id = supplier_id manufacturer_conversion.manufacturer_identifier = manufacturer_identifier #self.session.add(manufacturer_conversion) return manufacturer_conversion def get_price_control(self, supplier_id, manufacturer_id, retail, preorder, special): """Price Control""" if self.price_control_filter is None: self.price_control_filter = ScalableBloomFilter() query = self.session.query( PriceControlModel.supplier_id, PriceControlModel.manufacturer_id ) for row in query.yield_per(100): self.price_control_filter.add(row) row = (supplier_id, manufacturer_id) if row in self.price_control_filter: query = self.session.query(PriceControlModel) query = query.filter(PriceControlModel.supplier_id == supplier_id) query = query.filter(PriceControlModel.manufacturer_id == manufacturer_id) if preorder: query = query.filter(PriceControlModel.preorder == True) if special: query = query.filter(PriceControlModel.special == True) if (not preorder) and (not special): query = query.filter(PriceControlModel.normal == True) query = query.filter(PriceControlModel.retail_low <= retail) query = query.filter(PriceControlModel.retail_high >= retail) query = query.filter(PriceControlModel.enable == True) try: price_control = query.one() return price_control except NoResultFound: #logger.warning( # "No PriceControl found for supplier_id '%s' manufacturer_id '%s' retail '%s', preorder '%s', special '%s'", # supplier_id, # manufacturer_id, # retail, # preorder, # special #) return None except MultipleResultsFound: logger.warning( "Duplicate PriceControls found for supplier_id '%s' manufacturer_id '%s' retail '%s', preorder '%s', special '%s'", supplier_id, manufacturer_id, retail, preorder, special ) return None def get_product_conversion(self, supplier_id, manufacturer_id, product_identifier): """Product Conversion""" query = self.session.query(ProductConversionModel) query = query.filter(ProductConversionModel.supplier_id == supplier_id) query = query.filter(ProductConversionModel.manufacturer_id == manufacturer_id) query = query.filter(ProductConversionModel.product_identifier == product_identifier) try: product_conversion = query.one() return product_conversion except NoResultFound: pass query = self.session.query(ProductModel) query = query.filter(ProductModel.manufacturer_id == manufacturer_id) query = query.filter(ProductModel.identifier == product_identifier) try: product = query.one() except NoResultFound: #logger.warning( # "No ProductConversion found for supplier_id '%s' manufacturer_id '%s' product_identifier '%s'", # supplier_id, # manufacturer_id, # product_identifier, #) return None product_conversion = ProductConversionModel() product_conversion.product_id = product.id product_conversion.manufacturer_id = manufacturer_id product_conversion.supplier_id = supplier_id product_conversion.source_quantity = 1 product_conversion.target_quantity = 1 return product_conversion def get_scale_conversion(self, supplier_id, scale_identifier): """Scale Conversion""" if scale_identifier is None: return None if supplier_id is None: return None if self.scale_conversion_filter is None: self.scale_conversion_filter = ScalableBloomFilter() query = self.session.query( ScaleConversionModel.supplier_id, ScaleConversionModel.scale_identifier ) for row in query.yield_per(100): self.scale_conversion_filter.add(row) row = (supplier_id, scale_identifier) if row in self.scale_conversion_filter: query = self.session.query(ScaleConversionModel) query = query.filter(ScaleConversionModel.supplier_id == supplier_id) query = query.filter(ScaleConversionModel.scale_identifier == scale_identifier) try: scale_conversion = query.one() return scale_conversion except NoResultFound: pass query = self.session.query(ScaleModel) query = query.filter(ScaleModel.name == scale_identifier) try: scale = query.one() except NoResultFound: scale = None if scale is not None: scale_conversion = ScaleConversionModel() scale_conversion.scale_id = scale.id return scale_conversion else: scale_conversion = ScaleConversionModel() scale_conversion.scale_id = None scale_conversion.supplier_id = supplier_id scale_conversion.scale_identifier = scale_identifier self.session.add(scale_conversion) self.scale_conversion_filter.add(row) self.session.flush() return scale_conversion
def addNewUrl(): conn = database.getConn() cursor = conn.cursor() # check if empty cursor.execute('SELECT outlinks FROM webpage WHERE status = 2') num_outlinks = cursor.rowcount rows_outlinks = cursor.fetchall() cursor.execute("SELECT error FROM webpage WHERE status = 11") num_redirect = cursor.rowcount rows_redirect = cursor.fetchall() num_all = num_redirect + num_outlinks if num_all == 0 : return {'exist':0 , 'insert':0 , 'all':0} cursor.close() conn.close() #bloom start ..input the urls into bloom import bitarray from pybloom import ScalableBloomFilter sql = "SELECT url FROM webpage WHERE 1" cursor.execute(sql) num_exist = cursor.rowcount rows = cursor.fetchall() sbf = ScalableBloomFilter(mode=ScalableBloomFilter.SMALL_SET_GROWTH) for row in rows: sbf.add(row[0]) #bloom end sbf insert_arr = [] num_insert = 0 for row in rows_outlinks: outlinks_arr = row[0].split(',') proper_links = filterOutLinks(outlinks_arr) for link in proper_links: if link in sbf: pass else: num_insert += 1 sbf.add(link) insert_arr.append((link,0)) # for the redirect url cursor.execute("SELECT error FROM webpage WHERE status = 11") rows = cursor.fetchall() for row in rows_redirect: link = row[0] link = filterLink(link) if link == '': continue if link in sbf: pass else: num_insert += 1 sbf.add(link) insert_arr.append((link,0)) sql = "INSERT INTO webpage (url,status)VALUE(%s,%s)" cursor.executemany(sql,insert_arr) cursor.execute("UPDATE webpage SET status = 3 WHERE status = 2 OR status = 11") cursor.close() conn.close() return {'exist':num_exist , 'insert':num_insert , 'all':num_all}
class StreamingTriangles(threading.Thread): daemon = True # Constructor sets up Redis connection and algorithm vars def __init__(self): super(StreamingTriangles, self).__init__() # Set up connection to Redis server self.redis_server = 'localhost' self.redis_db = redis.StrictRedis(host=self.redis_server, port=6379, db=0) # Initialize reservoir sizes self.edge_res_size = 40000 self.wedge_res_size = 40000 # Set Scalable Bloom Filter for ignoring repeated edges self.bloom_filter = ScalableBloomFilter( mode=ScalableBloomFilter.SMALL_SET_GROWTH) # Init counters and arrays for Streaming-Triangles algorithm self.edge_count = {RED: 0, BLUE: 0, YELLOW: 0, GREEN: 0} self.total_wedges = {RED: 0, BLUE: 0, YELLOW: 0, GREEN: 0} self.edge_res = { RED: [list(tuple((0, 0))) for _ in xrange(self.edge_res_size)], BLUE: [list(tuple((0, 0))) for _ in xrange(self.edge_res_size)], YELLOW: [list(tuple((0, 0))) for _ in xrange(self.edge_res_size)], GREEN: [list(tuple((0, 0))) for _ in xrange(self.edge_res_size)] } self.wedge_res = { RED: [list(tuple((0, 0, 0))) for _ in xrange(self.wedge_res_size)], BLUE: [list(tuple((0, 0, 0))) for _ in xrange(self.wedge_res_size)], YELLOW: [list(tuple((0, 0, 0))) for _ in xrange(self.wedge_res_size)], GREEN: [list(tuple((0, 0, 0))) for _ in xrange(self.wedge_res_size)] } self.is_closed = { RED: [False for _ in xrange(self.wedge_res_size)], BLUE: [False for _ in xrange(self.wedge_res_size)], YELLOW: [False for _ in xrange(self.wedge_res_size)], GREEN: [False for _ in xrange(self.wedge_res_size)] } # Track percent of uncategorized transactions self.num_missed = 0 self.num_colored = 0 # Thread sets up consumer and consumes kafka messages def run(self): consumer = KafkaConsumer(bootstrap_servers='52.35.109.64:9092') consumer.subscribe(['venmo-transactions']) for message in consumer: msg = str(message.value) new_edge = self.__extract_edge__(msg) colors = self.__analyze_message__(msg) self.redis_db.set( 'percent_caught', self.num_colored / float(self.num_colored + self.num_missed)) for color in colors: colored_edge = tuple((color, new_edge)) if colored_edge not in self.bloom_filter and -1 not in new_edge: self.__streaming_triangles__(self.redis_db, new_edge, color) self.bloom_filter.add(colored_edge) # Assign colors to message based on emoji/text content def __analyze_message__(self, json_obj): json_data = json.loads(json_obj) message = json_data['message'] # message data moji = PyMoji() message = moji.encode(message) if isinstance(message, str): message = unicode(message, "utf-8") message = message.encode('utf-8').lower() print(message) # Define categorization rules foods = [ "pizza", "hamburger", "food", "burrito", "chinese", "indian", "fries", "ramen", "taco", "dinner", "lunch", "spaghetti", "poultry_leg", "breakfast", "sushi" ] drinks = [ "wine", "cocktail", "drink", " bar" "beer", "[:tada]", "club", "vegas" ] transportation = [ "taxi", "[:car]", "[:oncoming_automobile]", "uber", "lyft", "ride", "drive", "driving" ] bills = [ "bulb", "[:moneybag]", "water", "[:house_with_garden]", "[:house]", " bill", "rent", "internet", "utilities", "pg&e", "dues", "cable" ] colors = Set([]) # Check for food-related content if any(food in message for food in foods): colors.add(RED) # Check for drink-related content if any(drink in message for drink in drinks): colors.add(BLUE) # Check for transportation-related content if any(transport in message for transport in transportation): colors.add(YELLOW) # Check for transportation-related content if any(bill in message for bill in bills): colors.add(GREEN) if (len(colors) == 0): self.num_missed += 1 else: self.num_colored += 1 return colors # Streaming triangles algorithm as described in Jha et al. 2013 def __streaming_triangles__(self, redis_db, new_edge, color): k = self.__update__(new_edge, color) transitivity = 3 * k redis_db.set(str(color + '_transitivity'), transitivity) # store calculated transitivity in Redi # Update function as described in Jha et al. 2013 def __update__(self, new_edge, color): self.edge_count[color] += 1 # increment edge counter updated_edge_res = False # Check if new edge closes any of the wedges to form a triangle for i in range(len(self.wedge_res[color])): if self.__is_closed_by__(self.wedge_res[color][i], new_edge): self.is_closed[color][i] = True # Use reservoir sampling method to maintain random sample of edges, including new edges from stream for i in range(len(self.edge_res[color])): x = random.uniform(0, 1) if x < (1 / float(self.edge_count[color])): self.edge_res[color][i] = new_edge updated_edge_res = True if updated_edge_res: new_wedges = [ ] # stores all new wedges created by the new edge in the edge reservoir # Generate list of new wedges created by the newest edge added to the edge reservoir for i in range(len(self.edge_res[color])): if self.__creates_wedge__(self.edge_res[color][i], new_edge): new_wedges.append( self.__get_wedge__(self.edge_res[color][i], new_edge)) self.total_wedges[color] += len( new_wedges) # Update ratio for total number of wedges # Use reservoir sampling method to maintain random sample of wedges, including newly formed wedges from stream for i in range(len(self.wedge_res[color])): x = random.uniform(0, 1) if self.total_wedges[color] > 0 and x < ( len(new_wedges) / float(self.total_wedges[color])): w = random.choice(new_wedges) self.wedge_res[color][i] = w self.is_closed[color][i] = False # Return ratio of closed wedges (triangles) in wedge reservoir return np.sum(self.is_closed[color]) / float(len( self.is_closed[color])) # Extract relevant data from json body def __extract_edge__(self, json_obj): json_data = json.loads(json_obj) try: from_id = int(json_data['actor']['id']) # Sender data to_id = int( json_data['transactions'][0]['target']['id']) # Receiver data except: from_id = -1 # Values of -1 are filtered out later to_id = -1 edge = sorted(tuple( (from_id, to_id) )) # Sort to treat edges as undirected (ie. (132, 452) = (452, 132)) return edge # Extract wedge from adjacent edges def __get_wedge__(self, edge1, edge2): if edge1[0] == edge2[0]: return tuple((edge2[1], edge1[0], edge1[1])) if edge1[0] == edge2[1]: return tuple((edge2[0], edge1[0], edge1[1])) if edge1[1] == edge2[0]: return tuple((edge2[1], edge1[1], edge1[0])) if edge1[1] == edge2[1]: return tuple((edge2[0], edge1[1], edge1[0])) return None # Check if input edge closes input wedge def __is_closed_by__(self, wedge, edge): if (wedge[0] == edge[0] and wedge[2] == edge[1]) or (wedge[0] == edge[1] and wedge[2] == edge[0]): return True return False # Check if input edges create a wedge def __creates_wedge__(self, edge1, edge2): if edge1[0] == edge2[0] and edge1[1] != edge2[1]: return True if edge1[0] == edge2[1] and edge1[1] != edge2[0]: return True if edge1[1] == edge2[1] and edge1[0] != edge2[0]: return True if edge1[1] == edge2[0] and edge1[0] != edge2[1]: return True return False
class StoreSpider(RedisSpider): name = "store" allowed_domains = ["aliexpress.com"] start_urls = ( 'http://www.aliexpress.com/', ) prefix = '' def __init__(self): self.redis_queue = None self.ids = ScalableBloomFilter(mode=ScalableBloomFilter.LARGE_SET_GROWTH) def get_queue(self): for value in set(self.server.smembers(self.redis_key)): yield value def start_requests(self): StoreSpider.prefix = self.settings['prefix'] self.redis_key = '{}:store'.format(StoreSpider.prefix) self.redis_queue = self.get_queue() db = MongoClient().aliexpress for store in db['{}store'.format(StoreSpider.prefix)].find(): self.ids.add(store['url'][store['url'].rfind('/') + 1:]) yield self.next_request() def next_request(self): while True: try: url = next(self.redis_queue) except StopIteration: url = None if not (url and self.ids.add(url[url.rfind('/') + 1:])): break if url: return self.make_requests_from_url(url) else: raise CloseSpider('redis queue has no url to request') def parse(self, response): try: self.log('request store: {}'.format(response.url), logging.INFO) owner_member_id = response.css('.s-alitalk').xpath('a/@data-id1').extract()[0] evaluation_detail_url = 'http://feedback.aliexpress.com/display/evaluationDetail.htm?ownerMemberId={}'.format(owner_member_id) store_feedback_item = UrlItem() store_feedback_item['prefix'] = StoreSpider.prefix store_feedback_item['type'] = 'storefeedback' store_feedback_item[ 'url'] = 'http://feedback.aliexpress.com/display/evaluationList.htm?ownerMemberId={}&refreshPage=received'.format( owner_member_id) yield scrapy.Request(url=evaluation_detail_url, callback=self.parse_evaluation_detail, meta={'store_feedback_item': store_feedback_item}) except: try: store_url = response.meta['redirect_urls'][0] except: store_url = response.url self.log('strange store url: {}'.format(store_url), logging.ERROR) finally: self.log('meet anti-spider, back store: {}'.format(store_url), logging.INFO) url_item = UrlItem() url_item['prefix'] = StoreSpider.prefix url_item['type'] = 'store' url_item['url'] = store_url yield url_item def parse_evaluation_detail(self, response): self.log('parse evaluation detail: {}'.format(response.url), logging.INFO) summary_tb_tds = response.xpath('//div[@id="feedback-summary"]/div/table/tbody/tr/td') store_name = summary_tb_tds[0].xpath('a/text()').extract()[0] store_url = summary_tb_tds[0].xpath('a/@href').extract()[0] store_positive_feedback = summary_tb_tds[1].xpath('span/text()').extract()[0] store_positive_score = int(summary_tb_tds[2].xpath('span/text()').extract()[0].replace(',', '')) store_since_time = datetime.strptime(summary_tb_tds[3].xpath('text()').extract()[0].strip(), '%d %b %Y') history_tds = response.xpath('//div[@id="feedback-history"]/div/table/tbody/tr/td/a/text()').extract() one_month_feedback = [int(td.strip().replace(',', '').replace('-', '0')) for td in history_tds[::5]] three_month_feedback = [int(td.strip().replace(',', '').replace('-', '0')) for td in history_tds[1::5]] six_month_feedback = [int(td.strip().replace(',', '').replace('-', '0')) for td in history_tds[2::5]] twelve_month_feedback = [int(td.strip().replace(',', '').replace('-', '0')) for td in history_tds[3::5]] overall_feedback = [int(td.strip().replace(',', '').replace('-', '0')) for td in history_tds[4::5]] store_id = store_url.split('/')[-1] # store_feedback_item = response.meta['store_feedback_item'] # store_feedback_item['url'] += '&storeId={}'.format(store_id) # yield store_feedback_item item = StoreItem() item['prefix'] = StoreSpider.prefix item['_id'] = store_id item['url'] = store_url item['name'] = store_name item['positive_feedback'] = store_positive_feedback item['positive_score'] = store_positive_score item['since_time'] = store_since_time item['one_month_feedback'] = one_month_feedback item['three_month_feedback'] = three_month_feedback item['six_month_feedback'] = six_month_feedback item['twelve_month_feedback'] = twelve_month_feedback item['overall_feedback'] = overall_feedback all_product_url = 'http://www.aliexpress.com/store/all-wholesale-products/{}.html'.format(store_id) self.log('request product store: {}'.format(response.url), logging.INFO) return scrapy.Request(all_product_url, meta={'item': item}, callback=self.parse_product_num) def parse_product_num(self, response): self.log('parse product num: {}'.format(response.url), logging.INFO) item = response.meta['item'] product_num = int(response.xpath('//div[@id="result-info"]/strong/text()').extract()[0].replace(',', '')) item['product'] = product_num return item
class ProductSpider(RedisSpider): name = "product" allowed_domains = ["aliexpress.com"] start_urls = ('http://www.aliexpress.com/', ) prefix = '' def __init__(self): self.products = dict() self.ids = ScalableBloomFilter( mode=ScalableBloomFilter.LARGE_SET_GROWTH) self.redis_queue = None def get_queue(self): for value in set(self.server.smembers(self.redis_key)): yield value def start_requests(self): ProductSpider.prefix = self.settings['prefix'] self.redis_key = '{}:product'.format(ProductSpider.prefix) self.redis_queue = self.get_queue() db = MongoClient().aliexpress for product in db['{}product'.format(ProductSpider.prefix)].find(): self.ids.add(product['url'][product['url'].rfind('/') + 1:product['url'].rfind('.')]) yield self.next_request() def next_request(self): while True: try: url = next(self.redis_queue) except StopIteration: url = None if not (url and self.ids.add(url[url.rfind('/') + 1:url.rfind('.')])): break if url: return self.make_requests_from_url(url) else: raise CloseSpider('redis queue has no url to request') def parse(self, response): self.log('product url: {}'.format(response.url), logging.INFO) try: store_url = response.css('.shop-name').xpath( 'a/@href').extract()[0] self.log('crawl store url: {}'.format(store_url), logging.INFO) store_item = UrlItem() store_item['prefix'] = ProductSpider.prefix store_item['type'] = 'store' store_item['url'] = store_url yield store_item feedback_base_url = response.xpath( '//div[@id="feedback"]/iframe/@thesrc').extract()[0] parsed = urlparse.urlparse(feedback_base_url) product_id = urlparse.parse_qs(parsed.query)['productId'][0] try: percent_num = response.css('.percent-num').xpath( 'text()').extract()[0] rantings_text = response.css('.rantings-num').xpath( 'text()').extract()[0] rantings_num = rantings_text[1:rantings_text.index(' ')] order_text = response.css('.order-num').xpath( 'text()').extract()[0] order_num = order_text[:order_text.index(' ')] except: percent_num = 0 rantings_num = 0 order_num = 0 product_item = ProductItem() product_item['prefix'] = ProductSpider.prefix product_item['_id'] = product_id product_item['store'] = store_url product_item['url'] = response.url product_item['percent_num'] = percent_num product_item['rantings_num'] = rantings_num product_item['order_num'] = order_num yield product_item feedback_item = UrlItem() feedback_item['prefix'] = ProductSpider.prefix feedback_item['type'] = 'feedback' feedback_item['url'] = feedback_base_url yield feedback_item order_item = UrlItem() order_item['prefix'] = ProductSpider.prefix order_item['type'] = 'order' order_item[ 'url'] = 'http://feedback.aliexpress.com/display/evaluationProductDetailAjaxService.htm?productId={}&type=default'.format( product_id) yield order_item except: try: product_url = response.meta['redirect_urls'][0] except: product_url = response.url self.log('strange product url: {}'.format(product_url), logging.ERROR) finally: self.log( 'meet anti-spider, back product: {}'.format(product_url), logging.INFO) url_item = UrlItem() url_item['prefix'] = ProductSpider.prefix url_item['type'] = 'product' url_item['url'] = product_url yield url_item
class BFSFrontier(Frontier): def __init__(self, spider): super(BFSFrontier, self).__init__(spider) self._spider = spider self.args = {'rules': [], 'order': 'bfs'} self.redis = RediSugar.getConnection() self.filter = ScalableBloomFilter(mode=ScalableBloomFilter.SMALL_SET_GROWTH) self.todo = spider.name + '-todo' self.visited = spider.name + '-visited' self._feedfilter() def setargs(self, args): if not isinstance(args, dict): raise FrontierException('Args must be a dict') for key, value in args.iteritems(): self.args[key] = value if self.args['rules']: for each in self.args['rules']: try: re.compile(each) except re.error: raise FrontierException('Wrong regular expression: \'{0}\''.format(each)) def __len__(self): return self.redis.llen(self.todo) def __contains__(self, item): temp = self.redis.lrange(self.todo, 0, self.__len__()) return item in temp def visitednum(self): return self.redis.llen(self.visited) def add(self, item): if isinstance(item, list): for each in iter(item): self._addone(each) elif isinstance(item, str): self._addone(item) else: raise FrontierException('Unsupported type: {0}'.format(type(item))) def _addone(self, item): if not self.isVisited(item) and self.validate(item): self.redis.rpush(self.todo, item) def next(self, num=1): if num == 1: return self._nextone() elif num == 0 or num >= self.__len__(): return self._nextall() elif num > 1: result = [] while len(result) < num: item = self._nextone() if item: result.append(item) return result else: raise FrontierException('Num should be greater than 0') def _nextone(self): item = self.redis.lpop(self.todo) while item: if item in self.filter: item = self.redis.lpop(self.todo) else: self.filter.add(item) self.redis.rpush(self.visited, item) break return item def _nextall(self): temp = self.redis.lrange(self.todo, 0, self.__len__()) result = [x for x in temp if x not in self.filter] self.redis.ltrim(self.todo, len(temp), self.__len__()) for each in iter(result): self.filter.add(each) self.redis.rpush(self.visited, each) return result def hasnext(self): return self.__len__() != 0 def isVisited(self, item): return item in self.filter def validate(self, item): if self.args['rules']: for each in self.args['rules']: if not re.match(each, item): return False return True def clean(self, *args): if 'visited' in args: self.redis.delete(self.visited) if 'todo' in args: self.redis.delete(self.todo) def _feedfilter(self): length = self.redis.llen(self.visited) if length != 0: map(self.filter.add, self.redis.lrange(self.visited, 0, length)) def save(self): try: self.redis.bgsave() except ResponseError: pass