def _get_extensions(self): """ Build files extensions list """ result = {} coll = Registry().get('mongo').spider_urls links = coll.group({'path': True}, '', {}, 'function () {}') links = mongo_result_to_list(links) exts = [] for link in links: if link['path'].rfind('.') > -1 and len( link['path']) - link['path'].rfind('.') <= 5: exts.append(link['path'][link['path'].rfind('.'):]) for ext in list(set(exts)): if ext not in result: result[ext] = [] links = coll.find({'path': re.compile('\\' + ext + '$')}) links = mongo_result_to_list(links) for link in links: result[ext].append( link['path'] + '?' + link['query'] if link['query'] else link['path']) return result
def _get_codes_stat(self): """ Build dict with http-codes and their counts """ coll = Registry().get('mongo').spider_urls result = {} codes = coll.group({'code': True}, '', {}, 'function () {}') for code in codes: links = [] code = code['code'] data = coll.find({'code': code}, {'path': 1, 'query': 1}) for link in mongo_result_to_list(data): links.append(link['path'] + '?' + link['query'] if link['query'] else link['path']) result[int(code)] = links return result
def _get_extensions(self): """ Build files extensions list """ result = {} coll = Registry().get('mongo').spider_urls links = coll.group({'path': True}, '', {}, 'function () {}') links = mongo_result_to_list(links) exts = [] for link in links: if link['path'].rfind('.') > -1 and len(link['path']) - link['path'].rfind('.') <= 5: exts.append(link['path'][link['path'].rfind('.'):]) for ext in list(set(exts)): if ext not in result: result[ext] = [] links = coll.find({'path': re.compile('\\' + ext + '$')}) links = mongo_result_to_list(links) for link in links: result[ext].append(link['path'] + '?' + link['query'] if link['query'] else link['path']) return result
class MongoJob(WSJob): """ Common class for jobs works with MongoDB """ unique = True collection = None select_limit = 50 skip_blank_rows = True counter = 0 collection_name = None def __init__(self, maxsize=0): WSJob.__init__(self, maxsize) self.collection = Registry().get('mongo')[self.collection_name] def build_row(self, _str): """ Common build row method for MongoDB """ return { "name": _str.strip(), "checked": 0, "getted": 0 } def qsize(self): """ Size of queue """ return self.collection.find({"checked": 0}).count() def set_unique(self, unique=True): """ Enable remove dups in queue """ self.unique = unique def set_skip_blank_rows(self, value=True): """ If True - we will skip blank rows then fill queue from dict or file """ self.skip_blank_rows = value def task_done(self, name): """ Mark current row as done """ self.counter += 1 self.collection.update({'name': str(unicode(name)), "getted": 1}, {"$set": {"checked": 1}}) WSJob.task_done(self) def get(self, block=False, timeout=None): """ Get next item from queue """ if self.empty() or self.qsize() < 50: self.load_data() if self.empty(): raise Queue.Empty return WSJob.get(self, block, timeout) def load_data(self): """ Load data into queue from MongoDB """ data = self.collection.find( {"checked": 0, "getted": 0}, limit=int(Registry().get('config')['main']['mongo_data_load_per_once']) ) for row in data: self.put(row['name']) self.collection.update({"name": row['name']}, {"$set": {"getted": 1}}) return True def load_dict(self, dict_for_load, drop=True): """ Fill collection from dict """ if drop: self.collection.drop() counter = 0 last = "START OF FILE" for line in dict_for_load: try: line = line.strip() unicode(line) self.collection.insert(self.build_row(line)) except UnicodeDecodeError: _str = " UNICODE ERROR: In file '{0}' skip word '{1}', after word '{2}' !".format(file, line, last) if Registry().isset('logger'): Registry().get('logger').log(_str) else: print _str continue counter += 1 last = line self.load_data() return counter def load_dom(self, dom): """ Fill queue from DictOfMask """ self.collection.drop() while True: word = dom.get() if word is None: break self.collection.insert(self.build_row(word)) self.collection.create_index('name', drop_dups=True, unique=self.unique) self.load_data() return self.collection.count() def load_file(self, _file): """ Fill queue from text file """ self.collection.drop() fh = open(_file) last = "START OF FILE" while True: line = fh.readline() if not line: break if not line.strip() and self.skip_blank_rows: continue try: line = line.strip() unicode(line) self.collection.insert(self.build_row(line)) except UnicodeDecodeError: _str = " UNICODE ERROR: In file '{0}' skip word '{1}', after word '{2}' !".format(_file, line, last) if Registry().isset('logger'): Registry().get('logger').log(_str) else: print _str continue last = line fh.close() self.collection.create_index('name', drop_dups=True, unique=self.unique) self.load_data() return self.collection.count() # 2 метода ниже взяты с # http://stackoverflow.com/questions/1581895/how-check-if-a-task-is-already-in-python-queue # Рецепт для уникальных задачь в очереди def _init(self, maxsize): WSJob._init(self, maxsize) if self.unique: self.all_items = set() def _put(self, item): if self.unique: if item not in self.all_items: WSJob._put(self, item) self.all_items.add(item) else: _str = "WARNING: try to add not unique item `{0}`".format(item) if Registry().isset('logger'): #Registry().get('logger').log(_str) pass else: #print _str pass else: WSJob._put(self, item)