def test_OpenCloseSingle(self): """Write 1 item, close, reopen checking if same item is there""" q = Queue(self.path) q.put('var1') del q q = Queue(self.path) self.assertEqual(1, q.qsize()) self.assertEqual('var1', q.get()) q.task_done()
def test_PartialWrite(self): """Test recovery from previous crash w/ partial write""" q = Queue(self.path) for i in range(100): q.put('var%d' % i) del q with open(os.path.join(self.path, 'q00000'), 'ab') as f: pickle.dump('文字化け', f) q = Queue(self.path) self.assertEqual(100, q.qsize()) for i in range(100): self.assertEqual('var%d' % i, q.get()) q.task_done() with self.assertRaises(Empty): q.get_nowait()
def test_ClearOldFile(self): """put until reaching chunksize, then get without calling task_done""" q = Queue(self.path, chunksize=10) for i in range(15): q.put('var1') for i in range(11): q.get() q = Queue(self.path, chunksize=10) self.assertEqual(q.qsize(), 15) for i in range(11): q.get() q.task_done() self.assertEqual(q.qsize(), 4)
def test_RandomReadWrite(self): """Test random read/write""" q = Queue(self.path) n = 0 for i in range(1000): if random.random() < 0.5: if n > 0: q.get_nowait() q.task_done() n -= 1 else: with self.assertRaises(Empty): q.get_nowait() else: q.put('var%d' % random.getrandbits(16)) n += 1
def test_GarbageOnHead(self): """Adds garbage to the queue head and let the internal integrity checks fix it""" q = Queue(self.path) q.put('var1') del q with open(os.path.join(self.path, 'q00001'), 'a') as fd: fd.write('garbage') q = Queue(self.path) q.put('var2') self.assertEqual(2, q.qsize()) self.assertEqual('var1', q.get()) q.task_done()
def test_OpenCloseOneHundred(self): """Write 1000 items, close, reopen checking if all items are there""" q = Queue(self.path) for i in range(1000): q.put('var%d' % i) del q q = Queue(self.path) self.assertEqual(1000, q.qsize()) for i in range(1000): data = q.get() self.assertEqual('var%d' % i, data) q.task_done() with self.assertRaises(Empty): q.get_nowait() # assert adding another one still works q.put('foobar') data = q.get()
class Crawler(object): def __init__(self, site): ''' (Crawler, str) -> Crawler creates a Crawler with a given origin_url ''' self.site = site self.filters = site.referringsitefilter_set.all() self.domain = urlparse(site.url).netloc # http://alexeyvishnevsky.com/2013/11/tips-on-optimizing-scrapy-for-a-high-performance/ # fork of pybloom: https://github.com/joseph-fox/python-bloomfilter self.ignore_filter = ScalableBloomFilter(initial_capacity=10000000, error_rate=0.00001) ignore_filter_dir = '../ignore_filter/' if not os.path.exists(ignore_filter_dir): os.makedirs(ignore_filter_dir) self.ignore_filter = ScalableBloomFilter(initial_capacity=10000000, error_rate=0.00001) try: f = open( '../ignore_filter/' + self.site.name + '_ignore_file.txt', 'r+') f.write(self.ignore_filter) except IOError: f = open( '../ignore_filter/' + self.site.name + '_ignore_file.txt', 'w+') f.close() else: if (not (os.path.exists('../ignore_filter/' + self.site.name + '_ignore_file.txt'))): f = open( '../ignore_filter/' + self.site.name + '_ignore_file.txt', 'w+') f.close() with open('../ignore_filter/' + self.site.name + '_ignore_file.txt', 'r+', buffering=False) as ignore_filter_file: try: for line in ignore_filter_file: self.ignore_filter.add(line.decode('utf8').rstrip()) except Exception as e: logging.info(str(e)) ignore_filter_file.close() self.visited_count = 0 tmpqueuetmp_dir = '../tmpqueue/tmp/' if not os.path.exists(tmpqueuetmp_dir): os.makedirs(tmpqueuetmp_dir) slugified_name = slugify(unicode(site.name)) tmpqueue_dir = '../tmpqueue/{}'.format(slugified_name) if not os.path.exists(tmpqueue_dir): os.makedirs(tmpqueue_dir) self.to_visit = Queue(tmpqueue_dir, tempdir=tmpqueuetmp_dir) # Initial url if (self.site.is_shallow == False): self.to_visit.put(site.url) else: self.to_visit.put((site.url, str(0))) # Limit self.limit = common.get_config()["crawler"]["limit"] # Specifies how deep the shallow crawler should go; "1" is the lowest option for this self.level = common.get_config()["crawler"]["level"] """ self.probabilistic_n = common.get_config()["crawler"]["n"] self.probabilistic_k = common.get_config()["crawler"]["k"] self.db = psycopg2.connect(host='localhost', database=common.get_config()["crawler"]["postgresql"]["name"], user=common.get_config()["crawler"]["postgresql"]["user"], password=common.get_config()["crawler"]["postgresql"]["password"]) self.cursor = self.db.cursor() self.already_added_urls = set() self.visited_table = "visited_" + str(site.id) self.tovisit_table = "tovisit_" + str(site.id) #self.cursor.execute("DROP TABLE IF EXISTS " + self.visited_table) #self.cursor.execute("CREATE TABLE " + self.visited_table + " (url VARCHAR(1024) PRIMARY KEY)") self.cursor.execute("DROP TABLE IF EXISTS " + self.tovisit_table) self.cursor.execute(u"CREATE TABLE " + self.tovisit_table + " (id SERIAL PRIMARY KEY, url VARCHAR(1024))") #self.cursor.execute(u"INSERT INTO " + self.visited_table + " VALUES (%s)", (site.url,)) self.cursor.execute(u"INSERT INTO " + self.tovisit_table + " VALUES (DEFAULT, %s)", (site.url,)) self.db.commit() """ def __iter__(self): return self def next(self): ''' (Crawler) -> newspaper.Article returns the next article in the sequence ''' #standard non-recursive tree iteration with open('../ignore_filter/' + self.site.name + '_ignore_file.txt', 'a') as ignore_filter_file: try: current_level = 0 while (True): if (self.limit > 0 and self.visited_count > self.limit): raise StopIteration('Limit reached: {:d}'.format( self.limit)) # if(self.pages_visited > self.probabilistic_n): # raise StopIteration # self.cursor.execute("SELECT * FROM " + self.tovisit_table + " ORDER BY id LIMIT 1") # row = self.cursor.fetchone() # if(row): # row_id = row[0] # current_url = row[1] # self.cursor.execute("DELETE FROM " + self.tovisit_table + " WHERE id=%s", (row_id,)) # else: # raise StopIteration # if(self._should_skip()): # logging.info(u"skipping {0} randomly".format(current_url)) # continue try: if (self.site.is_shallow): current = self.to_visit.get_nowait() current_url = current[0] current_level = current[1] logging.info(u"Shallow on level {0} {1}".format( current_level, current_url)) else: current_url = self.to_visit.get_nowait() except Empty: self.site.is_shallow = True # On line 26 the site gets set TO DELETE self.to_visit.put((self.site.url, str(0))) self.ignore_filter = ScalableBloomFilter( initial_capacity=10000000, error_rate=0.00001) ignore_filter_file.close() os.remove('../ignore_filter/' + self.site.name + '_ignore_file.txt') logging.info("stopped iteration") logging.info(u"{0}".format(self.site.url)) raise ZeroDivisionError logging.info(u"visiting {0}".format(current_url)) self.visited_count += 1 #use newspaper to download and parse the article article = ExplorerArticle(current_url) article.download() if (self.site.is_shallow): if (int(current_level) > self.level): continue # get urls from the article for link in article.get_links(): url = urljoin(current_url, link.href, False) if self.url_in_filter(url, self.filters): logging.info( u"skipping url \"{0}\" because it matches filter" .format(url)) continue try: parsed_url = urlparse(url) parsed_as_list = list(parsed_url) if (parsed_url.scheme != u"http" and parsed_url.scheme != u"https"): logging.info( u"skipping url with invalid scheme: {0}". format(url)) continue parsed_as_list[5] = '' url = urlunparse( urlnorm.norm_tuple(*parsed_as_list)) except Exception as e: logging.info( u"skipping malformed url {0}. Error: {1}". format(url, str(e))) continue if (not parsed_url.netloc.endswith(self.domain)): continue # If the url have been added to ignore list, skip if (url in self.ignore_filter): continue # Ignores the subscribe links for many domains if (u"subscribe" in url or "subscribe" in url and not (u"-subscribe" in url or "-subscribe" or u"subscribe-" in url or "subscribe-")): continue # Append the url to to_visit queue if (self.site.is_shallow): self.to_visit.put( (url, str(int(current_level) + 1))) logging.info( u"added {0} to the to_visit as well as the level {1}" .format(url, str(int(current_level) + 1))) # Append the url to visited to remove duplicates self.ignore_filter.add(url) ignore_filter_file.write(url.encode('utf8') + "\n") else: self.to_visit.put(url) logging.info( u"added {0} to the to_visit".format(url)) # Append the url to visited to remove duplicates self.ignore_filter.add(url) ignore_filter_file.write(url.encode('utf8') + "\n") # Update the Queue self.to_visit.task_done() return article except StopIteration as e: raise e except ValueError as e: raise ValueError except Exception as e: raise e def url_in_filter(self, url, filters): """ Checks if any of the filters matches the url. Filters can be in regex search or normal string comparison. """ for filt in filters: if ((filt.regex and re.search(filt.pattern, url, re.IGNORECASE)) or (not filt.regex and filt.pattern in url)): return True return False
class Crawler(object): def __init__(self, site): ''' (Crawler, str) -> Crawler creates a Crawler with a given origin_url ''' self.site = site self.filters = site.referringsitefilter_set.all() self.domain = urlparse(site.url).netloc # http://alexeyvishnevsky.com/2013/11/tips-on-optimizing-scrapy-for-a-high-performance/ # fork of pybloom: https://github.com/joseph-fox/python-bloomfilter self.ignore_filter = ScalableBloomFilter( initial_capacity=10000000, error_rate=0.00001) ignore_filter_dir='../ignore_filter/' if not os.path.exists(ignore_filter_dir): os.makedirs(ignore_filter_dir) self.ignore_filter = ScalableBloomFilter( initial_capacity=10000000, error_rate=0.00001) try: f = open('../ignore_filter/' + self.site.name + '_ignore_file.txt', 'r+') f.write(self.ignore_filter) except IOError: f = open('../ignore_filter/' + self.site.name + '_ignore_file.txt', 'w+') f.close() else: if (not(os.path.exists('../ignore_filter/' + self.site.name + '_ignore_file.txt'))): f = open('../ignore_filter/' + self.site.name + '_ignore_file.txt', 'w+') f.close() with open('../ignore_filter/' + self.site.name + '_ignore_file.txt', 'r+', buffering=False) as ignore_filter_file: try: for line in ignore_filter_file: self.ignore_filter.add(line.decode('utf8').rstrip()) except Exception as e: logging.info(str(e)) ignore_filter_file.close() self.visited_count = 0 tmpqueuetmp_dir='../tmpqueue/tmp/' if not os.path.exists(tmpqueuetmp_dir): os.makedirs(tmpqueuetmp_dir) slugified_name = slugify(unicode(site.name)) tmpqueue_dir = '../tmpqueue/{}'.format(slugified_name) if not os.path.exists(tmpqueue_dir): os.makedirs(tmpqueue_dir) self.to_visit = Queue(tmpqueue_dir, tempdir=tmpqueuetmp_dir) # Initial url if (self.site.is_shallow == False): self.to_visit.put(site.url) else: self.to_visit.put((site.url, str(0))) # Limit self.limit = common.get_config()["crawler"]["limit"] # Specifies how deep the shallow crawler should go; "1" is the lowest option for this self.level = common.get_config()["crawler"]["level"] """ self.probabilistic_n = common.get_config()["crawler"]["n"] self.probabilistic_k = common.get_config()["crawler"]["k"] self.db = psycopg2.connect(host='localhost', database=common.get_config()["crawler"]["postgresql"]["name"], user=common.get_config()["crawler"]["postgresql"]["user"], password=common.get_config()["crawler"]["postgresql"]["password"]) self.cursor = self.db.cursor() self.already_added_urls = set() self.visited_table = "visited_" + str(site.id) self.tovisit_table = "tovisit_" + str(site.id) #self.cursor.execute("DROP TABLE IF EXISTS " + self.visited_table) #self.cursor.execute("CREATE TABLE " + self.visited_table + " (url VARCHAR(1024) PRIMARY KEY)") self.cursor.execute("DROP TABLE IF EXISTS " + self.tovisit_table) self.cursor.execute(u"CREATE TABLE " + self.tovisit_table + " (id SERIAL PRIMARY KEY, url VARCHAR(1024))") #self.cursor.execute(u"INSERT INTO " + self.visited_table + " VALUES (%s)", (site.url,)) self.cursor.execute(u"INSERT INTO " + self.tovisit_table + " VALUES (DEFAULT, %s)", (site.url,)) self.db.commit() """ def __iter__(self): return self def next(self): ''' (Crawler) -> newspaper.Article returns the next article in the sequence ''' #standard non-recursive tree iteration with open('../ignore_filter/' + self.site.name + '_ignore_file.txt', 'a') as ignore_filter_file: try: current_level = 0 while(True): if (self.limit > 0 and self.visited_count > self.limit): raise StopIteration('Limit reached: {:d}'.format(self.limit)) # if(self.pages_visited > self.probabilistic_n): # raise StopIteration # self.cursor.execute("SELECT * FROM " + self.tovisit_table + " ORDER BY id LIMIT 1") # row = self.cursor.fetchone() # if(row): # row_id = row[0] # current_url = row[1] # self.cursor.execute("DELETE FROM " + self.tovisit_table + " WHERE id=%s", (row_id,)) # else: # raise StopIteration # if(self._should_skip()): # logging.info(u"skipping {0} randomly".format(current_url)) # continue try: if (self.site.is_shallow): current = self.to_visit.get_nowait() current_url = current[0] current_level = current[1] logging.info(u"Shallow on level {0} {1}".format(current_level, current_url)) else: current_url = self.to_visit.get_nowait() except Empty: self.site.is_shallow = True # On line 26 the site gets set TO DELETE self.to_visit.put((self.site.url, str(0))) self.ignore_filter = ScalableBloomFilter( initial_capacity=10000000, error_rate=0.00001) ignore_filter_file.close() os.remove('../ignore_filter/' + self.site.name + '_ignore_file.txt') logging.info("stopped iteration") logging.info(u"{0}".format(self.site.url)) raise ZeroDivisionError logging.info(u"visiting {0}".format(current_url)) self.visited_count += 1 #use newspaper to download and parse the article article = ExplorerArticle(current_url) article.download() if (self.site.is_shallow): if (int(current_level) > self.level): continue # get urls from the article for link in article.get_links(): url = urljoin(current_url, link.href, False) if self.url_in_filter(url, self.filters): logging.info(u"skipping url \"{0}\" because it matches filter".format(url)) continue try: parsed_url = urlparse(url) parsed_as_list = list(parsed_url) if(parsed_url.scheme != u"http" and parsed_url.scheme != u"https"): logging.info(u"skipping url with invalid scheme: {0}".format(url)) continue parsed_as_list[5] = '' url = urlunparse(urlnorm.norm_tuple(*parsed_as_list)) except Exception as e: logging.info(u"skipping malformed url {0}. Error: {1}".format(url, str(e))) continue if(not parsed_url.netloc.endswith(self.domain)): continue # If the url have been added to ignore list, skip if (url in self.ignore_filter): continue # Ignores the subscribe links for many domains if (u"subscribe" in url or "subscribe" in url and not(u"-subscribe" in url or "-subscribe" or u"subscribe-" in url or "subscribe-")): continue # Append the url to to_visit queue if (self.site.is_shallow): self.to_visit.put((url, str(int(current_level) + 1))) logging.info(u"added {0} to the to_visit as well as the level {1}".format(url, str(int(current_level) + 1))) # Append the url to visited to remove duplicates self.ignore_filter.add(url) ignore_filter_file.write(url.encode('utf8') + "\n") else: self.to_visit.put(url) logging.info(u"added {0} to the to_visit".format(url)) # Append the url to visited to remove duplicates self.ignore_filter.add(url) ignore_filter_file.write(url.encode('utf8') + "\n") # Update the Queue self.to_visit.task_done() return article except StopIteration as e: raise e except ValueError as e: raise ValueError except Exception as e: raise e def url_in_filter(self, url, filters): """ Checks if any of the filters matches the url. Filters can be in regex search or normal string comparison. """ for filt in filters: if ((filt.regex and re.search(filt.pattern, url, re.IGNORECASE)) or (not filt.regex and filt.pattern in url)): return True return False
class Rotator(object): def __init__(self, config): self.config = config self.dateformat = config['dateformat'] self.keep_files = int(config['rotate']) self.now = datetime.datetime.now() self.dateext = self.now.strftime(self.dateformat) self.mode = config['mode'] self.compress = config['compress'] self.user = config['user'] self.group = config['group'] self.sharedscripts = config['sharedscripts'] self.destext = config['destext'] self.copy = config['copy'] self.copytohdfs = config['copytohdfs'] self.prerotates = config['prerotate'] self.postrotates = config['postrotate'] self.hdfs_config = config['hdfs'] self.queuepath = config['queuepath'] self.queue_chunksize = 1000 self.queue_block_timeout = 30 self.queue = Queue(self.queuepath, self.queue_chunksize) self.client = None if self.hdfs_config: self.client = hdfs.InsecureClient(**self.hdfs_config) def get_rotated_dir(self, path): destext = self.now.strftime(self.destext) dest_dir = '{}-{}'.format(path, destext) return dest_dir def get_rotated_time(self, dest_path): dateext = dest_path.rsplit('-', 1)[-1] # remove gz ext dateext = dateext.split('.')[0] return datetime.datetime.strptime('-{}'.format(dateext), self.dateformat) def is_rotated_file(self, dest_path): try: t = self.get_rotated_time(dest_path) return bool(t) except: return False def get_dest_path(self, path): rotated_dir = self.get_rotated_dir(path) filename = os.path.split(path)[-1] dest_path = os.path.join(rotated_dir, '{}{}'.format(filename, self.dateext)) return dest_path def remove_old_files(self, path): rotated_dir = self.get_rotated_dir(path) filename = os.path.split(path)[-1] path = os.path.join(rotated_dir, filename) glob_path = '{}-*'.format(path) files = [f for f in glob.glob(glob_path) if self.is_rotated_file(f)] files.sort(key=self.get_rotated_time, reverse=True) for f in files[self.keep_files:]: os.remove(f) def create_rotated_dir(self, path): rotated_dir = self.get_rotated_dir(path) makedirs(rotated_dir, 0755) chown(rotated_dir, self.user, self.group) def rename_file(self, path): self.create_rotated_dir(path) dest_path = self.get_dest_path(path) shutil.move(path, dest_path) self.queue.put((path, dest_path), timeout=self.queue_block_timeout) os.chmod(dest_path, self.mode) chown(dest_path, self.user, self.group) return dest_path def compress_file(self, dest_path): gzip(dest_path) return '{}.gz'.format(dest_path) def _copy_file(self, path, from_, to): if not to: return dest = os.path.normpath(path.replace(from_, to)) dest_dir = os.path.dirname(dest) if not os.path.exists(dest_dir): makedirs(dest_dir, 0755) chown(dest_dir, self.user, self.group) if path.startswith(from_): shutil.copy(path, dest) def copy_file(self, dest_path): if isinstance(self.copy, dict): self.copy = [self.copy] for item in self.copy: to = item.get('to') from_ = item.get('from', '') self._copy_file(dest_path, from_, to) def _copy_to_hdfs(self, client, path, from_, to): if not to: return dest = os.path.normpath(path.replace(from_, to)) if path.startswith(from_): client.upload(dest, path, overwrite=True, cleanup=True) def copy_to_hdfs(self, path): if not (self.copytohdfs and self.hdfs_config): return for item in self.copytohdfs: to = item.get('to') from_ = item.get('from', '') self._copy_to_hdfs(self.client, path, from_, to) def secure_copy(self): to_be_clean = set() while True: try: path, rotated_path = self.queue.get_nowait() rotated_path_before = rotated_path if not os.path.exists(rotated_path): self.queue.task_done() continue if self.compress: rotated_path = self.compress_file(rotated_path) if self.copy: self.copy_file(rotated_path) if self.copytohdfs: self.copy_to_hdfs(rotated_path) self.queue.task_done() if self.compress: os.remove(rotated_path_before) to_be_clean.add(path) except Empty: break except Exception as e: print e for path in to_be_clean: self.remove_old_files(path) def rotate(self): if self.sharedscripts: self.prerotate() for f in iterate_log_paths(self.config['paths']): if not self.sharedscripts: self.prerotate() self.rename_file(f) if not self.sharedscripts: self.postrotate() if self.sharedscripts: self.postrotate() self.secure_copy() def prerotate(self): for cmd in self.prerotates: run(cmd) def postrotate(self): for cmd in self.postrotates: run(cmd)
class Rotator(object): def __init__(self, config): self.paths = config['paths'] self.mode = int(config['mode'], 8) self.user = config['user'] self.group = config['group'] # FIXME: Handle rotated files keeping correctly # self.keep_files = int(config['rotate']) self.compress = config['compress'] self.copy = config['copy'] self.copytohdfs = config['copytohdfs'] self.hdfs_config = config['hdfs'] self.hdfs_client = None if self.hdfs_config: self.hdfs_client = hdfs.InsecureClient(**self.hdfs_config) self.dateformat = config['dateformat'] self.now = datetime.datetime.now() self.timestamp = self.now.strftime(self.dateformat) self.destext = config['destext'] self.fnformat = config['fnformat'] if not self.fnformat: raise ValueError("'fnformat' cannot be empty") self.sharedscripts = config['sharedscripts'] self.prerotates = config['prerotate'] self.postrotates = config['postrotate'] self.queuepath = config['queuepath'] self.queue_chunksize = 1000 self.queue_block_timeout = 30 self.queue = Queue(self.queuepath, self.queue_chunksize) def get_rotated_dir(self, path): destext = self.now.strftime(self.destext) dest_dir = '{}-{}'.format(path, destext) return dest_dir def get_dest_path(self, path): rotated_dir = self.get_rotated_dir(path) logname = os.path.basename(path) dest_path = os.path.join( rotated_dir, self.fnformat.format(logname=logname, timestamp=self.timestamp, hostname=socket.gethostname())) return dest_path def create_rotated_dir(self, path): rotated_dir = self.get_rotated_dir(path) makedirs(rotated_dir, 0o755) chown(rotated_dir, self.user, self.group) def rename_file(self, path): self.create_rotated_dir(path) dest_path = self.get_dest_path(path) shutil.move(path, dest_path) self.queue.put((path, dest_path), timeout=self.queue_block_timeout) os.chmod(dest_path, self.mode) chown(dest_path, self.user, self.group) return dest_path def compress_file(self, dest_path): gzip(dest_path) return '{}.gz'.format(dest_path) def _copy_file(self, path, from_, to): if not to: return dest = os.path.normpath(path.replace(from_, to)) dest_dir = os.path.dirname(dest) if not os.path.exists(dest_dir): makedirs(dest_dir, 0o755) chown(dest_dir, self.user, self.group) if path.startswith(from_): shutil.copy2(path, dest) def copy_file(self, dest_path): if isinstance(self.copy, dict): self.copy = [self.copy] for item in self.copy: to = item.get('to') from_ = item.get('from', '') self._copy_file(dest_path, from_, to) def _copy_to_hdfs(self, client, path, from_, to): if not to: return dest = os.path.normpath(path.replace(from_, to)) if path.startswith(from_): client.upload(dest, path, overwrite=True, cleanup=True) def copy_to_hdfs(self, path): if not (self.copytohdfs and self.hdfs_config): return for item in self.copytohdfs: to = item.get('to') from_ = item.get('from', '') self._copy_to_hdfs(self.hdfs_client, path, from_, to) def secure_copy(self): while True: try: path, rotated_path = self.queue.get_nowait() rotated_path_before = rotated_path if not os.path.exists(rotated_path): self.queue.task_done() continue if self.compress: rotated_path = self.compress_file(rotated_path) if self.copy: self.copy_file(rotated_path) if self.copytohdfs: self.copy_to_hdfs(rotated_path) if self.compress: os.remove(rotated_path_before) self.queue.task_done() except Empty: break except Exception as e: print(e) raise def rotate(self): if self.sharedscripts: self.prerotate() for f in iterate_log_paths(self.paths): if is_empty_file(f): continue if not self.sharedscripts: self.prerotate() self.rename_file(f) if not self.sharedscripts: self.postrotate() if self.sharedscripts: self.postrotate() self.secure_copy() def prerotate(self): for cmd in self.prerotates: run(cmd) def postrotate(self): for cmd in self.postrotates: run(cmd)