def test_save_task(self): s = LocalStorage(dir_name + "/test_database.db") dir_id = s.save_directory(Directory("/some/dir", True, [], "my dir")) task_id = s.save_task(Task(0, dir_id)) self.assertEqual(s.tasks()[task_id].dir_id, dir_id) self.assertEqual(task_id, 1)
def test_save_and_retrieve_user(self): s = LocalStorage(dir_name + "/test_database.db") u = User("bob", b"anHashedPassword", True) s.save_user(u) self.assertEqual(s.users()["bob"].username, "bob") self.assertEqual(s.users()["bob"].admin, True)
def test_del_option(self): s = LocalStorage(dir_name + "/test_database.db") d = Directory("/some/directory", True, [Option("key1", "val1"), Option("key2", "val2")], "An excellent name") dir_id = s.save_directory(d) s.del_option(1) self.assertEqual(len(s.dirs()[dir_id].options), 1) self.assertEqual(s.dirs()[dir_id].options[0].key, "key2") self.assertEqual(s.dirs()[dir_id].options[0].value, "val2") self.assertEqual(s.dirs()[dir_id].options[0].dir_id, 1)
def test_update_directory(self): s = LocalStorage(dir_name + "/test_database.db") d = Directory("/some/directory", True, [Option("key1", "val1"), Option("key2", "val2")], "An excellent name") dir_id = s.save_directory(d) d.name = "A modified name" d.enabled = False d.path = "/another/directory" d.id = dir_id s.update_directory(d) s2 = LocalStorage(dir_name + "/test_database.db") self.assertEqual(s2.dirs()[dir_id].name, "A modified name") self.assertEqual(len(s2.dirs()[dir_id].options), 2) self.assertEqual(s2.dirs()[dir_id].path, "/another/directory") self.assertEqual(s2.dirs()[dir_id].enabled, 0) # enabled = false
def test_save_and_retrieve_dir_persistent(self): s1 = LocalStorage(dir_name + "/test_database.db") d = Directory("/some/directory", True, [Option("key1", "val1"), Option("key2", "val2")], "An excellent name") dir_id = s1.save_directory(d) s2 = LocalStorage(dir_name + "/test_database.db") self.assertEqual(s2.dirs()[dir_id].enabled, True) self.assertEqual(s2.dirs()[dir_id].options[0].key, "key1") self.assertEqual(s2.dirs()[dir_id].options[0].value, "val1") self.assertEqual(s2.dirs()[dir_id].options[0].dir_id, 1)
def test_remove_user(self): s = LocalStorage(dir_name + "/test_database.db") u = User("martin", b"anHashedPassword", True) s.save_user(u) s.remove_user(u.username) with self.assertRaises(KeyError): _ = s.users()["martin"]
def test_update_user(self): s = LocalStorage(dir_name + "/test_database.db") u = User("neil", b"anHashedPassword", True) s.save_user(u) u.admin = False s.update_user(u) self.assertFalse(s.users()["neil"].admin)
def test_remove_dir(self): s = LocalStorage(dir_name + "/test_database.db") d = Directory("/some/directory", True, [Option("key1", "val1"), Option("key2", "val3")], "An excellent name") dir_id = s.save_directory(d) s.remove_directory(dir_id) with self.assertRaises(KeyError): _ = s.dirs()[dir_id]
def test_reject_duplicate_path(self): s = LocalStorage(dir_name + "/test_database.db") d1 = Directory("/some/directory", True, [Option("key1", "val1"), Option("key2", "val2")], "An excellent name") d2 = Directory("/some/directory", True, [Option("key1", "val1"), Option("key2", "val2")], "An excellent name") s.save_directory(d1) with self.assertRaises(DuplicateDirectoryException) as e: s.save_directory(d2)
def test_reject_duplicate_user(self): s = LocalStorage(dir_name + "/test_database.db") u1 = User("user1", b"anHashedPassword", True) u2 = User("user1", b"anotherHashedPassword", True) s.save_user(u1) with self.assertRaises(DuplicateUserException) as e: s.save_user(u2)
def test_auth_user(self): s = LocalStorage(dir_name + "/test_database.db") u = User("bob", b'$2b$10$RakMb.3n/tl76sK7iVahJuklNYkR7f2Y4dsf73tPANwYBkp4VuJ7.', True) s.save_user(u) self.assertTrue(s.auth_user("bob", "test")) self.assertFalse(s.auth_user("bob", "wrong")) pass
def test_save_option(self): s = LocalStorage(dir_name + "/test_database.db") d = Directory("/some/directory", True, [Option("key1", "val1"), Option("key2", "val2")], "An excellent name") dir_id = s.save_directory(d) opt_id = s.save_option(Option("key3", "val3", dir_id)) self.assertEqual(s.dirs()[dir_id].options[2].key, "key3") self.assertEqual(s.dirs()[dir_id].options[2].value, "val3") self.assertEqual(s.dirs()[dir_id].options[2].dir_id, dir_id) self.assertEqual(opt_id, 3)
def test_return_none_with_unknown_user(self): s = LocalStorage(dir_name + "/test_database.db") with self.assertRaises(KeyError) as e: _ = s.users()["unknown_user"]
def main(): domains_to_crawl = DomainQueue() # content_storage = GDriveStorage.create_storage() content_storage = LocalStorage.create_storage() # Store pairs <next_possible_fetch_time, CrawlQueue_for_domain> time_domains_to_crawl = PriorityQueue() robots_info = RobotsProvider() url_fails = defaultdict(lambda: 0) while domains_to_crawl.has_next(): # Fetch one more domain from the queue new_domain = domains_to_crawl.get_next_domain() main_crawler_logger.info("Received new domain to crawl: {}".format(new_domain)) # Create new CrawlQueue for it crawl_queue_for_new_domain = CrawlQueue() crawl_queue_for_new_domain.add_pages([Page(new_domain)]) # Put new queue, with next fetch time equal to current time. time_domains_to_crawl.put_nowait((time.time(), crawl_queue_for_new_domain)) while not time_domains_to_crawl.empty(): fetch_time, page_queue = time_domains_to_crawl.get_nowait() main_crawler_logger.debug('Fetched {}, {} from priority queue'.format(fetch_time, page_queue)) if not page_queue.is_empty(): safe_sleep(fetch_time - time.time()) cur_page = page_queue.pop() if robots_info.can_be_crawled(cur_page): try: cur_page.fetch() if cur_page.can_be_stored(): content_storage.put_page(cur_page.url(), cur_page.get_cleaned_response()) page_queue.add_pages(cur_page.children()) # Page is fetched: next access should be delayed. required_delay = robots_info.get_robots_delay(new_domain) time_domains_to_crawl.put_nowait((fetch_time + required_delay, page_queue)) except Exception as e: main_crawler_logger.error(e) # Seen errors: # apiclient.errors.HttpError: <HttpError 403 when requesting https://www.googleapis.com/upload/drive/v3/files?uploadType=multipart&alt=json returned "User rate limit exceeded."> # requests.exceptions.ConnectionError: ('Connection aborted.', RemoteDisconnected('Remote end closed connection without response',)) URLGetter.restart_sesison() print('sleeping') time.sleep(10) url_fails[cur_page.url()] += 1 if url_fails[cur_page.url()] < 3: page_queue.add_page(cur_page) time_domains_to_crawl.put_nowait((fetch_time, page_queue)) else: # Page should not be fetched: we can ignore delay. time_domains_to_crawl.put_nowait((fetch_time, page_queue)) time_domains_to_crawl.task_done() return 0
def test_del_task(self): s = LocalStorage(dir_name + "/test_database.db") dir_id = s.save_directory(Directory("/some/dir", True, [], "my dir")) task_id = s.save_task(Task(0, dir_id)) s2 = LocalStorage(dir_name + "/test_database.db") s2.tasks() s2.del_task(task_id) self.assertEqual(len(s2.tasks()), 0) with self.assertRaises(KeyError): _ = s2.tasks()[task_id]
def setUp(self): if os.path.exists(dir_name + "/test_database.db"): os.remove(dir_name + "/test_database.db") s = LocalStorage(dir_name + "/test_database.db") s.init_db(dir_name + "/../database.sql")
def test_set_access(self): s = LocalStorage(dir_name + "/test_database.db") dir_id = s.save_directory(Directory("/some/dir", True, [], "my dir")) dir_id2 = s.save_directory(Directory("/some/dir2", True, [], "my dir2")) dir_id3 = s.save_directory(Directory("/some/dir3", True, [], "my dir3")) s.save_user(User("bob", b"", False)) s.set_access("bob", dir_id, True) s.set_access("bob", dir_id2, True) s.set_access("bob", dir_id3, True) s.set_access("bob", dir_id3, False) self.assertEqual(s.get_access("bob"), [dir_id, dir_id2])
def get_storage(config, verify=True): return S3Storage(config, verify=verify) if config.USE_S3 else LocalStorage()
import bcrypt import humanfriendly from PIL import Image from flask import Flask, render_template, request, redirect, flash, session, abort, send_file import config from crawler import TaskManager from search import Search from storage import Directory, Option, Task, User from storage import LocalStorage, DuplicateDirectoryException, DuplicateUserException from thumbnail import ThumbnailGenerator app = Flask(__name__) app.secret_key = "A very secret key" storage = LocalStorage(config.db_path) # Disable flask logging flaskLogger = logging.getLogger('werkzeug') flaskLogger.setLevel(logging.ERROR) tm = TaskManager(storage) search = Search(config.elasticsearch_index) def get_dir_size(path): size = 0 for root, dirs, files in os.walk(path): for filename in files: full_path = os.path.join(root, filename)