예제 #1
0
    def test_save_task(self):

        s = LocalStorage(dir_name + "/test_database.db")

        dir_id = s.save_directory(Directory("/some/dir", True, [], "my dir"))
        task_id = s.save_task(Task(0, dir_id))

        self.assertEqual(s.tasks()[task_id].dir_id, dir_id)
        self.assertEqual(task_id, 1)
예제 #2
0
    def test_save_and_retrieve_user(self):

        s = LocalStorage(dir_name + "/test_database.db")

        u = User("bob", b"anHashedPassword", True)

        s.save_user(u)

        self.assertEqual(s.users()["bob"].username, "bob")
        self.assertEqual(s.users()["bob"].admin, True)
예제 #3
0
    def test_del_option(self):

        s = LocalStorage(dir_name + "/test_database.db")

        d = Directory("/some/directory", True, [Option("key1", "val1"), Option("key2", "val2")], "An excellent name")
        dir_id = s.save_directory(d)
        s.del_option(1)

        self.assertEqual(len(s.dirs()[dir_id].options), 1)
        self.assertEqual(s.dirs()[dir_id].options[0].key, "key2")
        self.assertEqual(s.dirs()[dir_id].options[0].value, "val2")
        self.assertEqual(s.dirs()[dir_id].options[0].dir_id, 1)
예제 #4
0
    def test_update_directory(self):

        s = LocalStorage(dir_name + "/test_database.db")

        d = Directory("/some/directory", True, [Option("key1", "val1"), Option("key2", "val2")], "An excellent name")

        dir_id = s.save_directory(d)

        d.name = "A modified name"
        d.enabled = False
        d.path = "/another/directory"
        d.id = dir_id

        s.update_directory(d)

        s2 = LocalStorage(dir_name + "/test_database.db")

        self.assertEqual(s2.dirs()[dir_id].name, "A modified name")
        self.assertEqual(len(s2.dirs()[dir_id].options), 2)
        self.assertEqual(s2.dirs()[dir_id].path, "/another/directory")
        self.assertEqual(s2.dirs()[dir_id].enabled, 0)  # enabled = false
예제 #5
0
    def test_save_and_retrieve_dir_persistent(self):

        s1 = LocalStorage(dir_name + "/test_database.db")

        d = Directory("/some/directory", True, [Option("key1", "val1"), Option("key2", "val2")], "An excellent name")

        dir_id = s1.save_directory(d)

        s2 = LocalStorage(dir_name + "/test_database.db")
        self.assertEqual(s2.dirs()[dir_id].enabled, True)

        self.assertEqual(s2.dirs()[dir_id].options[0].key, "key1")
        self.assertEqual(s2.dirs()[dir_id].options[0].value, "val1")
        self.assertEqual(s2.dirs()[dir_id].options[0].dir_id, 1)
예제 #6
0
    def test_remove_user(self):

        s = LocalStorage(dir_name + "/test_database.db")

        u = User("martin", b"anHashedPassword", True)
        s.save_user(u)

        s.remove_user(u.username)

        with self.assertRaises(KeyError):
            _ = s.users()["martin"]
예제 #7
0
    def test_update_user(self):
        s = LocalStorage(dir_name + "/test_database.db")

        u = User("neil", b"anHashedPassword", True)

        s.save_user(u)

        u.admin = False
        s.update_user(u)

        self.assertFalse(s.users()["neil"].admin)
예제 #8
0
    def test_remove_dir(self):
        s = LocalStorage(dir_name + "/test_database.db")

        d = Directory("/some/directory", True, [Option("key1", "val1"), Option("key2", "val3")], "An excellent name")
        dir_id = s.save_directory(d)

        s.remove_directory(dir_id)

        with self.assertRaises(KeyError):
            _ = s.dirs()[dir_id]
예제 #9
0
    def test_reject_duplicate_path(self):

        s = LocalStorage(dir_name + "/test_database.db")

        d1 = Directory("/some/directory", True, [Option("key1", "val1"), Option("key2", "val2")], "An excellent name")
        d2 = Directory("/some/directory", True, [Option("key1", "val1"), Option("key2", "val2")], "An excellent name")

        s.save_directory(d1)

        with self.assertRaises(DuplicateDirectoryException) as e:
            s.save_directory(d2)
예제 #10
0
    def test_reject_duplicate_user(self):

        s = LocalStorage(dir_name + "/test_database.db")

        u1 = User("user1", b"anHashedPassword", True)
        u2 = User("user1", b"anotherHashedPassword", True)

        s.save_user(u1)

        with self.assertRaises(DuplicateUserException) as e:
            s.save_user(u2)
예제 #11
0
    def test_auth_user(self):

        s = LocalStorage(dir_name + "/test_database.db")

        u = User("bob", b'$2b$10$RakMb.3n/tl76sK7iVahJuklNYkR7f2Y4dsf73tPANwYBkp4VuJ7.', True)

        s.save_user(u)

        self.assertTrue(s.auth_user("bob", "test"))
        self.assertFalse(s.auth_user("bob", "wrong"))

        pass
예제 #12
0
    def test_save_option(self):

        s = LocalStorage(dir_name + "/test_database.db")

        d = Directory("/some/directory", True, [Option("key1", "val1"), Option("key2", "val2")], "An excellent name")
        dir_id = s.save_directory(d)

        opt_id = s.save_option(Option("key3", "val3", dir_id))

        self.assertEqual(s.dirs()[dir_id].options[2].key, "key3")
        self.assertEqual(s.dirs()[dir_id].options[2].value, "val3")
        self.assertEqual(s.dirs()[dir_id].options[2].dir_id, dir_id)
        self.assertEqual(opt_id, 3)
예제 #13
0
    def test_return_none_with_unknown_user(self):

        s = LocalStorage(dir_name + "/test_database.db")

        with self.assertRaises(KeyError) as e:
            _ = s.users()["unknown_user"]
예제 #14
0
def main():
    domains_to_crawl = DomainQueue()
    # content_storage = GDriveStorage.create_storage()
    content_storage = LocalStorage.create_storage()

    # Store pairs <next_possible_fetch_time, CrawlQueue_for_domain>
    time_domains_to_crawl = PriorityQueue()

    robots_info = RobotsProvider()

    url_fails = defaultdict(lambda: 0)

    while domains_to_crawl.has_next():
        # Fetch one more domain from the queue
        new_domain = domains_to_crawl.get_next_domain()
        main_crawler_logger.info("Received new domain to crawl: {}".format(new_domain))

        # Create new CrawlQueue for it
        crawl_queue_for_new_domain = CrawlQueue()
        crawl_queue_for_new_domain.add_pages([Page(new_domain)])

        # Put new queue, with next fetch time equal to current time.
        time_domains_to_crawl.put_nowait((time.time(),
                                          crawl_queue_for_new_domain))

        while not time_domains_to_crawl.empty():
            fetch_time, page_queue = time_domains_to_crawl.get_nowait()
            main_crawler_logger.debug('Fetched {}, {} from priority queue'.format(fetch_time, page_queue))

            if not page_queue.is_empty():
                safe_sleep(fetch_time - time.time())

                cur_page = page_queue.pop()

                if robots_info.can_be_crawled(cur_page):
                    try:
                        cur_page.fetch()

                        if cur_page.can_be_stored():
                            content_storage.put_page(cur_page.url(),
                                                     cur_page.get_cleaned_response())

                        page_queue.add_pages(cur_page.children())

                        # Page is fetched: next access should be delayed.
                        required_delay = robots_info.get_robots_delay(new_domain)
                        time_domains_to_crawl.put_nowait((fetch_time + required_delay, page_queue))
                    except Exception as e:
                        main_crawler_logger.error(e)
                        # Seen errors: 
                        # apiclient.errors.HttpError: <HttpError 403 when requesting https://www.googleapis.com/upload/drive/v3/files?uploadType=multipart&alt=json returned "User rate limit exceeded.">
                        # requests.exceptions.ConnectionError: ('Connection aborted.', RemoteDisconnected('Remote end closed connection without response',))
                        URLGetter.restart_sesison()
                        print('sleeping')
                        time.sleep(10)
                        url_fails[cur_page.url()] += 1
                        if url_fails[cur_page.url()] < 3:
                            page_queue.add_page(cur_page)

                        time_domains_to_crawl.put_nowait((fetch_time, page_queue))
                else:
                    # Page should not be fetched: we can ignore delay.
                    time_domains_to_crawl.put_nowait((fetch_time, page_queue))

            time_domains_to_crawl.task_done()

    return 0
예제 #15
0
    def test_del_task(self):
        s = LocalStorage(dir_name + "/test_database.db")

        dir_id = s.save_directory(Directory("/some/dir", True, [], "my dir"))
        task_id = s.save_task(Task(0, dir_id))

        s2 = LocalStorage(dir_name + "/test_database.db")
        s2.tasks()
        s2.del_task(task_id)

        self.assertEqual(len(s2.tasks()), 0)

        with self.assertRaises(KeyError):
            _ = s2.tasks()[task_id]
예제 #16
0
    def setUp(self):
        if os.path.exists(dir_name + "/test_database.db"):
            os.remove(dir_name + "/test_database.db")

        s = LocalStorage(dir_name + "/test_database.db")
        s.init_db(dir_name + "/../database.sql")
예제 #17
0
    def test_set_access(self):
        s = LocalStorage(dir_name + "/test_database.db")

        dir_id = s.save_directory(Directory("/some/dir", True, [], "my dir"))
        dir_id2 = s.save_directory(Directory("/some/dir2", True, [], "my dir2"))
        dir_id3 = s.save_directory(Directory("/some/dir3", True, [], "my dir3"))
        s.save_user(User("bob", b"", False))

        s.set_access("bob", dir_id, True)
        s.set_access("bob", dir_id2, True)
        s.set_access("bob", dir_id3, True)
        s.set_access("bob", dir_id3, False)

        self.assertEqual(s.get_access("bob"), [dir_id, dir_id2])
예제 #18
0
def get_storage(config, verify=True):
    return S3Storage(config,
                     verify=verify) if config.USE_S3 else LocalStorage()
import bcrypt
import humanfriendly
from PIL import Image
from flask import Flask, render_template, request, redirect, flash, session, abort, send_file

import config
from crawler import TaskManager
from search import Search
from storage import Directory, Option, Task, User
from storage import LocalStorage, DuplicateDirectoryException, DuplicateUserException
from thumbnail import ThumbnailGenerator

app = Flask(__name__)
app.secret_key = "A very secret key"
storage = LocalStorage(config.db_path)

# Disable flask logging
flaskLogger = logging.getLogger('werkzeug')
flaskLogger.setLevel(logging.ERROR)

tm = TaskManager(storage)
search = Search(config.elasticsearch_index)


def get_dir_size(path):
    size = 0

    for root, dirs, files in os.walk(path):
        for filename in files:
            full_path = os.path.join(root, filename)