Exemplo n.º 1
0
def export(outfile="out.csv"):

    print("Export started, connecting to databases...")
    es = ElasticSearchEngine("od-database")
    db = Database("db.sqlite3")
    docs = es.stream_all_docs()
    docs_with_website = db.join_website_on_scan(docs)

    print("Connected, writing to csv")

    with open(outfile + ".temp", "w") as out:

        csv_writer = csv.writer(out)
        csv_writer.writerow([
            "website_id", "website_url", "path", "name", "ext", "size", "mtime"
        ])

        for doc in docs_with_website:
            csv_writer.writerow([
                doc["_source"]["website_id"], doc["_source"]["website_url"],
                doc["_source"]["path"] +
                "/" if doc["_source"]["path"] != "" else "",
                doc["_source"]["name"], "." +
                doc["_source"]["ext"] if doc["_source"]["ext"] != "" else "",
                doc["_source"]["size"], doc["_source"]["mtime"]
            ])
    print("Wrote to csv, compressing with xz")

    os.system("xz -0 " + outfile + ".temp")
    os.system("mv " + outfile + ".temp.xz " + outfile + ".xz")
    print("Compressed to " + str(os.path.getsize(outfile + ".xz")) + " bytes")
Exemplo n.º 2
0
class TaskManager:

    def __init__(self):
        self.search = ElasticSearchEngine("od-database")
        self.db = database.Database("db.sqlite3")

    def complete_task(self, file_list, task, task_result, crawler_name):

        self.search.delete_docs(task_result.website_id)

        if file_list:
            def iter_lines():

                with open(file_list, "r") as f:
                    line = f.readline()
                    while line:
                        yield line
                        line = f.readline()

            self.search.import_json(iter_lines(), task.website_id)

        self.db.update_website_date_if_exists(task.website_id)

        task_result.server_id = crawler_name

        self.db.log_result(task_result)

    def queue_task(self, task: Task):
        self.db.put_task(task)
        print("Queued task and made it available to crawlers: " + str(task.website_id))

    def get_queued_tasks(self) -> list:
        return self.db.get_tasks()
Exemplo n.º 3
0
    def __init__(self):
        self.search = ElasticSearchEngine("od-database")
        self.db = database.Database(config.DB_CONN_STR)
        self.tracker = TaskTrackerApi(config.TT_API)

        self.worker = Worker.from_file(self.tracker)
        if not self.worker:
            self.worker = self.tracker.make_worker("oddb_master")
            self.worker.dump_to_file()
            self.worker.request_access(config.TT_CRAWL_PROJECT, False, True)
            self.worker.request_access(config.TT_INDEX_PROJECT, True, False)

        self.bucket = WsBucketApi(config.WSB_API, config.WSB_SECRET)

        self._indexer_threads = list()
        logger.info("Starting %s indexer threads " %
                    (config.INDEXER_THREADS, ))
        for _ in range(config.INDEXER_THREADS):
            t = Thread(target=self._do_indexing)
            t.setDaemon(True)
            self._indexer_threads.append(t)
            t.start()

        self._recrawl_thread = Thread(target=self._do_recrawl)
        self._recrawl_thread.setDaemon(True)
        self._recrawl_thread.start()
Exemplo n.º 4
0
    def __init__(self):
        scheduler = BackgroundScheduler()
        scheduler.add_job(self.check_completed_tasks, "interval", seconds=10)
        scheduler.start()

        self.search = ElasticSearchEngine("od-database")

        # TODO load from config
        self.crawl_servers = [
            CrawlServer("http://localhost:5001", "OVH_VPS_SSD2 #1"),
        ]
Exemplo n.º 5
0
    def __init__(self):
        self.search = ElasticSearchEngine("od-database")
        self.db = database.Database(config.DB_CONN_STR)
        self.tracker = TaskTrackerApi(config.TT_API)

        self.worker = Worker.from_file(self.tracker)
        if not self.worker:
            self.worker = self.tracker.make_worker("oddb_master")
            self.worker.dump_to_file()
            self.worker.request_access(config.TT_CRAWL_PROJECT, False, True)
            self.worker.request_access(config.TT_INDEX_PROJECT, True, False)

        self.bucket = WsBucketApi(config.WSB_API, config.WSB_SECRET)
        self._indexer_threads = list()
Exemplo n.º 6
0
    def __init__(self):
        self.search = ElasticSearchEngine(config.ES_URL, config.ES_INDEX)
        self.db = database.Database(config.DB_CONN_STR)
        self.tracker = TaskTrackerApi(config.TT_API)

        self.bucket = WsBucketApi(config.WSB_API, config.WSB_SECRET)
        self._indexer_threads = list()

        self.worker = Worker.from_file(self.tracker)
        if not self.worker:
            self.worker = self.tracker.make_worker("$oddb_master")
            if not self.worker:
                print("Could not create worker: %s" % traceback.format_exc())
                return
            self.worker.dump_to_file()
            self.worker.request_access(config.TT_CRAWL_PROJECT, False, True)
            self.worker.request_access(config.TT_INDEX_PROJECT, True, False)
Exemplo n.º 7
0
 def __init__(self):
     self.search = ElasticSearchEngine("od-database")
     self.db = database.Database("db.sqlite3")
Exemplo n.º 8
0
    else:
        return string


outfile = time.strftime("%Y-%m-%d_%H:%M:%S_dump.csv.lz4", time.gmtime())
dldir = "static/downloads/"

print("Deleting existing dumps")
for file in os.listdir(dldir):
    if file.endswith("_dump.csv.lz4"):
        os.remove(os.path.join(dldir, file))

print("Export started, connecting to databases...")

db = Database(config.DB_CONN_STR)
es = ElasticSearchEngine(config.ES_URL, config.ES_INDEX)

docs_with_url = db.join_website_url(es.stream_all_docs())

print("Connected, writing to csv")

with lz4.frame.open(outfile + ".part",
                    mode='wb',
                    compression_level=9,
                    block_size=lz4.frame.BLOCKSIZE_MAX4MB) as fp:
    fp.write((",".join([
        "website_id", "website_url", "path", "name", "ext", "size", "mtime"
    ]) + "\n").encode())

    for doc in docs_with_url:
        try:
Exemplo n.º 9
0
 def setUp(self):
     self.search = ElasticSearchEngine("od-database-test")
     self.search.reset()
     time.sleep(0.5)
Exemplo n.º 10
0
class SearchTest(TestCase):
    def setUp(self):
        self.search = ElasticSearchEngine("od-database-test")
        self.search.reset()
        time.sleep(0.5)

    def test_ping(self):
        self.assertTrue(self.search.ping(), "Search engine not running")

    def test_import_and_search(self):

        files = [{
            "name": "PaNopTicon",
            "size": 1000000000000000000,
            "path": "c/d",
            "mtime": 1528765672
        }, {
            "name": "BLAckwAter.Park",
            "size": 123,
            "path": "",
            "mtime": None
        }, {
            "name": "10'000 days",
            "size": -1,
            "path": "c",
            "mtime": 12345
        }, {
            "name": "Dead Racer",
            "size": 1000,
            "path": "Speed Machine [FLAC]",
            "mtime": 12345
        }]

        in_str = ""
        for file in files:
            in_str += json.dumps(file) + "\n"

        self.search.import_json(in_str, 123)
        time.sleep(2)
        self.assertEqual(
            4,
            self.search.es.count(self.search.index_name, "file")["count"])

        # Search for 'pan' in PaNopTicon and expect 1 result, a scroll id, and an highlight
        page = self.search.search("pan")
        self.assertIsNotNone(page["_scroll_id"])
        self.assertEqual(1, page["hits"]["total"])
        self.assertIsNotNone(page["hits"]["hits"][0]["highlight"]["name"])

        # Search for 'park' and expect BLAckwAter.Park
        page = self.search.search("park")
        self.assertEqual(1, page["hits"]["total"])

        # Search for fla and expect Dead Racer
        page = self.search.search("fla")
        self.assertEqual(1, page["hits"]["total"])

        # Search for 10'000 and expect 10'000 days
        page = self.search.search("10'000")
        self.assertEqual(1, page["hits"]["total"])

    def test_scroll(self):

        files = [{
            "name": "PaNopTicon",
            "size": 1000000000000000000,
            "path": "c/d",
            "mtime": 1528765672
        }, {
            "name": "BLAckwAter.Park",
            "size": 123,
            "path": "",
            "mtime": None
        }, {
            "name": "10'000 days",
            "size": -1,
            "path": "c",
            "mtime": 12345
        }, {
            "name": "Dead Racer",
            "size": 1000,
            "path": "Speed Machine [FLAC]",
            "mtime": 12345
        }]

        in_str = ""
        for file in files:
            in_str += json.dumps(file) + "\n"

        self.search.import_json(in_str, 123)
        time.sleep(2)

        page = self.search.search("")
        scroll_id = page["_scroll_id"]

        # next page
        next_page = self.search.scroll(scroll_id)
        next_scroll_id = next_page["_scroll_id"]
        self.assertIsNotNone(next_scroll_id)

        # again
        next_page2 = self.search.scroll(next_scroll_id)
        self.assertIsNotNone(next_page2["_scroll_id"])

    def test_invalid_scroll(self):

        invalid_scroll = "blahblah"

        self.assertIsNone(self.search.scroll(invalid_scroll))
Exemplo n.º 11
0
import praw
from crawl_server.reddit_bot import RedditBot
from search.search import ElasticSearchEngine
from database import Database, Website
import od_util
import os
import re

chars_to_remove_from_comment = re.compile("[\[\]\\\()]+")
reddit = praw.Reddit('opendirectories-bot',
                     user_agent='github.com/simon987/od-database v1.0  (by /u/Hexahedr_n)')
db = Database("db.sqlite3")
search = ElasticSearchEngine("od-database")
subreddit = reddit.subreddit("opendirectories")
# subreddit = reddit.subreddit("test")
bot = RedditBot("crawled.txt", reddit)

submissions = []


def handle_exact_repost(website_id, reddit_obj):
    stats = search.get_stats(website_id)
    comment = bot.get_comment({"": stats}, website_id,
                              "I already scanned this website on " + website.last_modified + " UTC")
    print(comment)
    print("Exact repost!")
    bot.reply(reddit_obj, comment)


def handle_subdir_repost(website_id, reddit_obj):
Exemplo n.º 12
0
class TaskDispatcher:
    def __init__(self):
        scheduler = BackgroundScheduler()
        scheduler.add_job(self.check_completed_tasks, "interval", seconds=10)
        scheduler.start()

        self.search = ElasticSearchEngine("od-database")

        # TODO load from config
        self.crawl_servers = [
            CrawlServer("http://localhost:5001", "OVH_VPS_SSD2 #1"),
        ]

    def check_completed_tasks(self):

        for server in self.crawl_servers:
            for task in server.fetch_completed_tasks():
                print("Completed task")
                file_list = server.fetch_website_files(task.website_id)
                if file_list:
                    self.search.import_json(file_list, task.website_id)

    def dispatch_task(self, task: Task):
        self._get_available_crawl_server().queue_task(task)

    def _get_available_crawl_server(self) -> CrawlServer:
        # TODO: Load balancing & health check for crawl servers
        return self.crawl_servers[0]

    def get_queued_tasks(self) -> list:

        queued_tasks = []

        for server in self.crawl_servers:
            queued_tasks.extend(server.fetch_queued_tasks())

        return queued_tasks

    def get_current_tasks(self) -> list:
        # TODO mem cache this

        current_tasks = []
        for server in self.crawl_servers:
            current_tasks.extend(server.fetch_current_tasks())

        return current_tasks

    def get_task_logs_by_server(self) -> dict:

        task_logs = dict()

        for server in self.crawl_servers:
            task_logs[server.name] = server.fetch_crawl_logs()

        return task_logs

    def get_stats_by_server(self) -> dict:

        stats = dict()

        for server in self.crawl_servers:
            server_stats = server.fetch_stats()
            if server_stats:
                stats[server.name] = server_stats

        return stats
Exemplo n.º 13
0
from database import Database
from search.search import ElasticSearchEngine
from tasks import TaskManager

# Disable flask logging
flaskLogger = logging.getLogger('werkzeug')
flaskLogger.setLevel(logging.ERROR)

logger = logging.getLogger("default")
logger.setLevel(logging.DEBUG)

formatter = logging.Formatter('%(asctime)s %(levelname)-5s %(message)s')
file_handler = FileHandler("oddb.log")
file_handler.setFormatter(formatter)
for h in logger.handlers:
    logger.removeHandler(h)
logger.addHandler(file_handler)
logger.addHandler(StreamHandler(sys.stdout))

taskManager = TaskManager()
searchEngine = ElasticSearchEngine(config.ES_URL, config.ES_INDEX)
searchEngine.start_stats_scheduler()
db = Database(config.DB_CONN_STR)

redis = r.Redis(host=config.REDIS_HOST, port=config.REDIS_PORT)


def require_role(role: str):
    if db.get_user_role(session.get("username", None)) != role:
        abort(403)
Exemplo n.º 14
0
class TaskManager:

    def __init__(self):
        self.search = ElasticSearchEngine("od-database")
        self.db = database.Database(config.DB_CONN_STR)
        self.tracker = TaskTrackerApi(config.TT_API)

        self.worker = Worker.from_file(self.tracker)
        if not self.worker:
            self.worker = self.tracker.make_worker("oddb_master")
            self.worker.dump_to_file()
            self.worker.request_access(config.TT_CRAWL_PROJECT, False, True)
            self.worker.request_access(config.TT_INDEX_PROJECT, True, False)

        self.bucket = WsBucketApi(config.WSB_API, config.WSB_SECRET)
        self._indexer_threads = list()

    def start_indexer_threads(self):
        logger.info("Starting %s indexer threads " % (config.INDEXER_THREADS, ))
        for _ in range(config.INDEXER_THREADS):
            t = Thread(target=self._do_indexing)
            t.setDaemon(True)
            self._indexer_threads.append(t)
            t.start()

    def _do_indexing(self):

        while True:
            task = self.worker.fetch_task(project_id=config.TT_INDEX_PROJECT)

            if task:
                try:
                    recipe = task.json_recipe()
                    logger.debug("Got indexing task: " + str(recipe))
                    filename = os.path.join(config.WSB_PATH,
                                            format_file_name(recipe["website_id"], recipe["upload_token"]))
                    self._complete_task(filename, Task(recipe["website_id"], recipe["url"]))
                except Exception as e:
                    self.worker.release_task(task_id=task.id, result=1, verification=0)
                finally:
                    try:
                        self.worker.release_task(task_id=task.id, result=0, verification=0)
                    except:
                        pass
            else:
                time.sleep(5)

    def _complete_task(self, file_list, task):

        self.search.delete_docs(task.website_id)

        if file_list:
            def iter_lines():
                with open(file_list, "r") as f:
                    line = f.readline()
                    while line:
                        yield line
                        line = f.readline()

            self.search.import_json(iter_lines(), task.website_id)
            os.remove(file_list)

        self.db.update_website_date_if_exists(task.website_id)

    def do_recrawl(self):
        logger.debug("Creating re-crawl tasks")
        self._generate_crawling_tasks()

    def _generate_crawling_tasks(self):

        # TODO: Insert more in-depth re-crawl logic here
        websites_to_crawl = self.db.get_oldest_updated_websites(config.RECRAWL_POOL_SIZE, prefix="http")

        def recrawl(website: Website):
            crawl_task = Task(website.id, website.url,
                              priority=(int((time.time() - website.last_modified.timestamp()) / 3600)))
            self.queue_task(crawl_task)

        pool = ThreadPool(processes=30)
        pool.map(func=recrawl, iterable=websites_to_crawl)
        pool.close()

    def queue_task(self, task: Task):
        max_assign_time = 24 * 4 * 3600
        upload_token = uuid4().__str__()

        task.upload_token = upload_token
        tracker_response = self.worker.submit_task(config.TT_CRAWL_PROJECT,
                                                   recipe=task.__str__(),
                                                   priority=task.priority,
                                                   max_assign_time=max_assign_time,
                                                   hash64=task.website_id,
                                                   verification_count=1,
                                                   max_retries=3
                                                   )
        print(tracker_response.text)
        logging.info("Queued task and made it available to crawlers: t=%s, r=%s" % (task, tracker_response.text))
        if not tracker_response.json()["ok"]:
            return

        bucket_response = self.bucket.allocate(upload_token.__str__(),
                                               21474837499,  # 20Gib
                                               format_file_name(task.website_id, upload_token),
                                               to_dispose_date=int(time.time() + max_assign_time),
                                               upload_hook="")
        logging.info("Allocated upload bucket: %d, t=%s, r=%s" % (task.website_id, upload_token, bucket_response.text))
Exemplo n.º 15
0
from search.search import ElasticSearchEngine
import ujson

es = ElasticSearchEngine("od-database")
es.reset()

with open("dump.json", "r") as f:

    buffer = list()
    index_every = 10000

    for line in f:
        try:
            doc = ujson.loads(line)["_source"]
            buffer.append(doc)

            if len(buffer) >= index_every:
                es._index(buffer)
                buffer.clear()

        except Exception as e:
            print("ERROR: " + str(e))

    es._index(buffer)

Exemplo n.º 16
0
def index_file_list(path: str, website_id):

    es = ElasticSearchEngine("od-database")
    with open(path, "r") as f:
        es.import_json(f.read(), website_id)
Exemplo n.º 17
0
from tasks import TaskManager
import logging
from flask import session, abort

# Disable flask logging
flaskLogger = logging.getLogger('werkzeug')
flaskLogger.setLevel(logging.ERROR)

logger = logging.getLogger("default")
logger.setLevel(logging.DEBUG)

formatter = logging.Formatter('%(asctime)s %(levelname)-5s %(message)s')
file_handler = FileHandler("oddb.log")
file_handler.setFormatter(formatter)
logger.addHandler(file_handler)
logger.addHandler(StreamHandler(sys.stdout))

taskManager = TaskManager()
searchEngine = ElasticSearchEngine("od-database")
searchEngine.start_stats_scheduler()
db = Database("db.sqlite3")

# temporary hotfix...
sessionStore = dict()


def require_role(role: str):

    if db.get_user_role(session.get("username", None)) != role:
        abort(403)
Exemplo n.º 18
0
    else:
        return string


outfile = time.strftime("%Y-%m-%d_%H:%M:%S_dump.csv.lz4", time.gmtime())
dldir = "static/downloads/"

print("Deleting existing dumps")
for file in os.listdir(dldir):
    if file.endswith("_dump.csv.lz4"):
        os.remove(os.path.join(dldir, file))

print("Export started, connecting to databases...")

db = Database(config.DB_CONN_STR)
es = ElasticSearchEngine("od-database")

docs_with_url = db.join_website_url(es.stream_all_docs())

print("Connected, writing to csv")

with lz4.frame.open(outfile + ".part", mode='wb',
                    compression_level=9,
                    block_size=lz4.frame.BLOCKSIZE_MAX4MB) as fp:
    fp.write((",".join(
        ["website_id", "website_url", "path", "name", "ext", "size", "mtime"]
    ) + "\n").encode())

    for doc in docs_with_url:
        try:
            fp.write(