def main() -> None: logging.basicConfig(level=logging.INFO) parser = argparse.ArgumentParser(description="Keep the database lean.") parser.add_argument( "--ursadb", help="URL of the ursadb instance.", default="tcp://localhost:9281", ) args = parser.parse_args() ursa = UrsaDb(args.ursadb) stage = 0 last_datasets = None while True: datasets = set( ursa.execute_command("topology;")["result"]["datasets"].keys()) if last_datasets: removed = list(last_datasets - datasets) created = list(datasets - last_datasets) logging.info("%s => %s", removed, created) logging.info("Stage %s: %s datasets left.", stage, len(datasets)) if last_datasets and datasets == last_datasets: logging.info("Finally, a fixed point! Returning...") return start = time.time() ursa.execute_command("compact all;") end = time.time() logging.info("Compacting took %s seconds...", (end - start)) stage += 1 last_datasets = datasets
def backend_status() -> BackendStatusSchema: """ Returns the current status of backend services, and returns it. Intended to be used by the webpage. This endpoint is not stable and may be subject to change in the future. """ agents = [] components = { "mquery": mquery_version(), } for name, agent_spec in db.get_active_agents().items(): try: ursa = UrsaDb(agent_spec.ursadb_url) status = ursa.status() tasks = status["result"]["tasks"] ursadb_version = status["result"]["ursadb_version"] agents.append( AgentSchema( name=name, alive=True, tasks=tasks, spec=agent_spec ) ) components[f"ursadb ({name})"] = ursadb_version except Again: agents.append( AgentSchema(name=name, alive=False, tasks=[], spec=agent_spec) ) components[f"ursadb ({name})"] = "unknown" return BackendStatusSchema(agents=agents, components=components,)
def all_indexed_files(ursa: UrsaDb) -> Set[str]: iterator = ursa.query("{}")["iterator"] result: Set[str] = set() while True: pop_result = ursa.pop(iterator, 10000) if pop_result.iterator_empty: break for fpath in pop_result.files: result.add(fpath) return result
def backend_status_datasets() -> BackendStatusDatasetsSchema: datasets = {} for agent_spec in db.get_active_agents().values(): try: ursa = UrsaDb(agent_spec.ursadb_url) datasets.update(ursa.topology()["result"]["datasets"]) except Again: pass return BackendStatusDatasetsSchema(datasets=datasets)
def test_query_with_taints(add_files_to_index): log = logging.getLogger() # a bit hacky, but this calls for a whole test framework otherwise db = UrsaDb("tcp://ursadb:9281") random_taint = os.urandom(8).hex() for dataset_id in db.topology()["result"]["datasets"].keys(): out = db.execute_command( f'dataset "{dataset_id}" taint "{random_taint}";') log.info("taint result: %s", out) files_to_detect = add_files_to_index["files_to_detect"] clue_words = add_files_to_index["clue_words"] yara_tests = [] without_single_clue_words = set(files_to_detect) - set(clue_words) for i in without_single_clue_words: test_yara = """ rule nymaim {{ strings: $check = "{0}" condition: any of them }} """.format(i) yara_tests.append(test_yara) for i in yara_tests: res = request_query(log, i) m = res.json()["matches"] assert len(m) == 1 with open(m[0]["file"], "r") as file: text = file.read() assert text in files_to_detect res = request_query(log, i, "anothertaint") m = res.json()["matches"] assert len(m) == 0 res = request_query(log, i, random_taint) m = res.json()["matches"] assert len(m) == 1
def __init__(self, group_id: str, ursa_url: str, db: Database) -> None: """Creates a new agent instance. Every agents belongs to some group (identified by `group_id`). There may be multiple agents in a single group, but they're all exchangeable (they read and write to the same queues, and they use the same ursadb instance). :param group_id: Identifier of the agent group this agent belongs to. :type group_id: str :param ursa_url: URL to connected ursadb instance. Ideally this should be public, because this will allow mquery to collect measurements. :type ursa_url: str :param db: Reference to main database/task queue. :type db: Database """ self.group_id = group_id self.ursa_url = ursa_url self.db = db self.ursa = UrsaDb(self.ursa_url) self.active_plugins: List[MetadataPlugin] = []
def backend_status_datasets() -> BackendStatusDatasetsSchema: """ Returns a combined list of datasets from all agents. Caveat: In case of collision of dataset ids when there are multiple agents, this API will only return one dataset per colliding ID. Collision is extremally unlikely though and it shouldn't be a problem in the real world. This endpoint is not stable and may be subject to change in the future. """ datasets: Dict[str, int] = {} for agent_spec in db.get_active_agents().values(): try: ursa = UrsaDb(agent_spec.ursadb_url) datasets.update(ursa.topology()["result"]["datasets"]) except Again: pass return BackendStatusDatasetsSchema(datasets=datasets)
def index( ursadb: str, workdir: Path, types: List[str], tags: List[str], workers: int, working_datasets: Optional[int], ) -> None: logging.info("Index.1: Determine compacting threshold.") if working_datasets is None: working_datasets = workers * 20 + 40 ursa = UrsaDb(ursadb) current_datasets = len( ursa.execute_command("topology;")["result"]["datasets"]) compact_threshold = current_datasets + working_datasets logging.info("Index.1: Compact threshold = %s.", compact_threshold) logging.info("Index.2: Find prepared batches.") indexing_jobs = [] for batch in workdir.glob("*.txt"): indexing_jobs.append((ursadb, types, tags, batch, compact_threshold)) logging.info("Index.2: Got %s batches to run.", len(indexing_jobs)) logging.info("Index.3: Run index commands with %s workers.", workers) pool = Pool(processes=workers) done = 0 total = len(indexing_jobs) for batchid in pool.imap_unordered(index_files, indexing_jobs, chunksize=1): done += 1 logging.info(f"Index.4: Batch %s done [%s/%s].", batchid, done, total) if list(workdir.iterdir()): logging.info("Index.5: Workdir not removed, because it's not empty.") else: logging.info("Index.5: Unlinking the workdir.") workdir.rmdir()
def backend_status() -> BackendStatusSchema: agents = [] components = { "mquery": mquery_version(), } for name, agent_spec in db.get_active_agents().items(): try: ursa = UrsaDb(agent_spec.ursadb_url) status = ursa.status() tasks = status["result"]["tasks"] ursadb_version = status["result"]["ursadb_version"] agents.append( AgentSchema( name=name, alive=True, tasks=tasks, spec=agent_spec ) ) components[f"ursadb ({name})"] = ursadb_version except Again: agents.append( AgentSchema(name=name, alive=False, tasks=[], spec=agent_spec) ) components[f"ursadb ({name})"] = "unknown" return BackendStatusSchema(agents=agents, components=components,)
def prepare( ursadb: str, workdir: Path, path: Path, batch: int, max_file_size: int, mounted_as: str, ) -> None: if not workdir.exists(): workdir.mkdir() logging.info("Prepare.1: load all indexed files into memory.") ursa = UrsaDb(ursadb) fileset = all_indexed_files(ursa) logging.info("Prepare.2: find all new files.") tmpfile = None current_batch = 10**20 # As good as infinity. new_files = 0 batch_id = 0 for f in find_new_files(fileset, path, mounted_as, max_file_size): if current_batch > batch: if tmpfile is not None: tmpfile.close() current_batch = 0 tmppath = workdir / f"batch_{batch_id:010}.txt" tmpfile = tmppath.open(mode="w") batch_id += 1 assert tmpfile is not None # Let mypy know the obvious. tmpfile.write(f"{f}\n") current_batch += 1 new_files += 1 if tmpfile is not None: tmpfile.close() logging.info("Prepare.3: Got %s files in %s batches to index.", new_files, batch_id)
def index_files( proc_params: Tuple[str, List[str], List[str], Path, int]) -> str: ursa_url, types, tags, batch, compact_threshold = proc_params ursa = UrsaDb(ursa_url) current_datasets = len( ursa.execute_command("topology;")["result"]["datasets"]) if current_datasets > compact_threshold: ursa.execute_command("compact smart;") type_list = ", ".join(types) mounted_names = [] wipbatch = batch.with_suffix(".wip") batch.rename(wipbatch) with wipbatch.open() as batchfile: for fname in batchfile: fname = fname[:-1] # remove the trailing newline fname = fname.replace('"', '\\"') mounted_names.append(fname) mounted_list = " ".join(f'"{fpath}"' for fpath in mounted_names) tag_mod = "" if tags: tag_list = ",".join(f'"{tag}"' for tag in tags) tag_mod = f" with taints [{tag_list}]" result = ursa.execute_command( f"index {mounted_list} with [{type_list}]{tag_mod} nocheck;") if "error" in result: wipbatch.rename(batch.with_suffix(".errored")) batch.with_suffix(".message").write_text(json.dumps(result, indent=4)) logging.error( "Batch %s errored, see %s for details", batch, batch.with_suffix(".message"), ) else: wipbatch.unlink() return str(batch)
class Agent: def __init__(self, group_id: str, ursa_url: str, db: Database) -> None: """Creates a new agent instance. Every agents belongs to some group (identified by `group_id`). There may be multiple agents in a single group, but they're all exchangeable (they read and write to the same queues, and they use the same ursadb instance). :param group_id: Identifier of the agent group this agent belongs to. :type group_id: str :param ursa_url: URL to connected ursadb instance. Ideally this should be public, because this will allow mquery to collect measurements. :type ursa_url: str :param db: Reference to main database/task queue. :type db: Database """ self.group_id = group_id self.ursa_url = ursa_url self.db = db self.ursa = UrsaDb(self.ursa_url) self.active_plugins: List[MetadataPlugin] = [] def __search_task(self, job_id: JobId) -> None: """Do ursadb query for yara belonging to the provided job. If successful, create a new yara tasks to do further processing of the results. """ logging.info("Parsing...") job = self.db.get_job(job_id) if job.status == "cancelled": logging.info("Job was cancelled, returning...") return if job.status == "new": # First search request - find datasets to query logging.info("New job, generate subtasks...") result = self.ursa.topology() if "error" in result: raise RuntimeError(result["error"]) self.db.init_job_datasets( self.group_id, job_id, list(result["result"]["datasets"].keys()), ) logging.info("Get next dataset to query...") dataset = self.db.get_next_search_dataset(self.group_id, job_id) if dataset is None: logging.info("Nothing to query, returning...") return rules = parse_yara(job.raw_yara) parsed = combine_rules(rules) logging.info("Querying backend...") result = self.ursa.query(parsed.query, job.taints, dataset) if "error" in result: raise RuntimeError(result["error"]) file_count = result["file_count"] iterator = result["iterator"] logging.info(f"Iterator {iterator} contains {file_count} files") self.db.update_job_files(job_id, file_count) self.db.agent_start_job(self.group_id, job_id, iterator) self.db.agent_continue_search(self.group_id, job_id) self.db.dataset_query_done(job_id) def __load_plugins(self) -> None: self.plugin_config_version: int = self.db.get_plugin_config_version() active_plugins = [] for plugin_class in METADATA_PLUGINS: plugin_name = plugin_class.get_name() plugin_config = self.db.get_plugin_configuration(plugin_name) try: active_plugins.append(plugin_class(self.db, plugin_config)) logging.info("Loaded %s plugin", plugin_name) except Exception: logging.exception("Failed to load %s plugin", plugin_name) self.active_plugins = active_plugins def __initialize_agent(self) -> None: self.__load_plugins() plugins_spec = { plugin_class.get_name(): plugin_class.config_fields for plugin_class in METADATA_PLUGINS } self.db.register_active_agent( self.group_id, self.ursa_url, plugins_spec, [ active_plugin.get_name() for active_plugin in self.active_plugins ], ) def __update_metadata(self, job: JobId, orig_name: str, path: str, matches: List[str]) -> None: """ Runs metadata plugins for the given file in a given job. :param group_id: Identifier of the agent group this agent belongs to. :type group_id: str :param ursa_url: URL to connected ursadb instance. Ideally this should be public, because this will allow mquery to collect measurements. :type ursa_url: str :param db: Reference to main database/task queue. :type db: Database """ # Initialise default values in the metadata. metadata: Metadata = { "job": job.hash, "path": path, "sha256": make_sha256_tag(path), } # Run all the plugins in configured order. for plugin in self.active_plugins: if not plugin.is_extractor: continue try: extracted_meta = plugin.run(orig_name, metadata) metadata.update(extracted_meta) except Exception: logging.exception( "Failed to launch plugin %s for %s", plugin.get_name(), orig_name, ) # Remove unnecessary keys from the metadata. del metadata["job"] del metadata["path"] # Update the database. match = MatchInfo(orig_name, metadata, matches) self.db.add_match(job, match) def __execute_yara(self, job: JobId, files: List[str]) -> None: rule = compile_yara(self.db, job) num_matches = 0 num_errors = 0 num_files = len(files) self.db.job_start_work(job, num_files) # filenames returned from ursadb are usually paths, but may be # rewritten by plugins. Create a map {original_name: file_path} filemap = {f: f for f in files} for plugin in self.active_plugins: if not plugin.is_filter: continue new_filemap = {} for orig_name, current_path in filemap.items(): new_path = plugin.filter(orig_name, current_path) if new_path: new_filemap[orig_name] = new_path filemap = new_filemap for orig_name, path in filemap.items(): try: matches = rule.match(path) if matches: self.__update_metadata(job, orig_name, path, [r.rule for r in matches]) num_matches += 1 except yara.Error: logging.error("Yara failed to check file %s", orig_name) num_errors += 1 except FileNotFoundError: logging.error("Failed to open file for yara check: %s", orig_name) num_errors += 1 for plugin in self.active_plugins: plugin.cleanup() if num_errors > 0: self.db.job_update_error(job, num_errors) self.db.job_update_work(job, num_files, num_matches) def __yara_task(self, job: JobId, iterator: str) -> None: """Get a next batch of worm from the db. If there are still files left in the iterator, push the task back to the same queue (so that other agents will be able to work on it in parallel). Later, process the obtained files. """ final_statuses = ["cancelled", "failed", "done", "removed"] j = self.db.get_job(job) if j.status in final_statuses: return MIN_BATCH_SIZE = 10 MAX_BATCH_SIZE = 500 taken_files = j.files_processed + j.files_in_progress # Never do more than MAX_BATCH_SIZE files at once. batch_size = MAX_BATCH_SIZE # Take small batches of work at first, so the db appears to run faster. batch_size = min(batch_size, taken_files) # Don't take more than 1/4 of files left at once (to speed up finishes). batch_size = min(batch_size, (j.total_files - taken_files) // 4) # Finally, always process at least MIN_BATCH_SIZE files. batch_size = max(batch_size, MIN_BATCH_SIZE) pop_result = self.ursa.pop(iterator, batch_size) if not pop_result.iterator_empty: # The job still have some files, put it back on the queue. self.db.agent_start_job(self.group_id, job, iterator) if pop_result.files: # If there are any files popped iterator, work on them self.__execute_yara(job, pop_result.files) j = self.db.get_job(job) if (j.status == "processing" and j.files_processed == j.total_files and j.datasets_left == 0): # The job is over, work of this agent as done. self.db.agent_finish_job(job) def __process_task(self, task: AgentTask) -> None: """Dispatches and executes the next incoming task. The high level workflow look like this: for every new `search` job, mquery creates a new `search` task for every agent group. One of the agents will pick it up and execute, and create `yara` tasks. `yara` tasks will be executed by workers for every file in iterator, until it's exhausted. :param task: Task to be executed. :type task: AgentTask :raises RuntimeError: Task with unsupported type given. """ if task.type == TaskType.RELOAD: if (self.plugin_config_version == self.db.get_plugin_config_version()): # This should never happen and suggests that there is bug somewhere # and version was not updated properly. logging.error( "Critical error: Requested to reload configuration, but " "configuration present in database is still the same (%s).", self.plugin_config_version, ) return logging.info("Configuration changed - reloading plugins.") # Request next agent to reload the configuration self.db.reload_configuration(self.plugin_config_version) # Reload configuration. Version will be updated during reinitialization, # so we don't receive our own request. self.__initialize_agent() elif task.type == TaskType.COMMAND: logging.info("Executing raw command: %s", task.data) self.ursa.execute_command(task.data) elif task.type == TaskType.SEARCH: job = JobId(task.data) logging.info(f"search: {job.hash}") try: self.__search_task(job) except Exception as e: logging.exception("Failed to execute task.") self.db.agent_finish_job(job) self.db.fail_job(job, str(e)) elif task.type == TaskType.YARA: data = json.loads(task.data) job = JobId(data["job"]) iterator = data["iterator"] logging.info("yara: iterator %s", iterator) try: self.__yara_task(job, iterator) except Exception as e: logging.exception("Failed to execute task.") self.db.agent_finish_job(job) self.db.fail_job(job, str(e)) else: raise RuntimeError("Unsupported queue") def main_loop(self) -> None: """Starts a main loop of the agent - this is the only intended public method of this class. This will register the agent in the db, then pop tasks from redis as they come, and execute them. """ self.__initialize_agent() while True: task = self.db.agent_get_task(self.group_id, self.plugin_config_version) self.__process_task(task)
#!/usr/bin/env python import json import logging import time import yara # type: ignore from functools import lru_cache import random from yara import SyntaxError import config from lib.ursadb import UrsaDb from lib.yaraparse import parse_yara, combine_rules from util import make_redis, setup_logging from typing import Any, Dict, List, Optional, Tuple redis = make_redis() db = UrsaDb(config.BACKEND) @lru_cache(maxsize=32) def compile_yara(job_hash: str) -> Any: yara_rule = redis.hget("job:" + job_hash, "raw_yara") logging.info("Compiling Yara") try: rule = yara.compile(source=yara_rule) except SyntaxError as e: logging.exception("Yara parse error") raise e return rule
def main() -> None: logging.basicConfig(level=logging.INFO) parser = argparse.ArgumentParser(description="Reindex local files.") parser.add_argument("path", help="Path of samples to be indexed.") parser.add_argument( "--path-mount", help= "Path to the samples to be indexed, as seen by ursadb (if different).", default=None, ) parser.add_argument( "--ursadb", help="URL of the ursadb instance.", default="tcp://localhost:9281", ) parser.add_argument("--tmpdir", help="Path to used tmpdir.", default="/tmp") parser.add_argument( "--tmpdir-mount", help="Path to used tmpdir, as seen by ursadb (if different)", default=None, ) parser.add_argument("--batch", help="Size of indexing batch.", type=int, default=1000) parser.add_argument( "--type", dest="types", help="Additional index types.", action="append", default=["gram3"], choices=["gram3", "text4", "hash4", "wide8"], ) parser.add_argument( "--workers", help="Number of parallel indexing jobs.", type=int, default=2, ) parser.add_argument( "--dry-run", help="Don't index, only print filenames.", action="store_true", ) args = parser.parse_args() tmpdir_mount = args.tmpdir_mount or args.tmpdir path_mount = args.path_mount or args.path logging.info("Stage 1: load all indexed files into memory.") ursa = UrsaDb(args.ursadb) fileset = all_indexed_files(ursa) logging.info("Stage 2: find all new files.") tmpfile = None tmpfiles = [] current_batch = 10**20 # As good as infinity. new_files = 0 for f in find_new_files(fileset, args.path, path_mount): if args.dry_run: print(f) continue if current_batch > args.batch: current_batch = 0 if tmpfile: tmpfile.close() tmpfile = NamedTemporaryFile(mode="w", dir=args.tmpdir, delete=False) tmpfiles.append(tmpfile.name) assert tmpfile is not None # Let mypy know the obvious. tmpfile.write(f"{f}\n") current_batch += 1 new_files += 1 logging.info("Got %s files in %s batches to index.", new_files, len(tmpfiles)) if args.dry_run: return del fileset indexing_jobs = [] for ndx, tmppath in enumerate(tmpfiles): mounted_name = os.path.join(tmpdir_mount, os.path.relpath(tmppath, args.tmpdir)) indexing_jobs.append((args.ursadb, args.types, mounted_name, ndx)) logging.info(f"Batch %s: %s", ndx, mounted_name) logging.info("Stage 3: Run index command in parallel.") pool = Pool(processes=args.workers) done = 0 total = len(indexing_jobs) for batchid in pool.imap_unordered(index_files, indexing_jobs, chunksize=1): done += 1 logging.info(f"Batch %s done [%s/%s].", batchid, done, total) logging.info("Stage 4: Cleanup.") for f in tmpfiles: os.unlink(f)
def index_files(proc_params: Tuple[str, List[str], str, int]) -> None: ursa_url, types, mounted_name, ndx = proc_params ursa = UrsaDb(ursa_url) with_ = ", ".join(types) ursa.execute_command( f'index from list "{mounted_name}" with [{with_}] nocheck;')
def main() -> None: logging.basicConfig(level=logging.INFO) parser = argparse.ArgumentParser(description="Reindex local files.") parser.add_argument( "--mode", help="Mode of operation. Only prepare batches, index them, or both.", default="prepare-and-index", choices=["prepare", "index", "prepare-and-index"], ) # switches relevant for both "prepare" and "index" modes parser.add_argument( "--ursadb", help="URL of the ursadb instance.", default="tcp://localhost:9281", ) parser.add_argument("--workdir", help="Path to a working directory.", default=None) # switches relevant only for "prepare" mode parser.add_argument("--batch", help="Size of indexing batch.", type=int, default=1000) parser.add_argument("--path", help="Path of samples to be indexed.", default=None) parser.add_argument( "--path-mount", help= "Path to the samples to be indexed, as seen by ursadb (if different).", default=None, ) parser.add_argument( "--max-file-size-mb", type=int, help="Maximum file size, in MB, to index. 128 By default.", default=128, ) # switches relevant only for "index" mode parser.add_argument( "--type", dest="types", help="Index types. By default [gram3, text4, wide8, hash4]", action="append", default=[], choices=["gram3", "text4", "hash4", "wide8"], ) parser.add_argument( "--tag", dest="tags", help="Additional tags for indexed datasets.", action="append", default=[], ) parser.add_argument( "--workers", help="Number of parallel indexing jobs.", type=int, default=2, ) parser.add_argument( "--working-datasets", help="Numer of working datasets (uses sane value by default).", type=int, default=None, ) args = parser.parse_args() types = list(set(args.types)) if args.workdir is None: logging.error("--workdir is a required parameter") return try: ursa = UrsaDb(args.ursadb) ursa.status() except Exception: logging.error("Can't connect to ursadb instance at %s", args.ursadb) if args.mode == "prepare" or args.mode == "prepare-and-index": # Path must exist if args.path is None: logging.error("Path (--path) is a required parameter.") return if args.path_mount is not None: path_mount = args.path_mount else: path_mount = args.path path = Path(args.path) if not path.exists: logging.error("Path (--path) %s does not exist.", args.path) return # We're starting a new indexing operation. Workdir must not exist. workdir = Path(args.workdir) if workdir.exists() and list(workdir.iterdir()): logging.error( "Workdir %s already exists and is not empty. Remove it or choose another one.", args.workdir, ) return max_file_size = args.max_file_size_mb * 1024 * 1024 prepare(args.ursadb, workdir, path, args.batch, max_file_size, path_mount) if args.mode == "index" or args.mode == "prepare-and-index": # By default use only all index types. if not args.types: types = ["gram3", "text4", "wide8", "hash4"] # We're continuing an existing operation. Workdir must exist. workdir = Path(args.workdir) if not workdir.exists(): logging.error( "Running with mode=index, but workdir %s doesn't exist", args.workdir, ) return index( args.ursadb, workdir, types, args.tags, args.workers, args.working_datasets, ) logging.info("Indexing finished. Consider compacting the database now")
def db_context(request): IPC = "ipc:///tmp/ursadb-test" context = zmq.Context() socket = context.socket(zmq.REP) socket.bind(IPC) return UrsadbTestContext(socket, UrsaDb(IPC))