Exemplo n.º 1
0
def backend_status() -> BackendStatusSchema:
    """
    Returns the current status of backend services, and returns it. Intended to
    be used by the webpage.

    This endpoint is not stable and may be subject to change in the future.
    """
    agents = []
    components = {
        "mquery": mquery_version(),
    }
    for name, agent_spec in db.get_active_agents().items():
        try:
            ursa = UrsaDb(agent_spec.ursadb_url)
            status = ursa.status()
            tasks = status["result"]["tasks"]
            ursadb_version = status["result"]["ursadb_version"]
            agents.append(
                AgentSchema(
                    name=name, alive=True, tasks=tasks, spec=agent_spec
                )
            )
            components[f"ursadb ({name})"] = ursadb_version
        except Again:
            agents.append(
                AgentSchema(name=name, alive=False, tasks=[], spec=agent_spec)
            )
            components[f"ursadb ({name})"] = "unknown"

    return BackendStatusSchema(agents=agents, components=components,)
Exemplo n.º 2
0
def main() -> None:
    logging.basicConfig(level=logging.INFO)

    parser = argparse.ArgumentParser(description="Keep the database lean.")
    parser.add_argument(
        "--ursadb",
        help="URL of the ursadb instance.",
        default="tcp://localhost:9281",
    )

    args = parser.parse_args()
    ursa = UrsaDb(args.ursadb)
    stage = 0
    last_datasets = None
    while True:
        datasets = set(
            ursa.execute_command("topology;")["result"]["datasets"].keys())
        if last_datasets:
            removed = list(last_datasets - datasets)
            created = list(datasets - last_datasets)
            logging.info("%s => %s", removed, created)
        logging.info("Stage %s: %s datasets left.", stage, len(datasets))
        if last_datasets and datasets == last_datasets:
            logging.info("Finally, a fixed point! Returning...")
            return

        start = time.time()
        ursa.execute_command("compact all;")
        end = time.time()
        logging.info("Compacting took %s seconds...", (end - start))
        stage += 1
        last_datasets = datasets
Exemplo n.º 3
0
def backend_status_datasets() -> BackendStatusDatasetsSchema:
    datasets = {}
    for agent_spec in db.get_active_agents().values():
        try:
            ursa = UrsaDb(agent_spec.ursadb_url)
            datasets.update(ursa.topology()["result"]["datasets"])
        except Again:
            pass

    return BackendStatusDatasetsSchema(datasets=datasets)
Exemplo n.º 4
0
def test_query_with_taints(add_files_to_index):
    log = logging.getLogger()

    # a bit hacky, but this calls for a whole test framework otherwise
    db = UrsaDb("tcp://ursadb:9281")

    random_taint = os.urandom(8).hex()
    for dataset_id in db.topology()["result"]["datasets"].keys():
        out = db.execute_command(
            f'dataset "{dataset_id}" taint "{random_taint}";')
        log.info("taint result: %s", out)

    files_to_detect = add_files_to_index["files_to_detect"]
    clue_words = add_files_to_index["clue_words"]

    yara_tests = []
    without_single_clue_words = set(files_to_detect) - set(clue_words)

    for i in without_single_clue_words:
        test_yara = """
    rule nymaim {{
        strings:
            $check = "{0}"
        condition:
            any of them
    }}
    """.format(i)
        yara_tests.append(test_yara)

    for i in yara_tests:
        res = request_query(log, i)
        m = res.json()["matches"]
        assert len(m) == 1
        with open(m[0]["file"], "r") as file:
            text = file.read()
        assert text in files_to_detect

        res = request_query(log, i, "anothertaint")
        m = res.json()["matches"]
        assert len(m) == 0

        res = request_query(log, i, random_taint)
        m = res.json()["matches"]
        assert len(m) == 1
Exemplo n.º 5
0
def backend_status_datasets() -> BackendStatusDatasetsSchema:
    """
    Returns a combined list of datasets from all agents.

    Caveat: In case of collision of dataset ids when there are multiple agents,
    this API will only return one dataset per colliding ID. Collision is
    extremally unlikely though and it shouldn't be a problem in the real world.

    This endpoint is not stable and may be subject to change in the future.
    """
    datasets: Dict[str, int] = {}
    for agent_spec in db.get_active_agents().values():
        try:
            ursa = UrsaDb(agent_spec.ursadb_url)
            datasets.update(ursa.topology()["result"]["datasets"])
        except Again:
            pass

    return BackendStatusDatasetsSchema(datasets=datasets)
Exemplo n.º 6
0
    def __init__(self, group_id: str, ursa_url: str, db: Database) -> None:
        """Creates a new agent instance. Every agents belongs to some group
        (identified by `group_id`). There may be multiple agents in a
        single group, but they're all exchangeable (they read and write to the
        same queues, and they use the same ursadb instance).

        :param group_id: Identifier of the agent group this agent belongs to.
        :type group_id: str
        :param ursa_url: URL to connected ursadb instance. Ideally this should
            be public, because this will allow mquery to collect measurements.
        :type ursa_url: str
        :param db: Reference to main database/task queue.
        :type db: Database
        """
        self.group_id = group_id
        self.ursa_url = ursa_url
        self.db = db
        self.ursa = UrsaDb(self.ursa_url)
        self.active_plugins: List[MetadataPlugin] = []
Exemplo n.º 7
0
def prepare(
    ursadb: str,
    workdir: Path,
    path: Path,
    batch: int,
    max_file_size: int,
    mounted_as: str,
) -> None:
    if not workdir.exists():
        workdir.mkdir()

    logging.info("Prepare.1: load all indexed files into memory.")
    ursa = UrsaDb(ursadb)
    fileset = all_indexed_files(ursa)

    logging.info("Prepare.2: find all new files.")

    tmpfile = None
    current_batch = 10**20  # As good as infinity.
    new_files = 0
    batch_id = 0
    for f in find_new_files(fileset, path, mounted_as, max_file_size):
        if current_batch > batch:
            if tmpfile is not None:
                tmpfile.close()
            current_batch = 0
            tmppath = workdir / f"batch_{batch_id:010}.txt"
            tmpfile = tmppath.open(mode="w")
            batch_id += 1

        assert tmpfile is not None  # Let mypy know the obvious.
        tmpfile.write(f"{f}\n")
        current_batch += 1
        new_files += 1

    if tmpfile is not None:
        tmpfile.close()

    logging.info("Prepare.3: Got %s files in %s batches to index.", new_files,
                 batch_id)
Exemplo n.º 8
0
def index(
    ursadb: str,
    workdir: Path,
    types: List[str],
    tags: List[str],
    workers: int,
    working_datasets: Optional[int],
) -> None:
    logging.info("Index.1: Determine compacting threshold.")
    if working_datasets is None:
        working_datasets = workers * 20 + 40

    ursa = UrsaDb(ursadb)
    current_datasets = len(
        ursa.execute_command("topology;")["result"]["datasets"])
    compact_threshold = current_datasets + working_datasets

    logging.info("Index.1: Compact threshold = %s.", compact_threshold)

    logging.info("Index.2: Find prepared batches.")
    indexing_jobs = []
    for batch in workdir.glob("*.txt"):
        indexing_jobs.append((ursadb, types, tags, batch, compact_threshold))

    logging.info("Index.2: Got %s batches to run.", len(indexing_jobs))

    logging.info("Index.3: Run index commands with %s workers.", workers)
    pool = Pool(processes=workers)
    done = 0
    total = len(indexing_jobs)
    for batchid in pool.imap_unordered(index_files, indexing_jobs,
                                       chunksize=1):
        done += 1
        logging.info(f"Index.4: Batch %s done [%s/%s].", batchid, done, total)

    if list(workdir.iterdir()):
        logging.info("Index.5: Workdir not removed, because it's not empty.")
    else:
        logging.info("Index.5: Unlinking the workdir.")
        workdir.rmdir()
Exemplo n.º 9
0
def index_files(
        proc_params: Tuple[str, List[str], List[str], Path, int]) -> str:
    ursa_url, types, tags, batch, compact_threshold = proc_params
    ursa = UrsaDb(ursa_url)

    current_datasets = len(
        ursa.execute_command("topology;")["result"]["datasets"])
    if current_datasets > compact_threshold:
        ursa.execute_command("compact smart;")

    type_list = ", ".join(types)
    mounted_names = []
    wipbatch = batch.with_suffix(".wip")
    batch.rename(wipbatch)
    with wipbatch.open() as batchfile:
        for fname in batchfile:
            fname = fname[:-1]  # remove the trailing newline
            fname = fname.replace('"', '\\"')
            mounted_names.append(fname)
    mounted_list = " ".join(f'"{fpath}"' for fpath in mounted_names)
    tag_mod = ""
    if tags:
        tag_list = ",".join(f'"{tag}"' for tag in tags)
        tag_mod = f" with taints [{tag_list}]"
    result = ursa.execute_command(
        f"index {mounted_list} with [{type_list}]{tag_mod} nocheck;")
    if "error" in result:
        wipbatch.rename(batch.with_suffix(".errored"))
        batch.with_suffix(".message").write_text(json.dumps(result, indent=4))
        logging.error(
            "Batch %s errored, see %s for details",
            batch,
            batch.with_suffix(".message"),
        )
    else:
        wipbatch.unlink()
    return str(batch)
Exemplo n.º 10
0
def backend_status() -> BackendStatusSchema:
    agents = []
    components = {
        "mquery": mquery_version(),
    }
    for name, agent_spec in db.get_active_agents().items():
        try:
            ursa = UrsaDb(agent_spec.ursadb_url)
            status = ursa.status()
            tasks = status["result"]["tasks"]
            ursadb_version = status["result"]["ursadb_version"]
            agents.append(
                AgentSchema(
                    name=name, alive=True, tasks=tasks, spec=agent_spec
                )
            )
            components[f"ursadb ({name})"] = ursadb_version
        except Again:
            agents.append(
                AgentSchema(name=name, alive=False, tasks=[], spec=agent_spec)
            )
            components[f"ursadb ({name})"] = "unknown"

    return BackendStatusSchema(agents=agents, components=components,)
Exemplo n.º 11
0
#!/usr/bin/env python
import json
import logging
import time
import yara  # type: ignore
from functools import lru_cache
import random
from yara import SyntaxError
import config
from lib.ursadb import UrsaDb
from lib.yaraparse import parse_yara, combine_rules
from util import make_redis, setup_logging
from typing import Any, Dict, List, Optional, Tuple

redis = make_redis()
db = UrsaDb(config.BACKEND)


@lru_cache(maxsize=32)
def compile_yara(job_hash: str) -> Any:
    yara_rule = redis.hget("job:" + job_hash, "raw_yara")

    logging.info("Compiling Yara")
    try:
        rule = yara.compile(source=yara_rule)
    except SyntaxError as e:
        logging.exception("Yara parse error")
        raise e

    return rule
Exemplo n.º 12
0
def main() -> None:
    logging.basicConfig(level=logging.INFO)

    parser = argparse.ArgumentParser(description="Reindex local files.")
    parser.add_argument(
        "--mode",
        help="Mode of operation. Only prepare batches, index them, or both.",
        default="prepare-and-index",
        choices=["prepare", "index", "prepare-and-index"],
    )
    # switches relevant for both "prepare" and "index" modes
    parser.add_argument(
        "--ursadb",
        help="URL of the ursadb instance.",
        default="tcp://localhost:9281",
    )
    parser.add_argument("--workdir",
                        help="Path to a working directory.",
                        default=None)
    # switches relevant only for "prepare" mode
    parser.add_argument("--batch",
                        help="Size of indexing batch.",
                        type=int,
                        default=1000)
    parser.add_argument("--path",
                        help="Path of samples to be indexed.",
                        default=None)
    parser.add_argument(
        "--path-mount",
        help=
        "Path to the samples to be indexed, as seen by ursadb (if different).",
        default=None,
    )
    parser.add_argument(
        "--max-file-size-mb",
        type=int,
        help="Maximum file size, in MB, to index. 128 By default.",
        default=128,
    )
    # switches relevant only for "index" mode
    parser.add_argument(
        "--type",
        dest="types",
        help="Index types. By default [gram3, text4, wide8, hash4]",
        action="append",
        default=[],
        choices=["gram3", "text4", "hash4", "wide8"],
    )
    parser.add_argument(
        "--tag",
        dest="tags",
        help="Additional tags for indexed datasets.",
        action="append",
        default=[],
    )
    parser.add_argument(
        "--workers",
        help="Number of parallel indexing jobs.",
        type=int,
        default=2,
    )
    parser.add_argument(
        "--working-datasets",
        help="Numer of working datasets (uses sane value by default).",
        type=int,
        default=None,
    )

    args = parser.parse_args()
    types = list(set(args.types))

    if args.workdir is None:
        logging.error("--workdir is a required parameter")
        return

    try:
        ursa = UrsaDb(args.ursadb)
        ursa.status()
    except Exception:
        logging.error("Can't connect to ursadb instance at %s", args.ursadb)

    if args.mode == "prepare" or args.mode == "prepare-and-index":
        # Path must exist
        if args.path is None:
            logging.error("Path (--path) is a required parameter.")
            return

        if args.path_mount is not None:
            path_mount = args.path_mount
        else:
            path_mount = args.path

        path = Path(args.path)
        if not path.exists:
            logging.error("Path (--path) %s does not exist.", args.path)
            return

        # We're starting a new indexing operation. Workdir must not exist.
        workdir = Path(args.workdir)
        if workdir.exists() and list(workdir.iterdir()):
            logging.error(
                "Workdir %s already exists and is not empty. Remove it or choose another one.",
                args.workdir,
            )
            return

        max_file_size = args.max_file_size_mb * 1024 * 1024
        prepare(args.ursadb, workdir, path, args.batch, max_file_size,
                path_mount)

    if args.mode == "index" or args.mode == "prepare-and-index":
        # By default use only all index types.
        if not args.types:
            types = ["gram3", "text4", "wide8", "hash4"]

        # We're continuing an existing operation. Workdir must exist.
        workdir = Path(args.workdir)
        if not workdir.exists():
            logging.error(
                "Running with mode=index, but workdir %s doesn't exist",
                args.workdir,
            )
            return

        index(
            args.ursadb,
            workdir,
            types,
            args.tags,
            args.workers,
            args.working_datasets,
        )

        logging.info("Indexing finished. Consider compacting the database now")
Exemplo n.º 13
0
def main() -> None:
    logging.basicConfig(level=logging.INFO)

    parser = argparse.ArgumentParser(description="Reindex local files.")
    parser.add_argument("path", help="Path of samples to be indexed.")
    parser.add_argument(
        "--path-mount",
        help=
        "Path to the samples to be indexed, as seen by ursadb (if different).",
        default=None,
    )
    parser.add_argument(
        "--ursadb",
        help="URL of the ursadb instance.",
        default="tcp://localhost:9281",
    )
    parser.add_argument("--tmpdir",
                        help="Path to used tmpdir.",
                        default="/tmp")
    parser.add_argument(
        "--tmpdir-mount",
        help="Path to used tmpdir, as seen by ursadb (if different)",
        default=None,
    )
    parser.add_argument("--batch",
                        help="Size of indexing batch.",
                        type=int,
                        default=1000)
    parser.add_argument(
        "--type",
        dest="types",
        help="Additional index types.",
        action="append",
        default=["gram3"],
        choices=["gram3", "text4", "hash4", "wide8"],
    )
    parser.add_argument(
        "--workers",
        help="Number of parallel indexing jobs.",
        type=int,
        default=2,
    )
    parser.add_argument(
        "--dry-run",
        help="Don't index, only print filenames.",
        action="store_true",
    )

    args = parser.parse_args()

    tmpdir_mount = args.tmpdir_mount or args.tmpdir
    path_mount = args.path_mount or args.path

    logging.info("Stage 1: load all indexed files into memory.")
    ursa = UrsaDb(args.ursadb)
    fileset = all_indexed_files(ursa)

    logging.info("Stage 2: find all new files.")

    tmpfile = None
    tmpfiles = []
    current_batch = 10**20  # As good as infinity.
    new_files = 0
    for f in find_new_files(fileset, args.path, path_mount):
        if args.dry_run:
            print(f)
            continue
        if current_batch > args.batch:
            current_batch = 0
            if tmpfile:
                tmpfile.close()
            tmpfile = NamedTemporaryFile(mode="w",
                                         dir=args.tmpdir,
                                         delete=False)
            tmpfiles.append(tmpfile.name)

        assert tmpfile is not None  # Let mypy know the obvious.
        tmpfile.write(f"{f}\n")
        current_batch += 1
        new_files += 1

    logging.info("Got %s files in %s batches to index.", new_files,
                 len(tmpfiles))
    if args.dry_run:
        return
    del fileset

    indexing_jobs = []
    for ndx, tmppath in enumerate(tmpfiles):
        mounted_name = os.path.join(tmpdir_mount,
                                    os.path.relpath(tmppath, args.tmpdir))
        indexing_jobs.append((args.ursadb, args.types, mounted_name, ndx))
        logging.info(f"Batch %s: %s", ndx, mounted_name)

    logging.info("Stage 3: Run index command in parallel.")
    pool = Pool(processes=args.workers)
    done = 0
    total = len(indexing_jobs)
    for batchid in pool.imap_unordered(index_files, indexing_jobs,
                                       chunksize=1):
        done += 1
        logging.info(f"Batch %s done [%s/%s].", batchid, done, total)

    logging.info("Stage 4: Cleanup.")
    for f in tmpfiles:
        os.unlink(f)
Exemplo n.º 14
0
def index_files(proc_params: Tuple[str, List[str], str, int]) -> None:
    ursa_url, types, mounted_name, ndx = proc_params
    ursa = UrsaDb(ursa_url)
    with_ = ", ".join(types)
    ursa.execute_command(
        f'index from list "{mounted_name}" with [{with_}] nocheck;')
Exemplo n.º 15
0
def db_context(request):
    IPC = "ipc:///tmp/ursadb-test"
    context = zmq.Context()
    socket = context.socket(zmq.REP)
    socket.bind(IPC)
    return UrsadbTestContext(socket, UrsaDb(IPC))