Пример #1
0
    def _consumer_thread(self) -> None:
        processed = 0
        while True:
            event = self._queue.get()
            processed += 1

            if event is _SENTINEL:
                logger.debug(
                    f"Consumer Thread {current_thread().name} finished after processing {processed} events"
                )
                return

            try:
                nodes = self.transform(event)
            except Exception as e:
                logger.warning(
                    f"Error when parsing event, recieved exception {e}")
                logger.debug(event)
                self.errors[current_thread()].append(e)
                nodes = []

            if nodes:
                self.nodes += nodes

            self._queue.task_done()
Пример #2
0
def _setup_params(form: dict, schema: dict, is_external: bool) -> dict:
    logger.debug("Setting up parameters")

    params: Dict[str, Any] = {}

    if is_external:
        # External parameters are in the form
        params = {}
        for param in schema["params"]:
            if param["name"] in request.form:
                params[param["name"]] = request.form[param["name"]]

        logger.info(f"ExternalDataSource params received {params}")

    else:
        for param in schema["params"]:
            # Save the files, keep track of which parameter they represent
            if param["name"] in request.files:
                params[param["name"]] = tempfile.NamedTemporaryFile()
                request.files[param["name"]].save(params[param["name"]].name)
                params[param["name"]].seek(0)

        logger.info(f"Saved uploaded files {params}")

    logger.debug("Set up parameters")

    return params
Пример #3
0
    def _make_edges(self, source_graph: nx.Graph) -> None:

        logger.info("Grouping Edges by type")

        sorted_edges = sorted(source_graph.edges(data=True, keys=True),
                              key=lambda edge: edge[3]["edge_name"])

        edges_by_type = itertools.groupby(
            sorted_edges, key=lambda edge: edge[3]["edge_name"])

        for edge_type, edges in edges_by_type:

            # Remove white spaces
            edge_type = edge_type.replace(" ", "_")

            cypher_edges = list(map(self._edge_as_cypher, edges))

            logger.debug(
                f"Inserting {len(cypher_edges)} {edge_type} edges into Neo4J")

            for i in range(0, len(cypher_edges), self.batch_size):

                start = i
                end = i + self.batch_size

                cypher = f"UNWIND [{', '.join(cypher_edges[start: end])}] as row\n"
                cypher += "MATCH (src {_key: row.src}), (dst {_key: row.dst})"
                cypher += f" CREATE (src)-[:`{edge_type}`]->(dst)"

                with self.neo4j.session() as session:
                    session.write_transaction(lambda tx: tx.run(cypher))

                logger.debug(f"Finished batch {i+1} ({start} -> {end})")
Пример #4
0
def _add_to_exiting_graph(
    existing_backend: Backend,
    datasource_cls: Type[DataSource],
    transformer_cls: Type[Transformer],
    params: Dict[str, Any],
    is_external: bool,
) -> Tuple[dict, bool]:
    try:
        # Set up parameters for datasource class
        datasource_params = (
            # Use filenames if we are referencing a temporary file
            {param_name: tempfile.name for param_name, tempfile in params.items()}
            if not is_external
            else params
        )
        # Create the datasource
        datasource = datasource_cls(**datasource_params)  # type: ignore
        # Create transformer
        transformer = datasource.to_transformer(transformer_cls)

        # Create the nodes
        nodes = transformer.run()

        # Create the backend
        G = existing_backend.add_nodes(nodes)

    except Exception as e:
        logger.critical(f"Failure to generate graph {e}")
        import traceback

        logger.debug(f"{traceback.format_exc()}")

        if not is_external:
            # Clean up temporary files
            try:
                for _tempfile in params.values():
                    _tempfile.close()
            except Exception as e:
                logger.critical(f"Failure to clean up temporary files after error {e}")
                return {"message": str(e)}, False

    logger.info("Cleaning up tempfiles")

    if not is_external:
        # Clean up temporary files
        for _tempfile in params.values():
            _tempfile.close()

    logger.info("Finished generating graph")

    # Check if we even had a graph.
    # This will be on the G attribute for any class subclassing NetworkX
    if existing_backend.is_empty():
        return {"message": f"Graph generation resulted in 0 nodes."}, False

    return {"graph": G, "backend": existing_backend}, True
Пример #5
0
    def _producer_thread(self) -> None:
        i = 0
        for element in self.datasource.events():
            self._queue.put(element, block=True)
            i += 1

        logger.debug(
            f"Producer Thread {current_thread().name} finished after {i} events"
        )
        return
Пример #6
0
    def setup_schema(self) -> None:
        """Sets up the DGraph schema based on the nodes. This inspect all attributes of all nodes,
        and generates a schema for them. Each schema entry has the format `{node_type}.{field}`. If a
        field is a string field, it has the `@index(exact)` predicate added to it.

        An example output schema::

            process.process_image string @index(exact)
            process.process_id int

        """

        all_node_types = inspect.getmembers(
            sys.modules["beagle.nodes"],
            lambda cls: inspect.isclass(cls)
            and not inspect.isabstract(cls)
            and issubclass(cls, Node)
            and cls != Node,
        )

        schema = ""

        for cls_name, node_class in all_node_types:
            for attr, attr_type in node_class.__annotations__.items():
                if attr == "key_fields":
                    continue

                # https://github.com/python/typing/issues/528#issuecomment-357751667
                if type(attr_type) == type(Union):
                    attr_type = attr_type.__args__[0]

                if attr_type == int:
                    attr_type = "int"
                elif type(attr_type) == type(DefaultDict) and issubclass(
                    attr_type.__args__[1], Edge
                ):
                    # Don't need this, get built automatically
                    continue
                else:
                    attr_type = "string @index(exact)"

                # Remove spaces, lowercase
                schema += f"{node_class.__name__.lower().replace(' ', '_')}.{attr}: {attr_type} .\n"

        schema += "<type>: string @index(exact) .\n"
        logger.debug(schema)
        self.dgraph.alter(pydgraph.Operation(schema=schema))
Пример #7
0
    def run(self) -> List[Node]:
        """Generates the list of nodes from the datasource.

        This methods kicks off a producer/consumer queue. The producer grabs events
        one by one from the datasource by iterating over the events from the `events`
        generator. Each event is then sent to the :py:meth:`transformer` function to be
        transformer into one or more `Node` objects.

        Returns
        -------
        List[Node]
            All Nodes created from the data source.
        """

        logger.debug("Launching transformer")

        threads: List[Thread] = []

        producer_thread = Thread(target=self._producer_thread)
        producer_thread.start()
        threads.append(producer_thread)
        self.errors[producer_thread] = []

        logger.debug("Started producer thread")

        consumer_count = _THREAD_COUNT - 1
        if consumer_count == 0:
            consumer_count = 1

        for i in range(consumer_count):
            t = Thread(target=self._consumer_thread)
            self.errors[t] = []
            t.start()
            threads.append(t)

        logger.debug(f"Started {_THREAD_COUNT-1} consumer threads")

        # Wait for the producer to finish
        producer_thread.join()
        self._queue.join()

        # Stop the threads
        for i in range(consumer_count):
            self._queue.put(_SENTINEL)

        for thread in threads:
            thread.join()

        logger.info(
            f"Finished processing of events, created {len(self.nodes)} nodes.")

        if any([len(x) > 0 for x in self.errors.values()]):
            logger.warning(f"Parsing finished with errors.")
            logger.debug(self.errors)

        return self.nodes
Пример #8
0
    def _make_nodes(self, source_graph: nx.Graph) -> None:

        logger.info("Grouping Nodes by type")

        # Group nodes by class
        sorted_nodes = sorted(
            [node["data"] for _, node in source_graph.nodes(data=True)],
            key=lambda node: node.__name__,
            reverse=True,
        )

        nodes_by_type = itertools.groupby(sorted_nodes,
                                          key=lambda node: node.__name__)

        for node_type, nodes in nodes_by_type:

            # remove whitespaces
            node_type = node_type.replace(" ", "_")

            self._create_constraint(node_type)

            cypher_nodes = list(map(self._node_as_cypher, nodes))

            logger.debug(
                f"Inserting {len(cypher_nodes)} {node_type} nodes into Neo4J")

            for i in range(0, len(cypher_nodes), self.batch_size):

                start = i
                end = i + self.batch_size

                cypher = f"UNWIND [{', '.join(cypher_nodes[start: end])}] as row\n"

                cypher += f"CREATE (node:{node_type} {{_key: row._key}}) SET node = row"

                with self.neo4j.session() as session:
                    session.write_transaction(lambda tx: tx.run(cypher))

                logger.debug(f"Finished batch {i+1} ({start} -> {end})")
Пример #9
0
def _validate_params(form: dict, files: dict) -> Tuple[dict, bool]:
    """Validates that the passed in parameters are valid. Test for the following:
    1. Datasource, comment, and transformer all passed in (backend is optional).
    2. For the datasource requested, all of the parameters to the datasource are present.

    Parameters
    ----------
    form : dict
        The HTTP form sent
    files : dict
        The files sent along the form, if any

    Returns
    -------
    Tuple[dict, bool]
        Return (error message, False) if not valid, otherwise (config, True)
    """
    # Verify we have the basic parameters.
    missing_params = []
    for req_param in ["datasource", "transformer", "comment"]:
        if req_param not in form:
            missing_params.append(req_param)

    if len(missing_params) > 0:
        logger.debug(f"Request to /new missing parameters: {missing_params}")
        return ({"message": f"Missing parameters {missing_params}"}, False)

    # Pull out the requested datasource/transformer.
    requested_datasource = form["datasource"]
    requested_transformer = form["transformer"]
    # Backend is optional
    requested_backend = form.get("backend", "NetworkX")

    datasource_schema = next(
        filter(lambda entry: entry["id"] == requested_datasource, SCHEMA["datasources"]), None
    )

    if datasource_schema is None:
        logger.debug(f"User requested a non-existent data source {requested_datasource}")
        resp = {
            "message": f"Requested datasource '{requested_datasource}' is invalid, "
            + "please use /api/datasources to find a list of valid datasources"
        }
        return (resp, False)

    datasource_cls = DATASOURCES[requested_datasource]
    transformer_cls = TRANSFORMERS[requested_transformer]
    backend_cls = BACKENDS[requested_backend]
    required_params: List[Dict[str, Any]] = datasource_schema["params"]

    is_external = issubclass(datasource_cls, ExternalDataSource)

    # Make sure the user provided all required parameters for the datasource.
    datasource_missing_params = []
    for param in required_params:
        # Skip missing parameters
        if param["required"] is False:
            continue
        if is_external and param["name"] not in form:
            datasource_missing_params.append(param["name"])

        if not is_external and param["name"] not in files:
            datasource_missing_params.append(param["name"])

    if len(datasource_missing_params) > 0:
        logger.debug(
            f"Missing datasource {'form' if is_external else 'files'} params {datasource_missing_params}"
        )
        resp = {
            "message": f"Missing datasource {'form' if is_external else 'files'} params {datasource_missing_params}"
        }
        return (resp, False)

    return (
        {
            "datasource": datasource_cls,
            "transformer": transformer_cls,
            "backend": backend_cls,
            "schema": datasource_schema,
            "required_params": required_params,
        },
        True,
    )
Пример #10
0
def new():
    """Generate a new graph using the supplied DataSource, Transformer, and the parameters
    passed to the DataSource.

    At minimum, the user must supply the following form parameters:
        1. datasource
        2. transformer
        3. comment
        4. backend

    Outside of that, the user must supply at **minimum** the parameters marked by
    the datasource as required.
        * Use the /api/datasources endpoint to see which ones these are.
        * Programmatically, these are any parameters without a default value.

    Failure to supply either the minimum three or the required parameters for that datasource
    returns a 400 status code with the missing parameters in the 'message' field.

    If any part of the graph creation yields an error, a 500 HTTP code is returend with the
    python exception as a string in the 'message' field.

    If the graph is succesfully created, the user is returned a dictionary with the ID of the graph
    and the URI path to viewing it in the *beagle web interface*.

    For example:

    >>> {
        id: 1,
        self: /fireeye_hx/1
    }

    Returns
    -------
    dict
        {id: integer, self: string}
    """

    # Returns a tuple of (dict, bool).
    resp, success = _validate_params(form=request.form, files=request.files)

    # If false, return error message
    if not success:
        return make_response(jsonify(resp), 400)

    datasource_cls: Type[DataSource] = resp["datasource"]
    transformer_cls: Type[Transformer] = resp["transformer"]
    backend_cls: Type[Backend] = resp["backend"]
    datasource_schema = resp["schema"]
    # If this class extends the ExternalDataSource class, we know that the parameters
    # represent strings, and not files.

    is_external = issubclass(datasource_cls, ExternalDataSource)

    logger.info(
        f"Recieved upload request for datasource=<{datasource_cls.__name__}>, "
        + f"transformer=<{transformer_cls.__name__}>, backend=<{backend_cls.__name__}>"
    )

    logger.info("Transforming data to a graph.")

    params = _setup_params(form=request.form, schema=datasource_schema, is_external=is_external)

    resp, success = _create_graph(
        datasource_cls=datasource_cls,
        transformer_cls=transformer_cls,
        backend_cls=backend_cls,
        params=params,
        is_external=is_external,
    )

    if not success:
        return make_response(jsonify(resp), 400)

    G = resp["graph"]

    # If the backend is NetworkX, save the graph.
    # Otherwise, redirect the user to wherever he sent it (if possible)
    if backend_cls.__name__ == "NetworkX":
        response = _save_graph_to_db(backend=resp["backend"], category=datasource_cls.category)
        response = jsonify(response)
    else:
        logger.debug(G)
        response = jsonify({"resp": G})

    return response
Пример #11
0
def new():
    """Generate a new graph using the supplied DataSource, Transformer, and the parameters
    passed to the DataSource.

    At minimum, the user must supply the following form parameters:
        1. datasource
        2. transformer
        3. comment
        4. backend

    Outside of that, the user must supply at **minimum** the parameters marked by
    the datasource as required.
        * Use the /api/datasources endpoint to see which ones these are.
        * Programmatically, these are any parameters without a default value.

    Failure to supply either the minimum three or the required parameters for that datasource
    returns a 400 status code with the missing parameters in the 'message' field.

    If any part of the graph creation yields an error, a 500 HTTP code is returend with the
    python exception as a string in the 'message' field.

    If the graph is succesfully created, the user is returned a dictionary with the ID of the graph
    and the URI path to viewing it in the *beagle web interface*.

    For example:

    >>> {
        id: 1,
        self: /fireeye_hx/1
    }

    Returns
    -------
    dict
        {id: integer, self: string}
    """

    # Verify we have the basic parameters.
    missing_params = []
    for param in ["datasource", "transformer", "comment"]:
        if param not in request.form:
            missing_params.append(param)

    if len(missing_params) > 0:
        logger.debug(f"Request to /new missing parameters: {missing_params}")
        return make_response(
            jsonify({"message": f"Missing parameters {missing_params}"}), 400)

    # Get the
    requested_datasource = request.form["datasource"]
    requested_transformer = request.form["transformer"]
    requested_backend = request.form.get("backend", "NetworkX")

    datasource_schema = next(
        filter(lambda entry: entry["id"] == requested_datasource,
               SCHEMA["datasources"]), None)

    if datasource_schema is None:
        logger.debug(
            f"User requested a non-existent data source {requested_datasource}"
        )
        return make_response(
            jsonify({
                "message":
                f"Requested datasource '{requested_datasource}' is invalid, " +
                "please use /api/datasources to find a list of valid datasources"
            }),
            400,
        )

    logger.info(
        f"Recieved upload request for datasource=<{requested_datasource}>, " +
        f"transformer=<{requested_transformer}>, backend=<{requested_backend}>"
    )

    datasource_cls = DATASOURCES[requested_datasource]
    transformer_cls = TRANSFORMERS[requested_transformer]
    backend_class = BACKENDS[requested_backend]

    required_parameters = datasource_schema["params"]

    # If this class extends the ExternalDataSource class, we know that the parameters
    # represent strings, and not files.
    is_external = issubclass(datasource_cls, ExternalDataSource)

    # Make sure the user provided all required parameters for the datasource.
    datasource_missing_params = []
    for param in required_parameters:
        # Skip missnig parameters
        if param["required"] is False:
            continue
        if is_external and param["name"] not in request.form:
            datasource_missing_params.append(param["name"])

        if not is_external and param["name"] not in request.files:
            datasource_missing_params.append(param["name"])

    if len(datasource_missing_params) > 0:
        logger.debug(
            f"Missing datasource {'form' if is_external else 'files'} params {datasource_missing_params}"
        )
        return make_response(
            jsonify({
                "message":
                f"Missing datasource {'form' if is_external else 'files'} params {datasource_missing_params}"
            }),
            400,
        )

    logger.info("Transforming data to a graph.")

    logger.debug("Setting up parameters")
    params = {}

    if is_external:
        # External parameters are in the form
        params = {}
        for param in datasource_schema["params"]:
            if param["name"] in request.form:
                params[param["name"]] = request.form[param["name"]]

        logger.info(f"ExternalDataSource params received {params}")

    else:
        for param in datasource_schema["params"]:
            # Save the files, keep track of which parameter they represent
            if param["name"] in request.files:
                params[param["name"]] = tempfile.NamedTemporaryFile()
                request.files[param["name"]].save(params[param["name"]].name)
                params[param["name"]].seek(0)

        logger.info(f"Saved uploaded files {params}")

    logger.debug("Set up parameters")

    try:
        # Create the datasource
        datasource = datasource_cls(
            # Give file paths instead of file-like objects when not external source.
            **({
                param_name: tempfile.name
                for param_name, tempfile in params.items()
            } if not is_external else params))
        transformer = datasource.to_transformer(transformer_cls)
        graph = backend_class(metadata=datasource.metadata(),
                              nodes=transformer.run(),
                              consolidate_edges=True)
        # Make the graph
        G = graph.graph()

    except Exception as e:
        logger.critical(f"Failure to generate graph {e}")
        import traceback

        logger.debug(f"{traceback.format_exc()}")

        if not is_external:
            # Clean up temporary files
            try:
                for _tempfile in params.values():
                    _tempfile.close()
            except Exception as e:
                logger.critical(
                    f"Failure to clean up temporary files after error {e}")

        response = make_response(jsonify({"message": str(e)}), 500)
        response.headers.add("Access-Control-Allow-Origin", "*")
        return response

    logger.info("Cleaning up tempfiles")

    if not is_external:
        # Clean up temporary files
        for _tempfile in params.values():
            _tempfile.close()

    logger.info("Finished generating graph")

    # Check if we even had a graph.
    # This will be on the G attribute for any class subclassing NetworkX
    if graph.is_empty():
        return make_response(
            jsonify({"message": f"Graph generation resulted in 0 nodes. "}),
            400)

    # If the backend is NetworkX, save the graph.
    # Otherwise, redirect the user to wherever he sent it (if possible)
    if backend_class.__name__ == "NetworkX":

        # Take the SHA256 of the contents of the graph.
        contents_hash = hashlib.sha256(
            json.dumps(graph.to_json(),
                       sort_keys=True).encode("utf-8")).hexdigest()

        # See if we have previously generated this *exact* graph.
        existing = Graph.query.filter_by(meta=graph.metadata,
                                         sha256=contents_hash).first()

        if existing:
            logger.info(f"Graph previously generated with id {existing.id}")
            response = jsonify({
                "id": existing.id,
                "self": f"/{existing.category}/{existing.id}"
            })
            response.headers.add("Access-Control-Allow-Origin", "*")
            return response

        dest_folder = datasource_cls.category.replace(" ", "_").lower()
        # Set up the storage directory.
        dest_path = f"{Config.get('storage', 'dir')}/{dest_folder}/{contents_hash}.json"
        os.makedirs(f"{Config.get('storage', 'dir')}/{dest_folder}",
                    exist_ok=True)

        db_entry = Graph(
            sha256=contents_hash,
            meta=graph.metadata,
            comment=request.form.get("comment", None),
            category=dest_folder,  # Categories use the lower name!
            file_path=f"{contents_hash}.json",
        )

        db.session.add(db_entry)
        db.session.commit()

        logger.info(f"Added graph to database with id={db_entry.id}")

        json.dump(graph.to_json(), open(dest_path, "w"))

        logger.info(f"Saved graph to {dest_path}")

        response = jsonify({
            "id": db_entry.id,
            "self": f"/{dest_folder}/{db_entry.id}"
        })
    else:
        logger.debug(G)
        response = jsonify({"resp": G})

    response.headers.add("Access-Control-Allow-Origin", "*")
    return response
Пример #12
0
    def _create_constraint(self, node_type: str) -> None:
        constraint_format = "CREATE CONSTRAINT ON (n:{name}) ASSERT n._key is UNIQUE"

        logger.debug(f"Creating _key constraint for {node_type}")
        with self.neo4j.session() as session:
            session.run(constraint_format.format(name=node_type))