Exemplo n.º 1
0
    def fileinfo(self, path: str) -> Dict:
        path = safe_str(path)
        data = get_digests_for_file(path, on_first_block=self.ident)
        data["ssdeep"] = ssdeep.hash_from_file(path)

        # Check if file empty
        if not int(data.get("size", -1)):
            data["type"] = "empty"

        # Futher identify zip files based of their content
        elif data["type"] in [
                "archive/zip", "java/jar", "document/office/unknown"
        ]:
            data["type"] = zip_ident(path, data["type"])

        # Further check CaRT files, they may have an explicit type set
        elif data["type"] == "archive/cart":
            data["type"] = cart_ident(path)

        # Further identify dos executables has this may be a PE that has been misidentified
        elif data["type"] == "executable/windows/dos":
            data["type"] = dos_ident(path)

        # If we're so far failed to identified the file, lets run the yara rules
        elif "unknown" in data["type"] or data["type"] == "text/plain":
            data["type"] = self.yara_ident(path, data, fallback=data["type"])

        # Extra checks for office documents
        #  - Check for encryption
        if data["type"] in [
                "document/office/word",
                "document/office/excel",
                "document/office/powerpoint",
                "document/office/unknown",
        ]:
            try:
                msoffcrypto_obj = msoffcrypto.OfficeFile(open(path, "rb"))
                if msoffcrypto_obj and msoffcrypto_obj.is_encrypted():
                    data["type"] = "document/office/passwordprotected"
            except Exception:
                # If msoffcrypto can't handle the file to confirm that it is/isn't password protected,
                # then it's not meant to be. Moving on!
                pass

        # Extra checks for PDF documents
        #  - Check for encryption
        #  - Check for PDF collection (portfolio)
        if data["type"] == "document/pdf":
            # Password protected documents typically contain '/Encrypt'
            pdf_content = open(path, "rb").read()
            if re.search(b"/Encrypt", pdf_content):
                data["type"] = "document/pdf/passwordprotected"
            # Portfolios typically contain '/Type/Catalog/Collection
            elif re.search(b"/Type/Catalog/Collection", pdf_content):
                data["type"] = "document/pdf/portfolio"

        return data
Exemplo n.º 2
0
def fileinfo(path: str) -> Dict:
    path = safe_str(path)

    data = get_digests_for_file(path, on_first_block=ident)

    # This is a special case, we know if the mime is set to one of these values
    # then the input file is almost certainly an office file, but based on only the first
    # block magic can't figure out any more than that. To handle that case we will read the
    # entire file, and identify again.
    if data['mime'] is not None and data['mime'].lower() in [
            'application/cdfv2-corrupt', 'application/cdfv2-unknown'
    ]:
        with open(path, 'rb') as fh:
            buf = fh.read()
            buflen = len(buf)
            data.update(ident(buf, buflen))
    data['ssdeep'] = ssdeep_from_file(path) if ssdeep_from_file else ''

    # When data is parsed from a cart file we trust its metatdata and can skip the recognition test later
    cart_metadata_set = False

    if not int(data.get('size', -1)):
        data['type'] = 'empty'
    elif data['type'] in ['archive/zip', 'java/jar']:
        # In addition to explicit zip files, we also want to run zip_ident when
        # a file is a jar as there is a high rate of false positive (magic
        # matching eclipse and other java related files as jars)
        data['type'] = zip_ident(path)
    elif data['type'] == 'document/office/unknown':
        # For unknown document files try identifying them by unziping,
        # but don't commit to it being a zip if it can't be extracted
        data['type'] = zip_ident(path, data['type'])
    elif data['type'] == 'unknown':
        data['type'], _ = guess_language(path)
    elif data['type'] == 'archive/cart':
        data['type'] = cart_ident(path)
        cart_metadata_set = True
    elif data['type'] == 'executable/windows/dos':
        # The default magic file misidentifies PE files with a munged DOS header
        data['type'] = dos_ident(path)
    elif data['type'] == 'code/html':
        # Magic detects .hta files as .html, guess_language detects .hta files as .js/.vbs
        # If both conditions are met, it's fair to say that the file is an .hta
        lang, _ = guess_language(path)
        if lang in ["code/javascript", "code/vbs"]:
            data['type'] = 'code/hta'

    if not recognized.get(data['type'], False) and not cart_metadata_set:
        data['type'] = 'unknown'

    return data
Exemplo n.º 3
0
    def presubmit_local_files(self, file_paths, **kw):
        default_error = {'succeeded': False, 'error': 'Unknown Error'}
        presubmit_requests = {}
        presubmit_results = {}

        ignore_size = kw.get('ignore_size', False)
        max_size = config.submissions.max.size

        # Prepare the batch presubmit.
        rid_map = {}
        for rid, local_path in enumerate(file_paths):
            rid = str(rid)
            rid_map[rid] = local_path
            try:
                assert_valid_file(local_path)
                d = digests.get_digests_for_file(local_path,
                                                 calculate_entropy=False)
                if d['size'] > max_size and not ignore_size:
                    presubmit_results[rid] = {
                        'succeeded': False,
                        'error': 'file too large (%d > %d). Skipping' % (d['size'], max_size),
                    }
                    continue
                presubmit_requests[rid] = d
                # Set a default error. Overwritten on success.
                presubmit_results[rid] = default_error.copy()
            except Exception as ex:  # pylint: disable=W0703
                log.error("Exception processing local file: %s. Skipping", ex)
                presubmit_results[rid] = {
                    'succeeded': False,
                    'error': 'local failure before presubmit: {0}'.format(ex),
                }
                continue

        if self.is_unix:
            presubmit_results = self._presubmit_local_files_unix(presubmit_requests, presubmit_results)
        else:
            presubmit_results = self._presubmit_local_files_windows(presubmit_requests, presubmit_results)

        if len(presubmit_results) != len(file_paths):
            log.error('Problem submitting %s: %s',
                      pprint.pformat(file_paths),
                      pprint.pformat(presubmit_results))

        # noinspection PyUnresolvedReferences
        for rid, result in presubmit_results.iteritems():
            result['path'] = rid_map[rid]

        return presubmit_results
    def add_extracted(
        self,
        path: str,
        name: str,
        description: str,
        classification: Optional[Classification] = None,
        safelist_interface: Optional[Union[ServiceAPI,
                                           PrivilegedServiceAPI]] = None
    ) -> bool:

        # Service-based safelisting of files has to be configured at the global configuration
        # Allows the administrator to be selective about the types of hashes to lookup in the safelist
        if safelist_interface and self.safelist_config.enabled and not (
                self.deep_scan or self.ignore_filtering):
            # Ignore adding files that are known to the system to be safe
            digests = get_digests_for_file(path)
            for hash_type in self.safelist_config.hash_types:
                qhash = digests[hash_type]
                resp = safelist_interface.lookup_safelist(qhash)
                self.log.debug(
                    f'Checking system safelist for {hash_type}: {qhash}')
                if resp and resp['enabled'] and resp['type'] == 'file':
                    self.log.info(
                        f'Dropping safelisted, extracted file.. ({hash_type}: {qhash})'
                    )
                    return False

        if self.max_extracted and len(self.extracted) >= int(
                self.max_extracted):
            raise MaxExtractedExceeded

        if not path:
            raise ValueError("Path cannot be empty")

        if not name:
            raise ValueError("Name cannot be empty")

        if not description:
            raise ValueError("Description cannot be empty")

        file = self._add_file(path, name, description, classification)

        if not file:
            return False

        self.extracted.append(file)
        return True
Exemplo n.º 5
0
def fileinfo(path):
    path = safe_str(path)

    data = get_digests_for_file(path, on_first_block=ident)
    if data['mime'].lower() == 'application/cdfv2-corrupt':
        with open(path, 'r') as fh:
            buf = fh.read()
            buflen = len(buf)
            data.update(ident(buf, buflen))
    data['ssdeep'] = ssdeep_from_file(path) if ssdeep_from_file else ''

    if not int(data.get('size', -1)):
        data['tag'] = 'empty'
    elif data['tag'] == 'archive/zip' or data['tag'] == 'java/jar':
        data['tag'] = zip_ident(path)
    elif data['tag'] == 'unknown':
        data['tag'], _ = guess_language(path)
    elif data['tag'] == 'archive/cart':
        data['tag'] = cart_ident(path)
    elif data['tag'] == 'executable/windows/dos':
        # The default magic file misidentifies PE files with a munged DOS header
        data['tag'] = dos_ident(path)

    return data