예제 #1
0
class IdentifyFileFormat(DBTask):
    queue = 'file_operation'

    def handle_matches(self, fullname, matches, delta_t, matchtype=''):
        if len(matches) == 0:
            raise ValueError("No matches for %s" % fullname)

        f, sigName = matches[-1]
        self.lastFmt = f.find('name').text

    def run(self, filename=None):
        """
        Identifies the format of the file using the fido library

        Args:
            filename: The filename to identify

        Returns:
            The format of the file
        """

        self.fid = Fido()
        self.fid.handle_matches = self.handle_matches
        self.fid.identify_file(filename)

        self.set_progress(100, total=100)

        return self.lastFmt

    def undo(self, filename=None):
        pass

    def event_outcome_success(self, filename=None, block_size=65536, algorithm='SHA-256'):
        return "Identified format of %s" % filename
예제 #2
0
    def __init__(self, filename):
        """
        Initialize the reader.

        Fido is done with old-style python and does not inherit object,
        so super() is not available.
        :filename: File path
        """
        self.filename = filename  # File path
        self.puid = None  # Identified pronom code
        self.mimetype = None  # Identified mime type
        self.version = None  # Identified file format version
        Fido.__init__(self, quiet=True, format_files=[
            "formats-v94.xml", "format_extensions.xml"])
예제 #3
0
def test_fido_cache_halting_file(fido_cache_halting_file):
    """Tests that time used between raw Fido usage and FidoDetector usage does
    not provide big difference in processing time."""
    fido_object = Fido(
        quiet=True, format_files=["formats-v95.xml", "format_extensions.xml"])
    fido_start_time = time.time()
    fido_object.identify_file(fido_cache_halting_file)
    fido_elapsed_time = time.time() - fido_start_time

    fido_reader_start_time = time.time()
    fido_reader_object = FidoDetector(fido_cache_halting_file)
    fido_reader_object.detect()
    fido_reader_elapsed_time = time.time() - fido_reader_start_time

    # 2 second difference is acceptable with the given test file.
    assert abs(fido_elapsed_time - fido_reader_elapsed_time) < 2
예제 #4
0
    def load_fido_xml(self, file):
        """Overloads the default load_fido_xml so that it has an option to
        prevent being called again.

        If data has been cached, will use that data instead.

        :param file: File that will be loaded.
        """
        if _FidoCachedFormats._use_cached:
            self.formats = _FidoCachedFormats._cached_formats
            self.puid_format_map = _FidoCachedFormats._cached_puid_format_map
            self.puid_has_priority_over_map = \
                _FidoCachedFormats._cached_puid_has_priority_over_map
        else:
            Fido.load_fido_xml(self, file=file)

        return self.formats
예제 #5
0
    def run(self, filename=None):
        """
        Identifies the format of the file using the fido library

        Args:
            filename: The filename to identify

        Returns:
            The format of the file
        """

        self.fid = Fido()
        self.fid.handle_matches = self.handle_matches
        self.fid.identify_file(filename)

        self.set_progress(100, total=100)

        return self.lastFmt
예제 #6
0
class FormatIdentification():
    """
    File Format Identification
    """
    def __init__(self):
        self.fid = Fido()
        self.fid.handle_matches = self.print_matches
        self.lastFmt = None

    def identify_file(self, entry):
        """
        This function identifies the file format of every file that is handed over.
        """
        self.fid.identify_file(entry)
        return self.lastFmt

    def print_matches(self, fullname, matches, delta_t, matchtype=''):
        #print "####" + fullname
        for (f, s) in matches:
            self.lastFmt = self.fid.get_puid(f)
예제 #7
0
class FormatIdentification():
    """
    File Format Identification
    """
    def __init__(self):
        self.fid = Fido()
        self.fid.handle_matches = self.print_matches
        self.lastFmt = None

    def identify_file(self, entry):
        """
        This function identifies the file format of every file that is handed over.
        """
        self.fid.identify_file(entry)
        return self.lastFmt

    def get_mime_for_puid(self, puid):
        """
        Get mime type for a given puid

        @type       puid: string
        @param      puid: PRONOM Persistent Unique Identifier

        @rtype:     string
        @return:    mime type string (default: application/octet-stream)
        """
        mime_tag = "mime"
        fmtres = self.fid.puid_format_map[puid]
        childs = [child for child in fmtres if child.tag.endswith(mime_tag)]
        if len(childs) == 1:
            return (childs[0]).text.strip()
        else:
            return "application/octet-stream"

    def print_matches(self, fullname, matches, delta_t, matchtype=''):
        # print "####" + fullname
        for (f, s) in matches:
            self.lastFmt = self.fid.get_puid(f)
예제 #8
0
class FormatIdentification():
    """
    File Format Identification
    """

    def __init__(self):
        if not fido_disabled:
            self.fid = Fido()
            self.fid.handle_matches = self.print_matches
            self.lastFmt = None

    def identify_file(self, entry):
        """
        This function identifies the file format of every file that is handed over.
        """
        assert not fido_disabled, "Fido module is not available!"
        self.fid.identify_file(entry)
        return self.lastFmt

    def get_mime_for_puid(self, puid):
        """

        :param puid: PRONOM Persistent Unique Identifier
        :return: mime type string (default: application/octet-stream)
        """
        assert not fido_disabled, "Fido module is not available!"
        mime_tag = "mime"
        fmtres = self.fid.puid_format_map[puid]
        childs = [child for child in fmtres if child.tag.endswith(mime_tag)]
        if len(childs) == 1:
            return (childs[0]).text.strip()
        return "application/octet-stream"

    def print_matches(self, fullname, matches, delta_t, matchtype=''):
        assert not fido_disabled, "Fido module is not available!"
        for (f, s) in matches:
            self.lastFmt = self.fid.get_puid(f)
예제 #9
0
    def run(self, filename=None, fid=Fido()):
        """
        Identifies the format of the file using the fido library

        Args:
            filename: The filename to identify

        Returns:
            A tuple with the format name, version and registry key
        """

        self.fid = fid
        self.fid.handle_matches = self.handle_matches
        self.fid.identify_file(filename)

        return (self.format_name, self.format_version,
                self.format_registry_key)
예제 #10
0
def test_fido_format_caching():
    """Tests that caching works as if no caching has been used."""
    fido_object = Fido(
        quiet=True, format_files=["formats-v95.xml", "format_extensions.xml"])
    start_time = time.time()
    for _ in range(200):
        reader = _FidoReader('non_existing_file.xml')
        # If caching works, the time spent to initialize the _FidoReader should
        # not take long so 30 seconds would be the absolute max.
        elapsed_time = time.time() - start_time
        assert elapsed_time < 30

        # We're constraining to len for assert, because these three attributes
        # contains large amount of lxml element-objects and thus would
        # make comparison very slow.
        assert len(reader.puid_format_map) == len(fido_object.puid_format_map)
        assert len(reader.formats) == len(fido_object.formats)
        assert len(reader.puid_has_priority_over_map) == len(
            fido_object.puid_has_priority_over_map)
예제 #11
0
 def fido(self):
     if self._fido is None:
         logger.debug('Initiating fido')
         self._fido = Fido(handle_matches=self.handle_matches)
         logger.info('Initiated fido')
     return self._fido
예제 #12
0
 def __init__(self):
     if not fido_disabled:
         self.fid = Fido()
         self.fid.handle_matches = self.print_matches
         self.lastFmt = None
예제 #13
0
 def event_outcome_success(self, filename=None, fid=Fido()):
     return "Identified format of %s" % filename
예제 #14
0
 def undo(self, filename=None, fid=Fido()):
     pass
예제 #15
0
 def __init__(self):
     self.fid = Fido()
     self.fid.handle_matches = self.print_matches
     self.lastFmt = None
예제 #16
0
 def fido(self):
     if self._fido is None:
         self._fido = Fido(handle_matches=self.handle_matches)
     return self._fido
예제 #17
0
 def fido(self):
     if self._fido is None:
         logger.debug('Initiating fido')
         self._fido = Fido(handle_matches=self.handle_matches, format_files=FORMAT_FILES)
         logger.info('Initiated fido')
     return self._fido