class IdentifyFileFormat(DBTask): queue = 'file_operation' def handle_matches(self, fullname, matches, delta_t, matchtype=''): if len(matches) == 0: raise ValueError("No matches for %s" % fullname) f, sigName = matches[-1] self.lastFmt = f.find('name').text def run(self, filename=None): """ Identifies the format of the file using the fido library Args: filename: The filename to identify Returns: The format of the file """ self.fid = Fido() self.fid.handle_matches = self.handle_matches self.fid.identify_file(filename) self.set_progress(100, total=100) return self.lastFmt def undo(self, filename=None): pass def event_outcome_success(self, filename=None, block_size=65536, algorithm='SHA-256'): return "Identified format of %s" % filename
def test_fido_cache_halting_file(fido_cache_halting_file): """Tests that time used between raw Fido usage and FidoDetector usage does not provide big difference in processing time.""" fido_object = Fido( quiet=True, format_files=["formats-v95.xml", "format_extensions.xml"]) fido_start_time = time.time() fido_object.identify_file(fido_cache_halting_file) fido_elapsed_time = time.time() - fido_start_time fido_reader_start_time = time.time() fido_reader_object = FidoDetector(fido_cache_halting_file) fido_reader_object.detect() fido_reader_elapsed_time = time.time() - fido_reader_start_time # 2 second difference is acceptable with the given test file. assert abs(fido_elapsed_time - fido_reader_elapsed_time) < 2
class FormatIdentification(): """ File Format Identification """ def __init__(self): self.fid = Fido() self.fid.handle_matches = self.print_matches self.lastFmt = None def identify_file(self, entry): """ This function identifies the file format of every file that is handed over. """ self.fid.identify_file(entry) return self.lastFmt def print_matches(self, fullname, matches, delta_t, matchtype=''): #print "####" + fullname for (f, s) in matches: self.lastFmt = self.fid.get_puid(f)
class FormatIdentification(): """ File Format Identification """ def __init__(self): self.fid = Fido() self.fid.handle_matches = self.print_matches self.lastFmt = None def identify_file(self, entry): """ This function identifies the file format of every file that is handed over. """ self.fid.identify_file(entry) return self.lastFmt def get_mime_for_puid(self, puid): """ Get mime type for a given puid @type puid: string @param puid: PRONOM Persistent Unique Identifier @rtype: string @return: mime type string (default: application/octet-stream) """ mime_tag = "mime" fmtres = self.fid.puid_format_map[puid] childs = [child for child in fmtres if child.tag.endswith(mime_tag)] if len(childs) == 1: return (childs[0]).text.strip() else: return "application/octet-stream" def print_matches(self, fullname, matches, delta_t, matchtype=''): # print "####" + fullname for (f, s) in matches: self.lastFmt = self.fid.get_puid(f)
class FormatIdentification(): """ File Format Identification """ def __init__(self): if not fido_disabled: self.fid = Fido() self.fid.handle_matches = self.print_matches self.lastFmt = None def identify_file(self, entry): """ This function identifies the file format of every file that is handed over. """ assert not fido_disabled, "Fido module is not available!" self.fid.identify_file(entry) return self.lastFmt def get_mime_for_puid(self, puid): """ :param puid: PRONOM Persistent Unique Identifier :return: mime type string (default: application/octet-stream) """ assert not fido_disabled, "Fido module is not available!" mime_tag = "mime" fmtres = self.fid.puid_format_map[puid] childs = [child for child in fmtres if child.tag.endswith(mime_tag)] if len(childs) == 1: return (childs[0]).text.strip() return "application/octet-stream" def print_matches(self, fullname, matches, delta_t, matchtype=''): assert not fido_disabled, "Fido module is not available!" for (f, s) in matches: self.lastFmt = self.fid.get_puid(f)