Пример #1
0
def test_upload_datatype_auto(instance, test_data, temp_file):
    with open(test_data.path, 'rb') as content:
        if hasattr(test_data.datatype, 'sniff') or 'false' in test_data.path:
            file_type = 'auto'
        else:
            file_type = test_data.datatype.file_ext
        dataset = instance.dataset_populator.new_dataset(instance.history_id, content=content, wait=False, file_type=file_type)
    dataset = instance.dataset_populator.get_history_dataset_details(instance.history_id, dataset=dataset, assert_ok=False)
    expected_file_ext = test_data.datatype.file_ext
    # State might be error if the datatype can't be uploaded
    if dataset['state'] == 'error' and not test_data.uploadable:
        # Some things can't be uploaded, if attempting to upload these datasets we mention why
        assert 'invalid' in dataset['misc_info'] or 'unsupported' in dataset['misc_info']
        return
    elif dataset['state'] == 'error' and 'empty' in dataset['misc_info']:
        return
    else:
        # state should be OK
        assert dataset['state'] == 'ok'
    # Check that correct datatype has been detected
    if 'false' in test_data.path:
        # datasets with false in their name are not of a specific datatype
        assert dataset['file_ext'] != PARENT_SNIFFER_MAP.get(expected_file_ext, expected_file_ext)
    else:
        assert dataset['file_ext'] == PARENT_SNIFFER_MAP.get(expected_file_ext, expected_file_ext)
    # download file and verify it hasn't been manipulated
    temp_file.write(instance.dataset_populator.get_history_dataset_content(history_id=instance.history_id,
                                                                           dataset=dataset,
                                                                           type='bytes',
                                                                           assert_ok=False,
                                                                           raw=True))
    temp_file.flush()
    assert md5_hash_file(test_data.path) == md5_hash_file(temp_file.name)
Пример #2
0
    def check(self):
        hashes = { key: None for key in self.paths.keys() }
        while self._active:
            do_reload = False
            with self._lock:
                paths = list(self.paths.keys())
            for path in paths:
                if not os.path.exists(path):
                    continue
                mod_time = self.paths[path]
                if not hashes.get(path, None):
                    hashes[path] = md5_hash_file(path)
                new_mod_time = None
                if os.path.exists(path):
                    new_mod_time = time.ctime(os.path.getmtime(path))
                if new_mod_time != mod_time:
                    new_hash = md5_hash_file(path)
                    if hashes[path] != new_hash:
                        self.paths[path] = new_mod_time
                        hashes[path] = new_hash
                        log.debug("The file '%s' has changes.", path)
                        do_reload = True

            if do_reload:
                with self._lock:
                    t = threading.Thread(target=self.event_handler.on_any_event)
                    t.daemon = True
                    t.start()
            time.sleep(1)
Пример #3
0
 def check(self):
     """Check for changes in self.paths or self.cache and call the event handler."""
     hashes = {key: None for key in self.paths.keys()}
     while self._active:
         do_reload = False
         with self._lock:
             paths = list(self.paths.keys())
         for path in paths:
             if not os.path.exists(path):
                 continue
             mod_time = self.paths[path]
             if not hashes.get(path, None):
                 hashes[path] = md5_hash_file(path)
             new_mod_time = None
             if os.path.exists(path):
                 new_mod_time = os.path.getmtime(path)
             if new_mod_time > mod_time:
                 new_hash = md5_hash_file(path)
                 if hashes[path] != new_hash:
                     self.paths[path] = new_mod_time
                     hashes[path] = new_hash
                     log.debug("The file '%s' has changes.", path)
                     do_reload = True
         if not do_reload and self.cache:
             removed_ids = self.cache.cleanup()
             if removed_ids:
                 do_reload = True
         if do_reload:
             self.reload_callback()
         time.sleep(1)
Пример #4
0
    def check(self):
        hashes = {key: None for key in self.paths.keys()}
        while self._active:
            do_reload = False
            with self._lock:
                paths = list(self.paths.keys())
            for path in paths:
                if not os.path.exists(path):
                    continue
                mod_time = self.paths[path]
                if not hashes.get(path, None):
                    hashes[path] = md5_hash_file(path)
                new_mod_time = None
                if os.path.exists(path):
                    new_mod_time = time.ctime(os.path.getmtime(path))
                if new_mod_time != mod_time:
                    new_hash = md5_hash_file(path)
                    if hashes[path] != new_hash:
                        self.paths[path] = new_mod_time
                        hashes[path] = new_hash
                        log.debug("The file '%s' has changes.", path)
                        do_reload = True

            if do_reload:
                with self._lock:
                    t = threading.Thread(
                        target=self.event_handler.on_any_event)
                    t.daemon = True
                    t.start()
            time.sleep(1)
Пример #5
0
 def check(self):
     """Check for changes in self.paths or self.cache and call the event handler."""
     hashes = { key: None for key in self.paths.keys() }
     while self._active:
         do_reload = False
         with self._lock:
             paths = list(self.paths.keys())
         for path in paths:
             if not os.path.exists(path):
                 continue
             mod_time = self.paths[path]
             if not hashes.get(path, None):
                 hashes[path] = md5_hash_file(path)
             new_mod_time = None
             if os.path.exists(path):
                 new_mod_time = os.path.getmtime(path)
             if new_mod_time > mod_time:
                 new_hash = md5_hash_file(path)
                 if hashes[path] != new_hash:
                     self.paths[path] = new_mod_time
                     hashes[path] = new_hash
                     log.debug("The file '%s' has changes.", path)
                     do_reload = True
         if not do_reload and self.cache:
             removed_ids = self.cache.cleanup()
             if removed_ids:
                 do_reload = True
         if do_reload:
             self.reload_callback()
         time.sleep(1)
Пример #6
0
 def _handle(self, event):
     # modified events will only have src path, move events will
     # have dest_path and src_path but we only care about dest. So
     # look at dest if it exists else use src.
     path = getattr(event, 'dest_path', None) or event.src_path
     path = os.path.abspath(path)
     callback = self.watcher.file_callbacks.get(path, None)
     if os.path.basename(path).startswith('.'):
         return
     if callback:
         ext_ok = self._extension_check(path, path)
     else:
         # reversed sort for getting the most specific dir first
         for key in reversed(sorted(self.watcher.dir_callbacks.keys())):
             if os.path.commonprefix([path, key]) == key:
                 callback = self.watcher.dir_callbacks[key]
                 ext_ok = self._extension_check(key, path)
                 break
     if not callback or not ext_ok:
         return
     cur_hash = md5_hash_file(path)
     if cur_hash:
         if self.watcher.path_hash.get(path) == cur_hash:
             return
         else:
             time.sleep(0.5)
             if cur_hash != md5_hash_file(path):
                 # We're still modifying the file, it'll be picked up later
                 return
             self.watcher.path_hash[path] = cur_hash
             callback(path=path)
Пример #7
0
 def _handle(self, event):
     # modified events will only have src path, move events will
     # have dest_path and src_path but we only care about dest. So
     # look at dest if it exists else use src.
     path = getattr(event, 'dest_path', None) or event.src_path
     path = os.path.abspath(path)
     callback = self.watcher.file_callbacks.get(path)
     if os.path.basename(path).startswith('.'):
         return
     if callback:
         ext_ok = self._extension_check(path, path)
     else:
         # reversed sort for getting the most specific dir first
         for key in reversed(sorted(self.watcher.dir_callbacks.keys())):
             if os.path.commonprefix([path, key]) == key:
                 callback = self.watcher.dir_callbacks[key]
                 ext_ok = self._extension_check(key, path)
                 break
     if not callback or not ext_ok:
         return
     cur_hash = md5_hash_file(path)
     if cur_hash:
         if self.watcher.path_hash.get(path) == cur_hash:
             return
         else:
             time.sleep(0.5)
             if cur_hash != md5_hash_file(path):
                 # We're still modifying the file, it'll be picked up later
                 return
             self.watcher.path_hash[path] = cur_hash
             callback(path=path)
Пример #8
0
 def check(self):
     """Check for changes in self.paths or self.cache and call the event handler."""
     hashes = {}
     if self.cache:
         self.cache.assert_hashes_initialized()
     while self._active and not self.exit.isSet():
         do_reload = False
         drop_on_next_loop = set()
         drop_now = set()
         with self._lock:
             paths = list(self.paths.keys())
         for path in paths:
             try:
                 if not os.path.exists(path):
                     continue
                 mod_time = self.paths[path]
                 if not hashes.get(path, None):
                     hash = md5_hash_file(path)
                     if hash:
                         hashes[path] = md5_hash_file(path)
                     else:
                         continue
                 new_mod_time = os.path.getmtime(path)
                 # mod_time can be None if a non-required config was just created
                 if not mod_time:
                     self.paths[path] = new_mod_time
                     log.debug("The file '%s' has been created.", path)
                     do_reload = True
                 elif new_mod_time > mod_time:
                     new_hash = md5_hash_file(path)
                     if hashes[path] != new_hash:
                         self.paths[path] = new_mod_time
                         hashes[path] = new_hash
                         log.debug("The file '%s' has changes.", path)
                         do_reload = True
             except OSError:
                 # in rare cases `path` may be deleted between `os.path.exists` calls
                 # and reading the file from the filesystem. We do not want the watcher
                 # thread to die in these cases.
                 if path in drop_now:
                     log.warning("'%s' could not be read, removing from watched files", path)
                     del paths[path]
                     if path in hashes:
                         del hashes[path]
                 else:
                     log.debug("'%s could not be read", path)
                     drop_on_next_loop.add(path)
                 if self.cache:
                     self.cache.cleanup()
                 do_reload = True
         if not do_reload and self.cache:
             removed_ids = self.cache.cleanup()
             if removed_ids:
                 do_reload = True
         if do_reload:
             self.reload_callback()
         drop_now = drop_on_next_loop
         drop_on_next_loop = set()
         self.exit.wait(1)
Пример #9
0
def get_input_files(*args):
    temp_dir = tempfile.mkdtemp()
    test_files = []
    try:
        for filename in args:
            shutil.copy(get_test_fname(filename), temp_dir)
            test_files.append(os.path.join(temp_dir, filename))
        md5_sums = [md5_hash_file(f) for f in test_files]
        yield test_files
        new_md5_sums = [md5_hash_file(f) for f in test_files]
        for old_hash, new_hash, f in zip(md5_sums, new_md5_sums, test_files):
            assert old_hash == new_hash, 'Unexpected change of content for file %s' % f
    finally:
        shutil.rmtree(temp_dir, ignore_errors=True)
Пример #10
0
 def check(self):
     """Check for changes in self.paths or self.cache and call the event handler."""
     hashes = {key: None for key in self.paths.keys()}
     while self._active and not self.exit.isSet():
         do_reload = False
         with self._lock:
             paths = list(self.paths.keys())
         for path in paths:
             try:
                 if not os.path.exists(path):
                     continue
                 mod_time = self.paths[path]
                 if not hashes.get(path, None):
                     hash = md5_hash_file(path)
                     if hash:
                         hashes[path] = md5_hash_file(path)
                     else:
                         continue
                 new_mod_time = os.path.getmtime(path)
                 # mod_time can be None if a non-required config was just created
                 if not mod_time:
                     self.paths[path] = new_mod_time
                     log.debug("The file '%s' has been created.", path)
                     do_reload = True
                 elif new_mod_time > mod_time:
                     new_hash = md5_hash_file(path)
                     if hashes[path] != new_hash:
                         self.paths[path] = new_mod_time
                         hashes[path] = new_hash
                         log.debug("The file '%s' has changes.", path)
                         do_reload = True
             except IOError:
                 # in rare cases `path` may be deleted between `os.path.exists` calls
                 # and reading the file from the filesystem. We do not want the watcher
                 # thread to die in these cases.
                 try:
                     del hashes[path]
                     del paths[path]
                 except KeyError:
                     pass
                 if self.cache:
                     self.cache.cleanup()
                 do_reload = True
         if not do_reload and self.cache:
             removed_ids = self.cache.cleanup()
             if removed_ids:
                 do_reload = True
         if do_reload:
             self.reload_callback()
         self.exit.wait(1)
Пример #11
0
def get_input_files(*args):
    temp_dir = tempfile.mkdtemp()
    test_files = []
    try:
        for filename in args:
            shutil.copy(get_test_fname(filename), temp_dir)
            test_files.append(os.path.join(temp_dir, filename))
        md5_sums = [md5_hash_file(f) for f in test_files]
        yield test_files
        new_md5_sums = [md5_hash_file(f) for f in test_files]
        for old_hash, new_hash, f in zip(md5_sums, new_md5_sums, test_files):
            assert old_hash == new_hash, 'Unexpected change of content for file %s' % f
    finally:
        shutil.rmtree(temp_dir, ignore_errors=True)
Пример #12
0
 def cache_tool(self, config_filename, tool):
     tool_hash = md5_hash_file(config_filename)
     tool_id = str( tool.id )
     self._hash_by_tool_paths[config_filename] = tool_hash
     self._mod_time_by_path[config_filename] = os.path.getmtime(config_filename)
     self._tool_paths_by_id[tool_id] = config_filename
     self._tools_by_path[config_filename] = tool
     self._new_tool_ids.add(tool_id)
Пример #13
0
 def check(self):
     """Check for changes in self.paths or self.cache and call the event handler."""
     hashes = {key: None for key in self.paths.keys()}
     while self._active:
         do_reload = False
         with self._lock:
             paths = list(self.paths.keys())
         for path in paths:
             try:
                 if not os.path.exists(path):
                     continue
                 mod_time = self.paths[path]
                 if not hashes.get(path, None):
                     hash = md5_hash_file(path)
                     if hash:
                         hashes[path] = md5_hash_file(path)
                     else:
                         continue
                 new_mod_time = os.path.getmtime(path)
                 if new_mod_time > mod_time:
                     new_hash = md5_hash_file(path)
                     if hashes[path] != new_hash:
                         self.paths[path] = new_mod_time
                         hashes[path] = new_hash
                         log.debug("The file '%s' has changes.", path)
                         do_reload = True
             except IOError:
                 # in rare cases `path` may be deleted between `os.path.exists` calls
                 # and reading the file from the filesystem. We do not want the watcher
                 # thread to die in these cases.
                 try:
                     del hashes[path]
                     del paths[path]
                 except KeyError:
                     pass
                 if self.cache:
                     self.cache.cleanup()
                 do_reload = True
         if not do_reload and self.cache:
             removed_ids = self.cache.cleanup()
             if removed_ids:
                 do_reload = True
         if do_reload:
             self.reload_callback()
         time.sleep(1)
Пример #14
0
 def _should_cleanup(self, config_filename):
     """Return True if `config_filename` does not exist or if modtime and hash have changes, else return False."""
     if not os.path.exists(config_filename):
         return True
     new_mtime = os.path.getmtime(config_filename)
     if self._mod_time_by_path.get(config_filename) < new_mtime:
         if md5_hash_file(config_filename) != self._hash_by_tool_paths.get(config_filename):
             return True
     return False
Пример #15
0
 def _handle(self, event):
     # modified events will only have src path, move events will
     # have dest_path and src_path but we only care about dest. So
     # look at dest if it exists else use src.
     path = getattr(event, 'dest_path', None) or event.src_path
     path = os.path.abspath(path)
     if path.endswith(".loc"):
         cur_hash = md5_hash_file(path)
         if cur_hash:
             if self.loc_watcher.path_hash.get(path) == cur_hash:
                 return
             else:
                 time.sleep(0.5)
                 if cur_hash != md5_hash_file(path):
                     # We're still modifying the file, it'll be picked up later
                     return
                 self.loc_watcher.path_hash[path] = cur_hash
                 self.loc_watcher.tool_data_tables.reload_tables(path=path)
Пример #16
0
 def _should_cleanup(self, config_filename):
     """Return True if `config_filename` does not exist or if modtime and hash have changes, else return False."""
     if not os.path.exists(config_filename):
         return True
     new_mtime = os.path.getmtime(config_filename)
     if self._mod_time_by_path.get(config_filename) < new_mtime:
         if md5_hash_file(config_filename) != self._hash_by_tool_paths.get(config_filename):
             return True
     return False
Пример #17
0
 def _handle(self, event):
     # modified events will only have src path, move events will
     # have dest_path and src_path but we only care about dest. So
     # look at dest if it exists else use src.
     path = getattr( event, 'dest_path', None ) or event.src_path
     path = os.path.abspath( path )
     if path.endswith(".loc"):
         cur_hash = md5_hash_file(path)
         if self.loc_watcher.path_hash.get(path) == cur_hash:
             return
         else:
             self.loc_watcher.path_hash[path] = cur_hash
             self.loc_watcher.tool_data_tables.reload_tables(path=path)
Пример #18
0
 def _handle(self, event):
     # modified events will only have src path, move events will
     # have dest_path and src_path but we only care about dest. So
     # look at dest if it exists else use src.
     path = getattr(event, 'dest_path', None) or event.src_path
     path = os.path.abspath(path)
     if path.endswith(".loc"):
         cur_hash = md5_hash_file(path)
         if self.loc_watcher.path_hash.get(path) == cur_hash:
             return
         else:
             self.loc_watcher.path_hash[path] = cur_hash
             self.loc_watcher.tool_data_tables.reload_tables(path=path)
Пример #19
0
 def _should_cleanup(self, config_filename):
     """Return True if `config_filename` does not exist or if modtime and hash have changes, else return False."""
     if not os.path.exists(config_filename):
         return True
     new_mtime = os.path.getmtime(config_filename)
     tool_hash = self._hash_by_tool_paths.get(config_filename)
     if tool_hash.modtime < new_mtime:
         if md5_hash_file(config_filename) != tool_hash.hash:
             return True
     tool = self._tools_by_path[config_filename]
     for macro_path in tool._macro_paths:
         new_mtime = os.path.getmtime(macro_path)
         if self._hash_by_tool_paths.get(macro_path).modtime < new_mtime:
             return True
     return False
Пример #20
0
 def cache_tool(self, config_filename, tool):
     tool_hash = md5_hash_file(config_filename)
     if tool_hash is None:
         return
     tool_id = str(tool.id)
     with self._lock:
         self._hash_by_tool_paths[config_filename] = tool_hash
         self._mod_time_by_path[config_filename] = os.path.getmtime(
             config_filename)
         self._tool_paths_by_id[tool_id] = config_filename
         self._tools_by_path[config_filename] = tool
         self._new_tool_ids.add(tool_id)
         for macro_path in tool._macro_paths:
             self._mod_time_by_path[macro_path] = os.path.getmtime(
                 macro_path)
             if tool_id not in self._macro_paths_by_id:
                 self._macro_paths_by_id[tool_id] = {macro_path}
             else:
                 self._macro_paths_by_id[tool_id].add(macro_path)
Пример #21
0
 def cache_tool(self, config_filename, tool):
     tool_hash = md5_hash_file(config_filename)
     if tool_hash is None:
         return
     tool_id = str(tool.id)
     self._hash_by_tool_paths[config_filename] = tool_hash
     self._mod_time_by_path[config_filename] = os.path.getmtime(config_filename)
     self._tool_paths_by_id[tool_id] = config_filename
     self._tools_by_path[config_filename] = tool
     self._new_tool_ids.add(tool_id)
     for macro_path in tool._macro_paths:
         self._mod_time_by_path[macro_path] = os.path.getmtime(macro_path)
         if tool_id not in self._macro_paths_by_id:
             self._macro_paths_by_id[tool_id] = {macro_path}
         else:
             self._macro_paths_by_id[tool_id].add(macro_path)
         if macro_path not in self._macro_paths_by_id:
             self._tool_ids_by_macro_paths[macro_path] = {tool_id}
         else:
             self._tool_ids_by_macro_paths[macro_path].add(tool_id)
Пример #22
0
 def hash(self):
     if self._tool_hash is None:
         self._tool_hash = md5_hash_file(self.path)
     return self._tool_hash
Пример #23
0
def upload_datatype_helper(instance,
                           test_data,
                           temp_file,
                           delete_cache_dir=False):
    is_compressed = False
    for is_method in (is_bz2, is_gzip, is_zip):
        is_compressed = is_method(test_data.path)
        if is_compressed:
            break
    with open(test_data.path, 'rb') as content:
        if hasattr(test_data.datatype, 'sniff') or 'false' in test_data.path:
            file_type = 'auto'
        else:
            file_type = test_data.datatype.file_ext
        dataset = instance.dataset_populator.new_dataset(instance.history_id,
                                                         content=content,
                                                         wait=False,
                                                         file_type=file_type)
    dataset = instance.dataset_populator.get_history_dataset_details(
        instance.history_id, dataset=dataset, assert_ok=False)
    expected_file_ext = test_data.datatype.file_ext
    # State might be error if the datatype can't be uploaded
    if dataset['state'] == 'error' and not test_data.uploadable:
        # Some things can't be uploaded, if attempting to upload these datasets we mention why
        assert 'invalid' in dataset['misc_info'] or 'unsupported' in dataset[
            'misc_info']
        return
    elif dataset['state'] == 'error' and 'empty' in dataset['misc_info']:
        return
    else:
        # state should be OK
        assert dataset['state'] == 'ok'
    # Check that correct datatype has been detected
    file_ext = dataset['file_ext']
    if 'false' in test_data.path:
        # datasets with false in their name are not of a specific datatype
        assert file_ext != PARENT_SNIFFER_MAP.get(expected_file_ext,
                                                  expected_file_ext)
    else:
        assert file_ext == PARENT_SNIFFER_MAP.get(expected_file_ext,
                                                  expected_file_ext)
    datatype = registry.datatypes_by_extension[file_ext]
    datatype_compressed = getattr(datatype, "compressed", False)
    if not is_compressed or datatype_compressed:
        if delete_cache_dir:
            # Delete cache directory and then re-create it. This way we confirm
            # that dataset is fetched from the object store, not from the cache
            temp_dir = instance.get_object_store_kwargs()['temp_directory']
            cache_dir = temp_dir + '/object_store_cache'
            shutil.rmtree(cache_dir)
            os.mkdir(cache_dir)

        # download file and verify it hasn't been manipulated
        temp_file.write(
            instance.dataset_populator.get_history_dataset_content(
                history_id=instance.history_id,
                dataset=dataset,
                type='bytes',
                assert_ok=False,
                raw=True))
        temp_file.flush()
        assert md5_hash_file(test_data.path) == md5_hash_file(temp_file.name)