def test_upload_datatype_auto(instance, test_data, temp_file): with open(test_data.path, 'rb') as content: if hasattr(test_data.datatype, 'sniff') or 'false' in test_data.path: file_type = 'auto' else: file_type = test_data.datatype.file_ext dataset = instance.dataset_populator.new_dataset(instance.history_id, content=content, wait=False, file_type=file_type) dataset = instance.dataset_populator.get_history_dataset_details(instance.history_id, dataset=dataset, assert_ok=False) expected_file_ext = test_data.datatype.file_ext # State might be error if the datatype can't be uploaded if dataset['state'] == 'error' and not test_data.uploadable: # Some things can't be uploaded, if attempting to upload these datasets we mention why assert 'invalid' in dataset['misc_info'] or 'unsupported' in dataset['misc_info'] return elif dataset['state'] == 'error' and 'empty' in dataset['misc_info']: return else: # state should be OK assert dataset['state'] == 'ok' # Check that correct datatype has been detected if 'false' in test_data.path: # datasets with false in their name are not of a specific datatype assert dataset['file_ext'] != PARENT_SNIFFER_MAP.get(expected_file_ext, expected_file_ext) else: assert dataset['file_ext'] == PARENT_SNIFFER_MAP.get(expected_file_ext, expected_file_ext) # download file and verify it hasn't been manipulated temp_file.write(instance.dataset_populator.get_history_dataset_content(history_id=instance.history_id, dataset=dataset, type='bytes', assert_ok=False, raw=True)) temp_file.flush() assert md5_hash_file(test_data.path) == md5_hash_file(temp_file.name)
def check(self): hashes = { key: None for key in self.paths.keys() } while self._active: do_reload = False with self._lock: paths = list(self.paths.keys()) for path in paths: if not os.path.exists(path): continue mod_time = self.paths[path] if not hashes.get(path, None): hashes[path] = md5_hash_file(path) new_mod_time = None if os.path.exists(path): new_mod_time = time.ctime(os.path.getmtime(path)) if new_mod_time != mod_time: new_hash = md5_hash_file(path) if hashes[path] != new_hash: self.paths[path] = new_mod_time hashes[path] = new_hash log.debug("The file '%s' has changes.", path) do_reload = True if do_reload: with self._lock: t = threading.Thread(target=self.event_handler.on_any_event) t.daemon = True t.start() time.sleep(1)
def check(self): """Check for changes in self.paths or self.cache and call the event handler.""" hashes = {key: None for key in self.paths.keys()} while self._active: do_reload = False with self._lock: paths = list(self.paths.keys()) for path in paths: if not os.path.exists(path): continue mod_time = self.paths[path] if not hashes.get(path, None): hashes[path] = md5_hash_file(path) new_mod_time = None if os.path.exists(path): new_mod_time = os.path.getmtime(path) if new_mod_time > mod_time: new_hash = md5_hash_file(path) if hashes[path] != new_hash: self.paths[path] = new_mod_time hashes[path] = new_hash log.debug("The file '%s' has changes.", path) do_reload = True if not do_reload and self.cache: removed_ids = self.cache.cleanup() if removed_ids: do_reload = True if do_reload: self.reload_callback() time.sleep(1)
def check(self): hashes = {key: None for key in self.paths.keys()} while self._active: do_reload = False with self._lock: paths = list(self.paths.keys()) for path in paths: if not os.path.exists(path): continue mod_time = self.paths[path] if not hashes.get(path, None): hashes[path] = md5_hash_file(path) new_mod_time = None if os.path.exists(path): new_mod_time = time.ctime(os.path.getmtime(path)) if new_mod_time != mod_time: new_hash = md5_hash_file(path) if hashes[path] != new_hash: self.paths[path] = new_mod_time hashes[path] = new_hash log.debug("The file '%s' has changes.", path) do_reload = True if do_reload: with self._lock: t = threading.Thread( target=self.event_handler.on_any_event) t.daemon = True t.start() time.sleep(1)
def check(self): """Check for changes in self.paths or self.cache and call the event handler.""" hashes = { key: None for key in self.paths.keys() } while self._active: do_reload = False with self._lock: paths = list(self.paths.keys()) for path in paths: if not os.path.exists(path): continue mod_time = self.paths[path] if not hashes.get(path, None): hashes[path] = md5_hash_file(path) new_mod_time = None if os.path.exists(path): new_mod_time = os.path.getmtime(path) if new_mod_time > mod_time: new_hash = md5_hash_file(path) if hashes[path] != new_hash: self.paths[path] = new_mod_time hashes[path] = new_hash log.debug("The file '%s' has changes.", path) do_reload = True if not do_reload and self.cache: removed_ids = self.cache.cleanup() if removed_ids: do_reload = True if do_reload: self.reload_callback() time.sleep(1)
def _handle(self, event): # modified events will only have src path, move events will # have dest_path and src_path but we only care about dest. So # look at dest if it exists else use src. path = getattr(event, 'dest_path', None) or event.src_path path = os.path.abspath(path) callback = self.watcher.file_callbacks.get(path, None) if os.path.basename(path).startswith('.'): return if callback: ext_ok = self._extension_check(path, path) else: # reversed sort for getting the most specific dir first for key in reversed(sorted(self.watcher.dir_callbacks.keys())): if os.path.commonprefix([path, key]) == key: callback = self.watcher.dir_callbacks[key] ext_ok = self._extension_check(key, path) break if not callback or not ext_ok: return cur_hash = md5_hash_file(path) if cur_hash: if self.watcher.path_hash.get(path) == cur_hash: return else: time.sleep(0.5) if cur_hash != md5_hash_file(path): # We're still modifying the file, it'll be picked up later return self.watcher.path_hash[path] = cur_hash callback(path=path)
def _handle(self, event): # modified events will only have src path, move events will # have dest_path and src_path but we only care about dest. So # look at dest if it exists else use src. path = getattr(event, 'dest_path', None) or event.src_path path = os.path.abspath(path) callback = self.watcher.file_callbacks.get(path) if os.path.basename(path).startswith('.'): return if callback: ext_ok = self._extension_check(path, path) else: # reversed sort for getting the most specific dir first for key in reversed(sorted(self.watcher.dir_callbacks.keys())): if os.path.commonprefix([path, key]) == key: callback = self.watcher.dir_callbacks[key] ext_ok = self._extension_check(key, path) break if not callback or not ext_ok: return cur_hash = md5_hash_file(path) if cur_hash: if self.watcher.path_hash.get(path) == cur_hash: return else: time.sleep(0.5) if cur_hash != md5_hash_file(path): # We're still modifying the file, it'll be picked up later return self.watcher.path_hash[path] = cur_hash callback(path=path)
def check(self): """Check for changes in self.paths or self.cache and call the event handler.""" hashes = {} if self.cache: self.cache.assert_hashes_initialized() while self._active and not self.exit.isSet(): do_reload = False drop_on_next_loop = set() drop_now = set() with self._lock: paths = list(self.paths.keys()) for path in paths: try: if not os.path.exists(path): continue mod_time = self.paths[path] if not hashes.get(path, None): hash = md5_hash_file(path) if hash: hashes[path] = md5_hash_file(path) else: continue new_mod_time = os.path.getmtime(path) # mod_time can be None if a non-required config was just created if not mod_time: self.paths[path] = new_mod_time log.debug("The file '%s' has been created.", path) do_reload = True elif new_mod_time > mod_time: new_hash = md5_hash_file(path) if hashes[path] != new_hash: self.paths[path] = new_mod_time hashes[path] = new_hash log.debug("The file '%s' has changes.", path) do_reload = True except OSError: # in rare cases `path` may be deleted between `os.path.exists` calls # and reading the file from the filesystem. We do not want the watcher # thread to die in these cases. if path in drop_now: log.warning("'%s' could not be read, removing from watched files", path) del paths[path] if path in hashes: del hashes[path] else: log.debug("'%s could not be read", path) drop_on_next_loop.add(path) if self.cache: self.cache.cleanup() do_reload = True if not do_reload and self.cache: removed_ids = self.cache.cleanup() if removed_ids: do_reload = True if do_reload: self.reload_callback() drop_now = drop_on_next_loop drop_on_next_loop = set() self.exit.wait(1)
def get_input_files(*args): temp_dir = tempfile.mkdtemp() test_files = [] try: for filename in args: shutil.copy(get_test_fname(filename), temp_dir) test_files.append(os.path.join(temp_dir, filename)) md5_sums = [md5_hash_file(f) for f in test_files] yield test_files new_md5_sums = [md5_hash_file(f) for f in test_files] for old_hash, new_hash, f in zip(md5_sums, new_md5_sums, test_files): assert old_hash == new_hash, 'Unexpected change of content for file %s' % f finally: shutil.rmtree(temp_dir, ignore_errors=True)
def check(self): """Check for changes in self.paths or self.cache and call the event handler.""" hashes = {key: None for key in self.paths.keys()} while self._active and not self.exit.isSet(): do_reload = False with self._lock: paths = list(self.paths.keys()) for path in paths: try: if not os.path.exists(path): continue mod_time = self.paths[path] if not hashes.get(path, None): hash = md5_hash_file(path) if hash: hashes[path] = md5_hash_file(path) else: continue new_mod_time = os.path.getmtime(path) # mod_time can be None if a non-required config was just created if not mod_time: self.paths[path] = new_mod_time log.debug("The file '%s' has been created.", path) do_reload = True elif new_mod_time > mod_time: new_hash = md5_hash_file(path) if hashes[path] != new_hash: self.paths[path] = new_mod_time hashes[path] = new_hash log.debug("The file '%s' has changes.", path) do_reload = True except IOError: # in rare cases `path` may be deleted between `os.path.exists` calls # and reading the file from the filesystem. We do not want the watcher # thread to die in these cases. try: del hashes[path] del paths[path] except KeyError: pass if self.cache: self.cache.cleanup() do_reload = True if not do_reload and self.cache: removed_ids = self.cache.cleanup() if removed_ids: do_reload = True if do_reload: self.reload_callback() self.exit.wait(1)
def cache_tool(self, config_filename, tool): tool_hash = md5_hash_file(config_filename) tool_id = str( tool.id ) self._hash_by_tool_paths[config_filename] = tool_hash self._mod_time_by_path[config_filename] = os.path.getmtime(config_filename) self._tool_paths_by_id[tool_id] = config_filename self._tools_by_path[config_filename] = tool self._new_tool_ids.add(tool_id)
def check(self): """Check for changes in self.paths or self.cache and call the event handler.""" hashes = {key: None for key in self.paths.keys()} while self._active: do_reload = False with self._lock: paths = list(self.paths.keys()) for path in paths: try: if not os.path.exists(path): continue mod_time = self.paths[path] if not hashes.get(path, None): hash = md5_hash_file(path) if hash: hashes[path] = md5_hash_file(path) else: continue new_mod_time = os.path.getmtime(path) if new_mod_time > mod_time: new_hash = md5_hash_file(path) if hashes[path] != new_hash: self.paths[path] = new_mod_time hashes[path] = new_hash log.debug("The file '%s' has changes.", path) do_reload = True except IOError: # in rare cases `path` may be deleted between `os.path.exists` calls # and reading the file from the filesystem. We do not want the watcher # thread to die in these cases. try: del hashes[path] del paths[path] except KeyError: pass if self.cache: self.cache.cleanup() do_reload = True if not do_reload and self.cache: removed_ids = self.cache.cleanup() if removed_ids: do_reload = True if do_reload: self.reload_callback() time.sleep(1)
def _should_cleanup(self, config_filename): """Return True if `config_filename` does not exist or if modtime and hash have changes, else return False.""" if not os.path.exists(config_filename): return True new_mtime = os.path.getmtime(config_filename) if self._mod_time_by_path.get(config_filename) < new_mtime: if md5_hash_file(config_filename) != self._hash_by_tool_paths.get(config_filename): return True return False
def _handle(self, event): # modified events will only have src path, move events will # have dest_path and src_path but we only care about dest. So # look at dest if it exists else use src. path = getattr(event, 'dest_path', None) or event.src_path path = os.path.abspath(path) if path.endswith(".loc"): cur_hash = md5_hash_file(path) if cur_hash: if self.loc_watcher.path_hash.get(path) == cur_hash: return else: time.sleep(0.5) if cur_hash != md5_hash_file(path): # We're still modifying the file, it'll be picked up later return self.loc_watcher.path_hash[path] = cur_hash self.loc_watcher.tool_data_tables.reload_tables(path=path)
def _handle(self, event): # modified events will only have src path, move events will # have dest_path and src_path but we only care about dest. So # look at dest if it exists else use src. path = getattr( event, 'dest_path', None ) or event.src_path path = os.path.abspath( path ) if path.endswith(".loc"): cur_hash = md5_hash_file(path) if self.loc_watcher.path_hash.get(path) == cur_hash: return else: self.loc_watcher.path_hash[path] = cur_hash self.loc_watcher.tool_data_tables.reload_tables(path=path)
def _handle(self, event): # modified events will only have src path, move events will # have dest_path and src_path but we only care about dest. So # look at dest if it exists else use src. path = getattr(event, 'dest_path', None) or event.src_path path = os.path.abspath(path) if path.endswith(".loc"): cur_hash = md5_hash_file(path) if self.loc_watcher.path_hash.get(path) == cur_hash: return else: self.loc_watcher.path_hash[path] = cur_hash self.loc_watcher.tool_data_tables.reload_tables(path=path)
def _should_cleanup(self, config_filename): """Return True if `config_filename` does not exist or if modtime and hash have changes, else return False.""" if not os.path.exists(config_filename): return True new_mtime = os.path.getmtime(config_filename) tool_hash = self._hash_by_tool_paths.get(config_filename) if tool_hash.modtime < new_mtime: if md5_hash_file(config_filename) != tool_hash.hash: return True tool = self._tools_by_path[config_filename] for macro_path in tool._macro_paths: new_mtime = os.path.getmtime(macro_path) if self._hash_by_tool_paths.get(macro_path).modtime < new_mtime: return True return False
def cache_tool(self, config_filename, tool): tool_hash = md5_hash_file(config_filename) if tool_hash is None: return tool_id = str(tool.id) with self._lock: self._hash_by_tool_paths[config_filename] = tool_hash self._mod_time_by_path[config_filename] = os.path.getmtime( config_filename) self._tool_paths_by_id[tool_id] = config_filename self._tools_by_path[config_filename] = tool self._new_tool_ids.add(tool_id) for macro_path in tool._macro_paths: self._mod_time_by_path[macro_path] = os.path.getmtime( macro_path) if tool_id not in self._macro_paths_by_id: self._macro_paths_by_id[tool_id] = {macro_path} else: self._macro_paths_by_id[tool_id].add(macro_path)
def cache_tool(self, config_filename, tool): tool_hash = md5_hash_file(config_filename) if tool_hash is None: return tool_id = str(tool.id) self._hash_by_tool_paths[config_filename] = tool_hash self._mod_time_by_path[config_filename] = os.path.getmtime(config_filename) self._tool_paths_by_id[tool_id] = config_filename self._tools_by_path[config_filename] = tool self._new_tool_ids.add(tool_id) for macro_path in tool._macro_paths: self._mod_time_by_path[macro_path] = os.path.getmtime(macro_path) if tool_id not in self._macro_paths_by_id: self._macro_paths_by_id[tool_id] = {macro_path} else: self._macro_paths_by_id[tool_id].add(macro_path) if macro_path not in self._macro_paths_by_id: self._tool_ids_by_macro_paths[macro_path] = {tool_id} else: self._tool_ids_by_macro_paths[macro_path].add(tool_id)
def hash(self): if self._tool_hash is None: self._tool_hash = md5_hash_file(self.path) return self._tool_hash
def upload_datatype_helper(instance, test_data, temp_file, delete_cache_dir=False): is_compressed = False for is_method in (is_bz2, is_gzip, is_zip): is_compressed = is_method(test_data.path) if is_compressed: break with open(test_data.path, 'rb') as content: if hasattr(test_data.datatype, 'sniff') or 'false' in test_data.path: file_type = 'auto' else: file_type = test_data.datatype.file_ext dataset = instance.dataset_populator.new_dataset(instance.history_id, content=content, wait=False, file_type=file_type) dataset = instance.dataset_populator.get_history_dataset_details( instance.history_id, dataset=dataset, assert_ok=False) expected_file_ext = test_data.datatype.file_ext # State might be error if the datatype can't be uploaded if dataset['state'] == 'error' and not test_data.uploadable: # Some things can't be uploaded, if attempting to upload these datasets we mention why assert 'invalid' in dataset['misc_info'] or 'unsupported' in dataset[ 'misc_info'] return elif dataset['state'] == 'error' and 'empty' in dataset['misc_info']: return else: # state should be OK assert dataset['state'] == 'ok' # Check that correct datatype has been detected file_ext = dataset['file_ext'] if 'false' in test_data.path: # datasets with false in their name are not of a specific datatype assert file_ext != PARENT_SNIFFER_MAP.get(expected_file_ext, expected_file_ext) else: assert file_ext == PARENT_SNIFFER_MAP.get(expected_file_ext, expected_file_ext) datatype = registry.datatypes_by_extension[file_ext] datatype_compressed = getattr(datatype, "compressed", False) if not is_compressed or datatype_compressed: if delete_cache_dir: # Delete cache directory and then re-create it. This way we confirm # that dataset is fetched from the object store, not from the cache temp_dir = instance.get_object_store_kwargs()['temp_directory'] cache_dir = temp_dir + '/object_store_cache' shutil.rmtree(cache_dir) os.mkdir(cache_dir) # download file and verify it hasn't been manipulated temp_file.write( instance.dataset_populator.get_history_dataset_content( history_id=instance.history_id, dataset=dataset, type='bytes', assert_ok=False, raw=True)) temp_file.flush() assert md5_hash_file(test_data.path) == md5_hash_file(temp_file.name)