def __init__(self): super().__init__("assemblyline.replay_creator.worker") if not self.replay_config.creator.alert_input.enabled and \ not self.replay_config.creator.submission_input.enabled: return # Initialize filestore object self.filestore = FileStore(self.replay_config.creator.output_filestore) # Create cache directory os.makedirs(self.replay_config.creator.working_directory, exist_ok=True) # Load client client_config = dict( lookback_time=self.replay_config.creator.lookback_time, alert_fqs=self.replay_config.creator.alert_input.filter_queries, submission_fqs=self.replay_config.creator.submission_input. filter_queries) if self.replay_config.creator.client.type == 'direct': self.log.info("Using direct database access client") self.client = DirectClient(self.log, **client_config) elif self.replay_config.creator.client.type == 'api': self.log.info( f"Using API access client to ({self.replay_config.creator.client.options.host})" ) client_config.update( self.replay_config.creator.client.options.as_primitives()) self.client = APIClient(self.log, **client_config) else: raise ValueError( f'Invalid client type ({self.replay_config.creator.client.type}). ' 'Must be either \'api\' or \'direct\'.')
def test_https(): """ Test HTTPS FileStore by fetching the assemblyline page on CSE's cyber center page. """ fs = FileStore('https://github.com/CybercentreCanada/') assert fs.exists('assemblyline-base') != [] assert fs.get('assemblyline-base') is not None
def test_https(): """ Test HTTPS FileStore by fetching the assemblyline page on CSE's cyber center page. """ fs = FileStore('https://cyber.gc.ca/en/') assert fs.exists('assemblyline') != [] assert fs.get('assemblyline') is not None
def test_s3(): """ Test Amazon S3 FileStore by fetching a test file from the assemblyline-support bucket on Amazon S3. """ fs = FileStore('s3://AKIAIIESFCKMSXUP6KWQ:Uud08qLQ48Cbo9RB7b+H+M97aA2wdR8OXaHXIKwL@' 's3.amazonaws.com/?s3_bucket=assemblyline-support&aws_region=us-east-1') assert fs.exists('al4_s3_pytest.txt') != [] assert fs.get('al4_s3_pytest.txt') is not None
def test_file(): """ Test Local FileStore by fetching the README.md file from the assemblyline core repo directory. Note: This test will fail if pytest is not ran from the root of the assemblyline core repo. """ fs = FileStore('file://%s' % os.path.dirname(__file__)) assert fs.exists(os.path.basename(__file__)) != [] assert fs.get(os.path.basename(__file__)) is not None
def __init__(self, component: str, config=None, datastore=None): if not component: raise ValueError("Cannot instantiate a cachestore without providing a component name.") if not COMPONENT_VALIDATOR.match(component): raise ValueError("Invalid component name. (Only letters, numbers, underscores and dots allowed)") if config is None: config = forge.get_config() self.component = component self.datastore = datastore or forge.get_datastore(config=config) self.filestore = FileStore(*config.filestore.cache)
def test_file(): """ Test Local FileStore by fetching the README.md file from the assemblyline core repo directory. Note: This test will fail if pytest is not ran from the root of the assemblyline core repo. """ fs = FileStore('file://%s' % os.path.dirname(__file__)) assert fs.exists(os.path.basename(__file__)) != [] assert fs.get(os.path.basename(__file__)) is not None with tempfile.TemporaryDirectory() as temp_dir: with FileStore('file://' + temp_dir) as fs: common_actions(fs)
def __init__(self, force_ilm=False): self.config = forge.get_config() if force_ilm: self.config.datastore.ilm.enabled = True super().__init__('assemblyline.expiry', shutdown_timeout=self.config.core.expiry.sleep_time + 5) self.datastore = forge.get_datastore(config=self.config, archive_access=True) self.hot_datastore = forge.get_datastore(config=self.config, archive_access=False) self.filestore = forge.get_filestore(config=self.config) self.cachestore = FileStore(*self.config.filestore.cache) self.expirable_collections = [] self.archiveable_collections = [] self.counter = MetricsFactory('expiry', Metrics) self.counter_archive = MetricsFactory('archive', Metrics) if self.config.datastore.ilm.enabled: self.fs_hashmap = { 'file': self.archive_filestore_delete, 'cached_file': self.archive_cachestore_delete } else: self.fs_hashmap = { 'file': self.filestore_delete, 'cached_file': self.cachestore_delete } for name, definition in self.datastore.ds.get_models().items(): if hasattr(definition, 'archive_ts'): self.archiveable_collections.append( getattr(self.datastore, name)) if hasattr(definition, 'expiry_ts'): self.expirable_collections.append(getattr( self.datastore, name)) if self.config.core.metrics.apm_server.server_url is not None: self.log.info( f"Exporting application metrics to: {self.config.core.metrics.apm_server.server_url}" ) elasticapm.instrument() self.apm_client = elasticapm.Client( server_url=self.config.core.metrics.apm_server.server_url, service_name="expiry") else: self.apm_client = None
def test_ftps(temp_ftps_server): """ Run some operations against an in-process ftp server """ with FileStore(f'ftps://{temp_ftps_server}') as fs: assert 'localhost' in str(fs) common_actions(fs)
def test_https(): """ Test HTTPS FileStore by fetching the assemblyline page on CSE's cyber center page. """ fs = FileStore('https://github.com/CybercentreCanada/') assert 'github.com' in str(fs) httpx_tests(fs)
def test_minio(): """ Test Minio FileStore by pushing and fetching back content from it. """ content = b"THIS IS A MINIO TEST" fs = FileStore('s3://al_storage_key:Ch@ngeTh!sPa33w0rd@localhost:9000/?s3_bucket=test&use_ssl=False') assert fs.delete('al4_minio_pytest.txt') is None assert fs.put('al4_minio_pytest.txt', content) != [] assert fs.exists('al4_minio_pytest.txt') != [] assert fs.get('al4_minio_pytest.txt') == content assert fs.delete('al4_minio_pytest.txt') is None
def test_azure(): """ Azure filestore by downloading a file from our public storage blob """ fs = FileStore("azure://alpytest.blob.core.windows.net/pytest/", connection_attempts=2) assert fs.exists('test') != [] assert fs.get('test') is not None with pytest.raises(TransportException): fs.put('bob', 'bob')
class CacheStore(object): def __init__(self, component, config=None, datastore=None): if not component: raise ValueError("Cannot instanciate a cachestore without providing a component name.") if not COMPONENT_VALIDATOR.match(component): raise ValueError("Invalid component name. (Only letters, numbers, underscores and dots allowed)") if config is None: config = forge.get_config() self.component = component self.datastore = datastore or forge.get_datastore() self.filestore = FileStore(*config.filestore.cache) def __enter__(self): return self def __exit__(self, ex_type, exc_val, exc_tb): self.filestore.close() def save(self, cache_key, data, ttl=DEFAULT_CACHE_LEN): if not COMPONENT_VALIDATOR.match(cache_key): raise ValueError("Invalid cache_key for cache item. " "(Only letters, numbers, underscores and dots allowed)") new_key = f"{self.component}_{cache_key}" if self.component else cache_key self.datastore.cached_file.save(new_key, {'expiry_ts': now_as_iso(ttl), 'component': self.component}) self.filestore.put(new_key, data) def get(self, cache_key): new_key = f"{self.component}_{cache_key}" if self.component else cache_key return self.filestore.get(new_key) def delete(self, cache_key, db_delete=True): new_key = f"{self.component}_{cache_key}" if self.component else cache_key self.filestore.delete(new_key) if db_delete: self.datastore.cached_file.delete(new_key)
class CacheStore(object): def __init__(self, component: str, config=None, datastore=None): if not component: raise ValueError("Cannot instantiate a cachestore without providing a component name.") if not COMPONENT_VALIDATOR.match(component): raise ValueError("Invalid component name. (Only letters, numbers, underscores and dots allowed)") if config is None: config = forge.get_config() self.component = component self.datastore = datastore or forge.get_datastore(config=config) self.filestore = FileStore(*config.filestore.cache) def __enter__(self) -> 'CacheStore': return self def __exit__(self, ex_type, exc_val, exc_tb): self.filestore.close() def save(self, cache_key: str, data: AnyStr, ttl=DEFAULT_CACHE_LEN, force=False): if not COMPONENT_VALIDATOR.match(cache_key): raise ValueError("Invalid cache_key for cache item. " "(Only letters, numbers, underscores and dots allowed)") new_key = f"{self.component}_{cache_key}" if self.component else cache_key self.datastore.cached_file.save(new_key, {'expiry_ts': now_as_iso(ttl), 'component': self.component}) self.filestore.put(new_key, data, force=force) def upload(self, cache_key: str, path: str, ttl=DEFAULT_CACHE_LEN): if not COMPONENT_VALIDATOR.match(cache_key): raise ValueError("Invalid cache_key for cache item. " "(Only letters, numbers, underscores and dots allowed)") new_key = f"{self.component}_{cache_key}" if self.component else cache_key self.datastore.cached_file.save(new_key, {'expiry_ts': now_as_iso(ttl), 'component': self.component}) self.filestore.upload(new_key, path, force=True) def touch(self, cache_key: str, ttl=DEFAULT_CACHE_LEN): if not COMPONENT_VALIDATOR.match(cache_key): raise ValueError("Invalid cache_key for cache item. " "(Only letters, numbers, underscores and dots allowed)") if not self.exists(cache_key): raise KeyError(cache_key) new_key = f"{self.component}_{cache_key}" if self.component else cache_key self.datastore.cached_file.save(new_key, {'expiry_ts': now_as_iso(ttl), 'component': self.component}) def get(self, cache_key: str) -> Optional[bytes]: new_key = f"{self.component}_{cache_key}" if self.component else cache_key return self.filestore.get(new_key) def download(self, cache_key: str, path: str): new_key = f"{self.component}_{cache_key}" if self.component else cache_key return self.filestore.download(new_key, path) def exists(self, cache_key: str): new_key = f"{self.component}_{cache_key}" if self.component else cache_key return self.filestore.exists(new_key) def delete(self, cache_key: str, db_delete=True): new_key = f"{self.component}_{cache_key}" if self.component else cache_key self.filestore.delete(new_key) if db_delete: self.datastore.cached_file.delete(new_key)
class ReplayCreatorWorker(ReplayBase): def __init__(self): super().__init__("assemblyline.replay_creator.worker") if not self.replay_config.creator.alert_input.enabled and \ not self.replay_config.creator.submission_input.enabled: return # Initialize filestore object self.filestore = FileStore(self.replay_config.creator.output_filestore) # Create cache directory os.makedirs(self.replay_config.creator.working_directory, exist_ok=True) # Load client client_config = dict( lookback_time=self.replay_config.creator.lookback_time, alert_fqs=self.replay_config.creator.alert_input.filter_queries, submission_fqs=self.replay_config.creator.submission_input. filter_queries) if self.replay_config.creator.client.type == 'direct': self.log.info("Using direct database access client") self.client = DirectClient(self.log, **client_config) elif self.replay_config.creator.client.type == 'api': self.log.info( f"Using API access client to ({self.replay_config.creator.client.options.host})" ) client_config.update( self.replay_config.creator.client.options.as_primitives()) self.client = APIClient(self.log, **client_config) else: raise ValueError( f'Invalid client type ({self.replay_config.creator.client.type}). ' 'Must be either \'api\' or \'direct\'.') def process_alerts(self, once=False): while self.running: # Process alerts found alert = self.client.get_next_alert() if alert: self.log.info(f"Processing alert: {alert['alert_id']}") # Make sure directories exists os.makedirs(self.replay_config.creator.working_directory, exist_ok=True) # Create the bundle bundle_path = os.path.join( self.replay_config.creator.working_directory, f"alert_{alert['alert_id']}.al_bundle") self.client.create_alert_bundle(alert['alert_id'], bundle_path) # Move the bundle self.filestore.upload(bundle_path, f"alert_{alert['alert_id']}.al_bundle") # Remove temp file if os.path.exists(bundle_path): os.unlink(bundle_path) # Set alert state done self.client.set_single_alert_complete(alert['alert_id']) if once: break def process_submissions(self, once=False): while self.running: # Process submissions found submission = self.client.get_next_submission() if submission: self.log.info(f"Processing submission: {submission['sid']}") # Make sure directories exists os.makedirs(self.replay_config.creator.working_directory, exist_ok=True) # Create the bundle bundle_path = os.path.join( self.replay_config.creator.working_directory, f"submission_{submission['sid']}.al_bundle") self.client.create_submission_bundle(submission['sid'], bundle_path) # Move the bundle self.filestore.upload( bundle_path, f"submission_{submission['sid']}.al_bundle") # Remove temp file if os.path.exists(bundle_path): os.unlink(bundle_path) # Set submission state done self.client.set_single_submission_complete(submission['sid']) if once: break def try_run(self): threads = {} if self.replay_config.creator.alert_input.enabled: for ii in range(self.replay_config.creator.alert_input.threads): threads[f'Alert process thread #{ii}'] = self.process_alerts if self.replay_config.creator.submission_input.enabled: for ii in range( self.replay_config.creator.submission_input.threads): threads[ f'Submission process thread #{ii}'] = self.process_submissions if threads: self.maintain_threads(threads) else: self.log.warning("There are no configured input, terminating") self.main_loop_exit.set() self.stop()
class ExpiryManager(ServerBase): def __init__(self, force_ilm=False): self.config = forge.get_config() if force_ilm: self.config.datastore.ilm.enabled = True super().__init__('assemblyline.expiry', shutdown_timeout=self.config.core.expiry.sleep_time + 5) self.datastore = forge.get_datastore(config=self.config, archive_access=True) self.hot_datastore = forge.get_datastore(config=self.config, archive_access=False) self.filestore = forge.get_filestore(config=self.config) self.cachestore = FileStore(*self.config.filestore.cache) self.expirable_collections = [] self.archiveable_collections = [] self.counter = MetricsFactory('expiry', Metrics) self.counter_archive = MetricsFactory('archive', Metrics) if self.config.datastore.ilm.enabled: self.fs_hashmap = { 'file': self.archive_filestore_delete, 'cached_file': self.archive_cachestore_delete } else: self.fs_hashmap = { 'file': self.filestore_delete, 'cached_file': self.cachestore_delete } for name, definition in self.datastore.ds.get_models().items(): if hasattr(definition, 'archive_ts'): self.archiveable_collections.append( getattr(self.datastore, name)) if hasattr(definition, 'expiry_ts'): self.expirable_collections.append(getattr( self.datastore, name)) if self.config.core.metrics.apm_server.server_url is not None: self.log.info( f"Exporting application metrics to: {self.config.core.metrics.apm_server.server_url}" ) elasticapm.instrument() self.apm_client = elasticapm.Client( server_url=self.config.core.metrics.apm_server.server_url, service_name="expiry") else: self.apm_client = None def stop(self): if self.counter: self.counter.stop() if self.apm_client: elasticapm.uninstrument() super().stop() def filestore_delete(self, sha256, _): self.filestore.delete(sha256) def archive_filestore_delete(self, sha256, expiry_time): # If we are working with an archive, their may be a hot entry that expires later. doc = self.hot_datastore.file.get_if_exists(sha256, as_obj=False) if doc and doc['expiry_ts'] > expiry_time: return self.filestore.delete(sha256) def cachestore_delete(self, sha256, _): self.filestore.delete(sha256) def archive_cachestore_delete(self, sha256, expiry_time): doc = self.hot_datastore.cached_file.get_if_exists(sha256, as_obj=False) if doc and doc['expiry_ts'] > expiry_time: return self.cachestore.delete(sha256) def run_expiry_once(self): now = now_as_iso() reached_max = False # Expire data for collection in self.expirable_collections: # Call heartbeat pre-dated by 5 minutes. If a collection takes more than # 5 minutes to expire, this container could be seen as unhealthy. The down # side is if it is stuck on something it will be more than 5 minutes before # the container is restarted. self.heartbeat(int(time.time() + 5 * 60)) # Start of expiry transaction if self.apm_client: self.apm_client.begin_transaction("Delete expired documents") if self.config.core.expiry.batch_delete: computed_date = epoch_to_iso( dm(f"{now}||-{self.config.core.expiry.delay}h/d"). float_timestamp) else: computed_date = epoch_to_iso( dm(f"{now}||-{self.config.core.expiry.delay}h"). float_timestamp) delete_query = f"expiry_ts:[* TO {computed_date}]" if self.config.core.expiry.delete_storage and collection.name in self.fs_hashmap: file_delete = True sort = ["expiry_ts asc", "id asc"] else: file_delete = False sort = None number_to_delete = collection.search( delete_query, rows=0, as_obj=False, use_archive=True, sort=sort, track_total_hits=EXPIRY_SIZE)['total'] if self.apm_client: elasticapm.label(query=delete_query) elasticapm.label(number_to_delete=number_to_delete) self.log.info(f"Processing collection: {collection.name}") if number_to_delete != 0: if file_delete: with elasticapm.capture_span( name='FILESTORE [ThreadPoolExecutor] :: delete()', labels={ "num_files": number_to_delete, "query": delete_query }): # Delete associated files with concurrent.futures.ThreadPoolExecutor( self.config.core.expiry.workers, thread_name_prefix="file_delete") as executor: for item in collection.search( delete_query, fl='id', rows=number_to_delete, sort=sort, use_archive=True, as_obj=False)['items']: executor.submit( self.fs_hashmap[collection.name], item['id'], computed_date) self.log.info( f' Deleted associated files from the ' f'{"cachestore" if "cache" in collection.name else "filestore"}...' ) # Proceed with deletion collection.delete_by_query( delete_query, workers=self.config.core.expiry.workers, sort=sort, max_docs=number_to_delete) else: # Proceed with deletion collection.delete_by_query( delete_query, workers=self.config.core.expiry.workers) if number_to_delete == EXPIRY_SIZE: reached_max = True self.counter.increment(f'{collection.name}', increment_by=number_to_delete) self.log.info( f" Deleted {number_to_delete} items from the datastore..." ) else: self.log.debug(" Nothing to delete in this collection.") # End of expiry transaction if self.apm_client: self.apm_client.end_transaction(collection.name, 'deleted') return reached_max def run_archive_once(self): reached_max = False if not self.config.datastore.ilm.enabled: return reached_max now = now_as_iso() # Archive data for collection in self.archiveable_collections: # Call heartbeat pre-dated by 5 minutes. If a collection takes more than # 5 minutes to expire, this container could be seen as unhealthy. The down # side is if it is stuck on something it will be more than 5 minutes before # the container is restarted. self.heartbeat(int(time.time() + 5 * 60)) # Start of expiry transaction if self.apm_client: self.apm_client.begin_transaction("Archive older documents") archive_query = f"archive_ts:[* TO {now}]" sort = ["archive_ts asc", "id asc"] number_to_archive = collection.search( archive_query, rows=0, as_obj=False, use_archive=False, sort=sort, track_total_hits=ARCHIVE_SIZE)['total'] if number_to_archive == ARCHIVE_SIZE: reached_max = True if self.apm_client: elasticapm.label(query=archive_query) elasticapm.label(number_to_archive=number_to_archive) self.log.info(f"Processing collection: {collection.name}") if number_to_archive != 0: # Proceed with archiving if collection.archive(archive_query, max_docs=number_to_archive, sort=sort): self.counter_archive.increment( f'{collection.name}', increment_by=number_to_archive) self.log.info( f" Archived {number_to_archive} documents...") else: self.log.warning( f" Failed to properly archive {number_to_archive} documents..." ) else: self.log.debug(" Nothing to archive in this collection.") # End of expiry transaction if self.apm_client: self.apm_client.end_transaction(collection.name, 'archived') return reached_max def try_run(self): while self.running: expiry_maxed_out = False archive_maxed_out = False try: expiry_maxed_out = self.run_expiry_once() except Exception as e: self.log.exception(str(e)) try: archive_maxed_out = self.run_archive_once() except Exception as e: self.log.exception(str(e)) if not expiry_maxed_out and not archive_maxed_out: self.sleep_with_heartbeat(self.config.core.expiry.sleep_time)
def get_filestore(config=None, connection_attempts=None): from assemblyline.filestore import FileStore if config is None: config = get_config() return FileStore(*config.filestore.storage, connection_attempts=connection_attempts)