def get_multiple_results(self, keys, cl_engine=forge.get_classification(), as_obj=False): results = { k: self.create_empty_result_from_key(k, cl_engine, as_obj=as_obj) for k in keys if k.endswith(".e") } keys = [k for k in keys if not k.endswith(".e")] try: results.update( self.result.multiget(keys, as_dictionary=True, as_obj=as_obj)) except MultiKeyError as e: log.warning( f"Trying to get multiple results but some are missing: {str(e.keys)}" ) results.update(e.partial_output) return results
def _ensure_collection(self): # Create HOT index if not self.with_retries(self.datastore.client.indices.exists, self.name): log.debug(f"Index {self.name.upper()} does not exists. Creating it now...") try: self.with_retries(self.datastore.client.indices.create, self.name, self._get_index_definition()) except elasticsearch.exceptions.RequestError as e: if "resource_already_exists_exception" not in str(e): raise log.warning(f"Tried to create an index template that already exists: {self.name.upper()}") if self.ilm_config: # Create ILM policy while not self._ilm_policy_exists(): try: self.with_retries(self._create_ilm_policy) except ILMException: time.sleep(0.1) pass # Create WARM index template if not self.with_retries(self.datastore.client.indices.exists_template, self.name): log.debug(f"Index template {self.name.upper()} does not exists. Creating it now...") index = self._get_index_definition() index["index_patterns"] = [f"{self.name}-*"] index["order"] = 1 index["settings"]["index.lifecycle.name"] = f"{self.name}_policy" index["settings"]["index.lifecycle.rollover_alias"] = f"{self.name}-archive" try: self.with_retries(self.datastore.client.indices.put_template, self.name, index) except elasticsearch.exceptions.RequestError as e: if "resource_already_exists_exception" not in str(e): raise log.warning(f"Tried to create an index template that already exists: {self.name.upper()}") if not self.with_retries(self.datastore.client.indices.exists_alias, f"{self.name}-archive"): log.debug(f"Index alias {self.name.upper()}-archive does not exists. Creating it now...") index = {"aliases": {f"{self.name}-archive": {"is_write_index": True}}} try: self.with_retries(self.datastore.client.indices.create, f"{self.name}-000001", index) except elasticsearch.exceptions.RequestError as e: if "resource_already_exists_exception" not in str(e): raise log.warning(f"Tried to create an index template that already exists: {self.name.upper()}-000001") self._check_fields()
def multi_index_bulk(self, bulk_plans): max_retry_backoff = 10 retries = 0 while True: try: plan = "\n".join([p.get_plan_data() for p in bulk_plans]) ret_val = self.ds.client.bulk(body=plan) return ret_val except (elasticsearch.exceptions.ConnectionError, elasticsearch.exceptions.ConnectionTimeout, elasticsearch.exceptions.AuthenticationException): log.warning(f"No connection to Elasticsearch server(s): " f"{' | '.join(self.ds.get_hosts(safe=True))}" f", retrying...") time.sleep(min(retries, max_retry_backoff)) self.ds.connection_reset() retries += 1 except elasticsearch.exceptions.TransportError as e: err_code, msg, cause = e.args if err_code == 503 or err_code == '503': log.warning( "Looks like index is not ready yet, retrying...") time.sleep(min(retries, max_retry_backoff)) self.ds.connection_reset() retries += 1 elif err_code == 429 or err_code == '429': log.warning( "Elasticsearch is too busy to perform the requested task, " "we will wait a bit and retry...") time.sleep(min(retries, max_retry_backoff)) self.ds.connection_reset() retries += 1 else: raise
def get_summary_from_keys(self, keys, cl_engine=forge.get_classification(), user_classification=None): out = { "tags": [], "attack_matrix": [], "heuristics": { "info": [], "suspicious": [], "malicious": [] }, "classification": cl_engine.UNRESTRICTED, "filtered": False } done_map = {"heuristics": set(), "attack": set(), "tags": set()} if len(keys) == 0: return out keys = [x for x in list(keys) if not x.endswith(".e")] file_keys = list(set([x[:64] for x in keys])) try: items = self.result.multiget(keys, as_obj=False) except MultiKeyError as e: # Generate partial summaries even if results are missing log.warning( f"Trying to generate summary but we are missing result(s): {str(e.keys)}" ) items = e.partial_output out['missing_results'] = e.keys try: files = self.file.multiget(file_keys, as_obj=False) except MultiKeyError as e: # Generate partial summaries even if results are missing log.warning( f"Trying to generate summary but we are missing file(s): {str(e.keys)}" ) files = e.partial_output out['missing_files'] = e.keys for key, item in items.items(): for section in item.get('result', {}).get('sections', []): file_classification = files.get(key[:64], {}).get( 'classification', section['classification']) if user_classification: if not cl_engine.is_accessible(user_classification, section['classification']): out["filtered"] = True continue if not cl_engine.is_accessible(user_classification, file_classification): out["filtered"] = True continue out["classification"] = cl_engine.max_classification( out["classification"], section['classification']) out["classification"] = cl_engine.max_classification( out["classification"], file_classification) h_type = "info" if section.get('heuristic', False): # Get the heuristics data if section['heuristic']['score'] < 100: h_type = "info" elif section['heuristic']['score'] < 1000: h_type = "suspicious" else: h_type = "malicious" cache_key = f"{section['heuristic']['heur_id']}_{key}" if cache_key not in done_map['heuristics']: out['heuristics'][h_type].append({ 'heur_id': section['heuristic']['heur_id'], 'name': section['heuristic']['name'], 'key': key }) done_map['heuristics'].add(cache_key) for attack in section['heuristic'].get('attack', []): # Get attack matrix data attack_id = attack['attack_id'] cache_key = f"{attack_id}_{key}" if cache_key not in done_map['attack']: out['attack_matrix'].append({ "key": key, "attack_id": attack_id, "h_type": h_type, "name": attack['pattern'], "categories": attack['categories'] }) done_map['attack'].add(cache_key) # Get tagging data for tag_type, tags in flatten(section.get('tags', {})).items(): if tags is not None: for tag in tags: cache_key = f"{tag_type}_{tag}_{key}" if cache_key not in done_map['tags']: out['tags'].append({ 'type': tag_type, 'h_type': h_type, 'short_type': tag_type.rsplit(".", 1)[-1], 'value': tag, 'key': key }) done_map['tags'].add(cache_key) return out
def get_or_create_file_tree(self, submission, max_depth, cl_engine=forge.get_classification(), user_classification=None): if user_classification is not None: user_classification = cl_engine.normalize_classification( user_classification, long_format=False) cache_key = f"{submission['sid']}_{user_classification}" for illegal_char in [" ", ":", "/"]: cache_key = cache_key.replace(illegal_char, "") else: cache_key = submission['sid'] if isinstance(submission, Model): submission = submission.as_primitives() num_files = len(list(set([x[:64] for x in submission['results']]))) max_score = submission['max_score'] cached_tree = self.submission_tree.get_if_exists(cache_key, as_obj=False) if cached_tree: tree = json.loads(cached_tree['tree']) if self._is_valid_tree(tree, num_files, max_score): return { "tree": tree, "classification": cached_tree['classification'], "filtered": cached_tree['filtered'], "partial": False } partial = False files = {} scores = {} missing_files = [] file_hashes = [x[:64] for x in submission['results']] file_hashes.extend([x[:64] for x in submission['errors']]) file_hashes.extend([f['sha256'] for f in submission['files']]) try: temp_file_data_map = self.file.multiget(list(set(file_hashes)), as_dictionary=True, as_obj=False) except MultiKeyError as e: log.warning( f"Trying to generate file tree but we are missing file(s): {str(e.keys)}" ) temp_file_data_map = e.partial_output missing_files = e.keys partial = True forbidden_files = set() max_classification = cl_engine.UNRESTRICTED file_data_map = {} for key, value in temp_file_data_map.items(): if user_classification and not cl_engine.is_accessible( user_classification, value['classification']): partial = True forbidden_files.add(key) continue file_data_map[key] = value max_classification = cl_engine.max_classification( max_classification, value['classification']) try: results_data = self.result.multiget( [x for x in submission['results'] if not x.endswith(".e")], as_obj=False) except MultiKeyError as e: log.warning( f"Trying to generate file tree but we are missing result(s): {str(e.keys)}" ) results_data = e.partial_output partial = True for key, item in results_data.items(): sha256 = key[:64] # Get scores if sha256 not in scores: scores[sha256] = 0 scores[sha256] += item["result"]["score"] # Get files extracted = item['response']['extracted'] if len(extracted) == 0: continue if sha256 not in files: files[sha256] = [] files[sha256].extend(extracted) tree_cache = [] def recurse_tree(child_p, placeholder, parents_p, lvl=0): if lvl == max_depth + 1: # Enforce depth protection while building the tree return c_sha256 = child_p['sha256'] c_name = child_p['name'] if c_sha256 in placeholder: placeholder[c_sha256]['name'].append(c_name) else: children_list = {} truncated = False child_list = files.get(c_sha256, []) for new_child in child_list: if new_child['sha256'] in tree_cache: truncated = True continue tree_cache.append(child['sha256']) if new_child['sha256'] not in parents_p: recurse_tree(new_child, children_list, parents_p + [c_sha256], lvl + 1) try: placeholder[c_sha256] = { "name": [c_name], "type": file_data_map[c_sha256]['type'], "sha256": file_data_map[c_sha256]['sha256'], "size": file_data_map[c_sha256]['size'], "children": children_list, "truncated": truncated, "score": scores.get(c_sha256, 0), } except KeyError: if c_sha256 not in forbidden_files and c_sha256 not in missing_files: file_data_map[c_sha256] = self.file.get(c_sha256, as_obj=False) placeholder[c_sha256] = { "name": [c_name], "type": file_data_map[c_sha256]['type'], "sha256": file_data_map[c_sha256]['sha256'], "size": file_data_map[c_sha256]['size'], "children": children_list, "truncated": truncated, "score": scores.get(c_sha256, 0), } tree = {} for f in submission['files']: sha256 = f['sha256'] name = f['name'] if sha256 in tree: tree[sha256]['name'].append(name) else: parents = [sha256] children = {} c_list = files.get(sha256, []) for child in c_list: tree_cache.append(child['sha256']) recurse_tree(child, children, parents) try: tree[sha256] = { "name": [name], "children": children, "type": file_data_map[sha256]['type'], "sha256": file_data_map[sha256]['sha256'], "size": file_data_map[sha256]['size'], "truncated": False, "score": scores.get(sha256, 0), } except KeyError: if sha256 not in forbidden_files and sha256 not in missing_files: file_data_map[sha256] = self.file.get(sha256, as_obj=False) tree[sha256] = { "name": [name], "children": children, "type": file_data_map[sha256]['type'], "sha256": file_data_map[sha256]['sha256'], "size": file_data_map[sha256]['size'], "truncated": False, "score": scores.get(sha256, 0), } if not partial: cached_tree = { 'expiry_ts': now_as_iso(days_until_archive * 24 * 60 * 60), 'tree': json.dumps(tree), 'classification': max_classification, 'filtered': len(forbidden_files) > 0 } self.submission_tree.save(cache_key, cached_tree) return { 'tree': tree, 'classification': max_classification, 'filtered': len(forbidden_files) > 0, 'partial': partial }
def with_retries(self, func, *args, **kwargs): retries = 0 updated = 0 deleted = 0 while True: try: ret_val = func(*args, **kwargs) if retries: log.info('Reconnected to elasticsearch!') if updated: ret_val['updated'] += updated if deleted: ret_val['deleted'] += deleted return ret_val except elasticsearch.exceptions.NotFoundError as e: if "index_not_found_exception" in str(e): time.sleep(min(retries, self.MAX_RETRY_BACKOFF)) log.debug("The index does not exist. Trying to recreate it...") self._ensure_collection() self.datastore.connection_reset() retries += 1 else: raise except elasticsearch.exceptions.ConflictError as ce: updated += ce.info.get('updated', 0) deleted += ce.info.get('deleted', 0) time.sleep(min(retries, self.MAX_RETRY_BACKOFF)) self.datastore.connection_reset() retries += 1 except (SearchRetryException, elasticsearch.exceptions.ConnectionError, elasticsearch.exceptions.ConnectionTimeout, elasticsearch.exceptions.AuthenticationException) as e: if not isinstance(e, SearchRetryException): log.warning(f"No connection to Elasticsearch server(s): " f"{' | '.join(self.datastore.get_hosts(safe=True))}" f", retrying...") time.sleep(min(retries, self.MAX_RETRY_BACKOFF)) self.datastore.connection_reset() retries += 1 except elasticsearch.exceptions.TransportError as e: err_code, msg, cause = e.args if err_code == 503 or err_code == '503': log.warning("Looks like index is not ready yet, retrying...") time.sleep(min(retries, self.MAX_RETRY_BACKOFF)) self.datastore.connection_reset() retries += 1 elif err_code == 429 or err_code == '429': log.warning("Elasticsearch is too busy to perform the requested task, " "we will wait a bit and retry...") time.sleep(min(retries, self.MAX_RETRY_BACKOFF)) self.datastore.connection_reset() retries += 1 else: raise