def generateConfigReport(dataset, include_result=False, dump_path=None): mng = PathManager(dataset) report = {} warnings = [] for doc_dir in tqdm(os.listdir(mng.DocBase())): config_path = mng.DocBase() + doc_dir + '/' try: cfg = loadJson(config_path + 'config.json') report[int(cfg['version'])] = { '__model': cfg['modelName'], # 添加下划线便于排序 '_k-q-qk': '-'.join([str(cfg['k']), str(cfg['n']), str(cfg['qk'])]), 'desc': cfg['description'] } if include_result: res = loadJson(config_path + 'testResult.json') report[int(cfg['version'])]['results'] = res['results'] except Exception as e: warnings.append('Error occurred when process %s: %s' % (doc_dir, str(e))) for w in warnings: logging.warning(w) dump_path = mng.DatasetBase( ) + 'summary.json' if dump_path is None else dump_path dumpJson(report, dump_path, sort=True)
def extractAPISeqOnLog(pre_log_path, dst_path, log_dump_path=None): logs = loadJson(pre_log_path) reporter = Reporter() for i,item in enumerate(logs['valid_files']): print('#', i+1, end=' ') try: report_ = loadJson(item['rawPath']) new_report = {} new_report['sha1'] = report_['target']['file']['sha1'] new_report['name'] = report_['target']['file']['name'] new_report['sha256'] = report_['target']['file']['sha256'] new_report['sha512'] = report_['target']['file']['sha512'] md5 = new_report['md5'] = report_['target']['file']['md5'] apis = [] for process in report_['behavior']['processes']: for call in process['calls']: apis.append(call['api']) new_report['apis'] = apis dumpJson(new_report, dst_path+md5+'.json') reporter.logSuccess() print("Success") except Exception as e: reporter.logError(entity=item['rawPath'], msg=str(e)) print("Error:", str(e)) reporter.report() if log_dump_path is not None: reporter.dump(log_dump_path)
def __init__(self, exe_bin='python', # python的可执行文件的全称 relative_path_config='../config/', # 运行位置到config目录的相对位置 relative_path_run='../run/', # 运行位置到run目录的相对位置 check_verbose=True, # 是否开启verbose检查,默认为静默模式 flags={}, # 运行flag:对于一对k,v,flag形式为: "-(k) (v)" param_type_config_sep_symbol='|'): # 使用参数型添加任务时,参数名称的层分隔符号 self.TimeFormatter = TimeFormatter() self.ExecuteTaskLines = [] self.ConfigUpdateLines = [] self.RelativePathToConfig = relative_path_config self.RelativePathToRun = relative_path_run self.Flags = flags self.ExecuteBin = exe_bin self.CheckVerbose = check_verbose self.ParamTypeConfigSepSymbol = param_type_config_sep_symbol self.ExecuteSuccessCount = 0 self.ExecuteFailCount = 0 try: # 初始化machine时读取一个config的cache # 防止后续运行多个任务时config被修改导致默认值不一致的问题 self.ConfigCache = { 'train': loadJson(self.RelativePathToConfig + 'train.json'), 'test': loadJson(self.RelativePathToConfig + 'test.json'), } except FileExistsError as e: raise FileNotFoundError(f'[ExecuteMachine] Config file not found: {e}')
def makeDataFile(json_path, w2idx_path, seq_length_save_path, data_save_path, num_per_class, idx2cls_mapping_save_path=None, max_seq_len=600): data_list = [] folder_name_mapping = {} printState('Loading config data...') word2index = loadJson(w2idx_path) printState('Read main data...') for cls_idx, cls_dir in tqdm(enumerate(os.listdir(json_path))): class_path = json_path + cls_dir + '/' assert num_per_class == len(os.listdir(class_path)), \ '数据集中类%s的样本数量%d与期望的样本数量不一致!'%\ (cls_dir, len(os.listdir(class_path)), num_per_class) for item in os.listdir(class_path): report = loadJson(class_path + item) apis = report['apis'] data_list.append(apis) # 添加API序列 folder_name_mapping[cls_idx] = cls_dir # label_list += [cls_idx] * num_per_class # 添加一个类的样本标签 printState('Converting...') data_list = convertApiSeq2DataSeq(data_list, word2index, max_seq_len) # 转化为嵌入后的数值序列列表 seq_length_list = {i:len(seq) for i,seq in enumerate(data_list)} # 数据的序列长度 data_list = pad_sequence(data_list, batch_first=True, padding_value=0) # 数据填充0组建batch # 由于pad函数是根据输入序列的最大长度进行pad,如果所有序列小于最大长度,则有可能出现长度 # 不一致的错误 if data_list.size(1) < max_seq_len: padding_size = max_seq_len - data_list.size(1) zero_paddings = t.zeros((data_list.size(0),padding_size)) data_list = t.cat((data_list,zero_paddings),dim=1) printState('Dumping...') dumpJson(seq_length_list, seq_length_save_path) # 存储序列长度到JSON文件 if idx2cls_mapping_save_path is not None: dumpJson(folder_name_mapping, idx2cls_mapping_save_path) t.save(data_list, data_save_path) # 存储填充后的数据文件 printState('Done')
def statSatifiedClasses(pe_path, json_path, report_path, stat_stairs=[10, 15, 20], count_dump_path=None): # 将样本名称映射为类 cls_mapping = {} cls_cnt = {} warn_err_report = loadJson(report_path) for cls in os.listdir(pe_path): cls_cnt[cls] = 0 for item in os.listdir(pe_path + cls + '/'): cls_mapping[item] = cls for json_item in os.listdir(json_path): if json_item not in warn_err_report['errors'] and \ json_item not in warn_err_report['warnings']: cls_cnt[cls_mapping[json_item]] += 1 stair_cls_cnt = {} for stair in stat_stairs: stair_cls_cnt[stair] = [] for cls_name, cnt in cls_cnt.items(): if cnt >= stair: stair_cls_cnt[stair].append(cls_name) printBulletin('More than %d items (%d in total)' % (stair, len(stair_cls_cnt[stair]))) if count_dump_path is not None: dumpJson(stair_cls_cnt, count_dump_path, indent=None)
def renameItemsByMD5( json_path, # 报告文件路径 item_path, # 数据文件路径 ext_name=''): reporter = Reporter() md5s = [] for json_item in tqdm(os.listdir(json_path)): try: report = loadJson(json_path + json_item) md5 = report['md5'] if md5 in md5s: reporter.logWarning(entity=json_item, msg='MD5重复') success_flag = False else: md5s.append(md5) success_flag = True filename = '.'.join(json_item.split('.')[:-1]) os.rename(json_path + json_item, json_path + md5 + '.json') # 重命名json报告 os.rename(item_path + filename + ext_name, item_path + md5 + ext_name) # 重命名数据文件 if success_flag: reporter.logSuccess() except Exception as e: reporter.logError(entity=json_item, msg=str(e)) reporter.report()
def mappingApiNormalize(json_path, mapping, dump_mapping_path=None, is_class_dir=False): reporter = Reporter() for folder in tqdm(os.listdir(json_path)): items = os.listdir(json_path + folder + '/') if is_class_dir else [folder + '.json'] for item in items: item_path = json_path + folder + '/' + item try: report = loadJson(item_path) for i in range(len(report['apis'])): if report['apis'][i] in mapping: report['apis'][i] = mapping[report['apis'][i]] dumpJson(report, item_path) reporter.logSuccess() except Exception as e: reporter.logError(item, str(e)) if dump_mapping_path is not None: dumpJson(mapping, dump_mapping_path) reporter.report()
def parseAndSampleDataset(scale_report_path, base_path, dst_path, num_per_class, checked=True): scale_report = loadJson(scale_report_path) for family_name in tqdm(scale_report): # 抽样满足数量规模的类 if len(scale_report[family_name]) >= num_per_class: random.seed(magicSeed()) candidates = random.sample(scale_report[family_name], num_per_class) if os.path.exists(dst_path+family_name+'/'): raise RuntimeError("%s 类的文件夹在目标路径中已存在!"%(family_name)) else: os.mkdir(dst_path+family_name+'/') for item in candidates: folder_name,item_name = item.split("/") full_item_name = item_name+'.'+folder_name shutil.copy(base_path+item, dst_path+family_name+'/'+full_item_name) if checked: reporter = Reporter() for folder in os.listdir(dst_path): if len(os.listdir(dst_path+folder+'/')) != num_per_class: reporter.logError(entity=folder, msg="数量不足预期: %d/%d"% (len(os.listdir(dst_path+folder+'/')), num_per_class)) else: reporter.logSuccess() reporter.report()
def dump(self, path, key='test_result', desc=[]): metric_names = self.TestStat.MetricNames metrics = self.TestStat.getAlltimeMetric() metric_intervals = self.TestStat.getAlltimeMetricInterval() loss = self.TestStat.getAlltimeLoss() loss_interval = self.TestStat.getAlltimeLossInterval() try: test_result = loadJson(path) except FileNotFoundError: test_result = {} if key not in test_result: test_result[key] = [] result_obj = { 'time': time.strftime("%Y-%m-%d %H:%M:%S", time.localtime()), 'metrics': { n: '%.5f' % v + '±' + '%.5f' % itv for n, v, itv in zip(metric_names, metrics, metric_intervals) }, 'loss': '%.5f' % loss + '±' + '%.5f' % loss_interval, 'desc': desc, 'test_time_per_episode': self.Timer.getTotalTimeStat(stat_type='avg'), 'gpu_mem_used': self.GPUManager.getGPUUsedMem(unit='M') } test_result[key].append(result_obj) dumpJson(test_result, path)
def multi_process_align(str_path, epoch=1000, log_path=None, verbose=False, acc_dump_path=None, process_num=3): if acc_dump_path is not None: if not os.path.exists(acc_dump_path): dumpIterable([], "acc", acc_dump_path) acc_sum = loadJson(acc_dump_path)['acc'] else: acc_sum = [] matrix = loadJson(str_path)['strings'] queue = Queue() tm = StepTimer() tm.begin() process_pool = [] for i in range(process_num): p = Process(target=scoreEpisodeAlignment, args=(matrix, queue, i + 1, epoch // process_num)) process_pool.append(p) p.start() count = 0 while count < epoch: cur_acc = queue.get(block=True) count += 1 print("#", count, "acc=", cur_acc) acc_sum.append(cur_acc) if acc_dump_path is not None: dumpIterable(acc_sum, "acc", acc_dump_path) for p in process_pool: p.join() # while not queue.empty(): # acc_sum.append(queue.get()) print("\n*********************************************") print("Avg acc: ", sum(acc_sum) / epoch) print("Total time:", tm.step(prt=False, end=True)) print("95%% belief interval:", calBeliefeInterval(acc_sum))
def renameItemFolder(json_path): for folder in tqdm(os.listdir(json_path)): report = loadJson(json_path + folder + '/report.json') name = report['target']['file']['name'] os.rename(json_path + folder + '/report.json', json_path + folder + '/%s.json'%name) os.rename(json_path+folder+'/', json_path+name+'/')
def _loadJsonConfig(file_name, err_msg): for rel_path in REL_CFG_PATHS: try: cfg = loadJson(rel_path + file_name) return cfg except FileNotFoundError: print(f"[ConfigInit] not found: {rel_path+file_name}") continue raise RuntimeError(f"[ConfigInit] pwd: {os.getcwd()}, {err_msg}")
def addRedoTrainTask(self, dataset, version, updated_configs=None, **kwargs): pm = PathManager(dataset=dataset, version=version) # 直接读取已做过的实验的config,重新做一次 pre_config = loadJson(joinPath(pm.doc(), 'train.json')) # 加载之前version的参数后,如果有需要更新的参数,则在之前参数的基础上进行更新 if updated_configs is not None: self._setFields(pre_config, updated_configs) self.addTask('train', pre_config, **kwargs)
def convertApiCategory(clst_path, word_map_path, json_path, str_dump_path, max_len=300): word_map = loadJson(word_map_path) cluster_map = loadJson(clst_path) seqs = aggregateApiSequences(json_path, is_class_dir=True) str_mat = [] for seq in seqs: seq = seq[:max_len] s = "" for idx in seq: api_idx = str(word_map[idx]) s += chr(65 + cluster_map[api_idx]) str_mat.append(s) dumpIterable(str_mat, title="strings", path=str_dump_path)
def saveRunVersionConfig(cur_v, dataset, model, cfg): cfg_pack = { '__version': cur_v, '_model': model, '_dataset': dataset, 'config': cfg, '_time': time.asctime() } ver_cfg = loadJson('version.json') ver_cfg[str(cur_v)] = cfg_pack dumpJson(ver_cfg, 'version.json', sort=True)
def sampleClassWiseData(dst_path, log_file_path, num_per_class=20): family_report = loadJson(log_file_path) for fname, flist in tqdm(family_report.items()): if len(flist) >= num_per_class: os.mkdir(dst_path + fname) cans = sample(flist, num_per_class) for can in cans: can_fname = can.split('/')[-1] shutil.copy(can, dst_path + fname + '/' + can_fname)
def makeClusteredData(json_path, cluster_path, word_map_path, dump_path, max_len=1000): word_map = loadJson(word_map_path) cluster_map = loadJson(cluster_path) seqs = aggregateApiSequences(json_path, is_class_dir=True) mat = [] for seq in seqs: seq = seq[:max_len] s = [] for idx in seq: s.append(cluster_map[str(word_map[idx])]) while len(s) < max_len: s.append(-1) mat.append(s) np.save(dump_path, np.array(mat))
def extractApiFromJson(path): reporter = Reporter() for i, item_dir in enumerate(os.listdir(path)): print(i, item_dir) cur_json_path = path + item_dir + '/%s.json' % item_dir new_report = {} new_report['apis'] = [] # 此处假设json文件与文件夹同名 try: report = loadJson(cur_json_path) # 兼容处理后的报告和未处理的报告 if 'target' in report: new_report['name'] = report['target']['file']['name'] else: new_report['name'] = report['name'] # 新版本的report,含有api字段 if 'apis' in report: new_report['apis'] = report['apis'] # 完整的报告中,api位于behavior-processes-calls-api中 else: # 按进程-调用-api的方式逐个收集api调用名称 api_call_seq = [] for process in report['behavior']['processes']: for call in process['calls']: api_call_seq.append(call['api']) new_report['apis'] = api_call_seq reporter.logSuccess() # 对于键错误,说明源文件中有错误,应该进行留空处理 except KeyError as e: # name字段已保存,则api留空 if 'name' in new_report: new_report['apis'] = [] dumpJson(new_report, cur_json_path) # 否则直接不处理 reporter.logError(item_dir, str(e)) # 其他错误不进行处理 except Exception as e: reporter.logError(item_dir, str(e)) reporter.report()
def removeApiRedundance(json_path, selected_apis=None, class_dir=True): reporter = Reporter() for folder in tqdm(os.listdir(json_path)): if class_dir: items = os.listdir(json_path + folder + '/') else: items = [folder + '.json'] for item in items: item_path = json_path + folder + '/' + item try: report = loadJson(item_path) redun_flag = False redun_api_token = None new_api_seq = [] for api_token in report['apis']: # 只关注选出的那些api # 如果给定的选中API为None代表不进行选择 if selected_apis is None or \ api_token in selected_apis: if api_token != redun_api_token: # 每当遇到新的api时,刷新当前遇到的api,同时重置flag redun_api_token = api_token redun_flag = False else: if not redun_flag: # 如果遇到了一样的api,但是flag没有置位,说明第二次遇到,同时置位flag redun_flag = True else: continue # 如果遇到了一样的api且flag置位,说明已经遇到过两次,则跳过冗余api new_api_seq.append(api_token) # 使用新api序列覆盖原api序列 report['apis'] = new_api_seq dumpJson(report, item_path) reporter.logSuccess() except Exception as e: reporter.logError(folder, str(e)) reporter.report()
def saveResult(path, desc, acc, los, acc_i, los_i): try: results = loadJson(path) except: results = {'results': []} results['results'].append({ 'acc': acc, 'loss': los, 'acc_interval': acc_i, 'loss_interval': los_i, 'desc': desc }) dumpJson(results, path)
def collectPEwithAPI( api_dir_path, pe_dir_path, dst_path, class_dir=True, name_prefix=None, # 后缀默认是json,因为要读取json文件 log_dump_path=None): print("[CollectPEwithAPI] Preparing...") pe_folder_map = { folder: os.listdir(pe_dir_path + folder) for folder in os.listdir(pe_dir_path) } reporter = Reporter() print("[CollectPEwithAPI] Starting...") for folder in tqdm(os.listdir(api_dir_path)): folder_path = api_dir_path + folder + '/' if class_dir: items = os.listdir(folder_path) os.mkdir(dst_path + folder + '/') dst_folder = dst_path + folder + '/' else: items = [name_prefix + '.json'] # 对于没有分类的文件,移动时不重新创建文件夹 dst_folder = dst_path for item in items: try: report = loadJson(folder_path + item) name = report['name'] found_flag = False for pe_folder in pe_folder_map: if name in pe_folder_map[pe_folder]: shutil.copy(pe_dir_path + pe_folder + '/' + name, dst_folder + name) found_flag = True break if not found_flag: reporter.logError(entity=name, msg="File not found") else: reporter.logSuccess() except Exception as e: reporter.logError(entity=folder_path + item, msg=str(e)) reporter.report() if log_dump_path is not None: reporter.dump(log_dump_path)
def checkVersion(cur_v): ver_cfg = loadJson('version.json') last_ver = ver_cfg['lastRunVersion'] if cur_v == last_ver: logging.warning(ver_check_warning_template % cur_v) opt = input('>>>') if opt == 'y' or opt == '1' or opt == '': return else: sys.exit(1) else: ver_cfg['lastRunVersion'] = cur_v dumpJson(ver_cfg, 'version.json')
def statValidJsonReport(dir_path, len_thresh=10, class_dir=False, name_prefix=None, dump_valid_path=None): valid = invalid = too_short = total = 0 valid_list = [] for folder in os.listdir(dir_path): folder_path = dir_path+folder+'/' if class_dir: items = os.listdir(folder_path) else: items = [name_prefix+'.json'] for item in items: total_length = 0 total += 1 print('#%d'%total, folder_path+item, end=': ') try: report = loadJson(folder_path+item) raw_file_name = report['target']['file']['name'] for process in report['behavior']['processes']: total_length += len(process['calls']) if total_length < len_thresh: too_short += 1 print('too short:', total_length) else: valid += 1 valid_list.append({'file':raw_file_name, 'len':total_length, 'rawPath':folder_path+item}) print('valid') except Exception as e: invalid += 1 print('Error: ', str(e)) print('Total:', total) print('Valid:', valid) print('Invalid:', invalid) print('Too Short:', too_short) if dump_valid_path is not None: dumpIterable(valid_list, title='valid_file_name', path=dump_valid_path)
def revertDatasetSplit(dataset, dump_path): man = PathManager(dataset) split_dump = loadJson(dump_path) deleteDatasetSplit(man.datasetBase()) for typ in ['train', 'validate', 'test']: print(f"[revertDatasetSplit] {typ}") for folder in split_dump[typ]: shutil.copytree(src=man.datasetBase() + 'all/api/' + folder + '/', dst=man.datasetBase() + typ + '/api/' + folder + '/') shutil.copytree(src=man.datasetBase() + 'all/img/' + folder + '/', dst=man.datasetBase() + typ + '/img/' + folder + '/') print('-- Done --')
def revertDatasetSplit(dataset, dump_path): man = PathManager(dataset) split_dump = loadJson(dump_path) deleteDatasetSplit(man.DatasetBase()) for typ in ['train', 'validate', 'test']: # delete the existed split # os.system('rm -rf {path}/*'.format(path=man.DatasetBase()+typ)) print(typ) for folder in split_dump[typ]: shutil.copytree(src=man.DatasetBase()+'all/'+folder+'/', dst=man.DatasetBase()+typ+'/'+folder+'/') print('-- Done --')
def convertToNGramSeq( parent_path, window=3, ngram_dict=None, # 统计得到的NGram字典,已排序 ngram_max_num=None, class_dir=False): # 要提取前n个NGram,可从统计函数中获取信息,或者不指定 reporter = Reporter() if ngram_dict is not None and ngram_max_num is not None: valid_ngrams = list(ngram_dict.keys())[:ngram_max_num] else: valid_ngrams = None for folder in tqdm(os.listdir(parent_path)): folder_path = parent_path + folder + '/' if class_dir: items = os.listdir(folder_path) else: items = [folder + '.json'] for item in items: try: ngram_seq = [] report = loadJson(folder_path + item) api_seq = report['apis'] for i in range(len(api_seq) - window): ngram = strlistToStr(api_seq[i:i + window]) # 没有指定要提取的ngram或者当前ngram存在于要提取的ngram中时才会添加 if valid_ngrams is None or ngram in valid_ngrams: ngram_seq.append(ngram) # 写回原文件中 report['apis'] = ngram_seq dumpJson(report, folder_path + item) reporter.logSuccess() except Exception as e: reporter.logError(entity=folder + '/' + item, msg=str(e)) continue reporter.report()
def renameCuckooFolders(json_path): reporter = Reporter() for folder in tqdm(os.listdir(json_path)): try: report = loadJson(json_path+folder+'/report.json') name = report['target']['file']['name'] os.rename(json_path+folder+'/report.json', json_path+folder+'/%s.json'%name) os.rename(json_path+folder, json_path+name) reporter.logSuccess() except Exception as e: reporter.logError(entity=folder, msg=str(e)) continue reporter.report()
def statApiFrequency(json_path, is_class_dir=False, threshold=None): api_frequency = {} total = 0 for dir_ in tqdm(os.listdir(json_path)): dir_path = json_path + dir_ + '/' if is_class_dir: items = os.listdir(dir_path) else: items = [dir_ + '.json'] for item in items: apis = loadJson(dir_path + item)['apis'] for api in apis: if api not in api_frequency: api_frequency[api] = 0 api_frequency[api] += 1 total += 1 printState('API频率统计') # 按照频率降序排列 api_frequency = sorted(api_frequency.items(), key=lambda x: x[1], reverse=True) below_threshold = [] for i, (api, f) in enumerate(api_frequency): print('#%d' % i, api, f / total) if threshold is not None: # threshold小于1时,定义为频率阈值 if 1 > threshold > f / total: below_threshold.append(api) # threshold大于1时,定义为排名阈值 elif i >= threshold >= 1: below_threshold.append(api) if threshold is not None: printState('低于%f的API(%d个)' % (threshold, len(below_threshold))) print(below_threshold)
def __init__(self, data_path, seq_path, N): self.Data = t.load(data_path) # 不再利用长度截断,而是所有序列都按照制定长度存储 seqLength = loadJson(seq_path) self.SeqLength = [0] * len(self.Data) for i,l in seqLength.items(): self.SeqLength[int(i)] = l # 存储序列长度 self.Label = [] self.ClassNum = len(self.Data) // N assert len(self.Data) % N == 0, \ '数据总长度%d不是指定每个类样本数量%d的整倍数!' % (len(self.Data), N) assert len(self.Data) == len(self.SeqLength), \ '数据总长度%d与序列长度数据总长度%d不同' % (len(self.Data), len(self.SeqLength)) for i in range(len(self.Data) // N): self.Label += [i] * N
def collectJsonByClass( pe_path, json_path, dst_path, report_path, num_per_class, selected_classes, ): reporter = Reporter() warn_errs = loadJson(report_path) def length_filter(x): return x not in warn_errs['warnings'] and x not in warn_errs['errors'] for cls in tqdm(selected_classes): dst_dir = dst_path + cls + '/' if not os.path.exists(dst_dir): os.mkdir(dst_dir) # filter those items not satisfying scale requirement cand_items = os.listdir(pe_path + cls + '/') cand_items = list(filter(length_filter, cand_items)) # for some PE items, there misses the corresponding json item cand_items = list( filter(lambda x: os.path.exists(json_path + x + '/'), cand_items)) cand_items = random.sample(cand_items, num_per_class) for item in cand_items: try: shutil.copy(json_path + item + '/%s.json' % item, dst_dir + '/%s.json' % item) reporter.logSuccess() except Exception as e: reporter.logError('%s/%s' % (cls, item), str(e)) reporter.report()