def vtScan(folder_path, json_save_path, scan_num=20000, timeout=600): scan_params = {'apikey': apikey} start_index = len(os.listdir(json_save_path)) end_index = min(start_index + scan_num, len(os.listdir(folder_path))) print('Begin to scan...') samples_list = os.listdir(folder_path) last_stamp = time.time() while start_index < end_index: print(start_index + 1, '/', end_index) f = samples_list[start_index] if (os.path.exists(json_save_path + f + '.json') and os.path.getsize(json_save_path + f + '.json') != 0): start_index += 1 last_stamp = time.time() continue files_cfg = {'file': ('test', open(folder_path + f, 'rb'))} try: print('scanning...') response = requests.post(scan_url, files=files_cfg, params=scan_params, timeout=timeout) except Exception as e: print(f, ': api request exceeds!', ' error:', str(e)) print('waiting...') time.sleep(10) continue scan_info = response.json() report_params = {'apikey': apikey, 'resource': scan_info['md5']} try: print('fetching report...') report = requests.get(report_url, params=report_params, timeout=timeout) report = report.json() # ['scans'] except BaseException as e: print(f, ': api request exceeds!', ' error:', str(e)) print('waiting...') time.sleep(10) continue # print(report) print(report['verbose_msg']) if report['response_code'] == 1: dumpJson(report, '%s.json' % (json_save_path + f), indent=None) else: sys.stderr.write('%s wrong response code %d' % (f, report['response_code'])) print('time consuming: %.2f' % (time.time() - last_stamp)) last_stamp = time.time() start_index += 1 time.sleep(1)
def extractAPISeqOnLog(pre_log_path, dst_path, log_dump_path=None): logs = loadJson(pre_log_path) reporter = Reporter() for i,item in enumerate(logs['valid_files']): print('#', i+1, end=' ') try: report_ = loadJson(item['rawPath']) new_report = {} new_report['sha1'] = report_['target']['file']['sha1'] new_report['name'] = report_['target']['file']['name'] new_report['sha256'] = report_['target']['file']['sha256'] new_report['sha512'] = report_['target']['file']['sha512'] md5 = new_report['md5'] = report_['target']['file']['md5'] apis = [] for process in report_['behavior']['processes']: for call in process['calls']: apis.append(call['api']) new_report['apis'] = apis dumpJson(new_report, dst_path+md5+'.json') reporter.logSuccess() print("Success") except Exception as e: reporter.logError(entity=item['rawPath'], msg=str(e)) print("Error:", str(e)) reporter.report() if log_dump_path is not None: reporter.dump(log_dump_path)
def mappingApiNormalize(json_path, mapping, dump_mapping_path=None, is_class_dir=False): reporter = Reporter() for folder in tqdm(os.listdir(json_path)): items = os.listdir(json_path + folder + '/') if is_class_dir else [folder + '.json'] for item in items: item_path = json_path + folder + '/' + item try: report = loadJson(item_path) for i in range(len(report['apis'])): if report['apis'][i] in mapping: report['apis'][i] = mapping[report['apis'][i]] dumpJson(report, item_path) reporter.logSuccess() except Exception as e: reporter.logError(item, str(e)) if dump_mapping_path is not None: dumpJson(mapping, dump_mapping_path) reporter.report()
def statClassScaleOfUnorgranized(base, normalizer, save_path=None, scale_stairs=[]): table = {} # 遍历,统计类数量 for folder in tqdm(os.listdir(base)): for item in tqdm(os.listdir(base+folder+'/')): can_name = normalizer(item) if can_name not in table: table[can_name] = [folder+'/'+item] else: table[can_name].append(folder+'/'+item) scale_table = {n:len(table[n]) for n in table} print("%d classes in total"%len(table)) for stair in scale_stairs: counter = 0 for k in scale_table: if scale_table[k] > stair: counter += 1 print("*"*50) print("Num of classes larger than %d: %d"%(stair, counter)) dumpJson(table, save_path) print('- Done -')
def dump(self, path, key='test_result', desc=[]): metric_names = self.TestStat.MetricNames metrics = self.TestStat.getAlltimeMetric() metric_intervals = self.TestStat.getAlltimeMetricInterval() loss = self.TestStat.getAlltimeLoss() loss_interval = self.TestStat.getAlltimeLossInterval() try: test_result = loadJson(path) except FileNotFoundError: test_result = {} if key not in test_result: test_result[key] = [] result_obj = { 'time': time.strftime("%Y-%m-%d %H:%M:%S", time.localtime()), 'metrics': { n: '%.5f' % v + '±' + '%.5f' % itv for n, v, itv in zip(metric_names, metrics, metric_intervals) }, 'loss': '%.5f' % loss + '±' + '%.5f' % loss_interval, 'desc': desc, 'test_time_per_episode': self.Timer.getTotalTimeStat(stat_type='avg'), 'gpu_mem_used': self.GPUManager.getGPUUsedMem(unit='M') } test_result[key].append(result_obj) dumpJson(test_result, path)
def statSatifiedClasses(pe_path, json_path, report_path, stat_stairs=[10, 15, 20], count_dump_path=None): # 将样本名称映射为类 cls_mapping = {} cls_cnt = {} warn_err_report = loadJson(report_path) for cls in os.listdir(pe_path): cls_cnt[cls] = 0 for item in os.listdir(pe_path + cls + '/'): cls_mapping[item] = cls for json_item in os.listdir(json_path): if json_item not in warn_err_report['errors'] and \ json_item not in warn_err_report['warnings']: cls_cnt[cls_mapping[json_item]] += 1 stair_cls_cnt = {} for stair in stat_stairs: stair_cls_cnt[stair] = [] for cls_name, cnt in cls_cnt.items(): if cnt >= stair: stair_cls_cnt[stair].append(cls_name) printBulletin('More than %d items (%d in total)' % (stair, len(stair_cls_cnt[stair]))) if count_dump_path is not None: dumpJson(stair_cls_cnt, count_dump_path, indent=None)
def generateConfigReport(dataset, include_result=False, dump_path=None): mng = PathManager(dataset) report = {} warnings = [] for doc_dir in tqdm(os.listdir(mng.DocBase())): config_path = mng.DocBase() + doc_dir + '/' try: cfg = loadJson(config_path + 'config.json') report[int(cfg['version'])] = { '__model': cfg['modelName'], # 添加下划线便于排序 '_k-q-qk': '-'.join([str(cfg['k']), str(cfg['n']), str(cfg['qk'])]), 'desc': cfg['description'] } if include_result: res = loadJson(config_path + 'testResult.json') report[int(cfg['version'])]['results'] = res['results'] except Exception as e: warnings.append('Error occurred when process %s: %s' % (doc_dir, str(e))) for w in warnings: logging.warning(w) dump_path = mng.DatasetBase( ) + 'summary.json' if dump_path is None else dump_path dumpJson(report, dump_path, sort=True)
def getApiSeqFromCSV(csv_path, json_save_path): with open(csv_path) as f: f_csv = csv.reader(f) for i, row in tqdm(enumerate(f_csv)): api = row[2:] hash_val = row[1] report = {'apis': api} dumpJson(report, path=json_save_path + hash_val + '.json')
def makeDataFile(json_path, w2idx_path, seq_length_save_path, data_save_path, num_per_class, idx2cls_mapping_save_path=None, max_seq_len=600): data_list = [] folder_name_mapping = {} printState('Loading config data...') word2index = loadJson(w2idx_path) printState('Read main data...') for cls_idx, cls_dir in tqdm(enumerate(os.listdir(json_path))): class_path = json_path + cls_dir + '/' assert num_per_class == len(os.listdir(class_path)), \ '数据集中类%s的样本数量%d与期望的样本数量不一致!'%\ (cls_dir, len(os.listdir(class_path)), num_per_class) for item in os.listdir(class_path): report = loadJson(class_path + item) apis = report['apis'] data_list.append(apis) # 添加API序列 folder_name_mapping[cls_idx] = cls_dir # label_list += [cls_idx] * num_per_class # 添加一个类的样本标签 printState('Converting...') data_list = convertApiSeq2DataSeq(data_list, word2index, max_seq_len) # 转化为嵌入后的数值序列列表 seq_length_list = {i:len(seq) for i,seq in enumerate(data_list)} # 数据的序列长度 data_list = pad_sequence(data_list, batch_first=True, padding_value=0) # 数据填充0组建batch # 由于pad函数是根据输入序列的最大长度进行pad,如果所有序列小于最大长度,则有可能出现长度 # 不一致的错误 if data_list.size(1) < max_seq_len: padding_size = max_seq_len - data_list.size(1) zero_paddings = t.zeros((data_list.size(0),padding_size)) data_list = t.cat((data_list,zero_paddings),dim=1) printState('Dumping...') dumpJson(seq_length_list, seq_length_save_path) # 存储序列长度到JSON文件 if idx2cls_mapping_save_path is not None: dumpJson(folder_name_mapping, idx2cls_mapping_save_path) t.save(data_list, data_save_path) # 存储填充后的数据文件 printState('Done')
def apiCluster(dict_path, map_dump_path, cluster_num=26): api_mat = np.load(dict_path, allow_pickle=True) # pca = TSNE(n_components=2) # de_api_mat = pca.fit_transform(api_mat) # colors = getRandomColor(26, more=False) print("Clustering...") km = KMeans(n_clusters=cluster_num).fit(api_mat) km_wrapper = {i: int(c) for i, c in enumerate(km.labels_)} dumpJson(km_wrapper, map_dump_path)
def dump(self, path): dump_file = {'errors': {}, 'warnings': {}} # 将entity作为键,信息内容作为值 for e in self.ErrorList: dump_file['errors'][e[0]] = e[1] for w in self.WarningList: dump_file['warnings'][w[0]] = w[1] dumpJson(dump_file, path)
def statExceptionReportFNcb(reporter_, list_, dict_): print('*'*50) print("Total:", dict_['noexc']+dict_['exc']+dict_['err']) print("No Exception:", dict_['noexc']) print('Exception:', dict_['exc']) print('Error:', dict_['err']) print('*' * 50) if dump_noexp_path is not None: dumpJson({'has_exception': dict_['exc_list'], 'no_exception': dict_['noexc_list']}, dump_noexp_path)
def extractApiFromJson(path): reporter = Reporter() for i, item_dir in enumerate(os.listdir(path)): print(i, item_dir) cur_json_path = path + item_dir + '/%s.json' % item_dir new_report = {} new_report['apis'] = [] # 此处假设json文件与文件夹同名 try: report = loadJson(cur_json_path) # 兼容处理后的报告和未处理的报告 if 'target' in report: new_report['name'] = report['target']['file']['name'] else: new_report['name'] = report['name'] # 新版本的report,含有api字段 if 'apis' in report: new_report['apis'] = report['apis'] # 完整的报告中,api位于behavior-processes-calls-api中 else: # 按进程-调用-api的方式逐个收集api调用名称 api_call_seq = [] for process in report['behavior']['processes']: for call in process['calls']: api_call_seq.append(call['api']) new_report['apis'] = api_call_seq reporter.logSuccess() # 对于键错误,说明源文件中有错误,应该进行留空处理 except KeyError as e: # name字段已保存,则api留空 if 'name' in new_report: new_report['apis'] = [] dumpJson(new_report, cur_json_path) # 否则直接不处理 reporter.logError(item_dir, str(e)) # 其他错误不进行处理 except Exception as e: reporter.logError(item_dir, str(e)) reporter.report()
def saveRunVersionConfig(cur_v, dataset, model, cfg): cfg_pack = { '__version': cur_v, '_model': model, '_dataset': dataset, 'config': cfg, '_time': time.asctime() } ver_cfg = loadJson('version.json') ver_cfg[str(cur_v)] = cfg_pack dumpJson(ver_cfg, 'version.json', sort=True)
def dumpDatasetSplitStruct(base_path, dump_path): dump = {} for split in ['train', 'validate', 'test']: print(split) folders = [] for folder in os.listdir(base_path+split+'/'): folders.append(folder) dump[split] = folders dumpJson(dump, dump_path) print('-- Done --')
def _setConfig(self, task_type, fields): ''' 只对给定的config的fields进行设置,没有给定的fields保持config的原值 ''' # 修改为读取machine初始化时加载的config的cache # 防止在运行过程中config被修改导致默认值在多个任务运行时不一致的问题 if task_type in self.ConfigCache: conf = deepcopy(self.ConfigCache.get(task_type)) else: return None # conf = loadJson(self.RelativePathToConfig+task_type+'.json') self._setFields(conf, fields) dumpJson(conf, self.RelativePathToConfig+task_type+'.json')
def dumpStatHist(self): res = { 'train': { 'metrics': self.TrainStat.MetricHist, 'loss': self.TrainStat.LossHist }, 'validate': { 'metrics': self.ValStat.MetricHist, 'loss': self.ValStat.LossHist } } if self.StatSavePath is not None: dumpJson(res, self.StatSavePath)
def removeApiRedundance(json_path, selected_apis=None, class_dir=True): reporter = Reporter() for folder in tqdm(os.listdir(json_path)): if class_dir: items = os.listdir(json_path + folder + '/') else: items = [folder + '.json'] for item in items: item_path = json_path + folder + '/' + item try: report = loadJson(item_path) redun_flag = False redun_api_token = None new_api_seq = [] for api_token in report['apis']: # 只关注选出的那些api # 如果给定的选中API为None代表不进行选择 if selected_apis is None or \ api_token in selected_apis: if api_token != redun_api_token: # 每当遇到新的api时,刷新当前遇到的api,同时重置flag redun_api_token = api_token redun_flag = False else: if not redun_flag: # 如果遇到了一样的api,但是flag没有置位,说明第二次遇到,同时置位flag redun_flag = True else: continue # 如果遇到了一样的api且flag置位,说明已经遇到过两次,则跳过冗余api new_api_seq.append(api_token) # 使用新api序列覆盖原api序列 report['apis'] = new_api_seq dumpJson(report, item_path) reporter.logSuccess() except Exception as e: reporter.logError(folder, str(e)) reporter.report()
def saveResult(path, desc, acc, los, acc_i, los_i): try: results = loadJson(path) except: results = {'results': []} results['results'].append({ 'acc': acc, 'loss': los, 'acc_interval': acc_i, 'loss_interval': los_i, 'desc': desc }) dumpJson(results, path)
def statMalClassesOnNamesFNcb(reporter_, list_, dict_): for f,c in dict_.items(): print(f,len(c)) if dump_log_path is not None: dumpJson(dict_, dump_log_path) counts = [0]*len(scale_stairs) for family, f_list in dict_.items(): for i,s in enumerate(scale_stairs): if len(f_list) >= s: counts[i] += 1 for s,c in zip(scale_stairs, counts): print("More than %d items:"%s, c)
def checkVersion(cur_v): ver_cfg = loadJson('version.json') last_ver = ver_cfg['lastRunVersion'] if cur_v == last_ver: logging.warning(ver_check_warning_template % cur_v) opt = input('>>>') if opt == 'y' or opt == '1' or opt == '': return else: sys.exit(1) else: ver_cfg['lastRunVersion'] = cur_v dumpJson(ver_cfg, 'version.json')
def dumpDatasetSplitStruct(base_path, dump_path, desc: list, verbose=True): dump = {"desc": desc} for split in ['train', 'validate', 'test']: if verbose: print(f"[dumpDatasetSplitStruct] {split}") folders = [] for folder in os.listdir(base_path + split + '/api'): # 以api文件夹的为标准 folders.append(folder) dump[split] = folders dumpJson(dump, dump_path) if verbose: print('-- Done --')
def convertToNGramSeq( parent_path, window=3, ngram_dict=None, # 统计得到的NGram字典,已排序 ngram_max_num=None, class_dir=False): # 要提取前n个NGram,可从统计函数中获取信息,或者不指定 reporter = Reporter() if ngram_dict is not None and ngram_max_num is not None: valid_ngrams = list(ngram_dict.keys())[:ngram_max_num] else: valid_ngrams = None for folder in tqdm(os.listdir(parent_path)): folder_path = parent_path + folder + '/' if class_dir: items = os.listdir(folder_path) else: items = [folder + '.json'] for item in items: try: ngram_seq = [] report = loadJson(folder_path + item) api_seq = report['apis'] for i in range(len(api_seq) - window): ngram = strlistToStr(api_seq[i:i + window]) # 没有指定要提取的ngram或者当前ngram存在于要提取的ngram中时才会添加 if valid_ngrams is None or ngram in valid_ngrams: ngram_seq.append(ngram) # 写回原文件中 report['apis'] = ngram_seq dumpJson(report, folder_path + item) reporter.logSuccess() except Exception as e: reporter.logError(entity=folder + '/' + item, msg=str(e)) continue reporter.report()
def extractAPISequenceFromRawInner(count_, file_path_, report_, list_, dict_, **kwargs): print("# %d"%count_, end=' ') new_report = {} new_report['sha1'] = report_['target']['file']['sha1'] new_report['name'] = report_['target']['file']['name'] new_report['sha256'] = report_['target']['file']['sha256'] new_report['sha512'] = report_['target']['file']['sha512'] md5 = new_report['md5'] = report_['target']['file']['md5'] apis = [] for process in report_['behavior']['processes']: for call in process['calls']: apis.append(call['api']) new_report['apis'] = apis dumpJson(new_report, dst_path+md5+'.json') return list_, dict_
def mapAndExtractTopKNgramInner(count_, filep_, report_, list_, dict_, **kwargs): print('# %d' % count_, end=' ') new_seq = [] apis = report_['apis'] for i in range(len(apis)): if i + N >= len(apis): break ngram = '/'.join(apis[i:i + N]) if ngram in topk_ngrams: new_seq.append(topk_ngrams[ngram]) new_report = {k: v for k, v in report_.items()} new_report['apis'] = new_seq dumpJson(new_report, filep_) return list_, dict_
def removeAPIRedundancyInner(count_, filep_, report_, list_, dict_, **kwargs): print('# %d' % count_, end=' ') new_report = {key: val for key, val in report_.items()} new_apis = [] base = 0 apis = report_['apis'] while base < len(apis): shift = 1 while base + shift < len(apis) and apis[base + shift] == apis[base]: shift += 1 new_apis.append(apis[base]) base += shift new_report['apis'] = new_apis dumpJson(new_report, filep_) return list_, dict_
def vtReportByHash(hash, report_save_path, timeout=300): report_params = {'apikey': apikey, 'resource': hash} try: start_time = time.time() print('fetching report...') report = requests.get(report_url, params=report_params, timeout=timeout) report = report.json() dumpJson(report, report_save_path, indent=None) end_time = time.time() print('time consuming: %.2f' % (end_time - start_time)) print('-------------------------------------------') print('') return True except BaseException as e: print('Error when fetching report of %s, %s' % (hash, str(e))) print('waiting...') time.sleep(10) return False
def trainW2Vmodel(seqs, sg=0, size=64, min_count=1, cbow_mean=1, save_matrix_path=None, save_word2index_path=None, padding=True): # 是否在W2V转换矩阵中添加一个pad嵌入 printBulletin('Traning Word2Vector...') model = Word2Vec(seqs, size=size, sg=sg, min_count=min_count, cbow_mean=cbow_mean) printBulletin('Saving...') matrix = model.wv.vectors word2index = {} if padding: pad_matrix = np.zeros((1, model.wv.vectors.shape[1])) matrix = np.concatenate((pad_matrix, matrix), axis=0) for i, w in enumerate(model.wv.index2word): word2index[ w] = i + 1 if padding else i # 由于idx=0要留给padding,因此所有的下标都加1 word2index['<PAD>'] = 0 if save_matrix_path: np.save(save_matrix_path, matrix) if save_word2index_path: dumpJson(word2index, save_word2index_path) if save_matrix_path is None and save_word2index_path is None: return matrix, word2index printBulletin('Done')
def mapAndExtractTopKNgram(dir_path, ngram_stat_log_path, K, N, class_dir=True, map_dump_path=None): ngram_fre = loadJson(ngram_stat_log_path) sorted_ngrams = sorted(ngram_fre.items(), key=lambda x: x[1], reverse=True)[:K] topk_ngrams = {x[0]: i + 1 for i, x in enumerate(sorted_ngrams)} # 将NGram映射为下标序号 topk_ngrams['<PAD>'] = 0 # 0为pad def mapAndExtractTopKNgramInner(count_, filep_, report_, list_, dict_, **kwargs): print('# %d' % count_, end=' ') new_seq = [] apis = report_['apis'] for i in range(len(apis)): if i + N >= len(apis): break ngram = '/'.join(apis[i:i + N]) if ngram in topk_ngrams: new_seq.append(topk_ngrams[ngram]) new_report = {k: v for k, v in report_.items()} new_report['apis'] = new_seq dumpJson(new_report, filep_) return list_, dict_ datasetTraverse(dir_path=dir_path, exec_kernel=mapAndExtractTopKNgramInner, class_dir=class_dir) if map_dump_path is not None: dumpJson(topk_ngrams, map_dump_path)
def filterApiSequence(json_path, api_list, keep_or_filter=True, is_class_dir=True): # 若为True则过滤列表中的API,若为False则保留列表中的API reporter = Reporter() for folder in tqdm(os.listdir(json_path)): if is_class_dir: items = os.listdir(json_path + folder + '/') else: items = [folder + '.json'] for item in items: item_path = json_path + folder + '/' + item try: report = loadJson(item_path) new_api_seq = [] for api_token in report['apis']: # 若过滤,则api不在列表中 # 若保留,则api在列表中 if (api_token in api_list) ^ keep_or_filter: new_api_seq.append(api_token) # 使用新api序列覆盖原api序列 report['apis'] = new_api_seq dumpJson(report, item_path) reporter.logSuccess() except Exception as e: reporter.logError(item, str(e)) reporter.report()