예제 #1
0
def generateConfigReport(dataset, include_result=False, dump_path=None):
    mng = PathManager(dataset)

    report = {}
    warnings = []

    for doc_dir in tqdm(os.listdir(mng.DocBase())):
        config_path = mng.DocBase() + doc_dir + '/'

        try:
            cfg = loadJson(config_path + 'config.json')

            report[int(cfg['version'])] = {
                '__model': cfg['modelName'],  # 添加下划线便于排序
                '_k-q-qk':
                '-'.join([str(cfg['k']),
                          str(cfg['n']),
                          str(cfg['qk'])]),
                'desc': cfg['description']
            }

            if include_result:
                res = loadJson(config_path + 'testResult.json')
                report[int(cfg['version'])]['results'] = res['results']

        except Exception as e:
            warnings.append('Error occurred when process %s: %s' %
                            (doc_dir, str(e)))

    for w in warnings:
        logging.warning(w)

    dump_path = mng.DatasetBase(
    ) + 'summary.json' if dump_path is None else dump_path
    dumpJson(report, dump_path, sort=True)
예제 #2
0
def extractAPISeqOnLog(pre_log_path, dst_path, log_dump_path=None):
    logs = loadJson(pre_log_path)
    reporter = Reporter()

    for i,item in enumerate(logs['valid_files']):
        print('#', i+1, end=' ')
        try:
            report_ = loadJson(item['rawPath'])

            new_report = {}
            new_report['sha1'] = report_['target']['file']['sha1']
            new_report['name'] = report_['target']['file']['name']
            new_report['sha256'] = report_['target']['file']['sha256']
            new_report['sha512'] = report_['target']['file']['sha512']
            md5 = new_report['md5'] = report_['target']['file']['md5']

            apis = []
            for process in report_['behavior']['processes']:
                for call in process['calls']:
                    apis.append(call['api'])
            new_report['apis'] = apis
            dumpJson(new_report, dst_path+md5+'.json')

            reporter.logSuccess()
            print("Success")
        except Exception as e:
            reporter.logError(entity=item['rawPath'],
                              msg=str(e))
            print("Error:", str(e))

    reporter.report()
    if log_dump_path is not None:
        reporter.dump(log_dump_path)
예제 #3
0
    def __init__(self,
                 exe_bin='python',                      # python的可执行文件的全称
                 relative_path_config='../config/',     # 运行位置到config目录的相对位置
                 relative_path_run='../run/',           # 运行位置到run目录的相对位置
                 check_verbose=True,                    # 是否开启verbose检查,默认为静默模式
                 flags={},                              # 运行flag:对于一对k,v,flag形式为: "-(k) (v)"
                 param_type_config_sep_symbol='|'):    # 使用参数型添加任务时,参数名称的层分隔符号
        self.TimeFormatter = TimeFormatter()
        self.ExecuteTaskLines = []
        self.ConfigUpdateLines = []
        self.RelativePathToConfig = relative_path_config
        self.RelativePathToRun = relative_path_run
        self.Flags = flags
        self.ExecuteBin = exe_bin
        self.CheckVerbose = check_verbose
        self.ParamTypeConfigSepSymbol = param_type_config_sep_symbol

        self.ExecuteSuccessCount = 0
        self.ExecuteFailCount = 0

        try:
            # 初始化machine时读取一个config的cache
            # 防止后续运行多个任务时config被修改导致默认值不一致的问题
            self.ConfigCache = {
                'train': loadJson(self.RelativePathToConfig + 'train.json'),
                'test': loadJson(self.RelativePathToConfig + 'test.json'),
            }
        except FileExistsError as e:
            raise FileNotFoundError(f'[ExecuteMachine] Config file not found: {e}')
예제 #4
0
def makeDataFile(json_path,
                 w2idx_path,
                 seq_length_save_path,
                 data_save_path,
                 num_per_class,
                 idx2cls_mapping_save_path=None,
                 max_seq_len=600):

    data_list = []
    folder_name_mapping = {}

    printState('Loading config data...')
    word2index = loadJson(w2idx_path)

    printState('Read main data...')
    for cls_idx, cls_dir in tqdm(enumerate(os.listdir(json_path))):
        class_path = json_path + cls_dir + '/'

        assert num_per_class == len(os.listdir(class_path)), \
            '数据集中类%s的样本数量%d与期望的样本数量不一致!'%\
            (cls_dir, len(os.listdir(class_path)), num_per_class)

        for item in os.listdir(class_path):
            report = loadJson(class_path + item)
            apis = report['apis']
            data_list.append(apis)          # 添加API序列

        folder_name_mapping[cls_idx] = cls_dir

        # label_list += [cls_idx] * num_per_class     # 添加一个类的样本标签

    printState('Converting...')
    data_list = convertApiSeq2DataSeq(data_list,
                                      word2index,
                                      max_seq_len)      # 转化为嵌入后的数值序列列表

    seq_length_list = {i:len(seq) for i,seq in enumerate(data_list)}   # 数据的序列长度

    data_list = pad_sequence(data_list, batch_first=True, padding_value=0)  # 数据填充0组建batch

    # 由于pad函数是根据输入序列的最大长度进行pad,如果所有序列小于最大长度,则有可能出现长度
    # 不一致的错误
    if data_list.size(1) < max_seq_len:
        padding_size = max_seq_len - data_list.size(1)
        zero_paddings = t.zeros((data_list.size(0),padding_size))
        data_list = t.cat((data_list,zero_paddings),dim=1)

    printState('Dumping...')
    dumpJson(seq_length_list, seq_length_save_path)     # 存储序列长度到JSON文件
    if idx2cls_mapping_save_path is not None:
        dumpJson(folder_name_mapping, idx2cls_mapping_save_path)
    t.save(data_list, data_save_path)                   # 存储填充后的数据文件

    printState('Done')
예제 #5
0
def statSatifiedClasses(pe_path,
                        json_path,
                        report_path,
                        stat_stairs=[10, 15, 20],
                        count_dump_path=None):
    # 将样本名称映射为类
    cls_mapping = {}
    cls_cnt = {}

    warn_err_report = loadJson(report_path)

    for cls in os.listdir(pe_path):
        cls_cnt[cls] = 0
        for item in os.listdir(pe_path + cls + '/'):
            cls_mapping[item] = cls

    for json_item in os.listdir(json_path):
        if json_item not in warn_err_report['errors'] and \
            json_item not in warn_err_report['warnings']:

            cls_cnt[cls_mapping[json_item]] += 1

    stair_cls_cnt = {}
    for stair in stat_stairs:
        stair_cls_cnt[stair] = []

        for cls_name, cnt in cls_cnt.items():
            if cnt >= stair:
                stair_cls_cnt[stair].append(cls_name)

        printBulletin('More than %d items (%d in total)' %
                      (stair, len(stair_cls_cnt[stair])))

    if count_dump_path is not None:
        dumpJson(stair_cls_cnt, count_dump_path, indent=None)
예제 #6
0
def renameItemsByMD5(
        json_path,  # 报告文件路径
        item_path,  # 数据文件路径
        ext_name=''):
    reporter = Reporter()

    md5s = []

    for json_item in tqdm(os.listdir(json_path)):
        try:
            report = loadJson(json_path + json_item)
            md5 = report['md5']

            if md5 in md5s:
                reporter.logWarning(entity=json_item, msg='MD5重复')
                success_flag = False

            else:
                md5s.append(md5)
                success_flag = True

            filename = '.'.join(json_item.split('.')[:-1])

            os.rename(json_path + json_item,
                      json_path + md5 + '.json')  # 重命名json报告
            os.rename(item_path + filename + ext_name,
                      item_path + md5 + ext_name)  # 重命名数据文件

            if success_flag:
                reporter.logSuccess()
        except Exception as e:
            reporter.logError(entity=json_item, msg=str(e))

    reporter.report()
예제 #7
0
def mappingApiNormalize(json_path,
                        mapping,
                        dump_mapping_path=None,
                        is_class_dir=False):
    reporter = Reporter()

    for folder in tqdm(os.listdir(json_path)):

        items = os.listdir(json_path + folder +
                           '/') if is_class_dir else [folder + '.json']

        for item in items:
            item_path = json_path + folder + '/' + item
            try:
                report = loadJson(item_path)

                for i in range(len(report['apis'])):
                    if report['apis'][i] in mapping:
                        report['apis'][i] = mapping[report['apis'][i]]

                dumpJson(report, item_path)

                reporter.logSuccess()

            except Exception as e:
                reporter.logError(item, str(e))

    if dump_mapping_path is not None:
        dumpJson(mapping, dump_mapping_path)

    reporter.report()
예제 #8
0
def parseAndSampleDataset(scale_report_path,
                          base_path,
                          dst_path,
                          num_per_class,
                          checked=True):

    scale_report = loadJson(scale_report_path)

    for family_name in tqdm(scale_report):
        # 抽样满足数量规模的类
        if len(scale_report[family_name]) >= num_per_class:
            random.seed(magicSeed())
            candidates = random.sample(scale_report[family_name], num_per_class)

            if os.path.exists(dst_path+family_name+'/'):
                raise RuntimeError("%s 类的文件夹在目标路径中已存在!"%(family_name))
            else:
                os.mkdir(dst_path+family_name+'/')

            for item in candidates:
                folder_name,item_name = item.split("/")
                full_item_name = item_name+'.'+folder_name
                shutil.copy(base_path+item, dst_path+family_name+'/'+full_item_name)

    if checked:
        reporter = Reporter()
        for folder in os.listdir(dst_path):
            if len(os.listdir(dst_path+folder+'/')) != num_per_class:
                reporter.logError(entity=folder, msg="数量不足预期: %d/%d"%
                                                     (len(os.listdir(dst_path+folder+'/')), num_per_class))
            else:
                reporter.logSuccess()
        reporter.report()
예제 #9
0
    def dump(self, path, key='test_result', desc=[]):
        metric_names = self.TestStat.MetricNames
        metrics = self.TestStat.getAlltimeMetric()
        metric_intervals = self.TestStat.getAlltimeMetricInterval()
        loss = self.TestStat.getAlltimeLoss()
        loss_interval = self.TestStat.getAlltimeLossInterval()

        try:
            test_result = loadJson(path)
        except FileNotFoundError:
            test_result = {}

        if key not in test_result:
            test_result[key] = []

        result_obj = {
            'time': time.strftime("%Y-%m-%d %H:%M:%S", time.localtime()),
            'metrics': {
                n: '%.5f' % v + '±' + '%.5f' % itv
                for n, v, itv in zip(metric_names, metrics, metric_intervals)
            },
            'loss': '%.5f' % loss + '±' + '%.5f' % loss_interval,
            'desc': desc,
            'test_time_per_episode':
            self.Timer.getTotalTimeStat(stat_type='avg'),
            'gpu_mem_used': self.GPUManager.getGPUUsedMem(unit='M')
        }

        test_result[key].append(result_obj)
        dumpJson(test_result, path)
예제 #10
0
def multi_process_align(str_path,
                        epoch=1000,
                        log_path=None,
                        verbose=False,
                        acc_dump_path=None,
                        process_num=3):

    if acc_dump_path is not None:
        if not os.path.exists(acc_dump_path):
            dumpIterable([], "acc", acc_dump_path)
        acc_sum = loadJson(acc_dump_path)['acc']
    else:
        acc_sum = []

    matrix = loadJson(str_path)['strings']
    queue = Queue()

    tm = StepTimer()
    tm.begin()

    process_pool = []
    for i in range(process_num):
        p = Process(target=scoreEpisodeAlignment,
                    args=(matrix, queue, i + 1, epoch // process_num))
        process_pool.append(p)
        p.start()

    count = 0
    while count < epoch:
        cur_acc = queue.get(block=True)
        count += 1
        print("#", count, "acc=", cur_acc)
        acc_sum.append(cur_acc)
        if acc_dump_path is not None:
            dumpIterable(acc_sum, "acc", acc_dump_path)

    for p in process_pool:
        p.join()

    # while not queue.empty():
    #     acc_sum.append(queue.get())

    print("\n*********************************************")
    print("Avg acc: ", sum(acc_sum) / epoch)
    print("Total time:", tm.step(prt=False, end=True))
    print("95%% belief interval:", calBeliefeInterval(acc_sum))
예제 #11
0
def renameItemFolder(json_path):

    for folder in tqdm(os.listdir(json_path)):

        report = loadJson(json_path + folder + '/report.json')
        name = report['target']['file']['name']

        os.rename(json_path + folder + '/report.json', json_path + folder + '/%s.json'%name)
        os.rename(json_path+folder+'/', json_path+name+'/')
예제 #12
0
def _loadJsonConfig(file_name, err_msg):
    for rel_path in REL_CFG_PATHS:
        try:
            cfg = loadJson(rel_path + file_name)
            return cfg
        except FileNotFoundError:
            print(f"[ConfigInit] not found: {rel_path+file_name}")
            continue
    raise RuntimeError(f"[ConfigInit] pwd: {os.getcwd()}, {err_msg}")
예제 #13
0
    def addRedoTrainTask(self, dataset, version, updated_configs=None, **kwargs):
        pm = PathManager(dataset=dataset,
                         version=version)
        # 直接读取已做过的实验的config,重新做一次
        pre_config = loadJson(joinPath(pm.doc(), 'train.json'))

        # 加载之前version的参数后,如果有需要更新的参数,则在之前参数的基础上进行更新
        if updated_configs is not None:
            self._setFields(pre_config, updated_configs)

        self.addTask('train', pre_config, **kwargs)
예제 #14
0
def convertApiCategory(clst_path,
                       word_map_path,
                       json_path,
                       str_dump_path,
                       max_len=300):
    word_map = loadJson(word_map_path)
    cluster_map = loadJson(clst_path)
    seqs = aggregateApiSequences(json_path, is_class_dir=True)

    str_mat = []
    for seq in seqs:
        seq = seq[:max_len]
        s = ""
        for idx in seq:
            api_idx = str(word_map[idx])
            s += chr(65 + cluster_map[api_idx])

        str_mat.append(s)

    dumpIterable(str_mat, title="strings", path=str_dump_path)
예제 #15
0
def saveRunVersionConfig(cur_v, dataset, model, cfg):
    cfg_pack = {
        '__version': cur_v,
        '_model': model,
        '_dataset': dataset,
        'config': cfg,
        '_time': time.asctime()
    }
    ver_cfg = loadJson('version.json')
    ver_cfg[str(cur_v)] = cfg_pack

    dumpJson(ver_cfg, 'version.json', sort=True)
예제 #16
0
def sampleClassWiseData(dst_path, log_file_path, num_per_class=20):

    family_report = loadJson(log_file_path)

    for fname, flist in tqdm(family_report.items()):
        if len(flist) >= num_per_class:
            os.mkdir(dst_path + fname)
            cans = sample(flist, num_per_class)

            for can in cans:
                can_fname = can.split('/')[-1]
                shutil.copy(can, dst_path + fname + '/' + can_fname)
예제 #17
0
def makeClusteredData(json_path,
                      cluster_path,
                      word_map_path,
                      dump_path,
                      max_len=1000):
    word_map = loadJson(word_map_path)
    cluster_map = loadJson(cluster_path)
    seqs = aggregateApiSequences(json_path, is_class_dir=True)

    mat = []
    for seq in seqs:
        seq = seq[:max_len]
        s = []
        for idx in seq:
            s.append(cluster_map[str(word_map[idx])])
        while len(s) < max_len:
            s.append(-1)

        mat.append(s)

    np.save(dump_path, np.array(mat))
예제 #18
0
def extractApiFromJson(path):

    reporter = Reporter()

    for i, item_dir in enumerate(os.listdir(path)):
        print(i, item_dir)

        cur_json_path = path + item_dir + '/%s.json' % item_dir

        new_report = {}
        new_report['apis'] = []

        # 此处假设json文件与文件夹同名
        try:
            report = loadJson(cur_json_path)

            # 兼容处理后的报告和未处理的报告
            if 'target' in report:
                new_report['name'] = report['target']['file']['name']
            else:
                new_report['name'] = report['name']

            # 新版本的report,含有api字段
            if 'apis' in report:
                new_report['apis'] = report['apis']

            # 完整的报告中,api位于behavior-processes-calls-api中
            else:
                # 按进程-调用-api的方式逐个收集api调用名称
                api_call_seq = []
                for process in report['behavior']['processes']:
                    for call in process['calls']:
                        api_call_seq.append(call['api'])

                new_report['apis'] = api_call_seq

            reporter.logSuccess()

        # 对于键错误,说明源文件中有错误,应该进行留空处理
        except KeyError as e:
            # name字段已保存,则api留空
            if 'name' in new_report:
                new_report['apis'] = []
                dumpJson(new_report, cur_json_path)

            # 否则直接不处理
            reporter.logError(item_dir, str(e))

        # 其他错误不进行处理
        except Exception as e:
            reporter.logError(item_dir, str(e))

    reporter.report()
예제 #19
0
def removeApiRedundance(json_path, selected_apis=None, class_dir=True):

    reporter = Reporter()

    for folder in tqdm(os.listdir(json_path)):

        if class_dir:
            items = os.listdir(json_path + folder + '/')
        else:
            items = [folder + '.json']

        for item in items:

            item_path = json_path + folder + '/' + item

            try:
                report = loadJson(item_path)

                redun_flag = False
                redun_api_token = None

                new_api_seq = []

                for api_token in report['apis']:
                    # 只关注选出的那些api
                    # 如果给定的选中API为None代表不进行选择
                    if selected_apis is None or \
                        api_token in selected_apis:
                        if api_token != redun_api_token:  # 每当遇到新的api时,刷新当前遇到的api,同时重置flag
                            redun_api_token = api_token
                            redun_flag = False
                        else:
                            if not redun_flag:  # 如果遇到了一样的api,但是flag没有置位,说明第二次遇到,同时置位flag
                                redun_flag = True
                            else:
                                continue  # 如果遇到了一样的api且flag置位,说明已经遇到过两次,则跳过冗余api

                        new_api_seq.append(api_token)

                # 使用新api序列覆盖原api序列
                report['apis'] = new_api_seq
                dumpJson(report, item_path)

                reporter.logSuccess()

            except Exception as e:
                reporter.logError(folder, str(e))

    reporter.report()
예제 #20
0
    def saveResult(path, desc, acc, los, acc_i, los_i):
        try:
            results = loadJson(path)
        except:
            results = {'results': []}

        results['results'].append({
            'acc': acc,
            'loss': los,
            'acc_interval': acc_i,
            'loss_interval': los_i,
            'desc': desc
        })

        dumpJson(results, path)
예제 #21
0
def collectPEwithAPI(
        api_dir_path,
        pe_dir_path,
        dst_path,
        class_dir=True,
        name_prefix=None,  # 后缀默认是json,因为要读取json文件
        log_dump_path=None):

    print("[CollectPEwithAPI] Preparing...")
    pe_folder_map = {
        folder: os.listdir(pe_dir_path + folder)
        for folder in os.listdir(pe_dir_path)
    }
    reporter = Reporter()

    print("[CollectPEwithAPI] Starting...")
    for folder in tqdm(os.listdir(api_dir_path)):
        folder_path = api_dir_path + folder + '/'
        if class_dir:
            items = os.listdir(folder_path)
            os.mkdir(dst_path + folder + '/')
            dst_folder = dst_path + folder + '/'
        else:
            items = [name_prefix + '.json']  # 对于没有分类的文件,移动时不重新创建文件夹
            dst_folder = dst_path

        for item in items:
            try:
                report = loadJson(folder_path + item)
                name = report['name']

                found_flag = False
                for pe_folder in pe_folder_map:
                    if name in pe_folder_map[pe_folder]:
                        shutil.copy(pe_dir_path + pe_folder + '/' + name,
                                    dst_folder + name)
                        found_flag = True
                        break
                if not found_flag:
                    reporter.logError(entity=name, msg="File not found")
                else:
                    reporter.logSuccess()
            except Exception as e:
                reporter.logError(entity=folder_path + item, msg=str(e))

    reporter.report()
    if log_dump_path is not None:
        reporter.dump(log_dump_path)
예제 #22
0
def checkVersion(cur_v):
    ver_cfg = loadJson('version.json')
    last_ver = ver_cfg['lastRunVersion']

    if cur_v == last_ver:
        logging.warning(ver_check_warning_template % cur_v)
        opt = input('>>>')

        if opt == 'y' or opt == '1' or opt == '':
            return
        else:
            sys.exit(1)

    else:
        ver_cfg['lastRunVersion'] = cur_v
        dumpJson(ver_cfg, 'version.json')
예제 #23
0
def statValidJsonReport(dir_path, len_thresh=10,
                        class_dir=False,
                        name_prefix=None,
                        dump_valid_path=None):

    valid = invalid = too_short = total = 0
    valid_list = []

    for folder in os.listdir(dir_path):
        folder_path = dir_path+folder+'/'
        if class_dir:
            items = os.listdir(folder_path)
        else:
            items = [name_prefix+'.json']

        for item in items:
            total_length = 0
            total += 1
            print('#%d'%total, folder_path+item, end=': ')

            try:
                report = loadJson(folder_path+item)
                raw_file_name = report['target']['file']['name']
                for process in report['behavior']['processes']:
                    total_length += len(process['calls'])

                if total_length < len_thresh:
                    too_short += 1
                    print('too short:', total_length)
                else:
                    valid += 1
                    valid_list.append({'file':raw_file_name,
                                       'len':total_length,
                                       'rawPath':folder_path+item})
                    print('valid')
            except Exception as e:
                invalid += 1
                print('Error: ', str(e))

    print('Total:', total)
    print('Valid:', valid)
    print('Invalid:', invalid)
    print('Too Short:', too_short)

    if dump_valid_path is not None:
        dumpIterable(valid_list, title='valid_file_name', path=dump_valid_path)
예제 #24
0
def revertDatasetSplit(dataset, dump_path):
    man = PathManager(dataset)
    split_dump = loadJson(dump_path)

    deleteDatasetSplit(man.datasetBase())

    for typ in ['train', 'validate', 'test']:
        print(f"[revertDatasetSplit] {typ}")
        for folder in split_dump[typ]:
            shutil.copytree(src=man.datasetBase() + 'all/api/' + folder + '/',
                            dst=man.datasetBase() + typ + '/api/' + folder +
                            '/')
            shutil.copytree(src=man.datasetBase() + 'all/img/' + folder + '/',
                            dst=man.datasetBase() + typ + '/img/' + folder +
                            '/')

    print('-- Done --')
예제 #25
0
def revertDatasetSplit(dataset, dump_path):
    man = PathManager(dataset)
    split_dump = loadJson(dump_path)

    deleteDatasetSplit(man.DatasetBase())

    for typ in ['train', 'validate', 'test']:

        # delete the existed split
        # os.system('rm -rf {path}/*'.format(path=man.DatasetBase()+typ))

        print(typ)
        for folder in split_dump[typ]:
            shutil.copytree(src=man.DatasetBase()+'all/'+folder+'/',
                            dst=man.DatasetBase()+typ+'/'+folder+'/')

    print('-- Done --')
예제 #26
0
def convertToNGramSeq(
        parent_path,
        window=3,
        ngram_dict=None,  # 统计得到的NGram字典,已排序
        ngram_max_num=None,
        class_dir=False):  # 要提取前n个NGram,可从统计函数中获取信息,或者不指定

    reporter = Reporter()

    if ngram_dict is not None and ngram_max_num is not None:
        valid_ngrams = list(ngram_dict.keys())[:ngram_max_num]
    else:
        valid_ngrams = None

    for folder in tqdm(os.listdir(parent_path)):
        folder_path = parent_path + folder + '/'

        if class_dir:
            items = os.listdir(folder_path)
        else:
            items = [folder + '.json']

        for item in items:
            try:
                ngram_seq = []
                report = loadJson(folder_path + item)
                api_seq = report['apis']

                for i in range(len(api_seq) - window):
                    ngram = strlistToStr(api_seq[i:i + window])

                    # 没有指定要提取的ngram或者当前ngram存在于要提取的ngram中时才会添加
                    if valid_ngrams is None or ngram in valid_ngrams:
                        ngram_seq.append(ngram)

                # 写回原文件中
                report['apis'] = ngram_seq
                dumpJson(report, folder_path + item)

                reporter.logSuccess()

            except Exception as e:
                reporter.logError(entity=folder + '/' + item, msg=str(e))
                continue

    reporter.report()
예제 #27
0
def renameCuckooFolders(json_path):
    reporter = Reporter()

    for folder in tqdm(os.listdir(json_path)):
        try:
            report = loadJson(json_path+folder+'/report.json')
            name = report['target']['file']['name']

            os.rename(json_path+folder+'/report.json', json_path+folder+'/%s.json'%name)
            os.rename(json_path+folder, json_path+name)

            reporter.logSuccess()

        except Exception as e:
            reporter.logError(entity=folder, msg=str(e))
            continue

    reporter.report()
예제 #28
0
def statApiFrequency(json_path, is_class_dir=False, threshold=None):

    api_frequency = {}
    total = 0

    for dir_ in tqdm(os.listdir(json_path)):
        dir_path = json_path + dir_ + '/'

        if is_class_dir:
            items = os.listdir(dir_path)
        else:
            items = [dir_ + '.json']

        for item in items:
            apis = loadJson(dir_path + item)['apis']

            for api in apis:
                if api not in api_frequency:
                    api_frequency[api] = 0
                api_frequency[api] += 1
                total += 1

    printState('API频率统计')
    # 按照频率降序排列
    api_frequency = sorted(api_frequency.items(),
                           key=lambda x: x[1],
                           reverse=True)

    below_threshold = []

    for i, (api, f) in enumerate(api_frequency):
        print('#%d' % i, api, f / total)
        if threshold is not None:
            # threshold小于1时,定义为频率阈值
            if 1 > threshold > f / total:
                below_threshold.append(api)
            # threshold大于1时,定义为排名阈值
            elif i >= threshold >= 1:
                below_threshold.append(api)

    if threshold is not None:
        printState('低于%f的API(%d个)' % (threshold, len(below_threshold)))
        print(below_threshold)
예제 #29
0
    def __init__(self, data_path, seq_path, N):
        self.Data = t.load(data_path)           # 不再利用长度截断,而是所有序列都按照制定长度存储

        seqLength = loadJson(seq_path)
        self.SeqLength = [0] * len(self.Data)
        for i,l in seqLength.items():
            self.SeqLength[int(i)] = l          # 存储序列长度

        self.Label = []
        self.ClassNum = len(self.Data) // N

        assert len(self.Data) % N == 0, \
            '数据总长度%d不是指定每个类样本数量%d的整倍数!' % (len(self.Data), N)

        assert len(self.Data) == len(self.SeqLength), \
            '数据总长度%d与序列长度数据总长度%d不同' % (len(self.Data), len(self.SeqLength))

        for i in range(len(self.Data) // N):
            self.Label += [i] * N
예제 #30
0
def collectJsonByClass(
    pe_path,
    json_path,
    dst_path,
    report_path,
    num_per_class,
    selected_classes,
):
    reporter = Reporter()

    warn_errs = loadJson(report_path)

    def length_filter(x):
        return x not in warn_errs['warnings'] and x not in warn_errs['errors']

    for cls in tqdm(selected_classes):
        dst_dir = dst_path + cls + '/'

        if not os.path.exists(dst_dir):
            os.mkdir(dst_dir)

        # filter those items not satisfying scale requirement
        cand_items = os.listdir(pe_path + cls + '/')
        cand_items = list(filter(length_filter, cand_items))

        # for some PE items, there misses the corresponding json item
        cand_items = list(
            filter(lambda x: os.path.exists(json_path + x + '/'), cand_items))

        cand_items = random.sample(cand_items, num_per_class)

        for item in cand_items:
            try:
                shutil.copy(json_path + item + '/%s.json' % item,
                            dst_dir + '/%s.json' % item)

                reporter.logSuccess()

            except Exception as e:
                reporter.logError('%s/%s' % (cls, item), str(e))

    reporter.report()