def makeDataFile(json_path, w2idx_path, seq_length_save_path, data_save_path, num_per_class, idx2cls_mapping_save_path=None, max_seq_len=600): data_list = [] folder_name_mapping = {} printState('Loading config data...') word2index = loadJson(w2idx_path) printState('Read main data...') for cls_idx, cls_dir in tqdm(enumerate(os.listdir(json_path))): class_path = json_path + cls_dir + '/' assert num_per_class == len(os.listdir(class_path)), \ '数据集中类%s的样本数量%d与期望的样本数量不一致!'%\ (cls_dir, len(os.listdir(class_path)), num_per_class) for item in os.listdir(class_path): report = loadJson(class_path + item) apis = report['apis'] data_list.append(apis) # 添加API序列 folder_name_mapping[cls_idx] = cls_dir # label_list += [cls_idx] * num_per_class # 添加一个类的样本标签 printState('Converting...') data_list = convertApiSeq2DataSeq(data_list, word2index, max_seq_len) # 转化为嵌入后的数值序列列表 seq_length_list = {i:len(seq) for i,seq in enumerate(data_list)} # 数据的序列长度 data_list = pad_sequence(data_list, batch_first=True, padding_value=0) # 数据填充0组建batch # 由于pad函数是根据输入序列的最大长度进行pad,如果所有序列小于最大长度,则有可能出现长度 # 不一致的错误 if data_list.size(1) < max_seq_len: padding_size = max_seq_len - data_list.size(1) zero_paddings = t.zeros((data_list.size(0),padding_size)) data_list = t.cat((data_list,zero_paddings),dim=1) printState('Dumping...') dumpJson(seq_length_list, seq_length_save_path) # 存储序列长度到JSON文件 if idx2cls_mapping_save_path is not None: dumpJson(folder_name_mapping, idx2cls_mapping_save_path) t.save(data_list, data_save_path) # 存储填充后的数据文件 printState('Done')
def makeDatasetDirStruct(base_path): if not os.path.exists(base_path): os.mkdir(base_path) os.mkdir(base_path + 'all/') os.mkdir(base_path + 'train/') os.mkdir(base_path + 'validate/') os.mkdir(base_path + 'test/') os.mkdir(base_path + 'models/') os.mkdir(base_path + 'data/') os.mkdir(base_path + 'data/train/') os.mkdir(base_path + 'data/validate/') os.mkdir(base_path + 'data/test/') os.mkdir(base_path + 'doc/') printState('Done')
def statApiFrequency(json_path, is_class_dir=False, threshold=None): api_frequency = {} total = 0 for dir_ in tqdm(os.listdir(json_path)): dir_path = json_path + dir_ + '/' if is_class_dir: items = os.listdir(dir_path) else: items = [dir_ + '.json'] for item in items: apis = loadJson(dir_path + item)['apis'] for api in apis: if api not in api_frequency: api_frequency[api] = 0 api_frequency[api] += 1 total += 1 printState('API频率统计') # 按照频率降序排列 api_frequency = sorted(api_frequency.items(), key=lambda x: x[1], reverse=True) below_threshold = [] for i, (api, f) in enumerate(api_frequency): print('#%d' % i, api, f / total) if threshold is not None: # threshold小于1时,定义为频率阈值 if 1 > threshold > f / total: below_threshold.append(api) # threshold大于1时,定义为排名阈值 elif i >= threshold >= 1: below_threshold.append(api) if threshold is not None: printState('低于%f的API(%d个)' % (threshold, len(below_threshold))) print(below_threshold)
data_folder = cfg.dataset() #'virushare_20_image' k, n, qk, N = cfg.taskParams() model_type, model_name = cfg.model() version = cfg.version() TestingEpoch = cfg.epoch() USED_SUB_DATASET = cfg.subDataset() ################################################ #----------------------定义数据------------------ ################################################ printState('init managers...') test_path_manager = PathManager(dataset=data_folder, d_type=USED_SUB_DATASET, model_name=model_name, version=version) ################################################ #----------------------读取模型参数------------------ ################################################ model_cfg = TrainingConfigManager(test_path_manager.Doc() + 'config.json') modelParams = model_cfg.modelParams() LRDecayIters, LRDecayGamma, optimizer_type,\ weight_decay, loss_func, default_lr, lrs, taskBatchSize = model_cfg.trainingParams()
def statNGram( parent_path, window=3, dict_save_path=None, # NGram频率的保存 frequency_stairs=[], # 频率阶梯,必须从小到大排列,统计超过该频率需要的最少NGram个数 class_dir=False): reporter = Reporter() ngram_dict = {} total_cnt = 0 printState('Counting...') for folder in tqdm(os.listdir(parent_path)): folder_path = parent_path + folder + '/' if class_dir: items = os.listdir(folder_path) else: items = [folder + '.json'] for item in items: try: seq = loadJson(folder_path + item)['apis'] for i in range(len(seq) - window): ngram = strlistToStr(seq[i:i + window]) total_cnt += 1 if ngram not in ngram_dict: ngram_dict[ngram] = 1 else: ngram_dict[ngram] += 1 reporter.logSuccess() except Exception as e: reporter.logError(entity=folder, msg=str(e)) continue printState('Processing...') # 按照频率降序排列 ngram_dict = dict( sorted(ngram_dict.items(), key=lambda x: x[1], reverse=True)) # 频率归一化 for k in ngram_dict.keys(): ngram_dict[k] = ngram_dict[k] / total_cnt if dict_save_path is not None: dumpJson(ngram_dict, dict_save_path) # 统计频率分布 f_accum = 0. idx = 0 keys = list(ngram_dict.keys()) max_len = len(keys) for f_stair in frequency_stairs: while f_accum < f_stair and idx < max_len: f_accum += ngram_dict[keys[idx]] idx += 1 printBulletin('%f: %d NGrams' % (f_stair, idx + 1)) printBulletin('Total: %d NGrams' % len(ngram_dict)) reporter.report() return ngram_dict
print('Version: %d'%version) print(f"{k}-shot {n}-way") print(f"device: {cfg.deviceId()}") print('*'*50) ################################################ #----------------------定义数据------------------ ################################################ expand = False if loss_func=='nll' else True loss = t.nn.NLLLoss().cuda() \ if loss_func=='nll' else \ t.nn.MSELoss().cuda() printState('init managers...') train_path_manager = PathManager(dataset=data_folder, d_type='train', model_name=model_name, version=version) val_path_manager = PathManager(dataset=data_folder, d_type='validate', model_name=model_name, version=version) train_dataset = SeqFileDataset(train_path_manager.FileData(), train_path_manager.FileSeqLen(), N) val_dataset = SeqFileDataset(val_path_manager.FileData(), val_path_manager.FileSeqLen(), N)