示例#1
0
def makeDataFile(json_path,
                 w2idx_path,
                 seq_length_save_path,
                 data_save_path,
                 num_per_class,
                 idx2cls_mapping_save_path=None,
                 max_seq_len=600):

    data_list = []
    folder_name_mapping = {}

    printState('Loading config data...')
    word2index = loadJson(w2idx_path)

    printState('Read main data...')
    for cls_idx, cls_dir in tqdm(enumerate(os.listdir(json_path))):
        class_path = json_path + cls_dir + '/'

        assert num_per_class == len(os.listdir(class_path)), \
            '数据集中类%s的样本数量%d与期望的样本数量不一致!'%\
            (cls_dir, len(os.listdir(class_path)), num_per_class)

        for item in os.listdir(class_path):
            report = loadJson(class_path + item)
            apis = report['apis']
            data_list.append(apis)          # 添加API序列

        folder_name_mapping[cls_idx] = cls_dir

        # label_list += [cls_idx] * num_per_class     # 添加一个类的样本标签

    printState('Converting...')
    data_list = convertApiSeq2DataSeq(data_list,
                                      word2index,
                                      max_seq_len)      # 转化为嵌入后的数值序列列表

    seq_length_list = {i:len(seq) for i,seq in enumerate(data_list)}   # 数据的序列长度

    data_list = pad_sequence(data_list, batch_first=True, padding_value=0)  # 数据填充0组建batch

    # 由于pad函数是根据输入序列的最大长度进行pad,如果所有序列小于最大长度,则有可能出现长度
    # 不一致的错误
    if data_list.size(1) < max_seq_len:
        padding_size = max_seq_len - data_list.size(1)
        zero_paddings = t.zeros((data_list.size(0),padding_size))
        data_list = t.cat((data_list,zero_paddings),dim=1)

    printState('Dumping...')
    dumpJson(seq_length_list, seq_length_save_path)     # 存储序列长度到JSON文件
    if idx2cls_mapping_save_path is not None:
        dumpJson(folder_name_mapping, idx2cls_mapping_save_path)
    t.save(data_list, data_save_path)                   # 存储填充后的数据文件

    printState('Done')
示例#2
0
def makeDatasetDirStruct(base_path):
    if not os.path.exists(base_path):
        os.mkdir(base_path)

    os.mkdir(base_path + 'all/')
    os.mkdir(base_path + 'train/')
    os.mkdir(base_path + 'validate/')
    os.mkdir(base_path + 'test/')
    os.mkdir(base_path + 'models/')

    os.mkdir(base_path + 'data/')
    os.mkdir(base_path + 'data/train/')
    os.mkdir(base_path + 'data/validate/')
    os.mkdir(base_path + 'data/test/')
    os.mkdir(base_path + 'doc/')

    printState('Done')
示例#3
0
def statApiFrequency(json_path, is_class_dir=False, threshold=None):

    api_frequency = {}
    total = 0

    for dir_ in tqdm(os.listdir(json_path)):
        dir_path = json_path + dir_ + '/'

        if is_class_dir:
            items = os.listdir(dir_path)
        else:
            items = [dir_ + '.json']

        for item in items:
            apis = loadJson(dir_path + item)['apis']

            for api in apis:
                if api not in api_frequency:
                    api_frequency[api] = 0
                api_frequency[api] += 1
                total += 1

    printState('API频率统计')
    # 按照频率降序排列
    api_frequency = sorted(api_frequency.items(),
                           key=lambda x: x[1],
                           reverse=True)

    below_threshold = []

    for i, (api, f) in enumerate(api_frequency):
        print('#%d' % i, api, f / total)
        if threshold is not None:
            # threshold小于1时,定义为频率阈值
            if 1 > threshold > f / total:
                below_threshold.append(api)
            # threshold大于1时,定义为排名阈值
            elif i >= threshold >= 1:
                below_threshold.append(api)

    if threshold is not None:
        printState('低于%f的API(%d个)' % (threshold, len(below_threshold)))
        print(below_threshold)
示例#4
0
data_folder = cfg.dataset()  #'virushare_20_image'

k, n, qk, N = cfg.taskParams()
model_type, model_name = cfg.model()

version = cfg.version()

TestingEpoch = cfg.epoch()
USED_SUB_DATASET = cfg.subDataset()

################################################
#----------------------定义数据------------------
################################################

printState('init managers...')
test_path_manager = PathManager(dataset=data_folder,
                                d_type=USED_SUB_DATASET,
                                model_name=model_name,
                                version=version)

################################################
#----------------------读取模型参数------------------
################################################

model_cfg = TrainingConfigManager(test_path_manager.Doc() + 'config.json')

modelParams = model_cfg.modelParams()

LRDecayIters, LRDecayGamma, optimizer_type,\
weight_decay, loss_func, default_lr, lrs, taskBatchSize = model_cfg.trainingParams()
示例#5
0
def statNGram(
        parent_path,
        window=3,
        dict_save_path=None,  # NGram频率的保存
        frequency_stairs=[],  # 频率阶梯,必须从小到大排列,统计超过该频率需要的最少NGram个数
        class_dir=False):

    reporter = Reporter()

    ngram_dict = {}
    total_cnt = 0

    printState('Counting...')
    for folder in tqdm(os.listdir(parent_path)):
        folder_path = parent_path + folder + '/'

        if class_dir:
            items = os.listdir(folder_path)
        else:
            items = [folder + '.json']

        for item in items:
            try:
                seq = loadJson(folder_path + item)['apis']

                for i in range(len(seq) - window):
                    ngram = strlistToStr(seq[i:i + window])

                    total_cnt += 1
                    if ngram not in ngram_dict:
                        ngram_dict[ngram] = 1
                    else:
                        ngram_dict[ngram] += 1

                reporter.logSuccess()

            except Exception as e:
                reporter.logError(entity=folder, msg=str(e))
                continue

    printState('Processing...')

    # 按照频率降序排列
    ngram_dict = dict(
        sorted(ngram_dict.items(), key=lambda x: x[1], reverse=True))

    # 频率归一化
    for k in ngram_dict.keys():
        ngram_dict[k] = ngram_dict[k] / total_cnt

    if dict_save_path is not None:
        dumpJson(ngram_dict, dict_save_path)

    # 统计频率分布
    f_accum = 0.
    idx = 0
    keys = list(ngram_dict.keys())
    max_len = len(keys)
    for f_stair in frequency_stairs:
        while f_accum < f_stair and idx < max_len:
            f_accum += ngram_dict[keys[idx]]
            idx += 1
        printBulletin('%f:   %d NGrams' % (f_stair, idx + 1))

    printBulletin('Total: %d NGrams' % len(ngram_dict))

    reporter.report()
    return ngram_dict
示例#6
0
print('Version: %d'%version)
print(f"{k}-shot {n}-way")
print(f"device: {cfg.deviceId()}")
print('*'*50)

################################################
#----------------------定义数据------------------
################################################

expand = False if loss_func=='nll' else True

loss = t.nn.NLLLoss().cuda() \
    if loss_func=='nll' else \
    t.nn.MSELoss().cuda()

printState('init managers...')
train_path_manager = PathManager(dataset=data_folder,
                                 d_type='train',
                                 model_name=model_name,
                                 version=version)
val_path_manager = PathManager(dataset=data_folder,
                               d_type='validate',
                               model_name=model_name,
                               version=version)

train_dataset = SeqFileDataset(train_path_manager.FileData(),
                               train_path_manager.FileSeqLen(),
                               N)
val_dataset = SeqFileDataset(val_path_manager.FileData(),
                               val_path_manager.FileSeqLen(),
                               N)