示例#1
0
import numpy as np
from pprint import pprint

import config
from builder import buildPlot
from utils.file import loadJson
from utils.manager import PathManager

dataset = 'virushare-20'
version = 32
model = 'ProtoNet'
report_iter = 100
val_episode = 50

pm = PathManager(dataset, version=version, model_name=model)

train_stat_data = loadJson(pm.doc() + 'train_stat.json')
train_config = loadJson(pm.doc() + 'train.json')
config.reloadArbitraryConfig(train_config, reload_config_list=['plot'])
config.plot.Enabled = True
vis = buildPlot(config.plot)

pprint(train_config)

train_metric = np.array(train_stat_data['train']['metrics']).reshape(
    (-1, report_iter))  # 此处假设metric_num=1
train_loss = np.array(train_stat_data['train']['loss']).reshape(
    (-1, report_iter))
# validate数据的总数与validate_episode有关
val_metric = np.array(train_stat_data['validate']['metrics']).reshape(
示例#2
0
print(f"device: {cfg.deviceId()}")
print('*'*50)

################################################
#----------------------定义数据------------------
################################################

expand = False if loss_func=='nll' else True

loss = t.nn.NLLLoss().cuda() \
    if loss_func=='nll' else \
    t.nn.MSELoss().cuda()

printState('init managers...')
train_path_manager = PathManager(dataset=data_folder,
                                 d_type='train',
                                 model_name=model_name,
                                 version=version)
val_path_manager = PathManager(dataset=data_folder,
                               d_type='validate',
                               model_name=model_name,
                               version=version)

train_dataset = SeqFileDataset(train_path_manager.FileData(),
                               train_path_manager.FileSeqLen(),
                               N)
val_dataset = SeqFileDataset(val_path_manager.FileData(),
                               val_path_manager.FileSeqLen(),
                               N)
# train_dataset = ImageFileDataset(train_path_manager.FileData(), N, rd_crop_size=224)
# val_dataset = ImageFileDataset(val_path_manager.FileData(), N, rd_crop_size=224)
示例#3
0
print("Fine-tuning")
print("FineTuning Epoch:", ft_epoch)
print('Used dataset: %s'%data_folder)
print('Version: %d'%version)
print(f"{k}-shot {n}-way")
print(f"device: {cfg.deviceId()}")
print('*'*50)


################################################
#----------------------定义数据------------------
################################################

printState('init managers...')
test_path_manager = PathManager(dataset=data_folder,
                               d_type=USED_SUB_DATASET,
                               version=version)

################################################
#----------------------读取模型参数------------------
################################################

model_cfg = TrainingConfigManager('./runConfig.json')

modelParams = model_cfg.modelParams()

LRDecayIters, LRDecayGamma, optimizer_type, \
weight_decay, loss_func_name, default_lr, lrs, \
taskBatchSize, criteria = model_cfg.trainingParams()

test_dataset = SeqFileDataset(test_path_manager.FileData(),
示例#4
0
# splitDatas(src=man.DatasetBase()+'train/',
#            dest=man.DatasetBase()+'validate/',
#            ratio=30,
#            mode='x',
#            is_dir=True)
# splitDatas(src=man.DatasetBase()+'train/',
#            dest=man.DatasetBase()+'test/',
#            ratio=30,
#            mode='x',
#            is_dir=True)
################################################################

# 制作基于下标的数据集
################################################################
for d_type in ['train', 'validate', 'test']:
    manager = PathManager(dataset='virushare-20-3gram-tfidf', d_type=d_type)

    makeDataFile(json_path=manager.Folder(),
                 w2idx_path=manager.WordIndexMap(),
                 seq_length_save_path=manager.FileSeqLen(),
                 data_save_path=manager.FileData(),
                 idx2cls_mapping_save_path=manager.FileIdx2Cls(),
                 num_per_class=20,
                 max_seq_len=700)
################################################################

# renameItemFolder('/home/asichurter/datasets/JSONs/LargePE-100-original/')

# 统计序列长度分布
################################################################
# apiStat('/home/asichurter/datasets/JSONs/HKS/all/',
示例#5
0
    for ck, cv in re.items():
        for kk, kv in cv.items():
            if kv > best_acc:
                best_acc = kv
                best_c = ck
                best_k = kk

    return best_c, best_k


if __name__ == "__main__":
    epoch = 5000
    seq_len = 50
    n_cluster = 30
    n_range = (15, 30)
    mng = PathManager("HKS-api")

    # # # findOptK(mng.WordEmbedMatrix(), k_range=(2,100))
    # apiCluster(mng.WordEmbedMatrix(), mng.DataRoot()+"MarkovClusterMapping.json", cluster_num=n_cluster)
    # makeClusteredData(json_path=mng.Folder(),
    #                   cluster_path=mng.DataRoot()+"MarkovClusterMapping.json",
    #                   word_map_path=mng.WordIndexMap(),
    #                   dump_path=mng.DataRozot()+"MarkovClusteredData.npy",
    #                   max_len=seq_len)
    # scoreMarkovEpisode(clustered_data_path=mng.DataRoot()+"MarkovClusteredData.npy",
    #                    epoch=2000,
    #                    n_cluster=n_cluster,
    #                    maxlen=seq_len)

    # re = gridSearch(c_values=list(range(*n_range)),
    #                 k_values=[i*50 for i in range(1,11)],
示例#6
0
#                 continue
#
#             items = random.sample(candidates, num_constrain)
#
#             for item in items:
#                 shutil.copy(src_path+folder+'/'+item,
#                             dst_path+folder+'/'+item)
#
#             reporter.logSuccess()
#
#         except Exception as e:
#             reporter.logError(entity=folder+'/'+item,
#                               msg=str(e))

if __name__ == '__main__':
    manager = PathManager(dataset='virushare_20', d_type='all')
    '''
    调用顺序:extract -> mapping -> removeRedundance -> (ngram) -> apiStat 
            -> stat_classes -> collect
            
    '''

    # 根据PE的划分,从json数据集中选出选中的文件收集到文件夹中
    #----------------------------------------------------------------
    # src_path = '/home/asichurter/datasets/JSONs/jsons - 副本/'
    # json_list_path = '/home/asichurter/datasets/PEs/virushare-20-after-increm/all/'
    # dst_path = '/home/asichurter/datasets/JSONs/virushare-50-original/'
    #
    # item_list = []
    # for folder in os.listdir(json_list_path):
    #     item_list += os.listdir(json_list_path+folder+'/')
示例#7
0
    if padding:
        pad_matrix = np.zeros((1, model.wv.vectors.shape[1]))
        matrix = np.concatenate((pad_matrix, matrix), axis=0)

        for i, w in enumerate(model.wv.index2word):
            word2index[
                w] = i + 1 if padding else i  # 由于idx=0要留给padding,因此所有的下标都加1
        word2index['<PAD>'] = 0

    if save_matrix_path:
        np.save(save_matrix_path, matrix)

    if save_word2index_path:
        dumpJson(word2index, save_word2index_path)

    if save_matrix_path is None and save_word2index_path is None:
        return matrix, word2index

    printBulletin('Done')


if __name__ == '__main__':
    manager = PathManager(dataset='HKS-api', d_type='all')

    # print(manager.FileData())

    seqs = aggregateApiSequences(manager.Folder())
    trainW2Vmodel(seqs,
                  save_matrix_path=manager.WordEmbedMatrix(),
                  save_word2index_path=manager.WordIndexMap(),
                  size=128)
# -*- coding: utf-8

from preliminaries.dataset import makeDataFile
from utils.manager import PathManager

dataset_name = '...'
item_per_family = 20
max_seq_len = 300

for d_type in ['train', 'validate', 'test']:
    manager = PathManager(dataset=dataset_name, d_type=d_type)

    makeDataFile(json_path=manager.Folder(),
                 w2idx_path=manager.WordIndexMap(),
                 seq_length_save_path=manager.FileSeqLen(),
                 data_save_path=manager.FileData(),
                 idx2cls_mapping_save_path=manager.FileIdx2Cls(),
                 num_per_class=item_per_family,
                 max_seq_len=max_seq_len)

#                    log_dump_path='/home/omnisky/NewAsichurter/FusionData/reports/Per40_3gram_stat.json')
#
# mapAndExtractTopKNgram(dir_path='/home/omnisky/NewAsichurter/FusionData/api/LargePE-API-Per40/',
#                        ngram_stat_log_path='/home/omnisky/NewAsichurter/FusionData/reports/Per40_3gram_stat.json',
#                        K=5000,
#                        N=3,
#                        class_dir=True,
#                        map_dump_path='/home/omnisky/NewAsichurter/FusionData/reports/Per40_3gram_map.json')

# makeDatasetDirStruct(base_path='/home/omnisky/NewAsichurter/FusionData/datasets/LargePE-Per35/')
#
# sampleClassWiseData(dst_path='/home/omnisky/NewAsichurter/FusionData/datasets/LargePE-Per35/all/api/',
#                     log_file_path="/home/omnisky/NewAsichurter/FusionData/reports/class_stat_after_processing_log.json",
#                     num_per_class=35)
#
pm = PathManager(dataset="virushare-20")
# trainGloVe(base_path=pm.rootBase(),
#            dataset='virushare-20',
#            size=300,
#            type='all')

# renamePEbyMD5fromApi(api_dir_path='/home/omnisky/NewAsichurter/FusionData/datasets/LargePE-Per40/all/api/',
#                      pe_dir_path='/home/omnisky/NewAsichurter/FusionData/datasets/LargePE-Per40/all/pe/')

# convertDir2Image(dir_path='F:/FSL_mal_data/datasets/virushare-20/all/pe/',
#                  dst_path='F:/FSL_mal_data/datasets/virushare-20/all/img/')

# splitDataset(dataset_path=pm.datasetBase(),
#              validate_ratio=20,
#              test_ratio=20)
示例#10
0
# }

# for ver, conf in tasks.items():
#     machine.addTask('train', flatten_update_config=conf)
#     machine.addTask('test', flatten_update_config={
#         'task|version': ver,
#         'load_type': 'best',
#     })
#     machine.addTask('test', flatten_update_config={
#         'task|version': ver,
#         'load_type': 'last',
#     })

s_ver, e_ver = 318, 326
for ver in range(s_ver, e_ver + 1):
    pm = PathManager(dataset='virushare-20', version=ver)
    if os.path.exists(pm.doc() + 'test_result.json'):
        os.remove(pm.doc() + 'test_result.json')
    machine.addTask('test',
                    flatten_update_config={
                        'task|version': ver,
                        'load_type': 'best',
                    })
    machine.addTask('test',
                    flatten_update_config={
                        'task|version': ver,
                        'load_type': 'last',
                    })

# machine.addTask('train',
#                 {
示例#11
0
print('*' * 50)
print('Model Name: %s' % model_type)
print('Used dataset: %s' % data_folder)
print('Version: %d' % version)
print(f"{k}-shot {n}-way")
print(f"device: {cfg.deviceId()}")
print("Is Random Test:", MODEL_RANDOM_STATE)
print('*' * 50)

################################################
#----------------------定义数据------------------
################################################

printState('init managers...')
test_path_manager = PathManager(dataset=data_folder,
                                d_type=USED_SUB_DATASET,
                                model_name=model_name,
                                version=version)

################################################
#----------------------读取模型参数------------------
################################################

param_config_path = 'runConfig.json' if MODEL_RANDOM_STATE else test_path_manager.Doc(
) + 'config.json'
model_cfg = TrainingConfigManager(param_config_path)

modelParams = model_cfg.modelParams()

LRDecayIters, LRDecayGamma, optimizer_type,\
weight_decay, loss_func, default_lr, lrs, \
taskBatchSize, criteria = model_cfg.trainingParams()
示例#12
0
            for item in items:
                shutil.copy(pj(base_dataset_path, tp, folder, item),
                            pj(new_dataset_path, tp, folder, item))

    shutil.copy(base_dataset_path+'data/matrix.npy',
                new_dataset_path+'/data/matrix.npy')
    shutil.copy(base_dataset_path+'data/wordMap.json',
                new_dataset_path+'/data/wordMap.json')

if __name__ == '__main__':
    original_dataset_name = 'virushare-20-3gram-tfidf'
    new_dataset_name = 'virushare-20-3gram-tfidf-general'
    N = 10
    seq_len = 200

    pm = PathManager(dataset=original_dataset_name)
    makeGeneralTestDataset(base_dataset_path=pm.DatasetBase(),
                           new_dataset_path=pm.ParentBase()+new_dataset_name+'/',
                           train_num_per_class=N,
                           include_test=False)
    for d_type in ['train', 'validate', 'test']:
        manager = PathManager(dataset=new_dataset_name, d_type=d_type)

        makeDataFile(json_path=manager.Folder(),
                     w2idx_path=manager.WordIndexMap(),
                     seq_length_save_path=manager.FileSeqLen(),
                     data_save_path=manager.FileData(),
                     idx2cls_mapping_save_path=manager.FileIdx2Cls(),
                     num_per_class=N,
                     max_seq_len=seq_len)