Пример #1
0
def gridSearch(c_values, k_values, per_epoch=200):  # 网格搜索聚类类簇数量和截断长度
    re = {}
    for ci, c_num in enumerate(c_values):
        re[c_num] = {}
        for ki, k_num in enumerate(k_values):
            print(ci * len(k_values) + ki + 1, "/",
                  len(c_values) * len(k_values))
            mng = PathManager("virushare-20-original")
            # findOptK(mng.WordEmbedMatrix(), k_range=(2,100))
            apiCluster(mng.WordEmbedMatrix(),
                       mng.DataRoot() + "MarkovClusterMapping.json",
                       cluster_num=c_num)
            makeClusteredData(
                json_path=mng.Folder(),
                cluster_path=mng.DataRoot() + "MarkovClusterMapping.json",
                word_map_path=mng.WordIndexMap(),
                dump_path=mng.DataRoot() + "MarkovClusteredData.npy",
                max_len=k_num)
            a = scoreMarkovEpisode(clustered_data_path=mng.DataRoot() +
                                   "MarkovClusteredData.npy",
                                   epoch=per_epoch,
                                   n_cluster=c_num,
                                   maxlen=k_num,
                                   verbose=False)
            re[c_num][k_num] = a

    return re
Пример #2
0
#            mode='x',
#            is_dir=True)
# splitDatas(src=man.DatasetBase()+'train/',
#            dest=man.DatasetBase()+'test/',
#            ratio=30,
#            mode='x',
#            is_dir=True)
################################################################

# 制作基于下标的数据集
################################################################
for d_type in ['train', 'validate', 'test']:
    manager = PathManager(dataset='virushare-20-3gram-tfidf', d_type=d_type)

    makeDataFile(json_path=manager.Folder(),
                 w2idx_path=manager.WordIndexMap(),
                 seq_length_save_path=manager.FileSeqLen(),
                 data_save_path=manager.FileData(),
                 idx2cls_mapping_save_path=manager.FileIdx2Cls(),
                 num_per_class=20,
                 max_seq_len=700)
################################################################

# renameItemFolder('/home/asichurter/datasets/JSONs/LargePE-100-original/')

# 统计序列长度分布
################################################################
# apiStat('/home/asichurter/datasets/JSONs/HKS/all/',
#         ratio_stairs=[50, 100, 200, 400, 500, 1000, 2000, 5000],
#         dump_report_path=None,#'/home/asichurter/datasets/reports/HKS_3gram_tfidf_api_report.json',#None,#
#         dump_apiset_path=None,#'/home/asichurter/datasets/reports/HKS_3gram_tfidf_api_set.json',#None
Пример #3
0
    #                   dump_path=mng.DataRozot()+"MarkovClusteredData.npy",
    #                   max_len=seq_len)
    # scoreMarkovEpisode(clustered_data_path=mng.DataRoot()+"MarkovClusteredData.npy",
    #                    epoch=2000,
    #                    n_cluster=n_cluster,
    #                    maxlen=seq_len)

    # re = gridSearch(c_values=list(range(*n_range)),
    #                 k_values=[i*50 for i in range(1,11)],
    #                 per_epoch=1000)
    # dumpJson(re, mng.DataRoot()+"GSs/GridSearchResult-%dshot-%dway-virushare20.json"%(k,n))
    # re = loadJson(mng.DataRoot()+"GSs/GridSearchResult-%dshot-%dway-virushare20.json"%(k,n))
    # n_cluster, seq_len = extractBestParam(re)
    # n_cluster = int(n_cluster)
    # seq_len = int(seq_len)

    apiCluster(mng.WordEmbedMatrix(),
               mng.DataRoot() + "MarkovClusterMapping.json",
               cluster_num=n_cluster)
    makeClusteredData(json_path=mng.Folder(),
                      cluster_path=mng.DataRoot() +
                      "MarkovClusterMapping.json",
                      word_map_path=mng.WordIndexMap(),
                      dump_path=mng.DataRoot() + "MarkovClusteredData.npy",
                      max_len=seq_len)
    scoreMarkovEpisode(clustered_data_path=mng.DataRoot() +
                       "MarkovClusteredData.npy",
                       epoch=epoch,
                       n_cluster=n_cluster,
                       maxlen=seq_len)
Пример #4
0
    if padding:
        pad_matrix = np.zeros((1, model.wv.vectors.shape[1]))
        matrix = np.concatenate((pad_matrix, matrix), axis=0)

        for i, w in enumerate(model.wv.index2word):
            word2index[
                w] = i + 1 if padding else i  # 由于idx=0要留给padding,因此所有的下标都加1
        word2index['<PAD>'] = 0

    if save_matrix_path:
        np.save(save_matrix_path, matrix)

    if save_word2index_path:
        dumpJson(word2index, save_word2index_path)

    if save_matrix_path is None and save_word2index_path is None:
        return matrix, word2index

    printBulletin('Done')


if __name__ == '__main__':
    manager = PathManager(dataset='HKS-api', d_type='all')

    # print(manager.FileData())

    seqs = aggregateApiSequences(manager.Folder())
    trainW2Vmodel(seqs,
                  save_matrix_path=manager.WordEmbedMatrix(),
                  save_word2index_path=manager.WordIndexMap(),
                  size=128)