Exemplo n.º 1
0
    def save_data(self, res, out_dir=''):
        """ save res to disk

        Parameters
        ----------
        res
        out_dir

        Returns
        -------

        """
        if not pth.exists(out_dir):
            os.makedirs(out_dir)

        out_file = pth.join(out_dir, 'res.dat')
        dump_data(res, out_file=out_file)

        # (data_file, model_name), (_best_res, _middle_res) = res
        try:
            _dat2csv(res, out_file=out_file + '.csv', feat_set=self.data_cfg['feat'])
        except Exception as e:
            print(f"Error({e})")

        return out_file
Exemplo n.º 2
0
    def _merge_models(feat, is_header, is_before_proj, is_gs, ds, covariance_type,  dataset_name, model_name):
        """ merge all models and datasets to one csv

        Parameters
        ----------
        feat
        is_header
        is_gs
        d

        Returns
        -------

        """
        res_ = {}
        vs = [] # store for csv
        for i, d_tup in enumerate(ds):
            _, d = d_tup
            if "OCSVM" in model_name:
                pth_cfg = (feat, is_header, is_before_proj, is_gs, d, None, model_name)
                data = _merge_datasets([dataset_name], pth_cfg)
                res_[d_tup] = data
            elif 'GMM' in model_name:
                pth_cfg = (feat, is_header, is_before_proj, is_gs, d, covariance_type, model_name)
                data = _merge_datasets([dataset_name], pth_cfg)
                res_[d_tup] = data
            else:
                msg = d_tup
                raise NotImplementedError(msg)

            # store for csv
            if i == 0:
                vs = copy.deepcopy(data)
            else:
                vs.extend(data)
        # print(vs)
        #  'feat-header_false-before_proj_False-gs_True-diag-std_False_center_False-d_5'
        out_file_ = pth.join(in_dir, feat + "-header_" + str(is_header),
                             "before_proj_" + str(is_before_proj) + "-gs_" + str(is_gs),
                             f"std_False_center_False-{str(covariance_type)}",
                             f'{dataset_name}-{model_name}.csv')
        print(f'data_models: {out_file_}')
        check_path(out_file_)
        out_file_dat = out_file_ + '.dat'
        dump_data(res_, out_file=out_file_dat)
        # save as csv
        pd.DataFrame(vs).to_csv(out_file_, index=False, encoding='utf-8-sig')
        # # save as xlsx
        # out_xlsx = dat2xlxs_new(out_file_dat, out_file=out_file_dat + '.xlsx', models=models)
        # # compute ratio OCSVM/GMM
        # out_xlsx_ratio = improvement(out_xlsx, feat_set=feat,
        #                              out_file=os.path.splitext(out_file_dat)[0] + '-ratio.xlsx')
        # print(out_xlsx)
        #
        # # for paper
        # out_latex = dat2latex(out_xlsx_ratio, out_file=os.path.splitext(out_file_dat)[0] + '-latex.xlsx')
        # print(out_latex)

        # show(in_file=out_file_)   # show model separately
        return out_file_
Exemplo n.º 3
0
def main1(directions=[
    ('direction', 'src_dst'),
],
          feats=[('feat', 'iat_size'), ('feat', 'stats')],
          headers=[('is_header', True), ('is_header', False)],
          gses=[('is_gs', True), ('is_gs', False)],
          before_projs=[
              ('before_proj', False),
          ],
          ds=[
              ('d_kjl', 5),
          ],
          out_dir='speedup/out',
          train_sizes=[('train_size', 5000)],
          is_parallel=True):
    # Store all the results
    res = []

    # Get all datasets
    datasets = [('data_name', v) for v in DATASETS]
    datasets_cfg = list(itertools.product(datasets, directions, feats,
                                          headers))
    # Get all models
    models = [('model_name', v) for v in MODELS]
    models_cfg = list(
        itertools.product(models, gses, before_projs, ds, train_sizes))

    # The total number of the experiments
    n_tot = len(list(itertools.product(datasets_cfg, models_cfg)))
    lg.info(f'n_tot: {n_tot}')
    for i, (data_cfg, model_cfg) in enumerate(
            list(itertools.product(datasets_cfg, models_cfg))):
        lg.info(f'{i}/{n_tot}, {dict(data_cfg)}, {dict(model_cfg)}')
    n_cpus = os.cpu_count()
    lg.info(f'n_cpus: {n_cpus}')

    # If we execute all experiments in parallel
    if is_parallel:
        parallel = Parallel(n_jobs=5, verbose=30)
        with parallel:
            res = parallel(
                delayed(_main)(dict(data_cfg), dict(model_cfg), out_dir)
                for data_cfg, model_cfg, in list(
                    itertools.product(datasets_cfg, models_cfg)))
    else:
        # Run each combination in sequence.
        for i, (data_cfg, model_cfg) in enumerate(
                list(itertools.product(datasets_cfg, models_cfg))):
            res_, time_token = _main(dict(data_cfg), dict(model_cfg), out_dir)
            res.append(res_)
            lg.info(f'{i + 1}/{n_tot}, it takes {time_token:.5f}s')

    # Dump all results to disk
    dump_data(res, out_file=f'{out_dir}/res.dat')
    lg.info('\n\n***finish!')
def main(directions=[
    ('direction', 'src_dst'),
],
         feats=[('feat', 'iat_size'), ('feat', 'stats')],
         headers=[('is_header', True), ('is_header', False)],
         gses=[('is_gs', True), ('is_gs', False)],
         before_projs=[
             ('before_proj', False),
         ],
         ds=[
             ('d_kjl', 5),
         ],
         train_sizes=[('train_size', 5000)],
         k_qs=[('k_qs', 5000**(5 / 7)), ('k_qs', 5000**(2 / 3))],
         out_dir='speedup/out',
         is_parallel=True):
    # Store all the results
    res = []
    datasets = [('data_name', v) for v in DATASETS]
    datasets_cfg = list(itertools.product(datasets, directions, feats,
                                          headers))
    models = [('model_name', v) for v in MODELS]
    models_cfg = list(
        itertools.product(models, gses, before_projs, ds, train_sizes, k_qs))

    # Total number of experiments
    n_tot = len(list(itertools.product(datasets_cfg, models_cfg)))
    lg.info(f'n_tot: {n_tot}')
    for i, (data_cfg, model_cfg) in enumerate(
            list(itertools.product(datasets_cfg, models_cfg))):
        lg.info(f'{i}/{n_tot}, {dict(data_cfg)}, {dict(model_cfg)}')
    n_cpus = os.cpu_count()
    lg.info(f'n_cpus: {n_cpus}')
    if is_parallel:
        # It doesn't work well (will be killed by the server). Not sure what's the issue about it.
        # n_jobs(=5) * _single_main(n_job=10) = 5*10, too large
        parallel = Parallel(n_jobs=5, verbose=30)
        with parallel:
            res = parallel(
                delayed(_main)(dict(data_cfg), dict(model_cfg), out_dir)
                for data_cfg, model_cfg, in list(
                    itertools.product(datasets_cfg, models_cfg)))
    else:
        # Run each combination in sequence. It's slow but it works.
        for i, (data_cfg, model_cfg) in enumerate(
                list(itertools.product(datasets_cfg, models_cfg))):
            res_, time_token = _main(dict(data_cfg), dict(model_cfg), out_dir)
            res.append(res_)
            lg.info(f'{i + 1}/{n_tot}, it takes {time_token:.5f}s')

    # dump all data to disk
    dump_data(res, out_file=f'{out_dir}/res.dat')
    lg.info('\n\n***finish!')
Exemplo n.º 5
0
def save_each_result(data, case_str, out_file=None):
    if not pt.exists(pt.dirname(out_file)): os.makedirs(pt.dirname(out_file))
    # dump first
    dump_data(data, pt.splitext(out_file)[0] + '.dat')

    with open(out_file, 'w') as f:
        aucs = data['aucs']
        train_times = data['train_times']
        test_times = data['test_times']
        params = data['params']

        _prefix, _line, _suffex = _get_line(data, feat_set='iat_size')
        aucs_str = "-".join([str(v) for v in aucs])
        train_times_str = "-".join([str(v) for v in train_times])
        test_times_str = "-".join([str(v) for v in test_times])

        line = f'{case_str}, {_prefix}, {_line}, => aucs:{aucs_str}, train_times:{train_times_str}, test_times:{test_times_str}, with params: {params}: {_suffex}'

        f.write(line + '\n')
Exemplo n.º 6
0
def main1(DATASETS = [], MODELS = [],
        directions=[('direction', 'src_dst'), ],
          feats=[('feat', 'iat_size'), ('feat', 'stats')],
          headers=[('is_header', True), ('is_header', False)],
          gses=[('is_gs', True), ('is_gs', False)],
          before_projs=[('before_proj', False), ],
          ds=[('kjl_d', 5), ], out_dir='speedup/out',
          train_sizes=[('train_size', 5000)],
          is_parallel=False):
    """ Get all results on all datasets and save the results to file.

    Parameters
    ----------
    directions
    feats
    headers
    gses
    before_projs
    ds
    out_dir
    train_sizes
    is_parallel

    Returns
    -------

    """
    start = time.time()

    # Store all the results
    res = []

    ################################################################################################################
    # 1. Get all datasets
    datasets = [('data_name', v) for v in DATASETS]
    datasets_cfg = list(itertools.product(datasets, directions, feats, headers))

    ################################################################################################################
    # 2. Get all models
    models = [('model_name', v) for v in MODELS]
    models_cfg = list(itertools.product(models, gses, before_projs, ds, train_sizes))

    ################################################################################################################
    # 3. Get the total number of the experiments and print each of them out
    n_tot = len(list(itertools.product(datasets_cfg, models_cfg)))
    lg.info(f'n_tot: {n_tot}')
    for i, (data_cfg, model_cfg) in enumerate(list(itertools.product(datasets_cfg, models_cfg))):
        lg.info(f'{i}/{n_tot}, {dict(data_cfg)}, {dict(model_cfg)}')
    n_cpus = os.cpu_count()
    lg.info(f'n_cpus: {n_cpus}')

    ################################################################################################################
    # 4. Run experiments in parallel or serial
    if is_parallel:
        # if backend='loky', the time taken is less than that of serial. but if backend='multiprocessing', we can
        # get very similar time cost comparing with serial.
        # The reason may be that loky module manages a pool of worker that can be re-used across time.
        # It provides a robust and dynamic implementation os the ProcessPoolExecutor and a function
        # get_reusable_executor() which hide the pool management under the hood.
        with Parallel(n_jobs=-1, verbose=30, backend='multiprocessing') as parallel:
                res = parallel(delayed(single_main)(copy.deepcopy(dict(data_cfg_)), copy.deepcopy(dict(model_cfg_)),
                                                    copy.deepcopy(out_dir)) for data_cfg_,
                                                                                model_cfg_ in
                               list(itertools.product(datasets_cfg, models_cfg)))
    else:
        # Running each combination in serial to obtain correct train time and test time
        for i, (data_cfg_, model_cfg_) in enumerate(list(itertools.product(datasets_cfg, models_cfg))):
            res_, time_taken = single_main(copy.deepcopy(dict(data_cfg_)), copy.deepcopy(dict(model_cfg_)),
                                           copy.deepcopy(out_dir))
            res.append((res_, time_taken))
            lg.info(f'{i + 1}/{n_tot}, it takes {time_taken:.5f}s')

    ################################################################################################################
    # 5. Dump all results to disk
    dump_data(res, out_file=f'{out_dir}/res.dat')

    ################################################################################################################
    end = time.time()
    lg.info(f'\n***It takes {end - start:.5f}s to finish {n_tot} experiments!')
Exemplo n.º 7
0
def save_result2(result, out_file):
    dump_data(result, pth.splitext(out_file)[0] + '.dat')