예제 #1
0
def leave_one_out(definition):
    initial_time = datetime.datetime.now()
    blacklist = Utils.init_blacklist(definition.blacklist_path())
    col_stats = ColumnStatsD.fromFile(definition.stats_path())
    query_num = definition.query_num()

    dataset_dict = None
    # Note: the commented code below does not work because
    # writeToFile, and loadFromFile are broken. When they are fixed
    # this should speed up the whole procedure a bit, because we will
    # not need to parse a big trace file.

    # if os.path.exists(definition['model_file']) and os.path.isfile(definition['model_file']):
    #     try:
    #         dataset_dict = MalDictionary.loadFromFile(definition['model_file'])
    #     except:
    #         logging.warning('Could not load model file: {}. Rebuilding.'.format(definition['model_file']))
    #         dataset_dict = None

    if dataset_dict is None:
        print('Loading traces for query: {:02}...'.format(query_num), end='')
        sys.stdout.flush()
        load_start = datetime.datetime.now()
        dataset_dict = MalDictionary.fromJsonFile(definition.data_file(),
                                                  blacklist, col_stats)
        load_end = datetime.datetime.now()
        print('Done: {}'.format(load_end - load_start))
        # dataset_dict.writeToFile(definition['model_file'])

    errors = list()
    pl = open(definition.result_file(), 'w')
    cnt = 0
    total = len(dataset_dict.query_tags)
    for leaveout_tag in dataset_dict.query_tags:
        iter_start = datetime.datetime.now()
        print("\b\b\b\b", end='')
        print('{:03}%'.format(int(100 * cnt / total)), end='')
        sys.stdout.flush()
        cnt += 1
        test_dict = dataset_dict.filter(lambda x: x.tag == leaveout_tag)
        train_dict = dataset_dict.filter(lambda x: x.tag != leaveout_tag)

        graph = test_dict.buildApproxGraph(train_dict)

        predict_start = datetime.datetime.now()
        predicted_mem = test_dict.predictMaxMem(graph)
        actual_mem = test_dict.getMaxMem()
        iter_end = datetime.datetime.now()

        errors.append(100 * (predicted_mem - actual_mem) / actual_mem)
        pl.write("{} {} {}\n".format(iter_end - iter_start,
                                     iter_end - predict_start,
                                     errors[cnt - 1]))

    print("")
    outfile = definition.out_path('Q{:02}_memerror.pdf'.format(query_num))
    print()
    pl.close()
    Utils.plotLine(numpy.arange(1, cnt), errors, outfile, 'Error percent',
                   'Leave out query')
예제 #2
0
def plot_select_error_air(db,
                          q,
                          trainq=None,
                          path="",
                          ntrain=1000,
                          step=25,
                          output=None):
    assert db == 'tpch10' or db == 'airtraffic'
    blacklist = Utils.init_blacklist("config/mal_blacklist.txt")

    col_stats = ColumnStatsD.fromFile('config/{}_stats.txt'.format(db))

    if trainq is None:
        trainq = q

    e = []
    logging.info("Examining Query: {}".format(q))

    logging.info("loading training set...")
    trainf = "traces/random_{db}/ran_q{q}_n{n}_{db}.json".format(db=db,
                                                                 q=trainq,
                                                                 n=ntrain)
    traind = MalDictionary.fromJsonFile(trainf, blacklist, col_stats)

    logging.info("loading test set...")
    testf = "traces/{}/{}.json".format(db, q)
    testd = MalDictionary.fromJsonFile(testf, blacklist, col_stats)

    # filter only select instructions
    seld = testd.filter(lambda ins: ins.fname in ['select', 'thetaselect'])
    seli = seld.getInsList()

    train_tags = traind.query_tags
    train_tags.sort()
    e = []
    ind = []
    # kutsurak: This loop increases the queries we use to train the
    # model.
    for i in range(1, ntrain + 2, step):
        d12 = traind.filter(lambda ins: ins.tag in train_tags[0:i])
        print(len(d12.query_tags))
        pG = testd.buildApproxGraph(d12)
        error = 0
        for ins in seli:
            p = ins.predict(d12, pG)[0]
            cnt = ins.ret_size
            pc = p.getMem()
            # we use abs so that the errors do not cancel out
            if cnt > 0:
                error += 100 * abs((pc - cnt) / cnt)
        e.append(error / len(seli))
        ind.append(i)

    print("error array:", e)
    outpdf = path + '{}_sel{}_error.pdf'.format(
        db, q) if output is None else output
    Utils.plotLine(ind, e, outpdf, 'Error perc', 'Nof training queries')
예제 #3
0
def plot_mem_error_air(db,
                       q,
                       trainq=None,
                       path="",
                       output=None,
                       ntrain=1000,
                       step=25):
    blacklist = Utils.init_blacklist("config/mal_blacklist.txt")

    col_stats = ColumnStatsD.fromFile('config/{}_stats.txt'.format(db))

    if trainq is None:
        trainq = q

    e = []
    logging.info("Examining Query: {}".format(q))

    logging.info("loading training set...")
    trainf = "traces/random_{db}/ran_q{q}_n{n}_{db}.json".format(db=db,
                                                                 q=trainq,
                                                                 n=ntrain)
    traind = MalDictionary.fromJsonFile(trainf, blacklist, col_stats)

    logging.info("loading test set...")
    testf = "traces/{}/{}.json".format(db, q)
    testd = MalDictionary.fromJsonFile(testf, blacklist, col_stats)

    train_tags = traind.query_tags
    train_tags.sort()
    e = []
    ind = []
    for i in range(1, ntrain + 2, step):
        d12 = traind.filter(lambda ins: ins.tag in train_tags[0:i])
        print(len(d12.query_tags))
        pG = testd.buildApproxGraph(d12)
        pmm = testd.predictMaxMem(pG)
        mm = testd.getMaxMem()
        # print(pmm / 1000000, mm / 1000000)
        e.append(100 * ((pmm - mm) / mm))
        ind.append(i)

    print(e)
    outf = path + '{}_q{}_memerror.pdf'.format(db,
                                               q) if output is None else output
    Utils.plotLine(ind, e, outf, 'Error perc', 'Nof training queries')