def test_data_size_vs_diff(dm, given_dict, infer_dict):
    #Read all data from data model
    dm.read_data(normalize_data=False)   
    #attr_list = [U_UNIVERSITY_CODE, PROGRAM_CODE, UNIVERSITY, MAJOR_CODE, TERM]
    attr_list = [U_UNIVERSITY_CODE, PROGRAM_CODE, UNIVERSITY]
    #attr_list = [MAJOR_CODE, PROGRAM_CODE, TERM]
    
    #Size of data
    data_size = len(dm.data)

    #Step size = 10 steps 
    step_size = data_size//10

    #Get experiment data in a dict
    size = []
    accuracy = []

    for i in xrange(step_size, data_size, step_size):
        dm_test = DataModel("")
        dm_test.set_data(dm.data[:i])
        exp_test = Experimenter(dm_test, attr_list)
        actual = exp_test.get_actual_result(given_dict, infer_dict)
        estimation = exp_test.generic_get_estimated_result(given_dict, infer_dict)
        size.append(i)
        accuracy.append(abs(estimation - actual))
        print("Step:%d--->Actual:%f--->Estimate:%f" %(i, actual, estimation))
        print "-------------------------------------------------------------"
    plt.figure()
    plt.plot(size, accuracy)
    plt.title("Data Size vs Accuracy")
    plt.show()
示例#2
0
def main(args):
    dm = DataModel(args.gig_file, args.chat_file)
    dm.read_data()
    exp = Experimenter(dm)
    if args.classify is True:
        scores = exp.classify_gigs()
    if args.feature_values is True:
        scores = exp.evaluate_feature_values()
    return dm
示例#3
0
def main():
    debug = 0
    sample_size = 8192
    
    #database = MFPT(debug=debug)
    database = Paderborn(debug=debug)

    database_acq = database.load()
    #print(database_acq)

    database_exp = Experimenter(database_acq, sample_size)
    database_exp.perform(Classifiers(), Scoring())
示例#4
0
 def __init__(self, trials=3):
     self.exp = Experimenter()
     self.mongo_coll_conn = Mongo(collection_name='optimisation')
     self.trials = trials
     self.config = load_config()
     self.task_type = self.config['Utils']['task_type']
     self.data_name = self.config['Utils'][self.task_type]['data_name']
示例#5
0
def main(args):
    dm = DataModel(args.data_file)
    dm.read_data(to_read_count=10000)
    exp = Experimenter(dm, \
            process_datamodel=True, \
            serialise=False)
    t1 = time.time()
    exp.perform_multiclass_experiment(
            pred_mode=INDEPENDENT,
            use_exclusion=True,
            need_to_extract_features=True,
            prediction_file='../results/predictions_multiclass_independent_englishonly_legibleonly_wordunibigram_chartrigram_10000.csv',
            result_file='../results/results_multiclass_independent_englishonly_legibleonly_wordunibigram_chartrigram_10000.txt',
            english_only=True,
            legible_only=True)
    t2 = time.time()
    timeused = t2 - t1
    logging.getLogger(LOGGER).info('Time used in experiment (hour:min:sec): %d:%d:%d' % \
            (timeused/3600, timeused/60, timeused%60))
    return exp
示例#6
0
def main(args):
    dm = DataModel(args.data_file)
    dm.read_data(to_read_count=10000)
    exp = Experimenter(dm, \
            process_datamodel=True, \
            serialise=False)
    t1 = time.time()
    exp.perform_multiclass_experiment(
        pred_mode=INDEPENDENT,
        use_exclusion=True,
        need_to_extract_features=True,
        prediction_file=
        '../results/predictions_multiclass_independent_englishonly_legibleonly_wordunibigram_chartrigram_10000.csv',
        result_file=
        '../results/results_multiclass_independent_englishonly_legibleonly_wordunibigram_chartrigram_10000.txt',
        english_only=True,
        legible_only=True)
    t2 = time.time()
    timeused = t2 - t1
    logging.getLogger(LOGGER).info('Time used in experiment (hour:min:sec): %d:%d:%d' % \
            (timeused/3600, timeused/60, timeused%60))
    return exp
示例#7
0
def main(args):
    dm = DataModel(args.train_file)
    dm.read_train_data()
    exp = Experimenter(dm)
    distances = [x.get_distance() for x in dm.data]
    print(max(distances))
    print(min(distances))
    print(stats.mean(distances))
    t1 = time.time()
    t2 = time.time()
    timeused = t2 - t1
    logging.getLogger(LOGGER).info('Time used in experiment (hour:min:sec): %d:%d:%d' % \
            (timeused/3600, timeused/60, timeused%60))
    return exp
示例#8
0
        if args.check_estimator:
            common_params['eval_metrics'] += \
                    '-CPrecIPSin0.0_10-CPrecIPSin0.0_100-CDCGIPSin0.0_100000-CARIPSin0.0' + \
                    '-CPrecIPSin0.001_10-CPrecIPSin0.001_100-CDCGIPSin0.001_100000-CARIPSin0.001' + \
                    '-CPrecIPSin0.003_10-CPrecIPSin0.003_100-CDCGIPSin0.003_100000-CARIPSin0.003' + \
                    '-CPrecIPSin0.01_10-CPrecIPSin0.01_100-CDCGIPSin0.01_100000-CARIPSin0.01' + \
                    '-CPrecIPSin0.03_10-CPrecIPSin0.03_100-CDCGIPSin0.03_100000-CARIPSin0.03' + \
                    '-CPrecIPSin0.1_10-CPrecIPSin0.1_100-CDCGIPSin0.1_100000-CARIPSin0.1' + \
                    '-CPrecIPSin0.3_10-CPrecIPSin0.3_100-CDCGIPSin0.3_100000-CARIPSin0.3'
    else:
        common_params['eval_metrics'] = 'Prec_10-Prec_100' + '-CPrec_10-CPrec_100-CDCG_100000-CAR'
        if args.check_estimator:
            common_params['eval_metrics'] += '-CPrecIPSin0.01_10-CPrecIPSin0.01_100-CDCGIPSin0.01_100000-CARIPSin0.01'

    # set up experimenter
    experimenter = Experimenter()
    list_params = experimenter.set_search_params(args.cond_search, args.type_search)
    list_params = experimenter.set_common_params(list_params, common_params)
    save_result_file = dir_data_prepared + "result/" + datetime.now().strftime(
        '%Y%m%d_%H%M%S') + "_" + args.type_model + "_" + args.name_experiment + '_tlt' + str(args.time_length_train) + ".csv"

    if phase == 'test_phase':
        save_result_file = save_result_file.replace('.csv', '_test.csv')

    print('save_result_file is {}'.format(save_result_file))
    save_result_dir = os.path.dirname(save_result_file)
    if not os.path.exists(save_result_dir):
        os.mkdir(save_result_dir)

    print('Start experiment.')
    t_init = datetime.now()
示例#9
0
        "num_factor": 40
    }
    files["movietweetings-gte.csv"] = {
        "learning_rate": 0.0001,
        "reg_rate": 0.0,
        "batch_size": 256,
        "num_factor": 40
    }
    files["ml-100k-gte.csv"] = {
        "learning_rate": 0.0005,
        "reg_rate": 0.0,
        "batch_size": 256,
        "num_factor": 40
    }

    experimenter = Experimenter()
    experimenter.config_gpu()

    experimenter.addSamplingApproach(Cosine)
    experimenter.addSamplingApproach(TF_IDF)
    experimenter.addSamplingApproach(ARM)
    experimenter.addSamplingApproach(Random)

    experimenter.addMaxRejection(TotalLimit)
    experimenter.addMaxRejection(UniqueLimit)
    experimenter.addMaxRejection(Q3Total)
    experimenter.addMaxRejection(Q3Unique)

    experimenter.setModel(NeuMF)
    experimenter.setParameterFiles(files)
    experimenter.execute()
示例#10
0
from experiments import EXPERIMENTS
from experimenter import Experimenter
import utils

parser = argparse.ArgumentParser()
parser.add_argument('-n',
                    '--trials',
                    type=int,
                    default=1,
                    help="How many trials?")
parser.add_argument('-tb',
                    '--tensorboard',
                    action='store_true',
                    help="Should we write to TensorBoard")

rng = np.random.default_rng()
experiments = [0]
# experiments = list(sorted(EXPERIMENTS.keys()))
# experiments = list(range(31, 51))

# Experiment 0 is a test experiment
# experiments.remove(0)

if __name__ == '__main__':
    args = parser.parse_args()
    print(f"Running {args.trials} trial(s) of experiments {experiments}.")
    exp_runner = Experimenter(write_to_tensorboard=args.tensorboard)
    for exp in experiments:
        for _ in range(args.trials):
            exp_runner.run_experiment(exp, rng)