def training_test_sets(dic, p_total=100, p_train=70, p_test=30, name_train='train_set', name_test='test_set', dir='', pop=[], typ='arff'): """ From a dictionary of prepared and cleaned instances, in prepares training and test set for weka or in csv form :param dic: The dictionary of dataset :param p_total: the total percentage of data in the discitonary to use :param p_train: the percentage going in training set (temporally ordered) :param p_test: the percentage of the test set :param name_train: name of produced the training file :param name_test: name of produced the test file :param dir: the directory where the files are saved :param pop: a list containing keys which we want to exclude in classification or regression :param typ: the type of file. Arff for weka, csv for other methods :return: returns a value associated to the ouptut variable (useful for weka but not needed) """ assert p_train + p_test <= 100 # dividing the domain into train, void and test parts length = len(dic[dic.keys()[0]]) init_train = 0 fin_train = int(length * float(p_total) / 100.0 * float(p_train) / 100.0) init_void = fin_train + 1 fin_void = int(length * float(p_total) / 100.0 * float(100.0 - p_test) / 100.0) init_test = fin_void + 1 fin_test = int(length * float(p_total) / 100.0) - 1 # eliminating some of the features new_dic = dic.copy() for k in pop: new_dic.pop(k, None) #print new_dic.keys() # Brings event as the last key element (both for regression and classification) if is_in_list('Event', new_dic.keys()): keys = new_dic.keys() keys.remove('Event') keys.append('Event') if is_in_list('ElNino_tau', new_dic.keys()): keys = new_dic.keys() keys.remove('ElNino_tau') keys.append('ElNino_tau') p = 0 # writing the attributes attr = [] for k in keys: attr.append([k]) for i in range(0, len(attr)): if attr[i][0] != 'Event': attr[i].append('REAL') else: attr[i].append(['yes', 'no']) if (attr[i][0] == 't0'): p = i + 1 dic_train = {} dic_test = {} for k in new_dic.keys(): dic_train[k] = np.array([]) dic_test[k] = np.array([]) for i in range(init_train, fin_train + 1): for k in new_dic.keys(): dic_train[k] = np.append(dic_train[k], new_dic[k][i]) for i in range(init_test, fin_test + 1): for k in new_dic.keys(): dic_test[k] = np.append(dic_test[k], new_dic[k][i]) if typ == 'csv': io.csv_file(dic_train, dir, name_train, order=keys) io.csv_file(dic_test, dir, name_test, order=keys) elif typ == 'arff': io.arff_file(dic_train, attr, 'ElNino_training', u'', dir, name_train) io.arff_file(dic_test, attr, 'ElNino_test', u'', dir, name_test) elif typ == 'all': io.csv_file(dic_train, dir, name_train, order=keys) io.csv_file(dic_test, dir, name_test, order=keys) io.arff_file(dic_train, attr, 'ElNino_training', u'', dir, name_train) io.arff_file(dic_test, attr, 'ElNino_test', u'', dir, name_test) else: print 'Not allowed file format. Exiting!' exit(1) return p
def random_training_test_sets(dic, p_train=70, p_test=30, name_train='train_set', name_test='test_set', dir='', pop=[], typ='arff', seed=0): import random length = len(dic[dic.keys()[0]]) seq = range(0, length) random.seed(seed) random.shuffle(seq) init_train = 0 fin_train = int(length * float(p_train) / 100.0) init_test = fin_train + 1 fin_test = int(length) - 1 new_dic = dic.copy() for k in pop: new_dic.pop(k, None) if is_in_list('Event', new_dic.keys()): keys = new_dic.keys() keys.remove('Event') keys.append('Event') if is_in_list('ElNino_tau', new_dic.keys()): keys = new_dic.keys() keys.remove('ElNino_tau') keys.append('ElNino_tau') p = 0 # writing the attributes attr = [] for k in keys: attr.append([k]) for i in range(0, len(attr)): if attr[i][0] != 'Event': attr[i].append('REAL') else: attr[i].append(['yes', 'no']) if (attr[i][0] == 't0'): p = i + 1 dic_train = {} dic_test = {} for k in new_dic.keys(): dic_train[k] = np.array([]) dic_test[k] = np.array([]) for i in range(init_train, fin_train + 1): for k in new_dic.keys(): dic_train[k] = np.append(dic_train[k], new_dic[k][seq[i]]) for i in range(init_test, fin_test + 1): for k in new_dic.keys(): dic_test[k] = np.append(dic_test[k], new_dic[k][seq[i]]) if typ == 'csv': io.csv_file(dic_train, dir, name_train, order=keys) io.csv_file(dic_test, dir, name_test, order=keys) elif typ == 'arff': io.arff_file(dic_train, attr, 'ElNino_training', u'', dir, name_train) io.arff_file(dic_test, attr, 'ElNino_test', u'', dir, name_test) elif typ == 'all': io.csv_file(dic_train, dir, name_train, order=keys) io.csv_file(dic_test, dir, name_test, order=keys) io.arff_file(dic_train, attr, 'ElNino_training', u'', dir, name_train) io.arff_file(dic_test, attr, 'ElNino_test', u'', dir, name_test) else: print 'Not allowed file format. Exiting!' exit(1) return p
def training_test_sets(dic, p_total = 100, p_train = 70 ,p_test = 30 , name_train = 'train_set' , name_test = 'test_set', dir = '', pop = [] , typ = 'arff'): assert p_train + p_test <= 100 # dividing the domain into train, void and test parts length = len(dic[dic.keys()[0]]) init_train = 0 fin_train = int(length*float(p_total)/100.0*float(p_train)/100.0) init_void = fin_train + 1 fin_void = int(length*float(p_total)/100.0*float(100.0-p_test)/100.0) init_test = fin_void + 1 fin_test = int(length*float(p_total)/100.0) - 1 # eliminating some of the features new_dic = dic.copy() for k in pop: new_dic.pop(k, None) # Brings event as the last key element (both for regression and classification) if is_in_list('Event',new_dic.keys()): keys = new_dic.keys() keys.remove('Event') keys.append('Event') if is_in_list('ElNino_tau',new_dic.keys()): keys = new_dic.keys() keys.remove('ElNino_tau') keys.append('ElNino_tau') p = 0 # writing the attributes attr = [] for k in keys: attr.append([k]) for i in range(0,len(attr)): if attr[i][0] != 'Event': attr[i].append('REAL') else: attr[i].append(['yes','no']) if(attr[i][0] == 't0'): p = i+1 dic_train = {} dic_test = {} for k in new_dic.keys(): dic_train[k] = np.array([]) dic_test[k] = np.array([]) for i in range(init_train,fin_train+1): for k in new_dic.keys(): dic_train[k] = np.append(dic_train[k],new_dic[k][i]) for i in range(init_test,fin_test+1): for k in new_dic.keys(): dic_test[k] = np.append(dic_test[k],new_dic[k][i]) if typ == 'csv': io.csv_file(dic_train,dir,name_train,order=keys) io.csv_file(dic_test,dir,name_test,order=keys) elif typ == 'arff': io.arff_file(dic_train,attr,'ElNino_training',u'',dir,name_train) io.arff_file(dic_test,attr,'ElNino_test',u'',dir,name_test) elif typ == 'all': io.csv_file(dic_train,dir,name_train,order=keys) io.csv_file(dic_test,dir,name_test,order=keys) io.arff_file(dic_train,attr,'ElNino_training',u'',dir,name_train) io.arff_file(dic_test,attr,'ElNino_test',u'',dir,name_test) else: print 'Not allowed file format. Exiting!' exit(1) return p