def train(dataroot, classifier_name='cnn'): balance = get_balancing_technique() K = 10 fold_prefix = str(K) + 'bal_fold_{}.csv' if balance == 'explicit' else str( K) + 'r_fold_{}.csv' class_weight = get_class_weights(dataroot) classifier_args, config = get_args(classifier_name, class_weight) pre_fingerprint = join(dataroot, 'c_{}'.format(classifier_name)) fingerprint = join(pre_fingerprint + config, 'K_{}'.format(K)) print(fingerprint) folds_data = load_folds(dataroot, fold_prefix, K) for test_index in range(K): print('-----------{}----------'.format(test_index)) X_train = np.concatenate( [fold[0] for i, fold in enumerate(folds_data) if i != test_index], axis=0) y_train = np.concatenate( [fold[1] for i, fold in enumerate(folds_data) if i != test_index], axis=0) logdir = join(fingerprint, 'log', '{}'.format(test_index)) ensure_dir(logdir) classifier_args['runs_dir'] = logdir clf = get_classifier(classifier_args) clf.fit(X_train, y_train) modelname = join(classifier_args['runs_dir'], 'model.pkl') pickle.dump(clf, open(modelname, 'wb'))
def train(dataroot, classifier_name='cnn'): balance = get_balancing_technique() K = 10 fold_prefix = '{}bal_fold_{}.csv' if balance == 'explicit' else '{}r_fold_{}.csv' class_weight = get_class_weights(dataroot) classifier_args, config = get_args(classifier_name, class_weight) pre_fingerprint = join(dataroot, 'c_{}'.format(classifier_name)) fingerprint = join(pre_fingerprint + config, 'K_{}'.format(K)) print(fingerprint) num_epochs = 40 for test_index in range(K): print('-----------{}----------'.format(test_index)) dev_indices = [i for i in range(K) if i != test_index] val_index = dev_indices[0] train_indices = dev_indices[1:] val_csv = join(dataroot, fold_prefix.format(K, val_index)) list_of_train_csvs = [ join(dataroot, fold_prefix.format(K, i)) for i in train_indices ] logdir = join(fingerprint, 'log', '{}'.format(test_index)) ensure_dir(logdir) classifier_args['runs_dir'] = logdir clf = get_classifier(classifier_args) clf.fit(list_of_train_csvs, val_csv, num_epochs)
def classify(dataroot,classifier_name): K=5 balance = get_balancing_technique() train_data = [] #single fold 29M records # 4 folds 120M records # if 20M records require 5% RAM # then 120M records require 30% memory print("Reading the data...") tick=time.time() label_to_id, id_to_label, _ = get_ids18_mappers() num_train_records = 0 print("Reading 4 folds ") if balance=='with_loss' or balance=='no' or balance=='with_loss_sub': regex = 'r_fold_{}.csv' elif balance=='explicit': regex = 'bal_fold_{}.csv' for fold_index in tqdm(range(K)): if fold_index==0: continue reader = pd.read_csv(join(dataroot,regex.format(fold_index)),chunksize=10**6, usecols=get_cols4ml(), dtype=get_dtype4normalized())# 10**6 rows read in 9min, total is 29*10**6 # remove the extra header row for df in tqdm(reader): y_str = df.Label.values x = df.drop(columns=['Label']).values train_data.append((x,encode_label(y_str))) num_train_records +=df.shape[0] print(df.memory_usage(deep=True).sum()*(799902)/(1024*1024*1024 )) tock = time.time() print("read data in {:.2f}".format(tock-tick)) # 24min classifier_args, config = get_args(classifier_name, num_class='dummy', class_weight=None) pre_fingerprint = join(dataroot, 'c_{}'.format(classifier_name)) fingerprint = pre_fingerprint + config logdir = join(fingerprint,'log') ensure_dir(logdir) X_train = np.concatenate([fold[0] for fold in train_data ],axis=0) y_train = np.concatenate([fold[1] for fold in train_data ],axis=0) classifier_args['runs_dir']=logdir print("Start training") tick = time.time() clf= get_classifier(classifier_args) print("classes") print(np.unique(y_train)) clf.fit(X_train, y_train) fn = classifier_args['runs_dir']+'.pkl' pickle.dump(clf,open(fn,'wb')) print("Done training {} flow records in {:.2f} sec".format(y_train.shape[0],time.time()-tick))
def train_fold(X_train,y_train,args): classifier_name = args['classifier_name'] runs_dir = args['runs_dir'] clf = get_classifier(args) # runs_dir is a dir to put training log of the classifier print('fitting the model') tick = time.time() clf.fit(X_train,y_train) tock = time.time() duration = tock-tick print("Trained data of size {} in {:.0f} min, {:.0f} sec ".format(X_train.shape,duration//60,duration%60)) if classifier_name in ['tree', 'forest']: fn = runs_dir+'.pkl' print("Saving to ",fn) pickle.dump(clf,open(fn,'wb')) return clf,duration
def classify(dataroot, classifier_name): K = 5 fraction = 1 #total_records = 6907705; # in fold fraction after removin small classes <K folds_df = [] fold_root = join(dataroot, 'folds_fraction_{}'.format(fraction)) print("Reading the data...") ds_list = [] for fold_index in range(K): df = pd.read_csv(join(fold_root, 'fold_{}.csv'.format(fold_index))) folds_df.append(df) ds_list.append(df.Label) total_df = pd.concat(folds_df) total_label_df = pd.concat(ds_list) labels = total_label_df.sort_values().unique() total_records = total_label_df.shape[0] #labels,labels_d = get_labels(total_label_df.unique()) label_to_id, id_to_label, _ = get_ids18_mappers() class_weight = get_class_weights( encode_label(total_label_df.values, label_to_id)) balance = get_balancing_technique() input_dim = folds_df[0].shape[ 1] - 2 # because we remove Label and FlowID columns from X gt_num_class = len(label_to_id) num_class = len(labels) assert gt_num_class == num_class, 'all classess should be observed gt_classes!=observed_classes {}!={}'.format( gt_num_class, num_class) classifier_args, config = get_args(classifier_name, total_records, gt_num_class, input_dim, class_weight, balance) pre_fingerprint = join( dataroot, 'r_{}_c_{}_k_{}'.format(fraction, classifier_name, str(K))) fingerprint = pre_fingerprint + '_mem_constrained' + config logdir = join(pre_fingerprint + config, 'log') runs_dir = get_runs_dir(logdir) classifier_args['runs_dir'] = runs_dir clf = get_classifier(classifier_args) time_inference(classifier_name, clf, total_df, dataroot)
def classify(dataroot, classifier_name='cnn'): class_weight = get_class_weights(dataroot) balance = get_balancing_technique() print('balancing technique ', balance) if balance == 'explicit': train_csv = join(dataroot, 'bal_train.csv') val_csv = join(dataroot, 'bal_fold_1.csv' ) # no need to use bal__fold because it is shuffled else: train_csv = join(dataroot, 'r_train.csv') val_csv = join(dataroot, 'r_fold_1.csv') result_val = subprocess.run(['wc', '-l', val_csv], stdout=subprocess.PIPE) result_train = subprocess.run(['wc', '-l', train_csv], stdout=subprocess.PIPE) train_records = int(result_train.stdout.split()[0]) - 1 # for the header val_records = int(result_val.stdout.split()[0]) - 1 print("Number of train and val records ({},{})".format( train_records, val_records)) num_epochs = 40 label_to_id, id_to_label, _ = get_ids18_mappers() #class_weight = None class_weight = get_class_weights(dataroot) if balance == 'with_loss_inverse': class_weight = 1. / class_weight num_class = len(label_to_id) # we assume all the categories are observed classifier_args, config = get_args(classifier_name, num_class, class_weight) pre_fingerprint = join(dataroot, 'c_{}'.format(classifier_name)) fingerprint = pre_fingerprint + config logdir = join(fingerprint, 'log') ensure_dir(logdir) classifier_args['runs_dir'] = logdir clf = get_classifier(classifier_args) clf.fit(train_csv, val_csv, num_epochs, train_records, val_records)
def train_and_save_classifier(X_train,y_train,args): classifier_name = args['classifier_name'] balance = args['balance'] clf = get_classifier(args) # runs_dir is a dir to put training log of the classifier if balance=='explicit': tick = time.time() X_train,y_train = balance_data(X_train,y_train) tock = time.time() tick = time.time() print("Shufling data") X_train, y_train = shuffle(X_train,y_train) print('fitting model') clf.fit(X_train,y_train) if classifier_name in ['tree', 'forest']: with open(join(args['runs_dir'],'model.pkl'),'wb') as f: pickle.dump(clf,f) tock = time.time() duration = tock-tick print("Trained data of size {} in {:.0f} min, {:.0f} sec ".format(X_train.shape,duration//60,duration%60)) return
targets = f(dict(images=inputs), signature="reconstructions", as_dict=True)["images"] if settings.dataname == 'faces' or settings.dataname == 'faces2' or settings.dataname == 'planes' or settings.dataname == 'cars' or settings.dataname == 'chairs' or settings.dataname == 'dlib_cars3d' or settings.dataname == 'dlib_faces3d': input_shape = [64, 64, 3] elif settings.dataname == 'dlib_smallnorb': input_shape = [64, 64, 1] else: input_shape = list(tr_data_loader.inputs[0].shape) print('--- Created Dataset ---') ##################################################### ###################### Models ####################### ##################################################### if not dlib: simvae_model = get_classifier(settings, 'simvae') gen_model = get_classifier(settings, 'gen') #Decoder for the second VAE residual_enc_model = get_classifier( settings, 'residual_enc') #Encoder for the second VAE if settings.add_encoder: enc_model = get_classifier(settings, 'enc') if settings.add_mi_penalty: mi_disc_model = get_classifier(settings, 'mi_disc') print('--- Created Models ---') ###################################################### ############### Learning Rate and Optimizer ########## ###################################################### optimizer_gen = get_optimizers(settings, 'gen') if settings.add_encoder: optimizer_enc = get_optimizers(settings, 'enc') if settings.add_mi_penalty: optimizer_mi_disc = get_optimizers(settings, 'mi_disc') if settings.add_infogan_penalty: