def GENERATE_RANDOM_EXPANDERS(K, size_H, EPSILON, samples): NAME = '[RANDOM' + str(K) + ']' print NAME + " Generating " + str( samples) + " H (adjacency list matrices) of size " + str( size_H) + " x " + str(K) + " ... " print "\n" eigenvalue = 0 for sampling in range(samples): print NAME + " ## " + str(sampling) + " // " + str( samples) + " ## " H = generate_expander(K, size_H) eigenvalue_aux = helpers.generate_eigenvalue(H, size_H, K, EPSILON, NAME) eigenvalue += eigenvalue_aux eigenvalue = eigenvalue / samples print NAME + " Calculated average of second highest eigenvalue for " + str( samples) + " matrices H." helpers.write_result(NAME, size_H, K, eigenvalue) helpers.cleanup(".aux")
def GENERATE_ANGLUIN_EXPANDERS(size, A_indices, n, EPSILON): size_H = 2 * size print NAME + " Generating H (adjacency list matrix) of size " + str( size_H) + " x " + str(K) + " ... " H = numpy.empty( shape=(size_H, K), dtype=numpy.int32) # Generate H, empty adjacency list matrix for row in A_indices: for element_index in row: # Get the tuple index from the matrix of indices (A) x0 = element_index / n # Grab first value y0 = element_index % n # Grab second value i = element_index # Grab the index of the (x0, y0) element # connect to (x, y) in B x = x0 y = y0 j = (x * n + y % n) + size # add the shift in the H indexing H[i][0] = j # node with index i is connected to node with index j H[j][0] = i # vice-versa # connect to (x + y, y) in B x = (x0 + y0) % n y = y0 j = (x * n + y % n) + size H[i][1] = j H[j][1] = i # connect to (y + 1, -x) in B x = (y0 + 1) % n y = (-x0) % n j = (x * n + y % n) + size H[i][2] = j H[j][2] = i print NAME + " Generated adjacency list matrix H." print NAME + " Calculating second highest eigenvalue of H ... " eigenvalue = helpers.generate_eigenvalue(H, size_H, K, EPSILON, NAME) print NAME + " Calculated second highest eigenvalue of H." helpers.write_result(NAME, size_H, K, eigenvalue) helpers.cleanup(".aux")
def GENERATE_ANGLUIN_EXPANDERS(size, A_indices, n, EPSILON): size_H = 2 * size print NAME + " Generating H (adjacency list matrix) of size " + str(size_H) + " x " + str(K) + " ... " H = numpy.empty(shape=(size_H, K), dtype=numpy.int32) # Generate H, empty adjacency list matrix for row in A_indices: for element_index in row: # Get the tuple index from the matrix of indices (A) x0 = element_index / n # Grab first value y0 = element_index % n # Grab second value i = element_index # Grab the index of the (x0, y0) element # connect to (x, y) in B x = x0 y = y0 j = (x * n + y % n) + size # add the shift in the H indexing H[i][0] = j # node with index i is connected to node with index j H[j][0] = i # vice-versa # connect to (x + y, y) in B x = (x0 + y0) % n y = y0 j = (x * n + y % n) + size H[i][1] = j H[j][1] = i # connect to (y + 1, -x) in B x = (y0 + 1) % n y = (-x0) % n j = (x * n + y % n) + size H[i][2] = j H[j][2] = i print NAME + " Generated adjacency list matrix H." print NAME + " Calculating second highest eigenvalue of H ... " eigenvalue = helpers.generate_eigenvalue(H, size_H, K, EPSILON, NAME) print NAME + " Calculated second highest eigenvalue of H." helpers.write_result(NAME, size_H, K, eigenvalue) helpers.cleanup(".aux")
def GENERATE_RANDOM_EXPANDERS(K, size_H, EPSILON, samples): NAME = '[RANDOM' + str(K) + ']' print NAME + " Generating " + str(samples) + " H (adjacency list matrices) of size " + str(size_H) + " x " + str(K) + " ... " print "\n" eigenvalue = 0 for sampling in range(samples): print NAME + " ## " + str(sampling) + " // " + str(samples) + " ## " H = generate_expander(K, size_H) eigenvalue_aux = helpers.generate_eigenvalue(H, size_H, K, EPSILON, NAME) eigenvalue += eigenvalue_aux eigenvalue = eigenvalue / samples print NAME + " Calculated average of second highest eigenvalue for " + str(samples) + " matrices H." helpers.write_result(NAME, size_H, K, eigenvalue) helpers.cleanup(".aux")
os.makedirs(os.path.join(args.data_path, args.dataset, args.now)) # copying validation set to new path val_df = pd.read_csv('{}/{}/val.csv'.format(args.data_path, args.dataset), header=None, names=['text', 'label']) val_df.to_csv('{}/{}/{}/val.csv'.format(args.data_path, args.dataset, args.now), header=False, index=False) del val_df for avg_iter in range(args.num_avg): print('\nRun {}\n'.format(avg_iter)) # setting datapath to original datasets args.datapath = os.path.join(args.data_path, args.dataset) filename = os.path.join(args.result_path, '{}_{}_{}.csv'.format(args.method, args.dataset, avg_iter)) helpers.write_result(filename, 'w', ['Train Size', 'loss', 'accuracy', 'total {}'.format(args.method), 'al time', 'train time'], args) total = 0 time_ = 0 for al_iter in range(args.rounds): # defining text and label fields text_field = data.Field(lower=True)#, init_token='<bos>', eos_token='<eos>', tokenize="spacy") label_field = data.Field(sequential=False) # load data print('\nLoading {} data ...\n'.format(args.dataset)) train_set, val_set, test_set = data_loaders.ds_loader(text_field, label_field, args) text_field.build_vocab(train_set) label_field.build_vocab(train_set) args.train_size, args.val_size, args.test_size = len(train_set), len(val_set), len(test_set) # sizes
def al(init_model, avg_iter, args): train_df = pd.read_csv(args.path / 'train.csv', header=None, names=args.names) test_df = pd.read_csv(args.path / 'test.csv', header=None, names=args.names) print('\nEvaluating initial model ...') preds = init_model.validate() helpers.write_result( args.save_dir / '{}_{}_{}.csv'.format(args.dataset, args.method, avg_iter), 'w', [ 'Train Size', 'loss', 'accuracy', 'total {}'.format(args.method), 'al time', 'train time' ], args) helpers.write_result( args.save_dir / '{}_{}_{}.csv'.format(args.dataset, args.method, avg_iter), 'a', [len(train_df['text']), preds[0], preds[1].numpy(), 0, 0, 0], args) model = init_model print('\nStarting active learning loop ...') for al_loop in range(args.rounds): print('\nRound {}'.format(al_loop)) start = time.time() # selecting new instances to add to train if args.method == 'random': if args.cluster: subset = methods.random_w_clustering(len(test_df['text']), test_df['text'], args) else: subset = methods.random(len(test_df['text']), args) total = 0 elif args.method == 'entropy': subset, total = methods.entropy(model, args, df=test_df['text']) elif args.method == 'margin': subset, total = methods.margin(model, args, df=test_df['text']) elif args.method == 'variation_ratio': subset, total = methods.variation_ratio(model, args, df=test_df['text']) elif args.method == 'dropout_variability': subset, total = methods.dropout_variability(model, args, df=test_df['text']) elif args.method == 'dropout_entropy': subset, total = methods.dropout_entropy(model, args, df=test_df['text']) elif args.method == 'dropout_margin': subset, total = methods.dropout_margin(model, args, df=test_df['text']) elif args.method == 'dropout_variation_ratio': subset, model = methods.dropout_variation(model, args, df=test_df['text']) end = time.time() al_time = end - start print('Round {}: {} insances selected according to {}'.format( al_loop, len(subset), args.method)) # updating datasets print('\nUpdating datasets ...') #helpers.update_datasets(train, test, subset, args) helpers.update_datasets(train_df, test_df, subset, args) # reload data as DataBunch an retrain the model print('\nReloading data ...') train_df = pd.read_csv(args.path / args.now / 'train_up.csv', header=None, names=args.names) valid_df = pd.read_csv(args.path / args.now / 'val.csv', header=None, names=args.names) test_df = pd.read_csv(args.path / args.now / 'test_up.csv', header=None, names=args.names) if args.dropout: test_df = helpers.check_batch_size(test_df, len(test_df['text']), args) data_lm = TextLMDataBunch.from_df(args.path / args.now, train_df=train_df, valid_df=valid_df, test_df=test_df, text_cols=args.cols[0], label_cols=args.cols[1]) data_clas = TextClasDataBunch.from_df(args.path / args.now, train_df=train_df, valid_df=valid_df, test_df=test_df, text_cols=args.cols[0], label_cols=args.cols[1], vocab=data_lm.train_ds.vocab, bs=args.bs) print('\nRetraining model ...') # fine tuning language model start = time.time() helpers.language_model(data_lm, args) # create a classifier model = helpers.classifier(data_clas, args) end = time.time() train_time = end - start print('\nEvaluating ...') preds = model.validate() helpers.write_result( args.save_dir / '{}_{}_{}.csv'.format(args.dataset, args.method, avg_iter), 'a', [ len(train_df), preds[0], preds[1].numpy(), total, al_time, train_time ], args)