def get_cado_predictions(): data_path = '../../datasets/cado/train.csv' test_path = '../../datasets/cado/test.csv' data = du.load_data(data_path) test = du.load_data(test_path) text_index = 6 label_start_index = 7 X = [d[text_index] for d in data] labels = [d[label_start_index:label_start_index + 12] for d in data] X_test = [d[text_index] for d in test] labels_test = [d[label_start_index:label_start_index + 12] for d in test] Y = np.array(labels, dtype='int') y_test = np.array(labels_test, dtype='int') #Y = np.array(binary_labels, dtype='int') test_index = len(X) X = X + X_test Y = np.vstack([Y, y_test]) tokenizer = tokenize_data(X) word_index = tokenizer.word_index sequences = tokenizer.texts_to_sequences(X) X = pad_sequences(sequences, maxlen=700, padding="post", truncating="post", value=0) num_words = min(MAX_NB_WORDS, len(word_index) + 1) embedding_matrix = np.zeros((num_words, 1)) for word, i in word_index.items(): if i >= MAX_NB_WORDS: continue embedding_matrix[i] = 1 X_train = X[0:test_index, :] Y_train = Y[0:test_index, :] x_test = X[test_index:len(X), :] y_test = Y[test_index:len(Y), :] classifier = MLkNN() classifier.fit(X_train, Y_train) predictions = classifier.predict(x_test) scores = classifier.predict_proba(x_test) y_pred = predictions.toarray() y_score = scores.toarray() return y_pred, y_score
def _dkt_test_models_multistep_chunk(trainparams, mctsparams, runstartix, chunk_num_runs): ''' Evaluate multistep error for the chunk of models. ''' ms_losses = [[] for _ in six.moves.range(chunk_num_runs)] #load data data = dataset_utils.load_data( filename='{}{}'.format(dg.SYN_DATA_DIR, mctsparams.mserror_file)) for offset in six.moves.range(chunk_num_runs): r = runstartix + offset for ep in trainparams.saved_epochs: print('=====================================') print('---------- Rep {:2d} Epoch {:2d} ----------'.format(r, ep)) print('=====================================') # load model from checkpoint checkpoint_name = trainparams.checkpoint_pat.format( trainparams.run_name, r, ep) checkpoint_path = '{}/{}'.format(trainparams.dir_name, checkpoint_name) # compute the multistep on the training data curr_loss = test_dkt_multistep(trainparams.model_id, data, chkpt=checkpoint_path) ms_losses[offset].append(curr_loss) six.print_(curr_loss) return ms_losses
def save_to_latex(): n=12 results=[] results_path = '../../results/multi-cado/labels_r/' classifier_names = ['LSTM1', 'LSTM2' ,'MLkNN', 'MF1', 'MF2','MK3', 'RAND'] for l in range(n): tmp = [] for clf in classifier_names: data = du.load_data(results_path+clf) l_r = data[l] tmp.append([clf]+l_r[0:3]) results.append(tmp) #fmt = "%d, %d, %d, %s" #all_results = np.round(all_results, decimals=3) # all_results.astype('str') tex_out = "" for k in range(n): #np.savetxt(, np.array(all_results[k])) du.save_data(results[k], '../../results_out/prf'+str(k)+'.csv', header=['classifier', 'precision', 'recall', 'f1-score']) if k%2 == 0: tex_out += r''' \begin{table}[!htb] ''' tex_out += r''' \begin{minipage}{.5\textwidth} \centering \caption{Caption '''+str(k)+'''} \label{tab:prf_'''+str(k)+'''} \pgfplotstabletypeset[col sep=comma, header=true, precision=4, columns/classifier/.style={string type, column type=r, column name=\ }, columns={classifier, precision, recall, f1-score}, highlight col max ={prf'''+str(k)+r'''.csv}{precision}, highlight col max ={prf'''+str(k)+r'''.csv}{recall}, highlight col max ={prf'''+str(k)+r'''.csv}{f1-score}, every head row/.style={before row=\\\toprule, after row=\bottomrule}, every even row/.style={before row={\rowcolor[gray]{0.92}}}, every last row/.style={after row=\bottomrule} ]{prf'''+str(k)+'''.csv} \end{minipage}''' if k%2 != 0 and k > 0 or k==n-1: tex_out += r''' \end{table} ''' text_file = open("../../results_out/tables.tex", "w") text_file.write(tex_out) text_file.close()
auc_results = [] fr = [] cfms = {} java_aucs = [] net_aucs = [] for clf_name in classifier_names: precision_results[clf_name] = [] recall_results[clf_name] = [] fscore_results[clf_name] = [] amount_results[clf_name] = [] cfms[clf_name] = [] d = du.load_data(results_path+'y_test') pr_ids = du.load_data(results_path+'test_pr_ids.csv') t = np.array(d) t = t.astype('float') pr_ids = np.array([int(i[0]) for i in pr_ids]) java = np.array(pr_ids) net = np.array(pr_ids) java[java==8]=1 java[java==7]=0 java=java==1 net[net==8]=0 net[net==7]=1 net = net==1
def main_worker(gpu, args, config, hyper): torch.backends.cudnn.benchmark = True torch.backends.cudnn.enabled = True best_acc1 = 0 args.writer = None start_epoch = 0 distributed = args.gpu is None if distributed: dist.init_process_group(backend=args.dist_backend, init_method="tcp://10.0.1.164:12345", world_size=args.world_size, rank=gpu) print("Process: {}, rank: {}, world_size: {}".format( gpu, dist.get_rank(), dist.get_world_size())) # Set the default device, any tensors created by cuda by 'default' will use this device torch.cuda.set_device(gpu) train_loader = load_data(config.train_path, args, hyper, distributed) val_loader = load_val(config.val_path, args, hyper, distributed) assert train_loader.dataset.classes == val_loader.dataset.classes model = resnet18() model.cuda(gpu) criterion = nn.CrossEntropyLoss().cuda(gpu) optimizer = optim.SGD(model.parameters(), lr=hyper.base_lr, momentum=hyper.momentum, weight_decay=hyper.weight_decay) # Nvidia documentation states - # "O2 exists mainly to support some internal use cases. Please prefer O1" # https://github.com/NVIDIA/apex/tree/master/examples/imagenet model, optimizer = amp.initialize(model, optimizer, opt_level="O1") if distributed: # By default, apex.parallel.DistributedDataParallel overlaps communication # with computation in the backward pass. # delay_allreduce delays all communication to the end of the backward pass. model = apex.parallel.DistributedDataParallel(model) if args.resume: checkpoint = torch.load(config.checkpoint_file, map_location='cpu') best_acc1 = checkpoint['best_acc1'] model.load_state_dict(checkpoint["model"]) optimizer.load_state_dict(checkpoint["optimizer"]) amp.load_state_dict(checkpoint["amp"]) start_epoch = checkpoint["epoch"] del checkpoint start_epoch = args.start_epoch - 1 if "start_epoch_overr" in args.__dict__ else start_epoch if args.evaluate: train_or_eval(False, gpu, val_loader, model, criterion, None, args, hyper, 0) return if not distributed or gpu == 0: args.writer = SummaryWriter(filename_suffix="{}".format(gpu)) end_epoch = start_epoch + args.epochs for epoch in range(start_epoch, end_epoch): if distributed: train_loader.sampler.set_epoch(epoch) train_or_eval(True, gpu, train_loader, model, criterion, optimizer, args, hyper, epoch) if not args.prof and (not distributed or gpu == 0): acc1 = train_or_eval(False, gpu, val_loader, model, criterion, None, args, hyper, 0) is_best = acc1 > best_acc1 best_acc1 = max(acc1, best_acc1) print("Saving model state...\n") save_checkpoint( { "epoch": epoch + 1, "base_lr": hyper.base_lr, "max_lr": hyper.max_lr, "stepsize": hyper.stepsize, "lr_policy": hyper.lr_policy, "batch_size": hyper.batch_size * args.world_size, "model": model.state_dict(), "optimizer": optimizer.state_dict(), "amp": amp.state_dict(), "best_acc1": best_acc1, }, is_best, filename=config.checkpoint_write) if args.writer: args.writer.close()
def main(): n_concepts = 4 use_student2 = True student2_str = '2' if use_student2 else '' learn_prob = 0.5 lp_str = '-lp{}'.format(int(learn_prob * 100)) if not use_student2 else '' n_students = 100000 seqlen = 7 filter_mastery = True filter_str = '' if not filter_mastery else '-filtered' policy = 'random' filename = 'test{}-n{}-l{}{}-{}{}.pickle'.format(student2_str, n_students, seqlen, lp_str, policy, filter_str) #concept_tree = sm.create_custom_dependency() concept_tree = ConceptDependencyGraph() concept_tree.init_default_tree(n_concepts) if not use_student2: test_student = Student(n=n_concepts, p_trans_satisfied=learn_prob, p_trans_not_satisfied=0.0, p_get_ex_correct_if_concepts_learned=1.0) else: test_student = Student2(n_concepts) print(filename) # load toy data data = dataset_utils.load_data( filename='{}{}'.format(dg.SYN_DATA_DIR, filename)) print('Average posttest: {}'.format(sm.expected_reward(data))) print('Percent of full posttest score: {}'.format( sm.percent_complete(data))) print('Percent of all seen: {}'.format(sm.percent_all_seen(data))) input_data_, output_mask_, target_data_ = dataset_utils.preprocess_data_for_rnn( data) train_data = (input_data_[:, :, :], output_mask_[:, :, :], target_data_[:, :, :]) print(input_data_.shape) print(output_mask_.shape) print(target_data_.shape) # test_model hidden=16 # test_model_mid hidden=10 # test_model_small hidden=5 # test_model_tiny hidden=3 model_id = "test2_model_small" dropouts = np.array([1.0]) n_dropouts = dropouts.shape[0] total_epochs = 14 reps = 20 class ExtractCallback(tflearn.callbacks.Callback): def __init__(self): self.tstates = [] def on_epoch_end(self, training_state): self.tstates.append(copy.copy(training_state)) def test_dropout_losses(): losses = np.zeros((n_dropouts, reps, total_epochs)) val_losses = np.zeros((n_dropouts, reps, total_epochs)) for d in range(n_dropouts): dropout = dropouts[d] for r in range(reps): print('----------------------------------------') print('---------- Dropout {:3.1f} Rep {:2d} ----------'.format( dropout, r + 1)) print('----------------------------------------') ecall = ExtractCallback() dmodel = dmc.DynamicsModel(model_id=model_id, timesteps=seqlen, dropout=dropout, load_checkpoint=False) dmodel.train(train_data, n_epoch=total_epochs, callbacks=ecall, shuffle=False, load_checkpoint=False) losses[d, r, :] = np.array([s.global_loss for s in ecall.tstates]) val_losses[d, r, :] = np.array( [s.val_loss for s in ecall.tstates]) return losses, val_losses losses, val_losses = test_dropout_losses() np.savez("dropoutput", dropouts=dropouts, losses=losses, vals=val_losses)
def _dkt_train_models_chunk(params, runstartix, chunk_num_runs): ''' Loads data and trains a batch of models. A batch is a continguous sequence of runs ''' #six.print_('startix {} nruns {}'.format(runstartix,chunk_num_runs)) train_losses = [[] for _ in six.moves.range(chunk_num_runs)] val_losses = [[] for _ in six.moves.range(chunk_num_runs)] #load data data = dataset_utils.load_data( filename='{}{}'.format(dg.SYN_DATA_DIR, params.datafile)) input_data_, output_mask_, target_data_ = dataset_utils.preprocess_data_for_rnn( data) for offset in six.moves.range(chunk_num_runs): r = runstartix + offset # new model instantiation dkt_model = dmc.DynamicsModel(model_id=params.model_id, timesteps=params.seqlen - 1, dropout=params.dropout, output_dropout=params.output_dropout, load_checkpoint=False) epochs_trained = 0 for ep in params.saved_epochs: print('=====================================') print('---------- Rep {:2d} Epoch {:2d} ----------'.format(r, ep)) print('=====================================') # remember the epochs are given as zero-based epochs_to_train = ep + 1 - epochs_trained assert epochs_to_train > 0 # train ecall = ExtractCallback() for _ in six.moves.range(epochs_to_train): # add noise every epoch, so the noise is randomly different every epoch processed_input_data = input_data_ + ( params.noise * np.random.randn(*input_data_.shape)) train_data = (processed_input_data[:, :, :], output_mask_[:, :, :], target_data_[:, :, :]) dkt_model.train(train_data, n_epoch=1, callbacks=ecall, shuffle=params.shuffle, load_checkpoint=False) # save the checkpoint checkpoint_name = params.checkpoint_pat.format( params.run_name, r, ep) checkpoint_path = '{}/{}'.format(params.dir_name, checkpoint_name) dkt_model.save(checkpoint_path) # update stats train_losses[offset].extend([ np.mean([ts.global_loss for ts in batch]) for batch in ecall.tstates ]) val_losses[offset].extend( [batch[-1].val_loss for batch in ecall.tstates]) # update epochs_trained epochs_trained = ep + 1 return (train_losses, val_losses)
def load_toy_data(): filename = "toy.pickle" data = dataset_utils.load_data( filename="{}{}".format(SYN_DATA_DIR, filename)) print("Loaded data. # samples: {}".format(len(data)))
if len(sys.argv) > 4: test_path = str(sys.argv[4]) if len(sys.argv) > 5: result_path = str(sys.argv[5]) if len(sys.argv) > 6: train_embeddings = bool(str(sys.argv[6])) head = [[0, "documentText"], [1, "functionality"], [2, "concept"], [3, "directives"], [4, "purpose"], [5, "quality"], [6, "control"], [7, "structure"], [8, "patterns"], [9, "codeExamples"], [10, "environment"], [11, "reference"], [12, "nonInformation"]] data = du.load_data(data_path) test = du.load_data(test_path) #prid_index = 1 #text_index = 0 #label_start_index = 1 prid_index = 3 text_index = 6 label_start_index = 7 X = [d[text_index] for d in data] labels = [d[label_start_index:label_start_index + 12] for d in data] pr_ids = np.array([d[prid_index] for d in data]) prid_index = 3 text_index = 6