def eval(self, write_flag=False): with torch.no_grad(): self.n.eval() start_time = time.time() for i, (x, target) in enumerate(self.eval_data_loader): # measure data loading time # print("data time: " + str(time.time() - start_time)) # compute output x = x.to(DEVICE) target = target.to(DEVICE) output = self.n(x) predictions = output.data.squeeze_(1).squeeze_().cpu().numpy() # predictions = output.data.max(1)[1].squeeze_(1).squeeze_(0).cpu().numpy() if i == 0: predictions_all = predictions else: predictions_all = np.concatenate((predictions_all, predictions)) if i == 0: gts_all = target.data.squeeze_().cpu().numpy() else: gts_all = np.concatenate((gts_all, target.data.squeeze_().cpu().numpy())) acc = jaccard(predictions_all, gts_all) print('Validation set = Acc: ' + str(acc) + ' | time: ' + str(time.time() - start_time)) if write_flag: ffname = opt.outd + 'UNet_val_accuracies.txt' with open(ffname, 'a') as f: f.write(str(acc) + '\n')
def calculate_jaccard_score( original_tweet, target_string, sentiment_val, idx_start, idx_end, offsets, verbose=False): if idx_end < idx_start: idx_end = idx_start filtered_output = "" for ix in range(idx_start, idx_end + 1): filtered_output += original_tweet[offsets[ix][0]: offsets[ix][1]] if (ix+1) < len(offsets) and offsets[ix][1] < offsets[ix+1][0]: filtered_output += " " if sentiment_val == "neutral" or len(original_tweet.split()) < 2: filtered_output = original_tweet if sentiment_val != "neutral" and verbose == True: if filtered_output.strip().lower() != target_string.strip().lower(): print("********************************") print(f"Output= {filtered_output.strip()}") print(f"Target= {target_string.strip()}") print(f"Tweet= {original_tweet.strip()}") print("********************************") jac = utils.jaccard(target_string.strip(), filtered_output.strip()) return jac, filtered_output
def encode_argmax(self, target): if len(target) == 0: return torch.zeros(self.anchors.shape[0], 5) ious = jaccard(target[:, :4], corner_form(self.anchors)) max_iou, iou_idxs = ious.max(dim=0) if (max_iou >= self.argmax_pos_thresh).sum() == 0: return torch.zeros(self.anchors.shape[0], 5) boxes = center_form(target[:, :4])[iou_idxs] xy = 10 * (boxes[:, :2] - self.anchors[:, :2]) / self.anchors[:, 2:] wh = 5 * torch.log(boxes[:, 2:] / self.anchors[:, 2:]) target_boxes = torch.cat([xy, wh], dim=1) labels = torch.zeros(target_boxes.shape[0], 1) labels[max_iou >= self.argmax_pos_thresh, 0] = target[:, -1][iou_idxs[max_iou >= self.argmax_pos_thresh]] + 1 labels[(max_iou > self.argmax_neg_thresh) & (max_iou < self.argmax_pos_thresh)] = -1 # If it doesn't have a high enough threshold, still give it a label if it is the nearest anchor _, idxs = ious.max(dim=1) labels[idxs, 0] = target[:, -1] + 1 return torch.cat([target_boxes, labels], dim=1)
def eval_fn(model, dataset): """Eval the eval dataset and returns the metric""" data = tf.data.Dataset.from_generator( dataset.TweetDataset(data, config.TOKENIZER, config.MAX_LEN).gen, output_types=dataset.gen_str).batch(config.VALID_BATCH_SIZE) def get_text(text, pred): pred_texts = [] orig_texts = [] text = text.numpy() pred = tf.argmax(pred, axis=1).numpy() for t, p in zip(text, pred): orig_texts.append(t.decode("utf-8")) t = config.TOKENIZER.encode(orig_texts[-1]).offsets i, j = p[0], p[1] pred_texts.append(orig_texts[-1][t[i][0]:t[j][1]]) return orig_texts, pred_texts score = 0 for i, (data, _) in tqdm(enumerate(data)): orig_text = data["orig"] ext_text = data["ext"] preds = model.predict(data) targets, pred_texts = get_text(orig_text, preds) score = score + utils.jaccard(pred_texts, targets) score = sum(score) / len(score) print("Total jaccard score : ", score)
def calculate_jaccard_score(original_tweet, target_string, sentiment_val, idx_start, idx_end, offsets, verbose=False): """ Calculate the jaccard score from the predicted span and the actual span for a batch of tweets """ # A span's start index has to be greater than or equal to the end index # If this doesn't hold, the start index is set to equal the end index (the span is a single token) if idx_end < idx_start: idx_end = idx_start # Combine into a string the tokens that belong to the predicted span filtered_output = "" for ix in range(idx_start, idx_end + 1): filtered_output += original_tweet[offsets[ix][0]:offsets[ix][1]] # If the token is not the last token in the tweet, and the ending offset of the current token is less # than the beginning offset of the following token, add a space. # Basically, add a space when the next token (word piece) corresponds to a new word if (ix + 1) < len(offsets) and offsets[ix][1] < offsets[ix + 1][0]: filtered_output += " " # Set the predicted output as the original tweet when the tweet's sentiment is "neutral", or the tweet only contains one word if sentiment_val == "neutral" or len(original_tweet.split()) < 2: filtered_output = original_tweet # Calculate the jaccard score between the predicted span, and the actual span # The IOU (intersection over union) approach is detailed in the utils module's `jaccard` function: # https://www.kaggle.com/abhishek/utils jac = utils.jaccard(target_string.strip(), filtered_output.strip()) return jac, filtered_output
def cal_jaccard(tweet, target, idx_start, idx_end, input_offsets): if idx_end < idx_start: idx_end = idx_start output = "" for idx in range(idx_start, idx_end + 1): output += tweet[input_offsets[idx][0]: input_offsets[idx][1]] jac = utils.jaccard(target, output) return jac, output
def validation_step(self, batch, batch_idx): target_text = batch['selected_text'] preds = self.test_step(batch, batch_idx) preds_text = preds["preds"] jaccard_score = [ jaccard(p, t) for p, t in zip(preds_text, target_text) ] return {"jaccard_score": jaccard_score}
def calculate_jaccard_score(original_context, target_string, question_val, idx_start, idx_end): if idx_end < idx_start: idx_end = idx_start filtered_output = original_context[idx_start:idx_end + 1] jac = utils.jaccard(target_string.strip(), filtered_output.strip()) return jac, filtered_output
def calculate_jaccard_score( original_tweet, target_string, sentiment_val, idx_start, idx_end, offsets_start, offsets_end, verbose=False): offsets = list(zip(offsets_start, offsets_end)) if idx_end < idx_start: idx_end = idx_start filtered_output = "" original_tweet_sp = " ".join(original_tweet.split()) for ix in range(idx_start, idx_end + 1): if offsets[ix][0] == 0 and offsets[ix][1] == 0: continue filtered_output += original_tweet_sp[offsets[ix][0]: offsets[ix][1]] if (ix+1) < len(offsets) and offsets[ix][1] < offsets[ix+1][0]: filtered_output += " " filtered_output = filtered_output.replace(" .", ".") filtered_output = filtered_output.replace(" ?", "?") filtered_output = filtered_output.replace(" !", "!") filtered_output = filtered_output.replace(" ,", ",") filtered_output = filtered_output.replace(" ' ", "'") filtered_output = filtered_output.replace(" n't", "n't") filtered_output = filtered_output.replace(" 'm", "'m") filtered_output = filtered_output.replace(" do not", " don't") filtered_output = filtered_output.replace(" 's", "'s") filtered_output = filtered_output.replace(" 've", "'ve") filtered_output = filtered_output.replace(" 're", "'re") if sentiment_val == "neutral": filtered_output = original_tweet if sentiment_val != "neutral" and verbose == True: if filtered_output.strip().lower() != target_string.strip().lower(): print("********************************") print(f"Output= {filtered_output.strip()}") print(f"Target= {target_string.strip()}") print(f"Tweet= {original_tweet.strip()}") print("********************************") jac = utils.jaccard(target_string.strip(), filtered_output.strip()) return jac
def sup_forward(self, x, y, d_index, hyperparameters): self.sup.eval() # Encoding content image. content, _ = self.gen.encode(x) # Forwarding on supervised model. y_pred = self.sup(content, only_prediction=True) # Computing metrics. pred = y_pred.data.max(1)[1].squeeze_(1).squeeze_(0).cpu().numpy() jacc, jacc_cup = jaccard(pred, y.cpu().squeeze(0).numpy()) return jacc, jacc_cup, pred, content
def sup_forward(self, x, y, d_index, hyperparameters): self.sup.eval() # Encoding content image. one_hot_x = torch.cat([x, self.one_hot_img[d_index, 0].unsqueeze(0)], 1) hidden, _ = self.gen.encode(one_hot_x) # Forwarding on supervised model. y_pred = self.sup(hidden, only_prediction=True) # Computing metrics. pred = y_pred.data.max(1)[1].squeeze_(1).squeeze_(0).cpu().numpy() jacc = jaccard(pred, y.cpu().squeeze(0).numpy()) return jacc, pred, hidden
def test(X): jaccard_scores = [] for article in X: scores = [] for sentence1, sentence2 in zip(article, article[1:]): stopped_sentence1 = utils.remove_stop_words(sentence1) stemmed_sentence1 = utils.stem_tokens(stopped_sentence1) stopped_sentence2 = utils.remove_stop_words(sentence2) stemmed_sentence2 = utils.stem_tokens(stopped_sentence2) scores.append(utils.jaccard(stemmed_sentence1, stemmed_sentence2)) if scores: jaccard_scores.append(np.average(scores)) else: jaccard_scores.append(0.1) return np.array(jaccard_scores).reshape(len(jaccard_scores), 1)
def encode_bipartite(self, target): if len(target) == 0: return torch.zeros(self.anchors.shape[0], 5) ious = jaccard(target[:, :4], corner_form(self.anchors)) max_iou, iou_idxs = ious.max(dim=1) target_boxes = torch.zeros(self.anchors.shape[0], 5) anchors = self.anchors[iou_idxs] cf = center_form(target[:, :4]) xy = 10 * (cf[:, :2] - anchors[:, :2]) / anchors[:, 2:] wh = 5 * torch.log(cf[:, 2:] / anchors[:, 2:]) encoded = torch.cat([xy, wh], dim=1) target_boxes[iou_idxs] = torch.cat( [encoded, target[:, -1].unsqueeze(1) + 1], dim=1) return target_boxes
def compute_similarities(self, new_doc_ids=None, min_similarity=0.5): docs = self.corpus.all_docs() # new_doc_ids is used to keep from recomputing already known similarities. # None is special signal to compute on all doc pairs. if new_doc_ids is None: new_doc_ids = docs.keys() with get_similarity_writer(self.corpus.id) as writer: i = 0 for (x, y) in self._pairs_for_comparison(docs.keys(), new_doc_ids): similarity = jaccard(docs[x], docs[y]) if similarity >= min_similarity: writer.write(x, y, similarity) i += 1 if i % 10000000 == 0: writer.flush() sys.stdout.write('.') sys.stdout.flush()
def calculate_jaccard_score(original_tweet, target_string, sentiment_val, idx_start, idx_end, offsets, verbose=False): if idx_end < idx_start: idx_end = idx_start filtered_output = "" for ix in range(idx_start, idx_end + 1): filtered_output += original_tweet[offsets[ix][0]:offsets[ix][1]] if (ix + 1) < len(offsets) and offsets[ix][1] < offsets[ix + 1][0]: filtered_output += " " if sentiment_val == "neutral" or len(original_tweet.split()) < 2: filtered_output = original_tweet jac = utils.jaccard(target_string.strip(), filtered_output.strip()) return jac, filtered_output
def encode(self, boxes_list): '''convert ground truth boxes to loc_offset and labels :param boxes: list of tensors. [[num_boxes,4],...], the list length if the batchSize :return: ious_list: the length of the list is same with the anchor nums, each tensor is [batchsize, 1, h, w] offset_list: each tensor is [batchsize, 4, h, w] ''' ious_list = [] offsets_list = [] for anchor_idx in range(self.num_anchor): yxyx = self._buffers['yxyx_%d' % anchor_idx] yxhw = self._buffers['yxhw_%d' % anchor_idx] anchor_size = self.anchor_sizes[anchor_idx] ious = [] offsets = [] for boxes in boxes_list: flat_yxyx = th.reshape(yxyx, [-1, 4]) iou = ut.jaccard(flat_yxyx, boxes) iou, idx = th.max(iou, -1) iou = th.reshape(iou, [yxyx.shape[0], yxyx.shape[1]]) ious.append(iou) boxes_yxhw = ut.yxyx_to_yxhw(boxes) flat_yxhw = th.reshape(yxhw, [-1, 4]) expand_yxhw = flat_yxhw.unsqueeze(1).expand( flat_yxhw.size(0), boxes_yxhw.size(0), 4) offset_array = boxes_yxhw - expand_yxhw offset = [offset_array[i, idx[i]] for i in range(idx.size(0))] offset = th.stack(offset) / anchor_size offset = th.reshape(offset, yxyx.shape) offsets.append(offset) ious = th.stack(ious, dim=0) offsets = th.stack(offsets, dim=0).permute([0, 3, 1, 2]) ious_list.append(ious) offsets_list.append(offsets) return ious_list, offsets_list
def test(self, write_flag=False): with torch.no_grad(): self.n.eval() acc = [] start_time = time.time() for i, (x, target) in enumerate(self.test_data_loader): # measure data loading time # print("data time: " + str(time.time() - start_time)) # compute output x = x.to(DEVICE) target = target.to(DEVICE) gt = target.data.squeeze_().cpu().numpy() output = self.n(x) output = nn.functional.interpolate(output, size=target.shape, mode='bilinear') prediction = output.data.squeeze_(1).squeeze_().cpu().numpy() acc.append(jaccard(prediction, gt)) print('Test set = Acc: ' + str(np.mean(acc)) + ' | time: ' + str(time.time() - start_time)) if write_flag: ffname = opt.outd + 'UNet_accuracies.txt' with open(ffname, 'a') as f: f.write(str(np.mean(acc)) + '\n')
from nltk.corpus import stopwords from nltk.tokenize import word_tokenize import subprocess as sub #Running BASH script within python??? import matplotlib.pyplot as plt import numpy as np lemma = nltk.WordNetLemmatizer() relArticles = findRelevantArticles("Heart Attack") articlefilelist = [] wordslist = ['../STEMI_words', '../NSTEMI_words', '../WIKI_words'] for article in relArticles: articlefilename = "content_" + str(article) + ".txt" with codecs.open(articlefilename, 'wb', 'utf-8') as outfile: content = wikipedia.page(article).content content = [lemma.lemmatize(word) for word in content] content = set(content) for word in content: print >> outfile, word articlefilelist.append(articlefilename) for piece in wordslist: articlefilelist.append(piece) matrix = np.matrix([[jaccard(i, j) for i in articlefilelist] for j in articlefilelist]) print matrix with open('jaccardVals', 'wb') as outfile: print >> outfile, matrix
def main(): # tf flag flags = tf.flags flags.DEFINE_string( "test_data_txt", 'F:/data_info/VAE_liver/set_5/TFrecord/fold_1/test.txt', "test data txt") flags.DEFINE_string( "indir", 'G:/experiment_result/liver/VAE/set_5/down/64/alpha_0.1/fold_1/VAE/axis_5/beta_7', "input dir") flags.DEFINE_string( "outdir", 'G:/experiment_result/liver/VAE/set_5/down/64/alpha_0.1/fold_1/VAE/axis_5/beta_7/rec', "outdir") flags.DEFINE_integer("model_index", 3300, "index of model") flags.DEFINE_string("gpu_index", "0", "GPU-index") flags.DEFINE_float("beta", 1.0, "hyperparameter beta") flags.DEFINE_integer("num_of_test", 75, "number of test data") flags.DEFINE_integer("batch_size", 1, "batch size") flags.DEFINE_integer("latent_dim", 5, "latent dim") flags.DEFINE_list("image_size", [56, 72, 88, 1], "image size") FLAGS = flags.FLAGS # check folder if not (os.path.exists(FLAGS.outdir)): os.makedirs(FLAGS.outdir) # read list test_data_list = io.load_list(FLAGS.test_data_txt) # test step test_step = FLAGS.num_of_test // FLAGS.batch_size if FLAGS.num_of_test % FLAGS.batch_size != 0: test_step += 1 # load test data test_set = tf.data.TFRecordDataset(test_data_list, compression_type='GZIP') test_set = test_set.map( lambda x: utils._parse_function(x, image_size=FLAGS.image_size), num_parallel_calls=os.cpu_count()) test_set = test_set.batch(FLAGS.batch_size) test_iter = test_set.make_one_shot_iterator() test_data = test_iter.get_next() # initializer init_op = tf.group(tf.initializers.global_variables(), tf.initializers.local_variables()) with tf.Session(config=utils.config(index=FLAGS.gpu_index)) as sess: # set network kwargs = { 'sess': sess, 'outdir': FLAGS.outdir, 'beta': FLAGS.beta, 'latent_dim': FLAGS.latent_dim, 'batch_size': FLAGS.batch_size, 'image_size': FLAGS.image_size, 'encoder': encoder_resblock_bn, 'decoder': decoder_resblock_bn, 'downsampling': down_sampling, 'upsampling': up_sampling, 'is_training': False, 'is_down': False } VAE = Variational_Autoencoder(**kwargs) sess.run(init_op) # testing VAE.restore_model( os.path.join(FLAGS.indir, 'model', 'model_{}'.format(FLAGS.model_index))) tbar = tqdm(range(test_step), ascii=True) preds = [] ori = [] ji = [] for k in tbar: test_data_batch = sess.run(test_data) ori_single = test_data_batch preds_single = VAE.reconstruction_image(ori_single) preds_single = preds_single[0, :, :, :, 0] ori_single = ori_single[0, :, :, :, 0] preds.append(preds_single) ori.append(ori_single) # # label ji = [] for j in range(len(preds)): # EUDT eudt_image = sitk.GetImageFromArray(preds[j]) eudt_image.SetSpacing([1, 1, 1]) eudt_image.SetOrigin([0, 0, 0]) label = np.where(preds[j] > 0.5, 0, 1) # label = np.where(preds[j] > 0.5, 1, 0.5) label = label.astype(np.int16) label_image = sitk.GetImageFromArray(label) label_image.SetSpacing([1, 1, 1]) label_image.SetOrigin([0, 0, 0]) ori_label = np.where(ori[j] > 0.5, 0, 1) ori_label_image = sitk.GetImageFromArray(ori_label) ori_label_image.SetSpacing([1, 1, 1]) ori_label_image.SetOrigin([0, 0, 0]) # # calculate ji ji.append([utils.jaccard(label, ori_label)]) # output image io.write_mhd_and_raw( label_image, '{}.mhd'.format( os.path.join(FLAGS.outdir, 'label', 'recon_{}'.format(j)))) generalization = np.mean(ji) print('generalization = %f' % generalization) # # output csv file with open(os.path.join( FLAGS.outdir, 'generalization_{}.csv'.format(FLAGS.model_index)), 'w', newline='') as file: writer = csv.writer(file) writer.writerows(ji) writer.writerow(['generalization= ', generalization])
def eval_fn(data_loader, model, device): model.eval() fin_outputs_start = [] fin_outputs_end = [] fin_tweet_tokens = [] fin_padding_lens = [] fin_orig_selected = [] fin_orig_sentiment = [] fin_orig_tweet = [] fin_tweet_token_ids = [] for bi, d in enumerate(tk0, data_loader): ids = d["ids"] token_type_ids = d["token_type_ids"] mask = d["mask"] targets_start = d["targets_start"] targets_end = d["targets_end"] tweet_tokens = d['tweet_tokens'] padding_len = d['padding_len'] orig_sentiment = d['orig_sentiment'] orig_selected = d['orig_selected'] orig_tweet = d['orig_tweet'] #move everything to appropriate device ids = ids.to(device, dtype=torch.long) token_type_ids = token_type_ids.to(device, dtype=torch.long) mask = mask.to(device, dtype=torch.long) targets_start = targets_start.to(device, dtype=torch.float) targets_end = targets_end.to(device, dtype=torch.float) o1, o2 = model(ids=ids, mask=mask, token_type_ids=token_type_ids) #we are not calculating loss for validation so removed it .If you want to calculate loss feel free to do so. fin_outputs_end.append(torch.sigmoid(o2).cpu().detach().numpy()) fin_outputs_start.append(torch.sigmoid(o1).cpu().detach().numpy()) fin_padding_lens.extennd(padding_len.cpu().detach().numpy().tolist()) fin_tweet_tokens.extend(tweet_tokens) fin_orig_sentiment.extend(orig_sentiment) fin_orig_selected.extend(orig_selected) fin_orig_tweet.extend(orig_tweet) #NOTE : It is important that how you select the final selected text-----now fun begins fin_outputs_start = np.vstack(fin_output_start) fin_outputs_end = np.vstack(fin_output_end) threshold = 0.2 jaccards = [] #iterate over each prediction for j in range(len(fin_tweet_tokens)): target_string = fin_orig_selected[j] tweet_tokens = fin_tweet_tokens[j] padding_len = fin_padding_lens[j] original_tweet = fin_orig_tweet[j] sentiment = fin_orig_sentiment[j] if padding_len > 0: mask_start = fin_outputs_start[j, :][:-padding_len] >= threshold mask_end = fin_outputs_end[j, :][:-padding_len] >= threshold else: mask_start = fin_outputs_start[j, :] >= threshold mask_end = fin_outputs_end[j, :] >= threshold mask = [0] * len(mask_start) idx_start = np.nonzero(mask_start)[0] idx_end = np.nonzero(mask_end)[0] if len(idx_start) > 0: idx_start = idx_start[0] if len(idx_end) > 0: idx_end = idx_end[0] else: idx_end = idx_start else: idx_start = 0 idx_end = 0 for mj in range(idx_start, idx_end + 1): mask[mj] = 1 #if original tweet's word is present in selected text then it will be included in your output_toekns list output_tokens = [ x for p, x in enumerate(tweet_tokens.split()) if mask[p] == 1 ] #By above code CLS and SEP is also present in output_tokens list so let's remove them output_tokens = [ x for x in output_tokens if x not in ("[CLS]", "[SEP]") ] # final_output = "" for ot in output_tokens: #make your own rules if you want #-----------------rules start from here---------------- #if one word has been splitted(that is identified by ##) then add it back to previous word ==>> eg. youtube =====splitted into ===>>> you and ##tube =====>add it back==>>> youtube if ot.starswith("##"): final_output = final_output + ot[2:] elif len(ot) == 1 and ot in string.punctuation: final_output = final_output + ot else: final_output = final_output + " " + ot #----------rule ended here-------------------- final_output = final_output.strip() if sentiment == 'neutral' or len(original_tweet.split()) < 4: final_output = original_tweet jac = utils.jaccard(target_string.strip(), final_output.strip()) jaccards.append(jac) mean_jac = np.mean(jaccards) return mean_jac
def main(): parser = argparse.ArgumentParser( description='py, test_data_txt, ground_truth_txt, outdir') parser.add_argument('--ground_truth_txt', '-i1', default='') parser.add_argument('--model', '-i2', default='./model_{}'.format(50000)) parser.add_argument('--outdir', '-i3', default='') args = parser.parse_args() # check folder if not (os.path.exists(args.outdir)): os.makedirs(args.outdir) # tf flag flags = tf.flags flags.DEFINE_float("beta", 0.1, "hyperparameter beta") flags.DEFINE_integer("num_of_generate", 1000, "number of generate data") flags.DEFINE_integer("batch_size", 1, "batch size") flags.DEFINE_integer("latent_dim", 2, "latent dim") flags.DEFINE_list("image_size", [512, 512, 1], "image size") FLAGS = flags.FLAGS # load ground truth ground_truth = io.load_matrix_data(args.ground_truth_txt, 'int32') print(ground_truth.shape) # initializer init_op = tf.group(tf.initializers.global_variables(), tf.initializers.local_variables()) with tf.Session(config=utils.config) as sess: # set network kwargs = { 'sess': sess, 'outdir': args.outdir, 'beta': FLAGS.beta, 'latent_dim': FLAGS.latent_dim, 'batch_size': FLAGS.batch_size, 'image_size': FLAGS.image_size, 'encoder': cnn_encoder, 'decoder': cnn_decoder } VAE = Variational_Autoencoder(**kwargs) sess.run(init_op) # testing VAE.restore_model(args.model) tbar = tqdm(range(FLAGS.num_of_generate), ascii=True) specificity = [] for i in tbar: sample_z = np.random.normal(0, 1.0, (1, FLAGS.latent_dim)) generate_data = VAE.generate_sample(sample_z) generate_data = generate_data[0, :, :, 0] # EUDT eudt_image = sitk.GetImageFromArray(generate_data) eudt_image.SetSpacing([1, 1]) eudt_image.SetOrigin([0, 0]) # label label = np.where(generate_data > 0, 0, 1) label_image = sitk.GetImageFromArray(label) label_image.SetSpacing([1, 1]) label_image.SetOrigin([0, 0]) # calculate ji case_max_ji = 0. for image_index in range(ground_truth.shape[0]): ji = utils.jaccard(label, ground_truth[image_index]) if ji > case_max_ji: case_max_ji = ji specificity.append([case_max_ji]) # output image io.write_mhd_and_raw( eudt_image, '{}.mhd'.format(os.path.join(args.outdir, 'EUDT', str(i + 1)))) io.write_mhd_and_raw( label_image, '{}.mhd'.format(os.path.join(args.outdir, 'label', str(i + 1)))) print('specificity = %f' % np.mean(specificity)) # output csv file with open(os.path.join(args.outdir, 'specificity.csv'), 'w', newline='') as file: writer = csv.writer(file) writer.writerows(specificity) writer.writerow(['specificity:', np.mean(specificity)])
def __getitem__(self, index): """ Return some image with its meta information and labeled annotations. Parameters ---------- index : int The index of the image to be returned. Returns ------- image : Tensor The image at self.images[index] after some optional transforms have been performed as an (w, h, 3) Tensor in the range [0., 1.]. image_info : dict A dictionary object containing meta information about the image. target : Tensor A Tensor representing the target output of the YOLOv2 network which was used to initialise the dataset object. """ dataset, img = self.data[index] data = np.load(os.path.join(self.raw_dir[dataset], img + '.npz')) signal = data['signal'] samp_rate = data['samp_rate'] N_fft = data['N_fft'] N_overlap = data['N_overlap'] signal = signal[0] + 1.j * signal[1] stft, _, _ = self.stft(signal, N_fft=N_fft, N_overlap=N_overlap, samp_rate=samp_rate) if self.mode == 'spectrogram': data = np.abs(stft)**2 elif self.mode == 'spectrogram_db': data = 10. * np.log10(np.abs(stft)**2) elif self.mode == 'spectrogram_ap': data = [np.abs(stft)**2, np.angle(stft)] elif self.mode == 'spectrogram_ap_db': data = [10. * np.log10(np.abs(stft)**2), np.angle(stft)] elif self.mode == 'stft': data = np.abs(stft) elif self.mode == 'stft_iq': data = [stft.real, stft.imag] elif self.mode == 'stft_ap': data = [np.abs(stft), np.angle(stft)] else: raise ValueError( 'Unknown mode. Use one of spectrogram, spectrogram_db, ' 'spectrogram_ap, spectrogram_ap_db, stft_iq or stft_ap.') data = torch.tensor(data, dtype=torch.float32) if data.ndim == 2: data = data[None] data = (data - torch.tensor(self.mu)[:, None, None]) / torch.tensor( self.sigma)[:, None, None] data_info = { 'id': img, 'width': data.shape[2], 'height': data.shape[1], 'dataset': self.dataset[dataset] } if self.do_transforms: pass data_info['padding'] = [0., 0., 0., 0.] data_info['scale'] = [1., 1.] assert (data.size()[1:] == self.image_size[index]).all() if self.return_targets: annotations = get_annotations(self.annotations_dir[dataset], img) random.shuffle(annotations) target = [ np.zeros((self.grid_sizes[i][index, 1], self.grid_sizes[i][index, 0], self.num_anchors[i] * self.num_features), dtype=np.float32) for i in range(self.num_detectors) ] cell_dims = np.array([[self.strides[i], self.strides[i]] for i in range(self.num_detectors)]) anchors = [torch.zeros((n, 4)) for n in self.num_anchors] for i, (a, t) in enumerate(zip(anchors, target)): a[:, 2:] = self.anchors[i].clone() t[:, np.arange(self.grid_sizes[i][index][0]), 0::self.num_features] = \ np.arange(self.grid_sizes[i][index][0])[None, :, None] + 0.5 t[:, :, 1::self.num_features] = np.arange( self.grid_sizes[i][index][1])[:, None, None] + 0.5 t[:, :, 2::self.num_features] = a[:, 2] t[:, :, 3::self.num_features] = a[:, 3] # For each object in image. for annotation in annotations: name, height, width, xmin, ymin, xmax, ymax, truncated, difficult = annotation if (self.skip_truncated and truncated) or (self.skip_difficult and difficult): continue if name not in self.classes: continue if self.do_transforms: pass xmin = np.clip(xmin, a_min=1, a_max=self.image_size[index, 0]) xmax = np.clip(xmax, a_min=1, a_max=self.image_size[index, 0]) ymin = np.clip(ymin, a_min=1, a_max=self.image_size[index, 1]) ymax = np.clip(ymax, a_min=1, a_max=self.image_size[index, 1]) xmin, xmax, ymin, ymax = np.round(xmin), np.round( xmax), np.round(ymin), np.round(ymax) if xmax == xmin or ymax == ymin: continue xmin /= cell_dims[:, 0] xmax /= cell_dims[:, 0] ymin /= cell_dims[:, 1] ymax /= cell_dims[:, 1] if all(xmax - xmin < (SMALL_THRESHOLD * cell_dims[:, 0])): continue if all(ymax - ymin < (SMALL_THRESHOLD * cell_dims[:, 1])): continue idx = np.floor((xmax + xmin) / 2.), np.floor( (ymax + ymin) / 2.) idx = np.array(idx, dtype=np.int).T ground_truth = torch.tensor([xmin, ymin, xmax, ymax], dtype=torch.float32).t() anchors = [ torch.zeros((self.num_anchors[i], 4)) for i in range(self.num_detectors) ] for i in range(self.num_detectors): anchors[i][:, 2:] = self.anchors[i].clone() anchors[i][:, 0::2] += xmin[i] anchors[i][:, 1::2] += ymin[i] anchors = torch.cat(anchors) ious = jaccard(ground_truth, anchors) if ious.max() < IOU_MATCH_THRESHOLD: continue max_iou = 0. cumsum_detectors = np.cumsum([0] + self.num_anchors) for i in range(self.num_detectors): if ious[i, cumsum_detectors[i]:cumsum_detectors[i + 1]].max( ) > max_iou: l = i d = ious[ i, cumsum_detectors[i]:cumsum_detectors[i + 1]].argmax() max_iou = ious[ i, cumsum_detectors[i]:cumsum_detectors[i + 1]].max() target[l][idx[l][1], idx[l][0], d * self.num_features + 0] = (xmin[l] + xmax[l]) / 2. target[l][idx[l][1], idx[l][0], d * self.num_features + 1] = (ymin[l] + ymax[l]) / 2. target[l][idx[l][1], idx[l][0], d * self.num_features + 2] = xmax[l] - xmin[l] target[l][idx[l][1], idx[l][0], d * self.num_features + 3] = ymax[l] - ymin[l] target[l][idx[l][1], idx[l][0], d * self.num_features + 4] = 1. target[l][idx[l][1], idx[l][0], d * self.num_features + 5:(d + 1) * self.num_features] = \ self.encode_categorical(name) target = [ torch.tensor(target[i]).permute(2, 0, 1) for i in range(self.num_detectors) ] return data, data_info, target else: return data, data_info
def find_best_anchors(classes, root_dir, dataset, k=5, max_iter=20, skip_truncated=True, init=(13, 13), weighted=True, multi_scale=False, device='cuda'): annotations_dir = [os.path.join(r, 'Annotations') for r in root_dir] sets_dir = [os.path.join(r, 'ImageSets', 'Main') for r in root_dir] images = [] for d in range(len(dataset)): for cls in classes: file = os.path.join(sets_dir[d], '{}_{}.txt'.format(cls, dataset[d])) with open(file) as f: for line in f: image_desc = line.split() if image_desc[1] == '1': images.append((d, image_desc[0])) images = list(set(images)) bboxes = [] for image in images: annotations = get_annotations(annotations_dir[image[0]], image[1]) for annotation in annotations: name, height, width, xmin, ymin, xmax, ymax, truncated, difficult = annotation if skip_truncated and truncated: continue width = (xmax - xmin) / width height = (ymax - ymin) / height if multi_scale: for i in [2. * d + 1 for d in range(4, 10)]: bboxes.append([0., 0., i * width, i * height]) else: bboxes.append([0., 0., 13. * width, 13. * height]) bboxes = torch.tensor(bboxes, dtype=torch.float64, device=device) # anchors = [[0, 0, 3, 3], # [0, 0, 4, 3], # [0, 0, 5, 3], # [0, 0, 4, 4], # [0, 0, 5, 4], # [0, 0, 5, 5], # [0, 0, 6, 5], # [0, 0, 10, 5], # [0, 0, 13, 5]] anchors = torch.tensor(([0., 0., init[0], init[1]] * np.random.random( (k, 4))).astype(dtype=np.float64), device=device) # anchors = torch.tensor(anchors, dtype=torch.float64, device=device) for _ in range(max_iter): ious = jaccard(bboxes, anchors) iou_max, idx = torch.max(ious, dim=1) for i in range(k): if weighted: weights = (torch.tensor([1.], device=device) - iou_max[idx == i, None])**10 anchors[i] = torch.sum(bboxes[idx == i] * weights, dim=0) / torch.sum( weights) # Weighted k-means else: anchors[i] = torch.mean(bboxes[idx == i], dim=0) # Normal k-means sort = torch.argsort(anchors[:, 2], dim=0) anchors = anchors[sort] return anchors[:, 2:]
options, args = parser.parse_args() command_line_arguments = [command.strip().lower() for command in options.pipeline.split()] pipeline = [getattr(tech, command) for command in command_line_arguments] if os.path.isdir(options.input): input_filenames = [os.path.join(options.input,filename.strip().lower()) for filename in os.listdir(options.input)] input_filenames = [filename for filename in input_filenames if filename.endswith('.txt')] else: input_filenames = [filename.strip().lower() for filename in options.input.split()] data = {basename(filename):tech.filestream_to_word_list(open(filename,'rb')) for filename in input_filenames} #Analysis methods are pairwise #Calculate Jaccard similarity keys = data.keys() jaccard_similarity = np.zeros((len(keys),len(keys))) for j in xrange(jaccard_similarity.shape[1]): for i in xrange(j): jaccard_similarity[i,j] = tech.jaccard(data[keys[i]],data[keys[j]]) jaccard_similarity += jaccard_similarity.transpose() jaccard_similarity[np.diag_indices_from(jaccard_similarity)] = 1 np.savetxt('../data/jaccard_similarity.tsv',jaccard_similarity,fmt='%.04f',header = ' '.join(keys)) fig, ax = plt.subplots() ax = sns.heatmap(jaccard_similarity, annot=True, fmt='.02f', square = True, xticklabels = keys, yticklabels=keys) plt.tight_layout() plt.savefig('../graphs/jaccard_similarity.png')
def experiment(data, box, cv, output): """ Write the results of an experiment. This function will run an experiment for a specific dataset for a bounding box. There will be CV runs of randomized experiments run and the outputs will be written to a file. Parameters ---------- data : string Dataset name. box : string Bounding box on the file name. cv : int Number of cross validation runs. output : string If float or tuple, the projection will be the same for all features, otherwise if a list, the projection will be described feature by feature. Returns ------- None Raises ------ ValueError If the percent poison exceeds the number of samples in the requested data. """ #data, box, cv, output = 'conn-bench-sonar-mines-rocks', '1', 5, 'results/test.npz' # load normal and adversarial data path_adversarial_data = 'data/attacks/' + data + '_[xiao][' + box + '].csv' df_normal = pd.read_csv('data/clean/' + data + '.csv', header=None).values df_adversarial = pd.read_csv(path_adversarial_data, header=None).values # separate out the normal and adversarial data Xn, yn = df_normal[:, :-1], df_normal[:, -1] Xa, ya = df_adversarial[:, :-1], df_adversarial[:, -1] # change the labels from +/-1 to [0,1] ya[ya == -1], yn[yn == -1] = 0, 0 # calculate the ratios of data that would be used for training and hold out p0, p1 = 1. / cv, (1. - 1. / cv) N = len(Xn) # calculate the total number of training and testing samples and set the numbfer of # features that are going to be selected Ntr, Nte = int(p1 * N), int(p0 * N) n_selected_features = int(Xn.shape[1] * SEL_PERCENT) + 1 # zero the results out : err_jaccard and err_kuncheva are 9x4 matrices err_jaccard, err_kuncheva = np.zeros((NPR, NALG)), np.zeros((NPR, NALG)) # For M3(KNN classification error) analysis: err_KNN_norm will have just one row(1x4) because it only only contains normal data # err_KNN_pois is a 9x4 matrix err_KNN_norm, err_KNN_pois = np.zeros((1, NALG)), np.zeros((NPR, NALG)) # Empty lists that will hold feature sets for all npr MIM_fset = [] MIFS_fset = [] MRMR_fset = [] JMI_fset = [] # creating list of empty lists for n in range(NPR): MIM_fset.append([]) MIFS_fset.append([]) MRMR_fset.append([]) JMI_fset.append([]) # run `cv` randomized experiments. note this is not performing cross-validation, rather # we are going to use randomized splits of the data. for _ in range(cv): # shuffle up the data for the experiment then split the data into a training and # testing dataset i = np.random.permutation(N) Xtrk, ytrk, Xtek, ytek = Xn[i][:Ntr], yn[i][:Ntr], Xn[i][Nte:], yn[i][ Nte:] # run feature selection on the baseline dataset without an adversarial data. this # will serve as the baseline. use a parallel assignment to speed things up. sf_base_jmi, sf_base_mim, sf_base_mrmr, sf_base_mifs = run_feature_selection( Xtrk, ytrk, n_selected_features) Xtr_mim = Xtrk[:, sf_base_mim] Xtr_mifs = Xtrk[:, sf_base_mifs] Xtr_mrmr = Xtrk[:, sf_base_mrmr] Xtr_jmi = Xtrk[:, sf_base_jmi] Xte_mim = Xtek[:, sf_base_mim] Xte_mifs = Xtek[:, sf_base_mifs] Xte_mrmr = Xtek[:, sf_base_mrmr] Xte_jmi = Xtek[:, sf_base_jmi] # err_KNN_norm table gives us the classification accuracy score of feature selection # algorithms performed on untainted data, that can be used for further analysis err_KNN_norm[0, 0] += err_KNN_classification(Xtr_mim, ytrk, Xte_mim, ytek) err_KNN_norm[0, 1] += err_KNN_classification(Xtr_mifs, ytrk, Xte_mifs, ytek) err_KNN_norm[0, 2] += err_KNN_classification(Xtr_mrmr, ytrk, Xte_mrmr, ytek) err_KNN_norm[0, 3] += err_KNN_classification(Xtr_jmi, ytrk, Xte_jmi, ytek) # loop over the number of poisoning ratios that we need to evaluate for n in range(NPR): # calucate the number of poisoned data that we are going to need to make sure # that the poisoning ratio is correct in the training data. e.g., if you have # N=100 samples and you want to poison by 20% then the 20% needs to be from # the training size. hence it is not 20. Np = int(len(ytrk) * POI_RNG[n] + 1) if Np >= len(ya): # shouldn't happen but catch the case where we are requesting more poison # data samples than are available. NEED TO BE CAREFUL WHEN WE ARE CREATING # THE ADVERSARIAL DATA raise ValueError( 'Number of poison data requested is larger than the available data.' ) # find the number of normal samples (i.e., not poisoned) samples in the # training data. then create the randomized data set that has Nn normal data # samples and Np adversarial samples in the training data Nn = len(ytrk) - Np idx_normal, idx_adversarial = np.random.permutation(len(ytrk))[:Nn], \ np.random.permutation(len(ya))[:Np] Xtrk_poisoned, ytrk_poisoned = np.concatenate((Xtrk[idx_normal], Xa[idx_adversarial])), \ np.concatenate((ytrk[idx_normal], ya[idx_adversarial])) # run feature selection with the training data that has adversarial samples sf_adv_jmi, sf_adv_mim, sf_adv_mrmr, sf_adv_mifs = run_feature_selection( Xtrk_poisoned, ytrk_poisoned, n_selected_features) Xtrk_poisoned_MIM = Xtrk_poisoned[:, sf_adv_mim] Xtrk_poisoned_MIFS = Xtrk_poisoned[:, sf_adv_mifs] Xtrk_poisoned_MRMR = Xtrk_poisoned[:, sf_adv_mrmr] Xtrk_poisoned_JMI = Xtrk_poisoned[:, sf_adv_jmi] Xtest_MIM = Xtek[:, sf_adv_mim] Xtest_MIFS = Xtek[:, sf_adv_mifs] Xtest_MRMR = Xtek[:, sf_adv_mrmr] Xtest_JMI = Xtek[:, sf_adv_jmi] # calculate the accumulated jaccard and kuncheva performances for each of the # feature selection algorithms err_jaccard[n, 0] += jaccard(sf_adv_mim, sf_base_mim) err_jaccard[n, 1] += jaccard(sf_adv_mifs, sf_base_mifs) err_jaccard[n, 2] += jaccard(sf_adv_mrmr, sf_base_mrmr) err_jaccard[n, 3] += jaccard(sf_adv_jmi, sf_base_jmi) err_kuncheva[n, 0] += kuncheva(sf_adv_mim, sf_base_mim, Xtrk.shape[1]) err_kuncheva[n, 1] += kuncheva(sf_adv_mifs, sf_base_mifs, Xtrk.shape[1]) err_kuncheva[n, 2] += kuncheva(sf_adv_mrmr, sf_base_mrmr, Xtrk.shape[1]) err_kuncheva[n, 3] += kuncheva(sf_adv_jmi, sf_base_jmi, Xtrk.shape[1]) # err_KNN_pois table gives the classification accuracy score of feature selection # algorithms performed on poisoned data err_KNN_pois[n, 0] += err_KNN_classification(Xtrk_poisoned_MIM, ytrk_poisoned, Xtest_MIM, ytek) err_KNN_pois[n, 1] += err_KNN_classification(Xtrk_poisoned_MIFS, ytrk_poisoned, Xtest_MIFS, ytek) err_KNN_pois[n, 2] += err_KNN_classification(Xtrk_poisoned_MRMR, ytrk_poisoned, Xtest_MRMR, ytek) err_KNN_pois[n, 3] += err_KNN_classification(Xtrk_poisoned_JMI, ytrk_poisoned, Xtest_JMI, ytek) # Storing all the features in corresponding feature selection algo list MIM_fset[n].append(sf_adv_mim) MIFS_fset[n].append(sf_base_mifs) MRMR_fset[n].append(sf_base_mrmr) JMI_fset[n].append(sf_adv_jmi) MIM_stability_score = comb_kuncheva(MIM_fset, 2, cv, Xtrk.shape[1]) MIFS_stability_score = comb_kuncheva(MIFS_fset, 2, cv, Xtrk.shape[1]) MRMR_stability_score = comb_kuncheva(MRMR_fset, 2, cv, Xtrk.shape[1]) JMI_stability_score = comb_kuncheva(JMI_fset, 2, cv, Xtrk.shape[1]) feature_stability = np.column_stack( (MIM_stability_score, MIFS_stability_score, MRMR_stability_score, JMI_stability_score)) # scale the kuncheva and jaccard statistics by 1.0/cv then write the output file err_jaccard, err_kuncheva = err_jaccard / cv, err_kuncheva / cv err_KNN_pois, err_KNN_norm = err_KNN_pois / cv, err_KNN_norm / cv np.savez(output, M1=feature_stability, err_jaccard=err_jaccard, M2=err_kuncheva, M3_pois=err_KNN_pois, M3_norm=err_KNN_norm) return None
def loss(self, predictions, targets, stats): assert type(predictions) == list loss = {} for i, (p, t) in enumerate(zip(predictions, targets)): assert p.shape == t.shape l = {} batch_size = t.shape[0] t = t.permute(0, 2, 3, 1) p = p.permute(0, 2, 3, 1) t = t.contiguous().view(batch_size, -1, self.num_features) p = p.contiguous().view(batch_size, -1, self.num_features) img_idx = torch.arange(batch_size, dtype=torch.float, device=self.device) img_idx = img_idx.reshape(-1, 1) * p.shape[2] t[:, :, 0] += 2. * img_idx p[:, :, 0] += 2. * img_idx img_idx = torch.arange(batch_size, dtype=torch.float, device=self.device) img_idx = img_idx.reshape(-1, 1) * p.shape[1] t[:, :, 1] += 2. * img_idx p[:, :, 1] += 2. * img_idx t = t.contiguous().view(-1, self.num_features) p = p.contiguous().view(-1, self.num_features) obj_mask = torch.nonzero(t[:, 4]).flatten() num_obj = len(obj_mask) if obj_mask.numel() > 0: p_xyxy = xywh2xyxy(p[:, :4].detach()) t_xyxy = xywh2xyxy(t[obj_mask, :4]) all_ious = jaccard(p_xyxy, t_xyxy) ious, _ = torch.max(all_ious, dim=1) stats['avg_obj_iou'].append( all_ious[obj_mask].diag().mean().item()) mask = torch.nonzero(ious > self.noobj_iou_threshold).squeeze() t[mask, 4] = 1. noobj_mask = torch.nonzero(t[:, 4] == 0.).squeeze() l['coord'] = nn.MSELoss(reduction='sum')(p[obj_mask, 0], t[obj_mask, 0]) l['coord'] += nn.MSELoss(reduction='sum')(p[obj_mask, 1], t[obj_mask, 1]) l['coord'] += nn.MSELoss(reduction='sum')(torch.sqrt( p[obj_mask, 2]), torch.sqrt(t[obj_mask, 2])) l['coord'] += nn.MSELoss(reduction='sum')(torch.sqrt( p[obj_mask, 3]), torch.sqrt(t[obj_mask, 3])) l['coord'] *= LAMBDA_COORD / batch_size if self.iteration * self.batch_size < 12800: l['bias'] = nn.MSELoss(reduction='sum')(p[noobj_mask, 0], t[noobj_mask, 0]) l['bias'] += nn.MSELoss(reduction='sum')(p[noobj_mask, 1], t[noobj_mask, 1]) l['bias'] += nn.MSELoss(reduction='sum')(torch.sqrt( p[noobj_mask, 2]), torch.sqrt(t[noobj_mask, 2])) l['bias'] += nn.MSELoss(reduction='sum')(torch.sqrt( p[noobj_mask, 3]), torch.sqrt(t[noobj_mask, 3])) l['bias'] *= 0.1 / batch_size p[obj_mask, 5:] = F.log_softmax(p[obj_mask, 5:], dim=-1) t_long = torch.argmax(t[obj_mask, 5:], dim=1) if USE_CROSS_ENTROPY: l['class'] = nn.NLLLoss(reduction='sum')(p[obj_mask, 5:], t_long) else: l['class'] = nn.MSELoss(reduction='sum')(torch.exp( p[obj_mask, 5:]), t[obj_mask, 5:]) l['class'] *= LAMBDA_CLASS / batch_size stats['avg_class'].append( torch.exp(p[obj_mask, 5 + t_long]).mean().item()) # l['object'] = nn.MSELoss(reduction='sum')(p[obj_mask, 4], # all_ious[obj_mask, torch.arange(num_obj)].detach()) l['object'] = nn.MSELoss(reduction='sum')(p[obj_mask, 4], t[obj_mask, 4]) l['object'] *= LAMBDA_OBJ / batch_size stats['avg_pobj'].append(p[obj_mask, 4].mean().item()) l['no_object'] = nn.MSELoss(reduction='sum')(p[noobj_mask, 4], t[noobj_mask, 4]) l['no_object'] *= LAMBDA_NOOBJ / batch_size stats['avg_pnoobj'].append(p[noobj_mask, 4].mean().item()) else: l['object'] = torch.tensor([0.], device=self.device) l['coord'] = torch.tensor([0.], device=self.device) l['class'] = torch.tensor([0.], device=self.device) l['no_object'] = LAMBDA_NOOBJ / batch_size * nn.MSELoss( reduction='sum')(p[:, 4], t[:, 4]) if self.iteration * self.batch_size < 12800: l['bias'] = nn.MSELoss(reduction='sum')(p[:, 0], t[:, 0]) l['bias'] += nn.MSELoss(reduction='sum')(p[:, 1], t[:, 1]) l['bias'] += nn.MSELoss(reduction='sum')(torch.sqrt(p[:, 2]), torch.sqrt(t[:, 2])) l['bias'] += nn.MSELoss(reduction='sum')(torch.sqrt(p[:, 3]), torch.sqrt(t[:, 3])) l['bias'] *= 0.1 / batch_size l['total'] = (l['coord'] + l['class'] + l['object'] + l['no_object']) for k, v, in l.items(): try: loss[k] = loss[k] + v except KeyError: loss[k] = v return loss, stats
def main(argv): # turn off log message tf.compat.v1.logging.set_verbosity(tf.compat.v1.logging.FATAL) # check folder if not os.path.exists(FLAGS.dir): raise Exception("model dirctory is not existed!") if not os.path.exists(os.path.join(FLAGS.dir, 'generalization')): os.makedirs(os.path.join(FLAGS.dir, 'generalization')) # get tfrecord list test_data_list = glob.glob(FLAGS.indir + '/*') # test step test_step = FLAGS.num_of_test // FLAGS.batch_size if FLAGS.num_of_test % FLAGS.batch_size != 0: test_step += 1 # load test data test_set = tf.data.TFRecordDataset(test_data_list) test_set = test_set.map(lambda x: utils._parse_function_val_test( x, image_size=FLAGS.image_size), num_parallel_calls=os.cpu_count()) test_set = test_set.batch(FLAGS.batch_size) test_iter = test_set.make_one_shot_iterator() test_data = test_iter.get_next() # initializer init_op = tf.group(tf.initializers.global_variables(), tf.initializers.local_variables()) with tf.Session(config=utils.config(index=FLAGS.gpu_index)) as sess: # set network kwargs = { 'sess': sess, 'latent_dim': FLAGS.latent_dim, 'scale_lambda': FLAGS.scale_lambda, 'scale_kappa': FLAGS.scale_kappa, 'scale_psi': FLAGS.scale_psi, 'image_size': FLAGS.image_size, 'points_num': FLAGS.points_num, 'k_size': FLAGS.k_size, 'encoder_layer': encoder_layer, 'points_encoder_layer': points_encoder_layer, 'generator_layer': generator_layer, 'discriminator_layer': discriminator_layer, 'code_discriminator_layer': code_discriminator_layer, 'is_training': False } Model = conditional_alphaGAN(**kwargs) sess.run(init_op) # print parameters utils.cal_parameter() # test Model.restore_model(FLAGS.dir + '/model/model_{}'.format(FLAGS.model_index)) tbar = tqdm(range(test_step), ascii=True) for i in tbar: test_image_batch, test_points_batch, test_label_batch = sess.run( test_data) reconstruction_batch = Model.reconstruction( test_image_batch, test_points_batch) # dilation of points test_points_batch = tf.keras.layers.MaxPooling3D( pool_size=5, strides=1, padding='same')(test_points_batch) test_points_batch = test_points_batch.eval() test_points_batch = test_points_batch * 2 # scaling if i is 0: test_label = np.asarray(test_label_batch) reconstruction = np.asarray(reconstruction_batch)[0] points = np.asarray(test_points_batch) else: test_label = np.concatenate( (test_label, np.asarray(test_label_batch)), axis=0) reconstruction = np.concatenate( (reconstruction, np.asarray(reconstruction_batch)[0]), axis=0) points = np.concatenate((points, np.array(test_points_batch)), axis=0) # calculate Jaccard Index and output images generalization = [] tbar = tqdm(range(reconstruction.shape[0]), ascii=True) for i in tbar: test_label_single = test_label[i][:, :, :, 0] reconstruction_single = reconstruction[i][:, :, :, 0] points_single = points[i][:, :, :, 0] # label rec_label = np.where(reconstruction_single > 0.5, 0, 1) rec_label = rec_label.astype(np.int8) # calculate ji generalization.append( [utils.jaccard(rec_label, test_label_single)]) # label and points label_and_points = rec_label + points_single rec_label = rec_label.astype(np.int8) label_and_points = label_and_points.astype(np.int8) # output image io.write_mhd_and_raw(reconstruction_single, '{}.mhd'.format( os.path.join(FLAGS.dir, 'generalization', 'logodds', 'generate_{}'.format(i))), spacing=[1, 1, 1], origin=[0, 0, 0], compress=True) io.write_mhd_and_raw(rec_label, '{}.mhd'.format( os.path.join(FLAGS.dir, 'generalization', 'predict', 'recon_{}'.format(i))), spacing=[1, 1, 1], origin=[0, 0, 0], compress=True) io.write_mhd_and_raw(label_and_points, '{}.mhd'.format( os.path.join(FLAGS.dir, 'generalization', 'label_and_points', 'generate_{}'.format(i))), spacing=[1, 1, 1], origin=[0, 0, 0], compress=True) print('generalization = %f' % np.mean(generalization)) # write csv io.write_csv( generalization, os.path.join(FLAGS.dir, 'generalization', 'generalization_val_{}.csv'.format( FLAGS.model_index)), 'generalization')
def main(): parser = argparse.ArgumentParser( description='py, test_data_txt, model, outdir') parser.add_argument('--test_data_txt', '-i1', default='') parser.add_argument('--model', '-i2', default='./model_{}'.format(50000)) parser.add_argument('--outdir', '-i3', default='') args = parser.parse_args() # check folder if not (os.path.exists(args.outdir)): os.makedirs(args.outdir) # tf flag flags = tf.flags flags.DEFINE_float("beta", 0.1, "hyperparameter beta") flags.DEFINE_integer("num_of_test", 100, "number of test data") flags.DEFINE_integer("batch_size", 1, "batch size") flags.DEFINE_integer("latent_dim", 2, "latent dim") flags.DEFINE_list("image_size", [512, 512, 1], "image size") FLAGS = flags.FLAGS # read list test_data_list = io.load_list(args.test_data_txt) # test step test_step = FLAGS.num_of_test // FLAGS.batch_size if FLAGS.num_of_test % FLAGS.batch_size != 0: test_step += 1 # load test data test_set = tf.data.TFRecordDataset(test_data_list) test_set = test_set.map( lambda x: _parse_function(x, image_size=FLAGS.image_size), num_parallel_calls=os.cpu_count()) test_set = test_set.batch(FLAGS.batch_size) test_iter = test_set.make_one_shot_iterator() test_data = test_iter.get_next() # initializer init_op = tf.group(tf.initializers.global_variables(), tf.initializers.local_variables()) with tf.Session(config=utils.config) as sess: # set network kwargs = { 'sess': sess, 'outdir': args.outdir, 'beta': FLAGS.beta, 'latent_dim': FLAGS.latent_dim, 'batch_size': FLAGS.batch_size, 'image_size': FLAGS.image_size, 'encoder': cnn_encoder, 'decoder': cnn_decoder } VAE = Variational_Autoencoder(**kwargs) sess.run(init_op) # testing VAE.restore_model(args.model) tbar = tqdm(range(test_step), ascii=True) preds = [] ori = [] for k in tbar: test_data_batch = sess.run(test_data) ori_single = test_data_batch preds_single = VAE.reconstruction_image(ori_single) preds_single = preds_single[0, :, :, 0] ori_single = ori_single[0, :, :, 0] preds.append(preds_single) ori.append(ori_single) # # label ji = [] for j in range(len(preds)): # EUDT eudt_image = sitk.GetImageFromArray(preds[j]) eudt_image.SetSpacing([1, 1]) eudt_image.SetOrigin([0, 0]) label = np.where(preds[j] > 0, 0, 1) label_image = sitk.GetImageFromArray(label) label_image.SetSpacing([1, 1]) label_image.SetOrigin([0, 0]) ori_label = np.where(ori[j] > 0, 0, 1) ori_label_image = sitk.GetImageFromArray(ori_label) ori_label_image.SetSpacing([1, 1]) ori_label_image.SetOrigin([0, 0]) # # calculate ji ji.append(utils.jaccard(label, ori_label)) # output image io.write_mhd_and_raw( eudt_image, '{}.mhd'.format( os.path.join(args.outdir, 'EUDT', 'recon_{}'.format(j)))) io.write_mhd_and_raw( label_image, '{}.mhd'.format( os.path.join(args.outdir, 'label', 'recon_{}'.format(j)))) generalization = np.mean(ji) print('generalization = %f' % generalization) # output csv file with open(os.path.join(args.outdir, 'generalization.csv'), 'w', newline='') as file: writer = csv.writer(file) writer.writerows(ji) writer.writerow(['generalization= ', generalization])
def evaluate(epoch, dataloader, model, criterion, device, prefix=None): model.eval() with torch.no_grad(): jaccard_sum, loss_sum, count = 0.0, 0.0, 0.0 K = 5 samples = { 'positive': { 'best': [], 'worst': [], }, 'negative': { 'best': [], 'worst': [], }, 'neutral': { 'best': [], 'worst': [], }, } pbar = tqdm( desc='{}Eval Batch'.format('' if prefix is None else prefix + ' '), total=len(dataloader), leave=False) for i, batch in enumerate(dataloader): tweet = batch['tweet'].to(device) selection = batch['selection'].long().to(device) raw_selection = batch['raw_selection'] raw_tweet = batch['raw_tweet'] sentiment = batch['sentiment'] start = batch['start'].long().to(device) end = batch['end'].long().to(device) pos = batch['pos'].long().to(device) offsets = batch['offsets'] non_pad_elements = selection.shape[1] - \ (selection == -1).sum(dim=1) y_hat_start, y_hat_end = model(tweet, pos) loss = criterion(y_hat_start, start, y_hat_end, end, selection) loss_sum += loss.data.item() y_hat_start = torch.argmax(y_hat_start, dim=1) y_hat_end = torch.argmax(y_hat_end, dim=1) final = [] for j, t in enumerate(tweet): s = offsets[j][y_hat_start[j]][0] e = offsets[j][y_hat_end[j]][1] final.append(raw_tweet[j][s:e]) for j, raw in enumerate(raw_selection): selection_output = final[j] jacc = jaccard(raw, selection_output) jaccard_sum += jacc / tweet.shape[0] if len(samples[sentiment[j]]['best']) < K: samples[sentiment[j]]['best'].append( (jacc, raw_tweet[j], raw, selection_output)) elif jacc > samples[sentiment[j]]['best'][0][0]: samples[sentiment[j]]['best'].append( (jacc, raw_tweet[j], raw, selection_output)) samples[sentiment[j]]['best'].sort(key=lambda x: x[0]) samples[sentiment[j]]['best'].pop(0) if len(samples[sentiment[j]]['worst']) < K: samples[sentiment[j]]['worst'].append( (jacc, raw_tweet[j], raw, selection_output)) elif jacc < samples[sentiment[j]]['worst'][-1][0]: samples[sentiment[j]]['worst'].append( (jacc, raw_tweet[j], raw, selection_output)) samples[sentiment[j]]['worst'].sort(key=lambda x: x[0]) samples[sentiment[j]]['worst'].pop(-1) count += 1 pbar.update() pbar.clear() pbar.close() return loss_sum / count, jaccard_sum / count, samples
def train_fn(model, selected_model, dataloaders_dict, criterion, optimizer, num_epochs, filename): device = torch.device('cuda' if torch.cuda.is_available() else 'cpu') model.to(device) for epoch in range(num_epochs): for phase in ['train', 'val']: if phase == 'train': model.train() else: model.eval() epoch_loss = 0.0 epoch_jaccard = 0.0 tk0 = tqdm(dataloaders_dict[phase], total=len(dataloaders_dict[phase])) for data in (tk0): ids = data['ids'].to(device) masks = data['masks'].to(device) tweet = data['tweet'] offsets = data['offsets'].numpy() start_idx = data['start_idx'].to(device) end_idx = data['end_idx'].to(device) optimizer.zero_grad() with torch.set_grad_enabled(phase == 'train'): if selected_model == 'LSTM': start_logits, end_logits = model(ids) elif selected_model == 'RoBERTa': start_logits, end_logits = model(ids, masks) loss = criterion(start_logits, end_logits, start_idx, end_idx) if phase == 'train': loss.backward() optimizer.step() epoch_loss += loss.item() * len(ids) start_idx = start_idx.cpu().detach().numpy() end_idx = end_idx.cpu().detach().numpy() start_logits = torch.softmax(start_logits, dim=1).cpu().detach().numpy() end_logits = torch.softmax(end_logits, dim=1).cpu().detach().numpy() for i in range(len(ids)): start_pred = np.argmax(start_logits[i]) end_pred = np.argmax(end_logits[i]) pred = utils.get_selected_text(tweet[i], start_pred, end_pred, offsets[i]) true = utils.get_selected_text(tweet[i], start_idx[i], end_idx[i], offsets[i]) jaccard_score = utils.jaccard(pred, true) epoch_jaccard += jaccard_score epoch_loss = epoch_loss / len(dataloaders_dict[phase].dataset) epoch_jaccard = epoch_jaccard / len( dataloaders_dict[phase].dataset) print( 'Epoch {}/{} | {:^5} | Loss: {:.4f} | Jaccard: {:.4f}'.format( epoch + 1, num_epochs, phase, epoch_loss, epoch_jaccard)) torch.save(model.state_dict(), filename)
def eval_fn(data_loader, model, device): model.train() fin_output_start = [] fin_output_end = [] fin_padding_lens = [] fin_tweet_tokens = [] fin_orig_sentiment = [] fin_orig_selected = [] fin_orig_tweet = [] for bi, d in enumerate(data_loader): ids = d["ids"] token_type_ids = d["token_type_ids"] mask = d["mask"] tweet_tokens = d["tweet_tokens"] padding_len = d["padding_len"] orig_sentiment = d["orig_sentiment"] orig_selected = d["orig_selected"] orig_tweet = d["orig_tweet"] ids = ids.to(device, dtype=torch.long) token_type_ids = token_type_ids.to(device, dtype=torch.long) mask = mask.to(device, dtype=torch.long) o1, o2 = model( ids=ids, mask=mask, token_type_ids=token_type_ids ) fin_output_start.append(torch.sigmoid(o1).cpu().detach().numpy()) fin_output_end.append(torch.sigmoid(o1).cpu().detach().numpy()) fin_padding_lens.extend(padding_len.cpu().detach().numpy().tolist()) fin_tweet_tokens.extend(tweet_tokens) fin_orig_sentiment.extend(orig_sentiment) fin_orig_selected.extend(orig_selected) fin_orig_tweet.extend(orig_tweet) fin_output_start = np.vstack(fin_output_start) fin_output_end = np.vstack(fin_output_end) threshold = 0.2 jaccards = [] for j in range(len(fin_tweet_tokens)): target_string = fin_orig_selected[j] target_tokens = fin_tweet_tokens[j] padding_len = fin_padding_lens[j] original_tweet = fin_orig_tweet[j] sentiment = fin_orig_sentiment[j] if padding_len > 0: mask_start = fin_output_start[j, :][:-padding_len] >= threshold mask_end = fin_output_end[j, :][:-padding_len] >= threshold else: mask_start = fin_output_start[j, :] >= threshold mask_end = fin_output_end[j, :] >= threshold mask = [0] * len(mask_start) idx_start = np.nonzero(mask_start)[0] idx_end = np.nonzero(mask_end)[0] if len(idx_start) > 0: idx_start = idx_start[0] if len(idx_end) > 0: idx_end = idx_end[0] else: idx_end = idx_start else: idx_start = 0 idx_end = 0 for mj in range(idx_start, idx_end + 1): mask[mj] = 1 output_tokens = [x for p, x in enumerate(tweet_tokens.split()) if mask[p] == 1] output_tokens = [x for x in output_tokens if x not in ("[CLS]", "[SEP]")] final_output = "" for ot in output_tokens: if ot.startswith("##"): final_output = final_output + ot[2:] elif len(ot) == 1 and ot in string.punctuation: final_output = final_output + ot else: final_output = final_output + " " + ot final_output = final_output.strip() if sentiment == "neutral" or len(original_tweet.split()) < 4: final_output = original_tweet jac = utils.jaccard(target_string.strip(), final_output.strip()) jaccards.append(jac) mean_jac = np.mean(jaccards) return mean_jac
#--- CALCULATE JACCARD SIMILARITY source_rubric = [[source for source in sources] for source in sources] filenames = ['jaccard-similarity-%s'%disease for disease in keywords] filenames += ['jaccard-similarities.json'] if not all([os.path.isfile(filename) for filename in filenames]): jaccard_matrices = {disease:np.zeros((len(sources),len(sources))) for disease in keywords} for disease in keywords: jaccard_matrices[disease] = np.array([[tech.jaccard(corpus[sources[i]][disease],corpus[sources[j]][disease]) for i in xrange(len(sources))] for j in xrange(len(sources))]) fig = plt.figure() ax = fig.add_subplot(111) cax = ax.imshow(jaccard_matrices[disease],interpolation='nearest',aspect='equal',vmin=0,vmax=1) ax.set_xticks(range(len(sources))) ax.set_yticks(range(len(sources))) ax.set_xticklabels(map(tech.format,sources)) ax.set_yticklabels(map(tech.format,sources)) cbar = plt.colorbar(cax) cbar.set_label(tech.format('Jaccard Similarity'))