def process_projects(src_directory, glossary_description, glossary_file): corpus = Corpus(src_directory) corpus.process() reference_sources = ReferenceSources() reference_sources.read_sources() metrics = Metrics() metrics.create(corpus) # Select terms MAX_TERMS = 5000 sorted_terms_by_tfxdf = sorted(metrics.tfxdf, key=metrics.tfxdf.get, reverse=True) # Developer report glossary_entries = OrderedDict() translations = Translations() selected_terms = sorted_terms_by_tfxdf[:MAX_TERMS] # Sorted by frequency for term in selected_terms: glossary_entries[term] = translations.create_for_word_sorted_by_frequency(corpus.documents, term, reference_sources) dev_glossary_serializer = DevGlossarySerializer() dev_glossary_serializer.create(u"dev-" + glossary_file + ".html", glossary_description, corpus, glossary_entries, reference_sources) # User report glossary_entries = [] selected_terms = sorted(sorted_terms_by_tfxdf[:MAX_TERMS]) # Sorted by term glossary = Glossary(glossary_description) for term in selected_terms: glossary_entry = GlossaryEntry( term, translations.create_for_word_sorted_by_frequency(corpus.documents, term, reference_sources) ) glossary.entries.append(glossary_entry) glossary_entries = glossary.get_dict() process_template('templates/userglossary-html.mustache', glossary_file + ".html", glossary_entries) process_template('templates/userglossary-csv.mustache', glossary_file + ".csv", glossary_entries) generate_database(glossary, glossary_file)
def update_res(config_results, mydir, latest): if (os.path.exists(latest)): shutil.rmtree(latest) text = 'var configs = [' for config in config_results[:-1]: text += str(config) + ',' text += str(config_results[-1]) + '];' if (os.path.exists('config.js')): os.remove('config.js') Metrics.saveConfig('config.js', text) Metrics.copyDirectory(mydir, latest)
def get_average_throughput_for_node(self, node_id=0): data_set = [] data = self.f_receive_events_at(self._all_events, node_id) if self._sim_mode == MODE_WIRELESS: assert(I_TIMESTAMP_TOKEN == 1) data = self.f_events_at_level(data, 'MAC') data = self.f_events_with_data_pkts(data) if self._sim_mode == MODE_WIRELESS or \ self._sim_mode == MODE_WIRED: data_set = izip( self.f_get_cols(data, col_num=I_TIMESTAMP), self.f_get_cols(data, col_num=I_PKT_LEN) ) else: c_ts = [] c_len = [] for e in data: #print e if is_wired(e): c_ts.append( e[I_W_TIMESTAMP] ) c_len.append( e[I_W_PKT_LEN] ) else: c_ts.append( e[I_WL_TIMESTAMP] ) c_len.append( e[I_WL_PKT_LEN] ) data_set = izip(c_ts, c_len) #for e in data_set: print e return Metrics.average_throughput( data_set, -1 )
def get_packet_retransmissions(self, src_node=0, dst_node=0): data = self.f_send_events(self._all_events) data = self.f_events_at_node(data, src_node) if self._sim_mode == MODE_WIRELESS: assert(I_TIMESTAMP_TOKEN == 1) data = self.f_events_at_level(data, L_AGENT) data = self.f_events_with_data_pkts(data) # Check for destination node data = self.f_events_with_dst_node(data, dst_node) pkt_seq_num = [] if self._sim_mode == MODE_WIRELESS: for event in data: try: if event[I_SEQ_NUM_TOKEN] == S_SEQ_NUM_TOKEN: # Wireless, from AGT pkt_seq_num.append( event[I_SEQ_NUM] ) except IndexError: continue elif self._sim_mode == MODE_WIRED: assert(I_TIMESTAMP_TOKEN == -1) for event in data: try: pkt_seq_num.append( event[I_SEQ_NUM] ) except IndexError: continue #pkt_seq_num = self.f_get_cols(data, col_num=I_SEQ_NUM) #print pkt_seq_num[:40] return Metrics.packet_retransmissions(pkt_seq_num)
def get_end2end_delay(self, src_node=0, dst_node=0): data = self.f_send_events(self._all_events) data = self.f_events_at_node(data, src_node) if self._sim_mode == MODE_WIRELESS: assert(I_TIMESTAMP_TOKEN == 1) data = self.f_events_at_level(data, L_AGENT) data = self.f_events_with_data_pkts(data) pkt_seq_num = [] pkt_timestamp = [] #print data[:10] if self._sim_mode == MODE_WIRELESS: assert(I_TIMESTAMP_TOKEN == 1) for event in data: try: if event[I_SEQ_NUM_TOKEN] == S_SEQ_NUM_TOKEN: # Wireless, from AGT pkt_seq_num.append( event[I_SEQ_NUM] ) pkt_timestamp.append( event[I_TIMESTAMP] ) except IndexError: continue else: assert(I_TIMESTAMP_TOKEN == -1) for event in data: try: pkt_seq_num.append( event[I_SEQ_NUM] ) pkt_timestamp.append( event[I_TIMESTAMP] ) except IndexError: continue send_pkts = izip( pkt_seq_num, pkt_timestamp ) data = self.__common_filters__(dst_node) pkt_seq_num = [] pkt_timestamp = [] #print data[:10] if self._sim_mode == MODE_WIRELESS: for event in data: try: if event[I_SEQ_NUM_TOKEN] == S_SEQ_NUM_TOKEN: # Wireless, from AGT pkt_seq_num.append( event[I_SEQ_NUM] ) pkt_timestamp.append( event[I_TIMESTAMP] ) except IndexError: continue else: for event in data: try: pkt_seq_num.append( event[I_SEQ_NUM] ) pkt_timestamp.append( event[I_TIMESTAMP] ) except IndexError: continue rcvd_pkts = izip( pkt_seq_num, pkt_timestamp ) return Metrics.end2end_delay(send_pkts, rcvd_pkts)
def process_projects(): global glossary_file global glossary_description corpus = Corpus(src_directory) corpus.process() reference_sources = ReferenceSources() reference_sources.read_sources() metrics = Metrics() metrics.create(corpus) # Select terms MAX_TERMS = 1000 sorted_terms_by_tfxdf = sorted(metrics.tfxdf, key=metrics.tfxdf.get, reverse=True) # Developer report glossary_entries = OrderedDict() translations = Translations() selected_terms = sorted_terms_by_tfxdf[:MAX_TERMS] # Sorted by frequency for term in selected_terms: glossary_entries[term] = translations.create_for_word_sorted_by_frequency(corpus.documents, term, reference_sources) dev_glossary_serializer = DevGlossarySerializer() dev_glossary_serializer.create(u"dev-" + glossary_file + ".html", glossary_description, corpus, glossary_entries, reference_sources) # User report glossary_entries = [] selected_terms = sorted(sorted_terms_by_tfxdf[:MAX_TERMS]) # Sorted by term glossary = Glossary() glossary.description = glossary_description for term in selected_terms: glossary_entry = GlossaryEntry() glossary_entry.source_term = term glossary_entry.translations = translations.create_for_word_sorted_by_frequency(corpus.documents, term, reference_sources) glossary.entries.append(glossary_entry) user_glossary_serializer = UserGlossarySerializer() user_glossary_serializer.create(glossary_file, glossary.get_dict(), reference_sources)
def get_cumulative_bytes_received_for_node_at_layers(self, node_id=0, layers=[]): data = [] data_set = [] if layers == []: layers = [L_AGENT,] #print 'layers:', layers if self._sim_mode == MODE_WIRELESS or \ self._sim_mode == MODE_WIRED: data = self.f_receive_events_at(self._all_events, node_id) #print 'get_cumulative_bytes_received_for_node_at_layers:', len(data) if self._sim_mode == MODE_WIRELESS: assert(I_TIMESTAMP_TOKEN == 1) data = self.f_events_at_levels(data, layers) data = self.f_events_with_data_pkts(data) # All trace types are taken care of data_set = izip( self.f_get_cols(data, col_num=I_TIMESTAMP), self.f_get_cols(data, col_num=I_PKT_LEN) ) #print 'get_cumulative_bytes_received_for_node_at_layers:', len(data) else: # Mixed mode wired_events = [] wireless_events = [] rcv_events = self.f_receive_events(self._all_events) # All receive events #print 'len(rcv_events):', len(rcv_events) for e in rcv_events: if is_wired(e): if e[I_W_NXT_NODE_ID] == node_id: wired_events.append(e) else: if e[I_WL_NXT_NODE_ID] == node_id: wireless_events.append(e) wireless_events = self.f_events_at_levels(wireless_events, layers) data = wired_events + wireless_events #print 'len(data):', len(data) #data = self.f_events_with_data_pkts(data) # All trace types are taken care of c_ts = self.f_get_cols(wired_events, col_num=I_W_TIMESTAMP) + \ self.f_get_cols(wireless_events, col_num=I_WL_TIMESTAMP) c_len = self.f_get_cols(wired_events, col_num=I_W_PKT_LEN) + \ self.f_get_cols(wireless_events, col_num=I_WL_PKT_LEN) data_set = izip( c_ts, c_len ) #print c_ts[:10] return Metrics.cumulative_bytes_received(data_set)
def get_instantaneous_throughput_for_node(self, node_id=0): data = self.f_receive_events_at(self._all_events, node_id) #data = self.f_events_at_node(data, node_id) if self._sim_mode == MODE_WIRELESS: assert(I_TIMESTAMP_TOKEN == 1) data = self.f_events_at_level(data, 'AGT') data = self.f_events_with_data_pkts(data) data_set = [] data_set = izip( self.f_get_cols(data, col_num=I_TIMESTAMP), self.f_get_cols(data, col_num=I_PKT_LEN) ) #for e in data_set: print e return Metrics.instantaneous_throughput(data_set)
def setUp(self): self.labels_pred = { 0: 1, 1: 0, 2: 1, 3: 1, 4: 1, 5: 1, 6: 1, 7: 0, 8: 0, 9: 0, 10: 0, 11: 2, 12: 1, 13: 2, 14: 1, 15: 2, 16: 2} self.labels_true = { 0: 0, 1: 0, 2: 0, 3: 0, 4: 0, 5: 0, 6: 1, 7: 1, 8: 1, 9: 1, 10: 1, 11: 1, 12: 2, 13: 2, 14: 2, 15: 2, 16: 2} self._n = len(self.labels_pred) self.metrics = Metrics(self.labels_true, self.labels_pred)
#!venv/bin/python from metrics import Metrics m = Metrics() files = ['2010-13-080', '2010-00-072', '2010-00-094', '2010-26-075', '2010-58-011', '2010-08-078', '2010-94-034', '2010-71-034', '2010-40-008', '2010-08-069', '2010-92-061', '2010-70-013'] # tests the cutRecall recall = m.cutRecall(sorted(files), '2010-001') print(recall) # tests the cutPrecision precision = m.cutPrecision(sorted(files), '2010-001') print(precision) # tests the FMeasure FMeasure = m.FMeasure(precision, recall) print(FMeasure) # tests the RRank1 RRank1 = m.RRank1(files, '2010-001') print(RRank1) # tests the RRank2 RRank2 = m.RRank2(files, '2010-001') print(RRank2) # tests the Aprecision Aprecision = m.APrecision(files, '2010-001') print(Aprecision) # tests the nDCG nDCG = m.nDCG(files, '2010-001', 10) print(nDCG)
if not os.path.exists(DUMPS_FOLDER): os.makedirs(DUMPS_FOLDER) if not os.environ.get(WEBSITE_HOSTNAME): os.environ[WEBSITE_HOSTNAME] = f'localhost-main-{now()}' if 'email' in watch_types: store = CredentialsStore() def start_mail_checker_thread(email): mail_checker = MailChecker(store, email, config, tg_bot) return mail_checker.start_loop(args.nb_attempts) mail_checkers = list(map(lambda data: start_mail_checker_thread(data['email']), form_data)) logger.info(f"Waiting for {len(mail_checkers)} email checking threads to finish") for th in mail_checkers: th.join() if 'website' in watch_types: http_client = HttpClient() #browsers = list(map(lambda data: Browser(config, data, tg_bot, http_client), form_data)) metrics = Metrics(export_metrics=False) watcher = WatcherMultislot(tg_bot, http_client, metrics, config, args.parallelism) watcher.start_loop( max_attempts=args.nb_attempts ) logger.info(f"Waiting for {len(watcher.form_submit_threads)} submit actions to finish") for th in watcher.form_submit_threads: th.join() logger.info("Done. Exiting")
data_loader = DataLoader(data_set, batch_size=BATCH_SIZE, shuffle=True, num_workers=NUM_WORKERS) # TRAIN def lr_func_exp(step): return 0.95 ** step optimizer = optim.SGD(model.parameters(), lr=0.01, momentum=0.9) if store.optims: optimizer.load_state_dict(store.optims) scheduler = LambdaLR(optimizer, lr_lambda=lr_func_exp, last_epoch=epoch if store.optims else -1) # criterion = nn.BCELoss() # criterion = nn.BCEWithLogitsLoss() criterion = CrossEntropyLoss2d() metrics = Metrics() if store.metrics: metrics.load_state_dict(store.metrics) if FAKE: print('STOP TRAINING') exit(0) # LOOP print(f'Starting ({now_str()})') iter_count = len(data_set) // BATCH_SIZE while epoch < first_epoch + EPOCH_COUNT: iter_metrics = Metrics() lr = scheduler.get_lr()[0] for i, (inputs, labels) in enumerate(data_loader): inputs = inputs.to(device)
from config import Configuration from reddit import Reddit from data import Data from metrics import Metrics from logger import Logger configuration = Configuration() loggerInstance = Logger(configuration.logLocation) metrics = Metrics(loggerInstance) metrics.start() r = Reddit(configuration.clientId, configuration.clientSecret, configuration.password, configuration.userAgent, configuration.username, configuration.call, metrics, loggerInstance) data = Data(configuration.apiKey, loggerInstance) r.parseUnreadItems(data) metrics.end() metrics.buildInboxReport()
def evaluate( args, model, iterator, vocab, optimizers, step=0, epoch=0, save_checkpoint=True, save_predictions=True, save_csv=True, sampled_evaluation=False, metrics=Metrics(), ): print() logging.info( f"Start evaluation on split {'test' if args.eval_on_test_only else 'valid'}" ) model.eval() model.to(args.device, args.eval_device) all_words, all_tags, all_y, all_y_hat, all_predicted, all_token_ids = [], [], [], [], [], [] with torch.no_grad(): for iter, batch in enumerate(tqdm.tqdm(iterator)): ( batch_token_ids, label_ids, label_probs, eval_mask, label_id_to_entity_id_dict, batch_entity_ids, orig_batch, _, ) = batch logits, y, y_hat, probs, _, _ = model( batch_token_ids, None, None) # logits: (N, T, VOCAB), y: (N, T) tags = list() predtags = list() y_resolved_list = list() y_hat_resolved_list = list() token_list = list() chunk_len = args.create_integerized_training_instance_text_length chunk_overlap = args.create_integerized_training_instance_text_overlap for batch_id, seq in enumerate(label_probs.max(-1)[1]): for tok_id, label_id in enumerate( seq[chunk_overlap:-chunk_overlap]): y_resolved = (vocab.PAD_ID if eval_mask[batch_id][tok_id + chunk_overlap] == 0 else label_ids[label_id].item()) y_resolved_list.append(y_resolved) tags.append(vocab.idx2tag[y_resolved]) if sampled_evaluation: y_hat_resolved = ( vocab.PAD_ID if eval_mask[batch_id][tok_id + chunk_overlap] == 0 else label_ids[y_hat[batch_id][ tok_id + chunk_overlap]].item()) else: y_hat_resolved = y_hat[batch_id][ tok_id + chunk_overlap].item() y_hat_resolved_list.append(y_hat_resolved) predtags.append(vocab.idx2tag[y_hat_resolved]) token_list.append( batch_token_ids[batch_id][tok_id + chunk_overlap].item()) all_y.append(y_resolved_list) all_y_hat.append(y_hat_resolved_list) all_tags.append(tags) all_predicted.append(predtags) all_words.append( vocab.tokenizer.convert_ids_to_tokens(token_list)) all_token_ids.append(token_list) ## calc metric y_true = numpy.array(list(chain(*all_y))) y_pred = numpy.array(list(chain(*all_y_hat))) all_token_ids = numpy.array(list(chain(*all_token_ids))) num_proposed = len(y_pred[(vocab.OUTSIDE_ID > y_pred) & (all_token_ids > 0)]) num_correct = (((y_true == y_pred) & (vocab.OUTSIDE_ID > y_true) & (all_token_ids > 0))).astype(numpy.int).sum() num_gold = len(y_true[(vocab.OUTSIDE_ID > y_true) & (all_token_ids > 0)]) new_metrics = Metrics( epoch=epoch, step=step, num_correct=num_correct, num_proposed=num_proposed, num_gold=num_gold, ) if save_predictions: final = args.logdir + "/%s.P%.2f_R%.2f_F%.2f" % ( "{}-{}".format(str(epoch), str(step)), new_metrics.precision, new_metrics.recall, new_metrics.f1, ) with open(final, "w") as fout: for words, tags, y_hat, preds in zip(all_words, all_tags, all_y_hat, all_predicted): assert len(preds) == len(words) == len(tags) for w, t, p in zip(words, tags, preds): fout.write(f"{w}\t{t}\t{p}\n") fout.write("\n") fout.write(f"num_proposed:{num_proposed}\n") fout.write(f"num_correct:{num_correct}\n") fout.write(f"num_gold:{num_gold}\n") fout.write(f"precision={new_metrics.precision}\n") fout.write(f"recall={new_metrics.recall}\n") fout.write(f"f1={new_metrics.f1}\n") if not args.dont_save_checkpoints: if save_checkpoint and metrics.was_improved(new_metrics): config = { "args": args, "optimizer_dense": optimizers[0].state_dict(), "optimizer_sparse": optimizers[1].state_dict(), "model": model.state_dict(), "epoch": epoch, "step": step, "performance": new_metrics.dict(), } fname = os.path.join(args.logdir, "{}-{}".format(str(epoch), str(step))) torch.save(config, f"{fname}.pt") fname = os.path.join( args.logdir, new_metrics.get_best_checkpoint_filename()) torch.save(config, f"{fname}.pt") logging.info(f"weights were saved to {fname}.pt") if save_csv: new_metrics.to_csv(epoch=epoch, step=step, args=args) if metrics.was_improved(new_metrics): metrics.update(new_metrics) logging.info("Finished evaluation") return metrics
from PIL import Image, ImageDraw, ImageFont import codecs from metrics import Metrics,A4_LANDSCAPE_IN_MM import sys msg = str(sys.argv[1]).upper() file_name = str(sys.argv[2]) m = Metrics(300) font = ImageFont.truetype("arial.ttf", m.mm2pt(10)) im = Image.new("RGB",m.mmpoint2px(A4_LANDSCAPE_IN_MM),"#ffffff") draw = ImageDraw.Draw(im) def draw_letter_rect(x,y,letter): draw.rectangle( (m.mmpoint2px((x,y)), m.mmpoint2px((x+12,y+10))),outline="#000000", fill=None) textsize = draw.textsize(letter) tx = m.mm2px(x+2) ty = m.mm2px(y) draw.text((tx,ty), letter, font=font,fill="#000000") def draw_empty_rect(x,y): draw.rectangle( (m.mmpoint2px((x,y)), m.mmpoint2px((x+12,y+10))),outline="#000000", fill=None) xpos = 10 for c in "ABCDEFGHIJKLM": draw_letter_rect(xpos,10,c) draw_letter_rect(xpos,20,codecs.encode(c,"rot_13"))
def evaluate(self, dataloader): """ Evaluate a model on a validation dataloader. """ print("Running Validation...") t0 = time.time() self.model.eval() # Tracking variables total_eval_accuracy = 0 total_eval_loss = 0 nb_eval_steps = 0 y_true = [] y_pred = [] # Evaluate data for one epoch for batch in dataloader: # `batch` contains three pytorch tensors: # [0]: input ids # [1]: attention masks # [2]: labels # [3]: output_mask (optional) input_ids = batch[0].to(self.device) attention_mask = batch[1].to(self.device) label_ids = batch[2].to(self.device) output_mask = None if self.use_output_mask: output_mask = batch[3].to(self.device) with torch.no_grad(): # The documentation for the BERT `models` are here: # https://huggingface.co/transformers/v2.2.0/model_doc/bert.html outputs = self.model(input_ids, attention_mask=attention_mask, labels=label_ids) loss = outputs[0] logits = outputs[1] # Accumulate the validation loss. total_eval_loss += loss.item() # Move logits and labels to CPU logits = logits.detach().cpu().numpy() label_ids = label_ids.to('cpu').numpy() if self.use_output_mask: output_mask = output_mask.to('cpu').numpy() active_loss = (output_mask == 1) else: active_loss = np.ones(label_ids.shape) active_loss = (active_loss == 1) pred_flat = np.argmax(logits, axis=-1)[active_loss].flatten() labels_flat = label_ids[active_loss].flatten() y_true.append(labels_flat) y_pred.append(pred_flat) # Calculate the accuracy for this batch of test sentences, and # accumulate it over all batches. total_eval_accuracy += Metrics.flat_accuracy(label_ids, logits) # Report results report = Metrics.report(self.metric_name, [item for sublist in y_true for item in sublist], [item for sublist in y_pred for item in sublist]) print(report) # Report the final accuracy for this validation run. avg_val_accuracy = total_eval_accuracy / len(dataloader) print(" Accuracy: {0:.2f}".format(avg_val_accuracy)) # Calculate the average loss over all of the batches. avg_val_loss = total_eval_loss / len(dataloader) # Measure how long the validation run took. validation_time = format_time(time.time() - t0) print(" Validation Loss: {0:.2f}".format(avg_val_loss)) print(" Validation took: {:}".format(validation_time)) return avg_val_accuracy, avg_val_loss, validation_time, report
class GMVAE: def __init__(self, args): self.num_epochs = args.epochs self.cuda = args.cuda self.verbose = args.verbose self.batch_size = args.batch_size self.batch_size_val = args.batch_size_val self.learning_rate = args.learning_rate self.decay_epoch = args.decay_epoch self.lr_decay = args.lr_decay self.w_cat = args.w_categ self.w_gauss = args.w_gauss self.w_rec = args.w_rec self.rec_type = args.rec_type self.num_classes = args.num_classes self.gaussian_size = args.gaussian_size self.input_size = args.input_size # gumbel self.init_temp = args.init_temp self.decay_temp = args.decay_temp self.hard_gumbel = args.hard_gumbel self.min_temp = args.min_temp self.decay_temp_rate = args.decay_temp_rate self.gumbel_temp = self.init_temp self.network = GMVAENet(self.input_size, self.gaussian_size, self.num_classes) self.losses = LossFunctions() self.metrics = Metrics() if self.cuda: self.network = self.network.cuda() def unlabeled_loss(self, data, out_net): """Method defining the loss functions derived from the variational lower bound Args: data: (array) corresponding array containing the input data out_net: (dict) contains the graph operations or nodes of the network output Returns: loss_dic: (dict) contains the values of each loss function and predictions """ # obtain network variables z, data_recon = out_net['gaussian'], out_net['x_rec'] logits, prob_cat = out_net['logits'], out_net['prob_cat'] y_mu, y_var = out_net['y_mean'], out_net['y_var'] mu, var = out_net['mean'], out_net['var'] # reconstruction loss loss_rec = self.losses.reconstruction_loss(data, data_recon, self.rec_type) # gaussian loss loss_gauss = self.losses.gaussian_loss(z, mu, var, y_mu, y_var) # categorical loss loss_cat = -self.losses.entropy(logits, prob_cat) - np.log(0.1) # total loss loss_total = self.w_rec * loss_rec + self.w_gauss * loss_gauss + self.w_cat * loss_cat # obtain predictions _, predicted_labels = torch.max(logits, dim=1) loss_dic = { 'total': loss_total, 'predicted_labels': predicted_labels, 'reconstruction': loss_rec, 'gaussian': loss_gauss, 'categorical': loss_cat } return loss_dic def train_epoch(self, optimizer, data_loader): """Train the model for one epoch Args: optimizer: (Optim) optimizer to use in backpropagation data_loader: (DataLoader) corresponding loader containing the training data Returns: average of all loss values, accuracy, nmi """ self.network.train() total_loss = 0. recon_loss = 0. cat_loss = 0. gauss_loss = 0. accuracy = 0. nmi = 0. num_batches = 0. true_labels_list = [] predicted_labels_list = [] # iterate over the dataset for (data, labels) in data_loader: if self.cuda == 1: data = data.cuda() optimizer.zero_grad() # flatten data data = data.view(data.size(0), -1) # forward call out_net = self.network(data, self.gumbel_temp, self.hard_gumbel) unlab_loss_dic = self.unlabeled_loss(data, out_net) total = unlab_loss_dic['total'] # accumulate values total_loss += total.item() recon_loss += unlab_loss_dic['reconstruction'].item() gauss_loss += unlab_loss_dic['gaussian'].item() cat_loss += unlab_loss_dic['categorical'].item() # perform backpropagation total.backward() optimizer.step() # save predicted and true labels predicted = unlab_loss_dic['predicted_labels'] true_labels_list.append(labels) predicted_labels_list.append(predicted) num_batches += 1. # average per batch total_loss /= num_batches recon_loss /= num_batches gauss_loss /= num_batches cat_loss /= num_batches # concat all true and predicted labels true_labels = torch.cat(true_labels_list, dim=0).cpu().numpy() predicted_labels = torch.cat(predicted_labels_list, dim=0).cpu().numpy() # compute metrics accuracy = 100.0 * self.metrics.cluster_acc(predicted_labels, true_labels) nmi = 100.0 * self.metrics.nmi(predicted_labels, true_labels) return total_loss, recon_loss, gauss_loss, cat_loss, accuracy, nmi def test(self, data_loader, return_loss=False): """Test the model with new data Args: data_loader: (DataLoader) corresponding loader containing the test/validation data return_loss: (boolean) whether to return the average loss values Return: accuracy and nmi for the given test data """ self.network.eval() total_loss = 0. recon_loss = 0. cat_loss = 0. gauss_loss = 0. accuracy = 0. nmi = 0. num_batches = 0. true_labels_list = [] predicted_labels_list = [] with torch.no_grad(): for data, labels in data_loader: if self.cuda == 1: data = data.cuda() # flatten data data = data.view(data.size(0), -1) # forward call out_net = self.network(data, self.gumbel_temp, self.hard_gumbel) unlab_loss_dic = self.unlabeled_loss(data, out_net) # accumulate values total_loss += unlab_loss_dic['total'].item() recon_loss += unlab_loss_dic['reconstruction'].item() gauss_loss += unlab_loss_dic['gaussian'].item() cat_loss += unlab_loss_dic['categorical'].item() # save predicted and true labels predicted = unlab_loss_dic['predicted_labels'] true_labels_list.append(labels) predicted_labels_list.append(predicted) num_batches += 1. # average per batch if return_loss: total_loss /= num_batches recon_loss /= num_batches gauss_loss /= num_batches cat_loss /= num_batches # concat all true and predicted labels true_labels = torch.cat(true_labels_list, dim=0).cpu().numpy() predicted_labels = torch.cat(predicted_labels_list, dim=0).cpu().numpy() # compute metrics accuracy = 100.0 * self.metrics.cluster_acc(predicted_labels, true_labels) nmi = 100.0 * self.metrics.nmi(predicted_labels, true_labels) if return_loss: return total_loss, recon_loss, gauss_loss, cat_loss, accuracy, nmi else: return accuracy, nmi def train(self, train_loader, val_loader): """Train the model Args: train_loader: (DataLoader) corresponding loader containing the training data val_loader: (DataLoader) corresponding loader containing the validation data Returns: output: (dict) contains the history of train/val loss """ optimizer = optim.Adam(self.network.parameters(), lr=self.learning_rate) train_history_acc, val_history_acc = [], [] train_history_nmi, val_history_nmi = [], [] for epoch in range(1, self.num_epochs + 1): train_loss, train_rec, train_gauss, train_cat, train_acc, train_nmi = self.train_epoch( optimizer, train_loader) val_loss, val_rec, val_gauss, val_cat, val_acc, val_nmi = self.test( val_loader, True) # if verbose then print specific information about training if self.verbose == 1: print("(Epoch %d / %d)" % (epoch, self.num_epochs)) print("Train - REC: %.5lf; Gauss: %.5lf; Cat: %.5lf;" % \ (train_rec, train_gauss, train_cat)) print("Valid - REC: %.5lf; Gauss: %.5lf; Cat: %.5lf;" % \ (val_rec, val_gauss, val_cat)) print("Accuracy=Train: %.5lf; Val: %.5lf NMI=Train: %.5lf; Val: %.5lf Total Loss=Train: %.5lf; Val: %.5lf" % \ (train_acc, val_acc, train_nmi, val_nmi, train_loss, val_loss)) else: print('(Epoch %d / %d) Train_Loss: %.3lf; Val_Loss: %.3lf Train_ACC: %.3lf; Val_ACC: %.3lf Train_NMI: %.3lf; Val_NMI: %.3lf' % \ (epoch, self.num_epochs, train_loss, val_loss, train_acc, val_acc, train_nmi, val_nmi)) # decay gumbel temperature if self.decay_temp == 1: self.gumbel_temp = np.maximum( self.init_temp * np.exp(-self.decay_temp_rate * epoch), self.min_temp) if self.verbose == 1: print("Gumbel Temperature: %.3lf" % self.gumbel_temp) train_history_acc.append(train_acc) val_history_acc.append(val_acc) train_history_nmi.append(train_nmi) val_history_nmi.append(val_nmi) return { 'train_history_nmi': train_history_nmi, 'val_history_nmi': val_history_nmi, 'train_history_acc': train_history_acc, 'val_history_acc': val_history_acc } def latent_features(self, data_loader, return_labels=False): """Obtain latent features learnt by the model Args: data_loader: (DataLoader) loader containing the data return_labels: (boolean) whether to return true labels or not Returns: features: (array) array containing the features from the data """ self.network.eval() N = len(data_loader.dataset) features = np.zeros((N, self.gaussian_size)) if return_labels: true_labels = np.zeros(N, dtype=np.int64) start_ind = 0 with torch.no_grad(): for (data, labels) in data_loader: if self.cuda == 1: data = data.cuda() # flatten data data = data.view(data.size(0), -1) out = self.network.inference(data, self.gumbel_temp, self.hard_gumbel) latent_feat = out['mean'] end_ind = min(start_ind + data.size(0), N + 1) # return true labels if return_labels: true_labels[start_ind:end_ind] = labels.cpu().numpy() features[start_ind:end_ind] = latent_feat.cpu().detach().numpy( ) start_ind += data.size(0) if return_labels: return features, true_labels return features def reconstruct_data(self, data_loader, sample_size=-1): """Reconstruct Data Args: data_loader: (DataLoader) loader containing the data sample_size: (int) size of random data to consider from data_loader Returns: reconstructed: (array) array containing the reconstructed data """ self.network.eval() # sample random data from loader indices = np.random.randint(0, len(data_loader.dataset), size=sample_size) test_random_loader = torch.utils.data.DataLoader( data_loader.dataset, batch_size=sample_size, sampler=SubsetRandomSampler(indices)) # obtain values it = iter(test_random_loader) test_batch_data, _ = it.next() original = test_batch_data.data.numpy() if self.cuda: test_batch_data = test_batch_data.cuda() # obtain reconstructed data out = self.network(test_batch_data, self.gumbel_temp, self.hard_gumbel) reconstructed = out['x_rec'] return original, reconstructed.data.cpu().numpy() def plot_latent_space(self, data_loader, save=False): """Plot the latent space learnt by the model Args: data: (array) corresponding array containing the data labels: (array) corresponding array containing the labels save: (bool) whether to save the latent space plot Returns: fig: (figure) plot of the latent space """ # obtain the latent features features = self.latent_features(data_loader) # plot only the first 2 dimensions fig = plt.figure(figsize=(8, 6)) plt.scatter(features[:, 0], features[:, 1], c=labels, marker='o', edgecolor='none', cmap=plt.cm.get_cmap('jet', 10), s=10) plt.colorbar() if (save): fig.savefig('latent_space.png') return fig def random_generation(self, num_elements=1): """Random generation for each category Args: num_elements: (int) number of elements to generate Returns: generated data according to num_elements """ # categories for each element arr = np.array([]) for i in range(self.num_classes): arr = np.hstack([arr, np.ones(num_elements) * i]) indices = arr.astype(int).tolist() categorical = F.one_hot(torch.tensor(indices), self.num_classes).float() if self.cuda: categorical = categorical.cuda() # infer the gaussian distribution according to the category mean, var = self.network.generative.pzy(categorical) # gaussian random sample by using the mean and variance noise = torch.randn_like(var) std = torch.sqrt(var) gaussian = mean + noise * std # generate new samples with the given gaussian generated = self.network.generative.pxz(gaussian) return generated.cpu().detach().numpy()
def setUp(self): self.metrics = Metrics() self.tile = Tile()
def main(args): assert args.arch_gcn in ['firstchebnet'], '[ERROR] Architecture not implemented!' assert args.dataset == 'mit67', '[ERROR] Dataset not supported yet!' obj = args.storage + '/graph.bin' # load and preprocess dataset if not os.path.isfile(obj): print('Graph not found!') print('Creating graph...') gh = GNNHandler('dataset', args.pretrained_cnn) graph = gh.build_graph() gh.save_graph(obj, graph) obj = graph g, features, labels, train_mask, val_mask, test_mask, in_feats, n_classes = GNNHandler.get_info_from_graph(obj) n_edges = g.number_of_edges() print("""----Data statistics------' #Edges %d #Classes %d #Train samples %d #Val samples %d #Test samples %d""" % (n_edges, n_classes, train_mask.sum().item(), val_mask.sum().item(), test_mask.sum().item())) if args.gpu < 0: cuda = False else: cuda = True torch.cuda.set_device(args.gpu) features = features.cuda() labels = labels.cuda() train_mask = train_mask.cuda() val_mask = val_mask.cuda() test_mask = test_mask.cuda() # graph preprocess and calculate normalization factor # add self loop # if args.self_loop: # g.remove_edges_from(g.selfloop_edges()) # g.add_edges_from(zip(g.nodes(), g.nodes())) n_edges = g.number_of_edges() # normalization degs = g.in_degrees().float() norm = torch.pow(degs, -0.5) norm[torch.isinf(norm)] = 0 if cuda: norm = norm.cuda() g.ndata['norm'] = norm.unsqueeze(1) if args.arch_gcn == 'firstchebnet': model = FirstChebNet(g, in_feats, args.n_hidden, n_classes, args.n_layers, F.relu, args.dropout) else: print('ARCHITECTURE NOT IMPLEMENT! EXITING...') exit(1) if cuda: model.cuda() loss_fcn = torch.nn.CrossEntropyLoss() # use optimizer optimizer = torch.optim.Adam(model.parameters(), lr=args.lr, weight_decay=args.weight_decay) mtc = Metrics(features, labels, val_mask, backend='pytorch') # initialize graph dur = [] for epoch in range(args.n_epochs): model.train() if epoch >= 3: t0 = time.time() # forward logits = model(features) loss = loss_fcn(logits[train_mask], labels[train_mask]) optimizer.zero_grad() loss.backward() optimizer.step() if epoch >= 3: dur.append(time.time() - t0) acc = mtc.evaluate(model) print("Epoch {:05d} | Time(s) {:.4f} | Loss {:.4f} | Accuracy {:.4f} | " "ETputs(KTEPS) {:.2f}". format(epoch, np.mean(dur), loss.item(), acc, n_edges / np.mean(dur) / 1000)) print("Test Accuracy {:.2%}".format(mtc.evaluate(model))) mtc.save_metrics(args.save_path, model) torch.save(model.state_dict(), args.save_path + '/cp.pt') return mtc, model
def validate(localizer, adversarial, dataloader, experiment_directory, labels, segmentation_map_threshold, num_classes, evaluate=False, save_results=False): """ Loop over the validation set (in batches) to acquire relevant metrics """ print('Validating...') if evaluate: metrics = Metrics(20) localizer_criterion = torch.nn.BCELoss() adversarial_criterion = torch.nn.BCELoss() localizer_loss_meter = AverageMeter() adversarial_loss_meter = AverageMeter() for i, (inputs, targets) in enumerate(dataloader): if evaluate: # Segmentation maps are included in the targets targets, segmentation_maps = targets else: segmentation_maps = None if torch.cuda.is_available(): inputs, targets = inputs.cuda(), targets.cuda() output, gcams = localizer(inputs, labels=targets) loss = localizer_criterion(output, targets) localizer_loss_meter.update(loss.item()) gcams, new_images, new_targets, original_targets = gcams if adversarial is not None or save_results: new_batch_size = gcams.size(0) masks = gcam_to_mask(gcams) masked_image = erase_mask(new_images, masks) if adversarial is not None: adversarial_output = adversarial(masked_image) adversarial_output = torch.sigmoid(adversarial_output) adversarial_loss = adversarial_criterion( adversarial_output, original_targets) adversarial_loss_meter.update(adversarial_loss.item()) if save_results: for k in range(new_batch_size): number = f'{i * new_batch_size + k}' #TODO: fix label_string = labels[new_targets[k]] file_postfix = f'{number}_{label_string}' save_location = os.path.join( experiment_directory, f'heatmap_{file_postfix}.png') save_gradcam(filename=save_location, gcam=gcams[k, 0].detach(), raw_image=new_images[k].clone()) save_location = os.path.join( experiment_directory, f'raw_heatmap_{file_postfix}.png') save_gradcam(filename=save_location, gcam=gcams[k, 0].detach()) save_location = os.path.join(experiment_directory, f'erased_{file_postfix}.png') tensor2imwrite(save_location, denormalize(masked_image[k])) if evaluate: # Generate and visualize predicted segmentation map predicted_segmentation_maps = generate_segmentation_map( gcams, num_classes, segmentation_maps.shape[1:], new_targets, threshold=segmentation_map_threshold) metrics.update(predicted_segmentation_maps, segmentation_maps) if save_results: predicted_indices = predicted_segmentation_maps.unique() all_labels = ['background', *labels] predicted_labels = [ all_labels[idx] for idx in predicted_indices ] labels_string = '_'.join(predicted_labels) filename = f'map_{i:04d}_{labels_string}.png' save_location = os.path.join(experiment_directory, filename) save_segmentation_map(save_location, predicted_segmentation_maps, denormalize(new_images[k]).clone()) filename = f'map_raw_{i:04d}_{labels_string}.png' save_location = os.path.join(experiment_directory, filename) save_segmentation_map(save_location, predicted_segmentation_maps) print('Validation localizer loss:', localizer_loss_meter.avg) print('Validation adversarial loss:', adversarial_loss_meter.avg) if evaluate: miou = metrics.miou().item() precision = metrics.precision(skip_background=True).item() recall = metrics.recall(skip_background=True).item() metrics.print_scores_per_class() print('mIoU:', miou) print('precision:', precision) print('recall:', recall)
def __init__(self, interface): Thread.__init__(self) self.stop_event = Event() self.interface = interface self.metrics = Metrics()
import json import mongo_structure as mdb from metrics_util import * from metrics import Metrics from alldayplay import AllDayPlay # Create the application object app = Flask(__name__) app.config.from_pyfile('yr_metrics_api.cfg', silent=False) # Connect to the database db = Connection(app.config["MONGODB_HOST"], app.config["MONGODB_PORT"]) # Set up API methods metrics = Metrics(request=request, database_connection=db) adp = AllDayPlay(request=request, database_connection=db) # Mongo Schema. These objects all live in the mongo_structure import. db.register(mdb.RootDocument) db.register(mdb.Event) db.register(mdb.Count) # Add an event to the logging table. app.add_url_rule('/event/<func>', 'event_add_or_touch', lambda func: metrics.addOrTouchEvent(func), methods=["GET", "POST"]) # AllDayPlay Metrics. app.add_url_rule('/adp/songs/played', 'adp_last_songs_played', adp.lastSongsPlayed, methods=["GET"]) app.add_url_rule('/adp/songs/total', 'adp_total_songs_played', adp.totalSongsPlayed, methods=["GET"]) app.add_url_rule('/adp/sessions/current', 'adp_current_num_sessions', adp.currentNumberOfListeningSessions, methods=["GET"]) app.add_url_rule('/adp/sessions/bounced', 'adp_total_sessions_bounced', adp.totalSessionsBounced, methods=["GET"])
def get_instantaneous_throughput(self): Metrics.instantaneousThroughput()
#Remove first row targetByClass = np.delete(targetByClass, (0), axis=0) test_mse = test_mse/float(len(predictions)) """ PLOT AND CALCULATE METRICS """ pos_len = len(base['testing']['data'][base['testing']['target']==1]) neg_len = len(base['testing']['data'][base['testing']['target']==0]) confusion_matrix_percentage = calc_confusion_matrix(vp,fp,fn,vn,pos_len,neg_len) #Confusion Matrix Metrics.plot_confusion_matrix(confusion_matrix_percentage, configDir) #MSE (Training and Validation) Metrics.plot_mse_curve(np.array(error_train), np.array(error_valid), configDir) #Area Under ROC Curve roc_area = Metrics.plot_roc_curve(targetByClass, prob_predictions, configDir) #precision acurracy = ((len(base['testing']['data'])-errors_total)/len(base['testing']['data']))*100 print("acurracy:", acurracy,'%') print('errors',errors_total,'of', len(base['testing']['data'])) configDesc = {'opt_samp':opt_samp.name, 'opt_learning':opt_learning, 'activation_function_options':opt_actvfunc, 'topology_options':opt_top}
# os.path.join('.', 'results_cifar10_pot_conv', 'checkpoints', 'trained-pot-1.meta')) os.path.join(ckpt_dir, 'trained-pot-126480.meta')) saver.restore(sess, os.path.join(ckpt_dir, 'trained-pot-126480')) # saver.restore(sess, os.path.join('.', 'results_cifar10_pot_conv', 'checkpoints', 'trained-pot-1')) noise_ph = tf.get_collection('noise_ph')[0] bn_ph = tf.get_collection('is_training_ph')[0] decoder = tf.get_collection('decoder')[0] mean = np.zeros(z_dim) cov = np.identity(z_dim) noise = pz_std * np.random.multivariate_normal( mean, cov, 16 * num_cols).astype(np.float32) # 1. Random samples res = sess.run(decoder, feed_dict={noise_ph: noise, bn_ph: False}) metrics = Metrics() opts = {} opts['dataset'] = dataset opts['input_normalize_sym'] = normalyze opts['work_dir'] = output_dir metrics.make_plots(opts, 0, None, res, prefix='samples') # #2. Interpolations # ids = np.random.choice(16 * num_cols, num_pairs, replace=False) # for i in range(len(ids)): # for j in range(i + 1, len(ids)): # id1, id2 = ids[i], ids[j] # a = np.reshape(noise[id1, :], (1, z_dim)) # b = np.reshape(noise[id2, :], (1, z_dim)) # _lambda = np.linspace(0., 1., 60) # _lambda = np.reshape(_lambda, (60, 1))
model.add(LSTM(units=rnn_units, activation=rnn_activation)) elif layer == "GRU": model.add(GRU(units=rnn_units, activation=rnn_activation)) else: print("ERROR: Invalid layer", layer) exit(1) model.add(Dense(y.shape[1], activation=dense_activation)) model.compile(optimizer=optimizer, loss=loss, metrics=metrics) print(model.summary()) # Filename: [layer]_1-layer_A-timesteps_B-[activation]-units_[optimizer]_[loss]_D-batch-size_[epoch]_[loss]_[accuracy]_[vLoss]_[vAccuracy].hdf5 filename = weights_dir+"/"+layer+"_1-layer_"+str(timesteps)+"-timesteps_"+str(rnn_units)+"-"+rnn_activation+"-units_"+\ optimizer+"_"+loss+"_"+str(batch_size)+"-batch-size_{epoch:03d}_{loss:.4f}_{acc:.4f}_{val_loss:.4f}_{val_acc:.4f}.hdf5" callback_metric = Metrics(trainX, trainY) checkpoint = ModelCheckpoint(filename, monitor='loss', verbose=1, save_best_only=True, mode='min') checkpoint = [checkpoint, callback_metric] if "dataset1" in dataset: checkpoint = [callback_metric] history = model.fit(x=trainX, y=trainY, batch_size=batch_size, epochs=epochs, verbose=verbosity, validation_data=(validationX, validationY),
def get_average_throughput(self): Metrics.average_throughput()
def main(): global args args = parse_args() # argument validation args.cuda = args.cuda and torch.cuda.is_available() torch.manual_seed(args.seed) random.seed(args.seed) if args.cuda: torch.cuda.manual_seed(args.seed) torch.backends.cudnn.benchmark = True if not os.path.exists(args.save): os.makedirs(args.save) print(args) train_dir = glob.glob(os.path.join(args.data, 'train/holistic/*.pt')) dev_dir = glob.glob(os.path.join(args.data, 'val/holistic/*.pt')) test_dir = glob.glob(os.path.join(args.data, 'test/holistic/*.pt')) train_dataset = Dataset(os.path.join(args.data, 'train'), train_dir) dev_dataset = Dataset(os.path.join(args.data, 'val'), dev_dir) test_dataset = Dataset(os.path.join(args.data, 'test'), test_dir) print('==> Size of train data : %d ' % len(train_dataset)) print('==> Size of val data : %d ' % len(dev_dataset)) print('==> Size of test data : %d ' % len(test_dataset)) # initialize model, criterion/loss_function, optimizer if args.pretrained_model == 'vgg16': pretrained_vgg16 = models.vgg16(pretrained=True) # Freeze training for all layers for child in pretrained_vgg16.children(): for param in child.parameters(): param.requires_grad = False if args.pretrained_holistic == 0: model = model_vgg16.DocClassificationHolistic( args, pretrained_vgg16) elif args.pretrained_holistic == 1: pretrained_orig_vgg16 = model_vgg16.DocClassificationHolistic( args, pretrained_vgg16) pretrained_holistic = model_vgg16.DocClassificationHolistic( args, pretrained_orig_vgg16.pretrained_model) checkpoint = torch.load('./checkpoints/vgg16.pt') pretrained_holistic.load_state_dict(checkpoint['model']) model = model_vgg16.DocClassificationRest(args, pretrained_orig_vgg16, pretrained_holistic) elif args.pretrained_model == 'vgg19': pretrained_vgg19 = models.vgg19(pretrained=True) # Freeze training for all layers for child in pretrained_vgg19.children(): for param in child.parameters(): param.requires_grad = False if args.pretrained_holistic == 0: model = model_vgg19.DocClassificationHolistic( args, pretrained_vgg19) elif args.pretrained_holistic == 1: pretrained_orig_vgg19 = model_vgg19.DocClassificationHolistic( args, pretrained_vgg19) pretrained_holistic = model_vgg19.DocClassificationHolistic( args, pretrained_orig_vgg19.pretrained_model) checkpoint = torch.load('./checkpoints/vgg19.pt') pretrained_holistic.load_state_dict(checkpoint['model']) model = model_vgg19.DocClassificationRest(args, pretrained_orig_vgg19, pretrained_holistic) elif args.pretrained_model == 'resnet50': pretrained_resnet50 = models.resnet50(pretrained=True) # Freeze training for all layers for child in pretrained_resnet50.children(): for param in child.parameters(): param.requires_grad = False if args.pretrained_holistic == 0: model = model_resnet50.DocClassificationHolistic( args, pretrained_resnet50) elif args.pretrained_holistic == 1: pretrained_orig_resnet50 = model_resnet50.DocClassificationHolistic( args, pretrained_resnet50) pretrained_holistic = model_resnet50.DocClassificationHolistic( args, pretrained_orig_resnet50.pretrained_model) checkpoint = torch.load('./checkpoints/resnet50.pt') pretrained_holistic.load_state_dict(checkpoint['model']) model = model_resnet50.DocClassificationRest( args, pretrained_orig_resnet50, pretrained_holistic) elif args.pretrained_model == 'densenet121': pretrained_densenet121 = models.densenet121(pretrained=True) # Freeze training for all layers for child in pretrained_densenet121.children(): for param in child.parameters(): param.requires_grad = False if args.pretrained_holistic == 0: model = model_densenet121.DocClassificationHolistic( args, pretrained_densenet121) elif args.pretrained_holistic == 1: pretrained_orig_densenet121 = model_densenet121.DocClassificationHolistic( args, pretrained_densenet121) pretrained_holistic = model_densenet121.DocClassificationHolistic( args, pretrained_orig_densenet121.pretrained_model) checkpoint = torch.load('./checkpoints/densenet121.pt') pretrained_holistic.load_state_dict(checkpoint['model']) model = model_densenet121.DocClassificationRest( args, pretrained_orig_densenet121, pretrained_holistic) elif args.pretrained_model == 'inceptionv3': pretrained_inceptionv3 = models.inception_v3(pretrained=True) # Freeze training for all layers for child in pretrained_inceptionv3.children(): for param in child.parameters(): param.requires_grad = False if args.pretrained_holistic == 0: model = model_inceptionv3.DocClassificationHolistic( args, pretrained_inceptionv3) elif args.pretrained_holistic == 1: pretrained_orig_inceptionv3 = model_inceptionv3.DocClassificationHolistic( args, pretrained_inceptionv3) pretrained_holistic = model_inceptionv3.DocClassificationHolistic( args, pretrained_orig_inceptionv3.pretrained_model) checkpoint = torch.load('./checkpoints/inceptionv3.pt') pretrained_holistic.load_state_dict(checkpoint['model']) model = model_inceptionv3.DocClassificationRest( args, pretrained_orig_inceptionv3, pretrained_holistic) criterion = nn.CrossEntropyLoss(reduction='sum') parameters = filter(lambda p: p.requires_grad, model.parameters()) if args.cuda: model.cuda(), criterion.cuda() if args.optim == 'adam': optimizer = optim.Adam(parameters, lr=args.lr, weight_decay=args.wd) elif args.optim == 'adagrad': optimizer = optim.Adagrad(parameters, lr=args.lr, weight_decay=args.wd) elif args.optim == 'sgd': optimizer = optim.SGD(parameters, lr=args.lr, weight_decay=args.wd) elif args.optim == 'adadelta': optimizer = optim.Adadelta(parameters, lr=args.lr, weight_decay=args.wd) metrics = Metrics(args.num_classes) # create trainer object for training and testing trainer = Trainer(args, model, criterion, optimizer) train_idx = list(np.arange(len(train_dataset))) dev_idx = list(np.arange(len(dev_dataset))) test_idx = list(np.arange(len(test_dataset))) best = float('inf') columns = ['ExpName', 'ExpNo', 'Epoch', 'Loss', 'Accuracy'] results = [] early_stop_count = 0 for epoch in range(args.epochs): train_loss = 0.0 dev_loss = 0.0 test_loss = 0.0 train_predictions = [] train_labels = [] dev_predictions = [] dev_labels = [] test_predictions = [] test_labels = [] random.shuffle(train_idx) random.shuffle(dev_idx) random.shuffle(test_idx) batch_train_data = [ train_idx[i:i + args.batchsize] for i in range(0, len(train_idx), args.batchsize) ] batch_dev_data = [ dev_idx[i:i + args.batchsize] for i in range(0, len(dev_idx), args.batchsize) ] batch_test_data = [ test_idx[i:i + args.batchsize] for i in range(0, len(test_idx), args.batchsize) ] for batch in tqdm(batch_train_data, desc='Training batches..'): train_batch_holistic, \ train_batch_header, \ train_batch_footer, \ train_batch_left_body, \ train_batch_right_body, \ train_batch_labels = train_dataset[batch] if args.pretrained_holistic == 0: _ = trainer.train_holistic(train_batch_holistic, train_batch_labels) elif args.pretrained_holistic == 1: _ = trainer.train_rest(train_batch_holistic, \ train_batch_header, \ train_batch_footer, \ train_batch_left_body, \ train_batch_right_body, \ train_batch_labels) for batch in tqdm(batch_train_data, desc='Training batches..'): train_batch_holistic, \ train_batch_header, \ train_batch_footer, \ train_batch_left_body, \ train_batch_right_body, \ train_batch_labels = train_dataset[batch] if args.pretrained_holistic == 0: train_batch_loss, train_batch_predictions, train_batch_labels = trainer.test_holistic( train_batch_holistic, train_batch_labels) elif args.pretrained_holistic == 1: train_batch_loss, train_batch_predictions, train_batch_labels = trainer.test_rest(train_batch_holistic, \ train_batch_header, \ train_batch_footer, \ train_batch_left_body, \ train_batch_right_body, \ train_batch_labels) train_predictions.append(train_batch_predictions) train_labels.append(train_batch_labels) train_loss = train_loss + train_batch_loss train_accuracy = metrics.accuracy(np.concatenate(train_predictions), np.concatenate(train_labels)) for batch in tqdm(batch_dev_data, desc='Dev batches..'): dev_batch_holistic, \ dev_batch_header, \ dev_batch_footer, \ dev_batch_left_body, \ dev_batch_right_body, \ dev_batch_labels = dev_dataset[batch] if args.pretrained_holistic == 0: dev_batch_loss, dev_batch_predictions, dev_batch_labels = trainer.test_holistic( dev_batch_holistic, dev_batch_labels) elif args.pretrained_holistic == 1: dev_batch_loss, dev_batch_predictions, dev_batch_labels = trainer.test_rest(dev_batch_holistic, \ dev_batch_header, \ dev_batch_footer, \ dev_batch_left_body, \ dev_batch_right_body, \ dev_batch_labels) dev_predictions.append(dev_batch_predictions) dev_labels.append(dev_batch_labels) dev_loss = dev_loss + dev_batch_loss dev_accuracy = metrics.accuracy(np.concatenate(dev_predictions), np.concatenate(dev_labels)) for batch in tqdm(batch_test_data, desc='Test batches..'): test_batch_holistic, \ test_batch_header, \ test_batch_footer, \ test_batch_left_body, \ test_batch_right_body, \ test_batch_labels = test_dataset[batch] if args.pretrained_holistic == 0: test_batch_loss, test_batch_predictions, test_batch_labels = trainer.test_holistic( test_batch_holistic, test_batch_labels) elif args.pretrained_holistic == 1: test_batch_loss, test_batch_predictions, test_batch_labels = trainer.test_rest(test_batch_holistic, \ test_batch_header, \ test_batch_footer, \ test_batch_left_body, \ test_batch_right_body, \ test_batch_labels) test_predictions.append(test_batch_predictions) test_labels.append(test_batch_labels) test_loss = test_loss + test_batch_loss test_accuracy = metrics.accuracy(np.concatenate(test_predictions), np.concatenate(test_labels)) print('==> Training Epoch: %d, \ \nLoss: %f, \ \nAccuracy: %f' %(epoch + 1, \ train_loss/(len(batch_train_data) * args.batchsize), \ train_accuracy)) print('==> Dev Epoch: %d, \ \nLoss: %f, \ \nAccuracy: %f' %(epoch + 1, \ dev_loss/(len(batch_dev_data) * args.batchsize), \ dev_accuracy)) print('==> Test Epoch: %d, \ \nLoss: %f, \ \nAccuracy: %f' %(epoch + 1, \ test_loss/(len(batch_test_data) * args.batchsize), \ test_accuracy)) #quit() results.append((args.expname, \ args.expno, \ epoch+1, \ test_loss/(len(batch_test_data) * args.batchsize), \ test_accuracy)) if best > test_loss: best = test_loss checkpoint = { 'model': trainer.model.state_dict(), 'optim': trainer.optimizer, 'loss': test_loss, 'accuracy': test_accuracy, 'args': args, 'epoch': epoch } print('==> New optimum found, checkpointing everything now...') torch.save(checkpoint, '%s.pt' % os.path.join(args.save, args.expname)) #np.savetxt("test_pred.csv", test_pred.numpy(), delimiter=",") else: early_stop_count = early_stop_count + 1 if early_stop_count == 20: quit()
def main(): global args args = parse_args() # global logger logger = logging.getLogger(__name__) logger.setLevel(logging.DEBUG) formatter = logging.Formatter("[%(asctime)s] %(levelname)s:%(name)s:%(message)s") # file logger fh = logging.FileHandler(os.path.join(args.save, args.expname)+'.log', mode='w') fh.setLevel(logging.INFO) fh.setFormatter(formatter) logger.addHandler(fh) # console logger ch = logging.StreamHandler() ch.setLevel(logging.DEBUG) ch.setFormatter(formatter) logger.addHandler(ch) # argument validation args.cuda = args.cuda and torch.cuda.is_available() if args.sparse and args.wd != 0: logger.error('Sparsity and weight decay are incompatible, pick one!') exit() logger.debug(args) torch.manual_seed(args.seed) random.seed(args.seed) if args.cuda: torch.cuda.manual_seed(args.seed) torch.backends.cudnn.benchmark = True if not os.path.exists(args.save): os.makedirs(args.save) train_dir = os.path.join(args.data, 'train/') dev_dir = os.path.join(args.data, 'dev/') test_dir = os.path.join(args.data, 'test/') # write unique words from all token files sick_vocab_file = os.path.join(args.data, 'sick.vocab') if not os.path.isfile(sick_vocab_file): token_files_b = [os.path.join(split, 'b.toks') for split in [train_dir, dev_dir, test_dir]] token_files_a = [os.path.join(split, 'a.toks') for split in [train_dir, dev_dir, test_dir]] token_files = token_files_a + token_files_b sick_vocab_file = os.path.join(args.data, 'sick.vocab') build_vocab(token_files, sick_vocab_file) # get vocab object from vocab file previously written vocab = Vocab(filename=sick_vocab_file, data=[Constants.PAD_WORD, Constants.UNK_WORD, Constants.BOS_WORD, Constants.EOS_WORD]) logger.debug('==> SICK vocabulary size : %d ' % vocab.size()) # load SICK dataset splits train_file = os.path.join(args.data, 'sick_train.pth') if os.path.isfile(train_file): train_dataset = torch.load(train_file) else: train_dataset = SICKDataset(train_dir, vocab, args.num_classes) torch.save(train_dataset, train_file) logger.debug('==> Size of train data : %d ' % len(train_dataset)) dev_file = os.path.join(args.data, 'sick_dev.pth') if os.path.isfile(dev_file): dev_dataset = torch.load(dev_file) else: dev_dataset = SICKDataset(dev_dir, vocab, args.num_classes) torch.save(dev_dataset, dev_file) logger.debug('==> Size of dev data : %d ' % len(dev_dataset)) test_file = os.path.join(args.data, 'sick_test.pth') if os.path.isfile(test_file): test_dataset = torch.load(test_file) else: test_dataset = SICKDataset(test_dir, vocab, args.num_classes) torch.save(test_dataset, test_file) logger.debug('==> Size of test data : %d ' % len(test_dataset)) # initialize model, criterion/loss_function, optimizer model = SimilarityTreeLSTM( vocab.size(), args.input_dim, args.mem_dim, args.hidden_dim, args.num_classes, args.sparse, args.freeze_embed) criterion = nn.KLDivLoss() if args.cuda: model.cuda(), criterion.cuda() if args.optim == 'adam': optimizer = optim.Adam(filter(lambda p: p.requires_grad, model.parameters()), lr=args.lr, weight_decay=args.wd) elif args.optim == 'adagrad': optimizer = optim.Adagrad(filter(lambda p: p.requires_grad, model.parameters()), lr=args.lr, weight_decay=args.wd) elif args.optim == 'sgd': optimizer = optim.SGD(filter(lambda p: p.requires_grad, model.parameters()), lr=args.lr, weight_decay=args.wd) metrics = Metrics(args.num_classes) # for words common to dataset vocab and GLOVE, use GLOVE vectors # for other words in dataset vocab, use random normal vectors emb_file = os.path.join(args.data, 'sick_embed.pth') if os.path.isfile(emb_file): emb = torch.load(emb_file) else: # load glove embeddings and vocab glove_vocab, glove_emb = load_word_vectors(os.path.join(args.glove, 'glove.840B.300d')) logger.debug('==> GLOVE vocabulary size: %d ' % glove_vocab.size()) emb = torch.Tensor(vocab.size(), glove_emb.size(1)).normal_(-0.05, 0.05) # zero out the embeddings for padding and other special words if they are absent in vocab for idx, item in enumerate([Constants.PAD_WORD, Constants.UNK_WORD, Constants.BOS_WORD, Constants.EOS_WORD]): emb[idx].zero_() for word in vocab.labelToIdx.keys(): if glove_vocab.getIndex(word): emb[vocab.getIndex(word)] = glove_emb[glove_vocab.getIndex(word)] torch.save(emb, emb_file) # plug these into embedding matrix inside model if args.cuda: emb = emb.cuda() model.emb.weight.data.copy_(emb) # create trainer object for training and testing trainer = Trainer(args, model, criterion, optimizer) best = -float('inf') for epoch in range(args.epochs): train_loss = trainer.train(train_dataset) train_loss, train_pred = trainer.test(train_dataset) dev_loss, dev_pred = trainer.test(dev_dataset) test_loss, test_pred = trainer.test(test_dataset) train_pearson = metrics.pearson(train_pred, train_dataset.labels) train_mse = metrics.mse(train_pred, train_dataset.labels) logger.info('==> Epoch {}, Train \tLoss: {}\tPearson: {}\tMSE: {}'.format(epoch, train_loss, train_pearson, train_mse)) dev_pearson = metrics.pearson(dev_pred, dev_dataset.labels) dev_mse = metrics.mse(dev_pred, dev_dataset.labels) logger.info('==> Epoch {}, Dev \tLoss: {}\tPearson: {}\tMSE: {}'.format(epoch, dev_loss, dev_pearson, dev_mse)) test_pearson = metrics.pearson(test_pred, test_dataset.labels) test_mse = metrics.mse(test_pred, test_dataset.labels) logger.info('==> Epoch {}, Test \tLoss: {}\tPearson: {}\tMSE: {}'.format(epoch, test_loss, test_pearson, test_mse)) if best < test_pearson: best = test_pearson checkpoint = { 'model': trainer.model.state_dict(), 'optim': trainer.optimizer, 'pearson': test_pearson, 'mse': test_mse, 'args': args, 'epoch': epoch } logger.debug('==> New optimum found, checkpointing everything now...') torch.save(checkpoint, '%s.pt' % os.path.join(args.save, args.expname))
def get_metrics(self): """Return the metrics instance of this node.""" if self.__metrics is None: self.__metrics = Metrics(self) return self.__metrics
class MyTestCase(unittest.TestCase): def setUp(self): self.labels_pred = { 0: 1, 1: 0, 2: 1, 3: 1, 4: 1, 5: 1, 6: 1, 7: 0, 8: 0, 9: 0, 10: 0, 11: 2, 12: 1, 13: 2, 14: 1, 15: 2, 16: 2} self.labels_true = { 0: 0, 1: 0, 2: 0, 3: 0, 4: 0, 5: 0, 6: 1, 7: 1, 8: 1, 9: 1, 10: 1, 11: 1, 12: 2, 13: 2, 14: 2, 15: 2, 16: 2} self._n = len(self.labels_pred) self.metrics = Metrics(self.labels_true, self.labels_pred) def test_pairwise_precision_recall_f1(self): precision, recall, f1 = self.metrics._pairwise_precision_recall_f1() self.assertEqual(precision, 20.0/44) self.assertEqual(recall, 20.0/40) self.assertEqual(f1, 10.0/21) def test_cluster_precision_recall_f1(self): precision, recall, f1 = self.metrics._cluster_precision_recall_f1() self.assertEqual(precision, 0) self.assertEqual(recall, 0) self.assertEqual(f1, 0) def test_closest_cluster_precision_recall_f1(self): precision, recall, f1 = self.metrics._closest_cluster_precision_recall_f1() x = (4.0/7.0 + 5.0/9.0 + 3.0/6.0)/3.0 self.assertEqual(precision, x) self.assertEqual(recall, x) self.assertEqual(f1, 2*x*x/(x+x)) def test_average_author_cluster_purity(self): aap = 149.0/255 acp = 193.0/340 average_author_purity, average_cluster_purity, k = self.metrics._average_author_cluster_purity() self.assertEqual(average_author_purity, aap) self.assertEqual(average_cluster_purity, acp) self.assertEqual(k, (aap*acp)**0.5) def test_homogeneity_completeness_vmeasure(self): labels_true, labels_pred = _linearize(self.labels_true, self.labels_pred) sk_homogeneity, sk_completeness, sk_vmeasure = skmetrics.homogeneity_completeness_v_measure(labels_true, labels_pred) homogeneity, completeness, vmeasure = self.metrics._homogeneity_completeness_vmeasure(1) self.assertEqual(homogeneity, sk_homogeneity) self.assertEqual(completeness, sk_completeness) self.assertEqual(sk_vmeasure, vmeasure) def test_cluster(self): clusters = frozenset({frozenset({0, 1, 2, 3, 4, 5}), frozenset({6, 7, 8, 9, 10, 11}), frozenset({12, 13, 14, 15, 16})}) self.assertSetEqual(_cluster(self.labels_true), clusters) def test_intersection_size(self): self.assertEqual(_intersection_size(_cluster(self.labels_true), _cluster(self.labels_pred)), 20) def test_number_pairs(self): self.assertEqual(_number_pairs(_cluster(self.labels_true)), 40) def test_jaccard(self): set1 = {1, 2, 3} set2 = {3, 4, 5} set3 = {6} self.assertEqual(_jaccard(set1, set2), 1.0/5) self.assertEqual(_jaccard(set1, set3), 0) def test_global_merge_distance(self): """ This tests GMD using the relationship to other properties specified in the original paper """ fs = lambda x, y: x*y fm = lambda x, y: 0 independent = frozenset({frozenset({0}), frozenset({1}), frozenset({2}), frozenset({3}), frozenset({4}), frozenset({5}), frozenset({6}), frozenset({7}), frozenset({8}), frozenset({9}), frozenset({10}), frozenset({11}), frozenset({12}), frozenset({13}), frozenset({14}), frozenset({15}), frozenset({16})}) gmd_pairwise_precision = 1 - self.metrics.global_merge_distance(fs, fm)/self.metrics.global_merge_distance(fs, fm, S=independent) fs = lambda x, y: 0 fm = lambda x, y: x*y gmd_pairwise_recall = 1 - self.metrics.global_merge_distance(fs, fm)/self.metrics.global_merge_distance(fs, fm, R=independent) pairwise_precision, pairwise_recall, f1 = self.metrics._pairwise_precision_recall_f1() self.assertAlmostEqual(gmd_pairwise_precision, pairwise_precision) self.assertAlmostEqual(gmd_pairwise_recall, pairwise_recall) def test_mutual_information(self): labels_true, labels_pred = _linearize(self.labels_true, self.labels_pred) mi = self.metrics._mutual_information(self.metrics._clusters_pred, self.metrics._clusters_true) self.assertAlmostEqual(mi, skmetrics.mutual_info_score(labels_true, labels_pred)) def test_variation_of_information(self): vi = self.metrics._variation_of_information() h = lambda x: float(x)/self._n*math.log(float(x)/self._n) fs = lambda x, y: h(x+y) - h(x) - h(y) fm = fs gmd_vi = self.metrics.global_merge_distance(fs, fm) self.assertEqual(vi, gmd_vi) def test_purity(self): purity = self.metrics._purity() self.assertEqual(purity, 12.0/17) def test_entity_sizes(self): sizes_true = np.array([[5, 1], [6, 2]]) sizes_pred = np.array([[4, 1], [5, 1], [8, 1]]) self.assertTrue(np.array_equal(sizes_true, self.metrics._entity_sizes_true)) self.assertTrue(np.array_equal(sizes_pred, self.metrics._entity_sizes_pred)) def test_plot(self): self.metrics.display()
class CTTTrainer(TensorboardMixin, WandBMixin, IOMixin, BaseExperiment): WANDB_PROJECT = "ctt" def __init__(self): super(CTTTrainer, self).__init__() self.auto_setup() self._build() def _build(self): self._build_loaders() self._build_model() self._build_criteria_and_optim() self._build_scheduler() def _build_model(self): self.model: nn.Module = to_device( ContactTracingTransformer(**self.get("model/kwargs", {})), self.device) def _build_loaders(self): train_path = self.get("data/paths/train", ensure_exists=True) validate_path = self.get("data/paths/validate", ensure_exists=True) self.train_loader = get_dataloader(path=train_path, **self.get("data/loader_kwargs", ensure_exists=True)) self.validate_loader = get_dataloader(path=validate_path, **self.get("data/loader_kwargs", ensure_exists=True)) def _build_criteria_and_optim(self): # noinspection PyArgumentList self.loss = WeightedSum.from_config( self.get("losses", ensure_exists=True)) optim_cls = getattr(opts, self.get("optim/name", "Adam")) self.optim = optim_cls(self.model.parameters(), **self.get("optim/kwargs")) self.metrics = Metrics() def _build_scheduler(self): # Set up an epoch-wise scheduler here if you want to, but the # recommendation is to use the one defined in opts. self.scheduler = None @property def device(self): return self.get("device", "cuda" if torch.cuda.is_available() else "cpu") @register_default_dispatch def train(self): if self.get("wandb/use", True): self.initialize_wandb() for epoch in self.progress(range( self.get("training/num_epochs", ensure_exists=True)), tag="epochs"): self.train_epoch() validation_stats = self.validate_epoch() self.checkpoint() self.log_progress("epochs", **validation_stats) self.step_scheduler(epoch) self.next_epoch() def train_epoch(self): self.clear_moving_averages() self.model.train() for model_input in self.progress(self.train_loader, tag="train"): # Evaluate model model_input = to_device(model_input, self.device) model_output = Dict(self.model(model_input)) # Compute loss losses = self.loss(model_input, model_output) loss = losses.loss self.optim.zero_grad() loss.backward() self.optim.step() # Log to wandb (if required) self.log_training_losses(losses) self.log_learning_rates() # Log to pbar self.accumulate_in_cache("moving_loss", loss.item(), momentum_accumulator(0.9)) self.log_progress( "train", loss=self.read_from_cache("moving_loss"), ) self.next_step() def validate_epoch(self): all_losses_and_metrics = defaultdict(list) self.metrics.reset() self.model.eval() for model_input in self.progress(self.validate_loader, tag="validation"): with torch.no_grad(): model_input = to_device(model_input, self.device) model_output = Dict(self.model(model_input)) losses = self.loss(model_input, model_output) self.metrics.update(model_input, model_output) all_losses_and_metrics["loss"].append(losses.loss.item()) for key in losses.unweighted_losses: all_losses_and_metrics[key].append( losses.unweighted_losses[key].item()) # Compute mean for all losses all_losses_and_metrics = Dict( {key: np.mean(val) for key, val in all_losses_and_metrics.items()}) all_losses_and_metrics.update(Dict(self.metrics.evaluate())) self.log_validation_losses_and_metrics(all_losses_and_metrics) # Store the validation loss in cache. This will be used for checkpointing. self.write_to_cache("current_validation_metrics", all_losses_and_metrics) return all_losses_and_metrics def log_training_losses(self, losses): if self.log_wandb_now and self.get("wandb/use", False): metrics = Dict({"training_loss": losses.loss}) metrics.update({ f"training_{k}": v for k, v in losses.unweighted_losses.items() }) self.wandb_log(**metrics) if self.log_scalars_now: for key, value in losses.unweighted_losses.items(): self.log_scalar(f"training/{key}", value) return self def checkpoint(self, force=False): # Checkpoint as required if force or self.epoch % self.get("training/checkpoint/every", 1) == 0: info_dict = { "model": self.model.state_dict(), "optim": self.optim.state_dict(), } torch.save(info_dict, self.checkpoint_path) if self.get("training/checkpoint/if_best", True): # Save a checkpoint if the validation loss is better than best self.checkpoint_if_best_validation_loss() return self def checkpoint_if_best_validation_loss(self): current_validation_loss = self.read_from_cache( "current_validation_loss", float("inf")) best_validation_loss = self.read_from_cache("best_validation_loss", float("inf")) if current_validation_loss < best_validation_loss: self.write_to_cache("best_validation_loss", current_validation_loss) ckpt_path = os.path.join(self.checkpoint_directory, "best.ckpt") else: ckpt_path = None if ckpt_path is not None: info_dict = { "model": self.model.state_dict(), "optim": self.optim.state_dict(), } torch.save(info_dict, ckpt_path) return self def load(self, device=None): ckpt_path = os.path.join(self.checkpoint_directory, "best.ckpt") if not os.path.exists(ckpt_path): raise FileNotFoundError info_dict = torch.load( ckpt_path, map_location=torch.device( (self.device if device is None else device)), ) self.model.load_state_dict(info_dict["model"]) self.optim.load_state_dict(info_dict["optim"]) return self def log_validation_losses_and_metrics(self, losses): if self.get("wandb/use", False): metrics = {f"validation_{k}": v for k, v in losses.items()} self.wandb_log(**metrics) for key, value in losses.items(): self.log_scalar(f"validation/{key}", value) return self def clear_moving_averages(self): return self.clear_in_cache("moving_loss") def step_scheduler(self, epoch): if self.scheduler is not None: self.scheduler.step(epoch) return self def log_learning_rates(self): lrs = { f"lr_{i}": param_group["lr"] for i, param_group in enumerate(self.optim.param_groups) } if self.get("wandb/use", False): self.wandb_log(**lrs) for key, value in lrs.items(): self.log_scalar(f"training/{key}", value) return self
# Load data u = Utils() train_facile = u.load_matrix('data/data_train_facile.mat') #generate pairs pairs_idx, pairs_label = u.generate_pairs(train_facile['label'], 1000, 0.1) newX,newY = u.select_pairs_data(pairs_idx,train_facile['X'],train_facile['label'],c=700) feat_idx = u._feat_idx #test gradient g = Gradient() M_ini = g.generate_I(newX.shape[1]) M = g.sgd_metric_learning(newX, newY, 0.002, 50000, 0, M_ini) # Calculate distance m = Metrics() X = u.select_features(train_facile['X'],feat_idx) X -= X.mean(axis=0) X /= X.std(axis=0) X[np.isnan(X)] = 0. dist = m.mahalanobis_dist(X, pairs_idx,M) #dist[np.isnan(dist)] = 50. ## Evaluate model e = Evaluate() e.evaluation(pairs_label,dist) ## display results e.display_roc() e.easy_score() # Evaluate test dataset and save it test_facile = u.load_matrix('data/data_test_facile.mat')
class Metric_Tests(unittest.TestCase): def setUp(self): self.metrics = Metrics() self.tile = Tile() def test_evaluations_exist(self): self.assertNotEqual(Metrics, None) # When the tile is one step away def test_displaced_tiles_yields_one(self): tile = Mover.move_left(Tile.duplicate(self.tile)) self.assertEqual(self.metrics.displaced(tile), 1) # When the tile is two steps away def test_displaced_tiles_yields_two(self): tile = Mover.move_left(Tile.duplicate(self.tile)) tile = Mover.move_up(tile) self.assertEqual(self.metrics.displaced(tile), 2) # When no corners are solved def test_corner_yields_two(self): results = Tile() results.layout = [[2, 5, 6], [1, 7, 8], [3, 4, 0]] self.assertEqual(self.metrics.subset(results), 2) # When upper left corner is solved def test_corner_yields_one(self): results = Tile() results.layout = [[1, 2, 3], [8, 5, 6], [7, 4, 0]] self.assertEqual(self.metrics.subset(results), 1) # When bottom right corner is solved def test_corner_yields_one_again(self): results = Tile() results.layout = [[2, 0, 3], [8, 1, 4], [7, 6, 5]] self.assertEqual(self.metrics.subset(results), 1) # When in the goal state def test_corner_yields_zero_again(self): results = Tile() self.assertEqual(self.metrics.subset(results), 0) # When one tile is out of place def test_manhattan_distance_yields_one(self): tile = Mover.move_down(Tile()) self.assertEqual(self.metrics.manhattan(tile), 1) def test_manhattan_distance_yields_zero(self): self.assertEqual(self.metrics.manhattan(Tile()), 0) # When 14 tiles are out of place def test_manhattan_distance_yields_fourteen(self): results = Tile() results.layout = [ [2, 4, 0], [8, 6, 7], [5, 1, 3], ] self.assertEqual(self.metrics.manhattan(results), 14) # When two tiles are out of place def test_manhattan_distance_yields_two(self): results = Tile() results.layout = [ [1, 3, 0], [8, 2, 4], [7, 6, 5], ] self.assertEqual(self.metrics.manhattan(results), 2)
class TestMetrics(MockTestCase): def setUp(self): self.metrics = Metrics() self.metrics.factory = self.mock() self.fileMetrics = self.mock() self.metrics.factory.expects(once()).create().will(return_value(self.fileMetrics)) def node(self, name="test.cpp"): newNode = self.mock() newNode.expects(at_least_once()).file().will(return_value(name)) return newNode def testAddFile(self): node = self.node() self.fileMetrics.expects(once()).addNode(same(node)) self.metrics.addFile(node) assert(self.metrics.file("test.cpp") is self.fileMetrics) self.assertEqual(len(self.metrics.files()), 1) assert(self.metrics.files()[0] is self.fileMetrics) def testAddSameFileTwice(self): node = self.node() self.fileMetrics.expects(once()).addNode(same(node)) self.fileMetrics.expects(once()).addNode(same(node)) self.metrics.addFile(node) self.metrics.addFile(node) assert(self.metrics.file("test.cpp") is self.fileMetrics) self.assertEqual(len(self.metrics.files()), 1) def testTwoDifferentFiles(self): node1 = self.node("test1.h") node2 = self.node("test2.h") self.fileMetrics.expects(once()).addNode(same(node1)) self.fileMetrics.expects(once()).addNode(same(node2)) self.metrics.factory.expects(once()).create().will(return_value(self.fileMetrics)) self.metrics.addFile(node1) self.metrics.addFile(node2) assert(self.metrics.file("test1.h") is self.fileMetrics) assert(self.metrics.file("test2.h") is self.fileMetrics) self.assertEqual(len(self.metrics.files()), 2)
def main(argv): # Allow running multiple at once set_gpu_memory(FLAGS.gpumem) # Figure out the log and model directory filenames assert FLAGS.uid != "", "uid cannot be an empty string" model_dir, log_dir = get_directory_names() if not os.path.exists(model_dir): os.makedirs(model_dir) if not os.path.exists(log_dir): os.makedirs(log_dir) # Write config file about what dataset we're using, sources, target, etc. file_utils.write_config_from_args(log_dir) # Load datasets source_datasets, target_dataset = load_datasets.load_da(FLAGS.dataset, FLAGS.sources, FLAGS.target, test=FLAGS.test) # for x in source_datasets: # print (x) # source_train_iterators = [iter(x.train) for x in source_datasets] # print (len(source_train_iterators)) # for x in source_train_iterators: # a = next(x) # print (a) # data_sources = [next(x) for x in source_train_iterators] # data_sources = [next(x) for x in source_train_iterators] # data_sources = [next(x) for x in source_train_iterators] # Need to know which iteration for learning rate schedule global_step = tf.Variable(0, name="global_step", trainable=False) # Load the method, model, etc. method = methods.get_method(FLAGS.method, source_datasets=source_datasets, target_dataset=target_dataset, model_name=FLAGS.model, global_step=global_step, total_steps=FLAGS.steps, ensemble_size=FLAGS.ensemble, moving_average=FLAGS.moving_average, share_most_weights=FLAGS.share_most_weights) # Check that this method is supposed to be trainable. If not, we're done. # (Basically, we just wanted to write the config file for non-trainable # models.) if not method.trainable: print("Method not trainable. Exiting now.") return # Checkpoints checkpoint = tf.train.Checkpoint( global_step=global_step, **method.checkpoint_variables) checkpoint_manager = CheckpointManager(checkpoint, model_dir, log_dir) checkpoint_manager.restore_latest() # Metrics has_target_domain = target_dataset is not None metrics = Metrics(log_dir, method, source_datasets, target_dataset, has_target_domain) # Start training # # TODO maybe eventually rewrite this in the more-standard Keras way # See: https://www.tensorflow.org/guide/keras/train_and_evaluate for i in range(int(global_step), FLAGS.steps+1): t = time.time() data_sources, data_target = method.train_step() global_step.assign_add(1) t = time.time() - t if FLAGS.time_training: print(int(global_step), t, sep=",") continue # skip evaluation, checkpointing, etc. when timing if i%1000 == 0: print("step %d took %f seconds"%(int(global_step), t)) sys.stdout.flush() # otherwise waits till the end to flush on Kamiak # Metrics on training/validation data if FLAGS.log_train_steps != 0 and i%FLAGS.log_train_steps == 0: metrics.train(data_sources, data_target, global_step, t) # Evaluate every log_val_steps but also at the last step validation_accuracy_source = None validation_accuracy_target = None if (FLAGS.log_val_steps != 0 and i%FLAGS.log_val_steps == 0) \ or i == FLAGS.steps: validation_accuracy_source, validation_accuracy_target \ = metrics.test(global_step) print(validation_accuracy_source,validation_accuracy_target) # Checkpoints -- Save either if at the right model step or if we found # a new validation accuracy. If this is better than the previous best # model, we need to make a new checkpoint so we can restore from this # step with the best accuracy. if (FLAGS.model_steps != 0 and i%FLAGS.model_steps == 0) \ or validation_accuracy_source is not None: checkpoint_manager.save(int(global_step-1), validation_accuracy_source, validation_accuracy_target) # Plots if FLAGS.log_plots_steps != 0 and i%FLAGS.log_plots_steps == 0: metrics.plots(global_step) # We're done -- used for hyperparameter tuning file_utils.write_finished(log_dir)
def setUp(self): self.metrics = Metrics() self.metrics.factory = self.mock() self.fileMetrics = self.mock() self.metrics.factory.expects(once()).create().will(return_value(self.fileMetrics))
def train(model, train_loader, val_loader, num_epochs, optimizer, criterion, args, start_epoch=0, best_val_score=0, best_val_epoch=0): """ This is the main training loop. It trains the model, evaluates the model and saves the metrics and predictions. """ metrics_stats_list = [] val_per_type_metric_list = [] if args.apply_rubi: val_per_type_metric_list_rubi, val_per_type_metric_list_q = [], [] lr_decay_step = 2 lr_decay_rate = .25 if optimizer is None: # lr_decay_epochs = range(10, 25, lr_decay_step) # gradual_warmup_steps = [0.5 * args.lr, 1.0 * args.lr, 1.5 * args.lr, 2.0 * args.lr] # if args.apply_rubi: lr_decay_epochs = range(14, 100, lr_decay_step) gradual_warmup_steps = [ i * args.lr for i in torch.linspace(0.5, 2.0, 7) ] print(gradual_warmup_steps) # else: # lr_decay_epochs = range(10, 25, lr_decay_step) # gradual_warmup_steps = [0.5 * args.lr, 1.0 * args.lr, 1.5 * args.lr, 2.0 * args.lr] optimizer = getattr(torch.optim, args.optimizer)(filter(lambda p: p.requires_grad, model.parameters()), lr=args.lr) else: gradual_warmup_steps = [] lr_decay_epochs = range(14, 100, lr_decay_step) iter_num = 0 if args.test and start_epoch == num_epochs: start_epoch = num_epochs - 1 for epoch in range(start_epoch, num_epochs): if epoch < len(gradual_warmup_steps): optimizer.param_groups[0]['lr'] = gradual_warmup_steps[epoch] elif epoch in lr_decay_epochs: optimizer.param_groups[0]['lr'] *= lr_decay_rate else: optimizer.param_groups[0]['lr'] = args.lr print("lr {}".format(optimizer.param_groups[0]['lr'])) is_best = False train_metrics, val_metrics = Metrics(), Metrics() if args.apply_rubi: train_metrics_rubi, val_metrics_rubi = Metrics(), Metrics() train_metrics_q, val_metrics_q = Metrics(), Metrics() else: val_metrics_rubi, val_metrics_q = None, None if not args.test: tqdm_train_loader = tqdm(train_loader, position=0, leave=True) for i, (visual_features, boxes, question_features, answers, question_types, question_ids, question_lengths) in enumerate(tqdm_train_loader): tqdm_train_loader.set_description( f'Loss : {train_metrics.get_loss()} | Score {train_metrics.get_score()}' ) visual_features = Variable(visual_features.float()) boxes = Variable(boxes.float()) question_features = Variable(question_features) answers = Variable(answers) if torch.cuda.is_available(): visual_features = visual_features.cuda() boxes = boxes.cuda() question_features = question_features.cuda() answers = answers.cuda() pred = model(visual_features, boxes, question_features, answers, question_lengths) loss = criterion(pred, answers)['loss'] loss.backward() train_metrics.update_per_batch(model, answers, loss, pred, visual_features.shape[0]) if args.apply_rubi: train_metrics_rubi.update_per_batch( model, answers, loss, pred, visual_features.shape[0], logits_key='logits_rubi') train_metrics_q.update_per_batch(model, answers, loss, pred, visual_features.shape[0], logits_key='logits_q') nn.utils.clip_grad_norm_(model.parameters(), 50) optimizer.step() optimizer.zero_grad() iter_num += 1 #if i % 10 == 0: #train_metrics.print(epoch) # if args.apply_rubi: # print("\n\n### logits_rubi ###") # train_metrics_rubi.print(epoch) # print("\n\n### logits_q ###") # train_metrics_q.print(epoch) train_metrics.update_per_epoch() if args.apply_rubi: train_metrics_rubi.update_per_epoch() train_metrics_q.update_per_epoch() if None != val_loader: # TODO: "val_loader is not None' was not working for some reason print("Starting the test ... ") model.eval() with torch.no_grad(): val_results = evaluate_by_logits_key(model, val_loader, epoch, criterion, args, val_metrics, logits_key='logits') if args.apply_rubi: val_results_rubi = evaluate_by_logits_key( model, val_loader, epoch, criterion, args, val_metrics_rubi, logits_key='logits_rubi') val_results_q = evaluate_by_logits_key( model, val_loader, epoch, criterion, args, val_metrics_q, logits_key='logits_q') # eval_results = evaluate(model, val_loader, epoch, criterion, args, val_metrics, val_metrics_rubi, # val_metrics_q) # TODO: FIX, use a loop to do this model.train() if val_metrics.score > best_val_score: best_val_score = val_metrics.score best_val_epoch = epoch is_best = True save_val_metrics = not args.test or not args.test_does_not_have_answers if save_val_metrics: print("Best val score {} at epoch {}".format( best_val_score, best_val_epoch)) print(f"### Val from Logits {val_metrics.score}") if args.apply_rubi: print(f"### Val from Logits_rubi {val_metrics_rubi.score}") print(f"### Val from Logits_q {val_metrics_q.score}") # print( # f"##### by logits key {val_metrics_by_logits_key.score} " # f"val_metrics_by_logits_key_rubi {val_metrics_by_logits_key_rubi.score} " # f"Logits score: {val_metrics.score} " # f"Logits_rubi score: {val_metrics_rubi.score} " # f"Logits_q score: {val_metrics_q.score} ####") val_per_type_metric_list.append( val_results['per_type_metric'].get_json()) if args.apply_rubi: val_per_type_metric_list_rubi.append( val_results_rubi['per_type_metric'].get_json()) val_per_type_metric_list_q.append( val_results_q['per_type_metric'].get_json()) metrics = accumulate_metrics(epoch, train_metrics, val_metrics, val_results['per_type_metric'], best_val_score, best_val_epoch, save_val_metrics) metrics_stats_list.append(metrics) # Add metrics + parameters of the model and optimizer metrics_n_model = save_metrics_n_model(metrics, model, optimizer, args, is_best) VqaUtils.save_stats(metrics_stats_list, val_per_type_metric_list, val_results['all_preds'], args.expt_save_dir, split=args.test_split, epoch=epoch) # if args.apply_rubi: # VqaUtils.save_stats(metrics_stats_list, val_per_type_metric_list_rubi, val_results_rubi['all_preds'], # args.expt_save_dir, # split=args.test_split, epoch=epoch, suffix='rubi') # VqaUtils.save_stats(metrics_stats_list, val_per_type_metric_list_q, val_results_q['all_preds'], # args.expt_save_dir, # split=args.test_split, epoch=epoch, suffix='q') if args.test: VqaUtils.save_preds(val_results['all_preds'], args.expt_save_dir, args.test_split, epoch) print("Test completed!") break
Created on Tue Jun 16 17:57:09 2015 @author: Paco """ from utils import Utils from evaluate import Evaluate from metrics import Metrics # Load data u = Utils() train_hard = u.load_matrix('data/data_train_difficile.mat') #generate pairs pairs_idx, pairs_label = u.generate_pairs(train_hard['label'], 1000, 0.1) # Calculate distance m = Metrics() dist = m.braycurtis_dist(train_hard['X'], pairs_idx) # Evaluate model e = Evaluate() e.evaluation(pairs_label,dist) # display results e.display_roc() e.hard_score() # Evaluate test dataset and save it test_hard = u.load_matrix('data/data_test_difficile.mat') dist_test = m.braycurtis_dist(test_hard['X'], test_hard['pairs']) u.save_test(dist_test,filetxt='soumission_dur.txt')
df_binary = pd.DataFrame(data_binary) df_nominal_full = pd.DataFrame(data_nominal_full) df_nominal_missing = pd.DataFrame(data_nominal_missing) df_cohens = pd.DataFrame(data_cohens) df_fleiss = pd.DataFrame(data_fleiss) kripp_binary = Krippendorff(df_binary) kripp_nominal_full = Krippendorff(df_nominal_full) kripp_nominal_missing = Krippendorff(df_nominal_missing) kripp_test = Krippendorff(df_test) kripp_binary = Krippendorff(df_binary) kripp_nominal_full = Krippendorff(df_nominal_full) kripp_nominal_missing = Krippendorff(df_nominal_missing) mets = Metrics(df_test) mets_cohens = Metrics(df_cohens) mets_fleiss = Metrics(df_fleiss) class TestMetrics(unittest.TestCase): """ Tests for Krippendorff's alpha computations from disagree.metrics.Krippendorff """ def test_kripps_alpha_value_with_binary_data(self): # Test the final value of kripps alpha, from Krippendorff paper # Page 3 alpha = kripp_binary.alpha(data_type="nominal") alpha = float("{:.3f}".format(alpha)) self.assertTrue(alpha == 0.095)
def execute(): metrics = Metrics() document_vectors_list = [] document_id = 0 initial_centroids = [] tempo_final = 0 tempo_inicial = 0 new_closests = [] new_clusters_mpi = [] temp_dist_mpi = 1 initial_centroids_mpi = [] document_mpi = [] comm = MPI.COMM_WORLD rank = comm.Get_rank() size = comm.Get_size() comm.Barrier() if rank == 0: files = [] fileNames = sorted(glob2.glob("longo/*.txt")) for filename in fileNames: files.append(open(filename, "r+").read()) serviceTextMining = ServiceTextMining() terms = serviceTextMining.select_terms(files) matriz_tf = serviceTextMining.create_matriz_itf_terms(terms) matriz_df = serviceTextMining.create_matriz_idf_terms(terms, files) matriz_tf_df = serviceTextMining.create_matriz_tf_df_terms( matriz_tf, matriz_df) for line in range(len(matriz_tf_df)): document_vector = [] for column in matriz_tf_df[line]: document_vector.append(column) document_vectors_list.append((document_id, document_vector)) document_id += 1 initial_centroids_mpi = random.sample(document_vectors_list, k=3) else: document_vectors_list = [] initial_centroids_mpi = [] comm.Barrier() initial_centroids_mpi = comm.bcast(initial_centroids_mpi, root=0) document_mpi = comm.scatter(document_vectors_list, root=0) tempo_inicial = MPI.Wtime() print("RUN MPI") while temp_dist_mpi > 0.01: best_centroid = {} reduce_closest_mpi = [] closests = [] reduce_closest = [] new_clusters = [] if rank == 0: num_workers = len(document_vectors_list) - 1 closed_workers = 0 while closed_workers < num_workers: status = MPI.Status() best = comm.recv(source=MPI.ANY_SOURCE, tag=MPI.ANY_TAG, status=status) if best['max_value'] != 0: closests.append( (best['best_index'], (best['best_vc_doc'], 1))) else: closests.append(('erro', (best['best_vc_doc'], 1))) closed_workers += 1 new_closests = [ d for d in [d for d in closests if d[0] != 'erro'] if d[0] != [] ] for nc in new_closests: total_doc = 0 document_sum = [] closest = [d for d in reduce_closest if d[0] == nc[0]] if not closest: reduce_closest.append((nc[0], (0, nc[1][0][1]), nc[1][1])) continue for k, rc in enumerate(reduce_closest): if rc[0] == nc[0]: total_doc = closest[0][2] + nc[1][1] document_sum = [ sum(x) for x in zip(nc[1][0][1], closest[0][1][1]) ] reduce_closest[k] = (nc[0], (0, document_sum), total_doc) break for rc in reduce_closest: new_clusters.append(get_new_clusters(rc)) else: max_value = 0 var = 0 best = 0 for c in initial_centroids_mpi: temp = metrics.get_cosine_distance(c[1], document_mpi[1]) if temp != 0: if temp > max_value: max_value = temp best = c[0] best_centroid = { 'best_index': best, 'max_value': max_value, 'best_vc_doc': document_mpi } comm.send(best_centroid, dest=0) new_clusters_mpi = comm.bcast(new_clusters, root=0) comm.Barrier() if rank == 0: results = [] for index_cluster in range(len(new_clusters_mpi)): results.append( metrics.get_eculedian_distance( initial_centroids_mpi[index_cluster][1], new_clusters_mpi[index_cluster][1])) temp_dist_mpi = sum(results) for iK in range(len(new_clusters_mpi)): initial_centroids_mpi[iK] = (new_clusters_mpi[iK][0], new_clusters_mpi[iK][1]) initial_centroids_mpi = comm.bcast(initial_centroids_mpi, root=0) temp_dist_mpi = comm.bcast(temp_dist_mpi, root=0) if rank == 0: tempo_final = MPI.Wtime() print("Tempo ", tempo_final - tempo_inicial) return new_closests
def train_one_epoch( args, model, train_iter, optimizers, criterion, eval_iter, vocab, epoch, metrics=Metrics(), loss_aggr=None, ): labels_with_high_model_score = None with trange(len(train_iter)) as t: for iter, batch in enumerate(train_iter): model.to( args.device, args.out_device, ) model.train() batch_token_ids, label_ids, label_probs, eval_mask, _, _, orig_batch, loaded_batch = batch enc = None if (args.collect_most_popular_labels_steps is not None and args.collect_most_popular_labels_steps > 0 and iter > 0 and iter % args.collect_most_popular_labels_steps == 0): model.to(args.device, args.eval_device) with torch.no_grad(): logits_, _, _, _, _, enc = model( batch_token_ids, None, None, ) # logits: (N, T, VOCAB), y: (N, T) labels_with_high_model_score = get_topk_ids_aggregated_from_seq_prediction( logits_, topk_from_batch=args.label_size, topk_per_token=args.topk_neg_examples) batch_token_ids, label_ids, label_probs, eval_mask, _, _, _, _ = EDLDataset_collate_func( args=args, labels_with_high_model_score= labels_with_high_model_score, batch=orig_batch, return_labels=True, vocab=vocab, is_training=False, loaded_batch=loaded_batch, ) # if args.label_size is not None: logits, y, y_hat, label_probs, sparse_params, _ = model( batch_token_ids, label_ids, label_probs, enc=enc) # logits: (N, T, VOCAB), y: (N, T) logits = logits.view(-1) # (N*T, VOCAB) label_probs = label_probs.view(-1) # (N*T,) loss = criterion(logits, label_probs) loss.backward() if (iter + 1) % args.accumulate_batch_gradients == 0: for optimizer in optimizers: optimizer.step() optimizer.zero_grad() if iter == 0: logging.debug(f"Sanity check") logging.debug("x:", batch_token_ids.cpu().numpy()[0]) logging.debug( "tokens:", vocab.tokenizer.convert_ids_to_tokens( batch_token_ids.cpu().numpy()[0])) logging.debug("y:", label_probs.cpu().numpy()[0]) loss_aggr = running_mean(loss.detach().item(), loss_aggr) if iter > 0 and iter % args.checkpoint_eval_steps == 0: metrics = Net.evaluate( args=args, model=model, iterator=eval_iter, optimizers=optimizers, step=iter, epoch=epoch, save_checkpoint=iter % args.checkpoint_save_steps == 0, sampled_evaluation=False, metrics=metrics, vocab=vocab, ) t.set_postfix( loss=loss_aggr, nr_labels=len(label_ids), aggr_labels=len(labels_with_high_model_score) if labels_with_high_model_score else 0, last_eval=metrics.report( filter={"f1", "num_proposed", "epoch", "step"}), ) t.update() for optimizer in optimizers: optimizer.step() optimizer.zero_grad() return metrics
# net = torch.load('/home/intern1/qiuzhen/Works/result_for_structure_segment/yin_U_Net_LRS_256_cv2_newdata.pkl') # net = UNet256_kernel(4, BatchNorm=True) # net = torch.load('/home/intern1/qiuzhen/Works/result_for_structure_segment/M_Net_LBO_256_cv2.pkl') print('model : yin_unet_for_newdata') # net = models.resnet34(pretrained=True) # fc_features = net.fc.in_features # net.fc = nn.Linear(fc_features, 2) net.cuda() loss = nn.CrossEntropyLoss(size_average=True).cuda() optimizer = optim.Adam(net.parameters(), lr=0.01) #优化方法 image_list, iterper_epo = loadImageList(path, batchsize=batchsize, flag='train') total = len(image_list) print('train_data_len:' + str(total)) metric = Metrics(4) epochs = 500 max = 1.28982 for i in range(epochs): net.train() metric.reset() acc_list = [] random.shuffle(image_list) running_loss = [] for j in range(iterper_epo): if j == (iterper_epo - 1): iterlist = image_list[j * batchsize:] else: iterlist = image_list[j * batchsize:(j + 1) * batchsize] img_data, img_label = loaddata(path, iterlist) r_loss, correct = train(net, loss, optimizer, img_data, img_label,
def main(args: APNamespace): root_path = Path(args.root).expanduser() config_path = root_path / Path(args.config).expanduser() data_path = root_path / Path(args.data).expanduser() output_path = root_path / Path(args.output).expanduser() global checkpoint_path checkpoint_path = root_path / Path(args.checkpoint).expanduser() if not config_path.exists(): # logging.critical(f"AdaS: Config path {config_path} does not exist") print(f"AdaS: Config path {config_path} does not exist") raise ValueError if not data_path.exists(): print(f"AdaS: Data dir {data_path} does not exists, building") data_path.mkdir(exist_ok=True, parents=True) if not output_path.exists(): print(f"AdaS: Output dir {output_path} does not exists, building") output_path.mkdir(exist_ok=True, parents=True) if not checkpoint_path.exists(): if args.resume: print(f"AdaS: Cannot resume from checkpoint without specifying " + "checkpoint dir") raise ValueError if checkpoint_path.is_dir(): print(f"AdaS: Checkpoint dir {checkpoint_path} does not exists, " + "building") checkpoint_path.mkdir(exist_ok=True, parents=True) else: print(f"AdaS: Checkpoint path {checkpoint_path} doesn't exist " + "building directory to store checkpoints: .adas-checkpoint") checkpoint_path.cwd().mkdir(exist_ok=True, parents=True) with config_path.open() as f: config = yaml.load(f) device = 'cuda' if torch.cuda.is_available() else 'cpu' global best_acc best_acc = 0 # best test accuracy start_epoch = 0 # start from epoch 0 or last checkpoint epoch print("Adas: Argument Parser Options") print("-"*45) print(f" {'config':<20}: {args.config:<20}") print(f" {'data':<20}: {args.data:<20}") print(f" {'output':<20}: {args.output:<20}") print(f" {'checkpoint':<20}: {args.checkpoint:<20}") print(f" {'resume':<20}: {args.resume:<20}") print("\nAdas: Train: Config") print(f" {'Key':<20} {'Value':<20}") print("-"*45) for k, v in config.items(): print(f" {k:<20} {v:<20}") for trial in range(config['n_trials']): device # Data # logging.info("Adas: Preparing Data") train_loader, test_loader = get_data( root=data_path, dataset=config['dataset'], mini_batch_size=config['mini_batch_size']) global performance_statistics, net, metrics, adas performance_statistics = {} # logging.info("AdaS: Building Model") net = get_net(config['network'], num_classes=10 if config['dataset'] == 'CIFAR10' else 100 if config['dataset'] == 'CIFAR100' else 1000 if config['dataset'] == 'ImageNet' else 10) metrics = Metrics(list(net.parameters()), p=config['p']) if config['lr_scheduler'] == 'AdaS': adas = AdaS(parameters=list(net.parameters()), beta=config['beta'], zeta=config['zeta'], init_lr=float(config['init_lr']), min_lr=float(config['min_lr']), p=config['p']) net = net.to(device) global criterion criterion = get_loss(config['loss']) # TODO config optimizer, scheduler = get_optimizer_scheduler( init_lr=float(config['init_lr']), optim_method=config['optim_method'], lr_scheduler=config['lr_scheduler']) if device == 'cuda': net = torch.nn.DataParallel(net) cudnn.benchmark = True if args.resume: # Load checkpoint. print("Adas: Resuming from checkpoint...") if checkpoint_path.is_dir(): checkpoint = torch.load(str(checkpoint_path / 'ckpt.pth')) else: checkpoint = torch.load(str(checkpoint_path)) net.load_state_dict(checkpoint['net']) best_acc = checkpoint['acc'] start_epoch = checkpoint['epoch'] if adas is not None: adas.historical_io_metrics = \ checkpoint['historical_io_metrics'] # model_parameters = filter(lambda p: p.requires_grad, # net.parameters()) # params = sum([np.prod(p.size()) for p in model_parameters]) # print(params) epochs = range(start_epoch, start_epoch + config['max_epoch']) for epoch in epochs: start_time = time.time() print(f"AdaS: Epoch {epoch} Started.") train_loss, train_accuracy = epoch_iteration( train_loader, epoch, device, optimizer) end_time = time.time() if config['lr_scheduler'] == 'StepLR': scheduler.step() test_loss, test_accuracy = test_main(test_loader, epoch, device) total_time = time.time() print( f"AdaS: Epoch {epoch}/{epochs[-1]} Ended | " + "Total Time: {:.3f}s | ".format(total_time - start_time) + "Epoch Time: {:.3f}s | ".format(end_time - start_time) + "Est. Time Remaining: {:.3f}s | ".format( (total_time - start_time) * (epochs[-1] - epoch)), "Train Loss: {:.4f}% | Train Acc. {:.4f}% | ".format( train_loss, train_accuracy) + "Test Loss: {:.4f}% | Test Acc. {:.4f}%".format(test_loss, test_accuracy)) df = pd.DataFrame(data=performance_statistics) if config['lr_scheduler'] == 'AdaS': xlsx_name = \ f"config['optim_method']_AdaS_trial={trial}_" +\ f"beta={config['beta']}_initlr=config['init_lr']_" +\ f"net={config['network']}_dataset={config['dataset']}.xlsx" else: xlsx_name = \ f"config['optim_method']_config['lr_scheduler']_" +\ f"trial={trial}_initlr=config['init_lr']" +\ f"net={config['network']}_dataset={config['dataset']}.xlsx" df.to_excel(str(output_path / xlsx_name))
def validate(self): self.decoder.eval() # eval mode (no dropout or batchnorm) if self.encoder is not None: self.encoder.eval() batch_time = AverageMeter() losses = AverageMeter() top5accs = AverageMeter() start = time.time() ground_truth = list( ) # ground_truth (true captions) for calculating BLEU-4 score prediction = list() # prediction (predicted captions) # explicitly disable gradient calculation to avoid CUDA memory error # solves the issue #57 with torch.no_grad(): # Batches for i, (imgs, caps, caplens, allcaps) in enumerate(self.val_loader): # move to device, if available imgs = imgs.to(self.device) caps = caps.to(self.device) caplens = caplens.to(self.device) # forward encoder if self.encoder is not None: imgs = self.encoder(imgs) # forward decoder if self.caption_model == 'att2all': scores, caps_sorted, decode_lengths, alphas, sort_ind = self.decoder( imgs, caps, caplens) else: scores, caps_sorted, decode_lengths, sort_ind = self.decoder( imgs, caps, caplens) # since we decoded starting with <start>, the targets are all words after <start>, up to <end> targets = caps_sorted[:, 1:] # remove timesteps that we didn't decode at, or are pads # pack_padded_sequence is an easy trick to do this scores_copy = scores.clone() scores = pack_padded_sequence(scores, decode_lengths, batch_first=True)[0] targets = pack_padded_sequence(targets, decode_lengths, batch_first=True)[0] # calc loss loss = self.loss_function(scores, targets) # doubly stochastic attention regularization (in paper: show, attend and tell) if self.caption_model == 'att2all': loss += self.tau * ((1. - alphas.sum(dim=1))**2).mean() # keep track of metrics losses.update(loss.item(), sum(decode_lengths)) top5 = accuracy(scores, targets, 5) top5accs.update(top5, sum(decode_lengths)) batch_time.update(time.time() - start) start = time.time() if i % self.print_freq == 0: print( 'Validation: [{0}/{1}]\t' 'Batch Time {batch_time.val:.3f} ({batch_time.avg:.3f})\t' 'Loss {loss.val:.4f} ({loss.avg:.4f})\t' 'Top-5 Accuracy {top5.val:.3f} ({top5.avg:.3f})\t'. format(i, len(self.val_loader), batch_time=batch_time, loss=losses, top5=top5accs)) # store ground truth captions and predicted captions of each image # for n images, each of them has one prediction and multiple ground truths (a, b, c...): # prediction = [ [hyp1], [hyp2], ..., [hypn] ] # ground_truth = [ [ [ref1a], [ref1b], [ref1c] ], ..., [ [refna], [refnb] ] ] # ground truth allcaps = allcaps[ sort_ind] # because images were sorted in the decoder for j in range(allcaps.shape[0]): img_caps = allcaps[j].tolist() img_captions = list( map( lambda c: [ w for w in c if w not in { self.word_map['<start>'], self.word_map[ '<pad>'] } ], img_caps)) # remove <start> and pads ground_truth.append(img_captions) # prediction _, preds = torch.max(scores_copy, dim=2) preds = preds.tolist() temp_preds = list() for j, p in enumerate(preds): temp_preds.append( preds[j][:decode_lengths[j]]) # remove pads preds = temp_preds prediction.extend(preds) assert len(ground_truth) == len(prediction) # calc BLEU-4 and CIDEr score metrics = Metrics(ground_truth, prediction, self.rev_word_map) bleu4 = metrics.belu()[3] # BLEU-4 cider = metrics.cider() # CIDEr print( '\n * LOSS - {loss.avg:.3f}, TOP-5 ACCURACY - {top5.avg:.3f}, BLEU-4 - {bleu}, CIDEr - {cider}\n' .format(loss=losses, top5=top5accs, bleu=bleu4, cider=cider)) return bleu4
def eval(**args): """ Evaluate selected model Args: seed (Int): Integer indicating set seed for random state save_dir (String): Top level directory to generate results folder model (String): Name of selected model dataset (String): Name of selected dataset exp (String): Name of experiment load_type (String): Keyword indicator to evaluate the testing or validation set pretrained (Int/String): Int/String indicating loading of random, pretrained or saved weights Return: None """ print("\n############################################################################\n") print("Experimental Setup: ", args) print("\n############################################################################\n") d = datetime.datetime.today() date = d.strftime('%Y%m%d-%H%M%S') result_dir = os.path.join(args['save_dir'], args['model'], '_'.join((args['dataset'],args['exp'],date))) log_dir = os.path.join(result_dir, 'logs') save_dir = os.path.join(result_dir, 'checkpoints') if not args['debug']: os.makedirs(result_dir, exist_ok=True) os.makedirs(log_dir, exist_ok=True) os.makedirs(save_dir, exist_ok=True) # Save copy of config file with open(os.path.join(result_dir, 'config.yaml'),'w') as outfile: yaml.dump(args, outfile, default_flow_style=False) # Tensorboard Element writer = SummaryWriter(log_dir) # Check if GPU is available (CUDA) device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu") # Load Network model = create_model_object(**args).to(device) # Load Data loader = data_loader(**args, model_obj=model) if args['load_type'] == 'train_val': eval_loader = loader['valid'] elif args['load_type'] == 'train': eval_loader = loader['train'] elif args['load_type'] == 'test': eval_loader = loader['test'] else: sys.exit('load_type must be valid or test for eval, exiting') # END IF if isinstance(args['pretrained'], str): ckpt = load_checkpoint(args['pretrained']) model.load_state_dict(ckpt) # Training Setup params = [p for p in model.parameters() if p.requires_grad] acc_metric = Metrics(**args, result_dir=result_dir, ndata=len(eval_loader.dataset)) acc = 0.0 # Setup Model To Evaluate model.eval() with torch.no_grad(): for step, data in enumerate(eval_loader): x_input = data['data'] annotations = data['annots'] if isinstance(x_input, torch.Tensor): outputs = model(x_input.to(device)) else: for i, item in enumerate(x_input): if isinstance(item, torch.Tensor): x_input[i] = item.to(device) outputs = model(*x_input) # END IF acc = acc_metric.get_accuracy(outputs, annotations) if step % 100 == 0: print('Step: {}/{} | {} acc: {:.4f}'.format(step, len(eval_loader), args['load_type'], acc)) print('Accuracy of the network on the {} set: {:.3f} %\n'.format(args['load_type'], 100.*acc)) if not args['debug']: writer.add_scalar(args['dataset']+'/'+args['model']+'/'+args['load_type']+'_accuracy', 100.*acc) # Close Tensorboard Element writer.close()
def image_builder(buildspec): FORMATTER = OutputFormatter(constants.PADDING) BUILDSPEC = Buildspec() BUILDSPEC.load(buildspec) IMAGES = [] for image in BUILDSPEC["images"].items(): ARTIFACTS = deepcopy(BUILDSPEC["context"]) image_name = image[0] image_config = image[1] extra_build_args = {} labels = {} if image_config.get("version") is not None: if BUILDSPEC["version"] != image_config.get("version"): continue if image_config.get("context") is not None: ARTIFACTS.update(image_config["context"]) build_context = os.getenv("BUILD_CONTEXT") image_tag = (tag_image_with_pr_number(image_config["tag"]) if build_context == "PR" else image_config["tag"]) if not build_config.DISABLE_DATETIME_TAG or build_context != "PR": image_tag = tag_image_with_datetime(image_tag) image_repo_uri = (image_config["repository"] if build_context == "PR" else modify_repository_name_for_context( str(image_config["repository"]), build_context)) base_image_uri = None if image_config.get("base_image_name") is not None: base_image_object = _find_image_object( IMAGES, image_config["base_image_name"]) base_image_uri = base_image_object.ecr_url if image_config.get("download_artifacts") is not None: for artifact_name, artifact in image_config.get( "download_artifacts").items(): type = artifact["type"] uri = artifact["URI"] var = artifact["VAR_IN_DOCKERFILE"] try: file_name = utils.download_file(uri, type).strip() except ValueError: FORMATTER.print( f"Artifact download failed: {uri} of type {type}.") ARTIFACTS.update({ f"{artifact_name}": { "source": f"{os.path.join(os.sep, os.path.abspath(os.getcwd()), file_name)}", "target": file_name } }) extra_build_args[var] = file_name labels[var] = file_name labels[f"{var}_URI"] = uri ARTIFACTS.update({ "dockerfile": { "source": image_config["docker_file"], "target": "Dockerfile", } }) context = Context(ARTIFACTS, f"build/{image_name}.tar.gz", image_config["root"]) """ Override parameters from parent in child. """ info = { "account_id": str(BUILDSPEC["account_id"]), "region": str(BUILDSPEC["region"]), "framework": str(BUILDSPEC["framework"]), "version": str(BUILDSPEC["version"]), "root": str(image_config["root"]), "name": str(image_name), "device_type": str(image_config["device_type"]), "python_version": str(image_config["python_version"]), "image_type": str(image_config["image_type"]), "image_size_baseline": int(image_config["image_size_baseline"]), "base_image_uri": base_image_uri, "labels": labels, "extra_build_args": extra_build_args } image_object = DockerImage( info=info, dockerfile=image_config["docker_file"], repository=image_repo_uri, tag=image_tag, to_build=image_config["build"], context=context, ) IMAGES.append(image_object) FORMATTER.banner("DLC") FORMATTER.title("Status") THREADS = {} # In the context of the ThreadPoolExecutor each instance of image.build submitted # to it is executed concurrently in a separate thread. with concurrent.futures.ThreadPoolExecutor(max_workers=10) as executor: # Standard images must be built before example images # Example images will use standard images as base standard_images = [ image for image in IMAGES if "example" not in image.name.lower() ] example_images = [ image for image in IMAGES if "example" in image.name.lower() ] for image in standard_images: THREADS[image.name] = executor.submit(image.build) # the FORMATTER.progress(THREADS) function call also waits until all threads have completed FORMATTER.progress(THREADS) for image in example_images: THREADS[image.name] = executor.submit(image.build) # the FORMATTER.progress(THREADS) function call also waits until all threads have completed FORMATTER.progress(THREADS) FORMATTER.title("Build Logs") if not os.path.isdir("logs"): os.makedirs("logs") for image in IMAGES: FORMATTER.title(image.name) FORMATTER.table(image.info.items()) FORMATTER.separator() FORMATTER.print_lines(image.log) with open(f"logs/{image.name}", "w") as fp: fp.write("/n".join(image.log)) image.summary["log"] = f"logs/{image.name}" FORMATTER.title("Summary") for image in IMAGES: FORMATTER.title(image.name) FORMATTER.table(image.summary.items()) FORMATTER.title("Errors") ANY_FAIL = False for image in IMAGES: if image.build_status == constants.FAIL: FORMATTER.title(image.name) FORMATTER.print_lines(image.log[-10:]) ANY_FAIL = True if ANY_FAIL: raise Exception("Build failed") else: FORMATTER.print("No errors") FORMATTER.title("Uploading Metrics") metrics = Metrics( context=constants.BUILD_CONTEXT, region=BUILDSPEC["region"], namespace=constants.METRICS_NAMESPACE, ) for image in IMAGES: try: metrics.push_image_metrics(image) except Exception as e: if ANY_FAIL: raise Exception(f"Build failed.{e}") else: raise Exception(f"Build passed. {e}") FORMATTER.separator() # Set environment variables to be consumed by test jobs test_trigger_job = utils.get_codebuild_project_name() utils.set_test_env( IMAGES, BUILD_CONTEXT=os.getenv("BUILD_CONTEXT"), TEST_TRIGGER=test_trigger_job, )
def train(args): timestamp=datetime.now().strftime('%Y%m%d%H%M') # LOG # logger = logging.getLogger(__name__) logging.basicConfig(level=logging.DEBUG, format="%(message)s")#,format="%(asctime)s: %(name)s: %(levelname)s: %(message)s") tb_writer=None if args.visual: # make output directory if it doesn't already exist os.makedirs(f'./output/{args.model}/{args.expname}/{timestamp}/models', exist_ok=True) os.makedirs(f'./output/{args.model}/{args.expname}/{timestamp}/temp_results', exist_ok=True) fh = logging.FileHandler(f"./output/{args.model}/{args.expname}/{timestamp}/logs.txt") # create file handler which logs even debug messages logger.addHandler(fh)# add the handlers to the logger tb_writer = SummaryWriter(f"./output/{args.model}/{args.expname}/{timestamp}/logs/") # save arguments json.dump(vars(args), open(f'./output/{args.model}/{args.expname}/{timestamp}/args.json', 'w')) # Device # if args.gpu_id<0: device = torch.device("cuda") else: device = torch.device(f"cuda:{args.gpu_id}" if torch.cuda.is_available() and args.gpu_id>-1 else "cpu") print(device) n_gpu = torch.cuda.device_count() if args.gpu_id<0 else 1 print(f"num of gpus:{n_gpu}") # Set the random seed manually for reproducibility. random.seed(args.seed) np.random.seed(args.seed) torch.manual_seed(args.seed) torch.cuda.manual_seed_all(args.seed) def save_model(model, epoch, timestamp): """Save model parameters to checkpoint""" os.makedirs(f'./output/{args.model}/{args.expname}/{timestamp}/models', exist_ok=True) ckpt_path=f'./output/{args.model}/{args.expname}/{timestamp}/models/model_epo{epoch}.pkl' print(f'Saving model parameters to {ckpt_path}') torch.save(model.state_dict(), ckpt_path) def load_model(model, epoch, timestamp): """Load parameters from checkpoint""" ckpt_path=f'./output/{args.model}/{args.expname}/{timestamp}/models/model_epo{epoch}.pkl' print(f'Loading model parameters from {ckpt_path}') model.load_state_dict(torch.load(checkpoint)) config = getattr(configs, 'config_'+args.model)() ############################################################################### # Load dataset ############################################################################### train_set=APIDataset(args.data_path+'train.desc.h5', args.data_path+'train.apiseq.h5', config['max_sent_len']) valid_set=APIDataset(args.data_path+'test.desc.h5', args.data_path+'test.apiseq.h5', config['max_sent_len']) train_loader=torch.utils.data.DataLoader(dataset=train_set, batch_size=config['batch_size'], shuffle=True, num_workers=1) valid_loader=torch.utils.data.DataLoader(dataset=valid_set, batch_size=config['batch_size'], shuffle=True, num_workers=1) print("Loaded dataset!") ############################################################################### # Define the models ############################################################################### model = getattr(models, args.model)(config) if args.reload_from>=0: load_model(model, args.reload_from) model=model.to(device) ############################################################################### # Prepare the Optimizer ############################################################################### no_decay = ['bias', 'LayerNorm.weight'] optimizer_grouped_parameters = [ {'params': [p for n, p in model.named_parameters() if not any(nd in n for nd in no_decay)], 'weight_decay': 0.01}, {'params': [p for n, p in model.named_parameters() if any(nd in n for nd in no_decay)], 'weight_decay': 0.0} ] optimizer = torch.optim.AdamW(optimizer_grouped_parameters, lr=config['lr'], eps=config['adam_epsilon']) scheduler = get_cosine_schedule_with_warmup( optimizer, num_warmup_steps=config['warmup_steps'], num_training_steps=len(train_loader)*config['epochs']) # do not foget to modify the number when dataset is changed ############################################################################### # Training ############################################################################### logger.info("Training...") itr_global=1 start_epoch=1 if args.reload_from==-1 else args.reload_from+1 for epoch in range(start_epoch, config['epochs']+1): epoch_start_time = time.time() itr_start_time = time.time() # shuffle (re-define) dataset between epochs for batch in train_loader:# loop through all batches in training dataset model.train() batch_gpu = [tensor.to(device) for tensor in batch] loss = model(*batch_gpu) loss.backward() torch.nn.utils.clip_grad_norm_(model.parameters(), config['clip']) optimizer.step() scheduler.step() model.zero_grad() if itr_global % args.log_every == 0: elapsed = time.time() - itr_start_time log = '%s-%s|@gpu%d epo:[%d/%d] iter:%d step_time:%ds loss:%f'\ %(args.model, args.expname, args.gpu_id, epoch, config['epochs'],itr_global, elapsed, loss) if args.visual: tb_writer.add_scalar('loss', loss, itr_global) logger.info(log) itr_start_time = time.time() if itr_global % args.valid_every == 0: model.eval() loss_records={} for batch in valid_loader: batch_gpu = [tensor.to(device) for tensor in batch] with torch.no_grad(): valid_loss = model.valid(*batch_gpu) for loss_name, loss_value in valid_loss.items(): v=loss_records.get(loss_name, []) v.append(loss_value) loss_records[loss_name]=v log = 'Validation ' for loss_name, loss_values in loss_records.items(): log = log + loss_name + ':%.4f '%(np.mean(loss_values)) if args.visual: tb_writer.add_scalar(loss_name, np.mean(loss_values), itr_global) logger.info(log) itr_global+=1 if itr_global % args.eval_every == 0: # evaluate the model in the develop set model.eval() save_model(model, itr_global, timestamp) # save model after each epoch valid_loader=torch.utils.data.DataLoader(dataset=valid_set, batch_size=1, shuffle=False, num_workers=1) vocab_api = load_dict(args.data_path+'vocab.apiseq.json') vocab_desc = load_dict(args.data_path+'vocab.desc.json') metrics=Metrics() os.makedirs(f'./output/{args.model}/{args.expname}/{timestamp}/temp_results', exist_ok=True) f_eval = open(f"./output/{args.model}/{args.expname}/{timestamp}/temp_results/iter{itr_global}.txt", "w") repeat = 1 decode_mode = 'sample' recall_bleu, prec_bleu = evaluate(model, metrics, valid_loader, vocab_desc, vocab_api, repeat, decode_mode, f_eval) if args.visual: tb_writer.add_scalar('recall_bleu', recall_bleu, itr_global) tb_writer.add_scalar('prec_bleu', prec_bleu, itr_global) # end of epoch ---------------------------- model.adjust_lr()
def test(config: Dict, vgpmil_model: vgpmil = None, rf_model: RandomForestClassifier = None, svm_model: SVC = None): print('Testing..') test_df = pd.read_csv(config['path_test_df']) print('Loaded test dataframe. Number of instances: ' + str(len(test_df))) features, bag_labels_per_instance, bag_names_per_instance, instance_labels = load_dataframe( test_df, config) bag_features, bag_labels, bag_names = get_bag_level_information( features, bag_labels_per_instance, bag_names_per_instance) metrics_calculator = Metrics(instance_labels, bag_labels, bag_names, bag_names_per_instance) if vgpmil_model is not None: print('Test VGPMIL') start = timeit.timeit() instance_predictions, bag_predictions = vgpmil_model.predict( features, bag_names_per_instance, bag_names) end = timeit.timeit() print('Average runtime per bag: ', str((end - start) / bag_predictions.size)) metrics_calculator.calc_metrics(instance_predictions, bag_predictions, 'vgpmil') if rf_model is not None: print('Test Random Forest') bag_predictions = rf_model.predict(bag_features) metrics_calculator.calc_metrics(np.array([]), bag_predictions, 'random_forest') if svm_model is not None: print('Test SVM') bag_predictions = svm_model.predict(bag_features) metrics_calculator.calc_metrics(np.array([]), bag_predictions, 'svm') if config['use_models']['cnn'] == True: cnn_predictions, bag_cnn_predictions, bag_cnn_probability = load_cnn_predictions( test_df, config) metrics_calculator.calc_metrics(cnn_predictions, bag_cnn_probability, 'cnn') metrics_calculator.write_to_file(config)
def main(): global args args = parse_args() args.input_dim, args.mem_dim = 300, 150 args.hidden_dim, args.num_classes = 50, 5 args.cuda = args.cuda and torch.cuda.is_available() if args.sparse and args.wd != 0: print('Sparsity and weight decay are incompatible, pick one!') exit() print(args) torch.manual_seed(args.seed) random.seed(args.seed) numpy.random.seed(args.seed) if args.cuda: torch.cuda.manual_seed(args.seed) torch.backends.cudnn.benchmark = True if not os.path.exists(args.save): os.makedirs(args.save) train_dir = os.path.join(args.data, 'train/') dev_dir = os.path.join(args.data, 'dev/') test_dir = os.path.join(args.data, 'test/') # write unique words from all token files sick_vocab_file = os.path.join(args.data, 'sick.vocab') if not os.path.isfile(sick_vocab_file): token_files_a = [ os.path.join(split, 'a.toks') for split in [train_dir, dev_dir, test_dir] ] token_files_b = [ os.path.join(split, 'b.toks') for split in [train_dir, dev_dir, test_dir] ] token_files = token_files_a + token_files_b sick_vocab_file = os.path.join(args.data, 'sick.vocab') build_vocab(token_files, sick_vocab_file) # get vocab object from vocab file previously written vocab = Vocab(filename=sick_vocab_file, data=[ Constants.PAD_WORD, Constants.UNK_WORD, Constants.BOS_WORD, Constants.EOS_WORD ]) print('==> SICK vocabulary size : %d ' % vocab.size()) # load SICK dataset splits train_file = os.path.join(args.data, 'sick_train.pth') if os.path.isfile(train_file): train_dataset = torch.load(train_file) else: train_dataset = SICKDataset(train_dir, vocab, args.num_classes) torch.save(train_dataset, train_file) print('==> Size of train data : %d ' % len(train_dataset)) dev_file = os.path.join(args.data, 'sick_dev.pth') if os.path.isfile(dev_file): dev_dataset = torch.load(dev_file) else: dev_dataset = SICKDataset(dev_dir, vocab, args.num_classes) torch.save(dev_dataset, dev_file) print('==> Size of dev data : %d ' % len(dev_dataset)) test_file = os.path.join(args.data, 'sick_test.pth') if os.path.isfile(test_file): test_dataset = torch.load(test_file) else: test_dataset = SICKDataset(test_dir, vocab, args.num_classes) torch.save(test_dataset, test_file) print('==> Size of test data : %d ' % len(test_dataset)) # initialize model, criterion/loss_function, optimizer model = SimilarityTreeLSTM(args.cuda, vocab.size(), args.input_dim, args.mem_dim, args.hidden_dim, args.num_classes, args.sparse) criterion = nn.KLDivLoss() if args.cuda: model.cuda(), criterion.cuda() if args.optim == 'adam': optimizer = optim.Adam(model.parameters(), lr=args.lr, weight_decay=args.wd) elif args.optim == 'adagrad': optimizer = optim.Adagrad(model.parameters(), lr=args.lr, weight_decay=args.wd) elif args.optim == 'sgd': optimizer = optim.SGD(model.parameters(), lr=args.lr, weight_decay=args.wd) metrics = Metrics(args.num_classes) # for words common to dataset vocab and GLOVE, use GLOVE vectors # for other words in dataset vocab, use random normal vectors emb_file = os.path.join(args.data, 'sick_embed.pth') if os.path.isfile(emb_file): emb = torch.load(emb_file) else: # load glove embeddings and vocab glove_vocab, glove_emb = load_word_vectors( os.path.join(args.glove, 'glove.840B.300d')) print('==> GLOVE vocabulary size: %d ' % glove_vocab.size()) emb = torch.Tensor(vocab.size(), glove_emb.size(1)).normal_(-0.05, 0.05) # zero out the embeddings for padding and other special words if they are absent in vocab for idx, item in enumerate([ Constants.PAD_WORD, Constants.UNK_WORD, Constants.BOS_WORD, Constants.EOS_WORD ]): emb[idx].zero_() for word in vocab.labelToIdx.keys(): if glove_vocab.getIndex(word): emb[vocab.getIndex(word)] = glove_emb[glove_vocab.getIndex( word)] torch.save(emb, emb_file) # plug these into embedding matrix inside model if args.cuda: emb = emb.cuda() model.childsumtreelstm.emb.state_dict()['weight'].copy_(emb) # create trainer object for training and testing trainer = Trainer(args, model, criterion, optimizer) best = -float('inf') for epoch in range(args.epochs): train_loss = trainer.train(train_dataset) train_loss, train_pred = trainer.test(train_dataset) dev_loss, dev_pred = trainer.test(dev_dataset) test_loss, test_pred = trainer.test(test_dataset) train_pearson = metrics.pearson(train_pred, train_dataset.labels) train_mse = metrics.mse(train_pred, train_dataset.labels) print('==> Train Loss: {}\tPearson: {}\tMSE: {}'.format( train_loss, train_pearson, train_mse)) dev_pearson = metrics.pearson(dev_pred, dev_dataset.labels) dev_mse = metrics.mse(dev_pred, dev_dataset.labels) print('==> Dev Loss: {}\tPearson: {}\tMSE: {}'.format( dev_loss, dev_pearson, dev_mse)) test_pearson = metrics.pearson(test_pred, test_dataset.labels) test_mse = metrics.mse(test_pred, test_dataset.labels) print('==> Test Loss: {}\tPearson: {}\tMSE: {}'.format( test_loss, test_pearson, test_mse)) if best < test_pearson: best = test_pearson checkpoint = { 'model': trainer.model.state_dict(), 'optim': trainer.optimizer, 'pearson': test_pearson, 'mse': test_mse, 'args': args, 'epoch': epoch } print('==> New optimum found, checkpointing everything now...') torch.save( checkpoint, '%s.pt' % os.path.join(args.save, args.expname + '.pth'))
def main(): """ Runs a single entity resolution on data (real or synthetic) using a match function (logistic regression, decision tree, or random forest) """ data_type = 'real' decision_threshold = 0.7 train_class_balance = 0.5 max_block_size = 1000 cores = 2 if data_type == 'synthetic': database_train = SyntheticDatabase(100, 10, 10) corruption = 0.1 corruption_array = corruption*np.random.normal(loc=0.0, scale=1.0, size=[1000, database_train.database.feature_descriptor.number]) database_train.corrupt(corruption_array) database_validation = SyntheticDatabase(100, 10, 10) corruption_array = corruption*np.random.normal(loc=0.0, scale=1.0, size=[1000, database_validation.database.feature_descriptor.number]) database_validation.corrupt(corruption_array) database_test = SyntheticDatabase(10, 10, 10) corruption_array = corruption*np.random.normal(loc=0.0, scale=1.0, size=[1000, database_test.database.feature_descriptor.number]) database_test.corrupt(corruption_array) labels_train = database_train.labels labels_validation = database_validation.labels labels_test = database_test.labels database_train = database_train.database database_validation = database_validation.database database_test = database_test.database single_block = True elif data_type == 'real': # Uncomment to use all features (annotations and LM) #database_train = Database('../data/trafficking/cluster_subsample0_10000.csv', header_path='../data/trafficking/cluster_subsample_header_all.csv') #database_validation = Database('../data/trafficking/cluster_subsample1_10000.csv', header_path='../data/trafficking/cluster_subsample_header_all.csv') #database_test = Database('../data/trafficking/cluster_subsample2_10000.csv', header_path='../data/trafficking/cluster_subsample_header_all.csv') # Uncomment to only use annotation features #database_train = Database('../data/trafficking/cluster_subsample0_10000.csv', header_path='../data/trafficking/cluster_subsample_header_annotations.csv') #database_validation = Database('../data/trafficking/cluster_subsample1_10000.csv', header_path='../data/trafficking/cluster_subsample_header_annotations.csv') #database_test = Database('../data/trafficking/cluster_subsample2_10000.csv', header_path='../data/trafficking/cluster_subsample_header_annotations.csv') # Uncomment to only use LM features database_train = Database('../data/trafficking/cluster_subsample0_10000.csv', header_path='../data/trafficking/cluster_subsample_header_LM.csv') database_validation = Database('../data/trafficking/cluster_subsample1_10000.csv', header_path='../data/trafficking/cluster_subsample_header_LM.csv') database_test = Database('../data/trafficking/cluster_subsample2_10000.csv', header_path='../data/trafficking/cluster_subsample_header_LM.csv') labels_train = fast_strong_cluster(database_train) labels_validation = fast_strong_cluster(database_validation) labels_test = fast_strong_cluster(database_test) single_block = False else: Exception('Invalid experiment type'+data_type) entities = deepcopy(database_test) blocking_scheme = BlockingScheme(entities, max_block_size, single_block=single_block) train_seed = generate_pair_seed(database_train, labels_train, train_class_balance, require_direct_match=True, max_minor_class=5000) validation_seed = generate_pair_seed(database_validation, labels_validation, 0.5, require_direct_match=True, max_minor_class=5000) # forest_all = ForestMatchFunction(database_all_train, labels_train, train_seed, decision_threshold) # forest_all.test(database_all_validation, labels_validation, validation_seed) # tree_all = TreeMatchFunction(database_all_train, labels_train, train_seed, decision_threshold) # tree_all.test(database_all_validation, labels_validation, validation_seed) # logistic_all = LogisticMatchFunction(database_all_train, labels_train, train_seed, decision_threshold) # logistic_all.test(database_all_validation, labels_validation, validation_seed) forest_annotations = ForestMatchFunction(database_train, labels_train, train_seed, decision_threshold) roc = forest_annotations.test(database_validation, labels_validation, validation_seed) #roc.make_plot() #plt.show() # tree_annotations = TreeMatchFunction(database_annotations_train, labels_train, train_seed, decision_threshold) # tree_annotations.test(database_annotations_validation, labels_validation, validation_seed) # logistic_annotations = LogisticMatchFunction(database_annotations_train, labels_train, train_seed, decision_threshold) # logistic_annotations.test(database_annotations_validation, labels_validation, validation_seed) # forest_LM = ForestMatchFunction(database_LM_train, labels_train, train_seed, decision_threshold) # forest_LM.test(database_LM_validation, labels_validation, validation_seed) # tree_LM = TreeMatchFunction(database_LM_train, labels_train, train_seed, decision_threshold) # tree_LM.test(database_LM_validation, labels_validation, validation_seed) # logistic_LM = LogisticMatchFunction(database_LM_train, labels_train, train_seed, decision_threshold) # logistic_LM.test(database_LM_validation, labels_validation, validation_seed) # forest_all.roc.write_rates('match_forest_all.csv') # tree_all.roc.write_rates('match_tree_all.csv') # logistic_all.roc.write_rates('match_logistic_all.csv') # # forest_annotations.roc.write_rates('match_forest_annotations.csv') # tree_annotations.roc.write_rates('match_tree_annotations.csv') # logistic_annotations.roc.write_rates('match_logistic_annotations.csv') # # forest_LM.roc.write_rates('match_forest_LM.csv') # tree_LM.roc.write_rates('match_tree_LM.csv') # logistic_LM.roc.write_rates('match_logistic_LM.csv') # ax = forest_all.roc.make_plot() # _ = tree_all.roc.make_plot(ax=ax) # _ = logistic_all.roc.make_plot(ax=ax) # plt.show() #forest_annotations.roc.make_plot() #plt.show() #entities.merge(strong_labels) #er = EntityResolution() #weak_labels = er.run(entities, match_function, blocking_scheme, cores=cores) weak_labels = weak_connected_components(database_test, forest_annotations, blocking_scheme) entities.merge(weak_labels) #strong_labels = fast_strong_cluster(entities) #entities.merge(strong_labels) # out = open('ER.csv', 'w') # out.write('phone,cluster_id\n') # for cluster_counter, (entity_id, entity) in enumerate(entities.records.iteritems()): # phone_index = 21 # for phone in entity.features[phone_index]: # out.write(str(phone)+','+str(cluster_counter)+'\n') # out.close() print 'Metrics using strong features as surrogate label. Entity resolution run using weak and strong features' metrics = Metrics(labels_test, weak_labels) # estimated_test_class_balance = count_pairwise_class_balance(labels_test) # new_metrics = NewMetrics(database_all_test, weak_labels, forest_all, estimated_test_class_balance) metrics.display()